{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1260716086737267, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002521432173474534, "grad_norm": 56.41114044189453, "learning_rate": 5.025125628140703e-09, "loss": 1.5821, "num_input_tokens_seen": 2097152, "step": 1 }, { "epoch": 0.0005042864346949068, "grad_norm": 31.409353256225586, "learning_rate": 1.0050251256281407e-08, "loss": 1.5937, "num_input_tokens_seen": 4194304, "step": 2 }, { "epoch": 0.0007564296520423601, "grad_norm": 21.739652633666992, "learning_rate": 1.5075376884422108e-08, "loss": 1.2442, "num_input_tokens_seen": 6291456, "step": 3 }, { "epoch": 0.0010085728693898135, "grad_norm": 20.685302734375, "learning_rate": 2.0100502512562813e-08, "loss": 0.8062, "num_input_tokens_seen": 8388608, "step": 4 }, { "epoch": 0.0012607160867372667, "grad_norm": 22.219989776611328, "learning_rate": 2.5125628140703518e-08, "loss": 1.1513, "num_input_tokens_seen": 10485760, "step": 5 }, { "epoch": 0.0015128593040847202, "grad_norm": 28.416399002075195, "learning_rate": 3.0150753768844216e-08, "loss": 1.634, "num_input_tokens_seen": 12582912, "step": 6 }, { "epoch": 0.0017650025214321734, "grad_norm": 23.952890396118164, "learning_rate": 3.517587939698492e-08, "loss": 1.1944, "num_input_tokens_seen": 14680064, "step": 7 }, { "epoch": 0.002017145738779627, "grad_norm": 20.75243377685547, "learning_rate": 4.0201005025125626e-08, "loss": 0.7753, "num_input_tokens_seen": 16777216, "step": 8 }, { "epoch": 0.0022692889561270802, "grad_norm": 25.797378540039062, "learning_rate": 4.522613065326633e-08, "loss": 1.5984, "num_input_tokens_seen": 18874368, "step": 9 }, { "epoch": 0.0025214321734745334, "grad_norm": 25.863649368286133, "learning_rate": 5.0251256281407036e-08, "loss": 1.5978, "num_input_tokens_seen": 20971520, "step": 10 }, { "epoch": 0.002773575390821987, "grad_norm": 18.696609497070312, "learning_rate": 5.527638190954774e-08, "loss": 1.2323, "num_input_tokens_seen": 23068672, "step": 11 }, { "epoch": 0.0030257186081694403, "grad_norm": 28.740385055541992, "learning_rate": 6.030150753768843e-08, "loss": 1.1786, "num_input_tokens_seen": 25165824, "step": 12 }, { "epoch": 0.0032778618255168935, "grad_norm": 21.161056518554688, "learning_rate": 6.532663316582915e-08, "loss": 0.7842, "num_input_tokens_seen": 27262976, "step": 13 }, { "epoch": 0.0035300050428643467, "grad_norm": 25.495088577270508, "learning_rate": 7.035175879396984e-08, "loss": 1.9987, "num_input_tokens_seen": 29360128, "step": 14 }, { "epoch": 0.0037821482602118004, "grad_norm": 24.420948028564453, "learning_rate": 7.537688442211055e-08, "loss": 1.1424, "num_input_tokens_seen": 31457280, "step": 15 }, { "epoch": 0.004034291477559254, "grad_norm": 19.922271728515625, "learning_rate": 8.040201005025125e-08, "loss": 1.1716, "num_input_tokens_seen": 33554432, "step": 16 }, { "epoch": 0.004286434694906707, "grad_norm": 25.040063858032227, "learning_rate": 8.542713567839196e-08, "loss": 0.8189, "num_input_tokens_seen": 35651584, "step": 17 }, { "epoch": 0.0045385779122541605, "grad_norm": 27.888629913330078, "learning_rate": 9.045226130653266e-08, "loss": 1.1743, "num_input_tokens_seen": 37748736, "step": 18 }, { "epoch": 0.004790721129601614, "grad_norm": 21.901092529296875, "learning_rate": 9.547738693467335e-08, "loss": 0.7951, "num_input_tokens_seen": 39845888, "step": 19 }, { "epoch": 0.005042864346949067, "grad_norm": 21.351625442504883, "learning_rate": 1.0050251256281407e-07, "loss": 1.2271, "num_input_tokens_seen": 41943040, "step": 20 }, { "epoch": 0.00529500756429652, "grad_norm": 21.482006072998047, "learning_rate": 1.0552763819095476e-07, "loss": 1.1908, "num_input_tokens_seen": 44040192, "step": 21 }, { "epoch": 0.005547150781643974, "grad_norm": 21.15386390686035, "learning_rate": 1.1055276381909548e-07, "loss": 1.2297, "num_input_tokens_seen": 46137344, "step": 22 }, { "epoch": 0.005799293998991427, "grad_norm": 23.156387329101562, "learning_rate": 1.1557788944723617e-07, "loss": 1.1766, "num_input_tokens_seen": 48234496, "step": 23 }, { "epoch": 0.006051437216338881, "grad_norm": 38.258697509765625, "learning_rate": 1.2060301507537687e-07, "loss": 1.4932, "num_input_tokens_seen": 50331648, "step": 24 }, { "epoch": 0.006303580433686334, "grad_norm": 20.798620223999023, "learning_rate": 1.2562814070351758e-07, "loss": 1.2943, "num_input_tokens_seen": 52428800, "step": 25 }, { "epoch": 0.006555723651033787, "grad_norm": 24.994922637939453, "learning_rate": 1.306532663316583e-07, "loss": 1.0768, "num_input_tokens_seen": 54525952, "step": 26 }, { "epoch": 0.00680786686838124, "grad_norm": 33.116146087646484, "learning_rate": 1.35678391959799e-07, "loss": 1.1369, "num_input_tokens_seen": 56623104, "step": 27 }, { "epoch": 0.0070600100857286935, "grad_norm": 39.03334426879883, "learning_rate": 1.4070351758793969e-07, "loss": 1.5513, "num_input_tokens_seen": 58720256, "step": 28 }, { "epoch": 0.007312153303076148, "grad_norm": 25.035110473632812, "learning_rate": 1.4572864321608038e-07, "loss": 1.2028, "num_input_tokens_seen": 60817408, "step": 29 }, { "epoch": 0.007564296520423601, "grad_norm": 21.068431854248047, "learning_rate": 1.507537688442211e-07, "loss": 1.1555, "num_input_tokens_seen": 62914560, "step": 30 }, { "epoch": 0.007816439737771054, "grad_norm": 35.82476043701172, "learning_rate": 1.5577889447236181e-07, "loss": 1.1723, "num_input_tokens_seen": 65011712, "step": 31 }, { "epoch": 0.008068582955118508, "grad_norm": 27.961219787597656, "learning_rate": 1.608040201005025e-07, "loss": 0.7226, "num_input_tokens_seen": 67108864, "step": 32 }, { "epoch": 0.00832072617246596, "grad_norm": 21.109777450561523, "learning_rate": 1.658291457286432e-07, "loss": 1.0722, "num_input_tokens_seen": 69206016, "step": 33 }, { "epoch": 0.008572869389813415, "grad_norm": 43.04289627075195, "learning_rate": 1.7085427135678392e-07, "loss": 1.1128, "num_input_tokens_seen": 71303168, "step": 34 }, { "epoch": 0.008825012607160867, "grad_norm": 26.515880584716797, "learning_rate": 1.7587939698492463e-07, "loss": 1.1254, "num_input_tokens_seen": 73400320, "step": 35 }, { "epoch": 0.009077155824508321, "grad_norm": 21.351062774658203, "learning_rate": 1.8090452261306533e-07, "loss": 0.7675, "num_input_tokens_seen": 75497472, "step": 36 }, { "epoch": 0.009329299041855773, "grad_norm": 23.136459350585938, "learning_rate": 1.8592964824120602e-07, "loss": 1.1374, "num_input_tokens_seen": 77594624, "step": 37 }, { "epoch": 0.009581442259203227, "grad_norm": 17.877473831176758, "learning_rate": 1.909547738693467e-07, "loss": 1.1101, "num_input_tokens_seen": 79691776, "step": 38 }, { "epoch": 0.009833585476550681, "grad_norm": 33.78788375854492, "learning_rate": 1.9597989949748743e-07, "loss": 1.0273, "num_input_tokens_seen": 81788928, "step": 39 }, { "epoch": 0.010085728693898134, "grad_norm": 32.83673858642578, "learning_rate": 2.0100502512562815e-07, "loss": 1.1025, "num_input_tokens_seen": 83886080, "step": 40 }, { "epoch": 0.010337871911245588, "grad_norm": 26.676027297973633, "learning_rate": 2.0603015075376884e-07, "loss": 1.8515, "num_input_tokens_seen": 85983232, "step": 41 }, { "epoch": 0.01059001512859304, "grad_norm": 26.88898468017578, "learning_rate": 2.1105527638190953e-07, "loss": 1.3322, "num_input_tokens_seen": 88080384, "step": 42 }, { "epoch": 0.010842158345940494, "grad_norm": 24.28297233581543, "learning_rate": 2.1608040201005022e-07, "loss": 0.9043, "num_input_tokens_seen": 90177536, "step": 43 }, { "epoch": 0.011094301563287948, "grad_norm": 15.659173011779785, "learning_rate": 2.2110552763819096e-07, "loss": 0.9169, "num_input_tokens_seen": 92274688, "step": 44 }, { "epoch": 0.0113464447806354, "grad_norm": 15.930516242980957, "learning_rate": 2.2613065326633166e-07, "loss": 0.9613, "num_input_tokens_seen": 94371840, "step": 45 }, { "epoch": 0.011598587997982855, "grad_norm": 14.883039474487305, "learning_rate": 2.3115577889447235e-07, "loss": 0.882, "num_input_tokens_seen": 96468992, "step": 46 }, { "epoch": 0.011850731215330307, "grad_norm": 25.84305191040039, "learning_rate": 2.3618090452261304e-07, "loss": 1.1471, "num_input_tokens_seen": 98566144, "step": 47 }, { "epoch": 0.012102874432677761, "grad_norm": 21.669544219970703, "learning_rate": 2.4120603015075373e-07, "loss": 0.9125, "num_input_tokens_seen": 100663296, "step": 48 }, { "epoch": 0.012355017650025214, "grad_norm": 15.483664512634277, "learning_rate": 2.4623115577889445e-07, "loss": 0.8492, "num_input_tokens_seen": 102760448, "step": 49 }, { "epoch": 0.012607160867372668, "grad_norm": 18.560636520385742, "learning_rate": 2.5125628140703517e-07, "loss": 0.9035, "num_input_tokens_seen": 104857600, "step": 50 }, { "epoch": 0.012859304084720122, "grad_norm": 14.719083786010742, "learning_rate": 2.562814070351759e-07, "loss": 0.8161, "num_input_tokens_seen": 106954752, "step": 51 }, { "epoch": 0.013111447302067574, "grad_norm": 21.655672073364258, "learning_rate": 2.613065326633166e-07, "loss": 0.572, "num_input_tokens_seen": 109051904, "step": 52 }, { "epoch": 0.013363590519415028, "grad_norm": 11.465034484863281, "learning_rate": 2.6633165829145727e-07, "loss": 0.807, "num_input_tokens_seen": 111149056, "step": 53 }, { "epoch": 0.01361573373676248, "grad_norm": 17.689987182617188, "learning_rate": 2.71356783919598e-07, "loss": 1.4423, "num_input_tokens_seen": 113246208, "step": 54 }, { "epoch": 0.013867876954109935, "grad_norm": 14.684429168701172, "learning_rate": 2.7638190954773865e-07, "loss": 0.8659, "num_input_tokens_seen": 115343360, "step": 55 }, { "epoch": 0.014120020171457387, "grad_norm": 12.435643196105957, "learning_rate": 2.8140703517587937e-07, "loss": 0.7607, "num_input_tokens_seen": 117440512, "step": 56 }, { "epoch": 0.014372163388804841, "grad_norm": 17.700153350830078, "learning_rate": 2.864321608040201e-07, "loss": 0.8607, "num_input_tokens_seen": 119537664, "step": 57 }, { "epoch": 0.014624306606152295, "grad_norm": 13.79918384552002, "learning_rate": 2.9145728643216075e-07, "loss": 0.7589, "num_input_tokens_seen": 121634816, "step": 58 }, { "epoch": 0.014876449823499747, "grad_norm": 15.207538604736328, "learning_rate": 2.964824120603015e-07, "loss": 0.4787, "num_input_tokens_seen": 123731968, "step": 59 }, { "epoch": 0.015128593040847202, "grad_norm": 10.523366928100586, "learning_rate": 3.015075376884422e-07, "loss": 0.6908, "num_input_tokens_seen": 125829120, "step": 60 }, { "epoch": 0.015380736258194654, "grad_norm": 8.412284851074219, "learning_rate": 3.065326633165829e-07, "loss": 0.6561, "num_input_tokens_seen": 127926272, "step": 61 }, { "epoch": 0.015632879475542108, "grad_norm": 9.98276138305664, "learning_rate": 3.1155778894472363e-07, "loss": 0.7216, "num_input_tokens_seen": 130023424, "step": 62 }, { "epoch": 0.01588502269288956, "grad_norm": 11.017064094543457, "learning_rate": 3.165829145728643e-07, "loss": 0.6223, "num_input_tokens_seen": 132120576, "step": 63 }, { "epoch": 0.016137165910237016, "grad_norm": 15.129839897155762, "learning_rate": 3.21608040201005e-07, "loss": 1.0373, "num_input_tokens_seen": 134217728, "step": 64 }, { "epoch": 0.01638930912758447, "grad_norm": 8.578692436218262, "learning_rate": 3.2663316582914573e-07, "loss": 0.5687, "num_input_tokens_seen": 136314880, "step": 65 }, { "epoch": 0.01664145234493192, "grad_norm": 13.31927490234375, "learning_rate": 3.316582914572864e-07, "loss": 1.0766, "num_input_tokens_seen": 138412032, "step": 66 }, { "epoch": 0.016893595562279373, "grad_norm": 8.775867462158203, "learning_rate": 3.366834170854271e-07, "loss": 0.5324, "num_input_tokens_seen": 140509184, "step": 67 }, { "epoch": 0.01714573877962683, "grad_norm": 12.085953712463379, "learning_rate": 3.4170854271356783e-07, "loss": 0.8601, "num_input_tokens_seen": 142606336, "step": 68 }, { "epoch": 0.01739788199697428, "grad_norm": 12.76360034942627, "learning_rate": 3.4673366834170855e-07, "loss": 0.5595, "num_input_tokens_seen": 144703488, "step": 69 }, { "epoch": 0.017650025214321734, "grad_norm": 10.255838394165039, "learning_rate": 3.5175879396984927e-07, "loss": 0.3496, "num_input_tokens_seen": 146800640, "step": 70 }, { "epoch": 0.01790216843166919, "grad_norm": 9.94809341430664, "learning_rate": 3.5678391959798993e-07, "loss": 0.5976, "num_input_tokens_seen": 148897792, "step": 71 }, { "epoch": 0.018154311649016642, "grad_norm": 7.37994384765625, "learning_rate": 3.6180904522613065e-07, "loss": 0.5241, "num_input_tokens_seen": 150994944, "step": 72 }, { "epoch": 0.018406454866364094, "grad_norm": 8.874433517456055, "learning_rate": 3.668341708542713e-07, "loss": 0.5629, "num_input_tokens_seen": 153092096, "step": 73 }, { "epoch": 0.018658598083711547, "grad_norm": 16.685457229614258, "learning_rate": 3.7185929648241203e-07, "loss": 0.3801, "num_input_tokens_seen": 155189248, "step": 74 }, { "epoch": 0.018910741301059002, "grad_norm": 11.288415908813477, "learning_rate": 3.7688442211055275e-07, "loss": 0.6093, "num_input_tokens_seen": 157286400, "step": 75 }, { "epoch": 0.019162884518406455, "grad_norm": 10.51889419555664, "learning_rate": 3.819095477386934e-07, "loss": 0.5053, "num_input_tokens_seen": 159383552, "step": 76 }, { "epoch": 0.019415027735753907, "grad_norm": 10.236724853515625, "learning_rate": 3.869346733668342e-07, "loss": 0.7537, "num_input_tokens_seen": 161480704, "step": 77 }, { "epoch": 0.019667170953101363, "grad_norm": 9.370979309082031, "learning_rate": 3.9195979899497485e-07, "loss": 0.5814, "num_input_tokens_seen": 163577856, "step": 78 }, { "epoch": 0.019919314170448815, "grad_norm": 12.056835174560547, "learning_rate": 3.9698492462311557e-07, "loss": 0.5178, "num_input_tokens_seen": 165675008, "step": 79 }, { "epoch": 0.020171457387796268, "grad_norm": 8.761493682861328, "learning_rate": 4.020100502512563e-07, "loss": 0.4851, "num_input_tokens_seen": 167772160, "step": 80 }, { "epoch": 0.02042360060514372, "grad_norm": 9.159887313842773, "learning_rate": 4.0703517587939696e-07, "loss": 0.4531, "num_input_tokens_seen": 169869312, "step": 81 }, { "epoch": 0.020675743822491176, "grad_norm": 9.923644065856934, "learning_rate": 4.120603015075377e-07, "loss": 0.5835, "num_input_tokens_seen": 171966464, "step": 82 }, { "epoch": 0.020927887039838628, "grad_norm": 8.762866973876953, "learning_rate": 4.1708542713567834e-07, "loss": 0.4772, "num_input_tokens_seen": 174063616, "step": 83 }, { "epoch": 0.02118003025718608, "grad_norm": 10.09272289276123, "learning_rate": 4.2211055276381906e-07, "loss": 0.7305, "num_input_tokens_seen": 176160768, "step": 84 }, { "epoch": 0.021432173474533536, "grad_norm": 8.009614944458008, "learning_rate": 4.271356783919598e-07, "loss": 0.4629, "num_input_tokens_seen": 178257920, "step": 85 }, { "epoch": 0.02168431669188099, "grad_norm": 8.284019470214844, "learning_rate": 4.3216080402010044e-07, "loss": 0.4368, "num_input_tokens_seen": 180355072, "step": 86 }, { "epoch": 0.02193645990922844, "grad_norm": 6.427061557769775, "learning_rate": 4.371859296482412e-07, "loss": 0.43, "num_input_tokens_seen": 182452224, "step": 87 }, { "epoch": 0.022188603126575897, "grad_norm": 12.255255699157715, "learning_rate": 4.4221105527638193e-07, "loss": 0.5879, "num_input_tokens_seen": 184549376, "step": 88 }, { "epoch": 0.02244074634392335, "grad_norm": 6.626727104187012, "learning_rate": 4.472361809045226e-07, "loss": 0.3916, "num_input_tokens_seen": 186646528, "step": 89 }, { "epoch": 0.0226928895612708, "grad_norm": 8.53348445892334, "learning_rate": 4.522613065326633e-07, "loss": 0.4768, "num_input_tokens_seen": 188743680, "step": 90 }, { "epoch": 0.022945032778618254, "grad_norm": 6.995331287384033, "learning_rate": 4.57286432160804e-07, "loss": 0.3988, "num_input_tokens_seen": 190840832, "step": 91 }, { "epoch": 0.02319717599596571, "grad_norm": 8.352548599243164, "learning_rate": 4.623115577889447e-07, "loss": 0.3706, "num_input_tokens_seen": 192937984, "step": 92 }, { "epoch": 0.023449319213313162, "grad_norm": 6.609560489654541, "learning_rate": 4.673366834170854e-07, "loss": 0.2459, "num_input_tokens_seen": 195035136, "step": 93 }, { "epoch": 0.023701462430660614, "grad_norm": 9.539324760437012, "learning_rate": 4.723618090452261e-07, "loss": 0.3865, "num_input_tokens_seen": 197132288, "step": 94 }, { "epoch": 0.02395360564800807, "grad_norm": 9.831944465637207, "learning_rate": 4.773869346733669e-07, "loss": 0.4022, "num_input_tokens_seen": 199229440, "step": 95 }, { "epoch": 0.024205748865355523, "grad_norm": 9.292588233947754, "learning_rate": 4.824120603015075e-07, "loss": 0.3543, "num_input_tokens_seen": 201326592, "step": 96 }, { "epoch": 0.024457892082702975, "grad_norm": 9.192462921142578, "learning_rate": 4.874371859296482e-07, "loss": 0.4336, "num_input_tokens_seen": 203423744, "step": 97 }, { "epoch": 0.024710035300050427, "grad_norm": 8.302521705627441, "learning_rate": 4.924623115577889e-07, "loss": 0.534, "num_input_tokens_seen": 205520896, "step": 98 }, { "epoch": 0.024962178517397883, "grad_norm": 9.702790260314941, "learning_rate": 4.974874371859296e-07, "loss": 0.5899, "num_input_tokens_seen": 207618048, "step": 99 }, { "epoch": 0.025214321734745335, "grad_norm": 7.346845626831055, "learning_rate": 5.025125628140703e-07, "loss": 0.3439, "num_input_tokens_seen": 209715200, "step": 100 }, { "epoch": 0.025466464952092788, "grad_norm": 6.6140265464782715, "learning_rate": 5.075376884422111e-07, "loss": 0.3779, "num_input_tokens_seen": 211812352, "step": 101 }, { "epoch": 0.025718608169440244, "grad_norm": 6.8121209144592285, "learning_rate": 5.125628140703518e-07, "loss": 0.403, "num_input_tokens_seen": 213909504, "step": 102 }, { "epoch": 0.025970751386787696, "grad_norm": 6.07421875, "learning_rate": 5.175879396984925e-07, "loss": 0.3473, "num_input_tokens_seen": 216006656, "step": 103 }, { "epoch": 0.026222894604135148, "grad_norm": 6.86598539352417, "learning_rate": 5.226130653266332e-07, "loss": 0.3054, "num_input_tokens_seen": 218103808, "step": 104 }, { "epoch": 0.0264750378214826, "grad_norm": 7.970452308654785, "learning_rate": 5.276381909547738e-07, "loss": 0.3693, "num_input_tokens_seen": 220200960, "step": 105 }, { "epoch": 0.026727181038830056, "grad_norm": 7.2236552238464355, "learning_rate": 5.326633165829145e-07, "loss": 0.2194, "num_input_tokens_seen": 222298112, "step": 106 }, { "epoch": 0.02697932425617751, "grad_norm": 5.257369518280029, "learning_rate": 5.376884422110553e-07, "loss": 0.2962, "num_input_tokens_seen": 224395264, "step": 107 }, { "epoch": 0.02723146747352496, "grad_norm": 6.920422077178955, "learning_rate": 5.42713567839196e-07, "loss": 0.3699, "num_input_tokens_seen": 226492416, "step": 108 }, { "epoch": 0.027483610690872417, "grad_norm": 9.312458992004395, "learning_rate": 5.477386934673367e-07, "loss": 0.3812, "num_input_tokens_seen": 228589568, "step": 109 }, { "epoch": 0.02773575390821987, "grad_norm": 9.935240745544434, "learning_rate": 5.527638190954773e-07, "loss": 0.4443, "num_input_tokens_seen": 230686720, "step": 110 }, { "epoch": 0.02798789712556732, "grad_norm": 5.373161315917969, "learning_rate": 5.57788944723618e-07, "loss": 0.264, "num_input_tokens_seen": 232783872, "step": 111 }, { "epoch": 0.028240040342914774, "grad_norm": 6.769862651824951, "learning_rate": 5.628140703517587e-07, "loss": 0.1686, "num_input_tokens_seen": 234881024, "step": 112 }, { "epoch": 0.02849218356026223, "grad_norm": 5.726578712463379, "learning_rate": 5.678391959798995e-07, "loss": 0.3396, "num_input_tokens_seen": 236978176, "step": 113 }, { "epoch": 0.028744326777609682, "grad_norm": 5.439636707305908, "learning_rate": 5.728643216080402e-07, "loss": 0.2733, "num_input_tokens_seen": 239075328, "step": 114 }, { "epoch": 0.028996469994957134, "grad_norm": 5.622605323791504, "learning_rate": 5.778894472361808e-07, "loss": 0.2998, "num_input_tokens_seen": 241172480, "step": 115 }, { "epoch": 0.02924861321230459, "grad_norm": 6.728963851928711, "learning_rate": 5.829145728643215e-07, "loss": 0.2549, "num_input_tokens_seen": 243269632, "step": 116 }, { "epoch": 0.029500756429652043, "grad_norm": 5.0983781814575195, "learning_rate": 5.879396984924622e-07, "loss": 0.2705, "num_input_tokens_seen": 245366784, "step": 117 }, { "epoch": 0.029752899646999495, "grad_norm": 7.3646721839904785, "learning_rate": 5.92964824120603e-07, "loss": 0.3242, "num_input_tokens_seen": 247463936, "step": 118 }, { "epoch": 0.03000504286434695, "grad_norm": 7.918598651885986, "learning_rate": 5.979899497487438e-07, "loss": 0.371, "num_input_tokens_seen": 249561088, "step": 119 }, { "epoch": 0.030257186081694403, "grad_norm": 7.411210536956787, "learning_rate": 6.030150753768844e-07, "loss": 0.2728, "num_input_tokens_seen": 251658240, "step": 120 }, { "epoch": 0.030509329299041855, "grad_norm": 5.8603129386901855, "learning_rate": 6.080402010050251e-07, "loss": 0.1854, "num_input_tokens_seen": 253755392, "step": 121 }, { "epoch": 0.030761472516389308, "grad_norm": 5.476680278778076, "learning_rate": 6.130653266331658e-07, "loss": 0.1831, "num_input_tokens_seen": 255852544, "step": 122 }, { "epoch": 0.031013615733736764, "grad_norm": 6.4667158126831055, "learning_rate": 6.180904522613065e-07, "loss": 0.1721, "num_input_tokens_seen": 257949696, "step": 123 }, { "epoch": 0.031265758951084216, "grad_norm": 5.928079605102539, "learning_rate": 6.231155778894473e-07, "loss": 0.2728, "num_input_tokens_seen": 260046848, "step": 124 }, { "epoch": 0.03151790216843167, "grad_norm": 7.0044755935668945, "learning_rate": 6.28140703517588e-07, "loss": 0.4037, "num_input_tokens_seen": 262144000, "step": 125 }, { "epoch": 0.03177004538577912, "grad_norm": 8.558830261230469, "learning_rate": 6.331658291457286e-07, "loss": 0.5263, "num_input_tokens_seen": 264241152, "step": 126 }, { "epoch": 0.032022188603126577, "grad_norm": 5.0764055252075195, "learning_rate": 6.381909547738693e-07, "loss": 0.2054, "num_input_tokens_seen": 266338304, "step": 127 }, { "epoch": 0.03227433182047403, "grad_norm": 5.459807872772217, "learning_rate": 6.4321608040201e-07, "loss": 0.2122, "num_input_tokens_seen": 268435456, "step": 128 }, { "epoch": 0.03252647503782148, "grad_norm": 5.658675670623779, "learning_rate": 6.482412060301507e-07, "loss": 0.2226, "num_input_tokens_seen": 270532608, "step": 129 }, { "epoch": 0.03277861825516894, "grad_norm": 5.613616466522217, "learning_rate": 6.532663316582915e-07, "loss": 0.2701, "num_input_tokens_seen": 272629760, "step": 130 }, { "epoch": 0.033030761472516386, "grad_norm": 9.082258224487305, "learning_rate": 6.582914572864321e-07, "loss": 0.3726, "num_input_tokens_seen": 274726912, "step": 131 }, { "epoch": 0.03328290468986384, "grad_norm": 4.047947406768799, "learning_rate": 6.633165829145728e-07, "loss": 0.1323, "num_input_tokens_seen": 276824064, "step": 132 }, { "epoch": 0.0335350479072113, "grad_norm": 5.141188144683838, "learning_rate": 6.683417085427135e-07, "loss": 0.2615, "num_input_tokens_seen": 278921216, "step": 133 }, { "epoch": 0.033787191124558746, "grad_norm": 4.637810707092285, "learning_rate": 6.733668341708542e-07, "loss": 0.2252, "num_input_tokens_seen": 281018368, "step": 134 }, { "epoch": 0.0340393343419062, "grad_norm": 5.142843723297119, "learning_rate": 6.783919597989949e-07, "loss": 0.1817, "num_input_tokens_seen": 283115520, "step": 135 }, { "epoch": 0.03429147755925366, "grad_norm": 7.557190418243408, "learning_rate": 6.834170854271357e-07, "loss": 0.2897, "num_input_tokens_seen": 285212672, "step": 136 }, { "epoch": 0.03454362077660111, "grad_norm": 6.585993766784668, "learning_rate": 6.884422110552764e-07, "loss": 0.227, "num_input_tokens_seen": 287309824, "step": 137 }, { "epoch": 0.03479576399394856, "grad_norm": 4.926968574523926, "learning_rate": 6.934673366834171e-07, "loss": 0.1573, "num_input_tokens_seen": 289406976, "step": 138 }, { "epoch": 0.03504790721129602, "grad_norm": 6.03431510925293, "learning_rate": 6.984924623115578e-07, "loss": 0.2187, "num_input_tokens_seen": 291504128, "step": 139 }, { "epoch": 0.03530005042864347, "grad_norm": 9.677518844604492, "learning_rate": 7.035175879396985e-07, "loss": 0.2295, "num_input_tokens_seen": 293601280, "step": 140 }, { "epoch": 0.03555219364599092, "grad_norm": 6.820138931274414, "learning_rate": 7.085427135678391e-07, "loss": 0.1944, "num_input_tokens_seen": 295698432, "step": 141 }, { "epoch": 0.03580433686333838, "grad_norm": 5.568108081817627, "learning_rate": 7.135678391959799e-07, "loss": 0.3113, "num_input_tokens_seen": 297795584, "step": 142 }, { "epoch": 0.03605648008068583, "grad_norm": 6.417880058288574, "learning_rate": 7.185929648241206e-07, "loss": 0.2932, "num_input_tokens_seen": 299892736, "step": 143 }, { "epoch": 0.036308623298033284, "grad_norm": 5.040261745452881, "learning_rate": 7.236180904522613e-07, "loss": 0.2076, "num_input_tokens_seen": 301989888, "step": 144 }, { "epoch": 0.03656076651538074, "grad_norm": 6.350996494293213, "learning_rate": 7.28643216080402e-07, "loss": 0.1714, "num_input_tokens_seen": 304087040, "step": 145 }, { "epoch": 0.03681290973272819, "grad_norm": 5.744927406311035, "learning_rate": 7.336683417085426e-07, "loss": 0.1948, "num_input_tokens_seen": 306184192, "step": 146 }, { "epoch": 0.037065052950075644, "grad_norm": 5.379306793212891, "learning_rate": 7.386934673366834e-07, "loss": 0.1971, "num_input_tokens_seen": 308281344, "step": 147 }, { "epoch": 0.03731719616742309, "grad_norm": 4.08986234664917, "learning_rate": 7.437185929648241e-07, "loss": 0.1319, "num_input_tokens_seen": 310378496, "step": 148 }, { "epoch": 0.03756933938477055, "grad_norm": 8.005187034606934, "learning_rate": 7.487437185929648e-07, "loss": 0.3227, "num_input_tokens_seen": 312475648, "step": 149 }, { "epoch": 0.037821482602118005, "grad_norm": 6.485504627227783, "learning_rate": 7.537688442211055e-07, "loss": 0.4005, "num_input_tokens_seen": 314572800, "step": 150 }, { "epoch": 0.038073625819465454, "grad_norm": 7.763909339904785, "learning_rate": 7.587939698492461e-07, "loss": 0.3537, "num_input_tokens_seen": 316669952, "step": 151 }, { "epoch": 0.03832576903681291, "grad_norm": 5.093461036682129, "learning_rate": 7.638190954773868e-07, "loss": 0.1321, "num_input_tokens_seen": 318767104, "step": 152 }, { "epoch": 0.038577912254160365, "grad_norm": 4.274379730224609, "learning_rate": 7.688442211055276e-07, "loss": 0.1623, "num_input_tokens_seen": 320864256, "step": 153 }, { "epoch": 0.038830055471507814, "grad_norm": 5.359605312347412, "learning_rate": 7.738693467336684e-07, "loss": 0.2337, "num_input_tokens_seen": 322961408, "step": 154 }, { "epoch": 0.03908219868885527, "grad_norm": 5.039738655090332, "learning_rate": 7.788944723618091e-07, "loss": 0.2028, "num_input_tokens_seen": 325058560, "step": 155 }, { "epoch": 0.039334341906202726, "grad_norm": 5.888302326202393, "learning_rate": 7.839195979899497e-07, "loss": 0.1418, "num_input_tokens_seen": 327155712, "step": 156 }, { "epoch": 0.039586485123550175, "grad_norm": 5.222049236297607, "learning_rate": 7.889447236180904e-07, "loss": 0.1474, "num_input_tokens_seen": 329252864, "step": 157 }, { "epoch": 0.03983862834089763, "grad_norm": 5.662126064300537, "learning_rate": 7.939698492462311e-07, "loss": 0.2008, "num_input_tokens_seen": 331350016, "step": 158 }, { "epoch": 0.040090771558245086, "grad_norm": 4.854446887969971, "learning_rate": 7.989949748743719e-07, "loss": 0.1373, "num_input_tokens_seen": 333447168, "step": 159 }, { "epoch": 0.040342914775592535, "grad_norm": 5.8150177001953125, "learning_rate": 8.040201005025126e-07, "loss": 0.2512, "num_input_tokens_seen": 335544320, "step": 160 }, { "epoch": 0.04059505799293999, "grad_norm": 5.4808526039123535, "learning_rate": 8.090452261306532e-07, "loss": 0.1379, "num_input_tokens_seen": 337641472, "step": 161 }, { "epoch": 0.04084720121028744, "grad_norm": 5.683319091796875, "learning_rate": 8.140703517587939e-07, "loss": 0.2061, "num_input_tokens_seen": 339738624, "step": 162 }, { "epoch": 0.041099344427634896, "grad_norm": 5.919990062713623, "learning_rate": 8.190954773869346e-07, "loss": 0.2115, "num_input_tokens_seen": 341835776, "step": 163 }, { "epoch": 0.04135148764498235, "grad_norm": 4.193869113922119, "learning_rate": 8.241206030150753e-07, "loss": 0.1766, "num_input_tokens_seen": 343932928, "step": 164 }, { "epoch": 0.0416036308623298, "grad_norm": 4.4601945877075195, "learning_rate": 8.291457286432161e-07, "loss": 0.1939, "num_input_tokens_seen": 346030080, "step": 165 }, { "epoch": 0.041855774079677256, "grad_norm": 5.21290922164917, "learning_rate": 8.341708542713567e-07, "loss": 0.1787, "num_input_tokens_seen": 348127232, "step": 166 }, { "epoch": 0.04210791729702471, "grad_norm": 5.489988327026367, "learning_rate": 8.391959798994974e-07, "loss": 0.1809, "num_input_tokens_seen": 350224384, "step": 167 }, { "epoch": 0.04236006051437216, "grad_norm": 4.026052474975586, "learning_rate": 8.442211055276381e-07, "loss": 0.1248, "num_input_tokens_seen": 352321536, "step": 168 }, { "epoch": 0.04261220373171962, "grad_norm": 4.203098297119141, "learning_rate": 8.492462311557788e-07, "loss": 0.1089, "num_input_tokens_seen": 354418688, "step": 169 }, { "epoch": 0.04286434694906707, "grad_norm": 6.0608296394348145, "learning_rate": 8.542713567839196e-07, "loss": 0.185, "num_input_tokens_seen": 356515840, "step": 170 }, { "epoch": 0.04311649016641452, "grad_norm": 5.297198295593262, "learning_rate": 8.592964824120602e-07, "loss": 0.119, "num_input_tokens_seen": 358612992, "step": 171 }, { "epoch": 0.04336863338376198, "grad_norm": 4.82717227935791, "learning_rate": 8.643216080402009e-07, "loss": 0.1275, "num_input_tokens_seen": 360710144, "step": 172 }, { "epoch": 0.04362077660110943, "grad_norm": 7.091985702514648, "learning_rate": 8.693467336683417e-07, "loss": 0.3237, "num_input_tokens_seen": 362807296, "step": 173 }, { "epoch": 0.04387291981845688, "grad_norm": 4.359028339385986, "learning_rate": 8.743718592964824e-07, "loss": 0.1272, "num_input_tokens_seen": 364904448, "step": 174 }, { "epoch": 0.04412506303580434, "grad_norm": 4.864053726196289, "learning_rate": 8.793969849246231e-07, "loss": 0.2115, "num_input_tokens_seen": 367001600, "step": 175 }, { "epoch": 0.044377206253151794, "grad_norm": 4.585638523101807, "learning_rate": 8.844221105527639e-07, "loss": 0.1753, "num_input_tokens_seen": 369098752, "step": 176 }, { "epoch": 0.04462934947049924, "grad_norm": 6.2548933029174805, "learning_rate": 8.894472361809045e-07, "loss": 0.2436, "num_input_tokens_seen": 371195904, "step": 177 }, { "epoch": 0.0448814926878467, "grad_norm": 4.619575023651123, "learning_rate": 8.944723618090452e-07, "loss": 0.2271, "num_input_tokens_seen": 373293056, "step": 178 }, { "epoch": 0.04513363590519415, "grad_norm": 4.505560398101807, "learning_rate": 8.994974874371859e-07, "loss": 0.1728, "num_input_tokens_seen": 375390208, "step": 179 }, { "epoch": 0.0453857791225416, "grad_norm": 4.657378196716309, "learning_rate": 9.045226130653266e-07, "loss": 0.2134, "num_input_tokens_seen": 377487360, "step": 180 }, { "epoch": 0.04563792233988906, "grad_norm": 3.5373897552490234, "learning_rate": 9.095477386934673e-07, "loss": 0.125, "num_input_tokens_seen": 379584512, "step": 181 }, { "epoch": 0.04589006555723651, "grad_norm": 4.476269721984863, "learning_rate": 9.14572864321608e-07, "loss": 0.1805, "num_input_tokens_seen": 381681664, "step": 182 }, { "epoch": 0.046142208774583963, "grad_norm": 4.5421881675720215, "learning_rate": 9.195979899497487e-07, "loss": 0.1296, "num_input_tokens_seen": 383778816, "step": 183 }, { "epoch": 0.04639435199193142, "grad_norm": 4.141582012176514, "learning_rate": 9.246231155778894e-07, "loss": 0.194, "num_input_tokens_seen": 385875968, "step": 184 }, { "epoch": 0.04664649520927887, "grad_norm": 6.524399757385254, "learning_rate": 9.296482412060301e-07, "loss": 0.1595, "num_input_tokens_seen": 387973120, "step": 185 }, { "epoch": 0.046898638426626324, "grad_norm": 4.473093509674072, "learning_rate": 9.346733668341708e-07, "loss": 0.1909, "num_input_tokens_seen": 390070272, "step": 186 }, { "epoch": 0.04715078164397378, "grad_norm": 5.006099224090576, "learning_rate": 9.396984924623114e-07, "loss": 0.215, "num_input_tokens_seen": 392167424, "step": 187 }, { "epoch": 0.04740292486132123, "grad_norm": 4.727731227874756, "learning_rate": 9.447236180904522e-07, "loss": 0.1874, "num_input_tokens_seen": 394264576, "step": 188 }, { "epoch": 0.047655068078668684, "grad_norm": 4.6576828956604, "learning_rate": 9.497487437185929e-07, "loss": 0.1889, "num_input_tokens_seen": 396361728, "step": 189 }, { "epoch": 0.04790721129601614, "grad_norm": 4.223318099975586, "learning_rate": 9.547738693467337e-07, "loss": 0.1432, "num_input_tokens_seen": 398458880, "step": 190 }, { "epoch": 0.04815935451336359, "grad_norm": 3.288745641708374, "learning_rate": 9.597989949748744e-07, "loss": 0.1361, "num_input_tokens_seen": 400556032, "step": 191 }, { "epoch": 0.048411497730711045, "grad_norm": 4.024937629699707, "learning_rate": 9.64824120603015e-07, "loss": 0.1285, "num_input_tokens_seen": 402653184, "step": 192 }, { "epoch": 0.048663640948058494, "grad_norm": 4.060795783996582, "learning_rate": 9.698492462311556e-07, "loss": 0.1472, "num_input_tokens_seen": 404750336, "step": 193 }, { "epoch": 0.04891578416540595, "grad_norm": 5.01156759262085, "learning_rate": 9.748743718592964e-07, "loss": 0.2541, "num_input_tokens_seen": 406847488, "step": 194 }, { "epoch": 0.049167927382753406, "grad_norm": 3.8259568214416504, "learning_rate": 9.79899497487437e-07, "loss": 0.176, "num_input_tokens_seen": 408944640, "step": 195 }, { "epoch": 0.049420070600100854, "grad_norm": 4.526422500610352, "learning_rate": 9.849246231155778e-07, "loss": 0.2161, "num_input_tokens_seen": 411041792, "step": 196 }, { "epoch": 0.04967221381744831, "grad_norm": 4.0646867752075195, "learning_rate": 9.899497487437185e-07, "loss": 0.1361, "num_input_tokens_seen": 413138944, "step": 197 }, { "epoch": 0.049924357034795766, "grad_norm": 4.822361946105957, "learning_rate": 9.949748743718592e-07, "loss": 0.1678, "num_input_tokens_seen": 415236096, "step": 198 }, { "epoch": 0.050176500252143215, "grad_norm": 5.335970878601074, "learning_rate": 1e-06, "loss": 0.138, "num_input_tokens_seen": 417333248, "step": 199 }, { "epoch": 0.05042864346949067, "grad_norm": 4.283322811126709, "learning_rate": 9.999998435084117e-07, "loss": 0.1599, "num_input_tokens_seen": 419430400, "step": 200 }, { "epoch": 0.05068078668683813, "grad_norm": 3.6955955028533936, "learning_rate": 9.999993740337564e-07, "loss": 0.1203, "num_input_tokens_seen": 421527552, "step": 201 }, { "epoch": 0.050932929904185575, "grad_norm": 4.380987167358398, "learning_rate": 9.999985915763598e-07, "loss": 0.2069, "num_input_tokens_seen": 423624704, "step": 202 }, { "epoch": 0.05118507312153303, "grad_norm": 3.827716588973999, "learning_rate": 9.999974961367668e-07, "loss": 0.1987, "num_input_tokens_seen": 425721856, "step": 203 }, { "epoch": 0.05143721633888049, "grad_norm": 3.8995583057403564, "learning_rate": 9.999960877157389e-07, "loss": 0.1473, "num_input_tokens_seen": 427819008, "step": 204 }, { "epoch": 0.051689359556227936, "grad_norm": 3.6740832328796387, "learning_rate": 9.99994366314256e-07, "loss": 0.1348, "num_input_tokens_seen": 429916160, "step": 205 }, { "epoch": 0.05194150277357539, "grad_norm": 3.7553346157073975, "learning_rate": 9.99992331933515e-07, "loss": 0.1463, "num_input_tokens_seen": 432013312, "step": 206 }, { "epoch": 0.05219364599092285, "grad_norm": 4.992524147033691, "learning_rate": 9.99989984574931e-07, "loss": 0.2349, "num_input_tokens_seen": 434110464, "step": 207 }, { "epoch": 0.052445789208270296, "grad_norm": 4.383981704711914, "learning_rate": 9.99987324240137e-07, "loss": 0.1552, "num_input_tokens_seen": 436207616, "step": 208 }, { "epoch": 0.05269793242561775, "grad_norm": 4.6292619705200195, "learning_rate": 9.999843509309827e-07, "loss": 0.1998, "num_input_tokens_seen": 438304768, "step": 209 }, { "epoch": 0.0529500756429652, "grad_norm": 3.5693604946136475, "learning_rate": 9.999810646495363e-07, "loss": 0.1409, "num_input_tokens_seen": 440401920, "step": 210 }, { "epoch": 0.05320221886031266, "grad_norm": 4.460555553436279, "learning_rate": 9.999774653980837e-07, "loss": 0.2005, "num_input_tokens_seen": 442499072, "step": 211 }, { "epoch": 0.05345436207766011, "grad_norm": 3.6692800521850586, "learning_rate": 9.99973553179128e-07, "loss": 0.1358, "num_input_tokens_seen": 444596224, "step": 212 }, { "epoch": 0.05370650529500756, "grad_norm": 3.4849557876586914, "learning_rate": 9.999693279953903e-07, "loss": 0.1199, "num_input_tokens_seen": 446693376, "step": 213 }, { "epoch": 0.05395864851235502, "grad_norm": 3.9747097492218018, "learning_rate": 9.999647898498095e-07, "loss": 0.1885, "num_input_tokens_seen": 448790528, "step": 214 }, { "epoch": 0.05421079172970247, "grad_norm": 4.172543525695801, "learning_rate": 9.999599387455416e-07, "loss": 0.2118, "num_input_tokens_seen": 450887680, "step": 215 }, { "epoch": 0.05446293494704992, "grad_norm": 3.811913013458252, "learning_rate": 9.999547746859607e-07, "loss": 0.1973, "num_input_tokens_seen": 452984832, "step": 216 }, { "epoch": 0.05471507816439738, "grad_norm": 3.7271082401275635, "learning_rate": 9.999492976746585e-07, "loss": 0.2219, "num_input_tokens_seen": 455081984, "step": 217 }, { "epoch": 0.054967221381744834, "grad_norm": 4.112778186798096, "learning_rate": 9.999435077154446e-07, "loss": 0.1748, "num_input_tokens_seen": 457179136, "step": 218 }, { "epoch": 0.05521936459909228, "grad_norm": 6.517294883728027, "learning_rate": 9.99937404812346e-07, "loss": 0.3107, "num_input_tokens_seen": 459276288, "step": 219 }, { "epoch": 0.05547150781643974, "grad_norm": 4.02686071395874, "learning_rate": 9.99930988969607e-07, "loss": 0.0861, "num_input_tokens_seen": 461373440, "step": 220 }, { "epoch": 0.055723651033787194, "grad_norm": 3.6635353565216064, "learning_rate": 9.999242601916902e-07, "loss": 0.2132, "num_input_tokens_seen": 463470592, "step": 221 }, { "epoch": 0.05597579425113464, "grad_norm": 4.417490005493164, "learning_rate": 9.999172184832756e-07, "loss": 0.2374, "num_input_tokens_seen": 465567744, "step": 222 }, { "epoch": 0.0562279374684821, "grad_norm": 3.173140048980713, "learning_rate": 9.99909863849261e-07, "loss": 0.1771, "num_input_tokens_seen": 467664896, "step": 223 }, { "epoch": 0.05648008068582955, "grad_norm": 5.276343822479248, "learning_rate": 9.999021962947612e-07, "loss": 0.1569, "num_input_tokens_seen": 469762048, "step": 224 }, { "epoch": 0.056732223903177004, "grad_norm": 4.241299629211426, "learning_rate": 9.998942158251096e-07, "loss": 0.2738, "num_input_tokens_seen": 471859200, "step": 225 }, { "epoch": 0.05698436712052446, "grad_norm": 4.36360502243042, "learning_rate": 9.998859224458565e-07, "loss": 0.2735, "num_input_tokens_seen": 473956352, "step": 226 }, { "epoch": 0.05723651033787191, "grad_norm": 5.051778316497803, "learning_rate": 9.998773161627701e-07, "loss": 0.1831, "num_input_tokens_seen": 476053504, "step": 227 }, { "epoch": 0.057488653555219364, "grad_norm": 3.883115291595459, "learning_rate": 9.998683969818364e-07, "loss": 0.1617, "num_input_tokens_seen": 478150656, "step": 228 }, { "epoch": 0.05774079677256682, "grad_norm": 3.9679079055786133, "learning_rate": 9.998591649092588e-07, "loss": 0.1273, "num_input_tokens_seen": 480247808, "step": 229 }, { "epoch": 0.05799293998991427, "grad_norm": 6.0246901512146, "learning_rate": 9.998496199514582e-07, "loss": 0.1463, "num_input_tokens_seen": 482344960, "step": 230 }, { "epoch": 0.058245083207261725, "grad_norm": 3.684004545211792, "learning_rate": 9.998397621150734e-07, "loss": 0.1422, "num_input_tokens_seen": 484442112, "step": 231 }, { "epoch": 0.05849722642460918, "grad_norm": 5.111332416534424, "learning_rate": 9.998295914069606e-07, "loss": 0.2197, "num_input_tokens_seen": 486539264, "step": 232 }, { "epoch": 0.05874936964195663, "grad_norm": 3.0218448638916016, "learning_rate": 9.99819107834194e-07, "loss": 0.1219, "num_input_tokens_seen": 488636416, "step": 233 }, { "epoch": 0.059001512859304085, "grad_norm": 3.564114570617676, "learning_rate": 9.99808311404065e-07, "loss": 0.1983, "num_input_tokens_seen": 490733568, "step": 234 }, { "epoch": 0.05925365607665154, "grad_norm": 6.091875076293945, "learning_rate": 9.997972021240824e-07, "loss": 0.2782, "num_input_tokens_seen": 492830720, "step": 235 }, { "epoch": 0.05950579929399899, "grad_norm": 4.984955787658691, "learning_rate": 9.997857800019734e-07, "loss": 0.2658, "num_input_tokens_seen": 494927872, "step": 236 }, { "epoch": 0.059757942511346446, "grad_norm": 4.2022705078125, "learning_rate": 9.997740450456819e-07, "loss": 0.1511, "num_input_tokens_seen": 497025024, "step": 237 }, { "epoch": 0.0600100857286939, "grad_norm": 4.631911277770996, "learning_rate": 9.997619972633701e-07, "loss": 0.1874, "num_input_tokens_seen": 499122176, "step": 238 }, { "epoch": 0.06026222894604135, "grad_norm": 3.489034414291382, "learning_rate": 9.99749636663417e-07, "loss": 0.1684, "num_input_tokens_seen": 501219328, "step": 239 }, { "epoch": 0.060514372163388806, "grad_norm": 5.1144185066223145, "learning_rate": 9.997369632544202e-07, "loss": 0.1834, "num_input_tokens_seen": 503316480, "step": 240 }, { "epoch": 0.060766515380736255, "grad_norm": 5.526945114135742, "learning_rate": 9.997239770451938e-07, "loss": 0.2135, "num_input_tokens_seen": 505413632, "step": 241 }, { "epoch": 0.06101865859808371, "grad_norm": 6.000234127044678, "learning_rate": 9.997106780447705e-07, "loss": 0.2248, "num_input_tokens_seen": 507510784, "step": 242 }, { "epoch": 0.06127080181543117, "grad_norm": 3.4181573390960693, "learning_rate": 9.99697066262399e-07, "loss": 0.0903, "num_input_tokens_seen": 509607936, "step": 243 }, { "epoch": 0.061522945032778616, "grad_norm": 3.6254003047943115, "learning_rate": 9.996831417075477e-07, "loss": 0.1507, "num_input_tokens_seen": 511705088, "step": 244 }, { "epoch": 0.06177508825012607, "grad_norm": 3.7657456398010254, "learning_rate": 9.996689043899005e-07, "loss": 0.1569, "num_input_tokens_seen": 513802240, "step": 245 }, { "epoch": 0.06202723146747353, "grad_norm": 4.642493724822998, "learning_rate": 9.996543543193604e-07, "loss": 0.1187, "num_input_tokens_seen": 515899392, "step": 246 }, { "epoch": 0.062279374684820976, "grad_norm": 3.632336378097534, "learning_rate": 9.996394915060468e-07, "loss": 0.1736, "num_input_tokens_seen": 517996544, "step": 247 }, { "epoch": 0.06253151790216843, "grad_norm": 4.491301536560059, "learning_rate": 9.99624315960297e-07, "loss": 0.2351, "num_input_tokens_seen": 520093696, "step": 248 }, { "epoch": 0.06278366111951589, "grad_norm": 2.526890277862549, "learning_rate": 9.996088276926661e-07, "loss": 0.1088, "num_input_tokens_seen": 522190848, "step": 249 }, { "epoch": 0.06303580433686334, "grad_norm": 4.402822971343994, "learning_rate": 9.995930267139266e-07, "loss": 0.1189, "num_input_tokens_seen": 524288000, "step": 250 }, { "epoch": 0.06328794755421079, "grad_norm": 2.893916368484497, "learning_rate": 9.99576913035068e-07, "loss": 0.1003, "num_input_tokens_seen": 526385152, "step": 251 }, { "epoch": 0.06354009077155824, "grad_norm": 4.437779426574707, "learning_rate": 9.995604866672978e-07, "loss": 0.21, "num_input_tokens_seen": 528482304, "step": 252 }, { "epoch": 0.0637922339889057, "grad_norm": 7.890944957733154, "learning_rate": 9.995437476220408e-07, "loss": 0.3668, "num_input_tokens_seen": 530579456, "step": 253 }, { "epoch": 0.06404437720625315, "grad_norm": 3.5893633365631104, "learning_rate": 9.995266959109396e-07, "loss": 0.1771, "num_input_tokens_seen": 532676608, "step": 254 }, { "epoch": 0.06429652042360061, "grad_norm": 4.691050052642822, "learning_rate": 9.995093315458534e-07, "loss": 0.1696, "num_input_tokens_seen": 534773760, "step": 255 }, { "epoch": 0.06454866364094806, "grad_norm": 2.8213396072387695, "learning_rate": 9.9949165453886e-07, "loss": 0.1364, "num_input_tokens_seen": 536870912, "step": 256 }, { "epoch": 0.0648008068582955, "grad_norm": 4.529366493225098, "learning_rate": 9.994736649022539e-07, "loss": 0.1749, "num_input_tokens_seen": 538968064, "step": 257 }, { "epoch": 0.06505295007564296, "grad_norm": 3.919793128967285, "learning_rate": 9.99455362648547e-07, "loss": 0.1611, "num_input_tokens_seen": 541065216, "step": 258 }, { "epoch": 0.06530509329299042, "grad_norm": 4.9372711181640625, "learning_rate": 9.994367477904695e-07, "loss": 0.2556, "num_input_tokens_seen": 543162368, "step": 259 }, { "epoch": 0.06555723651033787, "grad_norm": 5.533105850219727, "learning_rate": 9.994178203409674e-07, "loss": 0.1598, "num_input_tokens_seen": 545259520, "step": 260 }, { "epoch": 0.06580937972768533, "grad_norm": 4.164669990539551, "learning_rate": 9.993985803132057e-07, "loss": 0.1743, "num_input_tokens_seen": 547356672, "step": 261 }, { "epoch": 0.06606152294503277, "grad_norm": 3.924823045730591, "learning_rate": 9.993790277205662e-07, "loss": 0.169, "num_input_tokens_seen": 549453824, "step": 262 }, { "epoch": 0.06631366616238023, "grad_norm": 3.045861005783081, "learning_rate": 9.993591625766477e-07, "loss": 0.1027, "num_input_tokens_seen": 551550976, "step": 263 }, { "epoch": 0.06656580937972768, "grad_norm": 2.7366058826446533, "learning_rate": 9.993389848952673e-07, "loss": 0.1027, "num_input_tokens_seen": 553648128, "step": 264 }, { "epoch": 0.06681795259707514, "grad_norm": 4.305903434753418, "learning_rate": 9.993184946904586e-07, "loss": 0.0899, "num_input_tokens_seen": 555745280, "step": 265 }, { "epoch": 0.0670700958144226, "grad_norm": 4.169579029083252, "learning_rate": 9.992976919764728e-07, "loss": 0.1555, "num_input_tokens_seen": 557842432, "step": 266 }, { "epoch": 0.06732223903177005, "grad_norm": 2.866806983947754, "learning_rate": 9.992765767677789e-07, "loss": 0.1226, "num_input_tokens_seen": 559939584, "step": 267 }, { "epoch": 0.06757438224911749, "grad_norm": 3.6884562969207764, "learning_rate": 9.992551490790626e-07, "loss": 0.1359, "num_input_tokens_seen": 562036736, "step": 268 }, { "epoch": 0.06782652546646495, "grad_norm": 4.731523513793945, "learning_rate": 9.992334089252278e-07, "loss": 0.1438, "num_input_tokens_seen": 564133888, "step": 269 }, { "epoch": 0.0680786686838124, "grad_norm": 3.90913724899292, "learning_rate": 9.992113563213944e-07, "loss": 0.1596, "num_input_tokens_seen": 566231040, "step": 270 }, { "epoch": 0.06833081190115986, "grad_norm": 3.4404547214508057, "learning_rate": 9.99188991282901e-07, "loss": 0.165, "num_input_tokens_seen": 568328192, "step": 271 }, { "epoch": 0.06858295511850732, "grad_norm": 2.840576648712158, "learning_rate": 9.991663138253025e-07, "loss": 0.109, "num_input_tokens_seen": 570425344, "step": 272 }, { "epoch": 0.06883509833585477, "grad_norm": 4.362993240356445, "learning_rate": 9.991433239643716e-07, "loss": 0.209, "num_input_tokens_seen": 572522496, "step": 273 }, { "epoch": 0.06908724155320221, "grad_norm": 4.26267671585083, "learning_rate": 9.991200217160984e-07, "loss": 0.0746, "num_input_tokens_seen": 574619648, "step": 274 }, { "epoch": 0.06933938477054967, "grad_norm": 3.7214324474334717, "learning_rate": 9.990964070966895e-07, "loss": 0.1395, "num_input_tokens_seen": 576716800, "step": 275 }, { "epoch": 0.06959152798789713, "grad_norm": 4.263853549957275, "learning_rate": 9.9907248012257e-07, "loss": 0.1919, "num_input_tokens_seen": 578813952, "step": 276 }, { "epoch": 0.06984367120524458, "grad_norm": 3.7660653591156006, "learning_rate": 9.99048240810381e-07, "loss": 0.1362, "num_input_tokens_seen": 580911104, "step": 277 }, { "epoch": 0.07009581442259204, "grad_norm": 3.3318731784820557, "learning_rate": 9.990236891769818e-07, "loss": 0.0849, "num_input_tokens_seen": 583008256, "step": 278 }, { "epoch": 0.07034795763993948, "grad_norm": 3.9983317852020264, "learning_rate": 9.98998825239448e-07, "loss": 0.1731, "num_input_tokens_seen": 585105408, "step": 279 }, { "epoch": 0.07060010085728693, "grad_norm": 3.032134532928467, "learning_rate": 9.98973649015073e-07, "loss": 0.1278, "num_input_tokens_seen": 587202560, "step": 280 }, { "epoch": 0.07085224407463439, "grad_norm": 3.8470921516418457, "learning_rate": 9.98948160521368e-07, "loss": 0.103, "num_input_tokens_seen": 589299712, "step": 281 }, { "epoch": 0.07110438729198185, "grad_norm": 2.935425043106079, "learning_rate": 9.989223597760598e-07, "loss": 0.1472, "num_input_tokens_seen": 591396864, "step": 282 }, { "epoch": 0.0713565305093293, "grad_norm": 3.791640043258667, "learning_rate": 9.988962467970938e-07, "loss": 0.1743, "num_input_tokens_seen": 593494016, "step": 283 }, { "epoch": 0.07160867372667676, "grad_norm": 2.616250991821289, "learning_rate": 9.988698216026322e-07, "loss": 0.0769, "num_input_tokens_seen": 595591168, "step": 284 }, { "epoch": 0.0718608169440242, "grad_norm": 3.309394359588623, "learning_rate": 9.988430842110538e-07, "loss": 0.1357, "num_input_tokens_seen": 597688320, "step": 285 }, { "epoch": 0.07211296016137166, "grad_norm": 4.600468635559082, "learning_rate": 9.988160346409551e-07, "loss": 0.1178, "num_input_tokens_seen": 599785472, "step": 286 }, { "epoch": 0.07236510337871911, "grad_norm": 3.2695717811584473, "learning_rate": 9.987886729111496e-07, "loss": 0.1122, "num_input_tokens_seen": 601882624, "step": 287 }, { "epoch": 0.07261724659606657, "grad_norm": 2.7870922088623047, "learning_rate": 9.98760999040668e-07, "loss": 0.0995, "num_input_tokens_seen": 603979776, "step": 288 }, { "epoch": 0.07286938981341402, "grad_norm": 3.2872393131256104, "learning_rate": 9.987330130487576e-07, "loss": 0.1314, "num_input_tokens_seen": 606076928, "step": 289 }, { "epoch": 0.07312153303076148, "grad_norm": 4.210444927215576, "learning_rate": 9.987047149548833e-07, "loss": 0.1435, "num_input_tokens_seen": 608174080, "step": 290 }, { "epoch": 0.07337367624810892, "grad_norm": 3.661651372909546, "learning_rate": 9.986761047787274e-07, "loss": 0.1075, "num_input_tokens_seen": 610271232, "step": 291 }, { "epoch": 0.07362581946545638, "grad_norm": 4.133707046508789, "learning_rate": 9.986471825401882e-07, "loss": 0.1977, "num_input_tokens_seen": 612368384, "step": 292 }, { "epoch": 0.07387796268280383, "grad_norm": 4.6356072425842285, "learning_rate": 9.98617948259382e-07, "loss": 0.1703, "num_input_tokens_seen": 614465536, "step": 293 }, { "epoch": 0.07413010590015129, "grad_norm": 3.9383256435394287, "learning_rate": 9.985884019566416e-07, "loss": 0.1848, "num_input_tokens_seen": 616562688, "step": 294 }, { "epoch": 0.07438224911749874, "grad_norm": 4.793269157409668, "learning_rate": 9.985585436525168e-07, "loss": 0.1488, "num_input_tokens_seen": 618659840, "step": 295 }, { "epoch": 0.07463439233484619, "grad_norm": 6.518699645996094, "learning_rate": 9.98528373367775e-07, "loss": 0.24, "num_input_tokens_seen": 620756992, "step": 296 }, { "epoch": 0.07488653555219364, "grad_norm": 3.71830415725708, "learning_rate": 9.984978911234003e-07, "loss": 0.1444, "num_input_tokens_seen": 622854144, "step": 297 }, { "epoch": 0.0751386787695411, "grad_norm": 3.535399913787842, "learning_rate": 9.984670969405932e-07, "loss": 0.145, "num_input_tokens_seen": 624951296, "step": 298 }, { "epoch": 0.07539082198688855, "grad_norm": 2.5828938484191895, "learning_rate": 9.984359908407716e-07, "loss": 0.1091, "num_input_tokens_seen": 627048448, "step": 299 }, { "epoch": 0.07564296520423601, "grad_norm": 3.900514841079712, "learning_rate": 9.984045728455707e-07, "loss": 0.1672, "num_input_tokens_seen": 629145600, "step": 300 }, { "epoch": 0.07589510842158347, "grad_norm": 4.364770412445068, "learning_rate": 9.98372842976842e-07, "loss": 0.2678, "num_input_tokens_seen": 631242752, "step": 301 }, { "epoch": 0.07614725163893091, "grad_norm": 3.6578245162963867, "learning_rate": 9.983408012566545e-07, "loss": 0.1238, "num_input_tokens_seen": 633339904, "step": 302 }, { "epoch": 0.07639939485627836, "grad_norm": 3.067723512649536, "learning_rate": 9.983084477072936e-07, "loss": 0.092, "num_input_tokens_seen": 635437056, "step": 303 }, { "epoch": 0.07665153807362582, "grad_norm": 2.8249781131744385, "learning_rate": 9.982757823512619e-07, "loss": 0.1065, "num_input_tokens_seen": 637534208, "step": 304 }, { "epoch": 0.07690368129097327, "grad_norm": 3.4561619758605957, "learning_rate": 9.982428052112784e-07, "loss": 0.1463, "num_input_tokens_seen": 639631360, "step": 305 }, { "epoch": 0.07715582450832073, "grad_norm": 4.192049503326416, "learning_rate": 9.982095163102796e-07, "loss": 0.1127, "num_input_tokens_seen": 641728512, "step": 306 }, { "epoch": 0.07740796772566819, "grad_norm": 2.888293743133545, "learning_rate": 9.981759156714185e-07, "loss": 0.113, "num_input_tokens_seen": 643825664, "step": 307 }, { "epoch": 0.07766011094301563, "grad_norm": 3.8195247650146484, "learning_rate": 9.981420033180651e-07, "loss": 0.1601, "num_input_tokens_seen": 645922816, "step": 308 }, { "epoch": 0.07791225416036308, "grad_norm": 3.721971035003662, "learning_rate": 9.98107779273806e-07, "loss": 0.1443, "num_input_tokens_seen": 648019968, "step": 309 }, { "epoch": 0.07816439737771054, "grad_norm": 3.4332494735717773, "learning_rate": 9.980732435624441e-07, "loss": 0.1503, "num_input_tokens_seen": 650117120, "step": 310 }, { "epoch": 0.078416540595058, "grad_norm": 2.9033710956573486, "learning_rate": 9.980383962080003e-07, "loss": 0.073, "num_input_tokens_seen": 652214272, "step": 311 }, { "epoch": 0.07866868381240545, "grad_norm": 3.597287178039551, "learning_rate": 9.980032372347116e-07, "loss": 0.1596, "num_input_tokens_seen": 654311424, "step": 312 }, { "epoch": 0.0789208270297529, "grad_norm": 3.0851659774780273, "learning_rate": 9.97967766667031e-07, "loss": 0.1188, "num_input_tokens_seen": 656408576, "step": 313 }, { "epoch": 0.07917297024710035, "grad_norm": 2.279250144958496, "learning_rate": 9.979319845296296e-07, "loss": 0.0974, "num_input_tokens_seen": 658505728, "step": 314 }, { "epoch": 0.0794251134644478, "grad_norm": 4.360164165496826, "learning_rate": 9.978958908473941e-07, "loss": 0.1992, "num_input_tokens_seen": 660602880, "step": 315 }, { "epoch": 0.07967725668179526, "grad_norm": 2.8060495853424072, "learning_rate": 9.978594856454288e-07, "loss": 0.1314, "num_input_tokens_seen": 662700032, "step": 316 }, { "epoch": 0.07992939989914272, "grad_norm": 4.089578628540039, "learning_rate": 9.978227689490536e-07, "loss": 0.1807, "num_input_tokens_seen": 664797184, "step": 317 }, { "epoch": 0.08018154311649017, "grad_norm": 3.043846368789673, "learning_rate": 9.977857407838061e-07, "loss": 0.1208, "num_input_tokens_seen": 666894336, "step": 318 }, { "epoch": 0.08043368633383761, "grad_norm": 2.2600390911102295, "learning_rate": 9.9774840117544e-07, "loss": 0.076, "num_input_tokens_seen": 668991488, "step": 319 }, { "epoch": 0.08068582955118507, "grad_norm": 3.115410089492798, "learning_rate": 9.977107501499253e-07, "loss": 0.1118, "num_input_tokens_seen": 671088640, "step": 320 }, { "epoch": 0.08093797276853253, "grad_norm": 3.720118761062622, "learning_rate": 9.976727877334493e-07, "loss": 0.1518, "num_input_tokens_seen": 673185792, "step": 321 }, { "epoch": 0.08119011598587998, "grad_norm": 3.6921238899230957, "learning_rate": 9.976345139524152e-07, "loss": 0.1261, "num_input_tokens_seen": 675282944, "step": 322 }, { "epoch": 0.08144225920322744, "grad_norm": 3.162914752960205, "learning_rate": 9.975959288334438e-07, "loss": 0.1038, "num_input_tokens_seen": 677380096, "step": 323 }, { "epoch": 0.08169440242057488, "grad_norm": 3.166231870651245, "learning_rate": 9.97557032403371e-07, "loss": 0.1294, "num_input_tokens_seen": 679477248, "step": 324 }, { "epoch": 0.08194654563792234, "grad_norm": 3.0747804641723633, "learning_rate": 9.975178246892507e-07, "loss": 0.1425, "num_input_tokens_seen": 681574400, "step": 325 }, { "epoch": 0.08219868885526979, "grad_norm": 3.0979673862457275, "learning_rate": 9.974783057183519e-07, "loss": 0.1586, "num_input_tokens_seen": 683671552, "step": 326 }, { "epoch": 0.08245083207261725, "grad_norm": 4.019197940826416, "learning_rate": 9.974384755181609e-07, "loss": 0.1663, "num_input_tokens_seen": 685768704, "step": 327 }, { "epoch": 0.0827029752899647, "grad_norm": 2.6061339378356934, "learning_rate": 9.973983341163807e-07, "loss": 0.0851, "num_input_tokens_seen": 687865856, "step": 328 }, { "epoch": 0.08295511850731216, "grad_norm": 3.0148558616638184, "learning_rate": 9.9735788154093e-07, "loss": 0.0966, "num_input_tokens_seen": 689963008, "step": 329 }, { "epoch": 0.0832072617246596, "grad_norm": 2.6705162525177, "learning_rate": 9.973171178199447e-07, "loss": 0.0839, "num_input_tokens_seen": 692060160, "step": 330 }, { "epoch": 0.08345940494200706, "grad_norm": 4.910850524902344, "learning_rate": 9.972760429817763e-07, "loss": 0.1695, "num_input_tokens_seen": 694157312, "step": 331 }, { "epoch": 0.08371154815935451, "grad_norm": 3.358743190765381, "learning_rate": 9.972346570549932e-07, "loss": 0.0935, "num_input_tokens_seen": 696254464, "step": 332 }, { "epoch": 0.08396369137670197, "grad_norm": 3.214064598083496, "learning_rate": 9.971929600683802e-07, "loss": 0.0848, "num_input_tokens_seen": 698351616, "step": 333 }, { "epoch": 0.08421583459404942, "grad_norm": 4.408289432525635, "learning_rate": 9.971509520509381e-07, "loss": 0.1624, "num_input_tokens_seen": 700448768, "step": 334 }, { "epoch": 0.08446797781139688, "grad_norm": 4.276678085327148, "learning_rate": 9.971086330318845e-07, "loss": 0.1458, "num_input_tokens_seen": 702545920, "step": 335 }, { "epoch": 0.08472012102874432, "grad_norm": 2.518461227416992, "learning_rate": 9.97066003040653e-07, "loss": 0.0934, "num_input_tokens_seen": 704643072, "step": 336 }, { "epoch": 0.08497226424609178, "grad_norm": 2.8323476314544678, "learning_rate": 9.970230621068932e-07, "loss": 0.1324, "num_input_tokens_seen": 706740224, "step": 337 }, { "epoch": 0.08522440746343923, "grad_norm": 2.8873610496520996, "learning_rate": 9.969798102604717e-07, "loss": 0.1292, "num_input_tokens_seen": 708837376, "step": 338 }, { "epoch": 0.08547655068078669, "grad_norm": 2.796959638595581, "learning_rate": 9.969362475314708e-07, "loss": 0.1086, "num_input_tokens_seen": 710934528, "step": 339 }, { "epoch": 0.08572869389813415, "grad_norm": 4.745234966278076, "learning_rate": 9.968923739501892e-07, "loss": 0.2212, "num_input_tokens_seen": 713031680, "step": 340 }, { "epoch": 0.08598083711548159, "grad_norm": 4.436620235443115, "learning_rate": 9.968481895471417e-07, "loss": 0.1376, "num_input_tokens_seen": 715128832, "step": 341 }, { "epoch": 0.08623298033282904, "grad_norm": 4.772200584411621, "learning_rate": 9.968036943530592e-07, "loss": 0.193, "num_input_tokens_seen": 717225984, "step": 342 }, { "epoch": 0.0864851235501765, "grad_norm": 3.2390449047088623, "learning_rate": 9.967588883988893e-07, "loss": 0.0999, "num_input_tokens_seen": 719323136, "step": 343 }, { "epoch": 0.08673726676752395, "grad_norm": 3.936569929122925, "learning_rate": 9.967137717157951e-07, "loss": 0.1634, "num_input_tokens_seen": 721420288, "step": 344 }, { "epoch": 0.08698940998487141, "grad_norm": 3.647679567337036, "learning_rate": 9.966683443351564e-07, "loss": 0.1798, "num_input_tokens_seen": 723517440, "step": 345 }, { "epoch": 0.08724155320221887, "grad_norm": 2.8842921257019043, "learning_rate": 9.966226062885682e-07, "loss": 0.1033, "num_input_tokens_seen": 725614592, "step": 346 }, { "epoch": 0.08749369641956631, "grad_norm": 6.5264434814453125, "learning_rate": 9.965765576078424e-07, "loss": 0.2729, "num_input_tokens_seen": 727711744, "step": 347 }, { "epoch": 0.08774583963691376, "grad_norm": 3.786755084991455, "learning_rate": 9.96530198325007e-07, "loss": 0.1233, "num_input_tokens_seen": 729808896, "step": 348 }, { "epoch": 0.08799798285426122, "grad_norm": 3.994030237197876, "learning_rate": 9.964835284723052e-07, "loss": 0.1229, "num_input_tokens_seen": 731906048, "step": 349 }, { "epoch": 0.08825012607160868, "grad_norm": 4.352416038513184, "learning_rate": 9.96436548082197e-07, "loss": 0.1501, "num_input_tokens_seen": 734003200, "step": 350 }, { "epoch": 0.08850226928895613, "grad_norm": 3.238286018371582, "learning_rate": 9.963892571873584e-07, "loss": 0.1314, "num_input_tokens_seen": 736100352, "step": 351 }, { "epoch": 0.08875441250630359, "grad_norm": 2.75301456451416, "learning_rate": 9.963416558206806e-07, "loss": 0.1137, "num_input_tokens_seen": 738197504, "step": 352 }, { "epoch": 0.08900655572365103, "grad_norm": 3.3911097049713135, "learning_rate": 9.962937440152712e-07, "loss": 0.0976, "num_input_tokens_seen": 740294656, "step": 353 }, { "epoch": 0.08925869894099848, "grad_norm": 2.7000679969787598, "learning_rate": 9.962455218044542e-07, "loss": 0.063, "num_input_tokens_seen": 742391808, "step": 354 }, { "epoch": 0.08951084215834594, "grad_norm": 3.3619422912597656, "learning_rate": 9.961969892217688e-07, "loss": 0.1167, "num_input_tokens_seen": 744488960, "step": 355 }, { "epoch": 0.0897629853756934, "grad_norm": 2.421957015991211, "learning_rate": 9.9614814630097e-07, "loss": 0.1184, "num_input_tokens_seen": 746586112, "step": 356 }, { "epoch": 0.09001512859304085, "grad_norm": 3.2838544845581055, "learning_rate": 9.960989930760294e-07, "loss": 0.1133, "num_input_tokens_seen": 748683264, "step": 357 }, { "epoch": 0.0902672718103883, "grad_norm": 4.716813564300537, "learning_rate": 9.960495295811337e-07, "loss": 0.152, "num_input_tokens_seen": 750780416, "step": 358 }, { "epoch": 0.09051941502773575, "grad_norm": 3.567866563796997, "learning_rate": 9.959997558506857e-07, "loss": 0.1348, "num_input_tokens_seen": 752877568, "step": 359 }, { "epoch": 0.0907715582450832, "grad_norm": 8.155049324035645, "learning_rate": 9.959496719193039e-07, "loss": 0.1658, "num_input_tokens_seen": 754974720, "step": 360 }, { "epoch": 0.09102370146243066, "grad_norm": 4.341349124908447, "learning_rate": 9.958992778218226e-07, "loss": 0.1635, "num_input_tokens_seen": 757071872, "step": 361 }, { "epoch": 0.09127584467977812, "grad_norm": 4.6380815505981445, "learning_rate": 9.95848573593292e-07, "loss": 0.1715, "num_input_tokens_seen": 759169024, "step": 362 }, { "epoch": 0.09152798789712557, "grad_norm": 3.3967676162719727, "learning_rate": 9.957975592689774e-07, "loss": 0.106, "num_input_tokens_seen": 761266176, "step": 363 }, { "epoch": 0.09178013111447302, "grad_norm": 2.9890308380126953, "learning_rate": 9.957462348843607e-07, "loss": 0.1163, "num_input_tokens_seen": 763363328, "step": 364 }, { "epoch": 0.09203227433182047, "grad_norm": 2.564323663711548, "learning_rate": 9.956946004751386e-07, "loss": 0.1217, "num_input_tokens_seen": 765460480, "step": 365 }, { "epoch": 0.09228441754916793, "grad_norm": 4.0984697341918945, "learning_rate": 9.956426560772238e-07, "loss": 0.1801, "num_input_tokens_seen": 767557632, "step": 366 }, { "epoch": 0.09253656076651538, "grad_norm": 2.5396645069122314, "learning_rate": 9.955904017267444e-07, "loss": 0.1272, "num_input_tokens_seen": 769654784, "step": 367 }, { "epoch": 0.09278870398386284, "grad_norm": 3.0213351249694824, "learning_rate": 9.955378374600447e-07, "loss": 0.121, "num_input_tokens_seen": 771751936, "step": 368 }, { "epoch": 0.09304084720121028, "grad_norm": 3.8049328327178955, "learning_rate": 9.954849633136839e-07, "loss": 0.102, "num_input_tokens_seen": 773849088, "step": 369 }, { "epoch": 0.09329299041855774, "grad_norm": 3.4090912342071533, "learning_rate": 9.95431779324437e-07, "loss": 0.1179, "num_input_tokens_seen": 775946240, "step": 370 }, { "epoch": 0.09354513363590519, "grad_norm": 2.5929131507873535, "learning_rate": 9.95378285529294e-07, "loss": 0.1106, "num_input_tokens_seen": 778043392, "step": 371 }, { "epoch": 0.09379727685325265, "grad_norm": 3.6183884143829346, "learning_rate": 9.953244819654615e-07, "loss": 0.1029, "num_input_tokens_seen": 780140544, "step": 372 }, { "epoch": 0.0940494200706001, "grad_norm": 3.812199354171753, "learning_rate": 9.952703686703604e-07, "loss": 0.0838, "num_input_tokens_seen": 782237696, "step": 373 }, { "epoch": 0.09430156328794756, "grad_norm": 5.054091453552246, "learning_rate": 9.952159456816275e-07, "loss": 0.2415, "num_input_tokens_seen": 784334848, "step": 374 }, { "epoch": 0.094553706505295, "grad_norm": 2.739720582962036, "learning_rate": 9.951612130371151e-07, "loss": 0.1198, "num_input_tokens_seen": 786432000, "step": 375 }, { "epoch": 0.09480584972264246, "grad_norm": 3.5317635536193848, "learning_rate": 9.951061707748907e-07, "loss": 0.0951, "num_input_tokens_seen": 788529152, "step": 376 }, { "epoch": 0.09505799293998991, "grad_norm": 2.7190043926239014, "learning_rate": 9.95050818933237e-07, "loss": 0.0918, "num_input_tokens_seen": 790626304, "step": 377 }, { "epoch": 0.09531013615733737, "grad_norm": 2.244220495223999, "learning_rate": 9.949951575506528e-07, "loss": 0.0987, "num_input_tokens_seen": 792723456, "step": 378 }, { "epoch": 0.09556227937468482, "grad_norm": 2.4800469875335693, "learning_rate": 9.94939186665851e-07, "loss": 0.112, "num_input_tokens_seen": 794820608, "step": 379 }, { "epoch": 0.09581442259203228, "grad_norm": 2.934340238571167, "learning_rate": 9.948829063177606e-07, "loss": 0.0914, "num_input_tokens_seen": 796917760, "step": 380 }, { "epoch": 0.09606656580937972, "grad_norm": 4.361299991607666, "learning_rate": 9.948263165455256e-07, "loss": 0.1366, "num_input_tokens_seen": 799014912, "step": 381 }, { "epoch": 0.09631870902672718, "grad_norm": 5.58315372467041, "learning_rate": 9.947694173885051e-07, "loss": 0.1444, "num_input_tokens_seen": 801112064, "step": 382 }, { "epoch": 0.09657085224407463, "grad_norm": 2.2215416431427, "learning_rate": 9.947122088862737e-07, "loss": 0.1324, "num_input_tokens_seen": 803209216, "step": 383 }, { "epoch": 0.09682299546142209, "grad_norm": 3.1041672229766846, "learning_rate": 9.946546910786208e-07, "loss": 0.1451, "num_input_tokens_seen": 805306368, "step": 384 }, { "epoch": 0.09707513867876955, "grad_norm": 3.4068877696990967, "learning_rate": 9.945968640055513e-07, "loss": 0.1318, "num_input_tokens_seen": 807403520, "step": 385 }, { "epoch": 0.09732728189611699, "grad_norm": 2.2413580417633057, "learning_rate": 9.945387277072845e-07, "loss": 0.0665, "num_input_tokens_seen": 809500672, "step": 386 }, { "epoch": 0.09757942511346444, "grad_norm": 2.360349655151367, "learning_rate": 9.944802822242558e-07, "loss": 0.0752, "num_input_tokens_seen": 811597824, "step": 387 }, { "epoch": 0.0978315683308119, "grad_norm": 2.0612034797668457, "learning_rate": 9.944215275971148e-07, "loss": 0.0661, "num_input_tokens_seen": 813694976, "step": 388 }, { "epoch": 0.09808371154815936, "grad_norm": 2.8129661083221436, "learning_rate": 9.943624638667263e-07, "loss": 0.0991, "num_input_tokens_seen": 815792128, "step": 389 }, { "epoch": 0.09833585476550681, "grad_norm": 3.179905891418457, "learning_rate": 9.943030910741707e-07, "loss": 0.166, "num_input_tokens_seen": 817889280, "step": 390 }, { "epoch": 0.09858799798285427, "grad_norm": 3.191718816757202, "learning_rate": 9.942434092607423e-07, "loss": 0.1583, "num_input_tokens_seen": 819986432, "step": 391 }, { "epoch": 0.09884014120020171, "grad_norm": 2.8753068447113037, "learning_rate": 9.941834184679511e-07, "loss": 0.1463, "num_input_tokens_seen": 822083584, "step": 392 }, { "epoch": 0.09909228441754916, "grad_norm": 2.709397315979004, "learning_rate": 9.94123118737522e-07, "loss": 0.103, "num_input_tokens_seen": 824180736, "step": 393 }, { "epoch": 0.09934442763489662, "grad_norm": 3.7003681659698486, "learning_rate": 9.94062510111394e-07, "loss": 0.1539, "num_input_tokens_seen": 826277888, "step": 394 }, { "epoch": 0.09959657085224408, "grad_norm": 4.4324631690979, "learning_rate": 9.94001592631722e-07, "loss": 0.1915, "num_input_tokens_seen": 828375040, "step": 395 }, { "epoch": 0.09984871406959153, "grad_norm": 4.082291126251221, "learning_rate": 9.93940366340875e-07, "loss": 0.2416, "num_input_tokens_seen": 830472192, "step": 396 }, { "epoch": 0.10010085728693899, "grad_norm": 2.7822890281677246, "learning_rate": 9.938788312814374e-07, "loss": 0.1053, "num_input_tokens_seen": 832569344, "step": 397 }, { "epoch": 0.10035300050428643, "grad_norm": 2.376317024230957, "learning_rate": 9.938169874962072e-07, "loss": 0.0785, "num_input_tokens_seen": 834666496, "step": 398 }, { "epoch": 0.10060514372163389, "grad_norm": 6.018281936645508, "learning_rate": 9.937548350281987e-07, "loss": 0.1501, "num_input_tokens_seen": 836763648, "step": 399 }, { "epoch": 0.10085728693898134, "grad_norm": 2.6437666416168213, "learning_rate": 9.936923739206391e-07, "loss": 0.1259, "num_input_tokens_seen": 838860800, "step": 400 }, { "epoch": 0.1011094301563288, "grad_norm": 3.112172842025757, "learning_rate": 9.936296042169723e-07, "loss": 0.1747, "num_input_tokens_seen": 840957952, "step": 401 }, { "epoch": 0.10136157337367625, "grad_norm": 7.632992744445801, "learning_rate": 9.93566525960855e-07, "loss": 0.0882, "num_input_tokens_seen": 843055104, "step": 402 }, { "epoch": 0.1016137165910237, "grad_norm": 3.4459123611450195, "learning_rate": 9.935031391961599e-07, "loss": 0.1184, "num_input_tokens_seen": 845152256, "step": 403 }, { "epoch": 0.10186585980837115, "grad_norm": 3.6913039684295654, "learning_rate": 9.93439443966973e-07, "loss": 0.1121, "num_input_tokens_seen": 847249408, "step": 404 }, { "epoch": 0.1021180030257186, "grad_norm": 3.291170835494995, "learning_rate": 9.933754403175956e-07, "loss": 0.1317, "num_input_tokens_seen": 849346560, "step": 405 }, { "epoch": 0.10237014624306606, "grad_norm": 5.224982738494873, "learning_rate": 9.93311128292544e-07, "loss": 0.2308, "num_input_tokens_seen": 851443712, "step": 406 }, { "epoch": 0.10262228946041352, "grad_norm": 3.043541193008423, "learning_rate": 9.932465079365477e-07, "loss": 0.1293, "num_input_tokens_seen": 853540864, "step": 407 }, { "epoch": 0.10287443267776097, "grad_norm": 3.613516092300415, "learning_rate": 9.931815792945515e-07, "loss": 0.2023, "num_input_tokens_seen": 855638016, "step": 408 }, { "epoch": 0.10312657589510842, "grad_norm": 3.9032676219940186, "learning_rate": 9.931163424117148e-07, "loss": 0.1554, "num_input_tokens_seen": 857735168, "step": 409 }, { "epoch": 0.10337871911245587, "grad_norm": 2.2143468856811523, "learning_rate": 9.930507973334106e-07, "loss": 0.1014, "num_input_tokens_seen": 859832320, "step": 410 }, { "epoch": 0.10363086232980333, "grad_norm": 3.722890615463257, "learning_rate": 9.92984944105227e-07, "loss": 0.1072, "num_input_tokens_seen": 861929472, "step": 411 }, { "epoch": 0.10388300554715078, "grad_norm": 3.3566651344299316, "learning_rate": 9.929187827729658e-07, "loss": 0.1597, "num_input_tokens_seen": 864026624, "step": 412 }, { "epoch": 0.10413514876449824, "grad_norm": 2.243074655532837, "learning_rate": 9.928523133826437e-07, "loss": 0.0799, "num_input_tokens_seen": 866123776, "step": 413 }, { "epoch": 0.1043872919818457, "grad_norm": 2.4208436012268066, "learning_rate": 9.927855359804914e-07, "loss": 0.1441, "num_input_tokens_seen": 868220928, "step": 414 }, { "epoch": 0.10463943519919314, "grad_norm": 3.7958076000213623, "learning_rate": 9.927184506129535e-07, "loss": 0.1769, "num_input_tokens_seen": 870318080, "step": 415 }, { "epoch": 0.10489157841654059, "grad_norm": 2.1095194816589355, "learning_rate": 9.926510573266894e-07, "loss": 0.0626, "num_input_tokens_seen": 872415232, "step": 416 }, { "epoch": 0.10514372163388805, "grad_norm": 2.22505784034729, "learning_rate": 9.925833561685718e-07, "loss": 0.0868, "num_input_tokens_seen": 874512384, "step": 417 }, { "epoch": 0.1053958648512355, "grad_norm": 2.8599283695220947, "learning_rate": 9.92515347185689e-07, "loss": 0.1311, "num_input_tokens_seen": 876609536, "step": 418 }, { "epoch": 0.10564800806858296, "grad_norm": 3.1945903301239014, "learning_rate": 9.924470304253418e-07, "loss": 0.0906, "num_input_tokens_seen": 878706688, "step": 419 }, { "epoch": 0.1059001512859304, "grad_norm": 5.766541481018066, "learning_rate": 9.92378405935046e-07, "loss": 0.1588, "num_input_tokens_seen": 880803840, "step": 420 }, { "epoch": 0.10615229450327786, "grad_norm": 2.077852249145508, "learning_rate": 9.92309473762531e-07, "loss": 0.0958, "num_input_tokens_seen": 882900992, "step": 421 }, { "epoch": 0.10640443772062531, "grad_norm": 3.552129030227661, "learning_rate": 9.922402339557405e-07, "loss": 0.1314, "num_input_tokens_seen": 884998144, "step": 422 }, { "epoch": 0.10665658093797277, "grad_norm": 2.371065855026245, "learning_rate": 9.92170686562832e-07, "loss": 0.1129, "num_input_tokens_seen": 887095296, "step": 423 }, { "epoch": 0.10690872415532023, "grad_norm": 3.874335289001465, "learning_rate": 9.921008316321768e-07, "loss": 0.1691, "num_input_tokens_seen": 889192448, "step": 424 }, { "epoch": 0.10716086737266768, "grad_norm": 2.733494520187378, "learning_rate": 9.920306692123609e-07, "loss": 0.1126, "num_input_tokens_seen": 891289600, "step": 425 }, { "epoch": 0.10741301059001512, "grad_norm": 2.3687491416931152, "learning_rate": 9.919601993521829e-07, "loss": 0.1028, "num_input_tokens_seen": 893386752, "step": 426 }, { "epoch": 0.10766515380736258, "grad_norm": 2.3049280643463135, "learning_rate": 9.91889422100656e-07, "loss": 0.0865, "num_input_tokens_seen": 895483904, "step": 427 }, { "epoch": 0.10791729702471003, "grad_norm": 2.899887800216675, "learning_rate": 9.918183375070073e-07, "loss": 0.1258, "num_input_tokens_seen": 897581056, "step": 428 }, { "epoch": 0.10816944024205749, "grad_norm": 4.081860065460205, "learning_rate": 9.917469456206773e-07, "loss": 0.0931, "num_input_tokens_seen": 899678208, "step": 429 }, { "epoch": 0.10842158345940495, "grad_norm": 3.0482466220855713, "learning_rate": 9.916752464913201e-07, "loss": 0.1039, "num_input_tokens_seen": 901775360, "step": 430 }, { "epoch": 0.10867372667675239, "grad_norm": 3.3849377632141113, "learning_rate": 9.916032401688042e-07, "loss": 0.1661, "num_input_tokens_seen": 903872512, "step": 431 }, { "epoch": 0.10892586989409984, "grad_norm": 3.4006130695343018, "learning_rate": 9.91530926703211e-07, "loss": 0.121, "num_input_tokens_seen": 905969664, "step": 432 }, { "epoch": 0.1091780131114473, "grad_norm": 4.100249290466309, "learning_rate": 9.91458306144836e-07, "loss": 0.1976, "num_input_tokens_seen": 908066816, "step": 433 }, { "epoch": 0.10943015632879476, "grad_norm": 2.491917610168457, "learning_rate": 9.913853785441878e-07, "loss": 0.1019, "num_input_tokens_seen": 910163968, "step": 434 }, { "epoch": 0.10968229954614221, "grad_norm": 4.087813377380371, "learning_rate": 9.913121439519893e-07, "loss": 0.1673, "num_input_tokens_seen": 912261120, "step": 435 }, { "epoch": 0.10993444276348967, "grad_norm": 2.377880334854126, "learning_rate": 9.912386024191763e-07, "loss": 0.1184, "num_input_tokens_seen": 914358272, "step": 436 }, { "epoch": 0.11018658598083711, "grad_norm": 2.745607376098633, "learning_rate": 9.911647539968981e-07, "loss": 0.0917, "num_input_tokens_seen": 916455424, "step": 437 }, { "epoch": 0.11043872919818457, "grad_norm": 4.707367897033691, "learning_rate": 9.91090598736518e-07, "loss": 0.2128, "num_input_tokens_seen": 918552576, "step": 438 }, { "epoch": 0.11069087241553202, "grad_norm": 3.578786611557007, "learning_rate": 9.910161366896119e-07, "loss": 0.1235, "num_input_tokens_seen": 920649728, "step": 439 }, { "epoch": 0.11094301563287948, "grad_norm": 2.3904166221618652, "learning_rate": 9.909413679079697e-07, "loss": 0.1139, "num_input_tokens_seen": 922746880, "step": 440 }, { "epoch": 0.11119515885022693, "grad_norm": 3.1667914390563965, "learning_rate": 9.908662924435946e-07, "loss": 0.157, "num_input_tokens_seen": 924844032, "step": 441 }, { "epoch": 0.11144730206757439, "grad_norm": 4.515403747558594, "learning_rate": 9.907909103487027e-07, "loss": 0.1837, "num_input_tokens_seen": 926941184, "step": 442 }, { "epoch": 0.11169944528492183, "grad_norm": 1.9842240810394287, "learning_rate": 9.907152216757239e-07, "loss": 0.1077, "num_input_tokens_seen": 929038336, "step": 443 }, { "epoch": 0.11195158850226929, "grad_norm": 3.713541030883789, "learning_rate": 9.906392264773008e-07, "loss": 0.1401, "num_input_tokens_seen": 931135488, "step": 444 }, { "epoch": 0.11220373171961674, "grad_norm": 2.7595789432525635, "learning_rate": 9.905629248062895e-07, "loss": 0.1262, "num_input_tokens_seen": 933232640, "step": 445 }, { "epoch": 0.1124558749369642, "grad_norm": 3.375941038131714, "learning_rate": 9.904863167157591e-07, "loss": 0.1777, "num_input_tokens_seen": 935329792, "step": 446 }, { "epoch": 0.11270801815431165, "grad_norm": 2.2114899158477783, "learning_rate": 9.904094022589923e-07, "loss": 0.0785, "num_input_tokens_seen": 937426944, "step": 447 }, { "epoch": 0.1129601613716591, "grad_norm": 3.5571250915527344, "learning_rate": 9.90332181489484e-07, "loss": 0.1771, "num_input_tokens_seen": 939524096, "step": 448 }, { "epoch": 0.11321230458900655, "grad_norm": 4.025667667388916, "learning_rate": 9.902546544609432e-07, "loss": 0.1424, "num_input_tokens_seen": 941621248, "step": 449 }, { "epoch": 0.11346444780635401, "grad_norm": 2.804630994796753, "learning_rate": 9.901768212272906e-07, "loss": 0.1722, "num_input_tokens_seen": 943718400, "step": 450 }, { "epoch": 0.11371659102370146, "grad_norm": 2.183051824569702, "learning_rate": 9.900986818426612e-07, "loss": 0.0876, "num_input_tokens_seen": 945815552, "step": 451 }, { "epoch": 0.11396873424104892, "grad_norm": 2.7712557315826416, "learning_rate": 9.900202363614025e-07, "loss": 0.1148, "num_input_tokens_seen": 947912704, "step": 452 }, { "epoch": 0.11422087745839637, "grad_norm": 3.2009191513061523, "learning_rate": 9.899414848380743e-07, "loss": 0.1514, "num_input_tokens_seen": 950009856, "step": 453 }, { "epoch": 0.11447302067574382, "grad_norm": 3.8625547885894775, "learning_rate": 9.8986242732745e-07, "loss": 0.1811, "num_input_tokens_seen": 952107008, "step": 454 }, { "epoch": 0.11472516389309127, "grad_norm": 2.4320788383483887, "learning_rate": 9.897830638845153e-07, "loss": 0.1304, "num_input_tokens_seen": 954204160, "step": 455 }, { "epoch": 0.11497730711043873, "grad_norm": 2.825261354446411, "learning_rate": 9.897033945644692e-07, "loss": 0.1156, "num_input_tokens_seen": 956301312, "step": 456 }, { "epoch": 0.11522945032778618, "grad_norm": 9.34619426727295, "learning_rate": 9.89623419422723e-07, "loss": 0.0738, "num_input_tokens_seen": 958398464, "step": 457 }, { "epoch": 0.11548159354513364, "grad_norm": 3.386025905609131, "learning_rate": 9.895431385149007e-07, "loss": 0.1693, "num_input_tokens_seen": 960495616, "step": 458 }, { "epoch": 0.1157337367624811, "grad_norm": 3.9842169284820557, "learning_rate": 9.894625518968396e-07, "loss": 0.0836, "num_input_tokens_seen": 962592768, "step": 459 }, { "epoch": 0.11598587997982854, "grad_norm": 4.544926166534424, "learning_rate": 9.893816596245886e-07, "loss": 0.2216, "num_input_tokens_seen": 964689920, "step": 460 }, { "epoch": 0.116238023197176, "grad_norm": 3.3318898677825928, "learning_rate": 9.8930046175441e-07, "loss": 0.1638, "num_input_tokens_seen": 966787072, "step": 461 }, { "epoch": 0.11649016641452345, "grad_norm": 2.5450119972229004, "learning_rate": 9.892189583427785e-07, "loss": 0.1472, "num_input_tokens_seen": 968884224, "step": 462 }, { "epoch": 0.1167423096318709, "grad_norm": 5.197476863861084, "learning_rate": 9.891371494463812e-07, "loss": 0.1708, "num_input_tokens_seen": 970981376, "step": 463 }, { "epoch": 0.11699445284921836, "grad_norm": 2.857074499130249, "learning_rate": 9.890550351221176e-07, "loss": 0.0968, "num_input_tokens_seen": 973078528, "step": 464 }, { "epoch": 0.1172465960665658, "grad_norm": 2.8476240634918213, "learning_rate": 9.889726154270997e-07, "loss": 0.1504, "num_input_tokens_seen": 975175680, "step": 465 }, { "epoch": 0.11749873928391326, "grad_norm": 6.322744369506836, "learning_rate": 9.888898904186517e-07, "loss": 0.1249, "num_input_tokens_seen": 977272832, "step": 466 }, { "epoch": 0.11775088250126071, "grad_norm": 3.161973237991333, "learning_rate": 9.888068601543106e-07, "loss": 0.2604, "num_input_tokens_seen": 979369984, "step": 467 }, { "epoch": 0.11800302571860817, "grad_norm": 2.0370872020721436, "learning_rate": 9.887235246918255e-07, "loss": 0.0983, "num_input_tokens_seen": 981467136, "step": 468 }, { "epoch": 0.11825516893595563, "grad_norm": 3.568608283996582, "learning_rate": 9.886398840891576e-07, "loss": 0.1531, "num_input_tokens_seen": 983564288, "step": 469 }, { "epoch": 0.11850731215330308, "grad_norm": 2.3104538917541504, "learning_rate": 9.885559384044805e-07, "loss": 0.1091, "num_input_tokens_seen": 985661440, "step": 470 }, { "epoch": 0.11875945537065052, "grad_norm": 3.4569497108459473, "learning_rate": 9.884716876961798e-07, "loss": 0.1195, "num_input_tokens_seen": 987758592, "step": 471 }, { "epoch": 0.11901159858799798, "grad_norm": 3.131441354751587, "learning_rate": 9.883871320228534e-07, "loss": 0.1564, "num_input_tokens_seen": 989855744, "step": 472 }, { "epoch": 0.11926374180534544, "grad_norm": 3.427337646484375, "learning_rate": 9.883022714433116e-07, "loss": 0.1911, "num_input_tokens_seen": 991952896, "step": 473 }, { "epoch": 0.11951588502269289, "grad_norm": 3.554757833480835, "learning_rate": 9.882171060165764e-07, "loss": 0.1489, "num_input_tokens_seen": 994050048, "step": 474 }, { "epoch": 0.11976802824004035, "grad_norm": 2.5964512825012207, "learning_rate": 9.881316358018816e-07, "loss": 0.0662, "num_input_tokens_seen": 996147200, "step": 475 }, { "epoch": 0.1200201714573878, "grad_norm": 3.2962310314178467, "learning_rate": 9.880458608586737e-07, "loss": 0.1555, "num_input_tokens_seen": 998244352, "step": 476 }, { "epoch": 0.12027231467473525, "grad_norm": 2.869269371032715, "learning_rate": 9.879597812466105e-07, "loss": 0.0795, "num_input_tokens_seen": 1000341504, "step": 477 }, { "epoch": 0.1205244578920827, "grad_norm": 2.913670778274536, "learning_rate": 9.878733970255618e-07, "loss": 0.1329, "num_input_tokens_seen": 1002438656, "step": 478 }, { "epoch": 0.12077660110943016, "grad_norm": 3.124332904815674, "learning_rate": 9.877867082556097e-07, "loss": 0.1538, "num_input_tokens_seen": 1004535808, "step": 479 }, { "epoch": 0.12102874432677761, "grad_norm": 3.5321497917175293, "learning_rate": 9.876997149970477e-07, "loss": 0.1714, "num_input_tokens_seen": 1006632960, "step": 480 }, { "epoch": 0.12128088754412507, "grad_norm": 3.904442071914673, "learning_rate": 9.87612417310381e-07, "loss": 0.1452, "num_input_tokens_seen": 1008730112, "step": 481 }, { "epoch": 0.12153303076147251, "grad_norm": 3.534336805343628, "learning_rate": 9.87524815256327e-07, "loss": 0.1589, "num_input_tokens_seen": 1010827264, "step": 482 }, { "epoch": 0.12178517397881997, "grad_norm": 3.5298209190368652, "learning_rate": 9.874369088958145e-07, "loss": 0.1413, "num_input_tokens_seen": 1012924416, "step": 483 }, { "epoch": 0.12203731719616742, "grad_norm": 3.4223012924194336, "learning_rate": 9.873486982899837e-07, "loss": 0.1552, "num_input_tokens_seen": 1015021568, "step": 484 }, { "epoch": 0.12228946041351488, "grad_norm": 2.560487747192383, "learning_rate": 9.872601835001869e-07, "loss": 0.1192, "num_input_tokens_seen": 1017118720, "step": 485 }, { "epoch": 0.12254160363086233, "grad_norm": 2.099520683288574, "learning_rate": 9.871713645879878e-07, "loss": 0.1125, "num_input_tokens_seen": 1019215872, "step": 486 }, { "epoch": 0.12279374684820979, "grad_norm": 3.477560520172119, "learning_rate": 9.870822416151614e-07, "loss": 0.1485, "num_input_tokens_seen": 1021313024, "step": 487 }, { "epoch": 0.12304589006555723, "grad_norm": 2.9200782775878906, "learning_rate": 9.869928146436942e-07, "loss": 0.0596, "num_input_tokens_seen": 1023410176, "step": 488 }, { "epoch": 0.12329803328290469, "grad_norm": 2.3703415393829346, "learning_rate": 9.86903083735785e-07, "loss": 0.1163, "num_input_tokens_seen": 1025507328, "step": 489 }, { "epoch": 0.12355017650025214, "grad_norm": 2.2664389610290527, "learning_rate": 9.868130489538425e-07, "loss": 0.0712, "num_input_tokens_seen": 1027604480, "step": 490 }, { "epoch": 0.1238023197175996, "grad_norm": 1.798887848854065, "learning_rate": 9.867227103604877e-07, "loss": 0.0709, "num_input_tokens_seen": 1029701632, "step": 491 }, { "epoch": 0.12405446293494705, "grad_norm": 3.6567928791046143, "learning_rate": 9.86632068018553e-07, "loss": 0.1474, "num_input_tokens_seen": 1031798784, "step": 492 }, { "epoch": 0.1243066061522945, "grad_norm": 2.8362531661987305, "learning_rate": 9.865411219910815e-07, "loss": 0.1235, "num_input_tokens_seen": 1033895936, "step": 493 }, { "epoch": 0.12455874936964195, "grad_norm": 2.423952341079712, "learning_rate": 9.86449872341328e-07, "loss": 0.1048, "num_input_tokens_seen": 1035993088, "step": 494 }, { "epoch": 0.12481089258698941, "grad_norm": 2.4268240928649902, "learning_rate": 9.863583191327583e-07, "loss": 0.1063, "num_input_tokens_seen": 1038090240, "step": 495 }, { "epoch": 0.12506303580433686, "grad_norm": 2.1852941513061523, "learning_rate": 9.862664624290494e-07, "loss": 0.0932, "num_input_tokens_seen": 1040187392, "step": 496 }, { "epoch": 0.12531517902168432, "grad_norm": 3.1700496673583984, "learning_rate": 9.86174302294089e-07, "loss": 0.1174, "num_input_tokens_seen": 1042284544, "step": 497 }, { "epoch": 0.12556732223903178, "grad_norm": 3.2374541759490967, "learning_rate": 9.860818387919762e-07, "loss": 0.1251, "num_input_tokens_seen": 1044381696, "step": 498 }, { "epoch": 0.12581946545637923, "grad_norm": 2.62046217918396, "learning_rate": 9.859890719870213e-07, "loss": 0.0991, "num_input_tokens_seen": 1046478848, "step": 499 }, { "epoch": 0.1260716086737267, "grad_norm": 3.053370237350464, "learning_rate": 9.85896001943745e-07, "loss": 0.1612, "num_input_tokens_seen": 1048576000, "step": 500 } ], "logging_steps": 1.0, "max_steps": 3966, "num_input_tokens_seen": 1048576000, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.902112919650304e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }