diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1260716086737267, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002521432173474534, + "grad_norm": 56.41114044189453, + "learning_rate": 5.025125628140703e-09, + "loss": 1.5821, + "num_input_tokens_seen": 2097152, + "step": 1 + }, + { + "epoch": 0.0005042864346949068, + "grad_norm": 31.409353256225586, + "learning_rate": 1.0050251256281407e-08, + "loss": 1.5937, + "num_input_tokens_seen": 4194304, + "step": 2 + }, + { + "epoch": 0.0007564296520423601, + "grad_norm": 21.739652633666992, + "learning_rate": 1.5075376884422108e-08, + "loss": 1.2442, + "num_input_tokens_seen": 6291456, + "step": 3 + }, + { + "epoch": 0.0010085728693898135, + "grad_norm": 20.685302734375, + "learning_rate": 2.0100502512562813e-08, + "loss": 0.8062, + "num_input_tokens_seen": 8388608, + "step": 4 + }, + { + "epoch": 0.0012607160867372667, + "grad_norm": 22.219989776611328, + "learning_rate": 2.5125628140703518e-08, + "loss": 1.1513, + "num_input_tokens_seen": 10485760, + "step": 5 + }, + { + "epoch": 0.0015128593040847202, + "grad_norm": 28.416399002075195, + "learning_rate": 3.0150753768844216e-08, + "loss": 1.634, + "num_input_tokens_seen": 12582912, + "step": 6 + }, + { + "epoch": 0.0017650025214321734, + "grad_norm": 23.952890396118164, + "learning_rate": 3.517587939698492e-08, + "loss": 1.1944, + "num_input_tokens_seen": 14680064, + "step": 7 + }, + { + "epoch": 0.002017145738779627, + "grad_norm": 20.75243377685547, + "learning_rate": 4.0201005025125626e-08, + "loss": 0.7753, + "num_input_tokens_seen": 16777216, + "step": 8 + }, + { + "epoch": 0.0022692889561270802, + "grad_norm": 25.797378540039062, + "learning_rate": 4.522613065326633e-08, + "loss": 1.5984, + "num_input_tokens_seen": 18874368, + "step": 9 + }, + { + "epoch": 0.0025214321734745334, + "grad_norm": 25.863649368286133, + "learning_rate": 5.0251256281407036e-08, + "loss": 1.5978, + "num_input_tokens_seen": 20971520, + "step": 10 + }, + { + "epoch": 0.002773575390821987, + "grad_norm": 18.696609497070312, + "learning_rate": 5.527638190954774e-08, + "loss": 1.2323, + "num_input_tokens_seen": 23068672, + "step": 11 + }, + { + "epoch": 0.0030257186081694403, + "grad_norm": 28.740385055541992, + "learning_rate": 6.030150753768843e-08, + "loss": 1.1786, + "num_input_tokens_seen": 25165824, + "step": 12 + }, + { + "epoch": 0.0032778618255168935, + "grad_norm": 21.161056518554688, + "learning_rate": 6.532663316582915e-08, + "loss": 0.7842, + "num_input_tokens_seen": 27262976, + "step": 13 + }, + { + "epoch": 0.0035300050428643467, + "grad_norm": 25.495088577270508, + "learning_rate": 7.035175879396984e-08, + "loss": 1.9987, + "num_input_tokens_seen": 29360128, + "step": 14 + }, + { + "epoch": 0.0037821482602118004, + "grad_norm": 24.420948028564453, + "learning_rate": 7.537688442211055e-08, + "loss": 1.1424, + "num_input_tokens_seen": 31457280, + "step": 15 + }, + { + "epoch": 0.004034291477559254, + "grad_norm": 19.922271728515625, + "learning_rate": 8.040201005025125e-08, + "loss": 1.1716, + "num_input_tokens_seen": 33554432, + "step": 16 + }, + { + "epoch": 0.004286434694906707, + "grad_norm": 25.040063858032227, + "learning_rate": 8.542713567839196e-08, + "loss": 0.8189, + "num_input_tokens_seen": 35651584, + "step": 17 + }, + { + "epoch": 0.0045385779122541605, + "grad_norm": 27.888629913330078, + "learning_rate": 9.045226130653266e-08, + "loss": 1.1743, + "num_input_tokens_seen": 37748736, + "step": 18 + }, + { + "epoch": 0.004790721129601614, + "grad_norm": 21.901092529296875, + "learning_rate": 9.547738693467335e-08, + "loss": 0.7951, + "num_input_tokens_seen": 39845888, + "step": 19 + }, + { + "epoch": 0.005042864346949067, + "grad_norm": 21.351625442504883, + "learning_rate": 1.0050251256281407e-07, + "loss": 1.2271, + "num_input_tokens_seen": 41943040, + "step": 20 + }, + { + "epoch": 0.00529500756429652, + "grad_norm": 21.482006072998047, + "learning_rate": 1.0552763819095476e-07, + "loss": 1.1908, + "num_input_tokens_seen": 44040192, + "step": 21 + }, + { + "epoch": 0.005547150781643974, + "grad_norm": 21.15386390686035, + "learning_rate": 1.1055276381909548e-07, + "loss": 1.2297, + "num_input_tokens_seen": 46137344, + "step": 22 + }, + { + "epoch": 0.005799293998991427, + "grad_norm": 23.156387329101562, + "learning_rate": 1.1557788944723617e-07, + "loss": 1.1766, + "num_input_tokens_seen": 48234496, + "step": 23 + }, + { + "epoch": 0.006051437216338881, + "grad_norm": 38.258697509765625, + "learning_rate": 1.2060301507537687e-07, + "loss": 1.4932, + "num_input_tokens_seen": 50331648, + "step": 24 + }, + { + "epoch": 0.006303580433686334, + "grad_norm": 20.798620223999023, + "learning_rate": 1.2562814070351758e-07, + "loss": 1.2943, + "num_input_tokens_seen": 52428800, + "step": 25 + }, + { + "epoch": 0.006555723651033787, + "grad_norm": 24.994922637939453, + "learning_rate": 1.306532663316583e-07, + "loss": 1.0768, + "num_input_tokens_seen": 54525952, + "step": 26 + }, + { + "epoch": 0.00680786686838124, + "grad_norm": 33.116146087646484, + "learning_rate": 1.35678391959799e-07, + "loss": 1.1369, + "num_input_tokens_seen": 56623104, + "step": 27 + }, + { + "epoch": 0.0070600100857286935, + "grad_norm": 39.03334426879883, + "learning_rate": 1.4070351758793969e-07, + "loss": 1.5513, + "num_input_tokens_seen": 58720256, + "step": 28 + }, + { + "epoch": 0.007312153303076148, + "grad_norm": 25.035110473632812, + "learning_rate": 1.4572864321608038e-07, + "loss": 1.2028, + "num_input_tokens_seen": 60817408, + "step": 29 + }, + { + "epoch": 0.007564296520423601, + "grad_norm": 21.068431854248047, + "learning_rate": 1.507537688442211e-07, + "loss": 1.1555, + "num_input_tokens_seen": 62914560, + "step": 30 + }, + { + "epoch": 0.007816439737771054, + "grad_norm": 35.82476043701172, + "learning_rate": 1.5577889447236181e-07, + "loss": 1.1723, + "num_input_tokens_seen": 65011712, + "step": 31 + }, + { + "epoch": 0.008068582955118508, + "grad_norm": 27.961219787597656, + "learning_rate": 1.608040201005025e-07, + "loss": 0.7226, + "num_input_tokens_seen": 67108864, + "step": 32 + }, + { + "epoch": 0.00832072617246596, + "grad_norm": 21.109777450561523, + "learning_rate": 1.658291457286432e-07, + "loss": 1.0722, + "num_input_tokens_seen": 69206016, + "step": 33 + }, + { + "epoch": 0.008572869389813415, + "grad_norm": 43.04289627075195, + "learning_rate": 1.7085427135678392e-07, + "loss": 1.1128, + "num_input_tokens_seen": 71303168, + "step": 34 + }, + { + "epoch": 0.008825012607160867, + "grad_norm": 26.515880584716797, + "learning_rate": 1.7587939698492463e-07, + "loss": 1.1254, + "num_input_tokens_seen": 73400320, + "step": 35 + }, + { + "epoch": 0.009077155824508321, + "grad_norm": 21.351062774658203, + "learning_rate": 1.8090452261306533e-07, + "loss": 0.7675, + "num_input_tokens_seen": 75497472, + "step": 36 + }, + { + "epoch": 0.009329299041855773, + "grad_norm": 23.136459350585938, + "learning_rate": 1.8592964824120602e-07, + "loss": 1.1374, + "num_input_tokens_seen": 77594624, + "step": 37 + }, + { + "epoch": 0.009581442259203227, + "grad_norm": 17.877473831176758, + "learning_rate": 1.909547738693467e-07, + "loss": 1.1101, + "num_input_tokens_seen": 79691776, + "step": 38 + }, + { + "epoch": 0.009833585476550681, + "grad_norm": 33.78788375854492, + "learning_rate": 1.9597989949748743e-07, + "loss": 1.0273, + "num_input_tokens_seen": 81788928, + "step": 39 + }, + { + "epoch": 0.010085728693898134, + "grad_norm": 32.83673858642578, + "learning_rate": 2.0100502512562815e-07, + "loss": 1.1025, + "num_input_tokens_seen": 83886080, + "step": 40 + }, + { + "epoch": 0.010337871911245588, + "grad_norm": 26.676027297973633, + "learning_rate": 2.0603015075376884e-07, + "loss": 1.8515, + "num_input_tokens_seen": 85983232, + "step": 41 + }, + { + "epoch": 0.01059001512859304, + "grad_norm": 26.88898468017578, + "learning_rate": 2.1105527638190953e-07, + "loss": 1.3322, + "num_input_tokens_seen": 88080384, + "step": 42 + }, + { + "epoch": 0.010842158345940494, + "grad_norm": 24.28297233581543, + "learning_rate": 2.1608040201005022e-07, + "loss": 0.9043, + "num_input_tokens_seen": 90177536, + "step": 43 + }, + { + "epoch": 0.011094301563287948, + "grad_norm": 15.659173011779785, + "learning_rate": 2.2110552763819096e-07, + "loss": 0.9169, + "num_input_tokens_seen": 92274688, + "step": 44 + }, + { + "epoch": 0.0113464447806354, + "grad_norm": 15.930516242980957, + "learning_rate": 2.2613065326633166e-07, + "loss": 0.9613, + "num_input_tokens_seen": 94371840, + "step": 45 + }, + { + "epoch": 0.011598587997982855, + "grad_norm": 14.883039474487305, + "learning_rate": 2.3115577889447235e-07, + "loss": 0.882, + "num_input_tokens_seen": 96468992, + "step": 46 + }, + { + "epoch": 0.011850731215330307, + "grad_norm": 25.84305191040039, + "learning_rate": 2.3618090452261304e-07, + "loss": 1.1471, + "num_input_tokens_seen": 98566144, + "step": 47 + }, + { + "epoch": 0.012102874432677761, + "grad_norm": 21.669544219970703, + "learning_rate": 2.4120603015075373e-07, + "loss": 0.9125, + "num_input_tokens_seen": 100663296, + "step": 48 + }, + { + "epoch": 0.012355017650025214, + "grad_norm": 15.483664512634277, + "learning_rate": 2.4623115577889445e-07, + "loss": 0.8492, + "num_input_tokens_seen": 102760448, + "step": 49 + }, + { + "epoch": 0.012607160867372668, + "grad_norm": 18.560636520385742, + "learning_rate": 2.5125628140703517e-07, + "loss": 0.9035, + "num_input_tokens_seen": 104857600, + "step": 50 + }, + { + "epoch": 0.012859304084720122, + "grad_norm": 14.719083786010742, + "learning_rate": 2.562814070351759e-07, + "loss": 0.8161, + "num_input_tokens_seen": 106954752, + "step": 51 + }, + { + "epoch": 0.013111447302067574, + "grad_norm": 21.655672073364258, + "learning_rate": 2.613065326633166e-07, + "loss": 0.572, + "num_input_tokens_seen": 109051904, + "step": 52 + }, + { + "epoch": 0.013363590519415028, + "grad_norm": 11.465034484863281, + "learning_rate": 2.6633165829145727e-07, + "loss": 0.807, + "num_input_tokens_seen": 111149056, + "step": 53 + }, + { + "epoch": 0.01361573373676248, + "grad_norm": 17.689987182617188, + "learning_rate": 2.71356783919598e-07, + "loss": 1.4423, + "num_input_tokens_seen": 113246208, + "step": 54 + }, + { + "epoch": 0.013867876954109935, + "grad_norm": 14.684429168701172, + "learning_rate": 2.7638190954773865e-07, + "loss": 0.8659, + "num_input_tokens_seen": 115343360, + "step": 55 + }, + { + "epoch": 0.014120020171457387, + "grad_norm": 12.435643196105957, + "learning_rate": 2.8140703517587937e-07, + "loss": 0.7607, + "num_input_tokens_seen": 117440512, + "step": 56 + }, + { + "epoch": 0.014372163388804841, + "grad_norm": 17.700153350830078, + "learning_rate": 2.864321608040201e-07, + "loss": 0.8607, + "num_input_tokens_seen": 119537664, + "step": 57 + }, + { + "epoch": 0.014624306606152295, + "grad_norm": 13.79918384552002, + "learning_rate": 2.9145728643216075e-07, + "loss": 0.7589, + "num_input_tokens_seen": 121634816, + "step": 58 + }, + { + "epoch": 0.014876449823499747, + "grad_norm": 15.207538604736328, + "learning_rate": 2.964824120603015e-07, + "loss": 0.4787, + "num_input_tokens_seen": 123731968, + "step": 59 + }, + { + "epoch": 0.015128593040847202, + "grad_norm": 10.523366928100586, + "learning_rate": 3.015075376884422e-07, + "loss": 0.6908, + "num_input_tokens_seen": 125829120, + "step": 60 + }, + { + "epoch": 0.015380736258194654, + "grad_norm": 8.412284851074219, + "learning_rate": 3.065326633165829e-07, + "loss": 0.6561, + "num_input_tokens_seen": 127926272, + "step": 61 + }, + { + "epoch": 0.015632879475542108, + "grad_norm": 9.98276138305664, + "learning_rate": 3.1155778894472363e-07, + "loss": 0.7216, + "num_input_tokens_seen": 130023424, + "step": 62 + }, + { + "epoch": 0.01588502269288956, + "grad_norm": 11.017064094543457, + "learning_rate": 3.165829145728643e-07, + "loss": 0.6223, + "num_input_tokens_seen": 132120576, + "step": 63 + }, + { + "epoch": 0.016137165910237016, + "grad_norm": 15.129839897155762, + "learning_rate": 3.21608040201005e-07, + "loss": 1.0373, + "num_input_tokens_seen": 134217728, + "step": 64 + }, + { + "epoch": 0.01638930912758447, + "grad_norm": 8.578692436218262, + "learning_rate": 3.2663316582914573e-07, + "loss": 0.5687, + "num_input_tokens_seen": 136314880, + "step": 65 + }, + { + "epoch": 0.01664145234493192, + "grad_norm": 13.31927490234375, + "learning_rate": 3.316582914572864e-07, + "loss": 1.0766, + "num_input_tokens_seen": 138412032, + "step": 66 + }, + { + "epoch": 0.016893595562279373, + "grad_norm": 8.775867462158203, + "learning_rate": 3.366834170854271e-07, + "loss": 0.5324, + "num_input_tokens_seen": 140509184, + "step": 67 + }, + { + "epoch": 0.01714573877962683, + "grad_norm": 12.085953712463379, + "learning_rate": 3.4170854271356783e-07, + "loss": 0.8601, + "num_input_tokens_seen": 142606336, + "step": 68 + }, + { + "epoch": 0.01739788199697428, + "grad_norm": 12.76360034942627, + "learning_rate": 3.4673366834170855e-07, + "loss": 0.5595, + "num_input_tokens_seen": 144703488, + "step": 69 + }, + { + "epoch": 0.017650025214321734, + "grad_norm": 10.255838394165039, + "learning_rate": 3.5175879396984927e-07, + "loss": 0.3496, + "num_input_tokens_seen": 146800640, + "step": 70 + }, + { + "epoch": 0.01790216843166919, + "grad_norm": 9.94809341430664, + "learning_rate": 3.5678391959798993e-07, + "loss": 0.5976, + "num_input_tokens_seen": 148897792, + "step": 71 + }, + { + "epoch": 0.018154311649016642, + "grad_norm": 7.37994384765625, + "learning_rate": 3.6180904522613065e-07, + "loss": 0.5241, + "num_input_tokens_seen": 150994944, + "step": 72 + }, + { + "epoch": 0.018406454866364094, + "grad_norm": 8.874433517456055, + "learning_rate": 3.668341708542713e-07, + "loss": 0.5629, + "num_input_tokens_seen": 153092096, + "step": 73 + }, + { + "epoch": 0.018658598083711547, + "grad_norm": 16.685457229614258, + "learning_rate": 3.7185929648241203e-07, + "loss": 0.3801, + "num_input_tokens_seen": 155189248, + "step": 74 + }, + { + "epoch": 0.018910741301059002, + "grad_norm": 11.288415908813477, + "learning_rate": 3.7688442211055275e-07, + "loss": 0.6093, + "num_input_tokens_seen": 157286400, + "step": 75 + }, + { + "epoch": 0.019162884518406455, + "grad_norm": 10.51889419555664, + "learning_rate": 3.819095477386934e-07, + "loss": 0.5053, + "num_input_tokens_seen": 159383552, + "step": 76 + }, + { + "epoch": 0.019415027735753907, + "grad_norm": 10.236724853515625, + "learning_rate": 3.869346733668342e-07, + "loss": 0.7537, + "num_input_tokens_seen": 161480704, + "step": 77 + }, + { + "epoch": 0.019667170953101363, + "grad_norm": 9.370979309082031, + "learning_rate": 3.9195979899497485e-07, + "loss": 0.5814, + "num_input_tokens_seen": 163577856, + "step": 78 + }, + { + "epoch": 0.019919314170448815, + "grad_norm": 12.056835174560547, + "learning_rate": 3.9698492462311557e-07, + "loss": 0.5178, + "num_input_tokens_seen": 165675008, + "step": 79 + }, + { + "epoch": 0.020171457387796268, + "grad_norm": 8.761493682861328, + "learning_rate": 4.020100502512563e-07, + "loss": 0.4851, + "num_input_tokens_seen": 167772160, + "step": 80 + }, + { + "epoch": 0.02042360060514372, + "grad_norm": 9.159887313842773, + "learning_rate": 4.0703517587939696e-07, + "loss": 0.4531, + "num_input_tokens_seen": 169869312, + "step": 81 + }, + { + "epoch": 0.020675743822491176, + "grad_norm": 9.923644065856934, + "learning_rate": 4.120603015075377e-07, + "loss": 0.5835, + "num_input_tokens_seen": 171966464, + "step": 82 + }, + { + "epoch": 0.020927887039838628, + "grad_norm": 8.762866973876953, + "learning_rate": 4.1708542713567834e-07, + "loss": 0.4772, + "num_input_tokens_seen": 174063616, + "step": 83 + }, + { + "epoch": 0.02118003025718608, + "grad_norm": 10.09272289276123, + "learning_rate": 4.2211055276381906e-07, + "loss": 0.7305, + "num_input_tokens_seen": 176160768, + "step": 84 + }, + { + "epoch": 0.021432173474533536, + "grad_norm": 8.009614944458008, + "learning_rate": 4.271356783919598e-07, + "loss": 0.4629, + "num_input_tokens_seen": 178257920, + "step": 85 + }, + { + "epoch": 0.02168431669188099, + "grad_norm": 8.284019470214844, + "learning_rate": 4.3216080402010044e-07, + "loss": 0.4368, + "num_input_tokens_seen": 180355072, + "step": 86 + }, + { + "epoch": 0.02193645990922844, + "grad_norm": 6.427061557769775, + "learning_rate": 4.371859296482412e-07, + "loss": 0.43, + "num_input_tokens_seen": 182452224, + "step": 87 + }, + { + "epoch": 0.022188603126575897, + "grad_norm": 12.255255699157715, + "learning_rate": 4.4221105527638193e-07, + "loss": 0.5879, + "num_input_tokens_seen": 184549376, + "step": 88 + }, + { + "epoch": 0.02244074634392335, + "grad_norm": 6.626727104187012, + "learning_rate": 4.472361809045226e-07, + "loss": 0.3916, + "num_input_tokens_seen": 186646528, + "step": 89 + }, + { + "epoch": 0.0226928895612708, + "grad_norm": 8.53348445892334, + "learning_rate": 4.522613065326633e-07, + "loss": 0.4768, + "num_input_tokens_seen": 188743680, + "step": 90 + }, + { + "epoch": 0.022945032778618254, + "grad_norm": 6.995331287384033, + "learning_rate": 4.57286432160804e-07, + "loss": 0.3988, + "num_input_tokens_seen": 190840832, + "step": 91 + }, + { + "epoch": 0.02319717599596571, + "grad_norm": 8.352548599243164, + "learning_rate": 4.623115577889447e-07, + "loss": 0.3706, + "num_input_tokens_seen": 192937984, + "step": 92 + }, + { + "epoch": 0.023449319213313162, + "grad_norm": 6.609560489654541, + "learning_rate": 4.673366834170854e-07, + "loss": 0.2459, + "num_input_tokens_seen": 195035136, + "step": 93 + }, + { + "epoch": 0.023701462430660614, + "grad_norm": 9.539324760437012, + "learning_rate": 4.723618090452261e-07, + "loss": 0.3865, + "num_input_tokens_seen": 197132288, + "step": 94 + }, + { + "epoch": 0.02395360564800807, + "grad_norm": 9.831944465637207, + "learning_rate": 4.773869346733669e-07, + "loss": 0.4022, + "num_input_tokens_seen": 199229440, + "step": 95 + }, + { + "epoch": 0.024205748865355523, + "grad_norm": 9.292588233947754, + "learning_rate": 4.824120603015075e-07, + "loss": 0.3543, + "num_input_tokens_seen": 201326592, + "step": 96 + }, + { + "epoch": 0.024457892082702975, + "grad_norm": 9.192462921142578, + "learning_rate": 4.874371859296482e-07, + "loss": 0.4336, + "num_input_tokens_seen": 203423744, + "step": 97 + }, + { + "epoch": 0.024710035300050427, + "grad_norm": 8.302521705627441, + "learning_rate": 4.924623115577889e-07, + "loss": 0.534, + "num_input_tokens_seen": 205520896, + "step": 98 + }, + { + "epoch": 0.024962178517397883, + "grad_norm": 9.702790260314941, + "learning_rate": 4.974874371859296e-07, + "loss": 0.5899, + "num_input_tokens_seen": 207618048, + "step": 99 + }, + { + "epoch": 0.025214321734745335, + "grad_norm": 7.346845626831055, + "learning_rate": 5.025125628140703e-07, + "loss": 0.3439, + "num_input_tokens_seen": 209715200, + "step": 100 + }, + { + "epoch": 0.025466464952092788, + "grad_norm": 6.6140265464782715, + "learning_rate": 5.075376884422111e-07, + "loss": 0.3779, + "num_input_tokens_seen": 211812352, + "step": 101 + }, + { + "epoch": 0.025718608169440244, + "grad_norm": 6.8121209144592285, + "learning_rate": 5.125628140703518e-07, + "loss": 0.403, + "num_input_tokens_seen": 213909504, + "step": 102 + }, + { + "epoch": 0.025970751386787696, + "grad_norm": 6.07421875, + "learning_rate": 5.175879396984925e-07, + "loss": 0.3473, + "num_input_tokens_seen": 216006656, + "step": 103 + }, + { + "epoch": 0.026222894604135148, + "grad_norm": 6.86598539352417, + "learning_rate": 5.226130653266332e-07, + "loss": 0.3054, + "num_input_tokens_seen": 218103808, + "step": 104 + }, + { + "epoch": 0.0264750378214826, + "grad_norm": 7.970452308654785, + "learning_rate": 5.276381909547738e-07, + "loss": 0.3693, + "num_input_tokens_seen": 220200960, + "step": 105 + }, + { + "epoch": 0.026727181038830056, + "grad_norm": 7.2236552238464355, + "learning_rate": 5.326633165829145e-07, + "loss": 0.2194, + "num_input_tokens_seen": 222298112, + "step": 106 + }, + { + "epoch": 0.02697932425617751, + "grad_norm": 5.257369518280029, + "learning_rate": 5.376884422110553e-07, + "loss": 0.2962, + "num_input_tokens_seen": 224395264, + "step": 107 + }, + { + "epoch": 0.02723146747352496, + "grad_norm": 6.920422077178955, + "learning_rate": 5.42713567839196e-07, + "loss": 0.3699, + "num_input_tokens_seen": 226492416, + "step": 108 + }, + { + "epoch": 0.027483610690872417, + "grad_norm": 9.312458992004395, + "learning_rate": 5.477386934673367e-07, + "loss": 0.3812, + "num_input_tokens_seen": 228589568, + "step": 109 + }, + { + "epoch": 0.02773575390821987, + "grad_norm": 9.935240745544434, + "learning_rate": 5.527638190954773e-07, + "loss": 0.4443, + "num_input_tokens_seen": 230686720, + "step": 110 + }, + { + "epoch": 0.02798789712556732, + "grad_norm": 5.373161315917969, + "learning_rate": 5.57788944723618e-07, + "loss": 0.264, + "num_input_tokens_seen": 232783872, + "step": 111 + }, + { + "epoch": 0.028240040342914774, + "grad_norm": 6.769862651824951, + "learning_rate": 5.628140703517587e-07, + "loss": 0.1686, + "num_input_tokens_seen": 234881024, + "step": 112 + }, + { + "epoch": 0.02849218356026223, + "grad_norm": 5.726578712463379, + "learning_rate": 5.678391959798995e-07, + "loss": 0.3396, + "num_input_tokens_seen": 236978176, + "step": 113 + }, + { + "epoch": 0.028744326777609682, + "grad_norm": 5.439636707305908, + "learning_rate": 5.728643216080402e-07, + "loss": 0.2733, + "num_input_tokens_seen": 239075328, + "step": 114 + }, + { + "epoch": 0.028996469994957134, + "grad_norm": 5.622605323791504, + "learning_rate": 5.778894472361808e-07, + "loss": 0.2998, + "num_input_tokens_seen": 241172480, + "step": 115 + }, + { + "epoch": 0.02924861321230459, + "grad_norm": 6.728963851928711, + "learning_rate": 5.829145728643215e-07, + "loss": 0.2549, + "num_input_tokens_seen": 243269632, + "step": 116 + }, + { + "epoch": 0.029500756429652043, + "grad_norm": 5.0983781814575195, + "learning_rate": 5.879396984924622e-07, + "loss": 0.2705, + "num_input_tokens_seen": 245366784, + "step": 117 + }, + { + "epoch": 0.029752899646999495, + "grad_norm": 7.3646721839904785, + "learning_rate": 5.92964824120603e-07, + "loss": 0.3242, + "num_input_tokens_seen": 247463936, + "step": 118 + }, + { + "epoch": 0.03000504286434695, + "grad_norm": 7.918598651885986, + "learning_rate": 5.979899497487438e-07, + "loss": 0.371, + "num_input_tokens_seen": 249561088, + "step": 119 + }, + { + "epoch": 0.030257186081694403, + "grad_norm": 7.411210536956787, + "learning_rate": 6.030150753768844e-07, + "loss": 0.2728, + "num_input_tokens_seen": 251658240, + "step": 120 + }, + { + "epoch": 0.030509329299041855, + "grad_norm": 5.8603129386901855, + "learning_rate": 6.080402010050251e-07, + "loss": 0.1854, + "num_input_tokens_seen": 253755392, + "step": 121 + }, + { + "epoch": 0.030761472516389308, + "grad_norm": 5.476680278778076, + "learning_rate": 6.130653266331658e-07, + "loss": 0.1831, + "num_input_tokens_seen": 255852544, + "step": 122 + }, + { + "epoch": 0.031013615733736764, + "grad_norm": 6.4667158126831055, + "learning_rate": 6.180904522613065e-07, + "loss": 0.1721, + "num_input_tokens_seen": 257949696, + "step": 123 + }, + { + "epoch": 0.031265758951084216, + "grad_norm": 5.928079605102539, + "learning_rate": 6.231155778894473e-07, + "loss": 0.2728, + "num_input_tokens_seen": 260046848, + "step": 124 + }, + { + "epoch": 0.03151790216843167, + "grad_norm": 7.0044755935668945, + "learning_rate": 6.28140703517588e-07, + "loss": 0.4037, + "num_input_tokens_seen": 262144000, + "step": 125 + }, + { + "epoch": 0.03177004538577912, + "grad_norm": 8.558830261230469, + "learning_rate": 6.331658291457286e-07, + "loss": 0.5263, + "num_input_tokens_seen": 264241152, + "step": 126 + }, + { + "epoch": 0.032022188603126577, + "grad_norm": 5.0764055252075195, + "learning_rate": 6.381909547738693e-07, + "loss": 0.2054, + "num_input_tokens_seen": 266338304, + "step": 127 + }, + { + "epoch": 0.03227433182047403, + "grad_norm": 5.459807872772217, + "learning_rate": 6.4321608040201e-07, + "loss": 0.2122, + "num_input_tokens_seen": 268435456, + "step": 128 + }, + { + "epoch": 0.03252647503782148, + "grad_norm": 5.658675670623779, + "learning_rate": 6.482412060301507e-07, + "loss": 0.2226, + "num_input_tokens_seen": 270532608, + "step": 129 + }, + { + "epoch": 0.03277861825516894, + "grad_norm": 5.613616466522217, + "learning_rate": 6.532663316582915e-07, + "loss": 0.2701, + "num_input_tokens_seen": 272629760, + "step": 130 + }, + { + "epoch": 0.033030761472516386, + "grad_norm": 9.082258224487305, + "learning_rate": 6.582914572864321e-07, + "loss": 0.3726, + "num_input_tokens_seen": 274726912, + "step": 131 + }, + { + "epoch": 0.03328290468986384, + "grad_norm": 4.047947406768799, + "learning_rate": 6.633165829145728e-07, + "loss": 0.1323, + "num_input_tokens_seen": 276824064, + "step": 132 + }, + { + "epoch": 0.0335350479072113, + "grad_norm": 5.141188144683838, + "learning_rate": 6.683417085427135e-07, + "loss": 0.2615, + "num_input_tokens_seen": 278921216, + "step": 133 + }, + { + "epoch": 0.033787191124558746, + "grad_norm": 4.637810707092285, + "learning_rate": 6.733668341708542e-07, + "loss": 0.2252, + "num_input_tokens_seen": 281018368, + "step": 134 + }, + { + "epoch": 0.0340393343419062, + "grad_norm": 5.142843723297119, + "learning_rate": 6.783919597989949e-07, + "loss": 0.1817, + "num_input_tokens_seen": 283115520, + "step": 135 + }, + { + "epoch": 0.03429147755925366, + "grad_norm": 7.557190418243408, + "learning_rate": 6.834170854271357e-07, + "loss": 0.2897, + "num_input_tokens_seen": 285212672, + "step": 136 + }, + { + "epoch": 0.03454362077660111, + "grad_norm": 6.585993766784668, + "learning_rate": 6.884422110552764e-07, + "loss": 0.227, + "num_input_tokens_seen": 287309824, + "step": 137 + }, + { + "epoch": 0.03479576399394856, + "grad_norm": 4.926968574523926, + "learning_rate": 6.934673366834171e-07, + "loss": 0.1573, + "num_input_tokens_seen": 289406976, + "step": 138 + }, + { + "epoch": 0.03504790721129602, + "grad_norm": 6.03431510925293, + "learning_rate": 6.984924623115578e-07, + "loss": 0.2187, + "num_input_tokens_seen": 291504128, + "step": 139 + }, + { + "epoch": 0.03530005042864347, + "grad_norm": 9.677518844604492, + "learning_rate": 7.035175879396985e-07, + "loss": 0.2295, + "num_input_tokens_seen": 293601280, + "step": 140 + }, + { + "epoch": 0.03555219364599092, + "grad_norm": 6.820138931274414, + "learning_rate": 7.085427135678391e-07, + "loss": 0.1944, + "num_input_tokens_seen": 295698432, + "step": 141 + }, + { + "epoch": 0.03580433686333838, + "grad_norm": 5.568108081817627, + "learning_rate": 7.135678391959799e-07, + "loss": 0.3113, + "num_input_tokens_seen": 297795584, + "step": 142 + }, + { + "epoch": 0.03605648008068583, + "grad_norm": 6.417880058288574, + "learning_rate": 7.185929648241206e-07, + "loss": 0.2932, + "num_input_tokens_seen": 299892736, + "step": 143 + }, + { + "epoch": 0.036308623298033284, + "grad_norm": 5.040261745452881, + "learning_rate": 7.236180904522613e-07, + "loss": 0.2076, + "num_input_tokens_seen": 301989888, + "step": 144 + }, + { + "epoch": 0.03656076651538074, + "grad_norm": 6.350996494293213, + "learning_rate": 7.28643216080402e-07, + "loss": 0.1714, + "num_input_tokens_seen": 304087040, + "step": 145 + }, + { + "epoch": 0.03681290973272819, + "grad_norm": 5.744927406311035, + "learning_rate": 7.336683417085426e-07, + "loss": 0.1948, + "num_input_tokens_seen": 306184192, + "step": 146 + }, + { + "epoch": 0.037065052950075644, + "grad_norm": 5.379306793212891, + "learning_rate": 7.386934673366834e-07, + "loss": 0.1971, + "num_input_tokens_seen": 308281344, + "step": 147 + }, + { + "epoch": 0.03731719616742309, + "grad_norm": 4.08986234664917, + "learning_rate": 7.437185929648241e-07, + "loss": 0.1319, + "num_input_tokens_seen": 310378496, + "step": 148 + }, + { + "epoch": 0.03756933938477055, + "grad_norm": 8.005187034606934, + "learning_rate": 7.487437185929648e-07, + "loss": 0.3227, + "num_input_tokens_seen": 312475648, + "step": 149 + }, + { + "epoch": 0.037821482602118005, + "grad_norm": 6.485504627227783, + "learning_rate": 7.537688442211055e-07, + "loss": 0.4005, + "num_input_tokens_seen": 314572800, + "step": 150 + }, + { + "epoch": 0.038073625819465454, + "grad_norm": 7.763909339904785, + "learning_rate": 7.587939698492461e-07, + "loss": 0.3537, + "num_input_tokens_seen": 316669952, + "step": 151 + }, + { + "epoch": 0.03832576903681291, + "grad_norm": 5.093461036682129, + "learning_rate": 7.638190954773868e-07, + "loss": 0.1321, + "num_input_tokens_seen": 318767104, + "step": 152 + }, + { + "epoch": 0.038577912254160365, + "grad_norm": 4.274379730224609, + "learning_rate": 7.688442211055276e-07, + "loss": 0.1623, + "num_input_tokens_seen": 320864256, + "step": 153 + }, + { + "epoch": 0.038830055471507814, + "grad_norm": 5.359605312347412, + "learning_rate": 7.738693467336684e-07, + "loss": 0.2337, + "num_input_tokens_seen": 322961408, + "step": 154 + }, + { + "epoch": 0.03908219868885527, + "grad_norm": 5.039738655090332, + "learning_rate": 7.788944723618091e-07, + "loss": 0.2028, + "num_input_tokens_seen": 325058560, + "step": 155 + }, + { + "epoch": 0.039334341906202726, + "grad_norm": 5.888302326202393, + "learning_rate": 7.839195979899497e-07, + "loss": 0.1418, + "num_input_tokens_seen": 327155712, + "step": 156 + }, + { + "epoch": 0.039586485123550175, + "grad_norm": 5.222049236297607, + "learning_rate": 7.889447236180904e-07, + "loss": 0.1474, + "num_input_tokens_seen": 329252864, + "step": 157 + }, + { + "epoch": 0.03983862834089763, + "grad_norm": 5.662126064300537, + "learning_rate": 7.939698492462311e-07, + "loss": 0.2008, + "num_input_tokens_seen": 331350016, + "step": 158 + }, + { + "epoch": 0.040090771558245086, + "grad_norm": 4.854446887969971, + "learning_rate": 7.989949748743719e-07, + "loss": 0.1373, + "num_input_tokens_seen": 333447168, + "step": 159 + }, + { + "epoch": 0.040342914775592535, + "grad_norm": 5.8150177001953125, + "learning_rate": 8.040201005025126e-07, + "loss": 0.2512, + "num_input_tokens_seen": 335544320, + "step": 160 + }, + { + "epoch": 0.04059505799293999, + "grad_norm": 5.4808526039123535, + "learning_rate": 8.090452261306532e-07, + "loss": 0.1379, + "num_input_tokens_seen": 337641472, + "step": 161 + }, + { + "epoch": 0.04084720121028744, + "grad_norm": 5.683319091796875, + "learning_rate": 8.140703517587939e-07, + "loss": 0.2061, + "num_input_tokens_seen": 339738624, + "step": 162 + }, + { + "epoch": 0.041099344427634896, + "grad_norm": 5.919990062713623, + "learning_rate": 8.190954773869346e-07, + "loss": 0.2115, + "num_input_tokens_seen": 341835776, + "step": 163 + }, + { + "epoch": 0.04135148764498235, + "grad_norm": 4.193869113922119, + "learning_rate": 8.241206030150753e-07, + "loss": 0.1766, + "num_input_tokens_seen": 343932928, + "step": 164 + }, + { + "epoch": 0.0416036308623298, + "grad_norm": 4.4601945877075195, + "learning_rate": 8.291457286432161e-07, + "loss": 0.1939, + "num_input_tokens_seen": 346030080, + "step": 165 + }, + { + "epoch": 0.041855774079677256, + "grad_norm": 5.21290922164917, + "learning_rate": 8.341708542713567e-07, + "loss": 0.1787, + "num_input_tokens_seen": 348127232, + "step": 166 + }, + { + "epoch": 0.04210791729702471, + "grad_norm": 5.489988327026367, + "learning_rate": 8.391959798994974e-07, + "loss": 0.1809, + "num_input_tokens_seen": 350224384, + "step": 167 + }, + { + "epoch": 0.04236006051437216, + "grad_norm": 4.026052474975586, + "learning_rate": 8.442211055276381e-07, + "loss": 0.1248, + "num_input_tokens_seen": 352321536, + "step": 168 + }, + { + "epoch": 0.04261220373171962, + "grad_norm": 4.203098297119141, + "learning_rate": 8.492462311557788e-07, + "loss": 0.1089, + "num_input_tokens_seen": 354418688, + "step": 169 + }, + { + "epoch": 0.04286434694906707, + "grad_norm": 6.0608296394348145, + "learning_rate": 8.542713567839196e-07, + "loss": 0.185, + "num_input_tokens_seen": 356515840, + "step": 170 + }, + { + "epoch": 0.04311649016641452, + "grad_norm": 5.297198295593262, + "learning_rate": 8.592964824120602e-07, + "loss": 0.119, + "num_input_tokens_seen": 358612992, + "step": 171 + }, + { + "epoch": 0.04336863338376198, + "grad_norm": 4.82717227935791, + "learning_rate": 8.643216080402009e-07, + "loss": 0.1275, + "num_input_tokens_seen": 360710144, + "step": 172 + }, + { + "epoch": 0.04362077660110943, + "grad_norm": 7.091985702514648, + "learning_rate": 8.693467336683417e-07, + "loss": 0.3237, + "num_input_tokens_seen": 362807296, + "step": 173 + }, + { + "epoch": 0.04387291981845688, + "grad_norm": 4.359028339385986, + "learning_rate": 8.743718592964824e-07, + "loss": 0.1272, + "num_input_tokens_seen": 364904448, + "step": 174 + }, + { + "epoch": 0.04412506303580434, + "grad_norm": 4.864053726196289, + "learning_rate": 8.793969849246231e-07, + "loss": 0.2115, + "num_input_tokens_seen": 367001600, + "step": 175 + }, + { + "epoch": 0.044377206253151794, + "grad_norm": 4.585638523101807, + "learning_rate": 8.844221105527639e-07, + "loss": 0.1753, + "num_input_tokens_seen": 369098752, + "step": 176 + }, + { + "epoch": 0.04462934947049924, + "grad_norm": 6.2548933029174805, + "learning_rate": 8.894472361809045e-07, + "loss": 0.2436, + "num_input_tokens_seen": 371195904, + "step": 177 + }, + { + "epoch": 0.0448814926878467, + "grad_norm": 4.619575023651123, + "learning_rate": 8.944723618090452e-07, + "loss": 0.2271, + "num_input_tokens_seen": 373293056, + "step": 178 + }, + { + "epoch": 0.04513363590519415, + "grad_norm": 4.505560398101807, + "learning_rate": 8.994974874371859e-07, + "loss": 0.1728, + "num_input_tokens_seen": 375390208, + "step": 179 + }, + { + "epoch": 0.0453857791225416, + "grad_norm": 4.657378196716309, + "learning_rate": 9.045226130653266e-07, + "loss": 0.2134, + "num_input_tokens_seen": 377487360, + "step": 180 + }, + { + "epoch": 0.04563792233988906, + "grad_norm": 3.5373897552490234, + "learning_rate": 9.095477386934673e-07, + "loss": 0.125, + "num_input_tokens_seen": 379584512, + "step": 181 + }, + { + "epoch": 0.04589006555723651, + "grad_norm": 4.476269721984863, + "learning_rate": 9.14572864321608e-07, + "loss": 0.1805, + "num_input_tokens_seen": 381681664, + "step": 182 + }, + { + "epoch": 0.046142208774583963, + "grad_norm": 4.5421881675720215, + "learning_rate": 9.195979899497487e-07, + "loss": 0.1296, + "num_input_tokens_seen": 383778816, + "step": 183 + }, + { + "epoch": 0.04639435199193142, + "grad_norm": 4.141582012176514, + "learning_rate": 9.246231155778894e-07, + "loss": 0.194, + "num_input_tokens_seen": 385875968, + "step": 184 + }, + { + "epoch": 0.04664649520927887, + "grad_norm": 6.524399757385254, + "learning_rate": 9.296482412060301e-07, + "loss": 0.1595, + "num_input_tokens_seen": 387973120, + "step": 185 + }, + { + "epoch": 0.046898638426626324, + "grad_norm": 4.473093509674072, + "learning_rate": 9.346733668341708e-07, + "loss": 0.1909, + "num_input_tokens_seen": 390070272, + "step": 186 + }, + { + "epoch": 0.04715078164397378, + "grad_norm": 5.006099224090576, + "learning_rate": 9.396984924623114e-07, + "loss": 0.215, + "num_input_tokens_seen": 392167424, + "step": 187 + }, + { + "epoch": 0.04740292486132123, + "grad_norm": 4.727731227874756, + "learning_rate": 9.447236180904522e-07, + "loss": 0.1874, + "num_input_tokens_seen": 394264576, + "step": 188 + }, + { + "epoch": 0.047655068078668684, + "grad_norm": 4.6576828956604, + "learning_rate": 9.497487437185929e-07, + "loss": 0.1889, + "num_input_tokens_seen": 396361728, + "step": 189 + }, + { + "epoch": 0.04790721129601614, + "grad_norm": 4.223318099975586, + "learning_rate": 9.547738693467337e-07, + "loss": 0.1432, + "num_input_tokens_seen": 398458880, + "step": 190 + }, + { + "epoch": 0.04815935451336359, + "grad_norm": 3.288745641708374, + "learning_rate": 9.597989949748744e-07, + "loss": 0.1361, + "num_input_tokens_seen": 400556032, + "step": 191 + }, + { + "epoch": 0.048411497730711045, + "grad_norm": 4.024937629699707, + "learning_rate": 9.64824120603015e-07, + "loss": 0.1285, + "num_input_tokens_seen": 402653184, + "step": 192 + }, + { + "epoch": 0.048663640948058494, + "grad_norm": 4.060795783996582, + "learning_rate": 9.698492462311556e-07, + "loss": 0.1472, + "num_input_tokens_seen": 404750336, + "step": 193 + }, + { + "epoch": 0.04891578416540595, + "grad_norm": 5.01156759262085, + "learning_rate": 9.748743718592964e-07, + "loss": 0.2541, + "num_input_tokens_seen": 406847488, + "step": 194 + }, + { + "epoch": 0.049167927382753406, + "grad_norm": 3.8259568214416504, + "learning_rate": 9.79899497487437e-07, + "loss": 0.176, + "num_input_tokens_seen": 408944640, + "step": 195 + }, + { + "epoch": 0.049420070600100854, + "grad_norm": 4.526422500610352, + "learning_rate": 9.849246231155778e-07, + "loss": 0.2161, + "num_input_tokens_seen": 411041792, + "step": 196 + }, + { + "epoch": 0.04967221381744831, + "grad_norm": 4.0646867752075195, + "learning_rate": 9.899497487437185e-07, + "loss": 0.1361, + "num_input_tokens_seen": 413138944, + "step": 197 + }, + { + "epoch": 0.049924357034795766, + "grad_norm": 4.822361946105957, + "learning_rate": 9.949748743718592e-07, + "loss": 0.1678, + "num_input_tokens_seen": 415236096, + "step": 198 + }, + { + "epoch": 0.050176500252143215, + "grad_norm": 5.335970878601074, + "learning_rate": 1e-06, + "loss": 0.138, + "num_input_tokens_seen": 417333248, + "step": 199 + }, + { + "epoch": 0.05042864346949067, + "grad_norm": 4.283322811126709, + "learning_rate": 9.999998435084117e-07, + "loss": 0.1599, + "num_input_tokens_seen": 419430400, + "step": 200 + }, + { + "epoch": 0.05068078668683813, + "grad_norm": 3.6955955028533936, + "learning_rate": 9.999993740337564e-07, + "loss": 0.1203, + "num_input_tokens_seen": 421527552, + "step": 201 + }, + { + "epoch": 0.050932929904185575, + "grad_norm": 4.380987167358398, + "learning_rate": 9.999985915763598e-07, + "loss": 0.2069, + "num_input_tokens_seen": 423624704, + "step": 202 + }, + { + "epoch": 0.05118507312153303, + "grad_norm": 3.827716588973999, + "learning_rate": 9.999974961367668e-07, + "loss": 0.1987, + "num_input_tokens_seen": 425721856, + "step": 203 + }, + { + "epoch": 0.05143721633888049, + "grad_norm": 3.8995583057403564, + "learning_rate": 9.999960877157389e-07, + "loss": 0.1473, + "num_input_tokens_seen": 427819008, + "step": 204 + }, + { + "epoch": 0.051689359556227936, + "grad_norm": 3.6740832328796387, + "learning_rate": 9.99994366314256e-07, + "loss": 0.1348, + "num_input_tokens_seen": 429916160, + "step": 205 + }, + { + "epoch": 0.05194150277357539, + "grad_norm": 3.7553346157073975, + "learning_rate": 9.99992331933515e-07, + "loss": 0.1463, + "num_input_tokens_seen": 432013312, + "step": 206 + }, + { + "epoch": 0.05219364599092285, + "grad_norm": 4.992524147033691, + "learning_rate": 9.99989984574931e-07, + "loss": 0.2349, + "num_input_tokens_seen": 434110464, + "step": 207 + }, + { + "epoch": 0.052445789208270296, + "grad_norm": 4.383981704711914, + "learning_rate": 9.99987324240137e-07, + "loss": 0.1552, + "num_input_tokens_seen": 436207616, + "step": 208 + }, + { + "epoch": 0.05269793242561775, + "grad_norm": 4.6292619705200195, + "learning_rate": 9.999843509309827e-07, + "loss": 0.1998, + "num_input_tokens_seen": 438304768, + "step": 209 + }, + { + "epoch": 0.0529500756429652, + "grad_norm": 3.5693604946136475, + "learning_rate": 9.999810646495363e-07, + "loss": 0.1409, + "num_input_tokens_seen": 440401920, + "step": 210 + }, + { + "epoch": 0.05320221886031266, + "grad_norm": 4.460555553436279, + "learning_rate": 9.999774653980837e-07, + "loss": 0.2005, + "num_input_tokens_seen": 442499072, + "step": 211 + }, + { + "epoch": 0.05345436207766011, + "grad_norm": 3.6692800521850586, + "learning_rate": 9.99973553179128e-07, + "loss": 0.1358, + "num_input_tokens_seen": 444596224, + "step": 212 + }, + { + "epoch": 0.05370650529500756, + "grad_norm": 3.4849557876586914, + "learning_rate": 9.999693279953903e-07, + "loss": 0.1199, + "num_input_tokens_seen": 446693376, + "step": 213 + }, + { + "epoch": 0.05395864851235502, + "grad_norm": 3.9747097492218018, + "learning_rate": 9.999647898498095e-07, + "loss": 0.1885, + "num_input_tokens_seen": 448790528, + "step": 214 + }, + { + "epoch": 0.05421079172970247, + "grad_norm": 4.172543525695801, + "learning_rate": 9.999599387455416e-07, + "loss": 0.2118, + "num_input_tokens_seen": 450887680, + "step": 215 + }, + { + "epoch": 0.05446293494704992, + "grad_norm": 3.811913013458252, + "learning_rate": 9.999547746859607e-07, + "loss": 0.1973, + "num_input_tokens_seen": 452984832, + "step": 216 + }, + { + "epoch": 0.05471507816439738, + "grad_norm": 3.7271082401275635, + "learning_rate": 9.999492976746585e-07, + "loss": 0.2219, + "num_input_tokens_seen": 455081984, + "step": 217 + }, + { + "epoch": 0.054967221381744834, + "grad_norm": 4.112778186798096, + "learning_rate": 9.999435077154446e-07, + "loss": 0.1748, + "num_input_tokens_seen": 457179136, + "step": 218 + }, + { + "epoch": 0.05521936459909228, + "grad_norm": 6.517294883728027, + "learning_rate": 9.99937404812346e-07, + "loss": 0.3107, + "num_input_tokens_seen": 459276288, + "step": 219 + }, + { + "epoch": 0.05547150781643974, + "grad_norm": 4.02686071395874, + "learning_rate": 9.99930988969607e-07, + "loss": 0.0861, + "num_input_tokens_seen": 461373440, + "step": 220 + }, + { + "epoch": 0.055723651033787194, + "grad_norm": 3.6635353565216064, + "learning_rate": 9.999242601916902e-07, + "loss": 0.2132, + "num_input_tokens_seen": 463470592, + "step": 221 + }, + { + "epoch": 0.05597579425113464, + "grad_norm": 4.417490005493164, + "learning_rate": 9.999172184832756e-07, + "loss": 0.2374, + "num_input_tokens_seen": 465567744, + "step": 222 + }, + { + "epoch": 0.0562279374684821, + "grad_norm": 3.173140048980713, + "learning_rate": 9.99909863849261e-07, + "loss": 0.1771, + "num_input_tokens_seen": 467664896, + "step": 223 + }, + { + "epoch": 0.05648008068582955, + "grad_norm": 5.276343822479248, + "learning_rate": 9.999021962947612e-07, + "loss": 0.1569, + "num_input_tokens_seen": 469762048, + "step": 224 + }, + { + "epoch": 0.056732223903177004, + "grad_norm": 4.241299629211426, + "learning_rate": 9.998942158251096e-07, + "loss": 0.2738, + "num_input_tokens_seen": 471859200, + "step": 225 + }, + { + "epoch": 0.05698436712052446, + "grad_norm": 4.36360502243042, + "learning_rate": 9.998859224458565e-07, + "loss": 0.2735, + "num_input_tokens_seen": 473956352, + "step": 226 + }, + { + "epoch": 0.05723651033787191, + "grad_norm": 5.051778316497803, + "learning_rate": 9.998773161627701e-07, + "loss": 0.1831, + "num_input_tokens_seen": 476053504, + "step": 227 + }, + { + "epoch": 0.057488653555219364, + "grad_norm": 3.883115291595459, + "learning_rate": 9.998683969818364e-07, + "loss": 0.1617, + "num_input_tokens_seen": 478150656, + "step": 228 + }, + { + "epoch": 0.05774079677256682, + "grad_norm": 3.9679079055786133, + "learning_rate": 9.998591649092588e-07, + "loss": 0.1273, + "num_input_tokens_seen": 480247808, + "step": 229 + }, + { + "epoch": 0.05799293998991427, + "grad_norm": 6.0246901512146, + "learning_rate": 9.998496199514582e-07, + "loss": 0.1463, + "num_input_tokens_seen": 482344960, + "step": 230 + }, + { + "epoch": 0.058245083207261725, + "grad_norm": 3.684004545211792, + "learning_rate": 9.998397621150734e-07, + "loss": 0.1422, + "num_input_tokens_seen": 484442112, + "step": 231 + }, + { + "epoch": 0.05849722642460918, + "grad_norm": 5.111332416534424, + "learning_rate": 9.998295914069606e-07, + "loss": 0.2197, + "num_input_tokens_seen": 486539264, + "step": 232 + }, + { + "epoch": 0.05874936964195663, + "grad_norm": 3.0218448638916016, + "learning_rate": 9.99819107834194e-07, + "loss": 0.1219, + "num_input_tokens_seen": 488636416, + "step": 233 + }, + { + "epoch": 0.059001512859304085, + "grad_norm": 3.564114570617676, + "learning_rate": 9.99808311404065e-07, + "loss": 0.1983, + "num_input_tokens_seen": 490733568, + "step": 234 + }, + { + "epoch": 0.05925365607665154, + "grad_norm": 6.091875076293945, + "learning_rate": 9.997972021240824e-07, + "loss": 0.2782, + "num_input_tokens_seen": 492830720, + "step": 235 + }, + { + "epoch": 0.05950579929399899, + "grad_norm": 4.984955787658691, + "learning_rate": 9.997857800019734e-07, + "loss": 0.2658, + "num_input_tokens_seen": 494927872, + "step": 236 + }, + { + "epoch": 0.059757942511346446, + "grad_norm": 4.2022705078125, + "learning_rate": 9.997740450456819e-07, + "loss": 0.1511, + "num_input_tokens_seen": 497025024, + "step": 237 + }, + { + "epoch": 0.0600100857286939, + "grad_norm": 4.631911277770996, + "learning_rate": 9.997619972633701e-07, + "loss": 0.1874, + "num_input_tokens_seen": 499122176, + "step": 238 + }, + { + "epoch": 0.06026222894604135, + "grad_norm": 3.489034414291382, + "learning_rate": 9.99749636663417e-07, + "loss": 0.1684, + "num_input_tokens_seen": 501219328, + "step": 239 + }, + { + "epoch": 0.060514372163388806, + "grad_norm": 5.1144185066223145, + "learning_rate": 9.997369632544202e-07, + "loss": 0.1834, + "num_input_tokens_seen": 503316480, + "step": 240 + }, + { + "epoch": 0.060766515380736255, + "grad_norm": 5.526945114135742, + "learning_rate": 9.997239770451938e-07, + "loss": 0.2135, + "num_input_tokens_seen": 505413632, + "step": 241 + }, + { + "epoch": 0.06101865859808371, + "grad_norm": 6.000234127044678, + "learning_rate": 9.997106780447705e-07, + "loss": 0.2248, + "num_input_tokens_seen": 507510784, + "step": 242 + }, + { + "epoch": 0.06127080181543117, + "grad_norm": 3.4181573390960693, + "learning_rate": 9.99697066262399e-07, + "loss": 0.0903, + "num_input_tokens_seen": 509607936, + "step": 243 + }, + { + "epoch": 0.061522945032778616, + "grad_norm": 3.6254003047943115, + "learning_rate": 9.996831417075477e-07, + "loss": 0.1507, + "num_input_tokens_seen": 511705088, + "step": 244 + }, + { + "epoch": 0.06177508825012607, + "grad_norm": 3.7657456398010254, + "learning_rate": 9.996689043899005e-07, + "loss": 0.1569, + "num_input_tokens_seen": 513802240, + "step": 245 + }, + { + "epoch": 0.06202723146747353, + "grad_norm": 4.642493724822998, + "learning_rate": 9.996543543193604e-07, + "loss": 0.1187, + "num_input_tokens_seen": 515899392, + "step": 246 + }, + { + "epoch": 0.062279374684820976, + "grad_norm": 3.632336378097534, + "learning_rate": 9.996394915060468e-07, + "loss": 0.1736, + "num_input_tokens_seen": 517996544, + "step": 247 + }, + { + "epoch": 0.06253151790216843, + "grad_norm": 4.491301536560059, + "learning_rate": 9.99624315960297e-07, + "loss": 0.2351, + "num_input_tokens_seen": 520093696, + "step": 248 + }, + { + "epoch": 0.06278366111951589, + "grad_norm": 2.526890277862549, + "learning_rate": 9.996088276926661e-07, + "loss": 0.1088, + "num_input_tokens_seen": 522190848, + "step": 249 + }, + { + "epoch": 0.06303580433686334, + "grad_norm": 4.402822971343994, + "learning_rate": 9.995930267139266e-07, + "loss": 0.1189, + "num_input_tokens_seen": 524288000, + "step": 250 + }, + { + "epoch": 0.06328794755421079, + "grad_norm": 2.893916368484497, + "learning_rate": 9.99576913035068e-07, + "loss": 0.1003, + "num_input_tokens_seen": 526385152, + "step": 251 + }, + { + "epoch": 0.06354009077155824, + "grad_norm": 4.437779426574707, + "learning_rate": 9.995604866672978e-07, + "loss": 0.21, + "num_input_tokens_seen": 528482304, + "step": 252 + }, + { + "epoch": 0.0637922339889057, + "grad_norm": 7.890944957733154, + "learning_rate": 9.995437476220408e-07, + "loss": 0.3668, + "num_input_tokens_seen": 530579456, + "step": 253 + }, + { + "epoch": 0.06404437720625315, + "grad_norm": 3.5893633365631104, + "learning_rate": 9.995266959109396e-07, + "loss": 0.1771, + "num_input_tokens_seen": 532676608, + "step": 254 + }, + { + "epoch": 0.06429652042360061, + "grad_norm": 4.691050052642822, + "learning_rate": 9.995093315458534e-07, + "loss": 0.1696, + "num_input_tokens_seen": 534773760, + "step": 255 + }, + { + "epoch": 0.06454866364094806, + "grad_norm": 2.8213396072387695, + "learning_rate": 9.9949165453886e-07, + "loss": 0.1364, + "num_input_tokens_seen": 536870912, + "step": 256 + }, + { + "epoch": 0.0648008068582955, + "grad_norm": 4.529366493225098, + "learning_rate": 9.994736649022539e-07, + "loss": 0.1749, + "num_input_tokens_seen": 538968064, + "step": 257 + }, + { + "epoch": 0.06505295007564296, + "grad_norm": 3.919793128967285, + "learning_rate": 9.99455362648547e-07, + "loss": 0.1611, + "num_input_tokens_seen": 541065216, + "step": 258 + }, + { + "epoch": 0.06530509329299042, + "grad_norm": 4.9372711181640625, + "learning_rate": 9.994367477904695e-07, + "loss": 0.2556, + "num_input_tokens_seen": 543162368, + "step": 259 + }, + { + "epoch": 0.06555723651033787, + "grad_norm": 5.533105850219727, + "learning_rate": 9.994178203409674e-07, + "loss": 0.1598, + "num_input_tokens_seen": 545259520, + "step": 260 + }, + { + "epoch": 0.06580937972768533, + "grad_norm": 4.164669990539551, + "learning_rate": 9.993985803132057e-07, + "loss": 0.1743, + "num_input_tokens_seen": 547356672, + "step": 261 + }, + { + "epoch": 0.06606152294503277, + "grad_norm": 3.924823045730591, + "learning_rate": 9.993790277205662e-07, + "loss": 0.169, + "num_input_tokens_seen": 549453824, + "step": 262 + }, + { + "epoch": 0.06631366616238023, + "grad_norm": 3.045861005783081, + "learning_rate": 9.993591625766477e-07, + "loss": 0.1027, + "num_input_tokens_seen": 551550976, + "step": 263 + }, + { + "epoch": 0.06656580937972768, + "grad_norm": 2.7366058826446533, + "learning_rate": 9.993389848952673e-07, + "loss": 0.1027, + "num_input_tokens_seen": 553648128, + "step": 264 + }, + { + "epoch": 0.06681795259707514, + "grad_norm": 4.305903434753418, + "learning_rate": 9.993184946904586e-07, + "loss": 0.0899, + "num_input_tokens_seen": 555745280, + "step": 265 + }, + { + "epoch": 0.0670700958144226, + "grad_norm": 4.169579029083252, + "learning_rate": 9.992976919764728e-07, + "loss": 0.1555, + "num_input_tokens_seen": 557842432, + "step": 266 + }, + { + "epoch": 0.06732223903177005, + "grad_norm": 2.866806983947754, + "learning_rate": 9.992765767677789e-07, + "loss": 0.1226, + "num_input_tokens_seen": 559939584, + "step": 267 + }, + { + "epoch": 0.06757438224911749, + "grad_norm": 3.6884562969207764, + "learning_rate": 9.992551490790626e-07, + "loss": 0.1359, + "num_input_tokens_seen": 562036736, + "step": 268 + }, + { + "epoch": 0.06782652546646495, + "grad_norm": 4.731523513793945, + "learning_rate": 9.992334089252278e-07, + "loss": 0.1438, + "num_input_tokens_seen": 564133888, + "step": 269 + }, + { + "epoch": 0.0680786686838124, + "grad_norm": 3.90913724899292, + "learning_rate": 9.992113563213944e-07, + "loss": 0.1596, + "num_input_tokens_seen": 566231040, + "step": 270 + }, + { + "epoch": 0.06833081190115986, + "grad_norm": 3.4404547214508057, + "learning_rate": 9.99188991282901e-07, + "loss": 0.165, + "num_input_tokens_seen": 568328192, + "step": 271 + }, + { + "epoch": 0.06858295511850732, + "grad_norm": 2.840576648712158, + "learning_rate": 9.991663138253025e-07, + "loss": 0.109, + "num_input_tokens_seen": 570425344, + "step": 272 + }, + { + "epoch": 0.06883509833585477, + "grad_norm": 4.362993240356445, + "learning_rate": 9.991433239643716e-07, + "loss": 0.209, + "num_input_tokens_seen": 572522496, + "step": 273 + }, + { + "epoch": 0.06908724155320221, + "grad_norm": 4.26267671585083, + "learning_rate": 9.991200217160984e-07, + "loss": 0.0746, + "num_input_tokens_seen": 574619648, + "step": 274 + }, + { + "epoch": 0.06933938477054967, + "grad_norm": 3.7214324474334717, + "learning_rate": 9.990964070966895e-07, + "loss": 0.1395, + "num_input_tokens_seen": 576716800, + "step": 275 + }, + { + "epoch": 0.06959152798789713, + "grad_norm": 4.263853549957275, + "learning_rate": 9.9907248012257e-07, + "loss": 0.1919, + "num_input_tokens_seen": 578813952, + "step": 276 + }, + { + "epoch": 0.06984367120524458, + "grad_norm": 3.7660653591156006, + "learning_rate": 9.99048240810381e-07, + "loss": 0.1362, + "num_input_tokens_seen": 580911104, + "step": 277 + }, + { + "epoch": 0.07009581442259204, + "grad_norm": 3.3318731784820557, + "learning_rate": 9.990236891769818e-07, + "loss": 0.0849, + "num_input_tokens_seen": 583008256, + "step": 278 + }, + { + "epoch": 0.07034795763993948, + "grad_norm": 3.9983317852020264, + "learning_rate": 9.98998825239448e-07, + "loss": 0.1731, + "num_input_tokens_seen": 585105408, + "step": 279 + }, + { + "epoch": 0.07060010085728693, + "grad_norm": 3.032134532928467, + "learning_rate": 9.98973649015073e-07, + "loss": 0.1278, + "num_input_tokens_seen": 587202560, + "step": 280 + }, + { + "epoch": 0.07085224407463439, + "grad_norm": 3.8470921516418457, + "learning_rate": 9.98948160521368e-07, + "loss": 0.103, + "num_input_tokens_seen": 589299712, + "step": 281 + }, + { + "epoch": 0.07110438729198185, + "grad_norm": 2.935425043106079, + "learning_rate": 9.989223597760598e-07, + "loss": 0.1472, + "num_input_tokens_seen": 591396864, + "step": 282 + }, + { + "epoch": 0.0713565305093293, + "grad_norm": 3.791640043258667, + "learning_rate": 9.988962467970938e-07, + "loss": 0.1743, + "num_input_tokens_seen": 593494016, + "step": 283 + }, + { + "epoch": 0.07160867372667676, + "grad_norm": 2.616250991821289, + "learning_rate": 9.988698216026322e-07, + "loss": 0.0769, + "num_input_tokens_seen": 595591168, + "step": 284 + }, + { + "epoch": 0.0718608169440242, + "grad_norm": 3.309394359588623, + "learning_rate": 9.988430842110538e-07, + "loss": 0.1357, + "num_input_tokens_seen": 597688320, + "step": 285 + }, + { + "epoch": 0.07211296016137166, + "grad_norm": 4.600468635559082, + "learning_rate": 9.988160346409551e-07, + "loss": 0.1178, + "num_input_tokens_seen": 599785472, + "step": 286 + }, + { + "epoch": 0.07236510337871911, + "grad_norm": 3.2695717811584473, + "learning_rate": 9.987886729111496e-07, + "loss": 0.1122, + "num_input_tokens_seen": 601882624, + "step": 287 + }, + { + "epoch": 0.07261724659606657, + "grad_norm": 2.7870922088623047, + "learning_rate": 9.98760999040668e-07, + "loss": 0.0995, + "num_input_tokens_seen": 603979776, + "step": 288 + }, + { + "epoch": 0.07286938981341402, + "grad_norm": 3.2872393131256104, + "learning_rate": 9.987330130487576e-07, + "loss": 0.1314, + "num_input_tokens_seen": 606076928, + "step": 289 + }, + { + "epoch": 0.07312153303076148, + "grad_norm": 4.210444927215576, + "learning_rate": 9.987047149548833e-07, + "loss": 0.1435, + "num_input_tokens_seen": 608174080, + "step": 290 + }, + { + "epoch": 0.07337367624810892, + "grad_norm": 3.661651372909546, + "learning_rate": 9.986761047787274e-07, + "loss": 0.1075, + "num_input_tokens_seen": 610271232, + "step": 291 + }, + { + "epoch": 0.07362581946545638, + "grad_norm": 4.133707046508789, + "learning_rate": 9.986471825401882e-07, + "loss": 0.1977, + "num_input_tokens_seen": 612368384, + "step": 292 + }, + { + "epoch": 0.07387796268280383, + "grad_norm": 4.6356072425842285, + "learning_rate": 9.98617948259382e-07, + "loss": 0.1703, + "num_input_tokens_seen": 614465536, + "step": 293 + }, + { + "epoch": 0.07413010590015129, + "grad_norm": 3.9383256435394287, + "learning_rate": 9.985884019566416e-07, + "loss": 0.1848, + "num_input_tokens_seen": 616562688, + "step": 294 + }, + { + "epoch": 0.07438224911749874, + "grad_norm": 4.793269157409668, + "learning_rate": 9.985585436525168e-07, + "loss": 0.1488, + "num_input_tokens_seen": 618659840, + "step": 295 + }, + { + "epoch": 0.07463439233484619, + "grad_norm": 6.518699645996094, + "learning_rate": 9.98528373367775e-07, + "loss": 0.24, + "num_input_tokens_seen": 620756992, + "step": 296 + }, + { + "epoch": 0.07488653555219364, + "grad_norm": 3.71830415725708, + "learning_rate": 9.984978911234003e-07, + "loss": 0.1444, + "num_input_tokens_seen": 622854144, + "step": 297 + }, + { + "epoch": 0.0751386787695411, + "grad_norm": 3.535399913787842, + "learning_rate": 9.984670969405932e-07, + "loss": 0.145, + "num_input_tokens_seen": 624951296, + "step": 298 + }, + { + "epoch": 0.07539082198688855, + "grad_norm": 2.5828938484191895, + "learning_rate": 9.984359908407716e-07, + "loss": 0.1091, + "num_input_tokens_seen": 627048448, + "step": 299 + }, + { + "epoch": 0.07564296520423601, + "grad_norm": 3.900514841079712, + "learning_rate": 9.984045728455707e-07, + "loss": 0.1672, + "num_input_tokens_seen": 629145600, + "step": 300 + }, + { + "epoch": 0.07589510842158347, + "grad_norm": 4.364770412445068, + "learning_rate": 9.98372842976842e-07, + "loss": 0.2678, + "num_input_tokens_seen": 631242752, + "step": 301 + }, + { + "epoch": 0.07614725163893091, + "grad_norm": 3.6578245162963867, + "learning_rate": 9.983408012566545e-07, + "loss": 0.1238, + "num_input_tokens_seen": 633339904, + "step": 302 + }, + { + "epoch": 0.07639939485627836, + "grad_norm": 3.067723512649536, + "learning_rate": 9.983084477072936e-07, + "loss": 0.092, + "num_input_tokens_seen": 635437056, + "step": 303 + }, + { + "epoch": 0.07665153807362582, + "grad_norm": 2.8249781131744385, + "learning_rate": 9.982757823512619e-07, + "loss": 0.1065, + "num_input_tokens_seen": 637534208, + "step": 304 + }, + { + "epoch": 0.07690368129097327, + "grad_norm": 3.4561619758605957, + "learning_rate": 9.982428052112784e-07, + "loss": 0.1463, + "num_input_tokens_seen": 639631360, + "step": 305 + }, + { + "epoch": 0.07715582450832073, + "grad_norm": 4.192049503326416, + "learning_rate": 9.982095163102796e-07, + "loss": 0.1127, + "num_input_tokens_seen": 641728512, + "step": 306 + }, + { + "epoch": 0.07740796772566819, + "grad_norm": 2.888293743133545, + "learning_rate": 9.981759156714185e-07, + "loss": 0.113, + "num_input_tokens_seen": 643825664, + "step": 307 + }, + { + "epoch": 0.07766011094301563, + "grad_norm": 3.8195247650146484, + "learning_rate": 9.981420033180651e-07, + "loss": 0.1601, + "num_input_tokens_seen": 645922816, + "step": 308 + }, + { + "epoch": 0.07791225416036308, + "grad_norm": 3.721971035003662, + "learning_rate": 9.98107779273806e-07, + "loss": 0.1443, + "num_input_tokens_seen": 648019968, + "step": 309 + }, + { + "epoch": 0.07816439737771054, + "grad_norm": 3.4332494735717773, + "learning_rate": 9.980732435624441e-07, + "loss": 0.1503, + "num_input_tokens_seen": 650117120, + "step": 310 + }, + { + "epoch": 0.078416540595058, + "grad_norm": 2.9033710956573486, + "learning_rate": 9.980383962080003e-07, + "loss": 0.073, + "num_input_tokens_seen": 652214272, + "step": 311 + }, + { + "epoch": 0.07866868381240545, + "grad_norm": 3.597287178039551, + "learning_rate": 9.980032372347116e-07, + "loss": 0.1596, + "num_input_tokens_seen": 654311424, + "step": 312 + }, + { + "epoch": 0.0789208270297529, + "grad_norm": 3.0851659774780273, + "learning_rate": 9.97967766667031e-07, + "loss": 0.1188, + "num_input_tokens_seen": 656408576, + "step": 313 + }, + { + "epoch": 0.07917297024710035, + "grad_norm": 2.279250144958496, + "learning_rate": 9.979319845296296e-07, + "loss": 0.0974, + "num_input_tokens_seen": 658505728, + "step": 314 + }, + { + "epoch": 0.0794251134644478, + "grad_norm": 4.360164165496826, + "learning_rate": 9.978958908473941e-07, + "loss": 0.1992, + "num_input_tokens_seen": 660602880, + "step": 315 + }, + { + "epoch": 0.07967725668179526, + "grad_norm": 2.8060495853424072, + "learning_rate": 9.978594856454288e-07, + "loss": 0.1314, + "num_input_tokens_seen": 662700032, + "step": 316 + }, + { + "epoch": 0.07992939989914272, + "grad_norm": 4.089578628540039, + "learning_rate": 9.978227689490536e-07, + "loss": 0.1807, + "num_input_tokens_seen": 664797184, + "step": 317 + }, + { + "epoch": 0.08018154311649017, + "grad_norm": 3.043846368789673, + "learning_rate": 9.977857407838061e-07, + "loss": 0.1208, + "num_input_tokens_seen": 666894336, + "step": 318 + }, + { + "epoch": 0.08043368633383761, + "grad_norm": 2.2600390911102295, + "learning_rate": 9.9774840117544e-07, + "loss": 0.076, + "num_input_tokens_seen": 668991488, + "step": 319 + }, + { + "epoch": 0.08068582955118507, + "grad_norm": 3.115410089492798, + "learning_rate": 9.977107501499253e-07, + "loss": 0.1118, + "num_input_tokens_seen": 671088640, + "step": 320 + }, + { + "epoch": 0.08093797276853253, + "grad_norm": 3.720118761062622, + "learning_rate": 9.976727877334493e-07, + "loss": 0.1518, + "num_input_tokens_seen": 673185792, + "step": 321 + }, + { + "epoch": 0.08119011598587998, + "grad_norm": 3.6921238899230957, + "learning_rate": 9.976345139524152e-07, + "loss": 0.1261, + "num_input_tokens_seen": 675282944, + "step": 322 + }, + { + "epoch": 0.08144225920322744, + "grad_norm": 3.162914752960205, + "learning_rate": 9.975959288334438e-07, + "loss": 0.1038, + "num_input_tokens_seen": 677380096, + "step": 323 + }, + { + "epoch": 0.08169440242057488, + "grad_norm": 3.166231870651245, + "learning_rate": 9.97557032403371e-07, + "loss": 0.1294, + "num_input_tokens_seen": 679477248, + "step": 324 + }, + { + "epoch": 0.08194654563792234, + "grad_norm": 3.0747804641723633, + "learning_rate": 9.975178246892507e-07, + "loss": 0.1425, + "num_input_tokens_seen": 681574400, + "step": 325 + }, + { + "epoch": 0.08219868885526979, + "grad_norm": 3.0979673862457275, + "learning_rate": 9.974783057183519e-07, + "loss": 0.1586, + "num_input_tokens_seen": 683671552, + "step": 326 + }, + { + "epoch": 0.08245083207261725, + "grad_norm": 4.019197940826416, + "learning_rate": 9.974384755181609e-07, + "loss": 0.1663, + "num_input_tokens_seen": 685768704, + "step": 327 + }, + { + "epoch": 0.0827029752899647, + "grad_norm": 2.6061339378356934, + "learning_rate": 9.973983341163807e-07, + "loss": 0.0851, + "num_input_tokens_seen": 687865856, + "step": 328 + }, + { + "epoch": 0.08295511850731216, + "grad_norm": 3.0148558616638184, + "learning_rate": 9.9735788154093e-07, + "loss": 0.0966, + "num_input_tokens_seen": 689963008, + "step": 329 + }, + { + "epoch": 0.0832072617246596, + "grad_norm": 2.6705162525177, + "learning_rate": 9.973171178199447e-07, + "loss": 0.0839, + "num_input_tokens_seen": 692060160, + "step": 330 + }, + { + "epoch": 0.08345940494200706, + "grad_norm": 4.910850524902344, + "learning_rate": 9.972760429817763e-07, + "loss": 0.1695, + "num_input_tokens_seen": 694157312, + "step": 331 + }, + { + "epoch": 0.08371154815935451, + "grad_norm": 3.358743190765381, + "learning_rate": 9.972346570549932e-07, + "loss": 0.0935, + "num_input_tokens_seen": 696254464, + "step": 332 + }, + { + "epoch": 0.08396369137670197, + "grad_norm": 3.214064598083496, + "learning_rate": 9.971929600683802e-07, + "loss": 0.0848, + "num_input_tokens_seen": 698351616, + "step": 333 + }, + { + "epoch": 0.08421583459404942, + "grad_norm": 4.408289432525635, + "learning_rate": 9.971509520509381e-07, + "loss": 0.1624, + "num_input_tokens_seen": 700448768, + "step": 334 + }, + { + "epoch": 0.08446797781139688, + "grad_norm": 4.276678085327148, + "learning_rate": 9.971086330318845e-07, + "loss": 0.1458, + "num_input_tokens_seen": 702545920, + "step": 335 + }, + { + "epoch": 0.08472012102874432, + "grad_norm": 2.518461227416992, + "learning_rate": 9.97066003040653e-07, + "loss": 0.0934, + "num_input_tokens_seen": 704643072, + "step": 336 + }, + { + "epoch": 0.08497226424609178, + "grad_norm": 2.8323476314544678, + "learning_rate": 9.970230621068932e-07, + "loss": 0.1324, + "num_input_tokens_seen": 706740224, + "step": 337 + }, + { + "epoch": 0.08522440746343923, + "grad_norm": 2.8873610496520996, + "learning_rate": 9.969798102604717e-07, + "loss": 0.1292, + "num_input_tokens_seen": 708837376, + "step": 338 + }, + { + "epoch": 0.08547655068078669, + "grad_norm": 2.796959638595581, + "learning_rate": 9.969362475314708e-07, + "loss": 0.1086, + "num_input_tokens_seen": 710934528, + "step": 339 + }, + { + "epoch": 0.08572869389813415, + "grad_norm": 4.745234966278076, + "learning_rate": 9.968923739501892e-07, + "loss": 0.2212, + "num_input_tokens_seen": 713031680, + "step": 340 + }, + { + "epoch": 0.08598083711548159, + "grad_norm": 4.436620235443115, + "learning_rate": 9.968481895471417e-07, + "loss": 0.1376, + "num_input_tokens_seen": 715128832, + "step": 341 + }, + { + "epoch": 0.08623298033282904, + "grad_norm": 4.772200584411621, + "learning_rate": 9.968036943530592e-07, + "loss": 0.193, + "num_input_tokens_seen": 717225984, + "step": 342 + }, + { + "epoch": 0.0864851235501765, + "grad_norm": 3.2390449047088623, + "learning_rate": 9.967588883988893e-07, + "loss": 0.0999, + "num_input_tokens_seen": 719323136, + "step": 343 + }, + { + "epoch": 0.08673726676752395, + "grad_norm": 3.936569929122925, + "learning_rate": 9.967137717157951e-07, + "loss": 0.1634, + "num_input_tokens_seen": 721420288, + "step": 344 + }, + { + "epoch": 0.08698940998487141, + "grad_norm": 3.647679567337036, + "learning_rate": 9.966683443351564e-07, + "loss": 0.1798, + "num_input_tokens_seen": 723517440, + "step": 345 + }, + { + "epoch": 0.08724155320221887, + "grad_norm": 2.8842921257019043, + "learning_rate": 9.966226062885682e-07, + "loss": 0.1033, + "num_input_tokens_seen": 725614592, + "step": 346 + }, + { + "epoch": 0.08749369641956631, + "grad_norm": 6.5264434814453125, + "learning_rate": 9.965765576078424e-07, + "loss": 0.2729, + "num_input_tokens_seen": 727711744, + "step": 347 + }, + { + "epoch": 0.08774583963691376, + "grad_norm": 3.786755084991455, + "learning_rate": 9.96530198325007e-07, + "loss": 0.1233, + "num_input_tokens_seen": 729808896, + "step": 348 + }, + { + "epoch": 0.08799798285426122, + "grad_norm": 3.994030237197876, + "learning_rate": 9.964835284723052e-07, + "loss": 0.1229, + "num_input_tokens_seen": 731906048, + "step": 349 + }, + { + "epoch": 0.08825012607160868, + "grad_norm": 4.352416038513184, + "learning_rate": 9.96436548082197e-07, + "loss": 0.1501, + "num_input_tokens_seen": 734003200, + "step": 350 + }, + { + "epoch": 0.08850226928895613, + "grad_norm": 3.238286018371582, + "learning_rate": 9.963892571873584e-07, + "loss": 0.1314, + "num_input_tokens_seen": 736100352, + "step": 351 + }, + { + "epoch": 0.08875441250630359, + "grad_norm": 2.75301456451416, + "learning_rate": 9.963416558206806e-07, + "loss": 0.1137, + "num_input_tokens_seen": 738197504, + "step": 352 + }, + { + "epoch": 0.08900655572365103, + "grad_norm": 3.3911097049713135, + "learning_rate": 9.962937440152712e-07, + "loss": 0.0976, + "num_input_tokens_seen": 740294656, + "step": 353 + }, + { + "epoch": 0.08925869894099848, + "grad_norm": 2.7000679969787598, + "learning_rate": 9.962455218044542e-07, + "loss": 0.063, + "num_input_tokens_seen": 742391808, + "step": 354 + }, + { + "epoch": 0.08951084215834594, + "grad_norm": 3.3619422912597656, + "learning_rate": 9.961969892217688e-07, + "loss": 0.1167, + "num_input_tokens_seen": 744488960, + "step": 355 + }, + { + "epoch": 0.0897629853756934, + "grad_norm": 2.421957015991211, + "learning_rate": 9.9614814630097e-07, + "loss": 0.1184, + "num_input_tokens_seen": 746586112, + "step": 356 + }, + { + "epoch": 0.09001512859304085, + "grad_norm": 3.2838544845581055, + "learning_rate": 9.960989930760294e-07, + "loss": 0.1133, + "num_input_tokens_seen": 748683264, + "step": 357 + }, + { + "epoch": 0.0902672718103883, + "grad_norm": 4.716813564300537, + "learning_rate": 9.960495295811337e-07, + "loss": 0.152, + "num_input_tokens_seen": 750780416, + "step": 358 + }, + { + "epoch": 0.09051941502773575, + "grad_norm": 3.567866563796997, + "learning_rate": 9.959997558506857e-07, + "loss": 0.1348, + "num_input_tokens_seen": 752877568, + "step": 359 + }, + { + "epoch": 0.0907715582450832, + "grad_norm": 8.155049324035645, + "learning_rate": 9.959496719193039e-07, + "loss": 0.1658, + "num_input_tokens_seen": 754974720, + "step": 360 + }, + { + "epoch": 0.09102370146243066, + "grad_norm": 4.341349124908447, + "learning_rate": 9.958992778218226e-07, + "loss": 0.1635, + "num_input_tokens_seen": 757071872, + "step": 361 + }, + { + "epoch": 0.09127584467977812, + "grad_norm": 4.6380815505981445, + "learning_rate": 9.95848573593292e-07, + "loss": 0.1715, + "num_input_tokens_seen": 759169024, + "step": 362 + }, + { + "epoch": 0.09152798789712557, + "grad_norm": 3.3967676162719727, + "learning_rate": 9.957975592689774e-07, + "loss": 0.106, + "num_input_tokens_seen": 761266176, + "step": 363 + }, + { + "epoch": 0.09178013111447302, + "grad_norm": 2.9890308380126953, + "learning_rate": 9.957462348843607e-07, + "loss": 0.1163, + "num_input_tokens_seen": 763363328, + "step": 364 + }, + { + "epoch": 0.09203227433182047, + "grad_norm": 2.564323663711548, + "learning_rate": 9.956946004751386e-07, + "loss": 0.1217, + "num_input_tokens_seen": 765460480, + "step": 365 + }, + { + "epoch": 0.09228441754916793, + "grad_norm": 4.0984697341918945, + "learning_rate": 9.956426560772238e-07, + "loss": 0.1801, + "num_input_tokens_seen": 767557632, + "step": 366 + }, + { + "epoch": 0.09253656076651538, + "grad_norm": 2.5396645069122314, + "learning_rate": 9.955904017267444e-07, + "loss": 0.1272, + "num_input_tokens_seen": 769654784, + "step": 367 + }, + { + "epoch": 0.09278870398386284, + "grad_norm": 3.0213351249694824, + "learning_rate": 9.955378374600447e-07, + "loss": 0.121, + "num_input_tokens_seen": 771751936, + "step": 368 + }, + { + "epoch": 0.09304084720121028, + "grad_norm": 3.8049328327178955, + "learning_rate": 9.954849633136839e-07, + "loss": 0.102, + "num_input_tokens_seen": 773849088, + "step": 369 + }, + { + "epoch": 0.09329299041855774, + "grad_norm": 3.4090912342071533, + "learning_rate": 9.95431779324437e-07, + "loss": 0.1179, + "num_input_tokens_seen": 775946240, + "step": 370 + }, + { + "epoch": 0.09354513363590519, + "grad_norm": 2.5929131507873535, + "learning_rate": 9.95378285529294e-07, + "loss": 0.1106, + "num_input_tokens_seen": 778043392, + "step": 371 + }, + { + "epoch": 0.09379727685325265, + "grad_norm": 3.6183884143829346, + "learning_rate": 9.953244819654615e-07, + "loss": 0.1029, + "num_input_tokens_seen": 780140544, + "step": 372 + }, + { + "epoch": 0.0940494200706001, + "grad_norm": 3.812199354171753, + "learning_rate": 9.952703686703604e-07, + "loss": 0.0838, + "num_input_tokens_seen": 782237696, + "step": 373 + }, + { + "epoch": 0.09430156328794756, + "grad_norm": 5.054091453552246, + "learning_rate": 9.952159456816275e-07, + "loss": 0.2415, + "num_input_tokens_seen": 784334848, + "step": 374 + }, + { + "epoch": 0.094553706505295, + "grad_norm": 2.739720582962036, + "learning_rate": 9.951612130371151e-07, + "loss": 0.1198, + "num_input_tokens_seen": 786432000, + "step": 375 + }, + { + "epoch": 0.09480584972264246, + "grad_norm": 3.5317635536193848, + "learning_rate": 9.951061707748907e-07, + "loss": 0.0951, + "num_input_tokens_seen": 788529152, + "step": 376 + }, + { + "epoch": 0.09505799293998991, + "grad_norm": 2.7190043926239014, + "learning_rate": 9.95050818933237e-07, + "loss": 0.0918, + "num_input_tokens_seen": 790626304, + "step": 377 + }, + { + "epoch": 0.09531013615733737, + "grad_norm": 2.244220495223999, + "learning_rate": 9.949951575506528e-07, + "loss": 0.0987, + "num_input_tokens_seen": 792723456, + "step": 378 + }, + { + "epoch": 0.09556227937468482, + "grad_norm": 2.4800469875335693, + "learning_rate": 9.94939186665851e-07, + "loss": 0.112, + "num_input_tokens_seen": 794820608, + "step": 379 + }, + { + "epoch": 0.09581442259203228, + "grad_norm": 2.934340238571167, + "learning_rate": 9.948829063177606e-07, + "loss": 0.0914, + "num_input_tokens_seen": 796917760, + "step": 380 + }, + { + "epoch": 0.09606656580937972, + "grad_norm": 4.361299991607666, + "learning_rate": 9.948263165455256e-07, + "loss": 0.1366, + "num_input_tokens_seen": 799014912, + "step": 381 + }, + { + "epoch": 0.09631870902672718, + "grad_norm": 5.58315372467041, + "learning_rate": 9.947694173885051e-07, + "loss": 0.1444, + "num_input_tokens_seen": 801112064, + "step": 382 + }, + { + "epoch": 0.09657085224407463, + "grad_norm": 2.2215416431427, + "learning_rate": 9.947122088862737e-07, + "loss": 0.1324, + "num_input_tokens_seen": 803209216, + "step": 383 + }, + { + "epoch": 0.09682299546142209, + "grad_norm": 3.1041672229766846, + "learning_rate": 9.946546910786208e-07, + "loss": 0.1451, + "num_input_tokens_seen": 805306368, + "step": 384 + }, + { + "epoch": 0.09707513867876955, + "grad_norm": 3.4068877696990967, + "learning_rate": 9.945968640055513e-07, + "loss": 0.1318, + "num_input_tokens_seen": 807403520, + "step": 385 + }, + { + "epoch": 0.09732728189611699, + "grad_norm": 2.2413580417633057, + "learning_rate": 9.945387277072845e-07, + "loss": 0.0665, + "num_input_tokens_seen": 809500672, + "step": 386 + }, + { + "epoch": 0.09757942511346444, + "grad_norm": 2.360349655151367, + "learning_rate": 9.944802822242558e-07, + "loss": 0.0752, + "num_input_tokens_seen": 811597824, + "step": 387 + }, + { + "epoch": 0.0978315683308119, + "grad_norm": 2.0612034797668457, + "learning_rate": 9.944215275971148e-07, + "loss": 0.0661, + "num_input_tokens_seen": 813694976, + "step": 388 + }, + { + "epoch": 0.09808371154815936, + "grad_norm": 2.8129661083221436, + "learning_rate": 9.943624638667263e-07, + "loss": 0.0991, + "num_input_tokens_seen": 815792128, + "step": 389 + }, + { + "epoch": 0.09833585476550681, + "grad_norm": 3.179905891418457, + "learning_rate": 9.943030910741707e-07, + "loss": 0.166, + "num_input_tokens_seen": 817889280, + "step": 390 + }, + { + "epoch": 0.09858799798285427, + "grad_norm": 3.191718816757202, + "learning_rate": 9.942434092607423e-07, + "loss": 0.1583, + "num_input_tokens_seen": 819986432, + "step": 391 + }, + { + "epoch": 0.09884014120020171, + "grad_norm": 2.8753068447113037, + "learning_rate": 9.941834184679511e-07, + "loss": 0.1463, + "num_input_tokens_seen": 822083584, + "step": 392 + }, + { + "epoch": 0.09909228441754916, + "grad_norm": 2.709397315979004, + "learning_rate": 9.94123118737522e-07, + "loss": 0.103, + "num_input_tokens_seen": 824180736, + "step": 393 + }, + { + "epoch": 0.09934442763489662, + "grad_norm": 3.7003681659698486, + "learning_rate": 9.94062510111394e-07, + "loss": 0.1539, + "num_input_tokens_seen": 826277888, + "step": 394 + }, + { + "epoch": 0.09959657085224408, + "grad_norm": 4.4324631690979, + "learning_rate": 9.94001592631722e-07, + "loss": 0.1915, + "num_input_tokens_seen": 828375040, + "step": 395 + }, + { + "epoch": 0.09984871406959153, + "grad_norm": 4.082291126251221, + "learning_rate": 9.93940366340875e-07, + "loss": 0.2416, + "num_input_tokens_seen": 830472192, + "step": 396 + }, + { + "epoch": 0.10010085728693899, + "grad_norm": 2.7822890281677246, + "learning_rate": 9.938788312814374e-07, + "loss": 0.1053, + "num_input_tokens_seen": 832569344, + "step": 397 + }, + { + "epoch": 0.10035300050428643, + "grad_norm": 2.376317024230957, + "learning_rate": 9.938169874962072e-07, + "loss": 0.0785, + "num_input_tokens_seen": 834666496, + "step": 398 + }, + { + "epoch": 0.10060514372163389, + "grad_norm": 6.018281936645508, + "learning_rate": 9.937548350281987e-07, + "loss": 0.1501, + "num_input_tokens_seen": 836763648, + "step": 399 + }, + { + "epoch": 0.10085728693898134, + "grad_norm": 2.6437666416168213, + "learning_rate": 9.936923739206391e-07, + "loss": 0.1259, + "num_input_tokens_seen": 838860800, + "step": 400 + }, + { + "epoch": 0.1011094301563288, + "grad_norm": 3.112172842025757, + "learning_rate": 9.936296042169723e-07, + "loss": 0.1747, + "num_input_tokens_seen": 840957952, + "step": 401 + }, + { + "epoch": 0.10136157337367625, + "grad_norm": 7.632992744445801, + "learning_rate": 9.93566525960855e-07, + "loss": 0.0882, + "num_input_tokens_seen": 843055104, + "step": 402 + }, + { + "epoch": 0.1016137165910237, + "grad_norm": 3.4459123611450195, + "learning_rate": 9.935031391961599e-07, + "loss": 0.1184, + "num_input_tokens_seen": 845152256, + "step": 403 + }, + { + "epoch": 0.10186585980837115, + "grad_norm": 3.6913039684295654, + "learning_rate": 9.93439443966973e-07, + "loss": 0.1121, + "num_input_tokens_seen": 847249408, + "step": 404 + }, + { + "epoch": 0.1021180030257186, + "grad_norm": 3.291170835494995, + "learning_rate": 9.933754403175956e-07, + "loss": 0.1317, + "num_input_tokens_seen": 849346560, + "step": 405 + }, + { + "epoch": 0.10237014624306606, + "grad_norm": 5.224982738494873, + "learning_rate": 9.93311128292544e-07, + "loss": 0.2308, + "num_input_tokens_seen": 851443712, + "step": 406 + }, + { + "epoch": 0.10262228946041352, + "grad_norm": 3.043541193008423, + "learning_rate": 9.932465079365477e-07, + "loss": 0.1293, + "num_input_tokens_seen": 853540864, + "step": 407 + }, + { + "epoch": 0.10287443267776097, + "grad_norm": 3.613516092300415, + "learning_rate": 9.931815792945515e-07, + "loss": 0.2023, + "num_input_tokens_seen": 855638016, + "step": 408 + }, + { + "epoch": 0.10312657589510842, + "grad_norm": 3.9032676219940186, + "learning_rate": 9.931163424117148e-07, + "loss": 0.1554, + "num_input_tokens_seen": 857735168, + "step": 409 + }, + { + "epoch": 0.10337871911245587, + "grad_norm": 2.2143468856811523, + "learning_rate": 9.930507973334106e-07, + "loss": 0.1014, + "num_input_tokens_seen": 859832320, + "step": 410 + }, + { + "epoch": 0.10363086232980333, + "grad_norm": 3.722890615463257, + "learning_rate": 9.92984944105227e-07, + "loss": 0.1072, + "num_input_tokens_seen": 861929472, + "step": 411 + }, + { + "epoch": 0.10388300554715078, + "grad_norm": 3.3566651344299316, + "learning_rate": 9.929187827729658e-07, + "loss": 0.1597, + "num_input_tokens_seen": 864026624, + "step": 412 + }, + { + "epoch": 0.10413514876449824, + "grad_norm": 2.243074655532837, + "learning_rate": 9.928523133826437e-07, + "loss": 0.0799, + "num_input_tokens_seen": 866123776, + "step": 413 + }, + { + "epoch": 0.1043872919818457, + "grad_norm": 2.4208436012268066, + "learning_rate": 9.927855359804914e-07, + "loss": 0.1441, + "num_input_tokens_seen": 868220928, + "step": 414 + }, + { + "epoch": 0.10463943519919314, + "grad_norm": 3.7958076000213623, + "learning_rate": 9.927184506129535e-07, + "loss": 0.1769, + "num_input_tokens_seen": 870318080, + "step": 415 + }, + { + "epoch": 0.10489157841654059, + "grad_norm": 2.1095194816589355, + "learning_rate": 9.926510573266894e-07, + "loss": 0.0626, + "num_input_tokens_seen": 872415232, + "step": 416 + }, + { + "epoch": 0.10514372163388805, + "grad_norm": 2.22505784034729, + "learning_rate": 9.925833561685718e-07, + "loss": 0.0868, + "num_input_tokens_seen": 874512384, + "step": 417 + }, + { + "epoch": 0.1053958648512355, + "grad_norm": 2.8599283695220947, + "learning_rate": 9.92515347185689e-07, + "loss": 0.1311, + "num_input_tokens_seen": 876609536, + "step": 418 + }, + { + "epoch": 0.10564800806858296, + "grad_norm": 3.1945903301239014, + "learning_rate": 9.924470304253418e-07, + "loss": 0.0906, + "num_input_tokens_seen": 878706688, + "step": 419 + }, + { + "epoch": 0.1059001512859304, + "grad_norm": 5.766541481018066, + "learning_rate": 9.92378405935046e-07, + "loss": 0.1588, + "num_input_tokens_seen": 880803840, + "step": 420 + }, + { + "epoch": 0.10615229450327786, + "grad_norm": 2.077852249145508, + "learning_rate": 9.92309473762531e-07, + "loss": 0.0958, + "num_input_tokens_seen": 882900992, + "step": 421 + }, + { + "epoch": 0.10640443772062531, + "grad_norm": 3.552129030227661, + "learning_rate": 9.922402339557405e-07, + "loss": 0.1314, + "num_input_tokens_seen": 884998144, + "step": 422 + }, + { + "epoch": 0.10665658093797277, + "grad_norm": 2.371065855026245, + "learning_rate": 9.92170686562832e-07, + "loss": 0.1129, + "num_input_tokens_seen": 887095296, + "step": 423 + }, + { + "epoch": 0.10690872415532023, + "grad_norm": 3.874335289001465, + "learning_rate": 9.921008316321768e-07, + "loss": 0.1691, + "num_input_tokens_seen": 889192448, + "step": 424 + }, + { + "epoch": 0.10716086737266768, + "grad_norm": 2.733494520187378, + "learning_rate": 9.920306692123609e-07, + "loss": 0.1126, + "num_input_tokens_seen": 891289600, + "step": 425 + }, + { + "epoch": 0.10741301059001512, + "grad_norm": 2.3687491416931152, + "learning_rate": 9.919601993521829e-07, + "loss": 0.1028, + "num_input_tokens_seen": 893386752, + "step": 426 + }, + { + "epoch": 0.10766515380736258, + "grad_norm": 2.3049280643463135, + "learning_rate": 9.91889422100656e-07, + "loss": 0.0865, + "num_input_tokens_seen": 895483904, + "step": 427 + }, + { + "epoch": 0.10791729702471003, + "grad_norm": 2.899887800216675, + "learning_rate": 9.918183375070073e-07, + "loss": 0.1258, + "num_input_tokens_seen": 897581056, + "step": 428 + }, + { + "epoch": 0.10816944024205749, + "grad_norm": 4.081860065460205, + "learning_rate": 9.917469456206773e-07, + "loss": 0.0931, + "num_input_tokens_seen": 899678208, + "step": 429 + }, + { + "epoch": 0.10842158345940495, + "grad_norm": 3.0482466220855713, + "learning_rate": 9.916752464913201e-07, + "loss": 0.1039, + "num_input_tokens_seen": 901775360, + "step": 430 + }, + { + "epoch": 0.10867372667675239, + "grad_norm": 3.3849377632141113, + "learning_rate": 9.916032401688042e-07, + "loss": 0.1661, + "num_input_tokens_seen": 903872512, + "step": 431 + }, + { + "epoch": 0.10892586989409984, + "grad_norm": 3.4006130695343018, + "learning_rate": 9.91530926703211e-07, + "loss": 0.121, + "num_input_tokens_seen": 905969664, + "step": 432 + }, + { + "epoch": 0.1091780131114473, + "grad_norm": 4.100249290466309, + "learning_rate": 9.91458306144836e-07, + "loss": 0.1976, + "num_input_tokens_seen": 908066816, + "step": 433 + }, + { + "epoch": 0.10943015632879476, + "grad_norm": 2.491917610168457, + "learning_rate": 9.913853785441878e-07, + "loss": 0.1019, + "num_input_tokens_seen": 910163968, + "step": 434 + }, + { + "epoch": 0.10968229954614221, + "grad_norm": 4.087813377380371, + "learning_rate": 9.913121439519893e-07, + "loss": 0.1673, + "num_input_tokens_seen": 912261120, + "step": 435 + }, + { + "epoch": 0.10993444276348967, + "grad_norm": 2.377880334854126, + "learning_rate": 9.912386024191763e-07, + "loss": 0.1184, + "num_input_tokens_seen": 914358272, + "step": 436 + }, + { + "epoch": 0.11018658598083711, + "grad_norm": 2.745607376098633, + "learning_rate": 9.911647539968981e-07, + "loss": 0.0917, + "num_input_tokens_seen": 916455424, + "step": 437 + }, + { + "epoch": 0.11043872919818457, + "grad_norm": 4.707367897033691, + "learning_rate": 9.91090598736518e-07, + "loss": 0.2128, + "num_input_tokens_seen": 918552576, + "step": 438 + }, + { + "epoch": 0.11069087241553202, + "grad_norm": 3.578786611557007, + "learning_rate": 9.910161366896119e-07, + "loss": 0.1235, + "num_input_tokens_seen": 920649728, + "step": 439 + }, + { + "epoch": 0.11094301563287948, + "grad_norm": 2.3904166221618652, + "learning_rate": 9.909413679079697e-07, + "loss": 0.1139, + "num_input_tokens_seen": 922746880, + "step": 440 + }, + { + "epoch": 0.11119515885022693, + "grad_norm": 3.1667914390563965, + "learning_rate": 9.908662924435946e-07, + "loss": 0.157, + "num_input_tokens_seen": 924844032, + "step": 441 + }, + { + "epoch": 0.11144730206757439, + "grad_norm": 4.515403747558594, + "learning_rate": 9.907909103487027e-07, + "loss": 0.1837, + "num_input_tokens_seen": 926941184, + "step": 442 + }, + { + "epoch": 0.11169944528492183, + "grad_norm": 1.9842240810394287, + "learning_rate": 9.907152216757239e-07, + "loss": 0.1077, + "num_input_tokens_seen": 929038336, + "step": 443 + }, + { + "epoch": 0.11195158850226929, + "grad_norm": 3.713541030883789, + "learning_rate": 9.906392264773008e-07, + "loss": 0.1401, + "num_input_tokens_seen": 931135488, + "step": 444 + }, + { + "epoch": 0.11220373171961674, + "grad_norm": 2.7595789432525635, + "learning_rate": 9.905629248062895e-07, + "loss": 0.1262, + "num_input_tokens_seen": 933232640, + "step": 445 + }, + { + "epoch": 0.1124558749369642, + "grad_norm": 3.375941038131714, + "learning_rate": 9.904863167157591e-07, + "loss": 0.1777, + "num_input_tokens_seen": 935329792, + "step": 446 + }, + { + "epoch": 0.11270801815431165, + "grad_norm": 2.2114899158477783, + "learning_rate": 9.904094022589923e-07, + "loss": 0.0785, + "num_input_tokens_seen": 937426944, + "step": 447 + }, + { + "epoch": 0.1129601613716591, + "grad_norm": 3.5571250915527344, + "learning_rate": 9.90332181489484e-07, + "loss": 0.1771, + "num_input_tokens_seen": 939524096, + "step": 448 + }, + { + "epoch": 0.11321230458900655, + "grad_norm": 4.025667667388916, + "learning_rate": 9.902546544609432e-07, + "loss": 0.1424, + "num_input_tokens_seen": 941621248, + "step": 449 + }, + { + "epoch": 0.11346444780635401, + "grad_norm": 2.804630994796753, + "learning_rate": 9.901768212272906e-07, + "loss": 0.1722, + "num_input_tokens_seen": 943718400, + "step": 450 + }, + { + "epoch": 0.11371659102370146, + "grad_norm": 2.183051824569702, + "learning_rate": 9.900986818426612e-07, + "loss": 0.0876, + "num_input_tokens_seen": 945815552, + "step": 451 + }, + { + "epoch": 0.11396873424104892, + "grad_norm": 2.7712557315826416, + "learning_rate": 9.900202363614025e-07, + "loss": 0.1148, + "num_input_tokens_seen": 947912704, + "step": 452 + }, + { + "epoch": 0.11422087745839637, + "grad_norm": 3.2009191513061523, + "learning_rate": 9.899414848380743e-07, + "loss": 0.1514, + "num_input_tokens_seen": 950009856, + "step": 453 + }, + { + "epoch": 0.11447302067574382, + "grad_norm": 3.8625547885894775, + "learning_rate": 9.8986242732745e-07, + "loss": 0.1811, + "num_input_tokens_seen": 952107008, + "step": 454 + }, + { + "epoch": 0.11472516389309127, + "grad_norm": 2.4320788383483887, + "learning_rate": 9.897830638845153e-07, + "loss": 0.1304, + "num_input_tokens_seen": 954204160, + "step": 455 + }, + { + "epoch": 0.11497730711043873, + "grad_norm": 2.825261354446411, + "learning_rate": 9.897033945644692e-07, + "loss": 0.1156, + "num_input_tokens_seen": 956301312, + "step": 456 + }, + { + "epoch": 0.11522945032778618, + "grad_norm": 9.34619426727295, + "learning_rate": 9.89623419422723e-07, + "loss": 0.0738, + "num_input_tokens_seen": 958398464, + "step": 457 + }, + { + "epoch": 0.11548159354513364, + "grad_norm": 3.386025905609131, + "learning_rate": 9.895431385149007e-07, + "loss": 0.1693, + "num_input_tokens_seen": 960495616, + "step": 458 + }, + { + "epoch": 0.1157337367624811, + "grad_norm": 3.9842169284820557, + "learning_rate": 9.894625518968396e-07, + "loss": 0.0836, + "num_input_tokens_seen": 962592768, + "step": 459 + }, + { + "epoch": 0.11598587997982854, + "grad_norm": 4.544926166534424, + "learning_rate": 9.893816596245886e-07, + "loss": 0.2216, + "num_input_tokens_seen": 964689920, + "step": 460 + }, + { + "epoch": 0.116238023197176, + "grad_norm": 3.3318898677825928, + "learning_rate": 9.8930046175441e-07, + "loss": 0.1638, + "num_input_tokens_seen": 966787072, + "step": 461 + }, + { + "epoch": 0.11649016641452345, + "grad_norm": 2.5450119972229004, + "learning_rate": 9.892189583427785e-07, + "loss": 0.1472, + "num_input_tokens_seen": 968884224, + "step": 462 + }, + { + "epoch": 0.1167423096318709, + "grad_norm": 5.197476863861084, + "learning_rate": 9.891371494463812e-07, + "loss": 0.1708, + "num_input_tokens_seen": 970981376, + "step": 463 + }, + { + "epoch": 0.11699445284921836, + "grad_norm": 2.857074499130249, + "learning_rate": 9.890550351221176e-07, + "loss": 0.0968, + "num_input_tokens_seen": 973078528, + "step": 464 + }, + { + "epoch": 0.1172465960665658, + "grad_norm": 2.8476240634918213, + "learning_rate": 9.889726154270997e-07, + "loss": 0.1504, + "num_input_tokens_seen": 975175680, + "step": 465 + }, + { + "epoch": 0.11749873928391326, + "grad_norm": 6.322744369506836, + "learning_rate": 9.888898904186517e-07, + "loss": 0.1249, + "num_input_tokens_seen": 977272832, + "step": 466 + }, + { + "epoch": 0.11775088250126071, + "grad_norm": 3.161973237991333, + "learning_rate": 9.888068601543106e-07, + "loss": 0.2604, + "num_input_tokens_seen": 979369984, + "step": 467 + }, + { + "epoch": 0.11800302571860817, + "grad_norm": 2.0370872020721436, + "learning_rate": 9.887235246918255e-07, + "loss": 0.0983, + "num_input_tokens_seen": 981467136, + "step": 468 + }, + { + "epoch": 0.11825516893595563, + "grad_norm": 3.568608283996582, + "learning_rate": 9.886398840891576e-07, + "loss": 0.1531, + "num_input_tokens_seen": 983564288, + "step": 469 + }, + { + "epoch": 0.11850731215330308, + "grad_norm": 2.3104538917541504, + "learning_rate": 9.885559384044805e-07, + "loss": 0.1091, + "num_input_tokens_seen": 985661440, + "step": 470 + }, + { + "epoch": 0.11875945537065052, + "grad_norm": 3.4569497108459473, + "learning_rate": 9.884716876961798e-07, + "loss": 0.1195, + "num_input_tokens_seen": 987758592, + "step": 471 + }, + { + "epoch": 0.11901159858799798, + "grad_norm": 3.131441354751587, + "learning_rate": 9.883871320228534e-07, + "loss": 0.1564, + "num_input_tokens_seen": 989855744, + "step": 472 + }, + { + "epoch": 0.11926374180534544, + "grad_norm": 3.427337646484375, + "learning_rate": 9.883022714433116e-07, + "loss": 0.1911, + "num_input_tokens_seen": 991952896, + "step": 473 + }, + { + "epoch": 0.11951588502269289, + "grad_norm": 3.554757833480835, + "learning_rate": 9.882171060165764e-07, + "loss": 0.1489, + "num_input_tokens_seen": 994050048, + "step": 474 + }, + { + "epoch": 0.11976802824004035, + "grad_norm": 2.5964512825012207, + "learning_rate": 9.881316358018816e-07, + "loss": 0.0662, + "num_input_tokens_seen": 996147200, + "step": 475 + }, + { + "epoch": 0.1200201714573878, + "grad_norm": 3.2962310314178467, + "learning_rate": 9.880458608586737e-07, + "loss": 0.1555, + "num_input_tokens_seen": 998244352, + "step": 476 + }, + { + "epoch": 0.12027231467473525, + "grad_norm": 2.869269371032715, + "learning_rate": 9.879597812466105e-07, + "loss": 0.0795, + "num_input_tokens_seen": 1000341504, + "step": 477 + }, + { + "epoch": 0.1205244578920827, + "grad_norm": 2.913670778274536, + "learning_rate": 9.878733970255618e-07, + "loss": 0.1329, + "num_input_tokens_seen": 1002438656, + "step": 478 + }, + { + "epoch": 0.12077660110943016, + "grad_norm": 3.124332904815674, + "learning_rate": 9.877867082556097e-07, + "loss": 0.1538, + "num_input_tokens_seen": 1004535808, + "step": 479 + }, + { + "epoch": 0.12102874432677761, + "grad_norm": 3.5321497917175293, + "learning_rate": 9.876997149970477e-07, + "loss": 0.1714, + "num_input_tokens_seen": 1006632960, + "step": 480 + }, + { + "epoch": 0.12128088754412507, + "grad_norm": 3.904442071914673, + "learning_rate": 9.87612417310381e-07, + "loss": 0.1452, + "num_input_tokens_seen": 1008730112, + "step": 481 + }, + { + "epoch": 0.12153303076147251, + "grad_norm": 3.534336805343628, + "learning_rate": 9.87524815256327e-07, + "loss": 0.1589, + "num_input_tokens_seen": 1010827264, + "step": 482 + }, + { + "epoch": 0.12178517397881997, + "grad_norm": 3.5298209190368652, + "learning_rate": 9.874369088958145e-07, + "loss": 0.1413, + "num_input_tokens_seen": 1012924416, + "step": 483 + }, + { + "epoch": 0.12203731719616742, + "grad_norm": 3.4223012924194336, + "learning_rate": 9.873486982899837e-07, + "loss": 0.1552, + "num_input_tokens_seen": 1015021568, + "step": 484 + }, + { + "epoch": 0.12228946041351488, + "grad_norm": 2.560487747192383, + "learning_rate": 9.872601835001869e-07, + "loss": 0.1192, + "num_input_tokens_seen": 1017118720, + "step": 485 + }, + { + "epoch": 0.12254160363086233, + "grad_norm": 2.099520683288574, + "learning_rate": 9.871713645879878e-07, + "loss": 0.1125, + "num_input_tokens_seen": 1019215872, + "step": 486 + }, + { + "epoch": 0.12279374684820979, + "grad_norm": 3.477560520172119, + "learning_rate": 9.870822416151614e-07, + "loss": 0.1485, + "num_input_tokens_seen": 1021313024, + "step": 487 + }, + { + "epoch": 0.12304589006555723, + "grad_norm": 2.9200782775878906, + "learning_rate": 9.869928146436942e-07, + "loss": 0.0596, + "num_input_tokens_seen": 1023410176, + "step": 488 + }, + { + "epoch": 0.12329803328290469, + "grad_norm": 2.3703415393829346, + "learning_rate": 9.86903083735785e-07, + "loss": 0.1163, + "num_input_tokens_seen": 1025507328, + "step": 489 + }, + { + "epoch": 0.12355017650025214, + "grad_norm": 2.2664389610290527, + "learning_rate": 9.868130489538425e-07, + "loss": 0.0712, + "num_input_tokens_seen": 1027604480, + "step": 490 + }, + { + "epoch": 0.1238023197175996, + "grad_norm": 1.798887848854065, + "learning_rate": 9.867227103604877e-07, + "loss": 0.0709, + "num_input_tokens_seen": 1029701632, + "step": 491 + }, + { + "epoch": 0.12405446293494705, + "grad_norm": 3.6567928791046143, + "learning_rate": 9.86632068018553e-07, + "loss": 0.1474, + "num_input_tokens_seen": 1031798784, + "step": 492 + }, + { + "epoch": 0.1243066061522945, + "grad_norm": 2.8362531661987305, + "learning_rate": 9.865411219910815e-07, + "loss": 0.1235, + "num_input_tokens_seen": 1033895936, + "step": 493 + }, + { + "epoch": 0.12455874936964195, + "grad_norm": 2.423952341079712, + "learning_rate": 9.86449872341328e-07, + "loss": 0.1048, + "num_input_tokens_seen": 1035993088, + "step": 494 + }, + { + "epoch": 0.12481089258698941, + "grad_norm": 2.4268240928649902, + "learning_rate": 9.863583191327583e-07, + "loss": 0.1063, + "num_input_tokens_seen": 1038090240, + "step": 495 + }, + { + "epoch": 0.12506303580433686, + "grad_norm": 2.1852941513061523, + "learning_rate": 9.862664624290494e-07, + "loss": 0.0932, + "num_input_tokens_seen": 1040187392, + "step": 496 + }, + { + "epoch": 0.12531517902168432, + "grad_norm": 3.1700496673583984, + "learning_rate": 9.86174302294089e-07, + "loss": 0.1174, + "num_input_tokens_seen": 1042284544, + "step": 497 + }, + { + "epoch": 0.12556732223903178, + "grad_norm": 3.2374541759490967, + "learning_rate": 9.860818387919762e-07, + "loss": 0.1251, + "num_input_tokens_seen": 1044381696, + "step": 498 + }, + { + "epoch": 0.12581946545637923, + "grad_norm": 2.62046217918396, + "learning_rate": 9.859890719870213e-07, + "loss": 0.0991, + "num_input_tokens_seen": 1046478848, + "step": 499 + }, + { + "epoch": 0.1260716086737267, + "grad_norm": 3.053370237350464, + "learning_rate": 9.85896001943745e-07, + "loss": 0.1612, + "num_input_tokens_seen": 1048576000, + "step": 500 + } + ], + "logging_steps": 1.0, + "max_steps": 3966, + "num_input_tokens_seen": 1048576000, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.902112919650304e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}