|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.1260716086737267, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002521432173474534, |
|
"grad_norm": 56.41114044189453, |
|
"learning_rate": 5.025125628140703e-09, |
|
"loss": 1.5821, |
|
"num_input_tokens_seen": 2097152, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005042864346949068, |
|
"grad_norm": 31.409353256225586, |
|
"learning_rate": 1.0050251256281407e-08, |
|
"loss": 1.5937, |
|
"num_input_tokens_seen": 4194304, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0007564296520423601, |
|
"grad_norm": 21.739652633666992, |
|
"learning_rate": 1.5075376884422108e-08, |
|
"loss": 1.2442, |
|
"num_input_tokens_seen": 6291456, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0010085728693898135, |
|
"grad_norm": 20.685302734375, |
|
"learning_rate": 2.0100502512562813e-08, |
|
"loss": 0.8062, |
|
"num_input_tokens_seen": 8388608, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0012607160867372667, |
|
"grad_norm": 22.219989776611328, |
|
"learning_rate": 2.5125628140703518e-08, |
|
"loss": 1.1513, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0015128593040847202, |
|
"grad_norm": 28.416399002075195, |
|
"learning_rate": 3.0150753768844216e-08, |
|
"loss": 1.634, |
|
"num_input_tokens_seen": 12582912, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0017650025214321734, |
|
"grad_norm": 23.952890396118164, |
|
"learning_rate": 3.517587939698492e-08, |
|
"loss": 1.1944, |
|
"num_input_tokens_seen": 14680064, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.002017145738779627, |
|
"grad_norm": 20.75243377685547, |
|
"learning_rate": 4.0201005025125626e-08, |
|
"loss": 0.7753, |
|
"num_input_tokens_seen": 16777216, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0022692889561270802, |
|
"grad_norm": 25.797378540039062, |
|
"learning_rate": 4.522613065326633e-08, |
|
"loss": 1.5984, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0025214321734745334, |
|
"grad_norm": 25.863649368286133, |
|
"learning_rate": 5.0251256281407036e-08, |
|
"loss": 1.5978, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002773575390821987, |
|
"grad_norm": 18.696609497070312, |
|
"learning_rate": 5.527638190954774e-08, |
|
"loss": 1.2323, |
|
"num_input_tokens_seen": 23068672, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0030257186081694403, |
|
"grad_norm": 28.740385055541992, |
|
"learning_rate": 6.030150753768843e-08, |
|
"loss": 1.1786, |
|
"num_input_tokens_seen": 25165824, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0032778618255168935, |
|
"grad_norm": 21.161056518554688, |
|
"learning_rate": 6.532663316582915e-08, |
|
"loss": 0.7842, |
|
"num_input_tokens_seen": 27262976, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0035300050428643467, |
|
"grad_norm": 25.495088577270508, |
|
"learning_rate": 7.035175879396984e-08, |
|
"loss": 1.9987, |
|
"num_input_tokens_seen": 29360128, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0037821482602118004, |
|
"grad_norm": 24.420948028564453, |
|
"learning_rate": 7.537688442211055e-08, |
|
"loss": 1.1424, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004034291477559254, |
|
"grad_norm": 19.922271728515625, |
|
"learning_rate": 8.040201005025125e-08, |
|
"loss": 1.1716, |
|
"num_input_tokens_seen": 33554432, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004286434694906707, |
|
"grad_norm": 25.040063858032227, |
|
"learning_rate": 8.542713567839196e-08, |
|
"loss": 0.8189, |
|
"num_input_tokens_seen": 35651584, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0045385779122541605, |
|
"grad_norm": 27.888629913330078, |
|
"learning_rate": 9.045226130653266e-08, |
|
"loss": 1.1743, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.004790721129601614, |
|
"grad_norm": 21.901092529296875, |
|
"learning_rate": 9.547738693467335e-08, |
|
"loss": 0.7951, |
|
"num_input_tokens_seen": 39845888, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.005042864346949067, |
|
"grad_norm": 21.351625442504883, |
|
"learning_rate": 1.0050251256281407e-07, |
|
"loss": 1.2271, |
|
"num_input_tokens_seen": 41943040, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00529500756429652, |
|
"grad_norm": 21.482006072998047, |
|
"learning_rate": 1.0552763819095476e-07, |
|
"loss": 1.1908, |
|
"num_input_tokens_seen": 44040192, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.005547150781643974, |
|
"grad_norm": 21.15386390686035, |
|
"learning_rate": 1.1055276381909548e-07, |
|
"loss": 1.2297, |
|
"num_input_tokens_seen": 46137344, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.005799293998991427, |
|
"grad_norm": 23.156387329101562, |
|
"learning_rate": 1.1557788944723617e-07, |
|
"loss": 1.1766, |
|
"num_input_tokens_seen": 48234496, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.006051437216338881, |
|
"grad_norm": 38.258697509765625, |
|
"learning_rate": 1.2060301507537687e-07, |
|
"loss": 1.4932, |
|
"num_input_tokens_seen": 50331648, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.006303580433686334, |
|
"grad_norm": 20.798620223999023, |
|
"learning_rate": 1.2562814070351758e-07, |
|
"loss": 1.2943, |
|
"num_input_tokens_seen": 52428800, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.006555723651033787, |
|
"grad_norm": 24.994922637939453, |
|
"learning_rate": 1.306532663316583e-07, |
|
"loss": 1.0768, |
|
"num_input_tokens_seen": 54525952, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00680786686838124, |
|
"grad_norm": 33.116146087646484, |
|
"learning_rate": 1.35678391959799e-07, |
|
"loss": 1.1369, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0070600100857286935, |
|
"grad_norm": 39.03334426879883, |
|
"learning_rate": 1.4070351758793969e-07, |
|
"loss": 1.5513, |
|
"num_input_tokens_seen": 58720256, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.007312153303076148, |
|
"grad_norm": 25.035110473632812, |
|
"learning_rate": 1.4572864321608038e-07, |
|
"loss": 1.2028, |
|
"num_input_tokens_seen": 60817408, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.007564296520423601, |
|
"grad_norm": 21.068431854248047, |
|
"learning_rate": 1.507537688442211e-07, |
|
"loss": 1.1555, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007816439737771054, |
|
"grad_norm": 35.82476043701172, |
|
"learning_rate": 1.5577889447236181e-07, |
|
"loss": 1.1723, |
|
"num_input_tokens_seen": 65011712, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.008068582955118508, |
|
"grad_norm": 27.961219787597656, |
|
"learning_rate": 1.608040201005025e-07, |
|
"loss": 0.7226, |
|
"num_input_tokens_seen": 67108864, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00832072617246596, |
|
"grad_norm": 21.109777450561523, |
|
"learning_rate": 1.658291457286432e-07, |
|
"loss": 1.0722, |
|
"num_input_tokens_seen": 69206016, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.008572869389813415, |
|
"grad_norm": 43.04289627075195, |
|
"learning_rate": 1.7085427135678392e-07, |
|
"loss": 1.1128, |
|
"num_input_tokens_seen": 71303168, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.008825012607160867, |
|
"grad_norm": 26.515880584716797, |
|
"learning_rate": 1.7587939698492463e-07, |
|
"loss": 1.1254, |
|
"num_input_tokens_seen": 73400320, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.009077155824508321, |
|
"grad_norm": 21.351062774658203, |
|
"learning_rate": 1.8090452261306533e-07, |
|
"loss": 0.7675, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.009329299041855773, |
|
"grad_norm": 23.136459350585938, |
|
"learning_rate": 1.8592964824120602e-07, |
|
"loss": 1.1374, |
|
"num_input_tokens_seen": 77594624, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.009581442259203227, |
|
"grad_norm": 17.877473831176758, |
|
"learning_rate": 1.909547738693467e-07, |
|
"loss": 1.1101, |
|
"num_input_tokens_seen": 79691776, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.009833585476550681, |
|
"grad_norm": 33.78788375854492, |
|
"learning_rate": 1.9597989949748743e-07, |
|
"loss": 1.0273, |
|
"num_input_tokens_seen": 81788928, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.010085728693898134, |
|
"grad_norm": 32.83673858642578, |
|
"learning_rate": 2.0100502512562815e-07, |
|
"loss": 1.1025, |
|
"num_input_tokens_seen": 83886080, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.010337871911245588, |
|
"grad_norm": 26.676027297973633, |
|
"learning_rate": 2.0603015075376884e-07, |
|
"loss": 1.8515, |
|
"num_input_tokens_seen": 85983232, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.01059001512859304, |
|
"grad_norm": 26.88898468017578, |
|
"learning_rate": 2.1105527638190953e-07, |
|
"loss": 1.3322, |
|
"num_input_tokens_seen": 88080384, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.010842158345940494, |
|
"grad_norm": 24.28297233581543, |
|
"learning_rate": 2.1608040201005022e-07, |
|
"loss": 0.9043, |
|
"num_input_tokens_seen": 90177536, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.011094301563287948, |
|
"grad_norm": 15.659173011779785, |
|
"learning_rate": 2.2110552763819096e-07, |
|
"loss": 0.9169, |
|
"num_input_tokens_seen": 92274688, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0113464447806354, |
|
"grad_norm": 15.930516242980957, |
|
"learning_rate": 2.2613065326633166e-07, |
|
"loss": 0.9613, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.011598587997982855, |
|
"grad_norm": 14.883039474487305, |
|
"learning_rate": 2.3115577889447235e-07, |
|
"loss": 0.882, |
|
"num_input_tokens_seen": 96468992, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.011850731215330307, |
|
"grad_norm": 25.84305191040039, |
|
"learning_rate": 2.3618090452261304e-07, |
|
"loss": 1.1471, |
|
"num_input_tokens_seen": 98566144, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.012102874432677761, |
|
"grad_norm": 21.669544219970703, |
|
"learning_rate": 2.4120603015075373e-07, |
|
"loss": 0.9125, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.012355017650025214, |
|
"grad_norm": 15.483664512634277, |
|
"learning_rate": 2.4623115577889445e-07, |
|
"loss": 0.8492, |
|
"num_input_tokens_seen": 102760448, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.012607160867372668, |
|
"grad_norm": 18.560636520385742, |
|
"learning_rate": 2.5125628140703517e-07, |
|
"loss": 0.9035, |
|
"num_input_tokens_seen": 104857600, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012859304084720122, |
|
"grad_norm": 14.719083786010742, |
|
"learning_rate": 2.562814070351759e-07, |
|
"loss": 0.8161, |
|
"num_input_tokens_seen": 106954752, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.013111447302067574, |
|
"grad_norm": 21.655672073364258, |
|
"learning_rate": 2.613065326633166e-07, |
|
"loss": 0.572, |
|
"num_input_tokens_seen": 109051904, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.013363590519415028, |
|
"grad_norm": 11.465034484863281, |
|
"learning_rate": 2.6633165829145727e-07, |
|
"loss": 0.807, |
|
"num_input_tokens_seen": 111149056, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.01361573373676248, |
|
"grad_norm": 17.689987182617188, |
|
"learning_rate": 2.71356783919598e-07, |
|
"loss": 1.4423, |
|
"num_input_tokens_seen": 113246208, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.013867876954109935, |
|
"grad_norm": 14.684429168701172, |
|
"learning_rate": 2.7638190954773865e-07, |
|
"loss": 0.8659, |
|
"num_input_tokens_seen": 115343360, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.014120020171457387, |
|
"grad_norm": 12.435643196105957, |
|
"learning_rate": 2.8140703517587937e-07, |
|
"loss": 0.7607, |
|
"num_input_tokens_seen": 117440512, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.014372163388804841, |
|
"grad_norm": 17.700153350830078, |
|
"learning_rate": 2.864321608040201e-07, |
|
"loss": 0.8607, |
|
"num_input_tokens_seen": 119537664, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.014624306606152295, |
|
"grad_norm": 13.79918384552002, |
|
"learning_rate": 2.9145728643216075e-07, |
|
"loss": 0.7589, |
|
"num_input_tokens_seen": 121634816, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.014876449823499747, |
|
"grad_norm": 15.207538604736328, |
|
"learning_rate": 2.964824120603015e-07, |
|
"loss": 0.4787, |
|
"num_input_tokens_seen": 123731968, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.015128593040847202, |
|
"grad_norm": 10.523366928100586, |
|
"learning_rate": 3.015075376884422e-07, |
|
"loss": 0.6908, |
|
"num_input_tokens_seen": 125829120, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.015380736258194654, |
|
"grad_norm": 8.412284851074219, |
|
"learning_rate": 3.065326633165829e-07, |
|
"loss": 0.6561, |
|
"num_input_tokens_seen": 127926272, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.015632879475542108, |
|
"grad_norm": 9.98276138305664, |
|
"learning_rate": 3.1155778894472363e-07, |
|
"loss": 0.7216, |
|
"num_input_tokens_seen": 130023424, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.01588502269288956, |
|
"grad_norm": 11.017064094543457, |
|
"learning_rate": 3.165829145728643e-07, |
|
"loss": 0.6223, |
|
"num_input_tokens_seen": 132120576, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.016137165910237016, |
|
"grad_norm": 15.129839897155762, |
|
"learning_rate": 3.21608040201005e-07, |
|
"loss": 1.0373, |
|
"num_input_tokens_seen": 134217728, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.01638930912758447, |
|
"grad_norm": 8.578692436218262, |
|
"learning_rate": 3.2663316582914573e-07, |
|
"loss": 0.5687, |
|
"num_input_tokens_seen": 136314880, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01664145234493192, |
|
"grad_norm": 13.31927490234375, |
|
"learning_rate": 3.316582914572864e-07, |
|
"loss": 1.0766, |
|
"num_input_tokens_seen": 138412032, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.016893595562279373, |
|
"grad_norm": 8.775867462158203, |
|
"learning_rate": 3.366834170854271e-07, |
|
"loss": 0.5324, |
|
"num_input_tokens_seen": 140509184, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.01714573877962683, |
|
"grad_norm": 12.085953712463379, |
|
"learning_rate": 3.4170854271356783e-07, |
|
"loss": 0.8601, |
|
"num_input_tokens_seen": 142606336, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01739788199697428, |
|
"grad_norm": 12.76360034942627, |
|
"learning_rate": 3.4673366834170855e-07, |
|
"loss": 0.5595, |
|
"num_input_tokens_seen": 144703488, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.017650025214321734, |
|
"grad_norm": 10.255838394165039, |
|
"learning_rate": 3.5175879396984927e-07, |
|
"loss": 0.3496, |
|
"num_input_tokens_seen": 146800640, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01790216843166919, |
|
"grad_norm": 9.94809341430664, |
|
"learning_rate": 3.5678391959798993e-07, |
|
"loss": 0.5976, |
|
"num_input_tokens_seen": 148897792, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.018154311649016642, |
|
"grad_norm": 7.37994384765625, |
|
"learning_rate": 3.6180904522613065e-07, |
|
"loss": 0.5241, |
|
"num_input_tokens_seen": 150994944, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.018406454866364094, |
|
"grad_norm": 8.874433517456055, |
|
"learning_rate": 3.668341708542713e-07, |
|
"loss": 0.5629, |
|
"num_input_tokens_seen": 153092096, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.018658598083711547, |
|
"grad_norm": 16.685457229614258, |
|
"learning_rate": 3.7185929648241203e-07, |
|
"loss": 0.3801, |
|
"num_input_tokens_seen": 155189248, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.018910741301059002, |
|
"grad_norm": 11.288415908813477, |
|
"learning_rate": 3.7688442211055275e-07, |
|
"loss": 0.6093, |
|
"num_input_tokens_seen": 157286400, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.019162884518406455, |
|
"grad_norm": 10.51889419555664, |
|
"learning_rate": 3.819095477386934e-07, |
|
"loss": 0.5053, |
|
"num_input_tokens_seen": 159383552, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.019415027735753907, |
|
"grad_norm": 10.236724853515625, |
|
"learning_rate": 3.869346733668342e-07, |
|
"loss": 0.7537, |
|
"num_input_tokens_seen": 161480704, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.019667170953101363, |
|
"grad_norm": 9.370979309082031, |
|
"learning_rate": 3.9195979899497485e-07, |
|
"loss": 0.5814, |
|
"num_input_tokens_seen": 163577856, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.019919314170448815, |
|
"grad_norm": 12.056835174560547, |
|
"learning_rate": 3.9698492462311557e-07, |
|
"loss": 0.5178, |
|
"num_input_tokens_seen": 165675008, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.020171457387796268, |
|
"grad_norm": 8.761493682861328, |
|
"learning_rate": 4.020100502512563e-07, |
|
"loss": 0.4851, |
|
"num_input_tokens_seen": 167772160, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02042360060514372, |
|
"grad_norm": 9.159887313842773, |
|
"learning_rate": 4.0703517587939696e-07, |
|
"loss": 0.4531, |
|
"num_input_tokens_seen": 169869312, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.020675743822491176, |
|
"grad_norm": 9.923644065856934, |
|
"learning_rate": 4.120603015075377e-07, |
|
"loss": 0.5835, |
|
"num_input_tokens_seen": 171966464, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.020927887039838628, |
|
"grad_norm": 8.762866973876953, |
|
"learning_rate": 4.1708542713567834e-07, |
|
"loss": 0.4772, |
|
"num_input_tokens_seen": 174063616, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.02118003025718608, |
|
"grad_norm": 10.09272289276123, |
|
"learning_rate": 4.2211055276381906e-07, |
|
"loss": 0.7305, |
|
"num_input_tokens_seen": 176160768, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.021432173474533536, |
|
"grad_norm": 8.009614944458008, |
|
"learning_rate": 4.271356783919598e-07, |
|
"loss": 0.4629, |
|
"num_input_tokens_seen": 178257920, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.02168431669188099, |
|
"grad_norm": 8.284019470214844, |
|
"learning_rate": 4.3216080402010044e-07, |
|
"loss": 0.4368, |
|
"num_input_tokens_seen": 180355072, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.02193645990922844, |
|
"grad_norm": 6.427061557769775, |
|
"learning_rate": 4.371859296482412e-07, |
|
"loss": 0.43, |
|
"num_input_tokens_seen": 182452224, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.022188603126575897, |
|
"grad_norm": 12.255255699157715, |
|
"learning_rate": 4.4221105527638193e-07, |
|
"loss": 0.5879, |
|
"num_input_tokens_seen": 184549376, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.02244074634392335, |
|
"grad_norm": 6.626727104187012, |
|
"learning_rate": 4.472361809045226e-07, |
|
"loss": 0.3916, |
|
"num_input_tokens_seen": 186646528, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0226928895612708, |
|
"grad_norm": 8.53348445892334, |
|
"learning_rate": 4.522613065326633e-07, |
|
"loss": 0.4768, |
|
"num_input_tokens_seen": 188743680, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.022945032778618254, |
|
"grad_norm": 6.995331287384033, |
|
"learning_rate": 4.57286432160804e-07, |
|
"loss": 0.3988, |
|
"num_input_tokens_seen": 190840832, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.02319717599596571, |
|
"grad_norm": 8.352548599243164, |
|
"learning_rate": 4.623115577889447e-07, |
|
"loss": 0.3706, |
|
"num_input_tokens_seen": 192937984, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.023449319213313162, |
|
"grad_norm": 6.609560489654541, |
|
"learning_rate": 4.673366834170854e-07, |
|
"loss": 0.2459, |
|
"num_input_tokens_seen": 195035136, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.023701462430660614, |
|
"grad_norm": 9.539324760437012, |
|
"learning_rate": 4.723618090452261e-07, |
|
"loss": 0.3865, |
|
"num_input_tokens_seen": 197132288, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.02395360564800807, |
|
"grad_norm": 9.831944465637207, |
|
"learning_rate": 4.773869346733669e-07, |
|
"loss": 0.4022, |
|
"num_input_tokens_seen": 199229440, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.024205748865355523, |
|
"grad_norm": 9.292588233947754, |
|
"learning_rate": 4.824120603015075e-07, |
|
"loss": 0.3543, |
|
"num_input_tokens_seen": 201326592, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.024457892082702975, |
|
"grad_norm": 9.192462921142578, |
|
"learning_rate": 4.874371859296482e-07, |
|
"loss": 0.4336, |
|
"num_input_tokens_seen": 203423744, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.024710035300050427, |
|
"grad_norm": 8.302521705627441, |
|
"learning_rate": 4.924623115577889e-07, |
|
"loss": 0.534, |
|
"num_input_tokens_seen": 205520896, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.024962178517397883, |
|
"grad_norm": 9.702790260314941, |
|
"learning_rate": 4.974874371859296e-07, |
|
"loss": 0.5899, |
|
"num_input_tokens_seen": 207618048, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.025214321734745335, |
|
"grad_norm": 7.346845626831055, |
|
"learning_rate": 5.025125628140703e-07, |
|
"loss": 0.3439, |
|
"num_input_tokens_seen": 209715200, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025466464952092788, |
|
"grad_norm": 6.6140265464782715, |
|
"learning_rate": 5.075376884422111e-07, |
|
"loss": 0.3779, |
|
"num_input_tokens_seen": 211812352, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.025718608169440244, |
|
"grad_norm": 6.8121209144592285, |
|
"learning_rate": 5.125628140703518e-07, |
|
"loss": 0.403, |
|
"num_input_tokens_seen": 213909504, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.025970751386787696, |
|
"grad_norm": 6.07421875, |
|
"learning_rate": 5.175879396984925e-07, |
|
"loss": 0.3473, |
|
"num_input_tokens_seen": 216006656, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.026222894604135148, |
|
"grad_norm": 6.86598539352417, |
|
"learning_rate": 5.226130653266332e-07, |
|
"loss": 0.3054, |
|
"num_input_tokens_seen": 218103808, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0264750378214826, |
|
"grad_norm": 7.970452308654785, |
|
"learning_rate": 5.276381909547738e-07, |
|
"loss": 0.3693, |
|
"num_input_tokens_seen": 220200960, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.026727181038830056, |
|
"grad_norm": 7.2236552238464355, |
|
"learning_rate": 5.326633165829145e-07, |
|
"loss": 0.2194, |
|
"num_input_tokens_seen": 222298112, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.02697932425617751, |
|
"grad_norm": 5.257369518280029, |
|
"learning_rate": 5.376884422110553e-07, |
|
"loss": 0.2962, |
|
"num_input_tokens_seen": 224395264, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.02723146747352496, |
|
"grad_norm": 6.920422077178955, |
|
"learning_rate": 5.42713567839196e-07, |
|
"loss": 0.3699, |
|
"num_input_tokens_seen": 226492416, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.027483610690872417, |
|
"grad_norm": 9.312458992004395, |
|
"learning_rate": 5.477386934673367e-07, |
|
"loss": 0.3812, |
|
"num_input_tokens_seen": 228589568, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.02773575390821987, |
|
"grad_norm": 9.935240745544434, |
|
"learning_rate": 5.527638190954773e-07, |
|
"loss": 0.4443, |
|
"num_input_tokens_seen": 230686720, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02798789712556732, |
|
"grad_norm": 5.373161315917969, |
|
"learning_rate": 5.57788944723618e-07, |
|
"loss": 0.264, |
|
"num_input_tokens_seen": 232783872, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.028240040342914774, |
|
"grad_norm": 6.769862651824951, |
|
"learning_rate": 5.628140703517587e-07, |
|
"loss": 0.1686, |
|
"num_input_tokens_seen": 234881024, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.02849218356026223, |
|
"grad_norm": 5.726578712463379, |
|
"learning_rate": 5.678391959798995e-07, |
|
"loss": 0.3396, |
|
"num_input_tokens_seen": 236978176, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.028744326777609682, |
|
"grad_norm": 5.439636707305908, |
|
"learning_rate": 5.728643216080402e-07, |
|
"loss": 0.2733, |
|
"num_input_tokens_seen": 239075328, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.028996469994957134, |
|
"grad_norm": 5.622605323791504, |
|
"learning_rate": 5.778894472361808e-07, |
|
"loss": 0.2998, |
|
"num_input_tokens_seen": 241172480, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.02924861321230459, |
|
"grad_norm": 6.728963851928711, |
|
"learning_rate": 5.829145728643215e-07, |
|
"loss": 0.2549, |
|
"num_input_tokens_seen": 243269632, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.029500756429652043, |
|
"grad_norm": 5.0983781814575195, |
|
"learning_rate": 5.879396984924622e-07, |
|
"loss": 0.2705, |
|
"num_input_tokens_seen": 245366784, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.029752899646999495, |
|
"grad_norm": 7.3646721839904785, |
|
"learning_rate": 5.92964824120603e-07, |
|
"loss": 0.3242, |
|
"num_input_tokens_seen": 247463936, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.03000504286434695, |
|
"grad_norm": 7.918598651885986, |
|
"learning_rate": 5.979899497487438e-07, |
|
"loss": 0.371, |
|
"num_input_tokens_seen": 249561088, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.030257186081694403, |
|
"grad_norm": 7.411210536956787, |
|
"learning_rate": 6.030150753768844e-07, |
|
"loss": 0.2728, |
|
"num_input_tokens_seen": 251658240, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.030509329299041855, |
|
"grad_norm": 5.8603129386901855, |
|
"learning_rate": 6.080402010050251e-07, |
|
"loss": 0.1854, |
|
"num_input_tokens_seen": 253755392, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.030761472516389308, |
|
"grad_norm": 5.476680278778076, |
|
"learning_rate": 6.130653266331658e-07, |
|
"loss": 0.1831, |
|
"num_input_tokens_seen": 255852544, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.031013615733736764, |
|
"grad_norm": 6.4667158126831055, |
|
"learning_rate": 6.180904522613065e-07, |
|
"loss": 0.1721, |
|
"num_input_tokens_seen": 257949696, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.031265758951084216, |
|
"grad_norm": 5.928079605102539, |
|
"learning_rate": 6.231155778894473e-07, |
|
"loss": 0.2728, |
|
"num_input_tokens_seen": 260046848, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.03151790216843167, |
|
"grad_norm": 7.0044755935668945, |
|
"learning_rate": 6.28140703517588e-07, |
|
"loss": 0.4037, |
|
"num_input_tokens_seen": 262144000, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03177004538577912, |
|
"grad_norm": 8.558830261230469, |
|
"learning_rate": 6.331658291457286e-07, |
|
"loss": 0.5263, |
|
"num_input_tokens_seen": 264241152, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.032022188603126577, |
|
"grad_norm": 5.0764055252075195, |
|
"learning_rate": 6.381909547738693e-07, |
|
"loss": 0.2054, |
|
"num_input_tokens_seen": 266338304, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.03227433182047403, |
|
"grad_norm": 5.459807872772217, |
|
"learning_rate": 6.4321608040201e-07, |
|
"loss": 0.2122, |
|
"num_input_tokens_seen": 268435456, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.03252647503782148, |
|
"grad_norm": 5.658675670623779, |
|
"learning_rate": 6.482412060301507e-07, |
|
"loss": 0.2226, |
|
"num_input_tokens_seen": 270532608, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.03277861825516894, |
|
"grad_norm": 5.613616466522217, |
|
"learning_rate": 6.532663316582915e-07, |
|
"loss": 0.2701, |
|
"num_input_tokens_seen": 272629760, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.033030761472516386, |
|
"grad_norm": 9.082258224487305, |
|
"learning_rate": 6.582914572864321e-07, |
|
"loss": 0.3726, |
|
"num_input_tokens_seen": 274726912, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.03328290468986384, |
|
"grad_norm": 4.047947406768799, |
|
"learning_rate": 6.633165829145728e-07, |
|
"loss": 0.1323, |
|
"num_input_tokens_seen": 276824064, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0335350479072113, |
|
"grad_norm": 5.141188144683838, |
|
"learning_rate": 6.683417085427135e-07, |
|
"loss": 0.2615, |
|
"num_input_tokens_seen": 278921216, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.033787191124558746, |
|
"grad_norm": 4.637810707092285, |
|
"learning_rate": 6.733668341708542e-07, |
|
"loss": 0.2252, |
|
"num_input_tokens_seen": 281018368, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0340393343419062, |
|
"grad_norm": 5.142843723297119, |
|
"learning_rate": 6.783919597989949e-07, |
|
"loss": 0.1817, |
|
"num_input_tokens_seen": 283115520, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.03429147755925366, |
|
"grad_norm": 7.557190418243408, |
|
"learning_rate": 6.834170854271357e-07, |
|
"loss": 0.2897, |
|
"num_input_tokens_seen": 285212672, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.03454362077660111, |
|
"grad_norm": 6.585993766784668, |
|
"learning_rate": 6.884422110552764e-07, |
|
"loss": 0.227, |
|
"num_input_tokens_seen": 287309824, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.03479576399394856, |
|
"grad_norm": 4.926968574523926, |
|
"learning_rate": 6.934673366834171e-07, |
|
"loss": 0.1573, |
|
"num_input_tokens_seen": 289406976, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.03504790721129602, |
|
"grad_norm": 6.03431510925293, |
|
"learning_rate": 6.984924623115578e-07, |
|
"loss": 0.2187, |
|
"num_input_tokens_seen": 291504128, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.03530005042864347, |
|
"grad_norm": 9.677518844604492, |
|
"learning_rate": 7.035175879396985e-07, |
|
"loss": 0.2295, |
|
"num_input_tokens_seen": 293601280, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03555219364599092, |
|
"grad_norm": 6.820138931274414, |
|
"learning_rate": 7.085427135678391e-07, |
|
"loss": 0.1944, |
|
"num_input_tokens_seen": 295698432, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.03580433686333838, |
|
"grad_norm": 5.568108081817627, |
|
"learning_rate": 7.135678391959799e-07, |
|
"loss": 0.3113, |
|
"num_input_tokens_seen": 297795584, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.03605648008068583, |
|
"grad_norm": 6.417880058288574, |
|
"learning_rate": 7.185929648241206e-07, |
|
"loss": 0.2932, |
|
"num_input_tokens_seen": 299892736, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.036308623298033284, |
|
"grad_norm": 5.040261745452881, |
|
"learning_rate": 7.236180904522613e-07, |
|
"loss": 0.2076, |
|
"num_input_tokens_seen": 301989888, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.03656076651538074, |
|
"grad_norm": 6.350996494293213, |
|
"learning_rate": 7.28643216080402e-07, |
|
"loss": 0.1714, |
|
"num_input_tokens_seen": 304087040, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.03681290973272819, |
|
"grad_norm": 5.744927406311035, |
|
"learning_rate": 7.336683417085426e-07, |
|
"loss": 0.1948, |
|
"num_input_tokens_seen": 306184192, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.037065052950075644, |
|
"grad_norm": 5.379306793212891, |
|
"learning_rate": 7.386934673366834e-07, |
|
"loss": 0.1971, |
|
"num_input_tokens_seen": 308281344, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.03731719616742309, |
|
"grad_norm": 4.08986234664917, |
|
"learning_rate": 7.437185929648241e-07, |
|
"loss": 0.1319, |
|
"num_input_tokens_seen": 310378496, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.03756933938477055, |
|
"grad_norm": 8.005187034606934, |
|
"learning_rate": 7.487437185929648e-07, |
|
"loss": 0.3227, |
|
"num_input_tokens_seen": 312475648, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.037821482602118005, |
|
"grad_norm": 6.485504627227783, |
|
"learning_rate": 7.537688442211055e-07, |
|
"loss": 0.4005, |
|
"num_input_tokens_seen": 314572800, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.038073625819465454, |
|
"grad_norm": 7.763909339904785, |
|
"learning_rate": 7.587939698492461e-07, |
|
"loss": 0.3537, |
|
"num_input_tokens_seen": 316669952, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.03832576903681291, |
|
"grad_norm": 5.093461036682129, |
|
"learning_rate": 7.638190954773868e-07, |
|
"loss": 0.1321, |
|
"num_input_tokens_seen": 318767104, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.038577912254160365, |
|
"grad_norm": 4.274379730224609, |
|
"learning_rate": 7.688442211055276e-07, |
|
"loss": 0.1623, |
|
"num_input_tokens_seen": 320864256, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.038830055471507814, |
|
"grad_norm": 5.359605312347412, |
|
"learning_rate": 7.738693467336684e-07, |
|
"loss": 0.2337, |
|
"num_input_tokens_seen": 322961408, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.03908219868885527, |
|
"grad_norm": 5.039738655090332, |
|
"learning_rate": 7.788944723618091e-07, |
|
"loss": 0.2028, |
|
"num_input_tokens_seen": 325058560, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.039334341906202726, |
|
"grad_norm": 5.888302326202393, |
|
"learning_rate": 7.839195979899497e-07, |
|
"loss": 0.1418, |
|
"num_input_tokens_seen": 327155712, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.039586485123550175, |
|
"grad_norm": 5.222049236297607, |
|
"learning_rate": 7.889447236180904e-07, |
|
"loss": 0.1474, |
|
"num_input_tokens_seen": 329252864, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.03983862834089763, |
|
"grad_norm": 5.662126064300537, |
|
"learning_rate": 7.939698492462311e-07, |
|
"loss": 0.2008, |
|
"num_input_tokens_seen": 331350016, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.040090771558245086, |
|
"grad_norm": 4.854446887969971, |
|
"learning_rate": 7.989949748743719e-07, |
|
"loss": 0.1373, |
|
"num_input_tokens_seen": 333447168, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.040342914775592535, |
|
"grad_norm": 5.8150177001953125, |
|
"learning_rate": 8.040201005025126e-07, |
|
"loss": 0.2512, |
|
"num_input_tokens_seen": 335544320, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04059505799293999, |
|
"grad_norm": 5.4808526039123535, |
|
"learning_rate": 8.090452261306532e-07, |
|
"loss": 0.1379, |
|
"num_input_tokens_seen": 337641472, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.04084720121028744, |
|
"grad_norm": 5.683319091796875, |
|
"learning_rate": 8.140703517587939e-07, |
|
"loss": 0.2061, |
|
"num_input_tokens_seen": 339738624, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.041099344427634896, |
|
"grad_norm": 5.919990062713623, |
|
"learning_rate": 8.190954773869346e-07, |
|
"loss": 0.2115, |
|
"num_input_tokens_seen": 341835776, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.04135148764498235, |
|
"grad_norm": 4.193869113922119, |
|
"learning_rate": 8.241206030150753e-07, |
|
"loss": 0.1766, |
|
"num_input_tokens_seen": 343932928, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.0416036308623298, |
|
"grad_norm": 4.4601945877075195, |
|
"learning_rate": 8.291457286432161e-07, |
|
"loss": 0.1939, |
|
"num_input_tokens_seen": 346030080, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.041855774079677256, |
|
"grad_norm": 5.21290922164917, |
|
"learning_rate": 8.341708542713567e-07, |
|
"loss": 0.1787, |
|
"num_input_tokens_seen": 348127232, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.04210791729702471, |
|
"grad_norm": 5.489988327026367, |
|
"learning_rate": 8.391959798994974e-07, |
|
"loss": 0.1809, |
|
"num_input_tokens_seen": 350224384, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.04236006051437216, |
|
"grad_norm": 4.026052474975586, |
|
"learning_rate": 8.442211055276381e-07, |
|
"loss": 0.1248, |
|
"num_input_tokens_seen": 352321536, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.04261220373171962, |
|
"grad_norm": 4.203098297119141, |
|
"learning_rate": 8.492462311557788e-07, |
|
"loss": 0.1089, |
|
"num_input_tokens_seen": 354418688, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.04286434694906707, |
|
"grad_norm": 6.0608296394348145, |
|
"learning_rate": 8.542713567839196e-07, |
|
"loss": 0.185, |
|
"num_input_tokens_seen": 356515840, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04311649016641452, |
|
"grad_norm": 5.297198295593262, |
|
"learning_rate": 8.592964824120602e-07, |
|
"loss": 0.119, |
|
"num_input_tokens_seen": 358612992, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.04336863338376198, |
|
"grad_norm": 4.82717227935791, |
|
"learning_rate": 8.643216080402009e-07, |
|
"loss": 0.1275, |
|
"num_input_tokens_seen": 360710144, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.04362077660110943, |
|
"grad_norm": 7.091985702514648, |
|
"learning_rate": 8.693467336683417e-07, |
|
"loss": 0.3237, |
|
"num_input_tokens_seen": 362807296, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.04387291981845688, |
|
"grad_norm": 4.359028339385986, |
|
"learning_rate": 8.743718592964824e-07, |
|
"loss": 0.1272, |
|
"num_input_tokens_seen": 364904448, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.04412506303580434, |
|
"grad_norm": 4.864053726196289, |
|
"learning_rate": 8.793969849246231e-07, |
|
"loss": 0.2115, |
|
"num_input_tokens_seen": 367001600, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.044377206253151794, |
|
"grad_norm": 4.585638523101807, |
|
"learning_rate": 8.844221105527639e-07, |
|
"loss": 0.1753, |
|
"num_input_tokens_seen": 369098752, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.04462934947049924, |
|
"grad_norm": 6.2548933029174805, |
|
"learning_rate": 8.894472361809045e-07, |
|
"loss": 0.2436, |
|
"num_input_tokens_seen": 371195904, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.0448814926878467, |
|
"grad_norm": 4.619575023651123, |
|
"learning_rate": 8.944723618090452e-07, |
|
"loss": 0.2271, |
|
"num_input_tokens_seen": 373293056, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.04513363590519415, |
|
"grad_norm": 4.505560398101807, |
|
"learning_rate": 8.994974874371859e-07, |
|
"loss": 0.1728, |
|
"num_input_tokens_seen": 375390208, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0453857791225416, |
|
"grad_norm": 4.657378196716309, |
|
"learning_rate": 9.045226130653266e-07, |
|
"loss": 0.2134, |
|
"num_input_tokens_seen": 377487360, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04563792233988906, |
|
"grad_norm": 3.5373897552490234, |
|
"learning_rate": 9.095477386934673e-07, |
|
"loss": 0.125, |
|
"num_input_tokens_seen": 379584512, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.04589006555723651, |
|
"grad_norm": 4.476269721984863, |
|
"learning_rate": 9.14572864321608e-07, |
|
"loss": 0.1805, |
|
"num_input_tokens_seen": 381681664, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.046142208774583963, |
|
"grad_norm": 4.5421881675720215, |
|
"learning_rate": 9.195979899497487e-07, |
|
"loss": 0.1296, |
|
"num_input_tokens_seen": 383778816, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.04639435199193142, |
|
"grad_norm": 4.141582012176514, |
|
"learning_rate": 9.246231155778894e-07, |
|
"loss": 0.194, |
|
"num_input_tokens_seen": 385875968, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.04664649520927887, |
|
"grad_norm": 6.524399757385254, |
|
"learning_rate": 9.296482412060301e-07, |
|
"loss": 0.1595, |
|
"num_input_tokens_seen": 387973120, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.046898638426626324, |
|
"grad_norm": 4.473093509674072, |
|
"learning_rate": 9.346733668341708e-07, |
|
"loss": 0.1909, |
|
"num_input_tokens_seen": 390070272, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.04715078164397378, |
|
"grad_norm": 5.006099224090576, |
|
"learning_rate": 9.396984924623114e-07, |
|
"loss": 0.215, |
|
"num_input_tokens_seen": 392167424, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.04740292486132123, |
|
"grad_norm": 4.727731227874756, |
|
"learning_rate": 9.447236180904522e-07, |
|
"loss": 0.1874, |
|
"num_input_tokens_seen": 394264576, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.047655068078668684, |
|
"grad_norm": 4.6576828956604, |
|
"learning_rate": 9.497487437185929e-07, |
|
"loss": 0.1889, |
|
"num_input_tokens_seen": 396361728, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.04790721129601614, |
|
"grad_norm": 4.223318099975586, |
|
"learning_rate": 9.547738693467337e-07, |
|
"loss": 0.1432, |
|
"num_input_tokens_seen": 398458880, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04815935451336359, |
|
"grad_norm": 3.288745641708374, |
|
"learning_rate": 9.597989949748744e-07, |
|
"loss": 0.1361, |
|
"num_input_tokens_seen": 400556032, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.048411497730711045, |
|
"grad_norm": 4.024937629699707, |
|
"learning_rate": 9.64824120603015e-07, |
|
"loss": 0.1285, |
|
"num_input_tokens_seen": 402653184, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.048663640948058494, |
|
"grad_norm": 4.060795783996582, |
|
"learning_rate": 9.698492462311556e-07, |
|
"loss": 0.1472, |
|
"num_input_tokens_seen": 404750336, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.04891578416540595, |
|
"grad_norm": 5.01156759262085, |
|
"learning_rate": 9.748743718592964e-07, |
|
"loss": 0.2541, |
|
"num_input_tokens_seen": 406847488, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.049167927382753406, |
|
"grad_norm": 3.8259568214416504, |
|
"learning_rate": 9.79899497487437e-07, |
|
"loss": 0.176, |
|
"num_input_tokens_seen": 408944640, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.049420070600100854, |
|
"grad_norm": 4.526422500610352, |
|
"learning_rate": 9.849246231155778e-07, |
|
"loss": 0.2161, |
|
"num_input_tokens_seen": 411041792, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.04967221381744831, |
|
"grad_norm": 4.0646867752075195, |
|
"learning_rate": 9.899497487437185e-07, |
|
"loss": 0.1361, |
|
"num_input_tokens_seen": 413138944, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.049924357034795766, |
|
"grad_norm": 4.822361946105957, |
|
"learning_rate": 9.949748743718592e-07, |
|
"loss": 0.1678, |
|
"num_input_tokens_seen": 415236096, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.050176500252143215, |
|
"grad_norm": 5.335970878601074, |
|
"learning_rate": 1e-06, |
|
"loss": 0.138, |
|
"num_input_tokens_seen": 417333248, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.05042864346949067, |
|
"grad_norm": 4.283322811126709, |
|
"learning_rate": 9.999998435084117e-07, |
|
"loss": 0.1599, |
|
"num_input_tokens_seen": 419430400, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05068078668683813, |
|
"grad_norm": 3.6955955028533936, |
|
"learning_rate": 9.999993740337564e-07, |
|
"loss": 0.1203, |
|
"num_input_tokens_seen": 421527552, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.050932929904185575, |
|
"grad_norm": 4.380987167358398, |
|
"learning_rate": 9.999985915763598e-07, |
|
"loss": 0.2069, |
|
"num_input_tokens_seen": 423624704, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.05118507312153303, |
|
"grad_norm": 3.827716588973999, |
|
"learning_rate": 9.999974961367668e-07, |
|
"loss": 0.1987, |
|
"num_input_tokens_seen": 425721856, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.05143721633888049, |
|
"grad_norm": 3.8995583057403564, |
|
"learning_rate": 9.999960877157389e-07, |
|
"loss": 0.1473, |
|
"num_input_tokens_seen": 427819008, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.051689359556227936, |
|
"grad_norm": 3.6740832328796387, |
|
"learning_rate": 9.99994366314256e-07, |
|
"loss": 0.1348, |
|
"num_input_tokens_seen": 429916160, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.05194150277357539, |
|
"grad_norm": 3.7553346157073975, |
|
"learning_rate": 9.99992331933515e-07, |
|
"loss": 0.1463, |
|
"num_input_tokens_seen": 432013312, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.05219364599092285, |
|
"grad_norm": 4.992524147033691, |
|
"learning_rate": 9.99989984574931e-07, |
|
"loss": 0.2349, |
|
"num_input_tokens_seen": 434110464, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.052445789208270296, |
|
"grad_norm": 4.383981704711914, |
|
"learning_rate": 9.99987324240137e-07, |
|
"loss": 0.1552, |
|
"num_input_tokens_seen": 436207616, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.05269793242561775, |
|
"grad_norm": 4.6292619705200195, |
|
"learning_rate": 9.999843509309827e-07, |
|
"loss": 0.1998, |
|
"num_input_tokens_seen": 438304768, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.0529500756429652, |
|
"grad_norm": 3.5693604946136475, |
|
"learning_rate": 9.999810646495363e-07, |
|
"loss": 0.1409, |
|
"num_input_tokens_seen": 440401920, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05320221886031266, |
|
"grad_norm": 4.460555553436279, |
|
"learning_rate": 9.999774653980837e-07, |
|
"loss": 0.2005, |
|
"num_input_tokens_seen": 442499072, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.05345436207766011, |
|
"grad_norm": 3.6692800521850586, |
|
"learning_rate": 9.99973553179128e-07, |
|
"loss": 0.1358, |
|
"num_input_tokens_seen": 444596224, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.05370650529500756, |
|
"grad_norm": 3.4849557876586914, |
|
"learning_rate": 9.999693279953903e-07, |
|
"loss": 0.1199, |
|
"num_input_tokens_seen": 446693376, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.05395864851235502, |
|
"grad_norm": 3.9747097492218018, |
|
"learning_rate": 9.999647898498095e-07, |
|
"loss": 0.1885, |
|
"num_input_tokens_seen": 448790528, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.05421079172970247, |
|
"grad_norm": 4.172543525695801, |
|
"learning_rate": 9.999599387455416e-07, |
|
"loss": 0.2118, |
|
"num_input_tokens_seen": 450887680, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.05446293494704992, |
|
"grad_norm": 3.811913013458252, |
|
"learning_rate": 9.999547746859607e-07, |
|
"loss": 0.1973, |
|
"num_input_tokens_seen": 452984832, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.05471507816439738, |
|
"grad_norm": 3.7271082401275635, |
|
"learning_rate": 9.999492976746585e-07, |
|
"loss": 0.2219, |
|
"num_input_tokens_seen": 455081984, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.054967221381744834, |
|
"grad_norm": 4.112778186798096, |
|
"learning_rate": 9.999435077154446e-07, |
|
"loss": 0.1748, |
|
"num_input_tokens_seen": 457179136, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.05521936459909228, |
|
"grad_norm": 6.517294883728027, |
|
"learning_rate": 9.99937404812346e-07, |
|
"loss": 0.3107, |
|
"num_input_tokens_seen": 459276288, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.05547150781643974, |
|
"grad_norm": 4.02686071395874, |
|
"learning_rate": 9.99930988969607e-07, |
|
"loss": 0.0861, |
|
"num_input_tokens_seen": 461373440, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.055723651033787194, |
|
"grad_norm": 3.6635353565216064, |
|
"learning_rate": 9.999242601916902e-07, |
|
"loss": 0.2132, |
|
"num_input_tokens_seen": 463470592, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.05597579425113464, |
|
"grad_norm": 4.417490005493164, |
|
"learning_rate": 9.999172184832756e-07, |
|
"loss": 0.2374, |
|
"num_input_tokens_seen": 465567744, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.0562279374684821, |
|
"grad_norm": 3.173140048980713, |
|
"learning_rate": 9.99909863849261e-07, |
|
"loss": 0.1771, |
|
"num_input_tokens_seen": 467664896, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.05648008068582955, |
|
"grad_norm": 5.276343822479248, |
|
"learning_rate": 9.999021962947612e-07, |
|
"loss": 0.1569, |
|
"num_input_tokens_seen": 469762048, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.056732223903177004, |
|
"grad_norm": 4.241299629211426, |
|
"learning_rate": 9.998942158251096e-07, |
|
"loss": 0.2738, |
|
"num_input_tokens_seen": 471859200, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.05698436712052446, |
|
"grad_norm": 4.36360502243042, |
|
"learning_rate": 9.998859224458565e-07, |
|
"loss": 0.2735, |
|
"num_input_tokens_seen": 473956352, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.05723651033787191, |
|
"grad_norm": 5.051778316497803, |
|
"learning_rate": 9.998773161627701e-07, |
|
"loss": 0.1831, |
|
"num_input_tokens_seen": 476053504, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.057488653555219364, |
|
"grad_norm": 3.883115291595459, |
|
"learning_rate": 9.998683969818364e-07, |
|
"loss": 0.1617, |
|
"num_input_tokens_seen": 478150656, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.05774079677256682, |
|
"grad_norm": 3.9679079055786133, |
|
"learning_rate": 9.998591649092588e-07, |
|
"loss": 0.1273, |
|
"num_input_tokens_seen": 480247808, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.05799293998991427, |
|
"grad_norm": 6.0246901512146, |
|
"learning_rate": 9.998496199514582e-07, |
|
"loss": 0.1463, |
|
"num_input_tokens_seen": 482344960, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.058245083207261725, |
|
"grad_norm": 3.684004545211792, |
|
"learning_rate": 9.998397621150734e-07, |
|
"loss": 0.1422, |
|
"num_input_tokens_seen": 484442112, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.05849722642460918, |
|
"grad_norm": 5.111332416534424, |
|
"learning_rate": 9.998295914069606e-07, |
|
"loss": 0.2197, |
|
"num_input_tokens_seen": 486539264, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.05874936964195663, |
|
"grad_norm": 3.0218448638916016, |
|
"learning_rate": 9.99819107834194e-07, |
|
"loss": 0.1219, |
|
"num_input_tokens_seen": 488636416, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.059001512859304085, |
|
"grad_norm": 3.564114570617676, |
|
"learning_rate": 9.99808311404065e-07, |
|
"loss": 0.1983, |
|
"num_input_tokens_seen": 490733568, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.05925365607665154, |
|
"grad_norm": 6.091875076293945, |
|
"learning_rate": 9.997972021240824e-07, |
|
"loss": 0.2782, |
|
"num_input_tokens_seen": 492830720, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.05950579929399899, |
|
"grad_norm": 4.984955787658691, |
|
"learning_rate": 9.997857800019734e-07, |
|
"loss": 0.2658, |
|
"num_input_tokens_seen": 494927872, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.059757942511346446, |
|
"grad_norm": 4.2022705078125, |
|
"learning_rate": 9.997740450456819e-07, |
|
"loss": 0.1511, |
|
"num_input_tokens_seen": 497025024, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.0600100857286939, |
|
"grad_norm": 4.631911277770996, |
|
"learning_rate": 9.997619972633701e-07, |
|
"loss": 0.1874, |
|
"num_input_tokens_seen": 499122176, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.06026222894604135, |
|
"grad_norm": 3.489034414291382, |
|
"learning_rate": 9.99749636663417e-07, |
|
"loss": 0.1684, |
|
"num_input_tokens_seen": 501219328, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.060514372163388806, |
|
"grad_norm": 5.1144185066223145, |
|
"learning_rate": 9.997369632544202e-07, |
|
"loss": 0.1834, |
|
"num_input_tokens_seen": 503316480, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.060766515380736255, |
|
"grad_norm": 5.526945114135742, |
|
"learning_rate": 9.997239770451938e-07, |
|
"loss": 0.2135, |
|
"num_input_tokens_seen": 505413632, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.06101865859808371, |
|
"grad_norm": 6.000234127044678, |
|
"learning_rate": 9.997106780447705e-07, |
|
"loss": 0.2248, |
|
"num_input_tokens_seen": 507510784, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.06127080181543117, |
|
"grad_norm": 3.4181573390960693, |
|
"learning_rate": 9.99697066262399e-07, |
|
"loss": 0.0903, |
|
"num_input_tokens_seen": 509607936, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.061522945032778616, |
|
"grad_norm": 3.6254003047943115, |
|
"learning_rate": 9.996831417075477e-07, |
|
"loss": 0.1507, |
|
"num_input_tokens_seen": 511705088, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.06177508825012607, |
|
"grad_norm": 3.7657456398010254, |
|
"learning_rate": 9.996689043899005e-07, |
|
"loss": 0.1569, |
|
"num_input_tokens_seen": 513802240, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.06202723146747353, |
|
"grad_norm": 4.642493724822998, |
|
"learning_rate": 9.996543543193604e-07, |
|
"loss": 0.1187, |
|
"num_input_tokens_seen": 515899392, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.062279374684820976, |
|
"grad_norm": 3.632336378097534, |
|
"learning_rate": 9.996394915060468e-07, |
|
"loss": 0.1736, |
|
"num_input_tokens_seen": 517996544, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.06253151790216843, |
|
"grad_norm": 4.491301536560059, |
|
"learning_rate": 9.99624315960297e-07, |
|
"loss": 0.2351, |
|
"num_input_tokens_seen": 520093696, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.06278366111951589, |
|
"grad_norm": 2.526890277862549, |
|
"learning_rate": 9.996088276926661e-07, |
|
"loss": 0.1088, |
|
"num_input_tokens_seen": 522190848, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.06303580433686334, |
|
"grad_norm": 4.402822971343994, |
|
"learning_rate": 9.995930267139266e-07, |
|
"loss": 0.1189, |
|
"num_input_tokens_seen": 524288000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06328794755421079, |
|
"grad_norm": 2.893916368484497, |
|
"learning_rate": 9.99576913035068e-07, |
|
"loss": 0.1003, |
|
"num_input_tokens_seen": 526385152, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.06354009077155824, |
|
"grad_norm": 4.437779426574707, |
|
"learning_rate": 9.995604866672978e-07, |
|
"loss": 0.21, |
|
"num_input_tokens_seen": 528482304, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.0637922339889057, |
|
"grad_norm": 7.890944957733154, |
|
"learning_rate": 9.995437476220408e-07, |
|
"loss": 0.3668, |
|
"num_input_tokens_seen": 530579456, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.06404437720625315, |
|
"grad_norm": 3.5893633365631104, |
|
"learning_rate": 9.995266959109396e-07, |
|
"loss": 0.1771, |
|
"num_input_tokens_seen": 532676608, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.06429652042360061, |
|
"grad_norm": 4.691050052642822, |
|
"learning_rate": 9.995093315458534e-07, |
|
"loss": 0.1696, |
|
"num_input_tokens_seen": 534773760, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.06454866364094806, |
|
"grad_norm": 2.8213396072387695, |
|
"learning_rate": 9.9949165453886e-07, |
|
"loss": 0.1364, |
|
"num_input_tokens_seen": 536870912, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.0648008068582955, |
|
"grad_norm": 4.529366493225098, |
|
"learning_rate": 9.994736649022539e-07, |
|
"loss": 0.1749, |
|
"num_input_tokens_seen": 538968064, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.06505295007564296, |
|
"grad_norm": 3.919793128967285, |
|
"learning_rate": 9.99455362648547e-07, |
|
"loss": 0.1611, |
|
"num_input_tokens_seen": 541065216, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.06530509329299042, |
|
"grad_norm": 4.9372711181640625, |
|
"learning_rate": 9.994367477904695e-07, |
|
"loss": 0.2556, |
|
"num_input_tokens_seen": 543162368, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.06555723651033787, |
|
"grad_norm": 5.533105850219727, |
|
"learning_rate": 9.994178203409674e-07, |
|
"loss": 0.1598, |
|
"num_input_tokens_seen": 545259520, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06580937972768533, |
|
"grad_norm": 4.164669990539551, |
|
"learning_rate": 9.993985803132057e-07, |
|
"loss": 0.1743, |
|
"num_input_tokens_seen": 547356672, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.06606152294503277, |
|
"grad_norm": 3.924823045730591, |
|
"learning_rate": 9.993790277205662e-07, |
|
"loss": 0.169, |
|
"num_input_tokens_seen": 549453824, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.06631366616238023, |
|
"grad_norm": 3.045861005783081, |
|
"learning_rate": 9.993591625766477e-07, |
|
"loss": 0.1027, |
|
"num_input_tokens_seen": 551550976, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.06656580937972768, |
|
"grad_norm": 2.7366058826446533, |
|
"learning_rate": 9.993389848952673e-07, |
|
"loss": 0.1027, |
|
"num_input_tokens_seen": 553648128, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.06681795259707514, |
|
"grad_norm": 4.305903434753418, |
|
"learning_rate": 9.993184946904586e-07, |
|
"loss": 0.0899, |
|
"num_input_tokens_seen": 555745280, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0670700958144226, |
|
"grad_norm": 4.169579029083252, |
|
"learning_rate": 9.992976919764728e-07, |
|
"loss": 0.1555, |
|
"num_input_tokens_seen": 557842432, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.06732223903177005, |
|
"grad_norm": 2.866806983947754, |
|
"learning_rate": 9.992765767677789e-07, |
|
"loss": 0.1226, |
|
"num_input_tokens_seen": 559939584, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.06757438224911749, |
|
"grad_norm": 3.6884562969207764, |
|
"learning_rate": 9.992551490790626e-07, |
|
"loss": 0.1359, |
|
"num_input_tokens_seen": 562036736, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.06782652546646495, |
|
"grad_norm": 4.731523513793945, |
|
"learning_rate": 9.992334089252278e-07, |
|
"loss": 0.1438, |
|
"num_input_tokens_seen": 564133888, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.0680786686838124, |
|
"grad_norm": 3.90913724899292, |
|
"learning_rate": 9.992113563213944e-07, |
|
"loss": 0.1596, |
|
"num_input_tokens_seen": 566231040, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06833081190115986, |
|
"grad_norm": 3.4404547214508057, |
|
"learning_rate": 9.99188991282901e-07, |
|
"loss": 0.165, |
|
"num_input_tokens_seen": 568328192, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.06858295511850732, |
|
"grad_norm": 2.840576648712158, |
|
"learning_rate": 9.991663138253025e-07, |
|
"loss": 0.109, |
|
"num_input_tokens_seen": 570425344, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.06883509833585477, |
|
"grad_norm": 4.362993240356445, |
|
"learning_rate": 9.991433239643716e-07, |
|
"loss": 0.209, |
|
"num_input_tokens_seen": 572522496, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.06908724155320221, |
|
"grad_norm": 4.26267671585083, |
|
"learning_rate": 9.991200217160984e-07, |
|
"loss": 0.0746, |
|
"num_input_tokens_seen": 574619648, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.06933938477054967, |
|
"grad_norm": 3.7214324474334717, |
|
"learning_rate": 9.990964070966895e-07, |
|
"loss": 0.1395, |
|
"num_input_tokens_seen": 576716800, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.06959152798789713, |
|
"grad_norm": 4.263853549957275, |
|
"learning_rate": 9.9907248012257e-07, |
|
"loss": 0.1919, |
|
"num_input_tokens_seen": 578813952, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.06984367120524458, |
|
"grad_norm": 3.7660653591156006, |
|
"learning_rate": 9.99048240810381e-07, |
|
"loss": 0.1362, |
|
"num_input_tokens_seen": 580911104, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.07009581442259204, |
|
"grad_norm": 3.3318731784820557, |
|
"learning_rate": 9.990236891769818e-07, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 583008256, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.07034795763993948, |
|
"grad_norm": 3.9983317852020264, |
|
"learning_rate": 9.98998825239448e-07, |
|
"loss": 0.1731, |
|
"num_input_tokens_seen": 585105408, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.07060010085728693, |
|
"grad_norm": 3.032134532928467, |
|
"learning_rate": 9.98973649015073e-07, |
|
"loss": 0.1278, |
|
"num_input_tokens_seen": 587202560, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07085224407463439, |
|
"grad_norm": 3.8470921516418457, |
|
"learning_rate": 9.98948160521368e-07, |
|
"loss": 0.103, |
|
"num_input_tokens_seen": 589299712, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.07110438729198185, |
|
"grad_norm": 2.935425043106079, |
|
"learning_rate": 9.989223597760598e-07, |
|
"loss": 0.1472, |
|
"num_input_tokens_seen": 591396864, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.0713565305093293, |
|
"grad_norm": 3.791640043258667, |
|
"learning_rate": 9.988962467970938e-07, |
|
"loss": 0.1743, |
|
"num_input_tokens_seen": 593494016, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.07160867372667676, |
|
"grad_norm": 2.616250991821289, |
|
"learning_rate": 9.988698216026322e-07, |
|
"loss": 0.0769, |
|
"num_input_tokens_seen": 595591168, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.0718608169440242, |
|
"grad_norm": 3.309394359588623, |
|
"learning_rate": 9.988430842110538e-07, |
|
"loss": 0.1357, |
|
"num_input_tokens_seen": 597688320, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.07211296016137166, |
|
"grad_norm": 4.600468635559082, |
|
"learning_rate": 9.988160346409551e-07, |
|
"loss": 0.1178, |
|
"num_input_tokens_seen": 599785472, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.07236510337871911, |
|
"grad_norm": 3.2695717811584473, |
|
"learning_rate": 9.987886729111496e-07, |
|
"loss": 0.1122, |
|
"num_input_tokens_seen": 601882624, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.07261724659606657, |
|
"grad_norm": 2.7870922088623047, |
|
"learning_rate": 9.98760999040668e-07, |
|
"loss": 0.0995, |
|
"num_input_tokens_seen": 603979776, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.07286938981341402, |
|
"grad_norm": 3.2872393131256104, |
|
"learning_rate": 9.987330130487576e-07, |
|
"loss": 0.1314, |
|
"num_input_tokens_seen": 606076928, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.07312153303076148, |
|
"grad_norm": 4.210444927215576, |
|
"learning_rate": 9.987047149548833e-07, |
|
"loss": 0.1435, |
|
"num_input_tokens_seen": 608174080, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07337367624810892, |
|
"grad_norm": 3.661651372909546, |
|
"learning_rate": 9.986761047787274e-07, |
|
"loss": 0.1075, |
|
"num_input_tokens_seen": 610271232, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.07362581946545638, |
|
"grad_norm": 4.133707046508789, |
|
"learning_rate": 9.986471825401882e-07, |
|
"loss": 0.1977, |
|
"num_input_tokens_seen": 612368384, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.07387796268280383, |
|
"grad_norm": 4.6356072425842285, |
|
"learning_rate": 9.98617948259382e-07, |
|
"loss": 0.1703, |
|
"num_input_tokens_seen": 614465536, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.07413010590015129, |
|
"grad_norm": 3.9383256435394287, |
|
"learning_rate": 9.985884019566416e-07, |
|
"loss": 0.1848, |
|
"num_input_tokens_seen": 616562688, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.07438224911749874, |
|
"grad_norm": 4.793269157409668, |
|
"learning_rate": 9.985585436525168e-07, |
|
"loss": 0.1488, |
|
"num_input_tokens_seen": 618659840, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.07463439233484619, |
|
"grad_norm": 6.518699645996094, |
|
"learning_rate": 9.98528373367775e-07, |
|
"loss": 0.24, |
|
"num_input_tokens_seen": 620756992, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.07488653555219364, |
|
"grad_norm": 3.71830415725708, |
|
"learning_rate": 9.984978911234003e-07, |
|
"loss": 0.1444, |
|
"num_input_tokens_seen": 622854144, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.0751386787695411, |
|
"grad_norm": 3.535399913787842, |
|
"learning_rate": 9.984670969405932e-07, |
|
"loss": 0.145, |
|
"num_input_tokens_seen": 624951296, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.07539082198688855, |
|
"grad_norm": 2.5828938484191895, |
|
"learning_rate": 9.984359908407716e-07, |
|
"loss": 0.1091, |
|
"num_input_tokens_seen": 627048448, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.07564296520423601, |
|
"grad_norm": 3.900514841079712, |
|
"learning_rate": 9.984045728455707e-07, |
|
"loss": 0.1672, |
|
"num_input_tokens_seen": 629145600, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07589510842158347, |
|
"grad_norm": 4.364770412445068, |
|
"learning_rate": 9.98372842976842e-07, |
|
"loss": 0.2678, |
|
"num_input_tokens_seen": 631242752, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.07614725163893091, |
|
"grad_norm": 3.6578245162963867, |
|
"learning_rate": 9.983408012566545e-07, |
|
"loss": 0.1238, |
|
"num_input_tokens_seen": 633339904, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.07639939485627836, |
|
"grad_norm": 3.067723512649536, |
|
"learning_rate": 9.983084477072936e-07, |
|
"loss": 0.092, |
|
"num_input_tokens_seen": 635437056, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.07665153807362582, |
|
"grad_norm": 2.8249781131744385, |
|
"learning_rate": 9.982757823512619e-07, |
|
"loss": 0.1065, |
|
"num_input_tokens_seen": 637534208, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.07690368129097327, |
|
"grad_norm": 3.4561619758605957, |
|
"learning_rate": 9.982428052112784e-07, |
|
"loss": 0.1463, |
|
"num_input_tokens_seen": 639631360, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.07715582450832073, |
|
"grad_norm": 4.192049503326416, |
|
"learning_rate": 9.982095163102796e-07, |
|
"loss": 0.1127, |
|
"num_input_tokens_seen": 641728512, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.07740796772566819, |
|
"grad_norm": 2.888293743133545, |
|
"learning_rate": 9.981759156714185e-07, |
|
"loss": 0.113, |
|
"num_input_tokens_seen": 643825664, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.07766011094301563, |
|
"grad_norm": 3.8195247650146484, |
|
"learning_rate": 9.981420033180651e-07, |
|
"loss": 0.1601, |
|
"num_input_tokens_seen": 645922816, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.07791225416036308, |
|
"grad_norm": 3.721971035003662, |
|
"learning_rate": 9.98107779273806e-07, |
|
"loss": 0.1443, |
|
"num_input_tokens_seen": 648019968, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.07816439737771054, |
|
"grad_norm": 3.4332494735717773, |
|
"learning_rate": 9.980732435624441e-07, |
|
"loss": 0.1503, |
|
"num_input_tokens_seen": 650117120, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.078416540595058, |
|
"grad_norm": 2.9033710956573486, |
|
"learning_rate": 9.980383962080003e-07, |
|
"loss": 0.073, |
|
"num_input_tokens_seen": 652214272, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.07866868381240545, |
|
"grad_norm": 3.597287178039551, |
|
"learning_rate": 9.980032372347116e-07, |
|
"loss": 0.1596, |
|
"num_input_tokens_seen": 654311424, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.0789208270297529, |
|
"grad_norm": 3.0851659774780273, |
|
"learning_rate": 9.97967766667031e-07, |
|
"loss": 0.1188, |
|
"num_input_tokens_seen": 656408576, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.07917297024710035, |
|
"grad_norm": 2.279250144958496, |
|
"learning_rate": 9.979319845296296e-07, |
|
"loss": 0.0974, |
|
"num_input_tokens_seen": 658505728, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.0794251134644478, |
|
"grad_norm": 4.360164165496826, |
|
"learning_rate": 9.978958908473941e-07, |
|
"loss": 0.1992, |
|
"num_input_tokens_seen": 660602880, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.07967725668179526, |
|
"grad_norm": 2.8060495853424072, |
|
"learning_rate": 9.978594856454288e-07, |
|
"loss": 0.1314, |
|
"num_input_tokens_seen": 662700032, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.07992939989914272, |
|
"grad_norm": 4.089578628540039, |
|
"learning_rate": 9.978227689490536e-07, |
|
"loss": 0.1807, |
|
"num_input_tokens_seen": 664797184, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.08018154311649017, |
|
"grad_norm": 3.043846368789673, |
|
"learning_rate": 9.977857407838061e-07, |
|
"loss": 0.1208, |
|
"num_input_tokens_seen": 666894336, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.08043368633383761, |
|
"grad_norm": 2.2600390911102295, |
|
"learning_rate": 9.9774840117544e-07, |
|
"loss": 0.076, |
|
"num_input_tokens_seen": 668991488, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.08068582955118507, |
|
"grad_norm": 3.115410089492798, |
|
"learning_rate": 9.977107501499253e-07, |
|
"loss": 0.1118, |
|
"num_input_tokens_seen": 671088640, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08093797276853253, |
|
"grad_norm": 3.720118761062622, |
|
"learning_rate": 9.976727877334493e-07, |
|
"loss": 0.1518, |
|
"num_input_tokens_seen": 673185792, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.08119011598587998, |
|
"grad_norm": 3.6921238899230957, |
|
"learning_rate": 9.976345139524152e-07, |
|
"loss": 0.1261, |
|
"num_input_tokens_seen": 675282944, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.08144225920322744, |
|
"grad_norm": 3.162914752960205, |
|
"learning_rate": 9.975959288334438e-07, |
|
"loss": 0.1038, |
|
"num_input_tokens_seen": 677380096, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.08169440242057488, |
|
"grad_norm": 3.166231870651245, |
|
"learning_rate": 9.97557032403371e-07, |
|
"loss": 0.1294, |
|
"num_input_tokens_seen": 679477248, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.08194654563792234, |
|
"grad_norm": 3.0747804641723633, |
|
"learning_rate": 9.975178246892507e-07, |
|
"loss": 0.1425, |
|
"num_input_tokens_seen": 681574400, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.08219868885526979, |
|
"grad_norm": 3.0979673862457275, |
|
"learning_rate": 9.974783057183519e-07, |
|
"loss": 0.1586, |
|
"num_input_tokens_seen": 683671552, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.08245083207261725, |
|
"grad_norm": 4.019197940826416, |
|
"learning_rate": 9.974384755181609e-07, |
|
"loss": 0.1663, |
|
"num_input_tokens_seen": 685768704, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.0827029752899647, |
|
"grad_norm": 2.6061339378356934, |
|
"learning_rate": 9.973983341163807e-07, |
|
"loss": 0.0851, |
|
"num_input_tokens_seen": 687865856, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.08295511850731216, |
|
"grad_norm": 3.0148558616638184, |
|
"learning_rate": 9.9735788154093e-07, |
|
"loss": 0.0966, |
|
"num_input_tokens_seen": 689963008, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.0832072617246596, |
|
"grad_norm": 2.6705162525177, |
|
"learning_rate": 9.973171178199447e-07, |
|
"loss": 0.0839, |
|
"num_input_tokens_seen": 692060160, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08345940494200706, |
|
"grad_norm": 4.910850524902344, |
|
"learning_rate": 9.972760429817763e-07, |
|
"loss": 0.1695, |
|
"num_input_tokens_seen": 694157312, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.08371154815935451, |
|
"grad_norm": 3.358743190765381, |
|
"learning_rate": 9.972346570549932e-07, |
|
"loss": 0.0935, |
|
"num_input_tokens_seen": 696254464, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.08396369137670197, |
|
"grad_norm": 3.214064598083496, |
|
"learning_rate": 9.971929600683802e-07, |
|
"loss": 0.0848, |
|
"num_input_tokens_seen": 698351616, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.08421583459404942, |
|
"grad_norm": 4.408289432525635, |
|
"learning_rate": 9.971509520509381e-07, |
|
"loss": 0.1624, |
|
"num_input_tokens_seen": 700448768, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.08446797781139688, |
|
"grad_norm": 4.276678085327148, |
|
"learning_rate": 9.971086330318845e-07, |
|
"loss": 0.1458, |
|
"num_input_tokens_seen": 702545920, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.08472012102874432, |
|
"grad_norm": 2.518461227416992, |
|
"learning_rate": 9.97066003040653e-07, |
|
"loss": 0.0934, |
|
"num_input_tokens_seen": 704643072, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.08497226424609178, |
|
"grad_norm": 2.8323476314544678, |
|
"learning_rate": 9.970230621068932e-07, |
|
"loss": 0.1324, |
|
"num_input_tokens_seen": 706740224, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.08522440746343923, |
|
"grad_norm": 2.8873610496520996, |
|
"learning_rate": 9.969798102604717e-07, |
|
"loss": 0.1292, |
|
"num_input_tokens_seen": 708837376, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.08547655068078669, |
|
"grad_norm": 2.796959638595581, |
|
"learning_rate": 9.969362475314708e-07, |
|
"loss": 0.1086, |
|
"num_input_tokens_seen": 710934528, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.08572869389813415, |
|
"grad_norm": 4.745234966278076, |
|
"learning_rate": 9.968923739501892e-07, |
|
"loss": 0.2212, |
|
"num_input_tokens_seen": 713031680, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08598083711548159, |
|
"grad_norm": 4.436620235443115, |
|
"learning_rate": 9.968481895471417e-07, |
|
"loss": 0.1376, |
|
"num_input_tokens_seen": 715128832, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.08623298033282904, |
|
"grad_norm": 4.772200584411621, |
|
"learning_rate": 9.968036943530592e-07, |
|
"loss": 0.193, |
|
"num_input_tokens_seen": 717225984, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.0864851235501765, |
|
"grad_norm": 3.2390449047088623, |
|
"learning_rate": 9.967588883988893e-07, |
|
"loss": 0.0999, |
|
"num_input_tokens_seen": 719323136, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.08673726676752395, |
|
"grad_norm": 3.936569929122925, |
|
"learning_rate": 9.967137717157951e-07, |
|
"loss": 0.1634, |
|
"num_input_tokens_seen": 721420288, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.08698940998487141, |
|
"grad_norm": 3.647679567337036, |
|
"learning_rate": 9.966683443351564e-07, |
|
"loss": 0.1798, |
|
"num_input_tokens_seen": 723517440, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.08724155320221887, |
|
"grad_norm": 2.8842921257019043, |
|
"learning_rate": 9.966226062885682e-07, |
|
"loss": 0.1033, |
|
"num_input_tokens_seen": 725614592, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.08749369641956631, |
|
"grad_norm": 6.5264434814453125, |
|
"learning_rate": 9.965765576078424e-07, |
|
"loss": 0.2729, |
|
"num_input_tokens_seen": 727711744, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.08774583963691376, |
|
"grad_norm": 3.786755084991455, |
|
"learning_rate": 9.96530198325007e-07, |
|
"loss": 0.1233, |
|
"num_input_tokens_seen": 729808896, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.08799798285426122, |
|
"grad_norm": 3.994030237197876, |
|
"learning_rate": 9.964835284723052e-07, |
|
"loss": 0.1229, |
|
"num_input_tokens_seen": 731906048, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.08825012607160868, |
|
"grad_norm": 4.352416038513184, |
|
"learning_rate": 9.96436548082197e-07, |
|
"loss": 0.1501, |
|
"num_input_tokens_seen": 734003200, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08850226928895613, |
|
"grad_norm": 3.238286018371582, |
|
"learning_rate": 9.963892571873584e-07, |
|
"loss": 0.1314, |
|
"num_input_tokens_seen": 736100352, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.08875441250630359, |
|
"grad_norm": 2.75301456451416, |
|
"learning_rate": 9.963416558206806e-07, |
|
"loss": 0.1137, |
|
"num_input_tokens_seen": 738197504, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.08900655572365103, |
|
"grad_norm": 3.3911097049713135, |
|
"learning_rate": 9.962937440152712e-07, |
|
"loss": 0.0976, |
|
"num_input_tokens_seen": 740294656, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.08925869894099848, |
|
"grad_norm": 2.7000679969787598, |
|
"learning_rate": 9.962455218044542e-07, |
|
"loss": 0.063, |
|
"num_input_tokens_seen": 742391808, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.08951084215834594, |
|
"grad_norm": 3.3619422912597656, |
|
"learning_rate": 9.961969892217688e-07, |
|
"loss": 0.1167, |
|
"num_input_tokens_seen": 744488960, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.0897629853756934, |
|
"grad_norm": 2.421957015991211, |
|
"learning_rate": 9.9614814630097e-07, |
|
"loss": 0.1184, |
|
"num_input_tokens_seen": 746586112, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.09001512859304085, |
|
"grad_norm": 3.2838544845581055, |
|
"learning_rate": 9.960989930760294e-07, |
|
"loss": 0.1133, |
|
"num_input_tokens_seen": 748683264, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.0902672718103883, |
|
"grad_norm": 4.716813564300537, |
|
"learning_rate": 9.960495295811337e-07, |
|
"loss": 0.152, |
|
"num_input_tokens_seen": 750780416, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.09051941502773575, |
|
"grad_norm": 3.567866563796997, |
|
"learning_rate": 9.959997558506857e-07, |
|
"loss": 0.1348, |
|
"num_input_tokens_seen": 752877568, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.0907715582450832, |
|
"grad_norm": 8.155049324035645, |
|
"learning_rate": 9.959496719193039e-07, |
|
"loss": 0.1658, |
|
"num_input_tokens_seen": 754974720, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09102370146243066, |
|
"grad_norm": 4.341349124908447, |
|
"learning_rate": 9.958992778218226e-07, |
|
"loss": 0.1635, |
|
"num_input_tokens_seen": 757071872, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.09127584467977812, |
|
"grad_norm": 4.6380815505981445, |
|
"learning_rate": 9.95848573593292e-07, |
|
"loss": 0.1715, |
|
"num_input_tokens_seen": 759169024, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.09152798789712557, |
|
"grad_norm": 3.3967676162719727, |
|
"learning_rate": 9.957975592689774e-07, |
|
"loss": 0.106, |
|
"num_input_tokens_seen": 761266176, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.09178013111447302, |
|
"grad_norm": 2.9890308380126953, |
|
"learning_rate": 9.957462348843607e-07, |
|
"loss": 0.1163, |
|
"num_input_tokens_seen": 763363328, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.09203227433182047, |
|
"grad_norm": 2.564323663711548, |
|
"learning_rate": 9.956946004751386e-07, |
|
"loss": 0.1217, |
|
"num_input_tokens_seen": 765460480, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.09228441754916793, |
|
"grad_norm": 4.0984697341918945, |
|
"learning_rate": 9.956426560772238e-07, |
|
"loss": 0.1801, |
|
"num_input_tokens_seen": 767557632, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.09253656076651538, |
|
"grad_norm": 2.5396645069122314, |
|
"learning_rate": 9.955904017267444e-07, |
|
"loss": 0.1272, |
|
"num_input_tokens_seen": 769654784, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.09278870398386284, |
|
"grad_norm": 3.0213351249694824, |
|
"learning_rate": 9.955378374600447e-07, |
|
"loss": 0.121, |
|
"num_input_tokens_seen": 771751936, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.09304084720121028, |
|
"grad_norm": 3.8049328327178955, |
|
"learning_rate": 9.954849633136839e-07, |
|
"loss": 0.102, |
|
"num_input_tokens_seen": 773849088, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.09329299041855774, |
|
"grad_norm": 3.4090912342071533, |
|
"learning_rate": 9.95431779324437e-07, |
|
"loss": 0.1179, |
|
"num_input_tokens_seen": 775946240, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09354513363590519, |
|
"grad_norm": 2.5929131507873535, |
|
"learning_rate": 9.95378285529294e-07, |
|
"loss": 0.1106, |
|
"num_input_tokens_seen": 778043392, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.09379727685325265, |
|
"grad_norm": 3.6183884143829346, |
|
"learning_rate": 9.953244819654615e-07, |
|
"loss": 0.1029, |
|
"num_input_tokens_seen": 780140544, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.0940494200706001, |
|
"grad_norm": 3.812199354171753, |
|
"learning_rate": 9.952703686703604e-07, |
|
"loss": 0.0838, |
|
"num_input_tokens_seen": 782237696, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.09430156328794756, |
|
"grad_norm": 5.054091453552246, |
|
"learning_rate": 9.952159456816275e-07, |
|
"loss": 0.2415, |
|
"num_input_tokens_seen": 784334848, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.094553706505295, |
|
"grad_norm": 2.739720582962036, |
|
"learning_rate": 9.951612130371151e-07, |
|
"loss": 0.1198, |
|
"num_input_tokens_seen": 786432000, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.09480584972264246, |
|
"grad_norm": 3.5317635536193848, |
|
"learning_rate": 9.951061707748907e-07, |
|
"loss": 0.0951, |
|
"num_input_tokens_seen": 788529152, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.09505799293998991, |
|
"grad_norm": 2.7190043926239014, |
|
"learning_rate": 9.95050818933237e-07, |
|
"loss": 0.0918, |
|
"num_input_tokens_seen": 790626304, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.09531013615733737, |
|
"grad_norm": 2.244220495223999, |
|
"learning_rate": 9.949951575506528e-07, |
|
"loss": 0.0987, |
|
"num_input_tokens_seen": 792723456, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.09556227937468482, |
|
"grad_norm": 2.4800469875335693, |
|
"learning_rate": 9.94939186665851e-07, |
|
"loss": 0.112, |
|
"num_input_tokens_seen": 794820608, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.09581442259203228, |
|
"grad_norm": 2.934340238571167, |
|
"learning_rate": 9.948829063177606e-07, |
|
"loss": 0.0914, |
|
"num_input_tokens_seen": 796917760, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09606656580937972, |
|
"grad_norm": 4.361299991607666, |
|
"learning_rate": 9.948263165455256e-07, |
|
"loss": 0.1366, |
|
"num_input_tokens_seen": 799014912, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.09631870902672718, |
|
"grad_norm": 5.58315372467041, |
|
"learning_rate": 9.947694173885051e-07, |
|
"loss": 0.1444, |
|
"num_input_tokens_seen": 801112064, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.09657085224407463, |
|
"grad_norm": 2.2215416431427, |
|
"learning_rate": 9.947122088862737e-07, |
|
"loss": 0.1324, |
|
"num_input_tokens_seen": 803209216, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.09682299546142209, |
|
"grad_norm": 3.1041672229766846, |
|
"learning_rate": 9.946546910786208e-07, |
|
"loss": 0.1451, |
|
"num_input_tokens_seen": 805306368, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.09707513867876955, |
|
"grad_norm": 3.4068877696990967, |
|
"learning_rate": 9.945968640055513e-07, |
|
"loss": 0.1318, |
|
"num_input_tokens_seen": 807403520, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.09732728189611699, |
|
"grad_norm": 2.2413580417633057, |
|
"learning_rate": 9.945387277072845e-07, |
|
"loss": 0.0665, |
|
"num_input_tokens_seen": 809500672, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.09757942511346444, |
|
"grad_norm": 2.360349655151367, |
|
"learning_rate": 9.944802822242558e-07, |
|
"loss": 0.0752, |
|
"num_input_tokens_seen": 811597824, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.0978315683308119, |
|
"grad_norm": 2.0612034797668457, |
|
"learning_rate": 9.944215275971148e-07, |
|
"loss": 0.0661, |
|
"num_input_tokens_seen": 813694976, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.09808371154815936, |
|
"grad_norm": 2.8129661083221436, |
|
"learning_rate": 9.943624638667263e-07, |
|
"loss": 0.0991, |
|
"num_input_tokens_seen": 815792128, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.09833585476550681, |
|
"grad_norm": 3.179905891418457, |
|
"learning_rate": 9.943030910741707e-07, |
|
"loss": 0.166, |
|
"num_input_tokens_seen": 817889280, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09858799798285427, |
|
"grad_norm": 3.191718816757202, |
|
"learning_rate": 9.942434092607423e-07, |
|
"loss": 0.1583, |
|
"num_input_tokens_seen": 819986432, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.09884014120020171, |
|
"grad_norm": 2.8753068447113037, |
|
"learning_rate": 9.941834184679511e-07, |
|
"loss": 0.1463, |
|
"num_input_tokens_seen": 822083584, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.09909228441754916, |
|
"grad_norm": 2.709397315979004, |
|
"learning_rate": 9.94123118737522e-07, |
|
"loss": 0.103, |
|
"num_input_tokens_seen": 824180736, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.09934442763489662, |
|
"grad_norm": 3.7003681659698486, |
|
"learning_rate": 9.94062510111394e-07, |
|
"loss": 0.1539, |
|
"num_input_tokens_seen": 826277888, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.09959657085224408, |
|
"grad_norm": 4.4324631690979, |
|
"learning_rate": 9.94001592631722e-07, |
|
"loss": 0.1915, |
|
"num_input_tokens_seen": 828375040, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.09984871406959153, |
|
"grad_norm": 4.082291126251221, |
|
"learning_rate": 9.93940366340875e-07, |
|
"loss": 0.2416, |
|
"num_input_tokens_seen": 830472192, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.10010085728693899, |
|
"grad_norm": 2.7822890281677246, |
|
"learning_rate": 9.938788312814374e-07, |
|
"loss": 0.1053, |
|
"num_input_tokens_seen": 832569344, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.10035300050428643, |
|
"grad_norm": 2.376317024230957, |
|
"learning_rate": 9.938169874962072e-07, |
|
"loss": 0.0785, |
|
"num_input_tokens_seen": 834666496, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.10060514372163389, |
|
"grad_norm": 6.018281936645508, |
|
"learning_rate": 9.937548350281987e-07, |
|
"loss": 0.1501, |
|
"num_input_tokens_seen": 836763648, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.10085728693898134, |
|
"grad_norm": 2.6437666416168213, |
|
"learning_rate": 9.936923739206391e-07, |
|
"loss": 0.1259, |
|
"num_input_tokens_seen": 838860800, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1011094301563288, |
|
"grad_norm": 3.112172842025757, |
|
"learning_rate": 9.936296042169723e-07, |
|
"loss": 0.1747, |
|
"num_input_tokens_seen": 840957952, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.10136157337367625, |
|
"grad_norm": 7.632992744445801, |
|
"learning_rate": 9.93566525960855e-07, |
|
"loss": 0.0882, |
|
"num_input_tokens_seen": 843055104, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.1016137165910237, |
|
"grad_norm": 3.4459123611450195, |
|
"learning_rate": 9.935031391961599e-07, |
|
"loss": 0.1184, |
|
"num_input_tokens_seen": 845152256, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.10186585980837115, |
|
"grad_norm": 3.6913039684295654, |
|
"learning_rate": 9.93439443966973e-07, |
|
"loss": 0.1121, |
|
"num_input_tokens_seen": 847249408, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.1021180030257186, |
|
"grad_norm": 3.291170835494995, |
|
"learning_rate": 9.933754403175956e-07, |
|
"loss": 0.1317, |
|
"num_input_tokens_seen": 849346560, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.10237014624306606, |
|
"grad_norm": 5.224982738494873, |
|
"learning_rate": 9.93311128292544e-07, |
|
"loss": 0.2308, |
|
"num_input_tokens_seen": 851443712, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.10262228946041352, |
|
"grad_norm": 3.043541193008423, |
|
"learning_rate": 9.932465079365477e-07, |
|
"loss": 0.1293, |
|
"num_input_tokens_seen": 853540864, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.10287443267776097, |
|
"grad_norm": 3.613516092300415, |
|
"learning_rate": 9.931815792945515e-07, |
|
"loss": 0.2023, |
|
"num_input_tokens_seen": 855638016, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.10312657589510842, |
|
"grad_norm": 3.9032676219940186, |
|
"learning_rate": 9.931163424117148e-07, |
|
"loss": 0.1554, |
|
"num_input_tokens_seen": 857735168, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.10337871911245587, |
|
"grad_norm": 2.2143468856811523, |
|
"learning_rate": 9.930507973334106e-07, |
|
"loss": 0.1014, |
|
"num_input_tokens_seen": 859832320, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10363086232980333, |
|
"grad_norm": 3.722890615463257, |
|
"learning_rate": 9.92984944105227e-07, |
|
"loss": 0.1072, |
|
"num_input_tokens_seen": 861929472, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.10388300554715078, |
|
"grad_norm": 3.3566651344299316, |
|
"learning_rate": 9.929187827729658e-07, |
|
"loss": 0.1597, |
|
"num_input_tokens_seen": 864026624, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.10413514876449824, |
|
"grad_norm": 2.243074655532837, |
|
"learning_rate": 9.928523133826437e-07, |
|
"loss": 0.0799, |
|
"num_input_tokens_seen": 866123776, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.1043872919818457, |
|
"grad_norm": 2.4208436012268066, |
|
"learning_rate": 9.927855359804914e-07, |
|
"loss": 0.1441, |
|
"num_input_tokens_seen": 868220928, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.10463943519919314, |
|
"grad_norm": 3.7958076000213623, |
|
"learning_rate": 9.927184506129535e-07, |
|
"loss": 0.1769, |
|
"num_input_tokens_seen": 870318080, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.10489157841654059, |
|
"grad_norm": 2.1095194816589355, |
|
"learning_rate": 9.926510573266894e-07, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 872415232, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.10514372163388805, |
|
"grad_norm": 2.22505784034729, |
|
"learning_rate": 9.925833561685718e-07, |
|
"loss": 0.0868, |
|
"num_input_tokens_seen": 874512384, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.1053958648512355, |
|
"grad_norm": 2.8599283695220947, |
|
"learning_rate": 9.92515347185689e-07, |
|
"loss": 0.1311, |
|
"num_input_tokens_seen": 876609536, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.10564800806858296, |
|
"grad_norm": 3.1945903301239014, |
|
"learning_rate": 9.924470304253418e-07, |
|
"loss": 0.0906, |
|
"num_input_tokens_seen": 878706688, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.1059001512859304, |
|
"grad_norm": 5.766541481018066, |
|
"learning_rate": 9.92378405935046e-07, |
|
"loss": 0.1588, |
|
"num_input_tokens_seen": 880803840, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10615229450327786, |
|
"grad_norm": 2.077852249145508, |
|
"learning_rate": 9.92309473762531e-07, |
|
"loss": 0.0958, |
|
"num_input_tokens_seen": 882900992, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.10640443772062531, |
|
"grad_norm": 3.552129030227661, |
|
"learning_rate": 9.922402339557405e-07, |
|
"loss": 0.1314, |
|
"num_input_tokens_seen": 884998144, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.10665658093797277, |
|
"grad_norm": 2.371065855026245, |
|
"learning_rate": 9.92170686562832e-07, |
|
"loss": 0.1129, |
|
"num_input_tokens_seen": 887095296, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.10690872415532023, |
|
"grad_norm": 3.874335289001465, |
|
"learning_rate": 9.921008316321768e-07, |
|
"loss": 0.1691, |
|
"num_input_tokens_seen": 889192448, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.10716086737266768, |
|
"grad_norm": 2.733494520187378, |
|
"learning_rate": 9.920306692123609e-07, |
|
"loss": 0.1126, |
|
"num_input_tokens_seen": 891289600, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.10741301059001512, |
|
"grad_norm": 2.3687491416931152, |
|
"learning_rate": 9.919601993521829e-07, |
|
"loss": 0.1028, |
|
"num_input_tokens_seen": 893386752, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.10766515380736258, |
|
"grad_norm": 2.3049280643463135, |
|
"learning_rate": 9.91889422100656e-07, |
|
"loss": 0.0865, |
|
"num_input_tokens_seen": 895483904, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.10791729702471003, |
|
"grad_norm": 2.899887800216675, |
|
"learning_rate": 9.918183375070073e-07, |
|
"loss": 0.1258, |
|
"num_input_tokens_seen": 897581056, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.10816944024205749, |
|
"grad_norm": 4.081860065460205, |
|
"learning_rate": 9.917469456206773e-07, |
|
"loss": 0.0931, |
|
"num_input_tokens_seen": 899678208, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.10842158345940495, |
|
"grad_norm": 3.0482466220855713, |
|
"learning_rate": 9.916752464913201e-07, |
|
"loss": 0.1039, |
|
"num_input_tokens_seen": 901775360, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.10867372667675239, |
|
"grad_norm": 3.3849377632141113, |
|
"learning_rate": 9.916032401688042e-07, |
|
"loss": 0.1661, |
|
"num_input_tokens_seen": 903872512, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.10892586989409984, |
|
"grad_norm": 3.4006130695343018, |
|
"learning_rate": 9.91530926703211e-07, |
|
"loss": 0.121, |
|
"num_input_tokens_seen": 905969664, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.1091780131114473, |
|
"grad_norm": 4.100249290466309, |
|
"learning_rate": 9.91458306144836e-07, |
|
"loss": 0.1976, |
|
"num_input_tokens_seen": 908066816, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.10943015632879476, |
|
"grad_norm": 2.491917610168457, |
|
"learning_rate": 9.913853785441878e-07, |
|
"loss": 0.1019, |
|
"num_input_tokens_seen": 910163968, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.10968229954614221, |
|
"grad_norm": 4.087813377380371, |
|
"learning_rate": 9.913121439519893e-07, |
|
"loss": 0.1673, |
|
"num_input_tokens_seen": 912261120, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.10993444276348967, |
|
"grad_norm": 2.377880334854126, |
|
"learning_rate": 9.912386024191763e-07, |
|
"loss": 0.1184, |
|
"num_input_tokens_seen": 914358272, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.11018658598083711, |
|
"grad_norm": 2.745607376098633, |
|
"learning_rate": 9.911647539968981e-07, |
|
"loss": 0.0917, |
|
"num_input_tokens_seen": 916455424, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.11043872919818457, |
|
"grad_norm": 4.707367897033691, |
|
"learning_rate": 9.91090598736518e-07, |
|
"loss": 0.2128, |
|
"num_input_tokens_seen": 918552576, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.11069087241553202, |
|
"grad_norm": 3.578786611557007, |
|
"learning_rate": 9.910161366896119e-07, |
|
"loss": 0.1235, |
|
"num_input_tokens_seen": 920649728, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.11094301563287948, |
|
"grad_norm": 2.3904166221618652, |
|
"learning_rate": 9.909413679079697e-07, |
|
"loss": 0.1139, |
|
"num_input_tokens_seen": 922746880, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11119515885022693, |
|
"grad_norm": 3.1667914390563965, |
|
"learning_rate": 9.908662924435946e-07, |
|
"loss": 0.157, |
|
"num_input_tokens_seen": 924844032, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.11144730206757439, |
|
"grad_norm": 4.515403747558594, |
|
"learning_rate": 9.907909103487027e-07, |
|
"loss": 0.1837, |
|
"num_input_tokens_seen": 926941184, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.11169944528492183, |
|
"grad_norm": 1.9842240810394287, |
|
"learning_rate": 9.907152216757239e-07, |
|
"loss": 0.1077, |
|
"num_input_tokens_seen": 929038336, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.11195158850226929, |
|
"grad_norm": 3.713541030883789, |
|
"learning_rate": 9.906392264773008e-07, |
|
"loss": 0.1401, |
|
"num_input_tokens_seen": 931135488, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.11220373171961674, |
|
"grad_norm": 2.7595789432525635, |
|
"learning_rate": 9.905629248062895e-07, |
|
"loss": 0.1262, |
|
"num_input_tokens_seen": 933232640, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.1124558749369642, |
|
"grad_norm": 3.375941038131714, |
|
"learning_rate": 9.904863167157591e-07, |
|
"loss": 0.1777, |
|
"num_input_tokens_seen": 935329792, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.11270801815431165, |
|
"grad_norm": 2.2114899158477783, |
|
"learning_rate": 9.904094022589923e-07, |
|
"loss": 0.0785, |
|
"num_input_tokens_seen": 937426944, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.1129601613716591, |
|
"grad_norm": 3.5571250915527344, |
|
"learning_rate": 9.90332181489484e-07, |
|
"loss": 0.1771, |
|
"num_input_tokens_seen": 939524096, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.11321230458900655, |
|
"grad_norm": 4.025667667388916, |
|
"learning_rate": 9.902546544609432e-07, |
|
"loss": 0.1424, |
|
"num_input_tokens_seen": 941621248, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.11346444780635401, |
|
"grad_norm": 2.804630994796753, |
|
"learning_rate": 9.901768212272906e-07, |
|
"loss": 0.1722, |
|
"num_input_tokens_seen": 943718400, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11371659102370146, |
|
"grad_norm": 2.183051824569702, |
|
"learning_rate": 9.900986818426612e-07, |
|
"loss": 0.0876, |
|
"num_input_tokens_seen": 945815552, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.11396873424104892, |
|
"grad_norm": 2.7712557315826416, |
|
"learning_rate": 9.900202363614025e-07, |
|
"loss": 0.1148, |
|
"num_input_tokens_seen": 947912704, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.11422087745839637, |
|
"grad_norm": 3.2009191513061523, |
|
"learning_rate": 9.899414848380743e-07, |
|
"loss": 0.1514, |
|
"num_input_tokens_seen": 950009856, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.11447302067574382, |
|
"grad_norm": 3.8625547885894775, |
|
"learning_rate": 9.8986242732745e-07, |
|
"loss": 0.1811, |
|
"num_input_tokens_seen": 952107008, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.11472516389309127, |
|
"grad_norm": 2.4320788383483887, |
|
"learning_rate": 9.897830638845153e-07, |
|
"loss": 0.1304, |
|
"num_input_tokens_seen": 954204160, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.11497730711043873, |
|
"grad_norm": 2.825261354446411, |
|
"learning_rate": 9.897033945644692e-07, |
|
"loss": 0.1156, |
|
"num_input_tokens_seen": 956301312, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.11522945032778618, |
|
"grad_norm": 9.34619426727295, |
|
"learning_rate": 9.89623419422723e-07, |
|
"loss": 0.0738, |
|
"num_input_tokens_seen": 958398464, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.11548159354513364, |
|
"grad_norm": 3.386025905609131, |
|
"learning_rate": 9.895431385149007e-07, |
|
"loss": 0.1693, |
|
"num_input_tokens_seen": 960495616, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.1157337367624811, |
|
"grad_norm": 3.9842169284820557, |
|
"learning_rate": 9.894625518968396e-07, |
|
"loss": 0.0836, |
|
"num_input_tokens_seen": 962592768, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.11598587997982854, |
|
"grad_norm": 4.544926166534424, |
|
"learning_rate": 9.893816596245886e-07, |
|
"loss": 0.2216, |
|
"num_input_tokens_seen": 964689920, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.116238023197176, |
|
"grad_norm": 3.3318898677825928, |
|
"learning_rate": 9.8930046175441e-07, |
|
"loss": 0.1638, |
|
"num_input_tokens_seen": 966787072, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.11649016641452345, |
|
"grad_norm": 2.5450119972229004, |
|
"learning_rate": 9.892189583427785e-07, |
|
"loss": 0.1472, |
|
"num_input_tokens_seen": 968884224, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.1167423096318709, |
|
"grad_norm": 5.197476863861084, |
|
"learning_rate": 9.891371494463812e-07, |
|
"loss": 0.1708, |
|
"num_input_tokens_seen": 970981376, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.11699445284921836, |
|
"grad_norm": 2.857074499130249, |
|
"learning_rate": 9.890550351221176e-07, |
|
"loss": 0.0968, |
|
"num_input_tokens_seen": 973078528, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.1172465960665658, |
|
"grad_norm": 2.8476240634918213, |
|
"learning_rate": 9.889726154270997e-07, |
|
"loss": 0.1504, |
|
"num_input_tokens_seen": 975175680, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.11749873928391326, |
|
"grad_norm": 6.322744369506836, |
|
"learning_rate": 9.888898904186517e-07, |
|
"loss": 0.1249, |
|
"num_input_tokens_seen": 977272832, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.11775088250126071, |
|
"grad_norm": 3.161973237991333, |
|
"learning_rate": 9.888068601543106e-07, |
|
"loss": 0.2604, |
|
"num_input_tokens_seen": 979369984, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.11800302571860817, |
|
"grad_norm": 2.0370872020721436, |
|
"learning_rate": 9.887235246918255e-07, |
|
"loss": 0.0983, |
|
"num_input_tokens_seen": 981467136, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.11825516893595563, |
|
"grad_norm": 3.568608283996582, |
|
"learning_rate": 9.886398840891576e-07, |
|
"loss": 0.1531, |
|
"num_input_tokens_seen": 983564288, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.11850731215330308, |
|
"grad_norm": 2.3104538917541504, |
|
"learning_rate": 9.885559384044805e-07, |
|
"loss": 0.1091, |
|
"num_input_tokens_seen": 985661440, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.11875945537065052, |
|
"grad_norm": 3.4569497108459473, |
|
"learning_rate": 9.884716876961798e-07, |
|
"loss": 0.1195, |
|
"num_input_tokens_seen": 987758592, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.11901159858799798, |
|
"grad_norm": 3.131441354751587, |
|
"learning_rate": 9.883871320228534e-07, |
|
"loss": 0.1564, |
|
"num_input_tokens_seen": 989855744, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.11926374180534544, |
|
"grad_norm": 3.427337646484375, |
|
"learning_rate": 9.883022714433116e-07, |
|
"loss": 0.1911, |
|
"num_input_tokens_seen": 991952896, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.11951588502269289, |
|
"grad_norm": 3.554757833480835, |
|
"learning_rate": 9.882171060165764e-07, |
|
"loss": 0.1489, |
|
"num_input_tokens_seen": 994050048, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.11976802824004035, |
|
"grad_norm": 2.5964512825012207, |
|
"learning_rate": 9.881316358018816e-07, |
|
"loss": 0.0662, |
|
"num_input_tokens_seen": 996147200, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1200201714573878, |
|
"grad_norm": 3.2962310314178467, |
|
"learning_rate": 9.880458608586737e-07, |
|
"loss": 0.1555, |
|
"num_input_tokens_seen": 998244352, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.12027231467473525, |
|
"grad_norm": 2.869269371032715, |
|
"learning_rate": 9.879597812466105e-07, |
|
"loss": 0.0795, |
|
"num_input_tokens_seen": 1000341504, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.1205244578920827, |
|
"grad_norm": 2.913670778274536, |
|
"learning_rate": 9.878733970255618e-07, |
|
"loss": 0.1329, |
|
"num_input_tokens_seen": 1002438656, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.12077660110943016, |
|
"grad_norm": 3.124332904815674, |
|
"learning_rate": 9.877867082556097e-07, |
|
"loss": 0.1538, |
|
"num_input_tokens_seen": 1004535808, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.12102874432677761, |
|
"grad_norm": 3.5321497917175293, |
|
"learning_rate": 9.876997149970477e-07, |
|
"loss": 0.1714, |
|
"num_input_tokens_seen": 1006632960, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12128088754412507, |
|
"grad_norm": 3.904442071914673, |
|
"learning_rate": 9.87612417310381e-07, |
|
"loss": 0.1452, |
|
"num_input_tokens_seen": 1008730112, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.12153303076147251, |
|
"grad_norm": 3.534336805343628, |
|
"learning_rate": 9.87524815256327e-07, |
|
"loss": 0.1589, |
|
"num_input_tokens_seen": 1010827264, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.12178517397881997, |
|
"grad_norm": 3.5298209190368652, |
|
"learning_rate": 9.874369088958145e-07, |
|
"loss": 0.1413, |
|
"num_input_tokens_seen": 1012924416, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.12203731719616742, |
|
"grad_norm": 3.4223012924194336, |
|
"learning_rate": 9.873486982899837e-07, |
|
"loss": 0.1552, |
|
"num_input_tokens_seen": 1015021568, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.12228946041351488, |
|
"grad_norm": 2.560487747192383, |
|
"learning_rate": 9.872601835001869e-07, |
|
"loss": 0.1192, |
|
"num_input_tokens_seen": 1017118720, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.12254160363086233, |
|
"grad_norm": 2.099520683288574, |
|
"learning_rate": 9.871713645879878e-07, |
|
"loss": 0.1125, |
|
"num_input_tokens_seen": 1019215872, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.12279374684820979, |
|
"grad_norm": 3.477560520172119, |
|
"learning_rate": 9.870822416151614e-07, |
|
"loss": 0.1485, |
|
"num_input_tokens_seen": 1021313024, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.12304589006555723, |
|
"grad_norm": 2.9200782775878906, |
|
"learning_rate": 9.869928146436942e-07, |
|
"loss": 0.0596, |
|
"num_input_tokens_seen": 1023410176, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.12329803328290469, |
|
"grad_norm": 2.3703415393829346, |
|
"learning_rate": 9.86903083735785e-07, |
|
"loss": 0.1163, |
|
"num_input_tokens_seen": 1025507328, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.12355017650025214, |
|
"grad_norm": 2.2664389610290527, |
|
"learning_rate": 9.868130489538425e-07, |
|
"loss": 0.0712, |
|
"num_input_tokens_seen": 1027604480, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1238023197175996, |
|
"grad_norm": 1.798887848854065, |
|
"learning_rate": 9.867227103604877e-07, |
|
"loss": 0.0709, |
|
"num_input_tokens_seen": 1029701632, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.12405446293494705, |
|
"grad_norm": 3.6567928791046143, |
|
"learning_rate": 9.86632068018553e-07, |
|
"loss": 0.1474, |
|
"num_input_tokens_seen": 1031798784, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.1243066061522945, |
|
"grad_norm": 2.8362531661987305, |
|
"learning_rate": 9.865411219910815e-07, |
|
"loss": 0.1235, |
|
"num_input_tokens_seen": 1033895936, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.12455874936964195, |
|
"grad_norm": 2.423952341079712, |
|
"learning_rate": 9.86449872341328e-07, |
|
"loss": 0.1048, |
|
"num_input_tokens_seen": 1035993088, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.12481089258698941, |
|
"grad_norm": 2.4268240928649902, |
|
"learning_rate": 9.863583191327583e-07, |
|
"loss": 0.1063, |
|
"num_input_tokens_seen": 1038090240, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.12506303580433686, |
|
"grad_norm": 2.1852941513061523, |
|
"learning_rate": 9.862664624290494e-07, |
|
"loss": 0.0932, |
|
"num_input_tokens_seen": 1040187392, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.12531517902168432, |
|
"grad_norm": 3.1700496673583984, |
|
"learning_rate": 9.86174302294089e-07, |
|
"loss": 0.1174, |
|
"num_input_tokens_seen": 1042284544, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.12556732223903178, |
|
"grad_norm": 3.2374541759490967, |
|
"learning_rate": 9.860818387919762e-07, |
|
"loss": 0.1251, |
|
"num_input_tokens_seen": 1044381696, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.12581946545637923, |
|
"grad_norm": 2.62046217918396, |
|
"learning_rate": 9.859890719870213e-07, |
|
"loss": 0.0991, |
|
"num_input_tokens_seen": 1046478848, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.1260716086737267, |
|
"grad_norm": 3.053370237350464, |
|
"learning_rate": 9.85896001943745e-07, |
|
"loss": 0.1612, |
|
"num_input_tokens_seen": 1048576000, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 3966, |
|
"num_input_tokens_seen": 1048576000, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.902112919650304e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|