bookclaim-1e-6-16-unmasked / trainer_state.json
chtmp223's picture
Upload folder using huggingface_hub
542f1e1 verified
raw
history blame contribute delete
No virus
108 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1260716086737267,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002521432173474534,
"grad_norm": 56.41114044189453,
"learning_rate": 5.025125628140703e-09,
"loss": 1.5821,
"num_input_tokens_seen": 2097152,
"step": 1
},
{
"epoch": 0.0005042864346949068,
"grad_norm": 31.409353256225586,
"learning_rate": 1.0050251256281407e-08,
"loss": 1.5937,
"num_input_tokens_seen": 4194304,
"step": 2
},
{
"epoch": 0.0007564296520423601,
"grad_norm": 21.739652633666992,
"learning_rate": 1.5075376884422108e-08,
"loss": 1.2442,
"num_input_tokens_seen": 6291456,
"step": 3
},
{
"epoch": 0.0010085728693898135,
"grad_norm": 20.685302734375,
"learning_rate": 2.0100502512562813e-08,
"loss": 0.8062,
"num_input_tokens_seen": 8388608,
"step": 4
},
{
"epoch": 0.0012607160867372667,
"grad_norm": 22.219989776611328,
"learning_rate": 2.5125628140703518e-08,
"loss": 1.1513,
"num_input_tokens_seen": 10485760,
"step": 5
},
{
"epoch": 0.0015128593040847202,
"grad_norm": 28.416399002075195,
"learning_rate": 3.0150753768844216e-08,
"loss": 1.634,
"num_input_tokens_seen": 12582912,
"step": 6
},
{
"epoch": 0.0017650025214321734,
"grad_norm": 23.952890396118164,
"learning_rate": 3.517587939698492e-08,
"loss": 1.1944,
"num_input_tokens_seen": 14680064,
"step": 7
},
{
"epoch": 0.002017145738779627,
"grad_norm": 20.75243377685547,
"learning_rate": 4.0201005025125626e-08,
"loss": 0.7753,
"num_input_tokens_seen": 16777216,
"step": 8
},
{
"epoch": 0.0022692889561270802,
"grad_norm": 25.797378540039062,
"learning_rate": 4.522613065326633e-08,
"loss": 1.5984,
"num_input_tokens_seen": 18874368,
"step": 9
},
{
"epoch": 0.0025214321734745334,
"grad_norm": 25.863649368286133,
"learning_rate": 5.0251256281407036e-08,
"loss": 1.5978,
"num_input_tokens_seen": 20971520,
"step": 10
},
{
"epoch": 0.002773575390821987,
"grad_norm": 18.696609497070312,
"learning_rate": 5.527638190954774e-08,
"loss": 1.2323,
"num_input_tokens_seen": 23068672,
"step": 11
},
{
"epoch": 0.0030257186081694403,
"grad_norm": 28.740385055541992,
"learning_rate": 6.030150753768843e-08,
"loss": 1.1786,
"num_input_tokens_seen": 25165824,
"step": 12
},
{
"epoch": 0.0032778618255168935,
"grad_norm": 21.161056518554688,
"learning_rate": 6.532663316582915e-08,
"loss": 0.7842,
"num_input_tokens_seen": 27262976,
"step": 13
},
{
"epoch": 0.0035300050428643467,
"grad_norm": 25.495088577270508,
"learning_rate": 7.035175879396984e-08,
"loss": 1.9987,
"num_input_tokens_seen": 29360128,
"step": 14
},
{
"epoch": 0.0037821482602118004,
"grad_norm": 24.420948028564453,
"learning_rate": 7.537688442211055e-08,
"loss": 1.1424,
"num_input_tokens_seen": 31457280,
"step": 15
},
{
"epoch": 0.004034291477559254,
"grad_norm": 19.922271728515625,
"learning_rate": 8.040201005025125e-08,
"loss": 1.1716,
"num_input_tokens_seen": 33554432,
"step": 16
},
{
"epoch": 0.004286434694906707,
"grad_norm": 25.040063858032227,
"learning_rate": 8.542713567839196e-08,
"loss": 0.8189,
"num_input_tokens_seen": 35651584,
"step": 17
},
{
"epoch": 0.0045385779122541605,
"grad_norm": 27.888629913330078,
"learning_rate": 9.045226130653266e-08,
"loss": 1.1743,
"num_input_tokens_seen": 37748736,
"step": 18
},
{
"epoch": 0.004790721129601614,
"grad_norm": 21.901092529296875,
"learning_rate": 9.547738693467335e-08,
"loss": 0.7951,
"num_input_tokens_seen": 39845888,
"step": 19
},
{
"epoch": 0.005042864346949067,
"grad_norm": 21.351625442504883,
"learning_rate": 1.0050251256281407e-07,
"loss": 1.2271,
"num_input_tokens_seen": 41943040,
"step": 20
},
{
"epoch": 0.00529500756429652,
"grad_norm": 21.482006072998047,
"learning_rate": 1.0552763819095476e-07,
"loss": 1.1908,
"num_input_tokens_seen": 44040192,
"step": 21
},
{
"epoch": 0.005547150781643974,
"grad_norm": 21.15386390686035,
"learning_rate": 1.1055276381909548e-07,
"loss": 1.2297,
"num_input_tokens_seen": 46137344,
"step": 22
},
{
"epoch": 0.005799293998991427,
"grad_norm": 23.156387329101562,
"learning_rate": 1.1557788944723617e-07,
"loss": 1.1766,
"num_input_tokens_seen": 48234496,
"step": 23
},
{
"epoch": 0.006051437216338881,
"grad_norm": 38.258697509765625,
"learning_rate": 1.2060301507537687e-07,
"loss": 1.4932,
"num_input_tokens_seen": 50331648,
"step": 24
},
{
"epoch": 0.006303580433686334,
"grad_norm": 20.798620223999023,
"learning_rate": 1.2562814070351758e-07,
"loss": 1.2943,
"num_input_tokens_seen": 52428800,
"step": 25
},
{
"epoch": 0.006555723651033787,
"grad_norm": 24.994922637939453,
"learning_rate": 1.306532663316583e-07,
"loss": 1.0768,
"num_input_tokens_seen": 54525952,
"step": 26
},
{
"epoch": 0.00680786686838124,
"grad_norm": 33.116146087646484,
"learning_rate": 1.35678391959799e-07,
"loss": 1.1369,
"num_input_tokens_seen": 56623104,
"step": 27
},
{
"epoch": 0.0070600100857286935,
"grad_norm": 39.03334426879883,
"learning_rate": 1.4070351758793969e-07,
"loss": 1.5513,
"num_input_tokens_seen": 58720256,
"step": 28
},
{
"epoch": 0.007312153303076148,
"grad_norm": 25.035110473632812,
"learning_rate": 1.4572864321608038e-07,
"loss": 1.2028,
"num_input_tokens_seen": 60817408,
"step": 29
},
{
"epoch": 0.007564296520423601,
"grad_norm": 21.068431854248047,
"learning_rate": 1.507537688442211e-07,
"loss": 1.1555,
"num_input_tokens_seen": 62914560,
"step": 30
},
{
"epoch": 0.007816439737771054,
"grad_norm": 35.82476043701172,
"learning_rate": 1.5577889447236181e-07,
"loss": 1.1723,
"num_input_tokens_seen": 65011712,
"step": 31
},
{
"epoch": 0.008068582955118508,
"grad_norm": 27.961219787597656,
"learning_rate": 1.608040201005025e-07,
"loss": 0.7226,
"num_input_tokens_seen": 67108864,
"step": 32
},
{
"epoch": 0.00832072617246596,
"grad_norm": 21.109777450561523,
"learning_rate": 1.658291457286432e-07,
"loss": 1.0722,
"num_input_tokens_seen": 69206016,
"step": 33
},
{
"epoch": 0.008572869389813415,
"grad_norm": 43.04289627075195,
"learning_rate": 1.7085427135678392e-07,
"loss": 1.1128,
"num_input_tokens_seen": 71303168,
"step": 34
},
{
"epoch": 0.008825012607160867,
"grad_norm": 26.515880584716797,
"learning_rate": 1.7587939698492463e-07,
"loss": 1.1254,
"num_input_tokens_seen": 73400320,
"step": 35
},
{
"epoch": 0.009077155824508321,
"grad_norm": 21.351062774658203,
"learning_rate": 1.8090452261306533e-07,
"loss": 0.7675,
"num_input_tokens_seen": 75497472,
"step": 36
},
{
"epoch": 0.009329299041855773,
"grad_norm": 23.136459350585938,
"learning_rate": 1.8592964824120602e-07,
"loss": 1.1374,
"num_input_tokens_seen": 77594624,
"step": 37
},
{
"epoch": 0.009581442259203227,
"grad_norm": 17.877473831176758,
"learning_rate": 1.909547738693467e-07,
"loss": 1.1101,
"num_input_tokens_seen": 79691776,
"step": 38
},
{
"epoch": 0.009833585476550681,
"grad_norm": 33.78788375854492,
"learning_rate": 1.9597989949748743e-07,
"loss": 1.0273,
"num_input_tokens_seen": 81788928,
"step": 39
},
{
"epoch": 0.010085728693898134,
"grad_norm": 32.83673858642578,
"learning_rate": 2.0100502512562815e-07,
"loss": 1.1025,
"num_input_tokens_seen": 83886080,
"step": 40
},
{
"epoch": 0.010337871911245588,
"grad_norm": 26.676027297973633,
"learning_rate": 2.0603015075376884e-07,
"loss": 1.8515,
"num_input_tokens_seen": 85983232,
"step": 41
},
{
"epoch": 0.01059001512859304,
"grad_norm": 26.88898468017578,
"learning_rate": 2.1105527638190953e-07,
"loss": 1.3322,
"num_input_tokens_seen": 88080384,
"step": 42
},
{
"epoch": 0.010842158345940494,
"grad_norm": 24.28297233581543,
"learning_rate": 2.1608040201005022e-07,
"loss": 0.9043,
"num_input_tokens_seen": 90177536,
"step": 43
},
{
"epoch": 0.011094301563287948,
"grad_norm": 15.659173011779785,
"learning_rate": 2.2110552763819096e-07,
"loss": 0.9169,
"num_input_tokens_seen": 92274688,
"step": 44
},
{
"epoch": 0.0113464447806354,
"grad_norm": 15.930516242980957,
"learning_rate": 2.2613065326633166e-07,
"loss": 0.9613,
"num_input_tokens_seen": 94371840,
"step": 45
},
{
"epoch": 0.011598587997982855,
"grad_norm": 14.883039474487305,
"learning_rate": 2.3115577889447235e-07,
"loss": 0.882,
"num_input_tokens_seen": 96468992,
"step": 46
},
{
"epoch": 0.011850731215330307,
"grad_norm": 25.84305191040039,
"learning_rate": 2.3618090452261304e-07,
"loss": 1.1471,
"num_input_tokens_seen": 98566144,
"step": 47
},
{
"epoch": 0.012102874432677761,
"grad_norm": 21.669544219970703,
"learning_rate": 2.4120603015075373e-07,
"loss": 0.9125,
"num_input_tokens_seen": 100663296,
"step": 48
},
{
"epoch": 0.012355017650025214,
"grad_norm": 15.483664512634277,
"learning_rate": 2.4623115577889445e-07,
"loss": 0.8492,
"num_input_tokens_seen": 102760448,
"step": 49
},
{
"epoch": 0.012607160867372668,
"grad_norm": 18.560636520385742,
"learning_rate": 2.5125628140703517e-07,
"loss": 0.9035,
"num_input_tokens_seen": 104857600,
"step": 50
},
{
"epoch": 0.012859304084720122,
"grad_norm": 14.719083786010742,
"learning_rate": 2.562814070351759e-07,
"loss": 0.8161,
"num_input_tokens_seen": 106954752,
"step": 51
},
{
"epoch": 0.013111447302067574,
"grad_norm": 21.655672073364258,
"learning_rate": 2.613065326633166e-07,
"loss": 0.572,
"num_input_tokens_seen": 109051904,
"step": 52
},
{
"epoch": 0.013363590519415028,
"grad_norm": 11.465034484863281,
"learning_rate": 2.6633165829145727e-07,
"loss": 0.807,
"num_input_tokens_seen": 111149056,
"step": 53
},
{
"epoch": 0.01361573373676248,
"grad_norm": 17.689987182617188,
"learning_rate": 2.71356783919598e-07,
"loss": 1.4423,
"num_input_tokens_seen": 113246208,
"step": 54
},
{
"epoch": 0.013867876954109935,
"grad_norm": 14.684429168701172,
"learning_rate": 2.7638190954773865e-07,
"loss": 0.8659,
"num_input_tokens_seen": 115343360,
"step": 55
},
{
"epoch": 0.014120020171457387,
"grad_norm": 12.435643196105957,
"learning_rate": 2.8140703517587937e-07,
"loss": 0.7607,
"num_input_tokens_seen": 117440512,
"step": 56
},
{
"epoch": 0.014372163388804841,
"grad_norm": 17.700153350830078,
"learning_rate": 2.864321608040201e-07,
"loss": 0.8607,
"num_input_tokens_seen": 119537664,
"step": 57
},
{
"epoch": 0.014624306606152295,
"grad_norm": 13.79918384552002,
"learning_rate": 2.9145728643216075e-07,
"loss": 0.7589,
"num_input_tokens_seen": 121634816,
"step": 58
},
{
"epoch": 0.014876449823499747,
"grad_norm": 15.207538604736328,
"learning_rate": 2.964824120603015e-07,
"loss": 0.4787,
"num_input_tokens_seen": 123731968,
"step": 59
},
{
"epoch": 0.015128593040847202,
"grad_norm": 10.523366928100586,
"learning_rate": 3.015075376884422e-07,
"loss": 0.6908,
"num_input_tokens_seen": 125829120,
"step": 60
},
{
"epoch": 0.015380736258194654,
"grad_norm": 8.412284851074219,
"learning_rate": 3.065326633165829e-07,
"loss": 0.6561,
"num_input_tokens_seen": 127926272,
"step": 61
},
{
"epoch": 0.015632879475542108,
"grad_norm": 9.98276138305664,
"learning_rate": 3.1155778894472363e-07,
"loss": 0.7216,
"num_input_tokens_seen": 130023424,
"step": 62
},
{
"epoch": 0.01588502269288956,
"grad_norm": 11.017064094543457,
"learning_rate": 3.165829145728643e-07,
"loss": 0.6223,
"num_input_tokens_seen": 132120576,
"step": 63
},
{
"epoch": 0.016137165910237016,
"grad_norm": 15.129839897155762,
"learning_rate": 3.21608040201005e-07,
"loss": 1.0373,
"num_input_tokens_seen": 134217728,
"step": 64
},
{
"epoch": 0.01638930912758447,
"grad_norm": 8.578692436218262,
"learning_rate": 3.2663316582914573e-07,
"loss": 0.5687,
"num_input_tokens_seen": 136314880,
"step": 65
},
{
"epoch": 0.01664145234493192,
"grad_norm": 13.31927490234375,
"learning_rate": 3.316582914572864e-07,
"loss": 1.0766,
"num_input_tokens_seen": 138412032,
"step": 66
},
{
"epoch": 0.016893595562279373,
"grad_norm": 8.775867462158203,
"learning_rate": 3.366834170854271e-07,
"loss": 0.5324,
"num_input_tokens_seen": 140509184,
"step": 67
},
{
"epoch": 0.01714573877962683,
"grad_norm": 12.085953712463379,
"learning_rate": 3.4170854271356783e-07,
"loss": 0.8601,
"num_input_tokens_seen": 142606336,
"step": 68
},
{
"epoch": 0.01739788199697428,
"grad_norm": 12.76360034942627,
"learning_rate": 3.4673366834170855e-07,
"loss": 0.5595,
"num_input_tokens_seen": 144703488,
"step": 69
},
{
"epoch": 0.017650025214321734,
"grad_norm": 10.255838394165039,
"learning_rate": 3.5175879396984927e-07,
"loss": 0.3496,
"num_input_tokens_seen": 146800640,
"step": 70
},
{
"epoch": 0.01790216843166919,
"grad_norm": 9.94809341430664,
"learning_rate": 3.5678391959798993e-07,
"loss": 0.5976,
"num_input_tokens_seen": 148897792,
"step": 71
},
{
"epoch": 0.018154311649016642,
"grad_norm": 7.37994384765625,
"learning_rate": 3.6180904522613065e-07,
"loss": 0.5241,
"num_input_tokens_seen": 150994944,
"step": 72
},
{
"epoch": 0.018406454866364094,
"grad_norm": 8.874433517456055,
"learning_rate": 3.668341708542713e-07,
"loss": 0.5629,
"num_input_tokens_seen": 153092096,
"step": 73
},
{
"epoch": 0.018658598083711547,
"grad_norm": 16.685457229614258,
"learning_rate": 3.7185929648241203e-07,
"loss": 0.3801,
"num_input_tokens_seen": 155189248,
"step": 74
},
{
"epoch": 0.018910741301059002,
"grad_norm": 11.288415908813477,
"learning_rate": 3.7688442211055275e-07,
"loss": 0.6093,
"num_input_tokens_seen": 157286400,
"step": 75
},
{
"epoch": 0.019162884518406455,
"grad_norm": 10.51889419555664,
"learning_rate": 3.819095477386934e-07,
"loss": 0.5053,
"num_input_tokens_seen": 159383552,
"step": 76
},
{
"epoch": 0.019415027735753907,
"grad_norm": 10.236724853515625,
"learning_rate": 3.869346733668342e-07,
"loss": 0.7537,
"num_input_tokens_seen": 161480704,
"step": 77
},
{
"epoch": 0.019667170953101363,
"grad_norm": 9.370979309082031,
"learning_rate": 3.9195979899497485e-07,
"loss": 0.5814,
"num_input_tokens_seen": 163577856,
"step": 78
},
{
"epoch": 0.019919314170448815,
"grad_norm": 12.056835174560547,
"learning_rate": 3.9698492462311557e-07,
"loss": 0.5178,
"num_input_tokens_seen": 165675008,
"step": 79
},
{
"epoch": 0.020171457387796268,
"grad_norm": 8.761493682861328,
"learning_rate": 4.020100502512563e-07,
"loss": 0.4851,
"num_input_tokens_seen": 167772160,
"step": 80
},
{
"epoch": 0.02042360060514372,
"grad_norm": 9.159887313842773,
"learning_rate": 4.0703517587939696e-07,
"loss": 0.4531,
"num_input_tokens_seen": 169869312,
"step": 81
},
{
"epoch": 0.020675743822491176,
"grad_norm": 9.923644065856934,
"learning_rate": 4.120603015075377e-07,
"loss": 0.5835,
"num_input_tokens_seen": 171966464,
"step": 82
},
{
"epoch": 0.020927887039838628,
"grad_norm": 8.762866973876953,
"learning_rate": 4.1708542713567834e-07,
"loss": 0.4772,
"num_input_tokens_seen": 174063616,
"step": 83
},
{
"epoch": 0.02118003025718608,
"grad_norm": 10.09272289276123,
"learning_rate": 4.2211055276381906e-07,
"loss": 0.7305,
"num_input_tokens_seen": 176160768,
"step": 84
},
{
"epoch": 0.021432173474533536,
"grad_norm": 8.009614944458008,
"learning_rate": 4.271356783919598e-07,
"loss": 0.4629,
"num_input_tokens_seen": 178257920,
"step": 85
},
{
"epoch": 0.02168431669188099,
"grad_norm": 8.284019470214844,
"learning_rate": 4.3216080402010044e-07,
"loss": 0.4368,
"num_input_tokens_seen": 180355072,
"step": 86
},
{
"epoch": 0.02193645990922844,
"grad_norm": 6.427061557769775,
"learning_rate": 4.371859296482412e-07,
"loss": 0.43,
"num_input_tokens_seen": 182452224,
"step": 87
},
{
"epoch": 0.022188603126575897,
"grad_norm": 12.255255699157715,
"learning_rate": 4.4221105527638193e-07,
"loss": 0.5879,
"num_input_tokens_seen": 184549376,
"step": 88
},
{
"epoch": 0.02244074634392335,
"grad_norm": 6.626727104187012,
"learning_rate": 4.472361809045226e-07,
"loss": 0.3916,
"num_input_tokens_seen": 186646528,
"step": 89
},
{
"epoch": 0.0226928895612708,
"grad_norm": 8.53348445892334,
"learning_rate": 4.522613065326633e-07,
"loss": 0.4768,
"num_input_tokens_seen": 188743680,
"step": 90
},
{
"epoch": 0.022945032778618254,
"grad_norm": 6.995331287384033,
"learning_rate": 4.57286432160804e-07,
"loss": 0.3988,
"num_input_tokens_seen": 190840832,
"step": 91
},
{
"epoch": 0.02319717599596571,
"grad_norm": 8.352548599243164,
"learning_rate": 4.623115577889447e-07,
"loss": 0.3706,
"num_input_tokens_seen": 192937984,
"step": 92
},
{
"epoch": 0.023449319213313162,
"grad_norm": 6.609560489654541,
"learning_rate": 4.673366834170854e-07,
"loss": 0.2459,
"num_input_tokens_seen": 195035136,
"step": 93
},
{
"epoch": 0.023701462430660614,
"grad_norm": 9.539324760437012,
"learning_rate": 4.723618090452261e-07,
"loss": 0.3865,
"num_input_tokens_seen": 197132288,
"step": 94
},
{
"epoch": 0.02395360564800807,
"grad_norm": 9.831944465637207,
"learning_rate": 4.773869346733669e-07,
"loss": 0.4022,
"num_input_tokens_seen": 199229440,
"step": 95
},
{
"epoch": 0.024205748865355523,
"grad_norm": 9.292588233947754,
"learning_rate": 4.824120603015075e-07,
"loss": 0.3543,
"num_input_tokens_seen": 201326592,
"step": 96
},
{
"epoch": 0.024457892082702975,
"grad_norm": 9.192462921142578,
"learning_rate": 4.874371859296482e-07,
"loss": 0.4336,
"num_input_tokens_seen": 203423744,
"step": 97
},
{
"epoch": 0.024710035300050427,
"grad_norm": 8.302521705627441,
"learning_rate": 4.924623115577889e-07,
"loss": 0.534,
"num_input_tokens_seen": 205520896,
"step": 98
},
{
"epoch": 0.024962178517397883,
"grad_norm": 9.702790260314941,
"learning_rate": 4.974874371859296e-07,
"loss": 0.5899,
"num_input_tokens_seen": 207618048,
"step": 99
},
{
"epoch": 0.025214321734745335,
"grad_norm": 7.346845626831055,
"learning_rate": 5.025125628140703e-07,
"loss": 0.3439,
"num_input_tokens_seen": 209715200,
"step": 100
},
{
"epoch": 0.025466464952092788,
"grad_norm": 6.6140265464782715,
"learning_rate": 5.075376884422111e-07,
"loss": 0.3779,
"num_input_tokens_seen": 211812352,
"step": 101
},
{
"epoch": 0.025718608169440244,
"grad_norm": 6.8121209144592285,
"learning_rate": 5.125628140703518e-07,
"loss": 0.403,
"num_input_tokens_seen": 213909504,
"step": 102
},
{
"epoch": 0.025970751386787696,
"grad_norm": 6.07421875,
"learning_rate": 5.175879396984925e-07,
"loss": 0.3473,
"num_input_tokens_seen": 216006656,
"step": 103
},
{
"epoch": 0.026222894604135148,
"grad_norm": 6.86598539352417,
"learning_rate": 5.226130653266332e-07,
"loss": 0.3054,
"num_input_tokens_seen": 218103808,
"step": 104
},
{
"epoch": 0.0264750378214826,
"grad_norm": 7.970452308654785,
"learning_rate": 5.276381909547738e-07,
"loss": 0.3693,
"num_input_tokens_seen": 220200960,
"step": 105
},
{
"epoch": 0.026727181038830056,
"grad_norm": 7.2236552238464355,
"learning_rate": 5.326633165829145e-07,
"loss": 0.2194,
"num_input_tokens_seen": 222298112,
"step": 106
},
{
"epoch": 0.02697932425617751,
"grad_norm": 5.257369518280029,
"learning_rate": 5.376884422110553e-07,
"loss": 0.2962,
"num_input_tokens_seen": 224395264,
"step": 107
},
{
"epoch": 0.02723146747352496,
"grad_norm": 6.920422077178955,
"learning_rate": 5.42713567839196e-07,
"loss": 0.3699,
"num_input_tokens_seen": 226492416,
"step": 108
},
{
"epoch": 0.027483610690872417,
"grad_norm": 9.312458992004395,
"learning_rate": 5.477386934673367e-07,
"loss": 0.3812,
"num_input_tokens_seen": 228589568,
"step": 109
},
{
"epoch": 0.02773575390821987,
"grad_norm": 9.935240745544434,
"learning_rate": 5.527638190954773e-07,
"loss": 0.4443,
"num_input_tokens_seen": 230686720,
"step": 110
},
{
"epoch": 0.02798789712556732,
"grad_norm": 5.373161315917969,
"learning_rate": 5.57788944723618e-07,
"loss": 0.264,
"num_input_tokens_seen": 232783872,
"step": 111
},
{
"epoch": 0.028240040342914774,
"grad_norm": 6.769862651824951,
"learning_rate": 5.628140703517587e-07,
"loss": 0.1686,
"num_input_tokens_seen": 234881024,
"step": 112
},
{
"epoch": 0.02849218356026223,
"grad_norm": 5.726578712463379,
"learning_rate": 5.678391959798995e-07,
"loss": 0.3396,
"num_input_tokens_seen": 236978176,
"step": 113
},
{
"epoch": 0.028744326777609682,
"grad_norm": 5.439636707305908,
"learning_rate": 5.728643216080402e-07,
"loss": 0.2733,
"num_input_tokens_seen": 239075328,
"step": 114
},
{
"epoch": 0.028996469994957134,
"grad_norm": 5.622605323791504,
"learning_rate": 5.778894472361808e-07,
"loss": 0.2998,
"num_input_tokens_seen": 241172480,
"step": 115
},
{
"epoch": 0.02924861321230459,
"grad_norm": 6.728963851928711,
"learning_rate": 5.829145728643215e-07,
"loss": 0.2549,
"num_input_tokens_seen": 243269632,
"step": 116
},
{
"epoch": 0.029500756429652043,
"grad_norm": 5.0983781814575195,
"learning_rate": 5.879396984924622e-07,
"loss": 0.2705,
"num_input_tokens_seen": 245366784,
"step": 117
},
{
"epoch": 0.029752899646999495,
"grad_norm": 7.3646721839904785,
"learning_rate": 5.92964824120603e-07,
"loss": 0.3242,
"num_input_tokens_seen": 247463936,
"step": 118
},
{
"epoch": 0.03000504286434695,
"grad_norm": 7.918598651885986,
"learning_rate": 5.979899497487438e-07,
"loss": 0.371,
"num_input_tokens_seen": 249561088,
"step": 119
},
{
"epoch": 0.030257186081694403,
"grad_norm": 7.411210536956787,
"learning_rate": 6.030150753768844e-07,
"loss": 0.2728,
"num_input_tokens_seen": 251658240,
"step": 120
},
{
"epoch": 0.030509329299041855,
"grad_norm": 5.8603129386901855,
"learning_rate": 6.080402010050251e-07,
"loss": 0.1854,
"num_input_tokens_seen": 253755392,
"step": 121
},
{
"epoch": 0.030761472516389308,
"grad_norm": 5.476680278778076,
"learning_rate": 6.130653266331658e-07,
"loss": 0.1831,
"num_input_tokens_seen": 255852544,
"step": 122
},
{
"epoch": 0.031013615733736764,
"grad_norm": 6.4667158126831055,
"learning_rate": 6.180904522613065e-07,
"loss": 0.1721,
"num_input_tokens_seen": 257949696,
"step": 123
},
{
"epoch": 0.031265758951084216,
"grad_norm": 5.928079605102539,
"learning_rate": 6.231155778894473e-07,
"loss": 0.2728,
"num_input_tokens_seen": 260046848,
"step": 124
},
{
"epoch": 0.03151790216843167,
"grad_norm": 7.0044755935668945,
"learning_rate": 6.28140703517588e-07,
"loss": 0.4037,
"num_input_tokens_seen": 262144000,
"step": 125
},
{
"epoch": 0.03177004538577912,
"grad_norm": 8.558830261230469,
"learning_rate": 6.331658291457286e-07,
"loss": 0.5263,
"num_input_tokens_seen": 264241152,
"step": 126
},
{
"epoch": 0.032022188603126577,
"grad_norm": 5.0764055252075195,
"learning_rate": 6.381909547738693e-07,
"loss": 0.2054,
"num_input_tokens_seen": 266338304,
"step": 127
},
{
"epoch": 0.03227433182047403,
"grad_norm": 5.459807872772217,
"learning_rate": 6.4321608040201e-07,
"loss": 0.2122,
"num_input_tokens_seen": 268435456,
"step": 128
},
{
"epoch": 0.03252647503782148,
"grad_norm": 5.658675670623779,
"learning_rate": 6.482412060301507e-07,
"loss": 0.2226,
"num_input_tokens_seen": 270532608,
"step": 129
},
{
"epoch": 0.03277861825516894,
"grad_norm": 5.613616466522217,
"learning_rate": 6.532663316582915e-07,
"loss": 0.2701,
"num_input_tokens_seen": 272629760,
"step": 130
},
{
"epoch": 0.033030761472516386,
"grad_norm": 9.082258224487305,
"learning_rate": 6.582914572864321e-07,
"loss": 0.3726,
"num_input_tokens_seen": 274726912,
"step": 131
},
{
"epoch": 0.03328290468986384,
"grad_norm": 4.047947406768799,
"learning_rate": 6.633165829145728e-07,
"loss": 0.1323,
"num_input_tokens_seen": 276824064,
"step": 132
},
{
"epoch": 0.0335350479072113,
"grad_norm": 5.141188144683838,
"learning_rate": 6.683417085427135e-07,
"loss": 0.2615,
"num_input_tokens_seen": 278921216,
"step": 133
},
{
"epoch": 0.033787191124558746,
"grad_norm": 4.637810707092285,
"learning_rate": 6.733668341708542e-07,
"loss": 0.2252,
"num_input_tokens_seen": 281018368,
"step": 134
},
{
"epoch": 0.0340393343419062,
"grad_norm": 5.142843723297119,
"learning_rate": 6.783919597989949e-07,
"loss": 0.1817,
"num_input_tokens_seen": 283115520,
"step": 135
},
{
"epoch": 0.03429147755925366,
"grad_norm": 7.557190418243408,
"learning_rate": 6.834170854271357e-07,
"loss": 0.2897,
"num_input_tokens_seen": 285212672,
"step": 136
},
{
"epoch": 0.03454362077660111,
"grad_norm": 6.585993766784668,
"learning_rate": 6.884422110552764e-07,
"loss": 0.227,
"num_input_tokens_seen": 287309824,
"step": 137
},
{
"epoch": 0.03479576399394856,
"grad_norm": 4.926968574523926,
"learning_rate": 6.934673366834171e-07,
"loss": 0.1573,
"num_input_tokens_seen": 289406976,
"step": 138
},
{
"epoch": 0.03504790721129602,
"grad_norm": 6.03431510925293,
"learning_rate": 6.984924623115578e-07,
"loss": 0.2187,
"num_input_tokens_seen": 291504128,
"step": 139
},
{
"epoch": 0.03530005042864347,
"grad_norm": 9.677518844604492,
"learning_rate": 7.035175879396985e-07,
"loss": 0.2295,
"num_input_tokens_seen": 293601280,
"step": 140
},
{
"epoch": 0.03555219364599092,
"grad_norm": 6.820138931274414,
"learning_rate": 7.085427135678391e-07,
"loss": 0.1944,
"num_input_tokens_seen": 295698432,
"step": 141
},
{
"epoch": 0.03580433686333838,
"grad_norm": 5.568108081817627,
"learning_rate": 7.135678391959799e-07,
"loss": 0.3113,
"num_input_tokens_seen": 297795584,
"step": 142
},
{
"epoch": 0.03605648008068583,
"grad_norm": 6.417880058288574,
"learning_rate": 7.185929648241206e-07,
"loss": 0.2932,
"num_input_tokens_seen": 299892736,
"step": 143
},
{
"epoch": 0.036308623298033284,
"grad_norm": 5.040261745452881,
"learning_rate": 7.236180904522613e-07,
"loss": 0.2076,
"num_input_tokens_seen": 301989888,
"step": 144
},
{
"epoch": 0.03656076651538074,
"grad_norm": 6.350996494293213,
"learning_rate": 7.28643216080402e-07,
"loss": 0.1714,
"num_input_tokens_seen": 304087040,
"step": 145
},
{
"epoch": 0.03681290973272819,
"grad_norm": 5.744927406311035,
"learning_rate": 7.336683417085426e-07,
"loss": 0.1948,
"num_input_tokens_seen": 306184192,
"step": 146
},
{
"epoch": 0.037065052950075644,
"grad_norm": 5.379306793212891,
"learning_rate": 7.386934673366834e-07,
"loss": 0.1971,
"num_input_tokens_seen": 308281344,
"step": 147
},
{
"epoch": 0.03731719616742309,
"grad_norm": 4.08986234664917,
"learning_rate": 7.437185929648241e-07,
"loss": 0.1319,
"num_input_tokens_seen": 310378496,
"step": 148
},
{
"epoch": 0.03756933938477055,
"grad_norm": 8.005187034606934,
"learning_rate": 7.487437185929648e-07,
"loss": 0.3227,
"num_input_tokens_seen": 312475648,
"step": 149
},
{
"epoch": 0.037821482602118005,
"grad_norm": 6.485504627227783,
"learning_rate": 7.537688442211055e-07,
"loss": 0.4005,
"num_input_tokens_seen": 314572800,
"step": 150
},
{
"epoch": 0.038073625819465454,
"grad_norm": 7.763909339904785,
"learning_rate": 7.587939698492461e-07,
"loss": 0.3537,
"num_input_tokens_seen": 316669952,
"step": 151
},
{
"epoch": 0.03832576903681291,
"grad_norm": 5.093461036682129,
"learning_rate": 7.638190954773868e-07,
"loss": 0.1321,
"num_input_tokens_seen": 318767104,
"step": 152
},
{
"epoch": 0.038577912254160365,
"grad_norm": 4.274379730224609,
"learning_rate": 7.688442211055276e-07,
"loss": 0.1623,
"num_input_tokens_seen": 320864256,
"step": 153
},
{
"epoch": 0.038830055471507814,
"grad_norm": 5.359605312347412,
"learning_rate": 7.738693467336684e-07,
"loss": 0.2337,
"num_input_tokens_seen": 322961408,
"step": 154
},
{
"epoch": 0.03908219868885527,
"grad_norm": 5.039738655090332,
"learning_rate": 7.788944723618091e-07,
"loss": 0.2028,
"num_input_tokens_seen": 325058560,
"step": 155
},
{
"epoch": 0.039334341906202726,
"grad_norm": 5.888302326202393,
"learning_rate": 7.839195979899497e-07,
"loss": 0.1418,
"num_input_tokens_seen": 327155712,
"step": 156
},
{
"epoch": 0.039586485123550175,
"grad_norm": 5.222049236297607,
"learning_rate": 7.889447236180904e-07,
"loss": 0.1474,
"num_input_tokens_seen": 329252864,
"step": 157
},
{
"epoch": 0.03983862834089763,
"grad_norm": 5.662126064300537,
"learning_rate": 7.939698492462311e-07,
"loss": 0.2008,
"num_input_tokens_seen": 331350016,
"step": 158
},
{
"epoch": 0.040090771558245086,
"grad_norm": 4.854446887969971,
"learning_rate": 7.989949748743719e-07,
"loss": 0.1373,
"num_input_tokens_seen": 333447168,
"step": 159
},
{
"epoch": 0.040342914775592535,
"grad_norm": 5.8150177001953125,
"learning_rate": 8.040201005025126e-07,
"loss": 0.2512,
"num_input_tokens_seen": 335544320,
"step": 160
},
{
"epoch": 0.04059505799293999,
"grad_norm": 5.4808526039123535,
"learning_rate": 8.090452261306532e-07,
"loss": 0.1379,
"num_input_tokens_seen": 337641472,
"step": 161
},
{
"epoch": 0.04084720121028744,
"grad_norm": 5.683319091796875,
"learning_rate": 8.140703517587939e-07,
"loss": 0.2061,
"num_input_tokens_seen": 339738624,
"step": 162
},
{
"epoch": 0.041099344427634896,
"grad_norm": 5.919990062713623,
"learning_rate": 8.190954773869346e-07,
"loss": 0.2115,
"num_input_tokens_seen": 341835776,
"step": 163
},
{
"epoch": 0.04135148764498235,
"grad_norm": 4.193869113922119,
"learning_rate": 8.241206030150753e-07,
"loss": 0.1766,
"num_input_tokens_seen": 343932928,
"step": 164
},
{
"epoch": 0.0416036308623298,
"grad_norm": 4.4601945877075195,
"learning_rate": 8.291457286432161e-07,
"loss": 0.1939,
"num_input_tokens_seen": 346030080,
"step": 165
},
{
"epoch": 0.041855774079677256,
"grad_norm": 5.21290922164917,
"learning_rate": 8.341708542713567e-07,
"loss": 0.1787,
"num_input_tokens_seen": 348127232,
"step": 166
},
{
"epoch": 0.04210791729702471,
"grad_norm": 5.489988327026367,
"learning_rate": 8.391959798994974e-07,
"loss": 0.1809,
"num_input_tokens_seen": 350224384,
"step": 167
},
{
"epoch": 0.04236006051437216,
"grad_norm": 4.026052474975586,
"learning_rate": 8.442211055276381e-07,
"loss": 0.1248,
"num_input_tokens_seen": 352321536,
"step": 168
},
{
"epoch": 0.04261220373171962,
"grad_norm": 4.203098297119141,
"learning_rate": 8.492462311557788e-07,
"loss": 0.1089,
"num_input_tokens_seen": 354418688,
"step": 169
},
{
"epoch": 0.04286434694906707,
"grad_norm": 6.0608296394348145,
"learning_rate": 8.542713567839196e-07,
"loss": 0.185,
"num_input_tokens_seen": 356515840,
"step": 170
},
{
"epoch": 0.04311649016641452,
"grad_norm": 5.297198295593262,
"learning_rate": 8.592964824120602e-07,
"loss": 0.119,
"num_input_tokens_seen": 358612992,
"step": 171
},
{
"epoch": 0.04336863338376198,
"grad_norm": 4.82717227935791,
"learning_rate": 8.643216080402009e-07,
"loss": 0.1275,
"num_input_tokens_seen": 360710144,
"step": 172
},
{
"epoch": 0.04362077660110943,
"grad_norm": 7.091985702514648,
"learning_rate": 8.693467336683417e-07,
"loss": 0.3237,
"num_input_tokens_seen": 362807296,
"step": 173
},
{
"epoch": 0.04387291981845688,
"grad_norm": 4.359028339385986,
"learning_rate": 8.743718592964824e-07,
"loss": 0.1272,
"num_input_tokens_seen": 364904448,
"step": 174
},
{
"epoch": 0.04412506303580434,
"grad_norm": 4.864053726196289,
"learning_rate": 8.793969849246231e-07,
"loss": 0.2115,
"num_input_tokens_seen": 367001600,
"step": 175
},
{
"epoch": 0.044377206253151794,
"grad_norm": 4.585638523101807,
"learning_rate": 8.844221105527639e-07,
"loss": 0.1753,
"num_input_tokens_seen": 369098752,
"step": 176
},
{
"epoch": 0.04462934947049924,
"grad_norm": 6.2548933029174805,
"learning_rate": 8.894472361809045e-07,
"loss": 0.2436,
"num_input_tokens_seen": 371195904,
"step": 177
},
{
"epoch": 0.0448814926878467,
"grad_norm": 4.619575023651123,
"learning_rate": 8.944723618090452e-07,
"loss": 0.2271,
"num_input_tokens_seen": 373293056,
"step": 178
},
{
"epoch": 0.04513363590519415,
"grad_norm": 4.505560398101807,
"learning_rate": 8.994974874371859e-07,
"loss": 0.1728,
"num_input_tokens_seen": 375390208,
"step": 179
},
{
"epoch": 0.0453857791225416,
"grad_norm": 4.657378196716309,
"learning_rate": 9.045226130653266e-07,
"loss": 0.2134,
"num_input_tokens_seen": 377487360,
"step": 180
},
{
"epoch": 0.04563792233988906,
"grad_norm": 3.5373897552490234,
"learning_rate": 9.095477386934673e-07,
"loss": 0.125,
"num_input_tokens_seen": 379584512,
"step": 181
},
{
"epoch": 0.04589006555723651,
"grad_norm": 4.476269721984863,
"learning_rate": 9.14572864321608e-07,
"loss": 0.1805,
"num_input_tokens_seen": 381681664,
"step": 182
},
{
"epoch": 0.046142208774583963,
"grad_norm": 4.5421881675720215,
"learning_rate": 9.195979899497487e-07,
"loss": 0.1296,
"num_input_tokens_seen": 383778816,
"step": 183
},
{
"epoch": 0.04639435199193142,
"grad_norm": 4.141582012176514,
"learning_rate": 9.246231155778894e-07,
"loss": 0.194,
"num_input_tokens_seen": 385875968,
"step": 184
},
{
"epoch": 0.04664649520927887,
"grad_norm": 6.524399757385254,
"learning_rate": 9.296482412060301e-07,
"loss": 0.1595,
"num_input_tokens_seen": 387973120,
"step": 185
},
{
"epoch": 0.046898638426626324,
"grad_norm": 4.473093509674072,
"learning_rate": 9.346733668341708e-07,
"loss": 0.1909,
"num_input_tokens_seen": 390070272,
"step": 186
},
{
"epoch": 0.04715078164397378,
"grad_norm": 5.006099224090576,
"learning_rate": 9.396984924623114e-07,
"loss": 0.215,
"num_input_tokens_seen": 392167424,
"step": 187
},
{
"epoch": 0.04740292486132123,
"grad_norm": 4.727731227874756,
"learning_rate": 9.447236180904522e-07,
"loss": 0.1874,
"num_input_tokens_seen": 394264576,
"step": 188
},
{
"epoch": 0.047655068078668684,
"grad_norm": 4.6576828956604,
"learning_rate": 9.497487437185929e-07,
"loss": 0.1889,
"num_input_tokens_seen": 396361728,
"step": 189
},
{
"epoch": 0.04790721129601614,
"grad_norm": 4.223318099975586,
"learning_rate": 9.547738693467337e-07,
"loss": 0.1432,
"num_input_tokens_seen": 398458880,
"step": 190
},
{
"epoch": 0.04815935451336359,
"grad_norm": 3.288745641708374,
"learning_rate": 9.597989949748744e-07,
"loss": 0.1361,
"num_input_tokens_seen": 400556032,
"step": 191
},
{
"epoch": 0.048411497730711045,
"grad_norm": 4.024937629699707,
"learning_rate": 9.64824120603015e-07,
"loss": 0.1285,
"num_input_tokens_seen": 402653184,
"step": 192
},
{
"epoch": 0.048663640948058494,
"grad_norm": 4.060795783996582,
"learning_rate": 9.698492462311556e-07,
"loss": 0.1472,
"num_input_tokens_seen": 404750336,
"step": 193
},
{
"epoch": 0.04891578416540595,
"grad_norm": 5.01156759262085,
"learning_rate": 9.748743718592964e-07,
"loss": 0.2541,
"num_input_tokens_seen": 406847488,
"step": 194
},
{
"epoch": 0.049167927382753406,
"grad_norm": 3.8259568214416504,
"learning_rate": 9.79899497487437e-07,
"loss": 0.176,
"num_input_tokens_seen": 408944640,
"step": 195
},
{
"epoch": 0.049420070600100854,
"grad_norm": 4.526422500610352,
"learning_rate": 9.849246231155778e-07,
"loss": 0.2161,
"num_input_tokens_seen": 411041792,
"step": 196
},
{
"epoch": 0.04967221381744831,
"grad_norm": 4.0646867752075195,
"learning_rate": 9.899497487437185e-07,
"loss": 0.1361,
"num_input_tokens_seen": 413138944,
"step": 197
},
{
"epoch": 0.049924357034795766,
"grad_norm": 4.822361946105957,
"learning_rate": 9.949748743718592e-07,
"loss": 0.1678,
"num_input_tokens_seen": 415236096,
"step": 198
},
{
"epoch": 0.050176500252143215,
"grad_norm": 5.335970878601074,
"learning_rate": 1e-06,
"loss": 0.138,
"num_input_tokens_seen": 417333248,
"step": 199
},
{
"epoch": 0.05042864346949067,
"grad_norm": 4.283322811126709,
"learning_rate": 9.999998435084117e-07,
"loss": 0.1599,
"num_input_tokens_seen": 419430400,
"step": 200
},
{
"epoch": 0.05068078668683813,
"grad_norm": 3.6955955028533936,
"learning_rate": 9.999993740337564e-07,
"loss": 0.1203,
"num_input_tokens_seen": 421527552,
"step": 201
},
{
"epoch": 0.050932929904185575,
"grad_norm": 4.380987167358398,
"learning_rate": 9.999985915763598e-07,
"loss": 0.2069,
"num_input_tokens_seen": 423624704,
"step": 202
},
{
"epoch": 0.05118507312153303,
"grad_norm": 3.827716588973999,
"learning_rate": 9.999974961367668e-07,
"loss": 0.1987,
"num_input_tokens_seen": 425721856,
"step": 203
},
{
"epoch": 0.05143721633888049,
"grad_norm": 3.8995583057403564,
"learning_rate": 9.999960877157389e-07,
"loss": 0.1473,
"num_input_tokens_seen": 427819008,
"step": 204
},
{
"epoch": 0.051689359556227936,
"grad_norm": 3.6740832328796387,
"learning_rate": 9.99994366314256e-07,
"loss": 0.1348,
"num_input_tokens_seen": 429916160,
"step": 205
},
{
"epoch": 0.05194150277357539,
"grad_norm": 3.7553346157073975,
"learning_rate": 9.99992331933515e-07,
"loss": 0.1463,
"num_input_tokens_seen": 432013312,
"step": 206
},
{
"epoch": 0.05219364599092285,
"grad_norm": 4.992524147033691,
"learning_rate": 9.99989984574931e-07,
"loss": 0.2349,
"num_input_tokens_seen": 434110464,
"step": 207
},
{
"epoch": 0.052445789208270296,
"grad_norm": 4.383981704711914,
"learning_rate": 9.99987324240137e-07,
"loss": 0.1552,
"num_input_tokens_seen": 436207616,
"step": 208
},
{
"epoch": 0.05269793242561775,
"grad_norm": 4.6292619705200195,
"learning_rate": 9.999843509309827e-07,
"loss": 0.1998,
"num_input_tokens_seen": 438304768,
"step": 209
},
{
"epoch": 0.0529500756429652,
"grad_norm": 3.5693604946136475,
"learning_rate": 9.999810646495363e-07,
"loss": 0.1409,
"num_input_tokens_seen": 440401920,
"step": 210
},
{
"epoch": 0.05320221886031266,
"grad_norm": 4.460555553436279,
"learning_rate": 9.999774653980837e-07,
"loss": 0.2005,
"num_input_tokens_seen": 442499072,
"step": 211
},
{
"epoch": 0.05345436207766011,
"grad_norm": 3.6692800521850586,
"learning_rate": 9.99973553179128e-07,
"loss": 0.1358,
"num_input_tokens_seen": 444596224,
"step": 212
},
{
"epoch": 0.05370650529500756,
"grad_norm": 3.4849557876586914,
"learning_rate": 9.999693279953903e-07,
"loss": 0.1199,
"num_input_tokens_seen": 446693376,
"step": 213
},
{
"epoch": 0.05395864851235502,
"grad_norm": 3.9747097492218018,
"learning_rate": 9.999647898498095e-07,
"loss": 0.1885,
"num_input_tokens_seen": 448790528,
"step": 214
},
{
"epoch": 0.05421079172970247,
"grad_norm": 4.172543525695801,
"learning_rate": 9.999599387455416e-07,
"loss": 0.2118,
"num_input_tokens_seen": 450887680,
"step": 215
},
{
"epoch": 0.05446293494704992,
"grad_norm": 3.811913013458252,
"learning_rate": 9.999547746859607e-07,
"loss": 0.1973,
"num_input_tokens_seen": 452984832,
"step": 216
},
{
"epoch": 0.05471507816439738,
"grad_norm": 3.7271082401275635,
"learning_rate": 9.999492976746585e-07,
"loss": 0.2219,
"num_input_tokens_seen": 455081984,
"step": 217
},
{
"epoch": 0.054967221381744834,
"grad_norm": 4.112778186798096,
"learning_rate": 9.999435077154446e-07,
"loss": 0.1748,
"num_input_tokens_seen": 457179136,
"step": 218
},
{
"epoch": 0.05521936459909228,
"grad_norm": 6.517294883728027,
"learning_rate": 9.99937404812346e-07,
"loss": 0.3107,
"num_input_tokens_seen": 459276288,
"step": 219
},
{
"epoch": 0.05547150781643974,
"grad_norm": 4.02686071395874,
"learning_rate": 9.99930988969607e-07,
"loss": 0.0861,
"num_input_tokens_seen": 461373440,
"step": 220
},
{
"epoch": 0.055723651033787194,
"grad_norm": 3.6635353565216064,
"learning_rate": 9.999242601916902e-07,
"loss": 0.2132,
"num_input_tokens_seen": 463470592,
"step": 221
},
{
"epoch": 0.05597579425113464,
"grad_norm": 4.417490005493164,
"learning_rate": 9.999172184832756e-07,
"loss": 0.2374,
"num_input_tokens_seen": 465567744,
"step": 222
},
{
"epoch": 0.0562279374684821,
"grad_norm": 3.173140048980713,
"learning_rate": 9.99909863849261e-07,
"loss": 0.1771,
"num_input_tokens_seen": 467664896,
"step": 223
},
{
"epoch": 0.05648008068582955,
"grad_norm": 5.276343822479248,
"learning_rate": 9.999021962947612e-07,
"loss": 0.1569,
"num_input_tokens_seen": 469762048,
"step": 224
},
{
"epoch": 0.056732223903177004,
"grad_norm": 4.241299629211426,
"learning_rate": 9.998942158251096e-07,
"loss": 0.2738,
"num_input_tokens_seen": 471859200,
"step": 225
},
{
"epoch": 0.05698436712052446,
"grad_norm": 4.36360502243042,
"learning_rate": 9.998859224458565e-07,
"loss": 0.2735,
"num_input_tokens_seen": 473956352,
"step": 226
},
{
"epoch": 0.05723651033787191,
"grad_norm": 5.051778316497803,
"learning_rate": 9.998773161627701e-07,
"loss": 0.1831,
"num_input_tokens_seen": 476053504,
"step": 227
},
{
"epoch": 0.057488653555219364,
"grad_norm": 3.883115291595459,
"learning_rate": 9.998683969818364e-07,
"loss": 0.1617,
"num_input_tokens_seen": 478150656,
"step": 228
},
{
"epoch": 0.05774079677256682,
"grad_norm": 3.9679079055786133,
"learning_rate": 9.998591649092588e-07,
"loss": 0.1273,
"num_input_tokens_seen": 480247808,
"step": 229
},
{
"epoch": 0.05799293998991427,
"grad_norm": 6.0246901512146,
"learning_rate": 9.998496199514582e-07,
"loss": 0.1463,
"num_input_tokens_seen": 482344960,
"step": 230
},
{
"epoch": 0.058245083207261725,
"grad_norm": 3.684004545211792,
"learning_rate": 9.998397621150734e-07,
"loss": 0.1422,
"num_input_tokens_seen": 484442112,
"step": 231
},
{
"epoch": 0.05849722642460918,
"grad_norm": 5.111332416534424,
"learning_rate": 9.998295914069606e-07,
"loss": 0.2197,
"num_input_tokens_seen": 486539264,
"step": 232
},
{
"epoch": 0.05874936964195663,
"grad_norm": 3.0218448638916016,
"learning_rate": 9.99819107834194e-07,
"loss": 0.1219,
"num_input_tokens_seen": 488636416,
"step": 233
},
{
"epoch": 0.059001512859304085,
"grad_norm": 3.564114570617676,
"learning_rate": 9.99808311404065e-07,
"loss": 0.1983,
"num_input_tokens_seen": 490733568,
"step": 234
},
{
"epoch": 0.05925365607665154,
"grad_norm": 6.091875076293945,
"learning_rate": 9.997972021240824e-07,
"loss": 0.2782,
"num_input_tokens_seen": 492830720,
"step": 235
},
{
"epoch": 0.05950579929399899,
"grad_norm": 4.984955787658691,
"learning_rate": 9.997857800019734e-07,
"loss": 0.2658,
"num_input_tokens_seen": 494927872,
"step": 236
},
{
"epoch": 0.059757942511346446,
"grad_norm": 4.2022705078125,
"learning_rate": 9.997740450456819e-07,
"loss": 0.1511,
"num_input_tokens_seen": 497025024,
"step": 237
},
{
"epoch": 0.0600100857286939,
"grad_norm": 4.631911277770996,
"learning_rate": 9.997619972633701e-07,
"loss": 0.1874,
"num_input_tokens_seen": 499122176,
"step": 238
},
{
"epoch": 0.06026222894604135,
"grad_norm": 3.489034414291382,
"learning_rate": 9.99749636663417e-07,
"loss": 0.1684,
"num_input_tokens_seen": 501219328,
"step": 239
},
{
"epoch": 0.060514372163388806,
"grad_norm": 5.1144185066223145,
"learning_rate": 9.997369632544202e-07,
"loss": 0.1834,
"num_input_tokens_seen": 503316480,
"step": 240
},
{
"epoch": 0.060766515380736255,
"grad_norm": 5.526945114135742,
"learning_rate": 9.997239770451938e-07,
"loss": 0.2135,
"num_input_tokens_seen": 505413632,
"step": 241
},
{
"epoch": 0.06101865859808371,
"grad_norm": 6.000234127044678,
"learning_rate": 9.997106780447705e-07,
"loss": 0.2248,
"num_input_tokens_seen": 507510784,
"step": 242
},
{
"epoch": 0.06127080181543117,
"grad_norm": 3.4181573390960693,
"learning_rate": 9.99697066262399e-07,
"loss": 0.0903,
"num_input_tokens_seen": 509607936,
"step": 243
},
{
"epoch": 0.061522945032778616,
"grad_norm": 3.6254003047943115,
"learning_rate": 9.996831417075477e-07,
"loss": 0.1507,
"num_input_tokens_seen": 511705088,
"step": 244
},
{
"epoch": 0.06177508825012607,
"grad_norm": 3.7657456398010254,
"learning_rate": 9.996689043899005e-07,
"loss": 0.1569,
"num_input_tokens_seen": 513802240,
"step": 245
},
{
"epoch": 0.06202723146747353,
"grad_norm": 4.642493724822998,
"learning_rate": 9.996543543193604e-07,
"loss": 0.1187,
"num_input_tokens_seen": 515899392,
"step": 246
},
{
"epoch": 0.062279374684820976,
"grad_norm": 3.632336378097534,
"learning_rate": 9.996394915060468e-07,
"loss": 0.1736,
"num_input_tokens_seen": 517996544,
"step": 247
},
{
"epoch": 0.06253151790216843,
"grad_norm": 4.491301536560059,
"learning_rate": 9.99624315960297e-07,
"loss": 0.2351,
"num_input_tokens_seen": 520093696,
"step": 248
},
{
"epoch": 0.06278366111951589,
"grad_norm": 2.526890277862549,
"learning_rate": 9.996088276926661e-07,
"loss": 0.1088,
"num_input_tokens_seen": 522190848,
"step": 249
},
{
"epoch": 0.06303580433686334,
"grad_norm": 4.402822971343994,
"learning_rate": 9.995930267139266e-07,
"loss": 0.1189,
"num_input_tokens_seen": 524288000,
"step": 250
},
{
"epoch": 0.06328794755421079,
"grad_norm": 2.893916368484497,
"learning_rate": 9.99576913035068e-07,
"loss": 0.1003,
"num_input_tokens_seen": 526385152,
"step": 251
},
{
"epoch": 0.06354009077155824,
"grad_norm": 4.437779426574707,
"learning_rate": 9.995604866672978e-07,
"loss": 0.21,
"num_input_tokens_seen": 528482304,
"step": 252
},
{
"epoch": 0.0637922339889057,
"grad_norm": 7.890944957733154,
"learning_rate": 9.995437476220408e-07,
"loss": 0.3668,
"num_input_tokens_seen": 530579456,
"step": 253
},
{
"epoch": 0.06404437720625315,
"grad_norm": 3.5893633365631104,
"learning_rate": 9.995266959109396e-07,
"loss": 0.1771,
"num_input_tokens_seen": 532676608,
"step": 254
},
{
"epoch": 0.06429652042360061,
"grad_norm": 4.691050052642822,
"learning_rate": 9.995093315458534e-07,
"loss": 0.1696,
"num_input_tokens_seen": 534773760,
"step": 255
},
{
"epoch": 0.06454866364094806,
"grad_norm": 2.8213396072387695,
"learning_rate": 9.9949165453886e-07,
"loss": 0.1364,
"num_input_tokens_seen": 536870912,
"step": 256
},
{
"epoch": 0.0648008068582955,
"grad_norm": 4.529366493225098,
"learning_rate": 9.994736649022539e-07,
"loss": 0.1749,
"num_input_tokens_seen": 538968064,
"step": 257
},
{
"epoch": 0.06505295007564296,
"grad_norm": 3.919793128967285,
"learning_rate": 9.99455362648547e-07,
"loss": 0.1611,
"num_input_tokens_seen": 541065216,
"step": 258
},
{
"epoch": 0.06530509329299042,
"grad_norm": 4.9372711181640625,
"learning_rate": 9.994367477904695e-07,
"loss": 0.2556,
"num_input_tokens_seen": 543162368,
"step": 259
},
{
"epoch": 0.06555723651033787,
"grad_norm": 5.533105850219727,
"learning_rate": 9.994178203409674e-07,
"loss": 0.1598,
"num_input_tokens_seen": 545259520,
"step": 260
},
{
"epoch": 0.06580937972768533,
"grad_norm": 4.164669990539551,
"learning_rate": 9.993985803132057e-07,
"loss": 0.1743,
"num_input_tokens_seen": 547356672,
"step": 261
},
{
"epoch": 0.06606152294503277,
"grad_norm": 3.924823045730591,
"learning_rate": 9.993790277205662e-07,
"loss": 0.169,
"num_input_tokens_seen": 549453824,
"step": 262
},
{
"epoch": 0.06631366616238023,
"grad_norm": 3.045861005783081,
"learning_rate": 9.993591625766477e-07,
"loss": 0.1027,
"num_input_tokens_seen": 551550976,
"step": 263
},
{
"epoch": 0.06656580937972768,
"grad_norm": 2.7366058826446533,
"learning_rate": 9.993389848952673e-07,
"loss": 0.1027,
"num_input_tokens_seen": 553648128,
"step": 264
},
{
"epoch": 0.06681795259707514,
"grad_norm": 4.305903434753418,
"learning_rate": 9.993184946904586e-07,
"loss": 0.0899,
"num_input_tokens_seen": 555745280,
"step": 265
},
{
"epoch": 0.0670700958144226,
"grad_norm": 4.169579029083252,
"learning_rate": 9.992976919764728e-07,
"loss": 0.1555,
"num_input_tokens_seen": 557842432,
"step": 266
},
{
"epoch": 0.06732223903177005,
"grad_norm": 2.866806983947754,
"learning_rate": 9.992765767677789e-07,
"loss": 0.1226,
"num_input_tokens_seen": 559939584,
"step": 267
},
{
"epoch": 0.06757438224911749,
"grad_norm": 3.6884562969207764,
"learning_rate": 9.992551490790626e-07,
"loss": 0.1359,
"num_input_tokens_seen": 562036736,
"step": 268
},
{
"epoch": 0.06782652546646495,
"grad_norm": 4.731523513793945,
"learning_rate": 9.992334089252278e-07,
"loss": 0.1438,
"num_input_tokens_seen": 564133888,
"step": 269
},
{
"epoch": 0.0680786686838124,
"grad_norm": 3.90913724899292,
"learning_rate": 9.992113563213944e-07,
"loss": 0.1596,
"num_input_tokens_seen": 566231040,
"step": 270
},
{
"epoch": 0.06833081190115986,
"grad_norm": 3.4404547214508057,
"learning_rate": 9.99188991282901e-07,
"loss": 0.165,
"num_input_tokens_seen": 568328192,
"step": 271
},
{
"epoch": 0.06858295511850732,
"grad_norm": 2.840576648712158,
"learning_rate": 9.991663138253025e-07,
"loss": 0.109,
"num_input_tokens_seen": 570425344,
"step": 272
},
{
"epoch": 0.06883509833585477,
"grad_norm": 4.362993240356445,
"learning_rate": 9.991433239643716e-07,
"loss": 0.209,
"num_input_tokens_seen": 572522496,
"step": 273
},
{
"epoch": 0.06908724155320221,
"grad_norm": 4.26267671585083,
"learning_rate": 9.991200217160984e-07,
"loss": 0.0746,
"num_input_tokens_seen": 574619648,
"step": 274
},
{
"epoch": 0.06933938477054967,
"grad_norm": 3.7214324474334717,
"learning_rate": 9.990964070966895e-07,
"loss": 0.1395,
"num_input_tokens_seen": 576716800,
"step": 275
},
{
"epoch": 0.06959152798789713,
"grad_norm": 4.263853549957275,
"learning_rate": 9.9907248012257e-07,
"loss": 0.1919,
"num_input_tokens_seen": 578813952,
"step": 276
},
{
"epoch": 0.06984367120524458,
"grad_norm": 3.7660653591156006,
"learning_rate": 9.99048240810381e-07,
"loss": 0.1362,
"num_input_tokens_seen": 580911104,
"step": 277
},
{
"epoch": 0.07009581442259204,
"grad_norm": 3.3318731784820557,
"learning_rate": 9.990236891769818e-07,
"loss": 0.0849,
"num_input_tokens_seen": 583008256,
"step": 278
},
{
"epoch": 0.07034795763993948,
"grad_norm": 3.9983317852020264,
"learning_rate": 9.98998825239448e-07,
"loss": 0.1731,
"num_input_tokens_seen": 585105408,
"step": 279
},
{
"epoch": 0.07060010085728693,
"grad_norm": 3.032134532928467,
"learning_rate": 9.98973649015073e-07,
"loss": 0.1278,
"num_input_tokens_seen": 587202560,
"step": 280
},
{
"epoch": 0.07085224407463439,
"grad_norm": 3.8470921516418457,
"learning_rate": 9.98948160521368e-07,
"loss": 0.103,
"num_input_tokens_seen": 589299712,
"step": 281
},
{
"epoch": 0.07110438729198185,
"grad_norm": 2.935425043106079,
"learning_rate": 9.989223597760598e-07,
"loss": 0.1472,
"num_input_tokens_seen": 591396864,
"step": 282
},
{
"epoch": 0.0713565305093293,
"grad_norm": 3.791640043258667,
"learning_rate": 9.988962467970938e-07,
"loss": 0.1743,
"num_input_tokens_seen": 593494016,
"step": 283
},
{
"epoch": 0.07160867372667676,
"grad_norm": 2.616250991821289,
"learning_rate": 9.988698216026322e-07,
"loss": 0.0769,
"num_input_tokens_seen": 595591168,
"step": 284
},
{
"epoch": 0.0718608169440242,
"grad_norm": 3.309394359588623,
"learning_rate": 9.988430842110538e-07,
"loss": 0.1357,
"num_input_tokens_seen": 597688320,
"step": 285
},
{
"epoch": 0.07211296016137166,
"grad_norm": 4.600468635559082,
"learning_rate": 9.988160346409551e-07,
"loss": 0.1178,
"num_input_tokens_seen": 599785472,
"step": 286
},
{
"epoch": 0.07236510337871911,
"grad_norm": 3.2695717811584473,
"learning_rate": 9.987886729111496e-07,
"loss": 0.1122,
"num_input_tokens_seen": 601882624,
"step": 287
},
{
"epoch": 0.07261724659606657,
"grad_norm": 2.7870922088623047,
"learning_rate": 9.98760999040668e-07,
"loss": 0.0995,
"num_input_tokens_seen": 603979776,
"step": 288
},
{
"epoch": 0.07286938981341402,
"grad_norm": 3.2872393131256104,
"learning_rate": 9.987330130487576e-07,
"loss": 0.1314,
"num_input_tokens_seen": 606076928,
"step": 289
},
{
"epoch": 0.07312153303076148,
"grad_norm": 4.210444927215576,
"learning_rate": 9.987047149548833e-07,
"loss": 0.1435,
"num_input_tokens_seen": 608174080,
"step": 290
},
{
"epoch": 0.07337367624810892,
"grad_norm": 3.661651372909546,
"learning_rate": 9.986761047787274e-07,
"loss": 0.1075,
"num_input_tokens_seen": 610271232,
"step": 291
},
{
"epoch": 0.07362581946545638,
"grad_norm": 4.133707046508789,
"learning_rate": 9.986471825401882e-07,
"loss": 0.1977,
"num_input_tokens_seen": 612368384,
"step": 292
},
{
"epoch": 0.07387796268280383,
"grad_norm": 4.6356072425842285,
"learning_rate": 9.98617948259382e-07,
"loss": 0.1703,
"num_input_tokens_seen": 614465536,
"step": 293
},
{
"epoch": 0.07413010590015129,
"grad_norm": 3.9383256435394287,
"learning_rate": 9.985884019566416e-07,
"loss": 0.1848,
"num_input_tokens_seen": 616562688,
"step": 294
},
{
"epoch": 0.07438224911749874,
"grad_norm": 4.793269157409668,
"learning_rate": 9.985585436525168e-07,
"loss": 0.1488,
"num_input_tokens_seen": 618659840,
"step": 295
},
{
"epoch": 0.07463439233484619,
"grad_norm": 6.518699645996094,
"learning_rate": 9.98528373367775e-07,
"loss": 0.24,
"num_input_tokens_seen": 620756992,
"step": 296
},
{
"epoch": 0.07488653555219364,
"grad_norm": 3.71830415725708,
"learning_rate": 9.984978911234003e-07,
"loss": 0.1444,
"num_input_tokens_seen": 622854144,
"step": 297
},
{
"epoch": 0.0751386787695411,
"grad_norm": 3.535399913787842,
"learning_rate": 9.984670969405932e-07,
"loss": 0.145,
"num_input_tokens_seen": 624951296,
"step": 298
},
{
"epoch": 0.07539082198688855,
"grad_norm": 2.5828938484191895,
"learning_rate": 9.984359908407716e-07,
"loss": 0.1091,
"num_input_tokens_seen": 627048448,
"step": 299
},
{
"epoch": 0.07564296520423601,
"grad_norm": 3.900514841079712,
"learning_rate": 9.984045728455707e-07,
"loss": 0.1672,
"num_input_tokens_seen": 629145600,
"step": 300
},
{
"epoch": 0.07589510842158347,
"grad_norm": 4.364770412445068,
"learning_rate": 9.98372842976842e-07,
"loss": 0.2678,
"num_input_tokens_seen": 631242752,
"step": 301
},
{
"epoch": 0.07614725163893091,
"grad_norm": 3.6578245162963867,
"learning_rate": 9.983408012566545e-07,
"loss": 0.1238,
"num_input_tokens_seen": 633339904,
"step": 302
},
{
"epoch": 0.07639939485627836,
"grad_norm": 3.067723512649536,
"learning_rate": 9.983084477072936e-07,
"loss": 0.092,
"num_input_tokens_seen": 635437056,
"step": 303
},
{
"epoch": 0.07665153807362582,
"grad_norm": 2.8249781131744385,
"learning_rate": 9.982757823512619e-07,
"loss": 0.1065,
"num_input_tokens_seen": 637534208,
"step": 304
},
{
"epoch": 0.07690368129097327,
"grad_norm": 3.4561619758605957,
"learning_rate": 9.982428052112784e-07,
"loss": 0.1463,
"num_input_tokens_seen": 639631360,
"step": 305
},
{
"epoch": 0.07715582450832073,
"grad_norm": 4.192049503326416,
"learning_rate": 9.982095163102796e-07,
"loss": 0.1127,
"num_input_tokens_seen": 641728512,
"step": 306
},
{
"epoch": 0.07740796772566819,
"grad_norm": 2.888293743133545,
"learning_rate": 9.981759156714185e-07,
"loss": 0.113,
"num_input_tokens_seen": 643825664,
"step": 307
},
{
"epoch": 0.07766011094301563,
"grad_norm": 3.8195247650146484,
"learning_rate": 9.981420033180651e-07,
"loss": 0.1601,
"num_input_tokens_seen": 645922816,
"step": 308
},
{
"epoch": 0.07791225416036308,
"grad_norm": 3.721971035003662,
"learning_rate": 9.98107779273806e-07,
"loss": 0.1443,
"num_input_tokens_seen": 648019968,
"step": 309
},
{
"epoch": 0.07816439737771054,
"grad_norm": 3.4332494735717773,
"learning_rate": 9.980732435624441e-07,
"loss": 0.1503,
"num_input_tokens_seen": 650117120,
"step": 310
},
{
"epoch": 0.078416540595058,
"grad_norm": 2.9033710956573486,
"learning_rate": 9.980383962080003e-07,
"loss": 0.073,
"num_input_tokens_seen": 652214272,
"step": 311
},
{
"epoch": 0.07866868381240545,
"grad_norm": 3.597287178039551,
"learning_rate": 9.980032372347116e-07,
"loss": 0.1596,
"num_input_tokens_seen": 654311424,
"step": 312
},
{
"epoch": 0.0789208270297529,
"grad_norm": 3.0851659774780273,
"learning_rate": 9.97967766667031e-07,
"loss": 0.1188,
"num_input_tokens_seen": 656408576,
"step": 313
},
{
"epoch": 0.07917297024710035,
"grad_norm": 2.279250144958496,
"learning_rate": 9.979319845296296e-07,
"loss": 0.0974,
"num_input_tokens_seen": 658505728,
"step": 314
},
{
"epoch": 0.0794251134644478,
"grad_norm": 4.360164165496826,
"learning_rate": 9.978958908473941e-07,
"loss": 0.1992,
"num_input_tokens_seen": 660602880,
"step": 315
},
{
"epoch": 0.07967725668179526,
"grad_norm": 2.8060495853424072,
"learning_rate": 9.978594856454288e-07,
"loss": 0.1314,
"num_input_tokens_seen": 662700032,
"step": 316
},
{
"epoch": 0.07992939989914272,
"grad_norm": 4.089578628540039,
"learning_rate": 9.978227689490536e-07,
"loss": 0.1807,
"num_input_tokens_seen": 664797184,
"step": 317
},
{
"epoch": 0.08018154311649017,
"grad_norm": 3.043846368789673,
"learning_rate": 9.977857407838061e-07,
"loss": 0.1208,
"num_input_tokens_seen": 666894336,
"step": 318
},
{
"epoch": 0.08043368633383761,
"grad_norm": 2.2600390911102295,
"learning_rate": 9.9774840117544e-07,
"loss": 0.076,
"num_input_tokens_seen": 668991488,
"step": 319
},
{
"epoch": 0.08068582955118507,
"grad_norm": 3.115410089492798,
"learning_rate": 9.977107501499253e-07,
"loss": 0.1118,
"num_input_tokens_seen": 671088640,
"step": 320
},
{
"epoch": 0.08093797276853253,
"grad_norm": 3.720118761062622,
"learning_rate": 9.976727877334493e-07,
"loss": 0.1518,
"num_input_tokens_seen": 673185792,
"step": 321
},
{
"epoch": 0.08119011598587998,
"grad_norm": 3.6921238899230957,
"learning_rate": 9.976345139524152e-07,
"loss": 0.1261,
"num_input_tokens_seen": 675282944,
"step": 322
},
{
"epoch": 0.08144225920322744,
"grad_norm": 3.162914752960205,
"learning_rate": 9.975959288334438e-07,
"loss": 0.1038,
"num_input_tokens_seen": 677380096,
"step": 323
},
{
"epoch": 0.08169440242057488,
"grad_norm": 3.166231870651245,
"learning_rate": 9.97557032403371e-07,
"loss": 0.1294,
"num_input_tokens_seen": 679477248,
"step": 324
},
{
"epoch": 0.08194654563792234,
"grad_norm": 3.0747804641723633,
"learning_rate": 9.975178246892507e-07,
"loss": 0.1425,
"num_input_tokens_seen": 681574400,
"step": 325
},
{
"epoch": 0.08219868885526979,
"grad_norm": 3.0979673862457275,
"learning_rate": 9.974783057183519e-07,
"loss": 0.1586,
"num_input_tokens_seen": 683671552,
"step": 326
},
{
"epoch": 0.08245083207261725,
"grad_norm": 4.019197940826416,
"learning_rate": 9.974384755181609e-07,
"loss": 0.1663,
"num_input_tokens_seen": 685768704,
"step": 327
},
{
"epoch": 0.0827029752899647,
"grad_norm": 2.6061339378356934,
"learning_rate": 9.973983341163807e-07,
"loss": 0.0851,
"num_input_tokens_seen": 687865856,
"step": 328
},
{
"epoch": 0.08295511850731216,
"grad_norm": 3.0148558616638184,
"learning_rate": 9.9735788154093e-07,
"loss": 0.0966,
"num_input_tokens_seen": 689963008,
"step": 329
},
{
"epoch": 0.0832072617246596,
"grad_norm": 2.6705162525177,
"learning_rate": 9.973171178199447e-07,
"loss": 0.0839,
"num_input_tokens_seen": 692060160,
"step": 330
},
{
"epoch": 0.08345940494200706,
"grad_norm": 4.910850524902344,
"learning_rate": 9.972760429817763e-07,
"loss": 0.1695,
"num_input_tokens_seen": 694157312,
"step": 331
},
{
"epoch": 0.08371154815935451,
"grad_norm": 3.358743190765381,
"learning_rate": 9.972346570549932e-07,
"loss": 0.0935,
"num_input_tokens_seen": 696254464,
"step": 332
},
{
"epoch": 0.08396369137670197,
"grad_norm": 3.214064598083496,
"learning_rate": 9.971929600683802e-07,
"loss": 0.0848,
"num_input_tokens_seen": 698351616,
"step": 333
},
{
"epoch": 0.08421583459404942,
"grad_norm": 4.408289432525635,
"learning_rate": 9.971509520509381e-07,
"loss": 0.1624,
"num_input_tokens_seen": 700448768,
"step": 334
},
{
"epoch": 0.08446797781139688,
"grad_norm": 4.276678085327148,
"learning_rate": 9.971086330318845e-07,
"loss": 0.1458,
"num_input_tokens_seen": 702545920,
"step": 335
},
{
"epoch": 0.08472012102874432,
"grad_norm": 2.518461227416992,
"learning_rate": 9.97066003040653e-07,
"loss": 0.0934,
"num_input_tokens_seen": 704643072,
"step": 336
},
{
"epoch": 0.08497226424609178,
"grad_norm": 2.8323476314544678,
"learning_rate": 9.970230621068932e-07,
"loss": 0.1324,
"num_input_tokens_seen": 706740224,
"step": 337
},
{
"epoch": 0.08522440746343923,
"grad_norm": 2.8873610496520996,
"learning_rate": 9.969798102604717e-07,
"loss": 0.1292,
"num_input_tokens_seen": 708837376,
"step": 338
},
{
"epoch": 0.08547655068078669,
"grad_norm": 2.796959638595581,
"learning_rate": 9.969362475314708e-07,
"loss": 0.1086,
"num_input_tokens_seen": 710934528,
"step": 339
},
{
"epoch": 0.08572869389813415,
"grad_norm": 4.745234966278076,
"learning_rate": 9.968923739501892e-07,
"loss": 0.2212,
"num_input_tokens_seen": 713031680,
"step": 340
},
{
"epoch": 0.08598083711548159,
"grad_norm": 4.436620235443115,
"learning_rate": 9.968481895471417e-07,
"loss": 0.1376,
"num_input_tokens_seen": 715128832,
"step": 341
},
{
"epoch": 0.08623298033282904,
"grad_norm": 4.772200584411621,
"learning_rate": 9.968036943530592e-07,
"loss": 0.193,
"num_input_tokens_seen": 717225984,
"step": 342
},
{
"epoch": 0.0864851235501765,
"grad_norm": 3.2390449047088623,
"learning_rate": 9.967588883988893e-07,
"loss": 0.0999,
"num_input_tokens_seen": 719323136,
"step": 343
},
{
"epoch": 0.08673726676752395,
"grad_norm": 3.936569929122925,
"learning_rate": 9.967137717157951e-07,
"loss": 0.1634,
"num_input_tokens_seen": 721420288,
"step": 344
},
{
"epoch": 0.08698940998487141,
"grad_norm": 3.647679567337036,
"learning_rate": 9.966683443351564e-07,
"loss": 0.1798,
"num_input_tokens_seen": 723517440,
"step": 345
},
{
"epoch": 0.08724155320221887,
"grad_norm": 2.8842921257019043,
"learning_rate": 9.966226062885682e-07,
"loss": 0.1033,
"num_input_tokens_seen": 725614592,
"step": 346
},
{
"epoch": 0.08749369641956631,
"grad_norm": 6.5264434814453125,
"learning_rate": 9.965765576078424e-07,
"loss": 0.2729,
"num_input_tokens_seen": 727711744,
"step": 347
},
{
"epoch": 0.08774583963691376,
"grad_norm": 3.786755084991455,
"learning_rate": 9.96530198325007e-07,
"loss": 0.1233,
"num_input_tokens_seen": 729808896,
"step": 348
},
{
"epoch": 0.08799798285426122,
"grad_norm": 3.994030237197876,
"learning_rate": 9.964835284723052e-07,
"loss": 0.1229,
"num_input_tokens_seen": 731906048,
"step": 349
},
{
"epoch": 0.08825012607160868,
"grad_norm": 4.352416038513184,
"learning_rate": 9.96436548082197e-07,
"loss": 0.1501,
"num_input_tokens_seen": 734003200,
"step": 350
},
{
"epoch": 0.08850226928895613,
"grad_norm": 3.238286018371582,
"learning_rate": 9.963892571873584e-07,
"loss": 0.1314,
"num_input_tokens_seen": 736100352,
"step": 351
},
{
"epoch": 0.08875441250630359,
"grad_norm": 2.75301456451416,
"learning_rate": 9.963416558206806e-07,
"loss": 0.1137,
"num_input_tokens_seen": 738197504,
"step": 352
},
{
"epoch": 0.08900655572365103,
"grad_norm": 3.3911097049713135,
"learning_rate": 9.962937440152712e-07,
"loss": 0.0976,
"num_input_tokens_seen": 740294656,
"step": 353
},
{
"epoch": 0.08925869894099848,
"grad_norm": 2.7000679969787598,
"learning_rate": 9.962455218044542e-07,
"loss": 0.063,
"num_input_tokens_seen": 742391808,
"step": 354
},
{
"epoch": 0.08951084215834594,
"grad_norm": 3.3619422912597656,
"learning_rate": 9.961969892217688e-07,
"loss": 0.1167,
"num_input_tokens_seen": 744488960,
"step": 355
},
{
"epoch": 0.0897629853756934,
"grad_norm": 2.421957015991211,
"learning_rate": 9.9614814630097e-07,
"loss": 0.1184,
"num_input_tokens_seen": 746586112,
"step": 356
},
{
"epoch": 0.09001512859304085,
"grad_norm": 3.2838544845581055,
"learning_rate": 9.960989930760294e-07,
"loss": 0.1133,
"num_input_tokens_seen": 748683264,
"step": 357
},
{
"epoch": 0.0902672718103883,
"grad_norm": 4.716813564300537,
"learning_rate": 9.960495295811337e-07,
"loss": 0.152,
"num_input_tokens_seen": 750780416,
"step": 358
},
{
"epoch": 0.09051941502773575,
"grad_norm": 3.567866563796997,
"learning_rate": 9.959997558506857e-07,
"loss": 0.1348,
"num_input_tokens_seen": 752877568,
"step": 359
},
{
"epoch": 0.0907715582450832,
"grad_norm": 8.155049324035645,
"learning_rate": 9.959496719193039e-07,
"loss": 0.1658,
"num_input_tokens_seen": 754974720,
"step": 360
},
{
"epoch": 0.09102370146243066,
"grad_norm": 4.341349124908447,
"learning_rate": 9.958992778218226e-07,
"loss": 0.1635,
"num_input_tokens_seen": 757071872,
"step": 361
},
{
"epoch": 0.09127584467977812,
"grad_norm": 4.6380815505981445,
"learning_rate": 9.95848573593292e-07,
"loss": 0.1715,
"num_input_tokens_seen": 759169024,
"step": 362
},
{
"epoch": 0.09152798789712557,
"grad_norm": 3.3967676162719727,
"learning_rate": 9.957975592689774e-07,
"loss": 0.106,
"num_input_tokens_seen": 761266176,
"step": 363
},
{
"epoch": 0.09178013111447302,
"grad_norm": 2.9890308380126953,
"learning_rate": 9.957462348843607e-07,
"loss": 0.1163,
"num_input_tokens_seen": 763363328,
"step": 364
},
{
"epoch": 0.09203227433182047,
"grad_norm": 2.564323663711548,
"learning_rate": 9.956946004751386e-07,
"loss": 0.1217,
"num_input_tokens_seen": 765460480,
"step": 365
},
{
"epoch": 0.09228441754916793,
"grad_norm": 4.0984697341918945,
"learning_rate": 9.956426560772238e-07,
"loss": 0.1801,
"num_input_tokens_seen": 767557632,
"step": 366
},
{
"epoch": 0.09253656076651538,
"grad_norm": 2.5396645069122314,
"learning_rate": 9.955904017267444e-07,
"loss": 0.1272,
"num_input_tokens_seen": 769654784,
"step": 367
},
{
"epoch": 0.09278870398386284,
"grad_norm": 3.0213351249694824,
"learning_rate": 9.955378374600447e-07,
"loss": 0.121,
"num_input_tokens_seen": 771751936,
"step": 368
},
{
"epoch": 0.09304084720121028,
"grad_norm": 3.8049328327178955,
"learning_rate": 9.954849633136839e-07,
"loss": 0.102,
"num_input_tokens_seen": 773849088,
"step": 369
},
{
"epoch": 0.09329299041855774,
"grad_norm": 3.4090912342071533,
"learning_rate": 9.95431779324437e-07,
"loss": 0.1179,
"num_input_tokens_seen": 775946240,
"step": 370
},
{
"epoch": 0.09354513363590519,
"grad_norm": 2.5929131507873535,
"learning_rate": 9.95378285529294e-07,
"loss": 0.1106,
"num_input_tokens_seen": 778043392,
"step": 371
},
{
"epoch": 0.09379727685325265,
"grad_norm": 3.6183884143829346,
"learning_rate": 9.953244819654615e-07,
"loss": 0.1029,
"num_input_tokens_seen": 780140544,
"step": 372
},
{
"epoch": 0.0940494200706001,
"grad_norm": 3.812199354171753,
"learning_rate": 9.952703686703604e-07,
"loss": 0.0838,
"num_input_tokens_seen": 782237696,
"step": 373
},
{
"epoch": 0.09430156328794756,
"grad_norm": 5.054091453552246,
"learning_rate": 9.952159456816275e-07,
"loss": 0.2415,
"num_input_tokens_seen": 784334848,
"step": 374
},
{
"epoch": 0.094553706505295,
"grad_norm": 2.739720582962036,
"learning_rate": 9.951612130371151e-07,
"loss": 0.1198,
"num_input_tokens_seen": 786432000,
"step": 375
},
{
"epoch": 0.09480584972264246,
"grad_norm": 3.5317635536193848,
"learning_rate": 9.951061707748907e-07,
"loss": 0.0951,
"num_input_tokens_seen": 788529152,
"step": 376
},
{
"epoch": 0.09505799293998991,
"grad_norm": 2.7190043926239014,
"learning_rate": 9.95050818933237e-07,
"loss": 0.0918,
"num_input_tokens_seen": 790626304,
"step": 377
},
{
"epoch": 0.09531013615733737,
"grad_norm": 2.244220495223999,
"learning_rate": 9.949951575506528e-07,
"loss": 0.0987,
"num_input_tokens_seen": 792723456,
"step": 378
},
{
"epoch": 0.09556227937468482,
"grad_norm": 2.4800469875335693,
"learning_rate": 9.94939186665851e-07,
"loss": 0.112,
"num_input_tokens_seen": 794820608,
"step": 379
},
{
"epoch": 0.09581442259203228,
"grad_norm": 2.934340238571167,
"learning_rate": 9.948829063177606e-07,
"loss": 0.0914,
"num_input_tokens_seen": 796917760,
"step": 380
},
{
"epoch": 0.09606656580937972,
"grad_norm": 4.361299991607666,
"learning_rate": 9.948263165455256e-07,
"loss": 0.1366,
"num_input_tokens_seen": 799014912,
"step": 381
},
{
"epoch": 0.09631870902672718,
"grad_norm": 5.58315372467041,
"learning_rate": 9.947694173885051e-07,
"loss": 0.1444,
"num_input_tokens_seen": 801112064,
"step": 382
},
{
"epoch": 0.09657085224407463,
"grad_norm": 2.2215416431427,
"learning_rate": 9.947122088862737e-07,
"loss": 0.1324,
"num_input_tokens_seen": 803209216,
"step": 383
},
{
"epoch": 0.09682299546142209,
"grad_norm": 3.1041672229766846,
"learning_rate": 9.946546910786208e-07,
"loss": 0.1451,
"num_input_tokens_seen": 805306368,
"step": 384
},
{
"epoch": 0.09707513867876955,
"grad_norm": 3.4068877696990967,
"learning_rate": 9.945968640055513e-07,
"loss": 0.1318,
"num_input_tokens_seen": 807403520,
"step": 385
},
{
"epoch": 0.09732728189611699,
"grad_norm": 2.2413580417633057,
"learning_rate": 9.945387277072845e-07,
"loss": 0.0665,
"num_input_tokens_seen": 809500672,
"step": 386
},
{
"epoch": 0.09757942511346444,
"grad_norm": 2.360349655151367,
"learning_rate": 9.944802822242558e-07,
"loss": 0.0752,
"num_input_tokens_seen": 811597824,
"step": 387
},
{
"epoch": 0.0978315683308119,
"grad_norm": 2.0612034797668457,
"learning_rate": 9.944215275971148e-07,
"loss": 0.0661,
"num_input_tokens_seen": 813694976,
"step": 388
},
{
"epoch": 0.09808371154815936,
"grad_norm": 2.8129661083221436,
"learning_rate": 9.943624638667263e-07,
"loss": 0.0991,
"num_input_tokens_seen": 815792128,
"step": 389
},
{
"epoch": 0.09833585476550681,
"grad_norm": 3.179905891418457,
"learning_rate": 9.943030910741707e-07,
"loss": 0.166,
"num_input_tokens_seen": 817889280,
"step": 390
},
{
"epoch": 0.09858799798285427,
"grad_norm": 3.191718816757202,
"learning_rate": 9.942434092607423e-07,
"loss": 0.1583,
"num_input_tokens_seen": 819986432,
"step": 391
},
{
"epoch": 0.09884014120020171,
"grad_norm": 2.8753068447113037,
"learning_rate": 9.941834184679511e-07,
"loss": 0.1463,
"num_input_tokens_seen": 822083584,
"step": 392
},
{
"epoch": 0.09909228441754916,
"grad_norm": 2.709397315979004,
"learning_rate": 9.94123118737522e-07,
"loss": 0.103,
"num_input_tokens_seen": 824180736,
"step": 393
},
{
"epoch": 0.09934442763489662,
"grad_norm": 3.7003681659698486,
"learning_rate": 9.94062510111394e-07,
"loss": 0.1539,
"num_input_tokens_seen": 826277888,
"step": 394
},
{
"epoch": 0.09959657085224408,
"grad_norm": 4.4324631690979,
"learning_rate": 9.94001592631722e-07,
"loss": 0.1915,
"num_input_tokens_seen": 828375040,
"step": 395
},
{
"epoch": 0.09984871406959153,
"grad_norm": 4.082291126251221,
"learning_rate": 9.93940366340875e-07,
"loss": 0.2416,
"num_input_tokens_seen": 830472192,
"step": 396
},
{
"epoch": 0.10010085728693899,
"grad_norm": 2.7822890281677246,
"learning_rate": 9.938788312814374e-07,
"loss": 0.1053,
"num_input_tokens_seen": 832569344,
"step": 397
},
{
"epoch": 0.10035300050428643,
"grad_norm": 2.376317024230957,
"learning_rate": 9.938169874962072e-07,
"loss": 0.0785,
"num_input_tokens_seen": 834666496,
"step": 398
},
{
"epoch": 0.10060514372163389,
"grad_norm": 6.018281936645508,
"learning_rate": 9.937548350281987e-07,
"loss": 0.1501,
"num_input_tokens_seen": 836763648,
"step": 399
},
{
"epoch": 0.10085728693898134,
"grad_norm": 2.6437666416168213,
"learning_rate": 9.936923739206391e-07,
"loss": 0.1259,
"num_input_tokens_seen": 838860800,
"step": 400
},
{
"epoch": 0.1011094301563288,
"grad_norm": 3.112172842025757,
"learning_rate": 9.936296042169723e-07,
"loss": 0.1747,
"num_input_tokens_seen": 840957952,
"step": 401
},
{
"epoch": 0.10136157337367625,
"grad_norm": 7.632992744445801,
"learning_rate": 9.93566525960855e-07,
"loss": 0.0882,
"num_input_tokens_seen": 843055104,
"step": 402
},
{
"epoch": 0.1016137165910237,
"grad_norm": 3.4459123611450195,
"learning_rate": 9.935031391961599e-07,
"loss": 0.1184,
"num_input_tokens_seen": 845152256,
"step": 403
},
{
"epoch": 0.10186585980837115,
"grad_norm": 3.6913039684295654,
"learning_rate": 9.93439443966973e-07,
"loss": 0.1121,
"num_input_tokens_seen": 847249408,
"step": 404
},
{
"epoch": 0.1021180030257186,
"grad_norm": 3.291170835494995,
"learning_rate": 9.933754403175956e-07,
"loss": 0.1317,
"num_input_tokens_seen": 849346560,
"step": 405
},
{
"epoch": 0.10237014624306606,
"grad_norm": 5.224982738494873,
"learning_rate": 9.93311128292544e-07,
"loss": 0.2308,
"num_input_tokens_seen": 851443712,
"step": 406
},
{
"epoch": 0.10262228946041352,
"grad_norm": 3.043541193008423,
"learning_rate": 9.932465079365477e-07,
"loss": 0.1293,
"num_input_tokens_seen": 853540864,
"step": 407
},
{
"epoch": 0.10287443267776097,
"grad_norm": 3.613516092300415,
"learning_rate": 9.931815792945515e-07,
"loss": 0.2023,
"num_input_tokens_seen": 855638016,
"step": 408
},
{
"epoch": 0.10312657589510842,
"grad_norm": 3.9032676219940186,
"learning_rate": 9.931163424117148e-07,
"loss": 0.1554,
"num_input_tokens_seen": 857735168,
"step": 409
},
{
"epoch": 0.10337871911245587,
"grad_norm": 2.2143468856811523,
"learning_rate": 9.930507973334106e-07,
"loss": 0.1014,
"num_input_tokens_seen": 859832320,
"step": 410
},
{
"epoch": 0.10363086232980333,
"grad_norm": 3.722890615463257,
"learning_rate": 9.92984944105227e-07,
"loss": 0.1072,
"num_input_tokens_seen": 861929472,
"step": 411
},
{
"epoch": 0.10388300554715078,
"grad_norm": 3.3566651344299316,
"learning_rate": 9.929187827729658e-07,
"loss": 0.1597,
"num_input_tokens_seen": 864026624,
"step": 412
},
{
"epoch": 0.10413514876449824,
"grad_norm": 2.243074655532837,
"learning_rate": 9.928523133826437e-07,
"loss": 0.0799,
"num_input_tokens_seen": 866123776,
"step": 413
},
{
"epoch": 0.1043872919818457,
"grad_norm": 2.4208436012268066,
"learning_rate": 9.927855359804914e-07,
"loss": 0.1441,
"num_input_tokens_seen": 868220928,
"step": 414
},
{
"epoch": 0.10463943519919314,
"grad_norm": 3.7958076000213623,
"learning_rate": 9.927184506129535e-07,
"loss": 0.1769,
"num_input_tokens_seen": 870318080,
"step": 415
},
{
"epoch": 0.10489157841654059,
"grad_norm": 2.1095194816589355,
"learning_rate": 9.926510573266894e-07,
"loss": 0.0626,
"num_input_tokens_seen": 872415232,
"step": 416
},
{
"epoch": 0.10514372163388805,
"grad_norm": 2.22505784034729,
"learning_rate": 9.925833561685718e-07,
"loss": 0.0868,
"num_input_tokens_seen": 874512384,
"step": 417
},
{
"epoch": 0.1053958648512355,
"grad_norm": 2.8599283695220947,
"learning_rate": 9.92515347185689e-07,
"loss": 0.1311,
"num_input_tokens_seen": 876609536,
"step": 418
},
{
"epoch": 0.10564800806858296,
"grad_norm": 3.1945903301239014,
"learning_rate": 9.924470304253418e-07,
"loss": 0.0906,
"num_input_tokens_seen": 878706688,
"step": 419
},
{
"epoch": 0.1059001512859304,
"grad_norm": 5.766541481018066,
"learning_rate": 9.92378405935046e-07,
"loss": 0.1588,
"num_input_tokens_seen": 880803840,
"step": 420
},
{
"epoch": 0.10615229450327786,
"grad_norm": 2.077852249145508,
"learning_rate": 9.92309473762531e-07,
"loss": 0.0958,
"num_input_tokens_seen": 882900992,
"step": 421
},
{
"epoch": 0.10640443772062531,
"grad_norm": 3.552129030227661,
"learning_rate": 9.922402339557405e-07,
"loss": 0.1314,
"num_input_tokens_seen": 884998144,
"step": 422
},
{
"epoch": 0.10665658093797277,
"grad_norm": 2.371065855026245,
"learning_rate": 9.92170686562832e-07,
"loss": 0.1129,
"num_input_tokens_seen": 887095296,
"step": 423
},
{
"epoch": 0.10690872415532023,
"grad_norm": 3.874335289001465,
"learning_rate": 9.921008316321768e-07,
"loss": 0.1691,
"num_input_tokens_seen": 889192448,
"step": 424
},
{
"epoch": 0.10716086737266768,
"grad_norm": 2.733494520187378,
"learning_rate": 9.920306692123609e-07,
"loss": 0.1126,
"num_input_tokens_seen": 891289600,
"step": 425
},
{
"epoch": 0.10741301059001512,
"grad_norm": 2.3687491416931152,
"learning_rate": 9.919601993521829e-07,
"loss": 0.1028,
"num_input_tokens_seen": 893386752,
"step": 426
},
{
"epoch": 0.10766515380736258,
"grad_norm": 2.3049280643463135,
"learning_rate": 9.91889422100656e-07,
"loss": 0.0865,
"num_input_tokens_seen": 895483904,
"step": 427
},
{
"epoch": 0.10791729702471003,
"grad_norm": 2.899887800216675,
"learning_rate": 9.918183375070073e-07,
"loss": 0.1258,
"num_input_tokens_seen": 897581056,
"step": 428
},
{
"epoch": 0.10816944024205749,
"grad_norm": 4.081860065460205,
"learning_rate": 9.917469456206773e-07,
"loss": 0.0931,
"num_input_tokens_seen": 899678208,
"step": 429
},
{
"epoch": 0.10842158345940495,
"grad_norm": 3.0482466220855713,
"learning_rate": 9.916752464913201e-07,
"loss": 0.1039,
"num_input_tokens_seen": 901775360,
"step": 430
},
{
"epoch": 0.10867372667675239,
"grad_norm": 3.3849377632141113,
"learning_rate": 9.916032401688042e-07,
"loss": 0.1661,
"num_input_tokens_seen": 903872512,
"step": 431
},
{
"epoch": 0.10892586989409984,
"grad_norm": 3.4006130695343018,
"learning_rate": 9.91530926703211e-07,
"loss": 0.121,
"num_input_tokens_seen": 905969664,
"step": 432
},
{
"epoch": 0.1091780131114473,
"grad_norm": 4.100249290466309,
"learning_rate": 9.91458306144836e-07,
"loss": 0.1976,
"num_input_tokens_seen": 908066816,
"step": 433
},
{
"epoch": 0.10943015632879476,
"grad_norm": 2.491917610168457,
"learning_rate": 9.913853785441878e-07,
"loss": 0.1019,
"num_input_tokens_seen": 910163968,
"step": 434
},
{
"epoch": 0.10968229954614221,
"grad_norm": 4.087813377380371,
"learning_rate": 9.913121439519893e-07,
"loss": 0.1673,
"num_input_tokens_seen": 912261120,
"step": 435
},
{
"epoch": 0.10993444276348967,
"grad_norm": 2.377880334854126,
"learning_rate": 9.912386024191763e-07,
"loss": 0.1184,
"num_input_tokens_seen": 914358272,
"step": 436
},
{
"epoch": 0.11018658598083711,
"grad_norm": 2.745607376098633,
"learning_rate": 9.911647539968981e-07,
"loss": 0.0917,
"num_input_tokens_seen": 916455424,
"step": 437
},
{
"epoch": 0.11043872919818457,
"grad_norm": 4.707367897033691,
"learning_rate": 9.91090598736518e-07,
"loss": 0.2128,
"num_input_tokens_seen": 918552576,
"step": 438
},
{
"epoch": 0.11069087241553202,
"grad_norm": 3.578786611557007,
"learning_rate": 9.910161366896119e-07,
"loss": 0.1235,
"num_input_tokens_seen": 920649728,
"step": 439
},
{
"epoch": 0.11094301563287948,
"grad_norm": 2.3904166221618652,
"learning_rate": 9.909413679079697e-07,
"loss": 0.1139,
"num_input_tokens_seen": 922746880,
"step": 440
},
{
"epoch": 0.11119515885022693,
"grad_norm": 3.1667914390563965,
"learning_rate": 9.908662924435946e-07,
"loss": 0.157,
"num_input_tokens_seen": 924844032,
"step": 441
},
{
"epoch": 0.11144730206757439,
"grad_norm": 4.515403747558594,
"learning_rate": 9.907909103487027e-07,
"loss": 0.1837,
"num_input_tokens_seen": 926941184,
"step": 442
},
{
"epoch": 0.11169944528492183,
"grad_norm": 1.9842240810394287,
"learning_rate": 9.907152216757239e-07,
"loss": 0.1077,
"num_input_tokens_seen": 929038336,
"step": 443
},
{
"epoch": 0.11195158850226929,
"grad_norm": 3.713541030883789,
"learning_rate": 9.906392264773008e-07,
"loss": 0.1401,
"num_input_tokens_seen": 931135488,
"step": 444
},
{
"epoch": 0.11220373171961674,
"grad_norm": 2.7595789432525635,
"learning_rate": 9.905629248062895e-07,
"loss": 0.1262,
"num_input_tokens_seen": 933232640,
"step": 445
},
{
"epoch": 0.1124558749369642,
"grad_norm": 3.375941038131714,
"learning_rate": 9.904863167157591e-07,
"loss": 0.1777,
"num_input_tokens_seen": 935329792,
"step": 446
},
{
"epoch": 0.11270801815431165,
"grad_norm": 2.2114899158477783,
"learning_rate": 9.904094022589923e-07,
"loss": 0.0785,
"num_input_tokens_seen": 937426944,
"step": 447
},
{
"epoch": 0.1129601613716591,
"grad_norm": 3.5571250915527344,
"learning_rate": 9.90332181489484e-07,
"loss": 0.1771,
"num_input_tokens_seen": 939524096,
"step": 448
},
{
"epoch": 0.11321230458900655,
"grad_norm": 4.025667667388916,
"learning_rate": 9.902546544609432e-07,
"loss": 0.1424,
"num_input_tokens_seen": 941621248,
"step": 449
},
{
"epoch": 0.11346444780635401,
"grad_norm": 2.804630994796753,
"learning_rate": 9.901768212272906e-07,
"loss": 0.1722,
"num_input_tokens_seen": 943718400,
"step": 450
},
{
"epoch": 0.11371659102370146,
"grad_norm": 2.183051824569702,
"learning_rate": 9.900986818426612e-07,
"loss": 0.0876,
"num_input_tokens_seen": 945815552,
"step": 451
},
{
"epoch": 0.11396873424104892,
"grad_norm": 2.7712557315826416,
"learning_rate": 9.900202363614025e-07,
"loss": 0.1148,
"num_input_tokens_seen": 947912704,
"step": 452
},
{
"epoch": 0.11422087745839637,
"grad_norm": 3.2009191513061523,
"learning_rate": 9.899414848380743e-07,
"loss": 0.1514,
"num_input_tokens_seen": 950009856,
"step": 453
},
{
"epoch": 0.11447302067574382,
"grad_norm": 3.8625547885894775,
"learning_rate": 9.8986242732745e-07,
"loss": 0.1811,
"num_input_tokens_seen": 952107008,
"step": 454
},
{
"epoch": 0.11472516389309127,
"grad_norm": 2.4320788383483887,
"learning_rate": 9.897830638845153e-07,
"loss": 0.1304,
"num_input_tokens_seen": 954204160,
"step": 455
},
{
"epoch": 0.11497730711043873,
"grad_norm": 2.825261354446411,
"learning_rate": 9.897033945644692e-07,
"loss": 0.1156,
"num_input_tokens_seen": 956301312,
"step": 456
},
{
"epoch": 0.11522945032778618,
"grad_norm": 9.34619426727295,
"learning_rate": 9.89623419422723e-07,
"loss": 0.0738,
"num_input_tokens_seen": 958398464,
"step": 457
},
{
"epoch": 0.11548159354513364,
"grad_norm": 3.386025905609131,
"learning_rate": 9.895431385149007e-07,
"loss": 0.1693,
"num_input_tokens_seen": 960495616,
"step": 458
},
{
"epoch": 0.1157337367624811,
"grad_norm": 3.9842169284820557,
"learning_rate": 9.894625518968396e-07,
"loss": 0.0836,
"num_input_tokens_seen": 962592768,
"step": 459
},
{
"epoch": 0.11598587997982854,
"grad_norm": 4.544926166534424,
"learning_rate": 9.893816596245886e-07,
"loss": 0.2216,
"num_input_tokens_seen": 964689920,
"step": 460
},
{
"epoch": 0.116238023197176,
"grad_norm": 3.3318898677825928,
"learning_rate": 9.8930046175441e-07,
"loss": 0.1638,
"num_input_tokens_seen": 966787072,
"step": 461
},
{
"epoch": 0.11649016641452345,
"grad_norm": 2.5450119972229004,
"learning_rate": 9.892189583427785e-07,
"loss": 0.1472,
"num_input_tokens_seen": 968884224,
"step": 462
},
{
"epoch": 0.1167423096318709,
"grad_norm": 5.197476863861084,
"learning_rate": 9.891371494463812e-07,
"loss": 0.1708,
"num_input_tokens_seen": 970981376,
"step": 463
},
{
"epoch": 0.11699445284921836,
"grad_norm": 2.857074499130249,
"learning_rate": 9.890550351221176e-07,
"loss": 0.0968,
"num_input_tokens_seen": 973078528,
"step": 464
},
{
"epoch": 0.1172465960665658,
"grad_norm": 2.8476240634918213,
"learning_rate": 9.889726154270997e-07,
"loss": 0.1504,
"num_input_tokens_seen": 975175680,
"step": 465
},
{
"epoch": 0.11749873928391326,
"grad_norm": 6.322744369506836,
"learning_rate": 9.888898904186517e-07,
"loss": 0.1249,
"num_input_tokens_seen": 977272832,
"step": 466
},
{
"epoch": 0.11775088250126071,
"grad_norm": 3.161973237991333,
"learning_rate": 9.888068601543106e-07,
"loss": 0.2604,
"num_input_tokens_seen": 979369984,
"step": 467
},
{
"epoch": 0.11800302571860817,
"grad_norm": 2.0370872020721436,
"learning_rate": 9.887235246918255e-07,
"loss": 0.0983,
"num_input_tokens_seen": 981467136,
"step": 468
},
{
"epoch": 0.11825516893595563,
"grad_norm": 3.568608283996582,
"learning_rate": 9.886398840891576e-07,
"loss": 0.1531,
"num_input_tokens_seen": 983564288,
"step": 469
},
{
"epoch": 0.11850731215330308,
"grad_norm": 2.3104538917541504,
"learning_rate": 9.885559384044805e-07,
"loss": 0.1091,
"num_input_tokens_seen": 985661440,
"step": 470
},
{
"epoch": 0.11875945537065052,
"grad_norm": 3.4569497108459473,
"learning_rate": 9.884716876961798e-07,
"loss": 0.1195,
"num_input_tokens_seen": 987758592,
"step": 471
},
{
"epoch": 0.11901159858799798,
"grad_norm": 3.131441354751587,
"learning_rate": 9.883871320228534e-07,
"loss": 0.1564,
"num_input_tokens_seen": 989855744,
"step": 472
},
{
"epoch": 0.11926374180534544,
"grad_norm": 3.427337646484375,
"learning_rate": 9.883022714433116e-07,
"loss": 0.1911,
"num_input_tokens_seen": 991952896,
"step": 473
},
{
"epoch": 0.11951588502269289,
"grad_norm": 3.554757833480835,
"learning_rate": 9.882171060165764e-07,
"loss": 0.1489,
"num_input_tokens_seen": 994050048,
"step": 474
},
{
"epoch": 0.11976802824004035,
"grad_norm": 2.5964512825012207,
"learning_rate": 9.881316358018816e-07,
"loss": 0.0662,
"num_input_tokens_seen": 996147200,
"step": 475
},
{
"epoch": 0.1200201714573878,
"grad_norm": 3.2962310314178467,
"learning_rate": 9.880458608586737e-07,
"loss": 0.1555,
"num_input_tokens_seen": 998244352,
"step": 476
},
{
"epoch": 0.12027231467473525,
"grad_norm": 2.869269371032715,
"learning_rate": 9.879597812466105e-07,
"loss": 0.0795,
"num_input_tokens_seen": 1000341504,
"step": 477
},
{
"epoch": 0.1205244578920827,
"grad_norm": 2.913670778274536,
"learning_rate": 9.878733970255618e-07,
"loss": 0.1329,
"num_input_tokens_seen": 1002438656,
"step": 478
},
{
"epoch": 0.12077660110943016,
"grad_norm": 3.124332904815674,
"learning_rate": 9.877867082556097e-07,
"loss": 0.1538,
"num_input_tokens_seen": 1004535808,
"step": 479
},
{
"epoch": 0.12102874432677761,
"grad_norm": 3.5321497917175293,
"learning_rate": 9.876997149970477e-07,
"loss": 0.1714,
"num_input_tokens_seen": 1006632960,
"step": 480
},
{
"epoch": 0.12128088754412507,
"grad_norm": 3.904442071914673,
"learning_rate": 9.87612417310381e-07,
"loss": 0.1452,
"num_input_tokens_seen": 1008730112,
"step": 481
},
{
"epoch": 0.12153303076147251,
"grad_norm": 3.534336805343628,
"learning_rate": 9.87524815256327e-07,
"loss": 0.1589,
"num_input_tokens_seen": 1010827264,
"step": 482
},
{
"epoch": 0.12178517397881997,
"grad_norm": 3.5298209190368652,
"learning_rate": 9.874369088958145e-07,
"loss": 0.1413,
"num_input_tokens_seen": 1012924416,
"step": 483
},
{
"epoch": 0.12203731719616742,
"grad_norm": 3.4223012924194336,
"learning_rate": 9.873486982899837e-07,
"loss": 0.1552,
"num_input_tokens_seen": 1015021568,
"step": 484
},
{
"epoch": 0.12228946041351488,
"grad_norm": 2.560487747192383,
"learning_rate": 9.872601835001869e-07,
"loss": 0.1192,
"num_input_tokens_seen": 1017118720,
"step": 485
},
{
"epoch": 0.12254160363086233,
"grad_norm": 2.099520683288574,
"learning_rate": 9.871713645879878e-07,
"loss": 0.1125,
"num_input_tokens_seen": 1019215872,
"step": 486
},
{
"epoch": 0.12279374684820979,
"grad_norm": 3.477560520172119,
"learning_rate": 9.870822416151614e-07,
"loss": 0.1485,
"num_input_tokens_seen": 1021313024,
"step": 487
},
{
"epoch": 0.12304589006555723,
"grad_norm": 2.9200782775878906,
"learning_rate": 9.869928146436942e-07,
"loss": 0.0596,
"num_input_tokens_seen": 1023410176,
"step": 488
},
{
"epoch": 0.12329803328290469,
"grad_norm": 2.3703415393829346,
"learning_rate": 9.86903083735785e-07,
"loss": 0.1163,
"num_input_tokens_seen": 1025507328,
"step": 489
},
{
"epoch": 0.12355017650025214,
"grad_norm": 2.2664389610290527,
"learning_rate": 9.868130489538425e-07,
"loss": 0.0712,
"num_input_tokens_seen": 1027604480,
"step": 490
},
{
"epoch": 0.1238023197175996,
"grad_norm": 1.798887848854065,
"learning_rate": 9.867227103604877e-07,
"loss": 0.0709,
"num_input_tokens_seen": 1029701632,
"step": 491
},
{
"epoch": 0.12405446293494705,
"grad_norm": 3.6567928791046143,
"learning_rate": 9.86632068018553e-07,
"loss": 0.1474,
"num_input_tokens_seen": 1031798784,
"step": 492
},
{
"epoch": 0.1243066061522945,
"grad_norm": 2.8362531661987305,
"learning_rate": 9.865411219910815e-07,
"loss": 0.1235,
"num_input_tokens_seen": 1033895936,
"step": 493
},
{
"epoch": 0.12455874936964195,
"grad_norm": 2.423952341079712,
"learning_rate": 9.86449872341328e-07,
"loss": 0.1048,
"num_input_tokens_seen": 1035993088,
"step": 494
},
{
"epoch": 0.12481089258698941,
"grad_norm": 2.4268240928649902,
"learning_rate": 9.863583191327583e-07,
"loss": 0.1063,
"num_input_tokens_seen": 1038090240,
"step": 495
},
{
"epoch": 0.12506303580433686,
"grad_norm": 2.1852941513061523,
"learning_rate": 9.862664624290494e-07,
"loss": 0.0932,
"num_input_tokens_seen": 1040187392,
"step": 496
},
{
"epoch": 0.12531517902168432,
"grad_norm": 3.1700496673583984,
"learning_rate": 9.86174302294089e-07,
"loss": 0.1174,
"num_input_tokens_seen": 1042284544,
"step": 497
},
{
"epoch": 0.12556732223903178,
"grad_norm": 3.2374541759490967,
"learning_rate": 9.860818387919762e-07,
"loss": 0.1251,
"num_input_tokens_seen": 1044381696,
"step": 498
},
{
"epoch": 0.12581946545637923,
"grad_norm": 2.62046217918396,
"learning_rate": 9.859890719870213e-07,
"loss": 0.0991,
"num_input_tokens_seen": 1046478848,
"step": 499
},
{
"epoch": 0.1260716086737267,
"grad_norm": 3.053370237350464,
"learning_rate": 9.85896001943745e-07,
"loss": 0.1612,
"num_input_tokens_seen": 1048576000,
"step": 500
}
],
"logging_steps": 1.0,
"max_steps": 3966,
"num_input_tokens_seen": 1048576000,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.902112919650304e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}