Safetensors
English
llama
ProLong-512k-8B-WritingPrompts / trainer_state.json
chtmp223's picture
Upload folder using huggingface_hub
ba0d393 verified
raw
history blame
194 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 906,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011037527593818985,
"grad_norm": 35.722469329833984,
"learning_rate": 2.173913043478261e-07,
"loss": 2.1083,
"num_input_tokens_seen": 65536,
"step": 1
},
{
"epoch": 0.002207505518763797,
"grad_norm": 25.2639102935791,
"learning_rate": 4.347826086956522e-07,
"loss": 1.3569,
"num_input_tokens_seen": 131072,
"step": 2
},
{
"epoch": 0.0033112582781456954,
"grad_norm": 31.971759796142578,
"learning_rate": 6.521739130434783e-07,
"loss": 2.3622,
"num_input_tokens_seen": 196608,
"step": 3
},
{
"epoch": 0.004415011037527594,
"grad_norm": 17.20412826538086,
"learning_rate": 8.695652173913044e-07,
"loss": 1.1356,
"num_input_tokens_seen": 262144,
"step": 4
},
{
"epoch": 0.005518763796909493,
"grad_norm": 23.274988174438477,
"learning_rate": 1.0869565217391306e-06,
"loss": 1.2981,
"num_input_tokens_seen": 327680,
"step": 5
},
{
"epoch": 0.006622516556291391,
"grad_norm": 10.878853797912598,
"learning_rate": 1.3043478260869566e-06,
"loss": 0.5643,
"num_input_tokens_seen": 393216,
"step": 6
},
{
"epoch": 0.00772626931567329,
"grad_norm": 13.176485061645508,
"learning_rate": 1.521739130434783e-06,
"loss": 0.6975,
"num_input_tokens_seen": 458752,
"step": 7
},
{
"epoch": 0.008830022075055188,
"grad_norm": 16.82210350036621,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.9322,
"num_input_tokens_seen": 524288,
"step": 8
},
{
"epoch": 0.009933774834437087,
"grad_norm": 12.083537101745605,
"learning_rate": 1.956521739130435e-06,
"loss": 0.7018,
"num_input_tokens_seen": 589824,
"step": 9
},
{
"epoch": 0.011037527593818985,
"grad_norm": 11.75355052947998,
"learning_rate": 2.173913043478261e-06,
"loss": 0.5493,
"num_input_tokens_seen": 655360,
"step": 10
},
{
"epoch": 0.012141280353200883,
"grad_norm": 18.269977569580078,
"learning_rate": 2.391304347826087e-06,
"loss": 0.7736,
"num_input_tokens_seen": 720896,
"step": 11
},
{
"epoch": 0.013245033112582781,
"grad_norm": 12.028225898742676,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.7684,
"num_input_tokens_seen": 786432,
"step": 12
},
{
"epoch": 0.01434878587196468,
"grad_norm": 12.417435646057129,
"learning_rate": 2.8260869565217393e-06,
"loss": 0.8609,
"num_input_tokens_seen": 851968,
"step": 13
},
{
"epoch": 0.01545253863134658,
"grad_norm": 9.14816665649414,
"learning_rate": 3.043478260869566e-06,
"loss": 0.3993,
"num_input_tokens_seen": 917504,
"step": 14
},
{
"epoch": 0.016556291390728478,
"grad_norm": 11.77620792388916,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.8576,
"num_input_tokens_seen": 983040,
"step": 15
},
{
"epoch": 0.017660044150110375,
"grad_norm": 6.811140060424805,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.2984,
"num_input_tokens_seen": 1048576,
"step": 16
},
{
"epoch": 0.018763796909492272,
"grad_norm": 10.383139610290527,
"learning_rate": 3.6956521739130436e-06,
"loss": 0.784,
"num_input_tokens_seen": 1114112,
"step": 17
},
{
"epoch": 0.019867549668874173,
"grad_norm": 8.979279518127441,
"learning_rate": 3.91304347826087e-06,
"loss": 0.5068,
"num_input_tokens_seen": 1179648,
"step": 18
},
{
"epoch": 0.02097130242825607,
"grad_norm": 19.544546127319336,
"learning_rate": 4.130434782608696e-06,
"loss": 1.2048,
"num_input_tokens_seen": 1245184,
"step": 19
},
{
"epoch": 0.02207505518763797,
"grad_norm": 8.711148262023926,
"learning_rate": 4.347826086956522e-06,
"loss": 0.4331,
"num_input_tokens_seen": 1310720,
"step": 20
},
{
"epoch": 0.023178807947019868,
"grad_norm": 9.380172729492188,
"learning_rate": 4.565217391304348e-06,
"loss": 0.6358,
"num_input_tokens_seen": 1376256,
"step": 21
},
{
"epoch": 0.024282560706401765,
"grad_norm": 9.575282096862793,
"learning_rate": 4.782608695652174e-06,
"loss": 0.7366,
"num_input_tokens_seen": 1441788,
"step": 22
},
{
"epoch": 0.025386313465783666,
"grad_norm": 5.655991554260254,
"learning_rate": 5e-06,
"loss": 0.3091,
"num_input_tokens_seen": 1507324,
"step": 23
},
{
"epoch": 0.026490066225165563,
"grad_norm": 8.588181495666504,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.7247,
"num_input_tokens_seen": 1572860,
"step": 24
},
{
"epoch": 0.02759381898454746,
"grad_norm": 9.483624458312988,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.8751,
"num_input_tokens_seen": 1638396,
"step": 25
},
{
"epoch": 0.02869757174392936,
"grad_norm": 7.005770683288574,
"learning_rate": 5.652173913043479e-06,
"loss": 0.5548,
"num_input_tokens_seen": 1703932,
"step": 26
},
{
"epoch": 0.029801324503311258,
"grad_norm": 8.58704662322998,
"learning_rate": 5.8695652173913055e-06,
"loss": 0.8409,
"num_input_tokens_seen": 1769468,
"step": 27
},
{
"epoch": 0.03090507726269316,
"grad_norm": 6.881068706512451,
"learning_rate": 6.086956521739132e-06,
"loss": 0.469,
"num_input_tokens_seen": 1835004,
"step": 28
},
{
"epoch": 0.03200883002207505,
"grad_norm": 7.618829250335693,
"learning_rate": 6.304347826086958e-06,
"loss": 0.4414,
"num_input_tokens_seen": 1900540,
"step": 29
},
{
"epoch": 0.033112582781456956,
"grad_norm": 9.56350326538086,
"learning_rate": 6.521739130434783e-06,
"loss": 0.7077,
"num_input_tokens_seen": 1966076,
"step": 30
},
{
"epoch": 0.03421633554083885,
"grad_norm": 11.033548355102539,
"learning_rate": 6.739130434782609e-06,
"loss": 0.883,
"num_input_tokens_seen": 2031612,
"step": 31
},
{
"epoch": 0.03532008830022075,
"grad_norm": 8.739025115966797,
"learning_rate": 6.956521739130435e-06,
"loss": 0.6193,
"num_input_tokens_seen": 2097148,
"step": 32
},
{
"epoch": 0.03642384105960265,
"grad_norm": 8.231757164001465,
"learning_rate": 7.173913043478261e-06,
"loss": 0.5358,
"num_input_tokens_seen": 2162684,
"step": 33
},
{
"epoch": 0.037527593818984545,
"grad_norm": 7.457165241241455,
"learning_rate": 7.391304347826087e-06,
"loss": 0.6193,
"num_input_tokens_seen": 2228220,
"step": 34
},
{
"epoch": 0.03863134657836645,
"grad_norm": 7.066882133483887,
"learning_rate": 7.608695652173914e-06,
"loss": 0.5043,
"num_input_tokens_seen": 2293756,
"step": 35
},
{
"epoch": 0.039735099337748346,
"grad_norm": 7.486729621887207,
"learning_rate": 7.82608695652174e-06,
"loss": 0.5171,
"num_input_tokens_seen": 2359292,
"step": 36
},
{
"epoch": 0.04083885209713024,
"grad_norm": 7.478753089904785,
"learning_rate": 8.043478260869566e-06,
"loss": 0.6266,
"num_input_tokens_seen": 2424828,
"step": 37
},
{
"epoch": 0.04194260485651214,
"grad_norm": 9.259384155273438,
"learning_rate": 8.260869565217392e-06,
"loss": 0.9509,
"num_input_tokens_seen": 2490364,
"step": 38
},
{
"epoch": 0.04304635761589404,
"grad_norm": 6.441883087158203,
"learning_rate": 8.478260869565218e-06,
"loss": 0.5386,
"num_input_tokens_seen": 2555900,
"step": 39
},
{
"epoch": 0.04415011037527594,
"grad_norm": 8.299201011657715,
"learning_rate": 8.695652173913044e-06,
"loss": 0.8399,
"num_input_tokens_seen": 2621436,
"step": 40
},
{
"epoch": 0.04525386313465784,
"grad_norm": 6.8729095458984375,
"learning_rate": 8.91304347826087e-06,
"loss": 0.6116,
"num_input_tokens_seen": 2686972,
"step": 41
},
{
"epoch": 0.046357615894039736,
"grad_norm": 6.9387311935424805,
"learning_rate": 9.130434782608697e-06,
"loss": 0.5301,
"num_input_tokens_seen": 2752508,
"step": 42
},
{
"epoch": 0.04746136865342163,
"grad_norm": 6.831243991851807,
"learning_rate": 9.347826086956523e-06,
"loss": 0.513,
"num_input_tokens_seen": 2818044,
"step": 43
},
{
"epoch": 0.04856512141280353,
"grad_norm": 9.532112121582031,
"learning_rate": 9.565217391304349e-06,
"loss": 1.0538,
"num_input_tokens_seen": 2883580,
"step": 44
},
{
"epoch": 0.04966887417218543,
"grad_norm": 8.001193046569824,
"learning_rate": 9.782608695652175e-06,
"loss": 0.5253,
"num_input_tokens_seen": 2949116,
"step": 45
},
{
"epoch": 0.05077262693156733,
"grad_norm": 8.312308311462402,
"learning_rate": 1e-05,
"loss": 0.5761,
"num_input_tokens_seen": 3014652,
"step": 46
},
{
"epoch": 0.05187637969094923,
"grad_norm": 6.9460859298706055,
"learning_rate": 9.999969974871272e-06,
"loss": 0.5442,
"num_input_tokens_seen": 3080188,
"step": 47
},
{
"epoch": 0.052980132450331126,
"grad_norm": 8.236810684204102,
"learning_rate": 9.999879899885757e-06,
"loss": 0.7561,
"num_input_tokens_seen": 3145724,
"step": 48
},
{
"epoch": 0.05408388520971302,
"grad_norm": 5.9857964515686035,
"learning_rate": 9.99972977624546e-06,
"loss": 0.4112,
"num_input_tokens_seen": 3211260,
"step": 49
},
{
"epoch": 0.05518763796909492,
"grad_norm": 9.374804496765137,
"learning_rate": 9.999519605953706e-06,
"loss": 0.7197,
"num_input_tokens_seen": 3276796,
"step": 50
},
{
"epoch": 0.056291390728476824,
"grad_norm": 11.54023551940918,
"learning_rate": 9.999249391815115e-06,
"loss": 1.0819,
"num_input_tokens_seen": 3342332,
"step": 51
},
{
"epoch": 0.05739514348785872,
"grad_norm": 5.593741416931152,
"learning_rate": 9.998919137435558e-06,
"loss": 0.4057,
"num_input_tokens_seen": 3407868,
"step": 52
},
{
"epoch": 0.05849889624724062,
"grad_norm": 5.5571489334106445,
"learning_rate": 9.998528847222116e-06,
"loss": 0.415,
"num_input_tokens_seen": 3473404,
"step": 53
},
{
"epoch": 0.059602649006622516,
"grad_norm": 7.452723503112793,
"learning_rate": 9.998078526383018e-06,
"loss": 0.6491,
"num_input_tokens_seen": 3538940,
"step": 54
},
{
"epoch": 0.06070640176600441,
"grad_norm": 5.862286567687988,
"learning_rate": 9.99756818092757e-06,
"loss": 0.3988,
"num_input_tokens_seen": 3604476,
"step": 55
},
{
"epoch": 0.06181015452538632,
"grad_norm": 5.775778770446777,
"learning_rate": 9.996997817666077e-06,
"loss": 0.4672,
"num_input_tokens_seen": 3670012,
"step": 56
},
{
"epoch": 0.06291390728476821,
"grad_norm": 5.253780841827393,
"learning_rate": 9.996367444209756e-06,
"loss": 0.3624,
"num_input_tokens_seen": 3735548,
"step": 57
},
{
"epoch": 0.0640176600441501,
"grad_norm": 7.225327968597412,
"learning_rate": 9.995677068970624e-06,
"loss": 0.7668,
"num_input_tokens_seen": 3801084,
"step": 58
},
{
"epoch": 0.06512141280353201,
"grad_norm": 6.573258876800537,
"learning_rate": 9.994926701161394e-06,
"loss": 0.5316,
"num_input_tokens_seen": 3866620,
"step": 59
},
{
"epoch": 0.06622516556291391,
"grad_norm": 7.839727878570557,
"learning_rate": 9.99411635079535e-06,
"loss": 0.8709,
"num_input_tokens_seen": 3932156,
"step": 60
},
{
"epoch": 0.0673289183222958,
"grad_norm": 7.096211910247803,
"learning_rate": 9.993246028686216e-06,
"loss": 0.6911,
"num_input_tokens_seen": 3997692,
"step": 61
},
{
"epoch": 0.0684326710816777,
"grad_norm": 8.092432022094727,
"learning_rate": 9.992315746448009e-06,
"loss": 0.8105,
"num_input_tokens_seen": 4063228,
"step": 62
},
{
"epoch": 0.0695364238410596,
"grad_norm": 6.5860490798950195,
"learning_rate": 9.991325516494876e-06,
"loss": 0.5278,
"num_input_tokens_seen": 4128764,
"step": 63
},
{
"epoch": 0.0706401766004415,
"grad_norm": 7.127208709716797,
"learning_rate": 9.990275352040943e-06,
"loss": 0.6808,
"num_input_tokens_seen": 4194300,
"step": 64
},
{
"epoch": 0.0717439293598234,
"grad_norm": 7.488225936889648,
"learning_rate": 9.989165267100137e-06,
"loss": 0.7354,
"num_input_tokens_seen": 4259836,
"step": 65
},
{
"epoch": 0.0728476821192053,
"grad_norm": 6.8002800941467285,
"learning_rate": 9.987995276485984e-06,
"loss": 0.6493,
"num_input_tokens_seen": 4325372,
"step": 66
},
{
"epoch": 0.0739514348785872,
"grad_norm": 6.0314459800720215,
"learning_rate": 9.986765395811425e-06,
"loss": 0.4735,
"num_input_tokens_seen": 4390908,
"step": 67
},
{
"epoch": 0.07505518763796909,
"grad_norm": 8.056852340698242,
"learning_rate": 9.985475641488608e-06,
"loss": 0.8006,
"num_input_tokens_seen": 4456444,
"step": 68
},
{
"epoch": 0.076158940397351,
"grad_norm": 7.1111741065979,
"learning_rate": 9.984126030728659e-06,
"loss": 0.7747,
"num_input_tokens_seen": 4521980,
"step": 69
},
{
"epoch": 0.0772626931567329,
"grad_norm": 6.619105339050293,
"learning_rate": 9.982716581541462e-06,
"loss": 0.4753,
"num_input_tokens_seen": 4587516,
"step": 70
},
{
"epoch": 0.07836644591611479,
"grad_norm": 5.534825801849365,
"learning_rate": 9.981247312735412e-06,
"loss": 0.567,
"num_input_tokens_seen": 4653052,
"step": 71
},
{
"epoch": 0.07947019867549669,
"grad_norm": 5.347917556762695,
"learning_rate": 9.979718243917172e-06,
"loss": 0.496,
"num_input_tokens_seen": 4718588,
"step": 72
},
{
"epoch": 0.08057395143487858,
"grad_norm": 6.386154651641846,
"learning_rate": 9.978129395491402e-06,
"loss": 0.4872,
"num_input_tokens_seen": 4784124,
"step": 73
},
{
"epoch": 0.08167770419426049,
"grad_norm": 6.805239200592041,
"learning_rate": 9.976480788660494e-06,
"loss": 0.5869,
"num_input_tokens_seen": 4849660,
"step": 74
},
{
"epoch": 0.08278145695364239,
"grad_norm": 6.585273265838623,
"learning_rate": 9.974772445424283e-06,
"loss": 0.4738,
"num_input_tokens_seen": 4915196,
"step": 75
},
{
"epoch": 0.08388520971302428,
"grad_norm": 6.134422779083252,
"learning_rate": 9.973004388579758e-06,
"loss": 0.4763,
"num_input_tokens_seen": 4980732,
"step": 76
},
{
"epoch": 0.08498896247240618,
"grad_norm": 7.285155296325684,
"learning_rate": 9.971176641720756e-06,
"loss": 0.6983,
"num_input_tokens_seen": 5046268,
"step": 77
},
{
"epoch": 0.08609271523178808,
"grad_norm": 7.901010513305664,
"learning_rate": 9.96928922923765e-06,
"loss": 0.7429,
"num_input_tokens_seen": 5111804,
"step": 78
},
{
"epoch": 0.08719646799116998,
"grad_norm": 6.154994964599609,
"learning_rate": 9.967342176317018e-06,
"loss": 0.4834,
"num_input_tokens_seen": 5177340,
"step": 79
},
{
"epoch": 0.08830022075055188,
"grad_norm": 7.007679462432861,
"learning_rate": 9.96533550894131e-06,
"loss": 0.655,
"num_input_tokens_seen": 5242876,
"step": 80
},
{
"epoch": 0.08940397350993377,
"grad_norm": 6.794103622436523,
"learning_rate": 9.963269253888504e-06,
"loss": 0.7023,
"num_input_tokens_seen": 5308412,
"step": 81
},
{
"epoch": 0.09050772626931568,
"grad_norm": 7.166098117828369,
"learning_rate": 9.961143438731741e-06,
"loss": 0.8764,
"num_input_tokens_seen": 5373948,
"step": 82
},
{
"epoch": 0.09161147902869757,
"grad_norm": 6.217060565948486,
"learning_rate": 9.958958091838969e-06,
"loss": 0.5542,
"num_input_tokens_seen": 5439484,
"step": 83
},
{
"epoch": 0.09271523178807947,
"grad_norm": 6.699713706970215,
"learning_rate": 9.95671324237255e-06,
"loss": 0.6756,
"num_input_tokens_seen": 5505020,
"step": 84
},
{
"epoch": 0.09381898454746136,
"grad_norm": 5.587467670440674,
"learning_rate": 9.954408920288884e-06,
"loss": 0.4274,
"num_input_tokens_seen": 5570556,
"step": 85
},
{
"epoch": 0.09492273730684327,
"grad_norm": 5.275640964508057,
"learning_rate": 9.952045156337998e-06,
"loss": 0.3499,
"num_input_tokens_seen": 5636092,
"step": 86
},
{
"epoch": 0.09602649006622517,
"grad_norm": 7.246316432952881,
"learning_rate": 9.949621982063145e-06,
"loss": 0.657,
"num_input_tokens_seen": 5701628,
"step": 87
},
{
"epoch": 0.09713024282560706,
"grad_norm": 6.412764549255371,
"learning_rate": 9.947139429800377e-06,
"loss": 0.5475,
"num_input_tokens_seen": 5767164,
"step": 88
},
{
"epoch": 0.09823399558498896,
"grad_norm": 7.114440441131592,
"learning_rate": 9.94459753267812e-06,
"loss": 0.7146,
"num_input_tokens_seen": 5832700,
"step": 89
},
{
"epoch": 0.09933774834437085,
"grad_norm": 7.031120300292969,
"learning_rate": 9.941996324616723e-06,
"loss": 0.6916,
"num_input_tokens_seen": 5898236,
"step": 90
},
{
"epoch": 0.10044150110375276,
"grad_norm": 4.999560356140137,
"learning_rate": 9.939335840328011e-06,
"loss": 0.4196,
"num_input_tokens_seen": 5963772,
"step": 91
},
{
"epoch": 0.10154525386313466,
"grad_norm": 6.941257953643799,
"learning_rate": 9.93661611531482e-06,
"loss": 0.5311,
"num_input_tokens_seen": 6029308,
"step": 92
},
{
"epoch": 0.10264900662251655,
"grad_norm": 6.328064441680908,
"learning_rate": 9.933837185870526e-06,
"loss": 0.6167,
"num_input_tokens_seen": 6094844,
"step": 93
},
{
"epoch": 0.10375275938189846,
"grad_norm": 5.660717010498047,
"learning_rate": 9.930999089078556e-06,
"loss": 0.4391,
"num_input_tokens_seen": 6160380,
"step": 94
},
{
"epoch": 0.10485651214128035,
"grad_norm": 5.730586051940918,
"learning_rate": 9.928101862811899e-06,
"loss": 0.5525,
"num_input_tokens_seen": 6225916,
"step": 95
},
{
"epoch": 0.10596026490066225,
"grad_norm": 5.740535259246826,
"learning_rate": 9.925145545732598e-06,
"loss": 0.5216,
"num_input_tokens_seen": 6291452,
"step": 96
},
{
"epoch": 0.10706401766004416,
"grad_norm": 5.135792255401611,
"learning_rate": 9.922130177291228e-06,
"loss": 0.4671,
"num_input_tokens_seen": 6356988,
"step": 97
},
{
"epoch": 0.10816777041942605,
"grad_norm": 7.077984809875488,
"learning_rate": 9.919055797726377e-06,
"loss": 1.0276,
"num_input_tokens_seen": 6422524,
"step": 98
},
{
"epoch": 0.10927152317880795,
"grad_norm": 7.4840192794799805,
"learning_rate": 9.915922448064111e-06,
"loss": 0.8116,
"num_input_tokens_seen": 6488060,
"step": 99
},
{
"epoch": 0.11037527593818984,
"grad_norm": 4.238962173461914,
"learning_rate": 9.912730170117419e-06,
"loss": 0.391,
"num_input_tokens_seen": 6553596,
"step": 100
},
{
"epoch": 0.11147902869757174,
"grad_norm": 5.252936840057373,
"learning_rate": 9.909479006485658e-06,
"loss": 0.4316,
"num_input_tokens_seen": 6619132,
"step": 101
},
{
"epoch": 0.11258278145695365,
"grad_norm": 5.10698127746582,
"learning_rate": 9.906169000553989e-06,
"loss": 0.4543,
"num_input_tokens_seen": 6684668,
"step": 102
},
{
"epoch": 0.11368653421633554,
"grad_norm": 4.487974643707275,
"learning_rate": 9.902800196492788e-06,
"loss": 0.4114,
"num_input_tokens_seen": 6750204,
"step": 103
},
{
"epoch": 0.11479028697571744,
"grad_norm": 5.283864974975586,
"learning_rate": 9.89937263925707e-06,
"loss": 0.4567,
"num_input_tokens_seen": 6815740,
"step": 104
},
{
"epoch": 0.11589403973509933,
"grad_norm": 7.552896976470947,
"learning_rate": 9.895886374585877e-06,
"loss": 0.8429,
"num_input_tokens_seen": 6881276,
"step": 105
},
{
"epoch": 0.11699779249448124,
"grad_norm": 4.633214950561523,
"learning_rate": 9.892341449001673e-06,
"loss": 0.3508,
"num_input_tokens_seen": 6946812,
"step": 106
},
{
"epoch": 0.11810154525386314,
"grad_norm": 7.490325450897217,
"learning_rate": 9.888737909809725e-06,
"loss": 0.7319,
"num_input_tokens_seen": 7012348,
"step": 107
},
{
"epoch": 0.11920529801324503,
"grad_norm": 6.352238655090332,
"learning_rate": 9.885075805097464e-06,
"loss": 0.5916,
"num_input_tokens_seen": 7077884,
"step": 108
},
{
"epoch": 0.12030905077262694,
"grad_norm": 6.156204700469971,
"learning_rate": 9.881355183733857e-06,
"loss": 0.6457,
"num_input_tokens_seen": 7143420,
"step": 109
},
{
"epoch": 0.12141280353200883,
"grad_norm": 6.884950637817383,
"learning_rate": 9.877576095368738e-06,
"loss": 0.6972,
"num_input_tokens_seen": 7208956,
"step": 110
},
{
"epoch": 0.12251655629139073,
"grad_norm": 6.656806468963623,
"learning_rate": 9.873738590432162e-06,
"loss": 0.6563,
"num_input_tokens_seen": 7274492,
"step": 111
},
{
"epoch": 0.12362030905077263,
"grad_norm": 6.270849704742432,
"learning_rate": 9.869842720133715e-06,
"loss": 0.5774,
"num_input_tokens_seen": 7340028,
"step": 112
},
{
"epoch": 0.12472406181015452,
"grad_norm": 6.998918533325195,
"learning_rate": 9.865888536461851e-06,
"loss": 0.5853,
"num_input_tokens_seen": 7405564,
"step": 113
},
{
"epoch": 0.12582781456953643,
"grad_norm": 6.9652299880981445,
"learning_rate": 9.861876092183174e-06,
"loss": 0.634,
"num_input_tokens_seen": 7471100,
"step": 114
},
{
"epoch": 0.12693156732891833,
"grad_norm": 7.60606575012207,
"learning_rate": 9.857805440841758e-06,
"loss": 0.9336,
"num_input_tokens_seen": 7536636,
"step": 115
},
{
"epoch": 0.1280353200883002,
"grad_norm": 5.321383953094482,
"learning_rate": 9.853676636758415e-06,
"loss": 0.4438,
"num_input_tokens_seen": 7602172,
"step": 116
},
{
"epoch": 0.1291390728476821,
"grad_norm": 6.2487592697143555,
"learning_rate": 9.849489735029975e-06,
"loss": 0.7418,
"num_input_tokens_seen": 7667708,
"step": 117
},
{
"epoch": 0.13024282560706402,
"grad_norm": 5.844862461090088,
"learning_rate": 9.845244791528563e-06,
"loss": 0.704,
"num_input_tokens_seen": 7733244,
"step": 118
},
{
"epoch": 0.13134657836644592,
"grad_norm": 6.298166275024414,
"learning_rate": 9.840941862900825e-06,
"loss": 0.8102,
"num_input_tokens_seen": 7798780,
"step": 119
},
{
"epoch": 0.13245033112582782,
"grad_norm": 5.498508453369141,
"learning_rate": 9.836581006567207e-06,
"loss": 0.4503,
"num_input_tokens_seen": 7864316,
"step": 120
},
{
"epoch": 0.1335540838852097,
"grad_norm": 6.927279472351074,
"learning_rate": 9.832162280721157e-06,
"loss": 0.8501,
"num_input_tokens_seen": 7929852,
"step": 121
},
{
"epoch": 0.1346578366445916,
"grad_norm": 4.555235385894775,
"learning_rate": 9.827685744328374e-06,
"loss": 0.3771,
"num_input_tokens_seen": 7995388,
"step": 122
},
{
"epoch": 0.1357615894039735,
"grad_norm": 6.517794609069824,
"learning_rate": 9.823151457126006e-06,
"loss": 0.8107,
"num_input_tokens_seen": 8060924,
"step": 123
},
{
"epoch": 0.1368653421633554,
"grad_norm": 5.046365737915039,
"learning_rate": 9.818559479621851e-06,
"loss": 0.3973,
"num_input_tokens_seen": 8126460,
"step": 124
},
{
"epoch": 0.13796909492273732,
"grad_norm": 5.294122695922852,
"learning_rate": 9.813909873093565e-06,
"loss": 0.4441,
"num_input_tokens_seen": 8191996,
"step": 125
},
{
"epoch": 0.1390728476821192,
"grad_norm": 5.88740873336792,
"learning_rate": 9.809202699587828e-06,
"loss": 0.4989,
"num_input_tokens_seen": 8257532,
"step": 126
},
{
"epoch": 0.1401766004415011,
"grad_norm": 5.602629661560059,
"learning_rate": 9.804438021919525e-06,
"loss": 0.4466,
"num_input_tokens_seen": 8323068,
"step": 127
},
{
"epoch": 0.141280353200883,
"grad_norm": 5.889889240264893,
"learning_rate": 9.799615903670904e-06,
"loss": 0.5564,
"num_input_tokens_seen": 8388604,
"step": 128
},
{
"epoch": 0.1423841059602649,
"grad_norm": 5.074821949005127,
"learning_rate": 9.794736409190732e-06,
"loss": 0.3879,
"num_input_tokens_seen": 8454140,
"step": 129
},
{
"epoch": 0.1434878587196468,
"grad_norm": 5.404687881469727,
"learning_rate": 9.789799603593433e-06,
"loss": 0.4141,
"num_input_tokens_seen": 8519676,
"step": 130
},
{
"epoch": 0.1445916114790287,
"grad_norm": 5.067564010620117,
"learning_rate": 9.784805552758213e-06,
"loss": 0.4376,
"num_input_tokens_seen": 8585212,
"step": 131
},
{
"epoch": 0.1456953642384106,
"grad_norm": 4.754221439361572,
"learning_rate": 9.779754323328192e-06,
"loss": 0.3985,
"num_input_tokens_seen": 8650748,
"step": 132
},
{
"epoch": 0.1467991169977925,
"grad_norm": 5.799423694610596,
"learning_rate": 9.77464598270951e-06,
"loss": 0.6592,
"num_input_tokens_seen": 8716284,
"step": 133
},
{
"epoch": 0.1479028697571744,
"grad_norm": 5.599109172821045,
"learning_rate": 9.76948059907043e-06,
"loss": 0.5591,
"num_input_tokens_seen": 8781820,
"step": 134
},
{
"epoch": 0.1490066225165563,
"grad_norm": 5.459147930145264,
"learning_rate": 9.764258241340421e-06,
"loss": 0.5448,
"num_input_tokens_seen": 8847356,
"step": 135
},
{
"epoch": 0.15011037527593818,
"grad_norm": 4.705541133880615,
"learning_rate": 9.758978979209243e-06,
"loss": 0.4279,
"num_input_tokens_seen": 8912892,
"step": 136
},
{
"epoch": 0.15121412803532008,
"grad_norm": 5.681971073150635,
"learning_rate": 9.753642883126018e-06,
"loss": 0.559,
"num_input_tokens_seen": 8978428,
"step": 137
},
{
"epoch": 0.152317880794702,
"grad_norm": 4.888025760650635,
"learning_rate": 9.748250024298291e-06,
"loss": 0.4626,
"num_input_tokens_seen": 9043964,
"step": 138
},
{
"epoch": 0.1534216335540839,
"grad_norm": 5.069499969482422,
"learning_rate": 9.742800474691075e-06,
"loss": 0.4097,
"num_input_tokens_seen": 9109500,
"step": 139
},
{
"epoch": 0.1545253863134658,
"grad_norm": 6.779152870178223,
"learning_rate": 9.73729430702589e-06,
"loss": 0.6859,
"num_input_tokens_seen": 9175036,
"step": 140
},
{
"epoch": 0.15562913907284767,
"grad_norm": 5.773662567138672,
"learning_rate": 9.731731594779807e-06,
"loss": 0.6244,
"num_input_tokens_seen": 9240568,
"step": 141
},
{
"epoch": 0.15673289183222958,
"grad_norm": 4.693535804748535,
"learning_rate": 9.726112412184441e-06,
"loss": 0.3684,
"num_input_tokens_seen": 9306104,
"step": 142
},
{
"epoch": 0.15783664459161148,
"grad_norm": 4.742260932922363,
"learning_rate": 9.72043683422499e-06,
"loss": 0.4194,
"num_input_tokens_seen": 9371640,
"step": 143
},
{
"epoch": 0.15894039735099338,
"grad_norm": 5.286125659942627,
"learning_rate": 9.71470493663921e-06,
"loss": 0.4763,
"num_input_tokens_seen": 9437176,
"step": 144
},
{
"epoch": 0.1600441501103753,
"grad_norm": 5.083088397979736,
"learning_rate": 9.708916795916418e-06,
"loss": 0.4952,
"num_input_tokens_seen": 9502712,
"step": 145
},
{
"epoch": 0.16114790286975716,
"grad_norm": 4.462346076965332,
"learning_rate": 9.703072489296467e-06,
"loss": 0.4201,
"num_input_tokens_seen": 9568248,
"step": 146
},
{
"epoch": 0.16225165562913907,
"grad_norm": 5.618736267089844,
"learning_rate": 9.697172094768717e-06,
"loss": 0.5632,
"num_input_tokens_seen": 9633784,
"step": 147
},
{
"epoch": 0.16335540838852097,
"grad_norm": 5.792170524597168,
"learning_rate": 9.691215691070994e-06,
"loss": 0.5519,
"num_input_tokens_seen": 9699320,
"step": 148
},
{
"epoch": 0.16445916114790288,
"grad_norm": 6.814916133880615,
"learning_rate": 9.685203357688536e-06,
"loss": 0.5686,
"num_input_tokens_seen": 9764856,
"step": 149
},
{
"epoch": 0.16556291390728478,
"grad_norm": 5.402688503265381,
"learning_rate": 9.679135174852934e-06,
"loss": 0.5319,
"num_input_tokens_seen": 9830392,
"step": 150
},
{
"epoch": 0.16666666666666666,
"grad_norm": 6.655113220214844,
"learning_rate": 9.673011223541067e-06,
"loss": 0.6354,
"num_input_tokens_seen": 9895928,
"step": 151
},
{
"epoch": 0.16777041942604856,
"grad_norm": 6.647359371185303,
"learning_rate": 9.666831585474012e-06,
"loss": 0.8729,
"num_input_tokens_seen": 9961464,
"step": 152
},
{
"epoch": 0.16887417218543047,
"grad_norm": 8.241610527038574,
"learning_rate": 9.660596343115958e-06,
"loss": 1.1005,
"num_input_tokens_seen": 10027000,
"step": 153
},
{
"epoch": 0.16997792494481237,
"grad_norm": 5.4942626953125,
"learning_rate": 9.65430557967311e-06,
"loss": 0.5956,
"num_input_tokens_seen": 10092536,
"step": 154
},
{
"epoch": 0.17108167770419427,
"grad_norm": 5.064987659454346,
"learning_rate": 9.647959379092568e-06,
"loss": 0.4986,
"num_input_tokens_seen": 10158072,
"step": 155
},
{
"epoch": 0.17218543046357615,
"grad_norm": 3.928574323654175,
"learning_rate": 9.641557826061218e-06,
"loss": 0.3439,
"num_input_tokens_seen": 10223608,
"step": 156
},
{
"epoch": 0.17328918322295805,
"grad_norm": 4.649381160736084,
"learning_rate": 9.635101006004596e-06,
"loss": 0.4185,
"num_input_tokens_seen": 10289144,
"step": 157
},
{
"epoch": 0.17439293598233996,
"grad_norm": 6.195332050323486,
"learning_rate": 9.628589005085745e-06,
"loss": 0.6933,
"num_input_tokens_seen": 10354680,
"step": 158
},
{
"epoch": 0.17549668874172186,
"grad_norm": 5.165769577026367,
"learning_rate": 9.622021910204074e-06,
"loss": 0.525,
"num_input_tokens_seen": 10420216,
"step": 159
},
{
"epoch": 0.17660044150110377,
"grad_norm": 5.248317241668701,
"learning_rate": 9.615399808994192e-06,
"loss": 0.4611,
"num_input_tokens_seen": 10485752,
"step": 160
},
{
"epoch": 0.17770419426048564,
"grad_norm": 4.948761940002441,
"learning_rate": 9.608722789824739e-06,
"loss": 0.4007,
"num_input_tokens_seen": 10551288,
"step": 161
},
{
"epoch": 0.17880794701986755,
"grad_norm": 6.128936290740967,
"learning_rate": 9.601990941797208e-06,
"loss": 0.6054,
"num_input_tokens_seen": 10616824,
"step": 162
},
{
"epoch": 0.17991169977924945,
"grad_norm": 6.17040491104126,
"learning_rate": 9.595204354744756e-06,
"loss": 0.5762,
"num_input_tokens_seen": 10682360,
"step": 163
},
{
"epoch": 0.18101545253863136,
"grad_norm": 5.761812210083008,
"learning_rate": 9.588363119231004e-06,
"loss": 0.5555,
"num_input_tokens_seen": 10747896,
"step": 164
},
{
"epoch": 0.18211920529801323,
"grad_norm": 5.079399108886719,
"learning_rate": 9.581467326548834e-06,
"loss": 0.3842,
"num_input_tokens_seen": 10813432,
"step": 165
},
{
"epoch": 0.18322295805739514,
"grad_norm": 5.7329607009887695,
"learning_rate": 9.57451706871916e-06,
"loss": 0.6035,
"num_input_tokens_seen": 10878968,
"step": 166
},
{
"epoch": 0.18432671081677704,
"grad_norm": 5.196740627288818,
"learning_rate": 9.567512438489711e-06,
"loss": 0.4072,
"num_input_tokens_seen": 10944504,
"step": 167
},
{
"epoch": 0.18543046357615894,
"grad_norm": 5.155704498291016,
"learning_rate": 9.560453529333787e-06,
"loss": 0.4642,
"num_input_tokens_seen": 11010040,
"step": 168
},
{
"epoch": 0.18653421633554085,
"grad_norm": 6.3926568031311035,
"learning_rate": 9.55334043544901e-06,
"loss": 0.8744,
"num_input_tokens_seen": 11075576,
"step": 169
},
{
"epoch": 0.18763796909492272,
"grad_norm": 3.620082378387451,
"learning_rate": 9.546173251756076e-06,
"loss": 0.2314,
"num_input_tokens_seen": 11141112,
"step": 170
},
{
"epoch": 0.18874172185430463,
"grad_norm": 5.067852973937988,
"learning_rate": 9.538952073897477e-06,
"loss": 0.3696,
"num_input_tokens_seen": 11206648,
"step": 171
},
{
"epoch": 0.18984547461368653,
"grad_norm": 4.713092803955078,
"learning_rate": 9.531676998236236e-06,
"loss": 0.4657,
"num_input_tokens_seen": 11272184,
"step": 172
},
{
"epoch": 0.19094922737306844,
"grad_norm": 5.534994125366211,
"learning_rate": 9.52434812185461e-06,
"loss": 0.5065,
"num_input_tokens_seen": 11337720,
"step": 173
},
{
"epoch": 0.19205298013245034,
"grad_norm": 5.428729057312012,
"learning_rate": 9.516965542552804e-06,
"loss": 0.4614,
"num_input_tokens_seen": 11403256,
"step": 174
},
{
"epoch": 0.19315673289183222,
"grad_norm": 4.462728977203369,
"learning_rate": 9.509529358847655e-06,
"loss": 0.383,
"num_input_tokens_seen": 11468792,
"step": 175
},
{
"epoch": 0.19426048565121412,
"grad_norm": 4.043033123016357,
"learning_rate": 9.502039669971336e-06,
"loss": 0.3162,
"num_input_tokens_seen": 11534328,
"step": 176
},
{
"epoch": 0.19536423841059603,
"grad_norm": 4.644472122192383,
"learning_rate": 9.494496575870007e-06,
"loss": 0.4098,
"num_input_tokens_seen": 11599864,
"step": 177
},
{
"epoch": 0.19646799116997793,
"grad_norm": 4.948442459106445,
"learning_rate": 9.486900177202503e-06,
"loss": 0.4331,
"num_input_tokens_seen": 11665400,
"step": 178
},
{
"epoch": 0.19757174392935983,
"grad_norm": 6.079590320587158,
"learning_rate": 9.479250575338977e-06,
"loss": 0.5996,
"num_input_tokens_seen": 11730936,
"step": 179
},
{
"epoch": 0.1986754966887417,
"grad_norm": 6.818512439727783,
"learning_rate": 9.471547872359552e-06,
"loss": 0.7914,
"num_input_tokens_seen": 11796472,
"step": 180
},
{
"epoch": 0.1997792494481236,
"grad_norm": 6.684516429901123,
"learning_rate": 9.463792171052965e-06,
"loss": 0.7554,
"num_input_tokens_seen": 11862008,
"step": 181
},
{
"epoch": 0.20088300220750552,
"grad_norm": 4.4812517166137695,
"learning_rate": 9.45598357491518e-06,
"loss": 0.3983,
"num_input_tokens_seen": 11927544,
"step": 182
},
{
"epoch": 0.20198675496688742,
"grad_norm": 5.318398475646973,
"learning_rate": 9.448122188148026e-06,
"loss": 0.5978,
"num_input_tokens_seen": 11993080,
"step": 183
},
{
"epoch": 0.20309050772626933,
"grad_norm": 5.358767032623291,
"learning_rate": 9.440208115657789e-06,
"loss": 0.4899,
"num_input_tokens_seen": 12058616,
"step": 184
},
{
"epoch": 0.2041942604856512,
"grad_norm": 4.549075126647949,
"learning_rate": 9.432241463053823e-06,
"loss": 0.3754,
"num_input_tokens_seen": 12124152,
"step": 185
},
{
"epoch": 0.2052980132450331,
"grad_norm": 6.748514175415039,
"learning_rate": 9.424222336647135e-06,
"loss": 0.8162,
"num_input_tokens_seen": 12189688,
"step": 186
},
{
"epoch": 0.206401766004415,
"grad_norm": 5.219625473022461,
"learning_rate": 9.416150843448974e-06,
"loss": 0.506,
"num_input_tokens_seen": 12255224,
"step": 187
},
{
"epoch": 0.20750551876379691,
"grad_norm": 5.062671661376953,
"learning_rate": 9.408027091169391e-06,
"loss": 0.437,
"num_input_tokens_seen": 12320760,
"step": 188
},
{
"epoch": 0.20860927152317882,
"grad_norm": 4.459550857543945,
"learning_rate": 9.399851188215815e-06,
"loss": 0.3918,
"num_input_tokens_seen": 12386296,
"step": 189
},
{
"epoch": 0.2097130242825607,
"grad_norm": 5.297883987426758,
"learning_rate": 9.391623243691595e-06,
"loss": 0.4505,
"num_input_tokens_seen": 12451832,
"step": 190
},
{
"epoch": 0.2108167770419426,
"grad_norm": 4.688574314117432,
"learning_rate": 9.38334336739455e-06,
"loss": 0.4077,
"num_input_tokens_seen": 12517368,
"step": 191
},
{
"epoch": 0.2119205298013245,
"grad_norm": 6.45738410949707,
"learning_rate": 9.375011669815504e-06,
"loss": 0.6966,
"num_input_tokens_seen": 12582904,
"step": 192
},
{
"epoch": 0.2130242825607064,
"grad_norm": 5.424302577972412,
"learning_rate": 9.366628262136808e-06,
"loss": 0.499,
"num_input_tokens_seen": 12648440,
"step": 193
},
{
"epoch": 0.2141280353200883,
"grad_norm": 5.937360763549805,
"learning_rate": 9.35819325623086e-06,
"loss": 0.6072,
"num_input_tokens_seen": 12713976,
"step": 194
},
{
"epoch": 0.2152317880794702,
"grad_norm": 5.085829734802246,
"learning_rate": 9.34970676465861e-06,
"loss": 0.4112,
"num_input_tokens_seen": 12779512,
"step": 195
},
{
"epoch": 0.2163355408388521,
"grad_norm": 4.746096611022949,
"learning_rate": 9.34116890066806e-06,
"loss": 0.3946,
"num_input_tokens_seen": 12845048,
"step": 196
},
{
"epoch": 0.217439293598234,
"grad_norm": 4.703492641448975,
"learning_rate": 9.332579778192749e-06,
"loss": 0.425,
"num_input_tokens_seen": 12910584,
"step": 197
},
{
"epoch": 0.2185430463576159,
"grad_norm": 5.84251594543457,
"learning_rate": 9.323939511850237e-06,
"loss": 0.6691,
"num_input_tokens_seen": 12976120,
"step": 198
},
{
"epoch": 0.2196467991169978,
"grad_norm": 3.8289952278137207,
"learning_rate": 9.31524821694057e-06,
"loss": 0.3155,
"num_input_tokens_seen": 13041656,
"step": 199
},
{
"epoch": 0.22075055187637968,
"grad_norm": 3.90834379196167,
"learning_rate": 9.30650600944475e-06,
"loss": 0.2806,
"num_input_tokens_seen": 13107192,
"step": 200
},
{
"epoch": 0.22185430463576158,
"grad_norm": 4.902582168579102,
"learning_rate": 9.297713006023183e-06,
"loss": 0.3854,
"num_input_tokens_seen": 13172728,
"step": 201
},
{
"epoch": 0.2229580573951435,
"grad_norm": 6.226442813873291,
"learning_rate": 9.28886932401411e-06,
"loss": 0.558,
"num_input_tokens_seen": 13238264,
"step": 202
},
{
"epoch": 0.2240618101545254,
"grad_norm": 5.094548225402832,
"learning_rate": 9.279975081432063e-06,
"loss": 0.4484,
"num_input_tokens_seen": 13303800,
"step": 203
},
{
"epoch": 0.2251655629139073,
"grad_norm": 4.96065616607666,
"learning_rate": 9.27103039696628e-06,
"loss": 0.4513,
"num_input_tokens_seen": 13369336,
"step": 204
},
{
"epoch": 0.22626931567328917,
"grad_norm": 3.6584391593933105,
"learning_rate": 9.262035389979113e-06,
"loss": 0.2415,
"num_input_tokens_seen": 13434872,
"step": 205
},
{
"epoch": 0.22737306843267108,
"grad_norm": 5.138461112976074,
"learning_rate": 9.252990180504451e-06,
"loss": 0.5035,
"num_input_tokens_seen": 13500408,
"step": 206
},
{
"epoch": 0.22847682119205298,
"grad_norm": 4.905426025390625,
"learning_rate": 9.243894889246106e-06,
"loss": 0.4478,
"num_input_tokens_seen": 13565944,
"step": 207
},
{
"epoch": 0.22958057395143489,
"grad_norm": 4.798696517944336,
"learning_rate": 9.234749637576206e-06,
"loss": 0.4719,
"num_input_tokens_seen": 13631480,
"step": 208
},
{
"epoch": 0.2306843267108168,
"grad_norm": 5.787559986114502,
"learning_rate": 9.22555454753358e-06,
"loss": 0.6204,
"num_input_tokens_seen": 13697016,
"step": 209
},
{
"epoch": 0.23178807947019867,
"grad_norm": 6.1379265785217285,
"learning_rate": 9.216309741822119e-06,
"loss": 0.641,
"num_input_tokens_seen": 13762552,
"step": 210
},
{
"epoch": 0.23289183222958057,
"grad_norm": 6.323113441467285,
"learning_rate": 9.20701534380915e-06,
"loss": 0.6088,
"num_input_tokens_seen": 13828088,
"step": 211
},
{
"epoch": 0.23399558498896247,
"grad_norm": 8.215484619140625,
"learning_rate": 9.197671477523785e-06,
"loss": 0.6663,
"num_input_tokens_seen": 13893624,
"step": 212
},
{
"epoch": 0.23509933774834438,
"grad_norm": 5.999568462371826,
"learning_rate": 9.188278267655255e-06,
"loss": 0.6261,
"num_input_tokens_seen": 13959160,
"step": 213
},
{
"epoch": 0.23620309050772628,
"grad_norm": 6.217502593994141,
"learning_rate": 9.178835839551273e-06,
"loss": 0.7274,
"num_input_tokens_seen": 14024696,
"step": 214
},
{
"epoch": 0.23730684326710816,
"grad_norm": 5.5007405281066895,
"learning_rate": 9.169344319216334e-06,
"loss": 0.4803,
"num_input_tokens_seen": 14090232,
"step": 215
},
{
"epoch": 0.23841059602649006,
"grad_norm": 4.78103494644165,
"learning_rate": 9.159803833310046e-06,
"loss": 0.432,
"num_input_tokens_seen": 14155768,
"step": 216
},
{
"epoch": 0.23951434878587197,
"grad_norm": 4.399326801300049,
"learning_rate": 9.150214509145439e-06,
"loss": 0.391,
"num_input_tokens_seen": 14221304,
"step": 217
},
{
"epoch": 0.24061810154525387,
"grad_norm": 5.642846584320068,
"learning_rate": 9.140576474687263e-06,
"loss": 0.5047,
"num_input_tokens_seen": 14286840,
"step": 218
},
{
"epoch": 0.24172185430463577,
"grad_norm": 5.760812282562256,
"learning_rate": 9.13088985855029e-06,
"loss": 0.6133,
"num_input_tokens_seen": 14352376,
"step": 219
},
{
"epoch": 0.24282560706401765,
"grad_norm": 5.308858394622803,
"learning_rate": 9.121154789997583e-06,
"loss": 0.4889,
"num_input_tokens_seen": 14417912,
"step": 220
},
{
"epoch": 0.24392935982339956,
"grad_norm": 5.612556457519531,
"learning_rate": 9.11137139893878e-06,
"loss": 0.4994,
"num_input_tokens_seen": 14483448,
"step": 221
},
{
"epoch": 0.24503311258278146,
"grad_norm": 4.570278167724609,
"learning_rate": 9.101539815928358e-06,
"loss": 0.4871,
"num_input_tokens_seen": 14548984,
"step": 222
},
{
"epoch": 0.24613686534216336,
"grad_norm": 6.837000846862793,
"learning_rate": 9.091660172163894e-06,
"loss": 0.8181,
"num_input_tokens_seen": 14614520,
"step": 223
},
{
"epoch": 0.24724061810154527,
"grad_norm": 7.090153217315674,
"learning_rate": 9.08173259948431e-06,
"loss": 0.8306,
"num_input_tokens_seen": 14680056,
"step": 224
},
{
"epoch": 0.24834437086092714,
"grad_norm": 4.480409622192383,
"learning_rate": 9.071757230368117e-06,
"loss": 0.4458,
"num_input_tokens_seen": 14745592,
"step": 225
},
{
"epoch": 0.24944812362030905,
"grad_norm": 5.686323642730713,
"learning_rate": 9.061734197931645e-06,
"loss": 0.5,
"num_input_tokens_seen": 14811128,
"step": 226
},
{
"epoch": 0.25055187637969095,
"grad_norm": 5.53110933303833,
"learning_rate": 9.051663635927265e-06,
"loss": 0.5392,
"num_input_tokens_seen": 14876664,
"step": 227
},
{
"epoch": 0.25165562913907286,
"grad_norm": 4.834395408630371,
"learning_rate": 9.04154567874161e-06,
"loss": 0.3774,
"num_input_tokens_seen": 14942200,
"step": 228
},
{
"epoch": 0.25275938189845476,
"grad_norm": 5.39569091796875,
"learning_rate": 9.031380461393774e-06,
"loss": 0.5263,
"num_input_tokens_seen": 15007736,
"step": 229
},
{
"epoch": 0.25386313465783666,
"grad_norm": 4.755312919616699,
"learning_rate": 9.021168119533522e-06,
"loss": 0.3951,
"num_input_tokens_seen": 15073272,
"step": 230
},
{
"epoch": 0.25496688741721857,
"grad_norm": 5.514160633087158,
"learning_rate": 9.010908789439463e-06,
"loss": 0.6618,
"num_input_tokens_seen": 15138808,
"step": 231
},
{
"epoch": 0.2560706401766004,
"grad_norm": 4.2921552658081055,
"learning_rate": 9.000602608017243e-06,
"loss": 0.4315,
"num_input_tokens_seen": 15204344,
"step": 232
},
{
"epoch": 0.2571743929359823,
"grad_norm": 5.0842108726501465,
"learning_rate": 8.99024971279772e-06,
"loss": 0.484,
"num_input_tokens_seen": 15269880,
"step": 233
},
{
"epoch": 0.2582781456953642,
"grad_norm": 7.238823890686035,
"learning_rate": 8.979850241935122e-06,
"loss": 0.7677,
"num_input_tokens_seen": 15335416,
"step": 234
},
{
"epoch": 0.25938189845474613,
"grad_norm": 4.493621349334717,
"learning_rate": 8.969404334205203e-06,
"loss": 0.3926,
"num_input_tokens_seen": 15400952,
"step": 235
},
{
"epoch": 0.26048565121412803,
"grad_norm": 5.966931343078613,
"learning_rate": 8.958912129003395e-06,
"loss": 0.5586,
"num_input_tokens_seen": 15466488,
"step": 236
},
{
"epoch": 0.26158940397350994,
"grad_norm": 6.150123596191406,
"learning_rate": 8.948373766342952e-06,
"loss": 0.7247,
"num_input_tokens_seen": 15532024,
"step": 237
},
{
"epoch": 0.26269315673289184,
"grad_norm": 5.65477180480957,
"learning_rate": 8.937789386853067e-06,
"loss": 0.452,
"num_input_tokens_seen": 15597560,
"step": 238
},
{
"epoch": 0.26379690949227375,
"grad_norm": 6.099196910858154,
"learning_rate": 8.927159131777013e-06,
"loss": 0.6872,
"num_input_tokens_seen": 15663096,
"step": 239
},
{
"epoch": 0.26490066225165565,
"grad_norm": 5.515918254852295,
"learning_rate": 8.916483142970244e-06,
"loss": 0.6076,
"num_input_tokens_seen": 15728632,
"step": 240
},
{
"epoch": 0.26600441501103755,
"grad_norm": 5.490879058837891,
"learning_rate": 8.905761562898514e-06,
"loss": 0.5676,
"num_input_tokens_seen": 15794168,
"step": 241
},
{
"epoch": 0.2671081677704194,
"grad_norm": 4.504538536071777,
"learning_rate": 8.894994534635962e-06,
"loss": 0.3909,
"num_input_tokens_seen": 15859704,
"step": 242
},
{
"epoch": 0.2682119205298013,
"grad_norm": 5.765637397766113,
"learning_rate": 8.884182201863218e-06,
"loss": 0.6022,
"num_input_tokens_seen": 15925240,
"step": 243
},
{
"epoch": 0.2693156732891832,
"grad_norm": 7.247265338897705,
"learning_rate": 8.873324708865473e-06,
"loss": 0.3683,
"num_input_tokens_seen": 15990776,
"step": 244
},
{
"epoch": 0.2704194260485651,
"grad_norm": 6.174973964691162,
"learning_rate": 8.862422200530561e-06,
"loss": 0.6992,
"num_input_tokens_seen": 16056312,
"step": 245
},
{
"epoch": 0.271523178807947,
"grad_norm": 5.999027729034424,
"learning_rate": 8.85147482234702e-06,
"loss": 0.745,
"num_input_tokens_seen": 16121848,
"step": 246
},
{
"epoch": 0.2726269315673289,
"grad_norm": 4.824187278747559,
"learning_rate": 8.840482720402159e-06,
"loss": 0.4242,
"num_input_tokens_seen": 16187384,
"step": 247
},
{
"epoch": 0.2737306843267108,
"grad_norm": 6.063555717468262,
"learning_rate": 8.829446041380099e-06,
"loss": 0.5956,
"num_input_tokens_seen": 16252920,
"step": 248
},
{
"epoch": 0.27483443708609273,
"grad_norm": 5.309985160827637,
"learning_rate": 8.818364932559822e-06,
"loss": 0.4649,
"num_input_tokens_seen": 16318456,
"step": 249
},
{
"epoch": 0.27593818984547464,
"grad_norm": 4.762182712554932,
"learning_rate": 8.807239541813204e-06,
"loss": 0.4554,
"num_input_tokens_seen": 16383992,
"step": 250
},
{
"epoch": 0.27704194260485654,
"grad_norm": 5.908174514770508,
"learning_rate": 8.796070017603037e-06,
"loss": 0.4796,
"num_input_tokens_seen": 16449528,
"step": 251
},
{
"epoch": 0.2781456953642384,
"grad_norm": 5.626619338989258,
"learning_rate": 8.784856508981062e-06,
"loss": 0.4437,
"num_input_tokens_seen": 16515064,
"step": 252
},
{
"epoch": 0.2792494481236203,
"grad_norm": 5.213916301727295,
"learning_rate": 8.773599165585957e-06,
"loss": 0.3765,
"num_input_tokens_seen": 16580600,
"step": 253
},
{
"epoch": 0.2803532008830022,
"grad_norm": 3.7691640853881836,
"learning_rate": 8.762298137641363e-06,
"loss": 0.2522,
"num_input_tokens_seen": 16646136,
"step": 254
},
{
"epoch": 0.2814569536423841,
"grad_norm": 7.3836236000061035,
"learning_rate": 8.750953575953862e-06,
"loss": 0.6886,
"num_input_tokens_seen": 16711672,
"step": 255
},
{
"epoch": 0.282560706401766,
"grad_norm": 6.279829978942871,
"learning_rate": 8.739565631910983e-06,
"loss": 0.729,
"num_input_tokens_seen": 16777208,
"step": 256
},
{
"epoch": 0.2836644591611479,
"grad_norm": 6.2540507316589355,
"learning_rate": 8.728134457479158e-06,
"loss": 0.5763,
"num_input_tokens_seen": 16842744,
"step": 257
},
{
"epoch": 0.2847682119205298,
"grad_norm": 5.088672161102295,
"learning_rate": 8.716660205201715e-06,
"loss": 0.3749,
"num_input_tokens_seen": 16908280,
"step": 258
},
{
"epoch": 0.2858719646799117,
"grad_norm": 5.244248390197754,
"learning_rate": 8.705143028196834e-06,
"loss": 0.5395,
"num_input_tokens_seen": 16973816,
"step": 259
},
{
"epoch": 0.2869757174392936,
"grad_norm": 5.609091758728027,
"learning_rate": 8.693583080155501e-06,
"loss": 0.4133,
"num_input_tokens_seen": 17039352,
"step": 260
},
{
"epoch": 0.28807947019867547,
"grad_norm": 4.83781099319458,
"learning_rate": 8.681980515339464e-06,
"loss": 0.3799,
"num_input_tokens_seen": 17104888,
"step": 261
},
{
"epoch": 0.2891832229580574,
"grad_norm": 5.5107340812683105,
"learning_rate": 8.670335488579166e-06,
"loss": 0.4834,
"num_input_tokens_seen": 17170424,
"step": 262
},
{
"epoch": 0.2902869757174393,
"grad_norm": 5.0106201171875,
"learning_rate": 8.658648155271688e-06,
"loss": 0.4059,
"num_input_tokens_seen": 17235960,
"step": 263
},
{
"epoch": 0.2913907284768212,
"grad_norm": 4.87216329574585,
"learning_rate": 8.646918671378666e-06,
"loss": 0.4187,
"num_input_tokens_seen": 17301496,
"step": 264
},
{
"epoch": 0.2924944812362031,
"grad_norm": 5.391574859619141,
"learning_rate": 8.635147193424219e-06,
"loss": 0.4873,
"num_input_tokens_seen": 17367032,
"step": 265
},
{
"epoch": 0.293598233995585,
"grad_norm": 5.076900959014893,
"learning_rate": 8.623333878492853e-06,
"loss": 0.4802,
"num_input_tokens_seen": 17432568,
"step": 266
},
{
"epoch": 0.2947019867549669,
"grad_norm": 5.86137580871582,
"learning_rate": 8.61147888422737e-06,
"loss": 0.5354,
"num_input_tokens_seen": 17498104,
"step": 267
},
{
"epoch": 0.2958057395143488,
"grad_norm": 4.704822540283203,
"learning_rate": 8.59958236882676e-06,
"loss": 0.4128,
"num_input_tokens_seen": 17563640,
"step": 268
},
{
"epoch": 0.2969094922737307,
"grad_norm": 5.304955005645752,
"learning_rate": 8.587644491044094e-06,
"loss": 0.3959,
"num_input_tokens_seen": 17629176,
"step": 269
},
{
"epoch": 0.2980132450331126,
"grad_norm": 4.6488518714904785,
"learning_rate": 8.575665410184398e-06,
"loss": 0.4585,
"num_input_tokens_seen": 17694712,
"step": 270
},
{
"epoch": 0.29911699779249445,
"grad_norm": 5.93010139465332,
"learning_rate": 8.563645286102539e-06,
"loss": 0.6202,
"num_input_tokens_seen": 17760248,
"step": 271
},
{
"epoch": 0.30022075055187636,
"grad_norm": 5.770275115966797,
"learning_rate": 8.551584279201085e-06,
"loss": 0.499,
"num_input_tokens_seen": 17825784,
"step": 272
},
{
"epoch": 0.30132450331125826,
"grad_norm": 6.36545991897583,
"learning_rate": 8.539482550428158e-06,
"loss": 0.7024,
"num_input_tokens_seen": 17891320,
"step": 273
},
{
"epoch": 0.30242825607064017,
"grad_norm": 5.636227130889893,
"learning_rate": 8.527340261275302e-06,
"loss": 0.6154,
"num_input_tokens_seen": 17956856,
"step": 274
},
{
"epoch": 0.30353200883002207,
"grad_norm": 4.823254585266113,
"learning_rate": 8.515157573775309e-06,
"loss": 0.3441,
"num_input_tokens_seen": 18022392,
"step": 275
},
{
"epoch": 0.304635761589404,
"grad_norm": 5.449868679046631,
"learning_rate": 8.50293465050008e-06,
"loss": 0.5015,
"num_input_tokens_seen": 18087928,
"step": 276
},
{
"epoch": 0.3057395143487859,
"grad_norm": 4.312854766845703,
"learning_rate": 8.490671654558427e-06,
"loss": 0.3685,
"num_input_tokens_seen": 18153464,
"step": 277
},
{
"epoch": 0.3068432671081678,
"grad_norm": 6.949029445648193,
"learning_rate": 8.478368749593925e-06,
"loss": 0.6666,
"num_input_tokens_seen": 18219000,
"step": 278
},
{
"epoch": 0.3079470198675497,
"grad_norm": 6.440571308135986,
"learning_rate": 8.466026099782708e-06,
"loss": 0.8144,
"num_input_tokens_seen": 18284536,
"step": 279
},
{
"epoch": 0.3090507726269316,
"grad_norm": 4.756875038146973,
"learning_rate": 8.453643869831289e-06,
"loss": 0.4319,
"num_input_tokens_seen": 18350072,
"step": 280
},
{
"epoch": 0.31015452538631344,
"grad_norm": 7.117915153503418,
"learning_rate": 8.441222224974353e-06,
"loss": 0.7307,
"num_input_tokens_seen": 18415608,
"step": 281
},
{
"epoch": 0.31125827814569534,
"grad_norm": 6.1996684074401855,
"learning_rate": 8.428761330972562e-06,
"loss": 0.6961,
"num_input_tokens_seen": 18481144,
"step": 282
},
{
"epoch": 0.31236203090507725,
"grad_norm": 5.32656192779541,
"learning_rate": 8.416261354110334e-06,
"loss": 0.4869,
"num_input_tokens_seen": 18546680,
"step": 283
},
{
"epoch": 0.31346578366445915,
"grad_norm": 5.852849960327148,
"learning_rate": 8.403722461193635e-06,
"loss": 0.5949,
"num_input_tokens_seen": 18612216,
"step": 284
},
{
"epoch": 0.31456953642384106,
"grad_norm": 4.754979133605957,
"learning_rate": 8.391144819547742e-06,
"loss": 0.4291,
"num_input_tokens_seen": 18677752,
"step": 285
},
{
"epoch": 0.31567328918322296,
"grad_norm": 6.007636547088623,
"learning_rate": 8.378528597015011e-06,
"loss": 0.542,
"num_input_tokens_seen": 18743288,
"step": 286
},
{
"epoch": 0.31677704194260486,
"grad_norm": 6.076221942901611,
"learning_rate": 8.365873961952648e-06,
"loss": 0.7025,
"num_input_tokens_seen": 18808824,
"step": 287
},
{
"epoch": 0.31788079470198677,
"grad_norm": 4.541327476501465,
"learning_rate": 8.35318108323045e-06,
"loss": 0.4121,
"num_input_tokens_seen": 18874360,
"step": 288
},
{
"epoch": 0.3189845474613687,
"grad_norm": 4.576253414154053,
"learning_rate": 8.340450130228558e-06,
"loss": 0.3526,
"num_input_tokens_seen": 18939896,
"step": 289
},
{
"epoch": 0.3200883002207506,
"grad_norm": 5.6783976554870605,
"learning_rate": 8.327681272835197e-06,
"loss": 0.5399,
"num_input_tokens_seen": 19005432,
"step": 290
},
{
"epoch": 0.3211920529801324,
"grad_norm": 5.336645603179932,
"learning_rate": 8.314874681444404e-06,
"loss": 0.4894,
"num_input_tokens_seen": 19070968,
"step": 291
},
{
"epoch": 0.32229580573951433,
"grad_norm": 5.151770114898682,
"learning_rate": 8.30203052695376e-06,
"loss": 0.4226,
"num_input_tokens_seen": 19136504,
"step": 292
},
{
"epoch": 0.32339955849889623,
"grad_norm": 5.253474712371826,
"learning_rate": 8.289148980762105e-06,
"loss": 0.5135,
"num_input_tokens_seen": 19202040,
"step": 293
},
{
"epoch": 0.32450331125827814,
"grad_norm": 5.810973644256592,
"learning_rate": 8.276230214767254e-06,
"loss": 0.6277,
"num_input_tokens_seen": 19267576,
"step": 294
},
{
"epoch": 0.32560706401766004,
"grad_norm": 5.979196548461914,
"learning_rate": 8.263274401363704e-06,
"loss": 0.6719,
"num_input_tokens_seen": 19333112,
"step": 295
},
{
"epoch": 0.32671081677704195,
"grad_norm": 4.947514533996582,
"learning_rate": 8.250281713440323e-06,
"loss": 0.4234,
"num_input_tokens_seen": 19398648,
"step": 296
},
{
"epoch": 0.32781456953642385,
"grad_norm": 4.194842338562012,
"learning_rate": 8.237252324378059e-06,
"loss": 0.335,
"num_input_tokens_seen": 19464184,
"step": 297
},
{
"epoch": 0.32891832229580575,
"grad_norm": 4.941613674163818,
"learning_rate": 8.224186408047616e-06,
"loss": 0.3597,
"num_input_tokens_seen": 19529720,
"step": 298
},
{
"epoch": 0.33002207505518766,
"grad_norm": 5.901616096496582,
"learning_rate": 8.211084138807138e-06,
"loss": 0.6489,
"num_input_tokens_seen": 19595256,
"step": 299
},
{
"epoch": 0.33112582781456956,
"grad_norm": 4.655982971191406,
"learning_rate": 8.197945691499876e-06,
"loss": 0.4378,
"num_input_tokens_seen": 19660792,
"step": 300
},
{
"epoch": 0.3322295805739514,
"grad_norm": 4.98732852935791,
"learning_rate": 8.184771241451862e-06,
"loss": 0.4636,
"num_input_tokens_seen": 19726328,
"step": 301
},
{
"epoch": 0.3333333333333333,
"grad_norm": 5.869397163391113,
"learning_rate": 8.17156096446957e-06,
"loss": 0.6995,
"num_input_tokens_seen": 19791864,
"step": 302
},
{
"epoch": 0.3344370860927152,
"grad_norm": 4.8156418800354,
"learning_rate": 8.158315036837557e-06,
"loss": 0.4852,
"num_input_tokens_seen": 19857400,
"step": 303
},
{
"epoch": 0.3355408388520971,
"grad_norm": 7.32762336730957,
"learning_rate": 8.14503363531613e-06,
"loss": 0.9684,
"num_input_tokens_seen": 19922936,
"step": 304
},
{
"epoch": 0.336644591611479,
"grad_norm": 5.988272190093994,
"learning_rate": 8.131716937138973e-06,
"loss": 0.6892,
"num_input_tokens_seen": 19988472,
"step": 305
},
{
"epoch": 0.33774834437086093,
"grad_norm": 5.299745559692383,
"learning_rate": 8.11836512001079e-06,
"loss": 0.4995,
"num_input_tokens_seen": 20054008,
"step": 306
},
{
"epoch": 0.33885209713024284,
"grad_norm": 5.244733810424805,
"learning_rate": 8.10497836210492e-06,
"loss": 0.5387,
"num_input_tokens_seen": 20119544,
"step": 307
},
{
"epoch": 0.33995584988962474,
"grad_norm": 4.309633255004883,
"learning_rate": 8.091556842060981e-06,
"loss": 0.3757,
"num_input_tokens_seen": 20185080,
"step": 308
},
{
"epoch": 0.34105960264900664,
"grad_norm": 4.323147296905518,
"learning_rate": 8.07810073898247e-06,
"loss": 0.3871,
"num_input_tokens_seen": 20250616,
"step": 309
},
{
"epoch": 0.34216335540838855,
"grad_norm": 5.932765007019043,
"learning_rate": 8.064610232434375e-06,
"loss": 0.687,
"num_input_tokens_seen": 20316152,
"step": 310
},
{
"epoch": 0.3432671081677704,
"grad_norm": 5.740137100219727,
"learning_rate": 8.051085502440782e-06,
"loss": 0.6566,
"num_input_tokens_seen": 20381688,
"step": 311
},
{
"epoch": 0.3443708609271523,
"grad_norm": 5.602391719818115,
"learning_rate": 8.037526729482474e-06,
"loss": 0.6846,
"num_input_tokens_seen": 20447224,
"step": 312
},
{
"epoch": 0.3454746136865342,
"grad_norm": 6.104027271270752,
"learning_rate": 8.02393409449452e-06,
"loss": 0.5781,
"num_input_tokens_seen": 20512760,
"step": 313
},
{
"epoch": 0.3465783664459161,
"grad_norm": 3.9661595821380615,
"learning_rate": 8.010307778863859e-06,
"loss": 0.2769,
"num_input_tokens_seen": 20578296,
"step": 314
},
{
"epoch": 0.347682119205298,
"grad_norm": 4.286974906921387,
"learning_rate": 7.996647964426883e-06,
"loss": 0.311,
"num_input_tokens_seen": 20643832,
"step": 315
},
{
"epoch": 0.3487858719646799,
"grad_norm": 5.434468746185303,
"learning_rate": 7.982954833467007e-06,
"loss": 0.6516,
"num_input_tokens_seen": 20709368,
"step": 316
},
{
"epoch": 0.3498896247240618,
"grad_norm": 4.0702223777771,
"learning_rate": 7.969228568712242e-06,
"loss": 0.353,
"num_input_tokens_seen": 20774904,
"step": 317
},
{
"epoch": 0.3509933774834437,
"grad_norm": 6.999876499176025,
"learning_rate": 7.95546935333275e-06,
"loss": 0.891,
"num_input_tokens_seen": 20840440,
"step": 318
},
{
"epoch": 0.35209713024282563,
"grad_norm": 3.8806591033935547,
"learning_rate": 7.941677370938404e-06,
"loss": 0.2951,
"num_input_tokens_seen": 20905976,
"step": 319
},
{
"epoch": 0.35320088300220753,
"grad_norm": 6.465142726898193,
"learning_rate": 7.927852805576334e-06,
"loss": 0.7898,
"num_input_tokens_seen": 20971512,
"step": 320
},
{
"epoch": 0.3543046357615894,
"grad_norm": 5.034674167633057,
"learning_rate": 7.913995841728477e-06,
"loss": 0.4284,
"num_input_tokens_seen": 21037048,
"step": 321
},
{
"epoch": 0.3554083885209713,
"grad_norm": 6.744060039520264,
"learning_rate": 7.90010666430911e-06,
"loss": 0.7808,
"num_input_tokens_seen": 21102584,
"step": 322
},
{
"epoch": 0.3565121412803532,
"grad_norm": 4.225239276885986,
"learning_rate": 7.886185458662383e-06,
"loss": 0.2769,
"num_input_tokens_seen": 21168120,
"step": 323
},
{
"epoch": 0.3576158940397351,
"grad_norm": 4.372021675109863,
"learning_rate": 7.872232410559848e-06,
"loss": 0.3404,
"num_input_tokens_seen": 21233656,
"step": 324
},
{
"epoch": 0.358719646799117,
"grad_norm": 5.2710185050964355,
"learning_rate": 7.85824770619798e-06,
"loss": 0.4023,
"num_input_tokens_seen": 21299192,
"step": 325
},
{
"epoch": 0.3598233995584989,
"grad_norm": 4.083426475524902,
"learning_rate": 7.844231532195686e-06,
"loss": 0.2934,
"num_input_tokens_seen": 21364728,
"step": 326
},
{
"epoch": 0.3609271523178808,
"grad_norm": 5.436210632324219,
"learning_rate": 7.830184075591829e-06,
"loss": 0.4998,
"num_input_tokens_seen": 21430264,
"step": 327
},
{
"epoch": 0.3620309050772627,
"grad_norm": 5.057102203369141,
"learning_rate": 7.816105523842712e-06,
"loss": 0.5121,
"num_input_tokens_seen": 21495800,
"step": 328
},
{
"epoch": 0.3631346578366446,
"grad_norm": 6.161040306091309,
"learning_rate": 7.801996064819594e-06,
"loss": 0.5667,
"num_input_tokens_seen": 21561336,
"step": 329
},
{
"epoch": 0.36423841059602646,
"grad_norm": 4.909401893615723,
"learning_rate": 7.787855886806174e-06,
"loss": 0.414,
"num_input_tokens_seen": 21626872,
"step": 330
},
{
"epoch": 0.36534216335540837,
"grad_norm": 6.199487209320068,
"learning_rate": 7.773685178496084e-06,
"loss": 0.6352,
"num_input_tokens_seen": 21692408,
"step": 331
},
{
"epoch": 0.36644591611479027,
"grad_norm": 5.417155742645264,
"learning_rate": 7.759484128990359e-06,
"loss": 0.5171,
"num_input_tokens_seen": 21757944,
"step": 332
},
{
"epoch": 0.3675496688741722,
"grad_norm": 5.248231410980225,
"learning_rate": 7.745252927794929e-06,
"loss": 0.5954,
"num_input_tokens_seen": 21823480,
"step": 333
},
{
"epoch": 0.3686534216335541,
"grad_norm": 4.734684944152832,
"learning_rate": 7.730991764818083e-06,
"loss": 0.4274,
"num_input_tokens_seen": 21889016,
"step": 334
},
{
"epoch": 0.369757174392936,
"grad_norm": 5.899268627166748,
"learning_rate": 7.716700830367937e-06,
"loss": 0.6043,
"num_input_tokens_seen": 21954552,
"step": 335
},
{
"epoch": 0.3708609271523179,
"grad_norm": 5.965540885925293,
"learning_rate": 7.702380315149885e-06,
"loss": 0.5565,
"num_input_tokens_seen": 22020088,
"step": 336
},
{
"epoch": 0.3719646799116998,
"grad_norm": 3.7075252532958984,
"learning_rate": 7.68803041026407e-06,
"loss": 0.2761,
"num_input_tokens_seen": 22085624,
"step": 337
},
{
"epoch": 0.3730684326710817,
"grad_norm": 5.284467697143555,
"learning_rate": 7.673651307202816e-06,
"loss": 0.4702,
"num_input_tokens_seen": 22151160,
"step": 338
},
{
"epoch": 0.3741721854304636,
"grad_norm": 4.557376861572266,
"learning_rate": 7.659243197848091e-06,
"loss": 0.3558,
"num_input_tokens_seen": 22216696,
"step": 339
},
{
"epoch": 0.37527593818984545,
"grad_norm": 5.523560523986816,
"learning_rate": 7.644806274468936e-06,
"loss": 0.5721,
"num_input_tokens_seen": 22282232,
"step": 340
},
{
"epoch": 0.37637969094922735,
"grad_norm": 5.48103141784668,
"learning_rate": 7.630340729718896e-06,
"loss": 0.6207,
"num_input_tokens_seen": 22347768,
"step": 341
},
{
"epoch": 0.37748344370860926,
"grad_norm": 4.440031051635742,
"learning_rate": 7.6158467566334584e-06,
"loss": 0.3645,
"num_input_tokens_seen": 22413304,
"step": 342
},
{
"epoch": 0.37858719646799116,
"grad_norm": 4.76084566116333,
"learning_rate": 7.6013245486274685e-06,
"loss": 0.4351,
"num_input_tokens_seen": 22478840,
"step": 343
},
{
"epoch": 0.37969094922737306,
"grad_norm": 6.258664131164551,
"learning_rate": 7.58677429949256e-06,
"loss": 0.6918,
"num_input_tokens_seen": 22544376,
"step": 344
},
{
"epoch": 0.38079470198675497,
"grad_norm": 4.8109660148620605,
"learning_rate": 7.572196203394553e-06,
"loss": 0.4623,
"num_input_tokens_seen": 22609912,
"step": 345
},
{
"epoch": 0.3818984547461369,
"grad_norm": 4.017666339874268,
"learning_rate": 7.557590454870874e-06,
"loss": 0.3383,
"num_input_tokens_seen": 22675448,
"step": 346
},
{
"epoch": 0.3830022075055188,
"grad_norm": 4.424713134765625,
"learning_rate": 7.5429572488279615e-06,
"loss": 0.3711,
"num_input_tokens_seen": 22740984,
"step": 347
},
{
"epoch": 0.3841059602649007,
"grad_norm": 6.114891529083252,
"learning_rate": 7.5282967805386555e-06,
"loss": 0.7094,
"num_input_tokens_seen": 22806520,
"step": 348
},
{
"epoch": 0.3852097130242826,
"grad_norm": 4.288976669311523,
"learning_rate": 7.5136092456396e-06,
"loss": 0.3381,
"num_input_tokens_seen": 22872056,
"step": 349
},
{
"epoch": 0.38631346578366443,
"grad_norm": 3.8139936923980713,
"learning_rate": 7.498894840128632e-06,
"loss": 0.2501,
"num_input_tokens_seen": 22937592,
"step": 350
},
{
"epoch": 0.38741721854304634,
"grad_norm": 4.9862060546875,
"learning_rate": 7.484153760362155e-06,
"loss": 0.4111,
"num_input_tokens_seen": 23003128,
"step": 351
},
{
"epoch": 0.38852097130242824,
"grad_norm": 6.541782379150391,
"learning_rate": 7.4693862030525356e-06,
"loss": 0.8444,
"num_input_tokens_seen": 23068664,
"step": 352
},
{
"epoch": 0.38962472406181015,
"grad_norm": 4.731116771697998,
"learning_rate": 7.454592365265464e-06,
"loss": 0.3405,
"num_input_tokens_seen": 23134200,
"step": 353
},
{
"epoch": 0.39072847682119205,
"grad_norm": 5.749923229217529,
"learning_rate": 7.439772444417337e-06,
"loss": 0.6451,
"num_input_tokens_seen": 23199736,
"step": 354
},
{
"epoch": 0.39183222958057395,
"grad_norm": 5.7721099853515625,
"learning_rate": 7.424926638272609e-06,
"loss": 0.5034,
"num_input_tokens_seen": 23265272,
"step": 355
},
{
"epoch": 0.39293598233995586,
"grad_norm": 4.80233907699585,
"learning_rate": 7.410055144941168e-06,
"loss": 0.3973,
"num_input_tokens_seen": 23330808,
"step": 356
},
{
"epoch": 0.39403973509933776,
"grad_norm": 5.965678691864014,
"learning_rate": 7.395158162875681e-06,
"loss": 0.5132,
"num_input_tokens_seen": 23396344,
"step": 357
},
{
"epoch": 0.39514348785871967,
"grad_norm": 5.89133882522583,
"learning_rate": 7.380235890868946e-06,
"loss": 0.6651,
"num_input_tokens_seen": 23461880,
"step": 358
},
{
"epoch": 0.39624724061810157,
"grad_norm": 4.942079067230225,
"learning_rate": 7.365288528051251e-06,
"loss": 0.4079,
"num_input_tokens_seen": 23527412,
"step": 359
},
{
"epoch": 0.3973509933774834,
"grad_norm": 4.461698055267334,
"learning_rate": 7.350316273887702e-06,
"loss": 0.3752,
"num_input_tokens_seen": 23592948,
"step": 360
},
{
"epoch": 0.3984547461368653,
"grad_norm": 5.262115001678467,
"learning_rate": 7.335319328175571e-06,
"loss": 0.6089,
"num_input_tokens_seen": 23658484,
"step": 361
},
{
"epoch": 0.3995584988962472,
"grad_norm": 6.512784957885742,
"learning_rate": 7.3202978910416225e-06,
"loss": 0.9327,
"num_input_tokens_seen": 23724020,
"step": 362
},
{
"epoch": 0.40066225165562913,
"grad_norm": 3.6190109252929688,
"learning_rate": 7.305252162939451e-06,
"loss": 0.2887,
"num_input_tokens_seen": 23789556,
"step": 363
},
{
"epoch": 0.40176600441501104,
"grad_norm": 4.493287086486816,
"learning_rate": 7.290182344646799e-06,
"loss": 0.3774,
"num_input_tokens_seen": 23855092,
"step": 364
},
{
"epoch": 0.40286975717439294,
"grad_norm": 6.982156276702881,
"learning_rate": 7.275088637262881e-06,
"loss": 1.0283,
"num_input_tokens_seen": 23920628,
"step": 365
},
{
"epoch": 0.40397350993377484,
"grad_norm": 5.538138389587402,
"learning_rate": 7.259971242205702e-06,
"loss": 0.6455,
"num_input_tokens_seen": 23986164,
"step": 366
},
{
"epoch": 0.40507726269315675,
"grad_norm": 3.606851100921631,
"learning_rate": 7.244830361209366e-06,
"loss": 0.2485,
"num_input_tokens_seen": 24051700,
"step": 367
},
{
"epoch": 0.40618101545253865,
"grad_norm": 5.6115851402282715,
"learning_rate": 7.229666196321383e-06,
"loss": 0.6441,
"num_input_tokens_seen": 24117236,
"step": 368
},
{
"epoch": 0.40728476821192056,
"grad_norm": 4.510439395904541,
"learning_rate": 7.214478949899976e-06,
"loss": 0.3399,
"num_input_tokens_seen": 24182772,
"step": 369
},
{
"epoch": 0.4083885209713024,
"grad_norm": 5.339845657348633,
"learning_rate": 7.199268824611382e-06,
"loss": 0.4898,
"num_input_tokens_seen": 24248308,
"step": 370
},
{
"epoch": 0.4094922737306843,
"grad_norm": 3.8770904541015625,
"learning_rate": 7.18403602342714e-06,
"loss": 0.2596,
"num_input_tokens_seen": 24313844,
"step": 371
},
{
"epoch": 0.4105960264900662,
"grad_norm": 3.9396862983703613,
"learning_rate": 7.168780749621394e-06,
"loss": 0.2945,
"num_input_tokens_seen": 24379380,
"step": 372
},
{
"epoch": 0.4116997792494481,
"grad_norm": 4.604616165161133,
"learning_rate": 7.1535032067681684e-06,
"loss": 0.2875,
"num_input_tokens_seen": 24444916,
"step": 373
},
{
"epoch": 0.41280353200883,
"grad_norm": 4.745989799499512,
"learning_rate": 7.138203598738659e-06,
"loss": 0.2887,
"num_input_tokens_seen": 24510448,
"step": 374
},
{
"epoch": 0.4139072847682119,
"grad_norm": 5.695927619934082,
"learning_rate": 7.122882129698514e-06,
"loss": 0.4992,
"num_input_tokens_seen": 24575984,
"step": 375
},
{
"epoch": 0.41501103752759383,
"grad_norm": 4.267956256866455,
"learning_rate": 7.107539004105097e-06,
"loss": 0.322,
"num_input_tokens_seen": 24641520,
"step": 376
},
{
"epoch": 0.41611479028697573,
"grad_norm": 6.005370140075684,
"learning_rate": 7.092174426704779e-06,
"loss": 0.5286,
"num_input_tokens_seen": 24707056,
"step": 377
},
{
"epoch": 0.41721854304635764,
"grad_norm": 4.701905727386475,
"learning_rate": 7.076788602530182e-06,
"loss": 0.4565,
"num_input_tokens_seen": 24772592,
"step": 378
},
{
"epoch": 0.41832229580573954,
"grad_norm": 5.479290962219238,
"learning_rate": 7.061381736897468e-06,
"loss": 0.3361,
"num_input_tokens_seen": 24838128,
"step": 379
},
{
"epoch": 0.4194260485651214,
"grad_norm": 4.56320333480835,
"learning_rate": 7.0459540354035775e-06,
"loss": 0.3394,
"num_input_tokens_seen": 24903664,
"step": 380
},
{
"epoch": 0.4205298013245033,
"grad_norm": 5.685725212097168,
"learning_rate": 7.0305057039235e-06,
"loss": 0.5699,
"num_input_tokens_seen": 24969200,
"step": 381
},
{
"epoch": 0.4216335540838852,
"grad_norm": 6.919594764709473,
"learning_rate": 7.015036948607519e-06,
"loss": 0.8107,
"num_input_tokens_seen": 25034736,
"step": 382
},
{
"epoch": 0.4227373068432671,
"grad_norm": 5.660834312438965,
"learning_rate": 6.999547975878467e-06,
"loss": 0.5774,
"num_input_tokens_seen": 25100272,
"step": 383
},
{
"epoch": 0.423841059602649,
"grad_norm": 4.428837776184082,
"learning_rate": 6.984038992428967e-06,
"loss": 0.3895,
"num_input_tokens_seen": 25165808,
"step": 384
},
{
"epoch": 0.4249448123620309,
"grad_norm": 6.41189432144165,
"learning_rate": 6.968510205218671e-06,
"loss": 0.8248,
"num_input_tokens_seen": 25231344,
"step": 385
},
{
"epoch": 0.4260485651214128,
"grad_norm": 4.905003547668457,
"learning_rate": 6.952961821471509e-06,
"loss": 0.4435,
"num_input_tokens_seen": 25296880,
"step": 386
},
{
"epoch": 0.4271523178807947,
"grad_norm": 4.218749046325684,
"learning_rate": 6.937394048672912e-06,
"loss": 0.3755,
"num_input_tokens_seen": 25362416,
"step": 387
},
{
"epoch": 0.4282560706401766,
"grad_norm": 5.433783054351807,
"learning_rate": 6.921807094567051e-06,
"loss": 0.6033,
"num_input_tokens_seen": 25427952,
"step": 388
},
{
"epoch": 0.42935982339955847,
"grad_norm": 5.018522262573242,
"learning_rate": 6.906201167154061e-06,
"loss": 0.4313,
"num_input_tokens_seen": 25493488,
"step": 389
},
{
"epoch": 0.4304635761589404,
"grad_norm": 5.442930698394775,
"learning_rate": 6.890576474687264e-06,
"loss": 0.5557,
"num_input_tokens_seen": 25559024,
"step": 390
},
{
"epoch": 0.4315673289183223,
"grad_norm": 6.9419145584106445,
"learning_rate": 6.8749332256703975e-06,
"loss": 0.7915,
"num_input_tokens_seen": 25624560,
"step": 391
},
{
"epoch": 0.4326710816777042,
"grad_norm": 5.017683982849121,
"learning_rate": 6.85927162885482e-06,
"loss": 0.4263,
"num_input_tokens_seen": 25690096,
"step": 392
},
{
"epoch": 0.4337748344370861,
"grad_norm": 5.351146221160889,
"learning_rate": 6.843591893236742e-06,
"loss": 0.5095,
"num_input_tokens_seen": 25755632,
"step": 393
},
{
"epoch": 0.434878587196468,
"grad_norm": 4.434185028076172,
"learning_rate": 6.827894228054416e-06,
"loss": 0.4626,
"num_input_tokens_seen": 25821168,
"step": 394
},
{
"epoch": 0.4359823399558499,
"grad_norm": 4.385143280029297,
"learning_rate": 6.812178842785364e-06,
"loss": 0.3621,
"num_input_tokens_seen": 25886704,
"step": 395
},
{
"epoch": 0.4370860927152318,
"grad_norm": 4.1273112297058105,
"learning_rate": 6.796445947143571e-06,
"loss": 0.3593,
"num_input_tokens_seen": 25952240,
"step": 396
},
{
"epoch": 0.4381898454746137,
"grad_norm": 6.839285373687744,
"learning_rate": 6.780695751076685e-06,
"loss": 0.9328,
"num_input_tokens_seen": 26017776,
"step": 397
},
{
"epoch": 0.4392935982339956,
"grad_norm": 3.5457375049591064,
"learning_rate": 6.7649284647632285e-06,
"loss": 0.283,
"num_input_tokens_seen": 26083312,
"step": 398
},
{
"epoch": 0.44039735099337746,
"grad_norm": 3.7174270153045654,
"learning_rate": 6.749144298609776e-06,
"loss": 0.2656,
"num_input_tokens_seen": 26148848,
"step": 399
},
{
"epoch": 0.44150110375275936,
"grad_norm": 5.119399547576904,
"learning_rate": 6.733343463248163e-06,
"loss": 0.4559,
"num_input_tokens_seen": 26214384,
"step": 400
},
{
"epoch": 0.44260485651214126,
"grad_norm": 4.70241641998291,
"learning_rate": 6.717526169532658e-06,
"loss": 0.4606,
"num_input_tokens_seen": 26279920,
"step": 401
},
{
"epoch": 0.44370860927152317,
"grad_norm": 4.453240871429443,
"learning_rate": 6.701692628537169e-06,
"loss": 0.3758,
"num_input_tokens_seen": 26345456,
"step": 402
},
{
"epoch": 0.4448123620309051,
"grad_norm": 4.099554061889648,
"learning_rate": 6.685843051552405e-06,
"loss": 0.2946,
"num_input_tokens_seen": 26410992,
"step": 403
},
{
"epoch": 0.445916114790287,
"grad_norm": 5.848024368286133,
"learning_rate": 6.669977650083075e-06,
"loss": 0.574,
"num_input_tokens_seen": 26476528,
"step": 404
},
{
"epoch": 0.4470198675496689,
"grad_norm": 5.945405960083008,
"learning_rate": 6.654096635845054e-06,
"loss": 0.6731,
"num_input_tokens_seen": 26542064,
"step": 405
},
{
"epoch": 0.4481236203090508,
"grad_norm": 4.190430164337158,
"learning_rate": 6.638200220762563e-06,
"loss": 0.2969,
"num_input_tokens_seen": 26607600,
"step": 406
},
{
"epoch": 0.4492273730684327,
"grad_norm": 4.02309513092041,
"learning_rate": 6.622288616965343e-06,
"loss": 0.3044,
"num_input_tokens_seen": 26673136,
"step": 407
},
{
"epoch": 0.4503311258278146,
"grad_norm": 5.067698955535889,
"learning_rate": 6.60636203678581e-06,
"loss": 0.4959,
"num_input_tokens_seen": 26738672,
"step": 408
},
{
"epoch": 0.45143487858719644,
"grad_norm": 4.1457695960998535,
"learning_rate": 6.590420692756247e-06,
"loss": 0.3315,
"num_input_tokens_seen": 26804208,
"step": 409
},
{
"epoch": 0.45253863134657835,
"grad_norm": 6.310359001159668,
"learning_rate": 6.574464797605938e-06,
"loss": 0.6599,
"num_input_tokens_seen": 26869744,
"step": 410
},
{
"epoch": 0.45364238410596025,
"grad_norm": 5.186681270599365,
"learning_rate": 6.558494564258362e-06,
"loss": 0.4739,
"num_input_tokens_seen": 26935280,
"step": 411
},
{
"epoch": 0.45474613686534215,
"grad_norm": 4.929074764251709,
"learning_rate": 6.542510205828316e-06,
"loss": 0.4659,
"num_input_tokens_seen": 27000816,
"step": 412
},
{
"epoch": 0.45584988962472406,
"grad_norm": 4.5425944328308105,
"learning_rate": 6.5265119356191005e-06,
"loss": 0.3948,
"num_input_tokens_seen": 27066352,
"step": 413
},
{
"epoch": 0.45695364238410596,
"grad_norm": 5.008086681365967,
"learning_rate": 6.51049996711966e-06,
"loss": 0.4228,
"num_input_tokens_seen": 27131888,
"step": 414
},
{
"epoch": 0.45805739514348787,
"grad_norm": 5.23047399520874,
"learning_rate": 6.494474514001734e-06,
"loss": 0.3959,
"num_input_tokens_seen": 27197424,
"step": 415
},
{
"epoch": 0.45916114790286977,
"grad_norm": 5.282271385192871,
"learning_rate": 6.478435790117007e-06,
"loss": 0.5117,
"num_input_tokens_seen": 27262960,
"step": 416
},
{
"epoch": 0.4602649006622517,
"grad_norm": 4.828380584716797,
"learning_rate": 6.462384009494257e-06,
"loss": 0.2912,
"num_input_tokens_seen": 27328496,
"step": 417
},
{
"epoch": 0.4613686534216336,
"grad_norm": 4.036655902862549,
"learning_rate": 6.446319386336499e-06,
"loss": 0.3067,
"num_input_tokens_seen": 27394032,
"step": 418
},
{
"epoch": 0.4624724061810154,
"grad_norm": 4.4774274826049805,
"learning_rate": 6.430242135018121e-06,
"loss": 0.3897,
"num_input_tokens_seen": 27459568,
"step": 419
},
{
"epoch": 0.46357615894039733,
"grad_norm": 5.575551509857178,
"learning_rate": 6.414152470082031e-06,
"loss": 0.5586,
"num_input_tokens_seen": 27525104,
"step": 420
},
{
"epoch": 0.46467991169977924,
"grad_norm": 4.964656829833984,
"learning_rate": 6.3980506062367884e-06,
"loss": 0.3839,
"num_input_tokens_seen": 27590640,
"step": 421
},
{
"epoch": 0.46578366445916114,
"grad_norm": 6.3939948081970215,
"learning_rate": 6.3819367583537425e-06,
"loss": 0.274,
"num_input_tokens_seen": 27656176,
"step": 422
},
{
"epoch": 0.46688741721854304,
"grad_norm": 6.026232719421387,
"learning_rate": 6.365811141464162e-06,
"loss": 0.5634,
"num_input_tokens_seen": 27721712,
"step": 423
},
{
"epoch": 0.46799116997792495,
"grad_norm": 5.401932716369629,
"learning_rate": 6.349673970756371e-06,
"loss": 0.4866,
"num_input_tokens_seen": 27787248,
"step": 424
},
{
"epoch": 0.46909492273730685,
"grad_norm": 5.220919609069824,
"learning_rate": 6.33352546157287e-06,
"loss": 0.4014,
"num_input_tokens_seen": 27852784,
"step": 425
},
{
"epoch": 0.47019867549668876,
"grad_norm": 5.735633373260498,
"learning_rate": 6.317365829407465e-06,
"loss": 0.489,
"num_input_tokens_seen": 27918320,
"step": 426
},
{
"epoch": 0.47130242825607066,
"grad_norm": 4.342199325561523,
"learning_rate": 6.301195289902395e-06,
"loss": 0.3543,
"num_input_tokens_seen": 27983856,
"step": 427
},
{
"epoch": 0.47240618101545256,
"grad_norm": 5.337205410003662,
"learning_rate": 6.2850140588454515e-06,
"loss": 0.4572,
"num_input_tokens_seen": 28049392,
"step": 428
},
{
"epoch": 0.4735099337748344,
"grad_norm": 4.443479061126709,
"learning_rate": 6.268822352167097e-06,
"loss": 0.3907,
"num_input_tokens_seen": 28114928,
"step": 429
},
{
"epoch": 0.4746136865342163,
"grad_norm": 4.553615570068359,
"learning_rate": 6.252620385937591e-06,
"loss": 0.4246,
"num_input_tokens_seen": 28180464,
"step": 430
},
{
"epoch": 0.4757174392935982,
"grad_norm": 5.831062316894531,
"learning_rate": 6.236408376364097e-06,
"loss": 0.6506,
"num_input_tokens_seen": 28246000,
"step": 431
},
{
"epoch": 0.4768211920529801,
"grad_norm": 5.902902603149414,
"learning_rate": 6.220186539787806e-06,
"loss": 0.6119,
"num_input_tokens_seen": 28311536,
"step": 432
},
{
"epoch": 0.47792494481236203,
"grad_norm": 4.526601314544678,
"learning_rate": 6.20395509268104e-06,
"loss": 0.3519,
"num_input_tokens_seen": 28377072,
"step": 433
},
{
"epoch": 0.47902869757174393,
"grad_norm": 5.268987655639648,
"learning_rate": 6.187714251644375e-06,
"loss": 0.5892,
"num_input_tokens_seen": 28442608,
"step": 434
},
{
"epoch": 0.48013245033112584,
"grad_norm": 5.396267414093018,
"learning_rate": 6.171464233403734e-06,
"loss": 0.4768,
"num_input_tokens_seen": 28508144,
"step": 435
},
{
"epoch": 0.48123620309050774,
"grad_norm": 6.279263019561768,
"learning_rate": 6.155205254807524e-06,
"loss": 0.7685,
"num_input_tokens_seen": 28573680,
"step": 436
},
{
"epoch": 0.48233995584988965,
"grad_norm": 4.332390785217285,
"learning_rate": 6.138937532823701e-06,
"loss": 0.3726,
"num_input_tokens_seen": 28639216,
"step": 437
},
{
"epoch": 0.48344370860927155,
"grad_norm": 5.152309417724609,
"learning_rate": 6.1226612845369134e-06,
"loss": 0.4462,
"num_input_tokens_seen": 28704752,
"step": 438
},
{
"epoch": 0.4845474613686534,
"grad_norm": 5.474704742431641,
"learning_rate": 6.1063767271455834e-06,
"loss": 0.6085,
"num_input_tokens_seen": 28770288,
"step": 439
},
{
"epoch": 0.4856512141280353,
"grad_norm": 5.3773345947265625,
"learning_rate": 6.090084077959013e-06,
"loss": 0.5532,
"num_input_tokens_seen": 28835824,
"step": 440
},
{
"epoch": 0.4867549668874172,
"grad_norm": 3.462773323059082,
"learning_rate": 6.073783554394486e-06,
"loss": 0.2709,
"num_input_tokens_seen": 28901360,
"step": 441
},
{
"epoch": 0.4878587196467991,
"grad_norm": 5.682385444641113,
"learning_rate": 6.057475373974366e-06,
"loss": 0.6188,
"num_input_tokens_seen": 28966896,
"step": 442
},
{
"epoch": 0.488962472406181,
"grad_norm": 5.999266624450684,
"learning_rate": 6.041159754323196e-06,
"loss": 0.6366,
"num_input_tokens_seen": 29032432,
"step": 443
},
{
"epoch": 0.4900662251655629,
"grad_norm": 6.69478702545166,
"learning_rate": 6.024836913164787e-06,
"loss": 0.5834,
"num_input_tokens_seen": 29097968,
"step": 444
},
{
"epoch": 0.4911699779249448,
"grad_norm": 6.7099103927612305,
"learning_rate": 6.008507068319318e-06,
"loss": 0.7872,
"num_input_tokens_seen": 29163504,
"step": 445
},
{
"epoch": 0.4922737306843267,
"grad_norm": 4.720966815948486,
"learning_rate": 5.992170437700436e-06,
"loss": 0.3586,
"num_input_tokens_seen": 29229040,
"step": 446
},
{
"epoch": 0.49337748344370863,
"grad_norm": 4.880152225494385,
"learning_rate": 5.9758272393123305e-06,
"loss": 0.3405,
"num_input_tokens_seen": 29294576,
"step": 447
},
{
"epoch": 0.49448123620309054,
"grad_norm": 5.520662784576416,
"learning_rate": 5.959477691246842e-06,
"loss": 0.5912,
"num_input_tokens_seen": 29360112,
"step": 448
},
{
"epoch": 0.4955849889624724,
"grad_norm": 4.44854211807251,
"learning_rate": 5.943122011680542e-06,
"loss": 0.4973,
"num_input_tokens_seen": 29425648,
"step": 449
},
{
"epoch": 0.4966887417218543,
"grad_norm": 3.7942519187927246,
"learning_rate": 5.926760418871823e-06,
"loss": 0.2792,
"num_input_tokens_seen": 29491184,
"step": 450
},
{
"epoch": 0.4977924944812362,
"grad_norm": 4.290163993835449,
"learning_rate": 5.910393131157987e-06,
"loss": 0.3856,
"num_input_tokens_seen": 29556720,
"step": 451
},
{
"epoch": 0.4988962472406181,
"grad_norm": 6.079360485076904,
"learning_rate": 5.894020366952331e-06,
"loss": 0.6704,
"num_input_tokens_seen": 29622256,
"step": 452
},
{
"epoch": 0.5,
"grad_norm": 4.259561061859131,
"learning_rate": 5.8776423447412366e-06,
"loss": 0.4156,
"num_input_tokens_seen": 29687792,
"step": 453
},
{
"epoch": 0.5011037527593819,
"grad_norm": 5.737578392028809,
"learning_rate": 5.861259283081246e-06,
"loss": 0.5608,
"num_input_tokens_seen": 29753328,
"step": 454
},
{
"epoch": 0.5022075055187638,
"grad_norm": 4.720670700073242,
"learning_rate": 5.844871400596154e-06,
"loss": 0.4287,
"num_input_tokens_seen": 29818864,
"step": 455
},
{
"epoch": 0.5033112582781457,
"grad_norm": 3.384713649749756,
"learning_rate": 5.828478915974084e-06,
"loss": 0.2397,
"num_input_tokens_seen": 29884400,
"step": 456
},
{
"epoch": 0.5044150110375276,
"grad_norm": 5.180420398712158,
"learning_rate": 5.812082047964578e-06,
"loss": 0.5351,
"num_input_tokens_seen": 29949936,
"step": 457
},
{
"epoch": 0.5055187637969095,
"grad_norm": 5.582147121429443,
"learning_rate": 5.795681015375664e-06,
"loss": 0.5918,
"num_input_tokens_seen": 30015472,
"step": 458
},
{
"epoch": 0.5066225165562914,
"grad_norm": 5.289458274841309,
"learning_rate": 5.779276037070951e-06,
"loss": 0.504,
"num_input_tokens_seen": 30081008,
"step": 459
},
{
"epoch": 0.5077262693156733,
"grad_norm": 4.269794464111328,
"learning_rate": 5.762867331966698e-06,
"loss": 0.3156,
"num_input_tokens_seen": 30146544,
"step": 460
},
{
"epoch": 0.5088300220750552,
"grad_norm": 4.807360649108887,
"learning_rate": 5.746455119028896e-06,
"loss": 0.3911,
"num_input_tokens_seen": 30212080,
"step": 461
},
{
"epoch": 0.5099337748344371,
"grad_norm": 5.4606122970581055,
"learning_rate": 5.730039617270353e-06,
"loss": 0.4696,
"num_input_tokens_seen": 30277616,
"step": 462
},
{
"epoch": 0.5110375275938189,
"grad_norm": 4.116583824157715,
"learning_rate": 5.7136210457477546e-06,
"loss": 0.2897,
"num_input_tokens_seen": 30343152,
"step": 463
},
{
"epoch": 0.5121412803532008,
"grad_norm": 3.7440638542175293,
"learning_rate": 5.697199623558758e-06,
"loss": 0.3166,
"num_input_tokens_seen": 30408688,
"step": 464
},
{
"epoch": 0.5132450331125827,
"grad_norm": 7.0095014572143555,
"learning_rate": 5.680775569839058e-06,
"loss": 0.695,
"num_input_tokens_seen": 30474224,
"step": 465
},
{
"epoch": 0.5143487858719646,
"grad_norm": 3.876681089401245,
"learning_rate": 5.664349103759467e-06,
"loss": 0.2507,
"num_input_tokens_seen": 30539760,
"step": 466
},
{
"epoch": 0.5154525386313465,
"grad_norm": 4.9294753074646,
"learning_rate": 5.647920444522986e-06,
"loss": 0.4078,
"num_input_tokens_seen": 30605296,
"step": 467
},
{
"epoch": 0.5165562913907285,
"grad_norm": 5.85141658782959,
"learning_rate": 5.631489811361891e-06,
"loss": 0.6855,
"num_input_tokens_seen": 30670832,
"step": 468
},
{
"epoch": 0.5176600441501104,
"grad_norm": 4.421255588531494,
"learning_rate": 5.615057423534788e-06,
"loss": 0.3744,
"num_input_tokens_seen": 30736368,
"step": 469
},
{
"epoch": 0.5187637969094923,
"grad_norm": 4.160279750823975,
"learning_rate": 5.5986235003237065e-06,
"loss": 0.3238,
"num_input_tokens_seen": 30801904,
"step": 470
},
{
"epoch": 0.5198675496688742,
"grad_norm": 5.7004194259643555,
"learning_rate": 5.5821882610311625e-06,
"loss": 0.5394,
"num_input_tokens_seen": 30867440,
"step": 471
},
{
"epoch": 0.5209713024282561,
"grad_norm": 6.271990776062012,
"learning_rate": 5.565751924977232e-06,
"loss": 0.4037,
"num_input_tokens_seen": 30932976,
"step": 472
},
{
"epoch": 0.522075055187638,
"grad_norm": 4.508204936981201,
"learning_rate": 5.549314711496631e-06,
"loss": 0.4116,
"num_input_tokens_seen": 30998512,
"step": 473
},
{
"epoch": 0.5231788079470199,
"grad_norm": 4.369766712188721,
"learning_rate": 5.532876839935779e-06,
"loss": 0.2638,
"num_input_tokens_seen": 31064048,
"step": 474
},
{
"epoch": 0.5242825607064018,
"grad_norm": 5.024233341217041,
"learning_rate": 5.516438529649883e-06,
"loss": 0.4689,
"num_input_tokens_seen": 31129584,
"step": 475
},
{
"epoch": 0.5253863134657837,
"grad_norm": 4.348725318908691,
"learning_rate": 5.500000000000001e-06,
"loss": 0.3456,
"num_input_tokens_seen": 31195120,
"step": 476
},
{
"epoch": 0.5264900662251656,
"grad_norm": 4.22805643081665,
"learning_rate": 5.483561470350118e-06,
"loss": 0.2993,
"num_input_tokens_seen": 31260656,
"step": 477
},
{
"epoch": 0.5275938189845475,
"grad_norm": 5.037574291229248,
"learning_rate": 5.467123160064222e-06,
"loss": 0.4853,
"num_input_tokens_seen": 31326192,
"step": 478
},
{
"epoch": 0.5286975717439294,
"grad_norm": 4.518187522888184,
"learning_rate": 5.4506852885033715e-06,
"loss": 0.1977,
"num_input_tokens_seen": 31391728,
"step": 479
},
{
"epoch": 0.5298013245033113,
"grad_norm": 5.805148601531982,
"learning_rate": 5.434248075022769e-06,
"loss": 0.6275,
"num_input_tokens_seen": 31457264,
"step": 480
},
{
"epoch": 0.5309050772626932,
"grad_norm": 4.804733753204346,
"learning_rate": 5.417811738968839e-06,
"loss": 0.3938,
"num_input_tokens_seen": 31522800,
"step": 481
},
{
"epoch": 0.5320088300220751,
"grad_norm": 4.533995151519775,
"learning_rate": 5.401376499676294e-06,
"loss": 0.3928,
"num_input_tokens_seen": 31588336,
"step": 482
},
{
"epoch": 0.5331125827814569,
"grad_norm": 4.422305583953857,
"learning_rate": 5.384942576465215e-06,
"loss": 0.3949,
"num_input_tokens_seen": 31653872,
"step": 483
},
{
"epoch": 0.5342163355408388,
"grad_norm": 5.579964637756348,
"learning_rate": 5.368510188638113e-06,
"loss": 0.4996,
"num_input_tokens_seen": 31719408,
"step": 484
},
{
"epoch": 0.5353200883002207,
"grad_norm": 4.969320774078369,
"learning_rate": 5.3520795554770155e-06,
"loss": 0.3976,
"num_input_tokens_seen": 31784944,
"step": 485
},
{
"epoch": 0.5364238410596026,
"grad_norm": 4.212143421173096,
"learning_rate": 5.3356508962405355e-06,
"loss": 0.309,
"num_input_tokens_seen": 31850480,
"step": 486
},
{
"epoch": 0.5375275938189845,
"grad_norm": 3.8276007175445557,
"learning_rate": 5.319224430160943e-06,
"loss": 0.2525,
"num_input_tokens_seen": 31916016,
"step": 487
},
{
"epoch": 0.5386313465783664,
"grad_norm": 5.205665111541748,
"learning_rate": 5.302800376441244e-06,
"loss": 0.4736,
"num_input_tokens_seen": 31981552,
"step": 488
},
{
"epoch": 0.5397350993377483,
"grad_norm": 4.5672807693481445,
"learning_rate": 5.286378954252247e-06,
"loss": 0.3377,
"num_input_tokens_seen": 32047088,
"step": 489
},
{
"epoch": 0.5408388520971302,
"grad_norm": 5.600037097930908,
"learning_rate": 5.269960382729649e-06,
"loss": 0.5481,
"num_input_tokens_seen": 32112624,
"step": 490
},
{
"epoch": 0.5419426048565121,
"grad_norm": 5.772684097290039,
"learning_rate": 5.2535448809711046e-06,
"loss": 0.4308,
"num_input_tokens_seen": 32178160,
"step": 491
},
{
"epoch": 0.543046357615894,
"grad_norm": 5.687682628631592,
"learning_rate": 5.237132668033303e-06,
"loss": 0.497,
"num_input_tokens_seen": 32243696,
"step": 492
},
{
"epoch": 0.5441501103752759,
"grad_norm": 4.946759223937988,
"learning_rate": 5.220723962929052e-06,
"loss": 0.4745,
"num_input_tokens_seen": 32309232,
"step": 493
},
{
"epoch": 0.5452538631346578,
"grad_norm": 5.427900791168213,
"learning_rate": 5.204318984624338e-06,
"loss": 0.505,
"num_input_tokens_seen": 32374768,
"step": 494
},
{
"epoch": 0.5463576158940397,
"grad_norm": 5.026839256286621,
"learning_rate": 5.187917952035424e-06,
"loss": 0.4926,
"num_input_tokens_seen": 32440304,
"step": 495
},
{
"epoch": 0.5474613686534217,
"grad_norm": 3.4422245025634766,
"learning_rate": 5.171521084025917e-06,
"loss": 0.2596,
"num_input_tokens_seen": 32505840,
"step": 496
},
{
"epoch": 0.5485651214128036,
"grad_norm": 5.574887752532959,
"learning_rate": 5.155128599403849e-06,
"loss": 0.4804,
"num_input_tokens_seen": 32571376,
"step": 497
},
{
"epoch": 0.5496688741721855,
"grad_norm": 4.954028129577637,
"learning_rate": 5.138740716918755e-06,
"loss": 0.5245,
"num_input_tokens_seen": 32636912,
"step": 498
},
{
"epoch": 0.5507726269315674,
"grad_norm": 6.411076068878174,
"learning_rate": 5.122357655258765e-06,
"loss": 0.5285,
"num_input_tokens_seen": 32702448,
"step": 499
},
{
"epoch": 0.5518763796909493,
"grad_norm": 6.448341369628906,
"learning_rate": 5.105979633047669e-06,
"loss": 0.6117,
"num_input_tokens_seen": 32767984,
"step": 500
},
{
"epoch": 0.5529801324503312,
"grad_norm": 5.298100471496582,
"learning_rate": 5.0896068688420146e-06,
"loss": 0.5242,
"num_input_tokens_seen": 32833520,
"step": 501
},
{
"epoch": 0.5540838852097131,
"grad_norm": 5.838515281677246,
"learning_rate": 5.07323958112818e-06,
"loss": 0.6421,
"num_input_tokens_seen": 32899056,
"step": 502
},
{
"epoch": 0.5551876379690949,
"grad_norm": 4.797820091247559,
"learning_rate": 5.056877988319459e-06,
"loss": 0.4227,
"num_input_tokens_seen": 32964592,
"step": 503
},
{
"epoch": 0.5562913907284768,
"grad_norm": 4.745166301727295,
"learning_rate": 5.04052230875316e-06,
"loss": 0.4165,
"num_input_tokens_seen": 33030128,
"step": 504
},
{
"epoch": 0.5573951434878587,
"grad_norm": 3.404788017272949,
"learning_rate": 5.024172760687671e-06,
"loss": 0.256,
"num_input_tokens_seen": 33095664,
"step": 505
},
{
"epoch": 0.5584988962472406,
"grad_norm": 5.247468948364258,
"learning_rate": 5.007829562299567e-06,
"loss": 0.5258,
"num_input_tokens_seen": 33161200,
"step": 506
},
{
"epoch": 0.5596026490066225,
"grad_norm": 4.767642021179199,
"learning_rate": 4.991492931680684e-06,
"loss": 0.4969,
"num_input_tokens_seen": 33226736,
"step": 507
},
{
"epoch": 0.5607064017660044,
"grad_norm": 5.1648101806640625,
"learning_rate": 4.975163086835216e-06,
"loss": 0.554,
"num_input_tokens_seen": 33292272,
"step": 508
},
{
"epoch": 0.5618101545253863,
"grad_norm": 5.1125264167785645,
"learning_rate": 4.958840245676806e-06,
"loss": 0.4574,
"num_input_tokens_seen": 33357808,
"step": 509
},
{
"epoch": 0.5629139072847682,
"grad_norm": 5.249313831329346,
"learning_rate": 4.9425246260256345e-06,
"loss": 0.5128,
"num_input_tokens_seen": 33423344,
"step": 510
},
{
"epoch": 0.5640176600441501,
"grad_norm": 3.5619633197784424,
"learning_rate": 4.9262164456055165e-06,
"loss": 0.2696,
"num_input_tokens_seen": 33488880,
"step": 511
},
{
"epoch": 0.565121412803532,
"grad_norm": 4.751221656799316,
"learning_rate": 4.909915922040989e-06,
"loss": 0.4366,
"num_input_tokens_seen": 33554416,
"step": 512
},
{
"epoch": 0.5662251655629139,
"grad_norm": 4.820193767547607,
"learning_rate": 4.893623272854417e-06,
"loss": 0.3903,
"num_input_tokens_seen": 33619952,
"step": 513
},
{
"epoch": 0.5673289183222958,
"grad_norm": 6.389187335968018,
"learning_rate": 4.877338715463087e-06,
"loss": 0.6647,
"num_input_tokens_seen": 33685488,
"step": 514
},
{
"epoch": 0.5684326710816777,
"grad_norm": 5.3614301681518555,
"learning_rate": 4.861062467176302e-06,
"loss": 0.4495,
"num_input_tokens_seen": 33751024,
"step": 515
},
{
"epoch": 0.5695364238410596,
"grad_norm": 5.853780746459961,
"learning_rate": 4.844794745192479e-06,
"loss": 0.5787,
"num_input_tokens_seen": 33816560,
"step": 516
},
{
"epoch": 0.5706401766004415,
"grad_norm": 4.756037712097168,
"learning_rate": 4.828535766596266e-06,
"loss": 0.409,
"num_input_tokens_seen": 33882096,
"step": 517
},
{
"epoch": 0.5717439293598234,
"grad_norm": 4.552085876464844,
"learning_rate": 4.8122857483556285e-06,
"loss": 0.4346,
"num_input_tokens_seen": 33947632,
"step": 518
},
{
"epoch": 0.5728476821192053,
"grad_norm": 5.546624660491943,
"learning_rate": 4.796044907318961e-06,
"loss": 0.4542,
"num_input_tokens_seen": 34013168,
"step": 519
},
{
"epoch": 0.5739514348785872,
"grad_norm": 3.965635061264038,
"learning_rate": 4.779813460212197e-06,
"loss": 0.3123,
"num_input_tokens_seen": 34078704,
"step": 520
},
{
"epoch": 0.5750551876379691,
"grad_norm": 3.4151055812835693,
"learning_rate": 4.763591623635905e-06,
"loss": 0.1869,
"num_input_tokens_seen": 34144240,
"step": 521
},
{
"epoch": 0.5761589403973509,
"grad_norm": 3.900256395339966,
"learning_rate": 4.747379614062411e-06,
"loss": 0.2873,
"num_input_tokens_seen": 34209776,
"step": 522
},
{
"epoch": 0.5772626931567328,
"grad_norm": 4.467334270477295,
"learning_rate": 4.731177647832905e-06,
"loss": 0.3625,
"num_input_tokens_seen": 34275312,
"step": 523
},
{
"epoch": 0.5783664459161147,
"grad_norm": 4.834880828857422,
"learning_rate": 4.714985941154551e-06,
"loss": 0.4265,
"num_input_tokens_seen": 34340848,
"step": 524
},
{
"epoch": 0.5794701986754967,
"grad_norm": 5.462576866149902,
"learning_rate": 4.698804710097607e-06,
"loss": 0.5406,
"num_input_tokens_seen": 34406384,
"step": 525
},
{
"epoch": 0.5805739514348786,
"grad_norm": 5.588657379150391,
"learning_rate": 4.682634170592537e-06,
"loss": 0.553,
"num_input_tokens_seen": 34471920,
"step": 526
},
{
"epoch": 0.5816777041942605,
"grad_norm": 4.872191429138184,
"learning_rate": 4.6664745384271315e-06,
"loss": 0.5007,
"num_input_tokens_seen": 34537456,
"step": 527
},
{
"epoch": 0.5827814569536424,
"grad_norm": 5.454434394836426,
"learning_rate": 4.650326029243629e-06,
"loss": 0.6074,
"num_input_tokens_seen": 34602992,
"step": 528
},
{
"epoch": 0.5838852097130243,
"grad_norm": 3.7384860515594482,
"learning_rate": 4.634188858535839e-06,
"loss": 0.2471,
"num_input_tokens_seen": 34668528,
"step": 529
},
{
"epoch": 0.5849889624724062,
"grad_norm": 5.02782678604126,
"learning_rate": 4.61806324164626e-06,
"loss": 0.446,
"num_input_tokens_seen": 34734064,
"step": 530
},
{
"epoch": 0.5860927152317881,
"grad_norm": 4.343999862670898,
"learning_rate": 4.601949393763215e-06,
"loss": 0.4206,
"num_input_tokens_seen": 34799600,
"step": 531
},
{
"epoch": 0.58719646799117,
"grad_norm": 6.126996040344238,
"learning_rate": 4.58584752991797e-06,
"loss": 0.6584,
"num_input_tokens_seen": 34865136,
"step": 532
},
{
"epoch": 0.5883002207505519,
"grad_norm": 4.696587085723877,
"learning_rate": 4.56975786498188e-06,
"loss": 0.5208,
"num_input_tokens_seen": 34930672,
"step": 533
},
{
"epoch": 0.5894039735099338,
"grad_norm": 4.739029884338379,
"learning_rate": 4.553680613663504e-06,
"loss": 0.4503,
"num_input_tokens_seen": 34996208,
"step": 534
},
{
"epoch": 0.5905077262693157,
"grad_norm": 4.531418323516846,
"learning_rate": 4.537615990505744e-06,
"loss": 0.4225,
"num_input_tokens_seen": 35061744,
"step": 535
},
{
"epoch": 0.5916114790286976,
"grad_norm": 4.888784885406494,
"learning_rate": 4.521564209882995e-06,
"loss": 0.4602,
"num_input_tokens_seen": 35127280,
"step": 536
},
{
"epoch": 0.5927152317880795,
"grad_norm": 5.615579605102539,
"learning_rate": 4.505525485998267e-06,
"loss": 0.5887,
"num_input_tokens_seen": 35192816,
"step": 537
},
{
"epoch": 0.5938189845474614,
"grad_norm": 4.535574913024902,
"learning_rate": 4.489500032880342e-06,
"loss": 0.4209,
"num_input_tokens_seen": 35258352,
"step": 538
},
{
"epoch": 0.5949227373068433,
"grad_norm": 5.854379177093506,
"learning_rate": 4.473488064380901e-06,
"loss": 0.672,
"num_input_tokens_seen": 35323888,
"step": 539
},
{
"epoch": 0.5960264900662252,
"grad_norm": 4.801560878753662,
"learning_rate": 4.457489794171685e-06,
"loss": 0.4111,
"num_input_tokens_seen": 35389424,
"step": 540
},
{
"epoch": 0.5971302428256071,
"grad_norm": 4.031239032745361,
"learning_rate": 4.44150543574164e-06,
"loss": 0.2854,
"num_input_tokens_seen": 35454960,
"step": 541
},
{
"epoch": 0.5982339955849889,
"grad_norm": 4.758004665374756,
"learning_rate": 4.4255352023940616e-06,
"loss": 0.403,
"num_input_tokens_seen": 35520496,
"step": 542
},
{
"epoch": 0.5993377483443708,
"grad_norm": 6.281385898590088,
"learning_rate": 4.4095793072437554e-06,
"loss": 0.5664,
"num_input_tokens_seen": 35586032,
"step": 543
},
{
"epoch": 0.6004415011037527,
"grad_norm": 4.875401973724365,
"learning_rate": 4.393637963214191e-06,
"loss": 0.4567,
"num_input_tokens_seen": 35651568,
"step": 544
},
{
"epoch": 0.6015452538631346,
"grad_norm": 6.389124870300293,
"learning_rate": 4.37771138303466e-06,
"loss": 0.8247,
"num_input_tokens_seen": 35717104,
"step": 545
},
{
"epoch": 0.6026490066225165,
"grad_norm": 4.922704696655273,
"learning_rate": 4.3617997792374365e-06,
"loss": 0.5074,
"num_input_tokens_seen": 35782640,
"step": 546
},
{
"epoch": 0.6037527593818984,
"grad_norm": 5.83562707901001,
"learning_rate": 4.345903364154949e-06,
"loss": 0.5406,
"num_input_tokens_seen": 35848176,
"step": 547
},
{
"epoch": 0.6048565121412803,
"grad_norm": 5.12996244430542,
"learning_rate": 4.330022349916928e-06,
"loss": 0.5394,
"num_input_tokens_seen": 35913712,
"step": 548
},
{
"epoch": 0.6059602649006622,
"grad_norm": 5.372644424438477,
"learning_rate": 4.314156948447596e-06,
"loss": 0.6657,
"num_input_tokens_seen": 35979248,
"step": 549
},
{
"epoch": 0.6070640176600441,
"grad_norm": 5.526101112365723,
"learning_rate": 4.298307371462833e-06,
"loss": 0.5652,
"num_input_tokens_seen": 36044784,
"step": 550
},
{
"epoch": 0.608167770419426,
"grad_norm": 2.293391227722168,
"learning_rate": 4.282473830467342e-06,
"loss": 0.1381,
"num_input_tokens_seen": 36110320,
"step": 551
},
{
"epoch": 0.609271523178808,
"grad_norm": 3.795881509780884,
"learning_rate": 4.26665653675184e-06,
"loss": 0.2672,
"num_input_tokens_seen": 36175856,
"step": 552
},
{
"epoch": 0.6103752759381899,
"grad_norm": 3.400076389312744,
"learning_rate": 4.250855701390225e-06,
"loss": 0.2625,
"num_input_tokens_seen": 36241392,
"step": 553
},
{
"epoch": 0.6114790286975718,
"grad_norm": 5.294395446777344,
"learning_rate": 4.235071535236773e-06,
"loss": 0.5561,
"num_input_tokens_seen": 36306928,
"step": 554
},
{
"epoch": 0.6125827814569537,
"grad_norm": 3.94831919670105,
"learning_rate": 4.219304248923316e-06,
"loss": 0.287,
"num_input_tokens_seen": 36372464,
"step": 555
},
{
"epoch": 0.6136865342163356,
"grad_norm": 4.536540985107422,
"learning_rate": 4.203554052856431e-06,
"loss": 0.3303,
"num_input_tokens_seen": 36438000,
"step": 556
},
{
"epoch": 0.6147902869757175,
"grad_norm": 3.447150230407715,
"learning_rate": 4.187821157214638e-06,
"loss": 0.2098,
"num_input_tokens_seen": 36503536,
"step": 557
},
{
"epoch": 0.6158940397350994,
"grad_norm": 4.320523738861084,
"learning_rate": 4.1721057719455845e-06,
"loss": 0.2808,
"num_input_tokens_seen": 36569072,
"step": 558
},
{
"epoch": 0.6169977924944813,
"grad_norm": 6.25832986831665,
"learning_rate": 4.156408106763259e-06,
"loss": 0.6881,
"num_input_tokens_seen": 36634608,
"step": 559
},
{
"epoch": 0.6181015452538632,
"grad_norm": 5.774060249328613,
"learning_rate": 4.1407283711451795e-06,
"loss": 0.5689,
"num_input_tokens_seen": 36700144,
"step": 560
},
{
"epoch": 0.6192052980132451,
"grad_norm": 5.2640461921691895,
"learning_rate": 4.125066774329605e-06,
"loss": 0.4055,
"num_input_tokens_seen": 36765680,
"step": 561
},
{
"epoch": 0.6203090507726269,
"grad_norm": 4.976436614990234,
"learning_rate": 4.109423525312738e-06,
"loss": 0.3318,
"num_input_tokens_seen": 36831216,
"step": 562
},
{
"epoch": 0.6214128035320088,
"grad_norm": 5.786040306091309,
"learning_rate": 4.093798832845941e-06,
"loss": 0.3969,
"num_input_tokens_seen": 36896752,
"step": 563
},
{
"epoch": 0.6225165562913907,
"grad_norm": 5.763451099395752,
"learning_rate": 4.078192905432949e-06,
"loss": 0.4451,
"num_input_tokens_seen": 36962288,
"step": 564
},
{
"epoch": 0.6236203090507726,
"grad_norm": 4.722074031829834,
"learning_rate": 4.0626059513270885e-06,
"loss": 0.3197,
"num_input_tokens_seen": 37027824,
"step": 565
},
{
"epoch": 0.6247240618101545,
"grad_norm": 4.204012870788574,
"learning_rate": 4.047038178528494e-06,
"loss": 0.2824,
"num_input_tokens_seen": 37093360,
"step": 566
},
{
"epoch": 0.6258278145695364,
"grad_norm": 4.942794322967529,
"learning_rate": 4.0314897947813315e-06,
"loss": 0.4326,
"num_input_tokens_seen": 37158896,
"step": 567
},
{
"epoch": 0.6269315673289183,
"grad_norm": 4.9892730712890625,
"learning_rate": 4.015961007571036e-06,
"loss": 0.3502,
"num_input_tokens_seen": 37224432,
"step": 568
},
{
"epoch": 0.6280353200883002,
"grad_norm": 4.182072639465332,
"learning_rate": 4.000452024121534e-06,
"loss": 0.3456,
"num_input_tokens_seen": 37289968,
"step": 569
},
{
"epoch": 0.6291390728476821,
"grad_norm": 3.5467734336853027,
"learning_rate": 3.9849630513924844e-06,
"loss": 0.2242,
"num_input_tokens_seen": 37355504,
"step": 570
},
{
"epoch": 0.630242825607064,
"grad_norm": 4.3329033851623535,
"learning_rate": 3.9694942960765035e-06,
"loss": 0.397,
"num_input_tokens_seen": 37421040,
"step": 571
},
{
"epoch": 0.6313465783664459,
"grad_norm": 4.063337326049805,
"learning_rate": 3.954045964596425e-06,
"loss": 0.2883,
"num_input_tokens_seen": 37486576,
"step": 572
},
{
"epoch": 0.6324503311258278,
"grad_norm": 5.374694347381592,
"learning_rate": 3.938618263102534e-06,
"loss": 0.5777,
"num_input_tokens_seen": 37552112,
"step": 573
},
{
"epoch": 0.6335540838852097,
"grad_norm": 5.350125789642334,
"learning_rate": 3.923211397469818e-06,
"loss": 0.5059,
"num_input_tokens_seen": 37617648,
"step": 574
},
{
"epoch": 0.6346578366445916,
"grad_norm": 4.029745101928711,
"learning_rate": 3.9078255732952244e-06,
"loss": 0.3395,
"num_input_tokens_seen": 37683184,
"step": 575
},
{
"epoch": 0.6357615894039735,
"grad_norm": 4.23199987411499,
"learning_rate": 3.8924609958949035e-06,
"loss": 0.3145,
"num_input_tokens_seen": 37748720,
"step": 576
},
{
"epoch": 0.6368653421633554,
"grad_norm": 3.817701578140259,
"learning_rate": 3.877117870301488e-06,
"loss": 0.2616,
"num_input_tokens_seen": 37814256,
"step": 577
},
{
"epoch": 0.6379690949227373,
"grad_norm": 5.496271133422852,
"learning_rate": 3.861796401261341e-06,
"loss": 0.4336,
"num_input_tokens_seen": 37879792,
"step": 578
},
{
"epoch": 0.6390728476821192,
"grad_norm": 5.724914073944092,
"learning_rate": 3.846496793231834e-06,
"loss": 0.5434,
"num_input_tokens_seen": 37945328,
"step": 579
},
{
"epoch": 0.6401766004415012,
"grad_norm": 5.538677215576172,
"learning_rate": 3.8312192503786085e-06,
"loss": 0.4194,
"num_input_tokens_seen": 38010864,
"step": 580
},
{
"epoch": 0.6412803532008831,
"grad_norm": 4.735373020172119,
"learning_rate": 3.81596397657286e-06,
"loss": 0.3671,
"num_input_tokens_seen": 38076400,
"step": 581
},
{
"epoch": 0.6423841059602649,
"grad_norm": 4.600485324859619,
"learning_rate": 3.80073117538862e-06,
"loss": 0.2745,
"num_input_tokens_seen": 38141936,
"step": 582
},
{
"epoch": 0.6434878587196468,
"grad_norm": 5.746383190155029,
"learning_rate": 3.785521050100025e-06,
"loss": 0.5071,
"num_input_tokens_seen": 38207472,
"step": 583
},
{
"epoch": 0.6445916114790287,
"grad_norm": 5.705986976623535,
"learning_rate": 3.7703338036786195e-06,
"loss": 0.4335,
"num_input_tokens_seen": 38273008,
"step": 584
},
{
"epoch": 0.6456953642384106,
"grad_norm": 6.0024614334106445,
"learning_rate": 3.7551696387906365e-06,
"loss": 0.4688,
"num_input_tokens_seen": 38338544,
"step": 585
},
{
"epoch": 0.6467991169977925,
"grad_norm": 6.416266918182373,
"learning_rate": 3.7400287577942994e-06,
"loss": 0.5509,
"num_input_tokens_seen": 38404080,
"step": 586
},
{
"epoch": 0.6479028697571744,
"grad_norm": 5.125373840332031,
"learning_rate": 3.7249113627371203e-06,
"loss": 0.4493,
"num_input_tokens_seen": 38469616,
"step": 587
},
{
"epoch": 0.6490066225165563,
"grad_norm": 5.068411827087402,
"learning_rate": 3.7098176553532015e-06,
"loss": 0.4476,
"num_input_tokens_seen": 38535152,
"step": 588
},
{
"epoch": 0.6501103752759382,
"grad_norm": 5.842297554016113,
"learning_rate": 3.6947478370605516e-06,
"loss": 0.5361,
"num_input_tokens_seen": 38600688,
"step": 589
},
{
"epoch": 0.6512141280353201,
"grad_norm": 4.763930320739746,
"learning_rate": 3.6797021089583794e-06,
"loss": 0.4917,
"num_input_tokens_seen": 38666224,
"step": 590
},
{
"epoch": 0.652317880794702,
"grad_norm": 3.0710020065307617,
"learning_rate": 3.66468067182443e-06,
"loss": 0.1964,
"num_input_tokens_seen": 38731760,
"step": 591
},
{
"epoch": 0.6534216335540839,
"grad_norm": 4.4214887619018555,
"learning_rate": 3.649683726112299e-06,
"loss": 0.3169,
"num_input_tokens_seen": 38797296,
"step": 592
},
{
"epoch": 0.6545253863134658,
"grad_norm": 5.3136396408081055,
"learning_rate": 3.6347114719487496e-06,
"loss": 0.454,
"num_input_tokens_seen": 38862832,
"step": 593
},
{
"epoch": 0.6556291390728477,
"grad_norm": 5.324778079986572,
"learning_rate": 3.6197641091310553e-06,
"loss": 0.5138,
"num_input_tokens_seen": 38928368,
"step": 594
},
{
"epoch": 0.6567328918322296,
"grad_norm": 5.356673717498779,
"learning_rate": 3.6048418371243222e-06,
"loss": 0.4474,
"num_input_tokens_seen": 38993904,
"step": 595
},
{
"epoch": 0.6578366445916115,
"grad_norm": 5.669276237487793,
"learning_rate": 3.5899448550588335e-06,
"loss": 0.6614,
"num_input_tokens_seen": 39059440,
"step": 596
},
{
"epoch": 0.6589403973509934,
"grad_norm": 4.830669403076172,
"learning_rate": 3.5750733617273914e-06,
"loss": 0.4255,
"num_input_tokens_seen": 39124976,
"step": 597
},
{
"epoch": 0.6600441501103753,
"grad_norm": 5.008633613586426,
"learning_rate": 3.560227555582665e-06,
"loss": 0.2942,
"num_input_tokens_seen": 39190508,
"step": 598
},
{
"epoch": 0.6611479028697572,
"grad_norm": 3.8209006786346436,
"learning_rate": 3.5454076347345367e-06,
"loss": 0.2513,
"num_input_tokens_seen": 39256044,
"step": 599
},
{
"epoch": 0.6622516556291391,
"grad_norm": 4.039699554443359,
"learning_rate": 3.5306137969474663e-06,
"loss": 0.3439,
"num_input_tokens_seen": 39321580,
"step": 600
},
{
"epoch": 0.6633554083885209,
"grad_norm": 5.03836727142334,
"learning_rate": 3.515846239637846e-06,
"loss": 0.3721,
"num_input_tokens_seen": 39387116,
"step": 601
},
{
"epoch": 0.6644591611479028,
"grad_norm": 5.898911476135254,
"learning_rate": 3.5011051598713707e-06,
"loss": 0.5748,
"num_input_tokens_seen": 39452652,
"step": 602
},
{
"epoch": 0.6655629139072847,
"grad_norm": 6.085241794586182,
"learning_rate": 3.4863907543604e-06,
"loss": 0.6649,
"num_input_tokens_seen": 39518188,
"step": 603
},
{
"epoch": 0.6666666666666666,
"grad_norm": 5.235047340393066,
"learning_rate": 3.4717032194613455e-06,
"loss": 0.4869,
"num_input_tokens_seen": 39583724,
"step": 604
},
{
"epoch": 0.6677704194260485,
"grad_norm": 3.5258564949035645,
"learning_rate": 3.45704275117204e-06,
"loss": 0.2515,
"num_input_tokens_seen": 39649260,
"step": 605
},
{
"epoch": 0.6688741721854304,
"grad_norm": 4.411483287811279,
"learning_rate": 3.4424095451291273e-06,
"loss": 0.3673,
"num_input_tokens_seen": 39714796,
"step": 606
},
{
"epoch": 0.6699779249448123,
"grad_norm": 4.131246566772461,
"learning_rate": 3.4278037966054505e-06,
"loss": 0.3262,
"num_input_tokens_seen": 39780332,
"step": 607
},
{
"epoch": 0.6710816777041942,
"grad_norm": 6.033511161804199,
"learning_rate": 3.4132257005074424e-06,
"loss": 0.6029,
"num_input_tokens_seen": 39845868,
"step": 608
},
{
"epoch": 0.6721854304635762,
"grad_norm": 4.950568675994873,
"learning_rate": 3.3986754513725308e-06,
"loss": 0.356,
"num_input_tokens_seen": 39911404,
"step": 609
},
{
"epoch": 0.673289183222958,
"grad_norm": 5.005128860473633,
"learning_rate": 3.3841532433665425e-06,
"loss": 0.431,
"num_input_tokens_seen": 39976940,
"step": 610
},
{
"epoch": 0.67439293598234,
"grad_norm": 6.317650318145752,
"learning_rate": 3.369659270281106e-06,
"loss": 0.7201,
"num_input_tokens_seen": 40042476,
"step": 611
},
{
"epoch": 0.6754966887417219,
"grad_norm": 4.954858779907227,
"learning_rate": 3.3551937255310656e-06,
"loss": 0.4634,
"num_input_tokens_seen": 40108012,
"step": 612
},
{
"epoch": 0.6766004415011038,
"grad_norm": 5.401790142059326,
"learning_rate": 3.3407568021519086e-06,
"loss": 0.4785,
"num_input_tokens_seen": 40173548,
"step": 613
},
{
"epoch": 0.6777041942604857,
"grad_norm": 3.7082345485687256,
"learning_rate": 3.326348692797185e-06,
"loss": 0.2134,
"num_input_tokens_seen": 40239084,
"step": 614
},
{
"epoch": 0.6788079470198676,
"grad_norm": 4.993045806884766,
"learning_rate": 3.3119695897359318e-06,
"loss": 0.4226,
"num_input_tokens_seen": 40304620,
"step": 615
},
{
"epoch": 0.6799116997792495,
"grad_norm": 7.229109764099121,
"learning_rate": 3.2976196848501164e-06,
"loss": 0.5581,
"num_input_tokens_seen": 40370156,
"step": 616
},
{
"epoch": 0.6810154525386314,
"grad_norm": 5.062275409698486,
"learning_rate": 3.2832991696320647e-06,
"loss": 0.3896,
"num_input_tokens_seen": 40435692,
"step": 617
},
{
"epoch": 0.6821192052980133,
"grad_norm": 4.842869281768799,
"learning_rate": 3.2690082351819176e-06,
"loss": 0.3345,
"num_input_tokens_seen": 40501228,
"step": 618
},
{
"epoch": 0.6832229580573952,
"grad_norm": 4.732944011688232,
"learning_rate": 3.254747072205072e-06,
"loss": 0.3487,
"num_input_tokens_seen": 40566764,
"step": 619
},
{
"epoch": 0.6843267108167771,
"grad_norm": 5.033801078796387,
"learning_rate": 3.2405158710096437e-06,
"loss": 0.3981,
"num_input_tokens_seen": 40632300,
"step": 620
},
{
"epoch": 0.6854304635761589,
"grad_norm": 4.653602600097656,
"learning_rate": 3.2263148215039188e-06,
"loss": 0.4127,
"num_input_tokens_seen": 40697836,
"step": 621
},
{
"epoch": 0.6865342163355408,
"grad_norm": 5.868643760681152,
"learning_rate": 3.2121441131938257e-06,
"loss": 0.4875,
"num_input_tokens_seen": 40763372,
"step": 622
},
{
"epoch": 0.6876379690949227,
"grad_norm": 4.774325847625732,
"learning_rate": 3.198003935180406e-06,
"loss": 0.3392,
"num_input_tokens_seen": 40828908,
"step": 623
},
{
"epoch": 0.6887417218543046,
"grad_norm": 4.566282749176025,
"learning_rate": 3.183894476157288e-06,
"loss": 0.2351,
"num_input_tokens_seen": 40894444,
"step": 624
},
{
"epoch": 0.6898454746136865,
"grad_norm": 6.035506725311279,
"learning_rate": 3.1698159244081728e-06,
"loss": 0.5228,
"num_input_tokens_seen": 40959980,
"step": 625
},
{
"epoch": 0.6909492273730684,
"grad_norm": 5.202334880828857,
"learning_rate": 3.1557684678043145e-06,
"loss": 0.4605,
"num_input_tokens_seen": 41025516,
"step": 626
},
{
"epoch": 0.6920529801324503,
"grad_norm": 5.597423553466797,
"learning_rate": 3.1417522938020227e-06,
"loss": 0.4955,
"num_input_tokens_seen": 41091052,
"step": 627
},
{
"epoch": 0.6931567328918322,
"grad_norm": 4.580325603485107,
"learning_rate": 3.127767589440154e-06,
"loss": 0.3477,
"num_input_tokens_seen": 41156588,
"step": 628
},
{
"epoch": 0.6942604856512141,
"grad_norm": 5.499457836151123,
"learning_rate": 3.1138145413376187e-06,
"loss": 0.4553,
"num_input_tokens_seen": 41222124,
"step": 629
},
{
"epoch": 0.695364238410596,
"grad_norm": 4.949913024902344,
"learning_rate": 3.0998933356908933e-06,
"loss": 0.36,
"num_input_tokens_seen": 41287660,
"step": 630
},
{
"epoch": 0.6964679911699779,
"grad_norm": 4.248452663421631,
"learning_rate": 3.086004158271526e-06,
"loss": 0.298,
"num_input_tokens_seen": 41353196,
"step": 631
},
{
"epoch": 0.6975717439293598,
"grad_norm": 3.7569615840911865,
"learning_rate": 3.072147194423668e-06,
"loss": 0.2398,
"num_input_tokens_seen": 41418732,
"step": 632
},
{
"epoch": 0.6986754966887417,
"grad_norm": 4.620909690856934,
"learning_rate": 3.058322629061598e-06,
"loss": 0.3882,
"num_input_tokens_seen": 41484268,
"step": 633
},
{
"epoch": 0.6997792494481236,
"grad_norm": 6.98662805557251,
"learning_rate": 3.044530646667251e-06,
"loss": 0.5345,
"num_input_tokens_seen": 41549804,
"step": 634
},
{
"epoch": 0.7008830022075055,
"grad_norm": 5.646053791046143,
"learning_rate": 3.0307714312877588e-06,
"loss": 0.4899,
"num_input_tokens_seen": 41615340,
"step": 635
},
{
"epoch": 0.7019867549668874,
"grad_norm": 11.439247131347656,
"learning_rate": 3.0170451665329936e-06,
"loss": 0.4995,
"num_input_tokens_seen": 41680876,
"step": 636
},
{
"epoch": 0.7030905077262694,
"grad_norm": 5.286317825317383,
"learning_rate": 3.0033520355731182e-06,
"loss": 0.4077,
"num_input_tokens_seen": 41746412,
"step": 637
},
{
"epoch": 0.7041942604856513,
"grad_norm": 6.521766662597656,
"learning_rate": 2.9896922211361423e-06,
"loss": 0.5231,
"num_input_tokens_seen": 41811948,
"step": 638
},
{
"epoch": 0.7052980132450332,
"grad_norm": 4.848811149597168,
"learning_rate": 2.9760659055054826e-06,
"loss": 0.3481,
"num_input_tokens_seen": 41877484,
"step": 639
},
{
"epoch": 0.7064017660044151,
"grad_norm": 4.405468940734863,
"learning_rate": 2.962473270517528e-06,
"loss": 0.3645,
"num_input_tokens_seen": 41943020,
"step": 640
},
{
"epoch": 0.7075055187637969,
"grad_norm": 3.9719290733337402,
"learning_rate": 2.94891449755922e-06,
"loss": 0.2403,
"num_input_tokens_seen": 42008556,
"step": 641
},
{
"epoch": 0.7086092715231788,
"grad_norm": 5.281460762023926,
"learning_rate": 2.9353897675656267e-06,
"loss": 0.4094,
"num_input_tokens_seen": 42074092,
"step": 642
},
{
"epoch": 0.7097130242825607,
"grad_norm": 3.758775234222412,
"learning_rate": 2.9218992610175324e-06,
"loss": 0.2112,
"num_input_tokens_seen": 42139628,
"step": 643
},
{
"epoch": 0.7108167770419426,
"grad_norm": 6.650103569030762,
"learning_rate": 2.9084431579390204e-06,
"loss": 0.6355,
"num_input_tokens_seen": 42205164,
"step": 644
},
{
"epoch": 0.7119205298013245,
"grad_norm": 5.46085786819458,
"learning_rate": 2.8950216378950824e-06,
"loss": 0.5132,
"num_input_tokens_seen": 42270700,
"step": 645
},
{
"epoch": 0.7130242825607064,
"grad_norm": 5.055593967437744,
"learning_rate": 2.8816348799892134e-06,
"loss": 0.3778,
"num_input_tokens_seen": 42336236,
"step": 646
},
{
"epoch": 0.7141280353200883,
"grad_norm": 3.933264970779419,
"learning_rate": 2.868283062861028e-06,
"loss": 0.2851,
"num_input_tokens_seen": 42401772,
"step": 647
},
{
"epoch": 0.7152317880794702,
"grad_norm": 5.225867748260498,
"learning_rate": 2.854966364683872e-06,
"loss": 0.4315,
"num_input_tokens_seen": 42467308,
"step": 648
},
{
"epoch": 0.7163355408388521,
"grad_norm": 4.521208763122559,
"learning_rate": 2.8416849631624453e-06,
"loss": 0.3846,
"num_input_tokens_seen": 42532844,
"step": 649
},
{
"epoch": 0.717439293598234,
"grad_norm": 4.635922908782959,
"learning_rate": 2.8284390355304325e-06,
"loss": 0.5581,
"num_input_tokens_seen": 42598380,
"step": 650
},
{
"epoch": 0.7185430463576159,
"grad_norm": 4.856995105743408,
"learning_rate": 2.8152287585481384e-06,
"loss": 0.4154,
"num_input_tokens_seen": 42663916,
"step": 651
},
{
"epoch": 0.7196467991169978,
"grad_norm": 4.650574684143066,
"learning_rate": 2.802054308500125e-06,
"loss": 0.4506,
"num_input_tokens_seen": 42729452,
"step": 652
},
{
"epoch": 0.7207505518763797,
"grad_norm": 5.974058628082275,
"learning_rate": 2.7889158611928647e-06,
"loss": 0.406,
"num_input_tokens_seen": 42794988,
"step": 653
},
{
"epoch": 0.7218543046357616,
"grad_norm": 3.3415307998657227,
"learning_rate": 2.775813591952385e-06,
"loss": 0.2127,
"num_input_tokens_seen": 42860524,
"step": 654
},
{
"epoch": 0.7229580573951435,
"grad_norm": 5.096837997436523,
"learning_rate": 2.7627476756219416e-06,
"loss": 0.4521,
"num_input_tokens_seen": 42926060,
"step": 655
},
{
"epoch": 0.7240618101545254,
"grad_norm": 5.722452640533447,
"learning_rate": 2.7497182865596785e-06,
"loss": 0.488,
"num_input_tokens_seen": 42991596,
"step": 656
},
{
"epoch": 0.7251655629139073,
"grad_norm": 5.530818462371826,
"learning_rate": 2.7367255986362995e-06,
"loss": 0.5543,
"num_input_tokens_seen": 43057132,
"step": 657
},
{
"epoch": 0.7262693156732892,
"grad_norm": 4.10623836517334,
"learning_rate": 2.7237697852327465e-06,
"loss": 0.2923,
"num_input_tokens_seen": 43122668,
"step": 658
},
{
"epoch": 0.7273730684326711,
"grad_norm": 5.076680660247803,
"learning_rate": 2.7108510192378956e-06,
"loss": 0.4849,
"num_input_tokens_seen": 43188204,
"step": 659
},
{
"epoch": 0.7284768211920529,
"grad_norm": 5.36618185043335,
"learning_rate": 2.697969473046239e-06,
"loss": 0.504,
"num_input_tokens_seen": 43253740,
"step": 660
},
{
"epoch": 0.7295805739514348,
"grad_norm": 3.818769693374634,
"learning_rate": 2.685125318555595e-06,
"loss": 0.2422,
"num_input_tokens_seen": 43319276,
"step": 661
},
{
"epoch": 0.7306843267108167,
"grad_norm": 4.558277606964111,
"learning_rate": 2.672318727164803e-06,
"loss": 0.3608,
"num_input_tokens_seen": 43384812,
"step": 662
},
{
"epoch": 0.7317880794701986,
"grad_norm": 3.284147262573242,
"learning_rate": 2.659549869771442e-06,
"loss": 0.1414,
"num_input_tokens_seen": 43450348,
"step": 663
},
{
"epoch": 0.7328918322295805,
"grad_norm": 4.764499187469482,
"learning_rate": 2.646818916769551e-06,
"loss": 0.3462,
"num_input_tokens_seen": 43515884,
"step": 664
},
{
"epoch": 0.7339955849889624,
"grad_norm": 6.250286102294922,
"learning_rate": 2.6341260380473522e-06,
"loss": 0.584,
"num_input_tokens_seen": 43581420,
"step": 665
},
{
"epoch": 0.7350993377483444,
"grad_norm": 5.290203094482422,
"learning_rate": 2.621471402984991e-06,
"loss": 0.4575,
"num_input_tokens_seen": 43646956,
"step": 666
},
{
"epoch": 0.7362030905077263,
"grad_norm": 4.591466903686523,
"learning_rate": 2.60885518045226e-06,
"loss": 0.3306,
"num_input_tokens_seen": 43712492,
"step": 667
},
{
"epoch": 0.7373068432671082,
"grad_norm": 3.301421642303467,
"learning_rate": 2.5962775388063653e-06,
"loss": 0.1541,
"num_input_tokens_seen": 43778028,
"step": 668
},
{
"epoch": 0.7384105960264901,
"grad_norm": 4.60337495803833,
"learning_rate": 2.5837386458896665e-06,
"loss": 0.3604,
"num_input_tokens_seen": 43843564,
"step": 669
},
{
"epoch": 0.739514348785872,
"grad_norm": 5.7256879806518555,
"learning_rate": 2.5712386690274405e-06,
"loss": 0.5413,
"num_input_tokens_seen": 43909100,
"step": 670
},
{
"epoch": 0.7406181015452539,
"grad_norm": 4.792141914367676,
"learning_rate": 2.55877777502565e-06,
"loss": 0.3497,
"num_input_tokens_seen": 43974636,
"step": 671
},
{
"epoch": 0.7417218543046358,
"grad_norm": 4.639016628265381,
"learning_rate": 2.5463561301687122e-06,
"loss": 0.3359,
"num_input_tokens_seen": 44040172,
"step": 672
},
{
"epoch": 0.7428256070640177,
"grad_norm": 6.675306797027588,
"learning_rate": 2.533973900217292e-06,
"loss": 0.6212,
"num_input_tokens_seen": 44105708,
"step": 673
},
{
"epoch": 0.7439293598233996,
"grad_norm": 4.351767063140869,
"learning_rate": 2.521631250406076e-06,
"loss": 0.3184,
"num_input_tokens_seen": 44171244,
"step": 674
},
{
"epoch": 0.7450331125827815,
"grad_norm": 5.042402744293213,
"learning_rate": 2.5093283454415753e-06,
"loss": 0.3898,
"num_input_tokens_seen": 44236780,
"step": 675
},
{
"epoch": 0.7461368653421634,
"grad_norm": 5.825693130493164,
"learning_rate": 2.4970653494999233e-06,
"loss": 0.5072,
"num_input_tokens_seen": 44302316,
"step": 676
},
{
"epoch": 0.7472406181015453,
"grad_norm": 3.8188884258270264,
"learning_rate": 2.484842426224692e-06,
"loss": 0.2332,
"num_input_tokens_seen": 44367852,
"step": 677
},
{
"epoch": 0.7483443708609272,
"grad_norm": 4.3476033210754395,
"learning_rate": 2.4726597387247e-06,
"loss": 0.307,
"num_input_tokens_seen": 44433388,
"step": 678
},
{
"epoch": 0.7494481236203091,
"grad_norm": 5.082565784454346,
"learning_rate": 2.4605174495718426e-06,
"loss": 0.3315,
"num_input_tokens_seen": 44498924,
"step": 679
},
{
"epoch": 0.7505518763796909,
"grad_norm": 5.1927032470703125,
"learning_rate": 2.4484157207989172e-06,
"loss": 0.4357,
"num_input_tokens_seen": 44564460,
"step": 680
},
{
"epoch": 0.7516556291390728,
"grad_norm": 4.175013542175293,
"learning_rate": 2.4363547138974615e-06,
"loss": 0.3239,
"num_input_tokens_seen": 44629996,
"step": 681
},
{
"epoch": 0.7527593818984547,
"grad_norm": 3.897489070892334,
"learning_rate": 2.4243345898156036e-06,
"loss": 0.2688,
"num_input_tokens_seen": 44695532,
"step": 682
},
{
"epoch": 0.7538631346578366,
"grad_norm": 3.912888288497925,
"learning_rate": 2.4123555089559084e-06,
"loss": 0.2775,
"num_input_tokens_seen": 44761068,
"step": 683
},
{
"epoch": 0.7549668874172185,
"grad_norm": 3.3684606552124023,
"learning_rate": 2.4004176311732407e-06,
"loss": 0.1913,
"num_input_tokens_seen": 44826604,
"step": 684
},
{
"epoch": 0.7560706401766004,
"grad_norm": 2.597721815109253,
"learning_rate": 2.388521115772631e-06,
"loss": 0.1278,
"num_input_tokens_seen": 44892140,
"step": 685
},
{
"epoch": 0.7571743929359823,
"grad_norm": 5.179257869720459,
"learning_rate": 2.3766661215071473e-06,
"loss": 0.4902,
"num_input_tokens_seen": 44957676,
"step": 686
},
{
"epoch": 0.7582781456953642,
"grad_norm": 5.601065635681152,
"learning_rate": 2.364852806575782e-06,
"loss": 0.5226,
"num_input_tokens_seen": 45023212,
"step": 687
},
{
"epoch": 0.7593818984547461,
"grad_norm": 4.68391752243042,
"learning_rate": 2.353081328621335e-06,
"loss": 0.3309,
"num_input_tokens_seen": 45088748,
"step": 688
},
{
"epoch": 0.760485651214128,
"grad_norm": 4.620096683502197,
"learning_rate": 2.3413518447283145e-06,
"loss": 0.3436,
"num_input_tokens_seen": 45154284,
"step": 689
},
{
"epoch": 0.7615894039735099,
"grad_norm": 3.8575119972229004,
"learning_rate": 2.329664511420835e-06,
"loss": 0.2027,
"num_input_tokens_seen": 45219820,
"step": 690
},
{
"epoch": 0.7626931567328918,
"grad_norm": 5.794271945953369,
"learning_rate": 2.3180194846605367e-06,
"loss": 0.5122,
"num_input_tokens_seen": 45285356,
"step": 691
},
{
"epoch": 0.7637969094922737,
"grad_norm": 3.642765998840332,
"learning_rate": 2.3064169198444988e-06,
"loss": 0.1909,
"num_input_tokens_seen": 45350892,
"step": 692
},
{
"epoch": 0.7649006622516556,
"grad_norm": 4.802554130554199,
"learning_rate": 2.2948569718031665e-06,
"loss": 0.3171,
"num_input_tokens_seen": 45416428,
"step": 693
},
{
"epoch": 0.7660044150110376,
"grad_norm": 3.871217727661133,
"learning_rate": 2.283339794798286e-06,
"loss": 0.2115,
"num_input_tokens_seen": 45481964,
"step": 694
},
{
"epoch": 0.7671081677704195,
"grad_norm": 4.514727592468262,
"learning_rate": 2.2718655425208443e-06,
"loss": 0.2803,
"num_input_tokens_seen": 45547500,
"step": 695
},
{
"epoch": 0.7682119205298014,
"grad_norm": 4.732895851135254,
"learning_rate": 2.26043436808902e-06,
"loss": 0.3749,
"num_input_tokens_seen": 45613036,
"step": 696
},
{
"epoch": 0.7693156732891833,
"grad_norm": 4.663212776184082,
"learning_rate": 2.2490464240461386e-06,
"loss": 0.2932,
"num_input_tokens_seen": 45678572,
"step": 697
},
{
"epoch": 0.7704194260485652,
"grad_norm": 4.421245574951172,
"learning_rate": 2.2377018623586392e-06,
"loss": 0.2619,
"num_input_tokens_seen": 45744108,
"step": 698
},
{
"epoch": 0.7715231788079471,
"grad_norm": 4.790342807769775,
"learning_rate": 2.2264008344140444e-06,
"loss": 0.351,
"num_input_tokens_seen": 45809644,
"step": 699
},
{
"epoch": 0.7726269315673289,
"grad_norm": 4.666688919067383,
"learning_rate": 2.2151434910189397e-06,
"loss": 0.307,
"num_input_tokens_seen": 45875180,
"step": 700
},
{
"epoch": 0.7737306843267108,
"grad_norm": 6.7703704833984375,
"learning_rate": 2.2039299823969623e-06,
"loss": 0.5548,
"num_input_tokens_seen": 45940716,
"step": 701
},
{
"epoch": 0.7748344370860927,
"grad_norm": 4.884865760803223,
"learning_rate": 2.1927604581867974e-06,
"loss": 0.2819,
"num_input_tokens_seen": 46006252,
"step": 702
},
{
"epoch": 0.7759381898454746,
"grad_norm": 3.6003012657165527,
"learning_rate": 2.1816350674401804e-06,
"loss": 0.2164,
"num_input_tokens_seen": 46071788,
"step": 703
},
{
"epoch": 0.7770419426048565,
"grad_norm": 4.006162166595459,
"learning_rate": 2.1705539586199037e-06,
"loss": 0.2567,
"num_input_tokens_seen": 46137324,
"step": 704
},
{
"epoch": 0.7781456953642384,
"grad_norm": 5.422812461853027,
"learning_rate": 2.159517279597844e-06,
"loss": 0.4782,
"num_input_tokens_seen": 46202860,
"step": 705
},
{
"epoch": 0.7792494481236203,
"grad_norm": 5.053647041320801,
"learning_rate": 2.148525177652982e-06,
"loss": 0.2962,
"num_input_tokens_seen": 46268396,
"step": 706
},
{
"epoch": 0.7803532008830022,
"grad_norm": 4.703584671020508,
"learning_rate": 2.1375777994694415e-06,
"loss": 0.3387,
"num_input_tokens_seen": 46333932,
"step": 707
},
{
"epoch": 0.7814569536423841,
"grad_norm": 6.285885810852051,
"learning_rate": 2.1266752911345293e-06,
"loss": 0.7101,
"num_input_tokens_seen": 46399468,
"step": 708
},
{
"epoch": 0.782560706401766,
"grad_norm": 5.295046329498291,
"learning_rate": 2.1158177981367832e-06,
"loss": 0.4221,
"num_input_tokens_seen": 46465004,
"step": 709
},
{
"epoch": 0.7836644591611479,
"grad_norm": 5.126113414764404,
"learning_rate": 2.1050054653640382e-06,
"loss": 0.499,
"num_input_tokens_seen": 46530540,
"step": 710
},
{
"epoch": 0.7847682119205298,
"grad_norm": 4.821897983551025,
"learning_rate": 2.0942384371014858e-06,
"loss": 0.3448,
"num_input_tokens_seen": 46596076,
"step": 711
},
{
"epoch": 0.7858719646799117,
"grad_norm": 5.19854211807251,
"learning_rate": 2.083516857029757e-06,
"loss": 0.5352,
"num_input_tokens_seen": 46661612,
"step": 712
},
{
"epoch": 0.7869757174392936,
"grad_norm": 4.652998924255371,
"learning_rate": 2.072840868222989e-06,
"loss": 0.3713,
"num_input_tokens_seen": 46727148,
"step": 713
},
{
"epoch": 0.7880794701986755,
"grad_norm": 4.630868434906006,
"learning_rate": 2.0622106131469346e-06,
"loss": 0.3579,
"num_input_tokens_seen": 46792684,
"step": 714
},
{
"epoch": 0.7891832229580574,
"grad_norm": 5.569841384887695,
"learning_rate": 2.0516262336570504e-06,
"loss": 0.6049,
"num_input_tokens_seen": 46858220,
"step": 715
},
{
"epoch": 0.7902869757174393,
"grad_norm": 4.281322479248047,
"learning_rate": 2.0410878709966055e-06,
"loss": 0.2668,
"num_input_tokens_seen": 46923756,
"step": 716
},
{
"epoch": 0.7913907284768212,
"grad_norm": 6.267445087432861,
"learning_rate": 2.0305956657947993e-06,
"loss": 0.5742,
"num_input_tokens_seen": 46989292,
"step": 717
},
{
"epoch": 0.7924944812362031,
"grad_norm": 6.611531734466553,
"learning_rate": 2.0201497580648804e-06,
"loss": 0.6532,
"num_input_tokens_seen": 47054828,
"step": 718
},
{
"epoch": 0.7935982339955849,
"grad_norm": 5.3773040771484375,
"learning_rate": 2.0097502872022808e-06,
"loss": 0.4038,
"num_input_tokens_seen": 47120364,
"step": 719
},
{
"epoch": 0.7947019867549668,
"grad_norm": 5.617459774017334,
"learning_rate": 1.999397391982758e-06,
"loss": 0.4642,
"num_input_tokens_seen": 47185900,
"step": 720
},
{
"epoch": 0.7958057395143487,
"grad_norm": 4.279394149780273,
"learning_rate": 1.98909121056054e-06,
"loss": 0.2961,
"num_input_tokens_seen": 47251436,
"step": 721
},
{
"epoch": 0.7969094922737306,
"grad_norm": 4.948429107666016,
"learning_rate": 1.97883188046648e-06,
"loss": 0.298,
"num_input_tokens_seen": 47316972,
"step": 722
},
{
"epoch": 0.7980132450331126,
"grad_norm": 4.897294521331787,
"learning_rate": 1.9686195386062253e-06,
"loss": 0.406,
"num_input_tokens_seen": 47382508,
"step": 723
},
{
"epoch": 0.7991169977924945,
"grad_norm": 3.2944583892822266,
"learning_rate": 1.958454321258391e-06,
"loss": 0.1767,
"num_input_tokens_seen": 47448044,
"step": 724
},
{
"epoch": 0.8002207505518764,
"grad_norm": 5.691059589385986,
"learning_rate": 1.948336364072736e-06,
"loss": 0.5323,
"num_input_tokens_seen": 47513580,
"step": 725
},
{
"epoch": 0.8013245033112583,
"grad_norm": 3.72227144241333,
"learning_rate": 1.9382658020683572e-06,
"loss": 0.2576,
"num_input_tokens_seen": 47579116,
"step": 726
},
{
"epoch": 0.8024282560706402,
"grad_norm": 5.053169250488281,
"learning_rate": 1.928242769631884e-06,
"loss": 0.4782,
"num_input_tokens_seen": 47644652,
"step": 727
},
{
"epoch": 0.8035320088300221,
"grad_norm": 3.867647171020508,
"learning_rate": 1.918267400515691e-06,
"loss": 0.2233,
"num_input_tokens_seen": 47710188,
"step": 728
},
{
"epoch": 0.804635761589404,
"grad_norm": 6.746331691741943,
"learning_rate": 1.9083398278361077e-06,
"loss": 0.5753,
"num_input_tokens_seen": 47775724,
"step": 729
},
{
"epoch": 0.8057395143487859,
"grad_norm": 5.674907207489014,
"learning_rate": 1.8984601840716443e-06,
"loss": 0.4902,
"num_input_tokens_seen": 47841260,
"step": 730
},
{
"epoch": 0.8068432671081678,
"grad_norm": 5.640385150909424,
"learning_rate": 1.8886286010612226e-06,
"loss": 0.5091,
"num_input_tokens_seen": 47906796,
"step": 731
},
{
"epoch": 0.8079470198675497,
"grad_norm": 4.219995975494385,
"learning_rate": 1.8788452100024185e-06,
"loss": 0.2676,
"num_input_tokens_seen": 47972332,
"step": 732
},
{
"epoch": 0.8090507726269316,
"grad_norm": 4.815901279449463,
"learning_rate": 1.8691101414497104e-06,
"loss": 0.3313,
"num_input_tokens_seen": 48037868,
"step": 733
},
{
"epoch": 0.8101545253863135,
"grad_norm": 5.200927734375,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.4606,
"num_input_tokens_seen": 48103404,
"step": 734
},
{
"epoch": 0.8112582781456954,
"grad_norm": 4.116132736206055,
"learning_rate": 1.8497854908545632e-06,
"loss": 0.2862,
"num_input_tokens_seen": 48168940,
"step": 735
},
{
"epoch": 0.8123620309050773,
"grad_norm": 4.191470623016357,
"learning_rate": 1.840196166689956e-06,
"loss": 0.248,
"num_input_tokens_seen": 48234476,
"step": 736
},
{
"epoch": 0.8134657836644592,
"grad_norm": 4.818159103393555,
"learning_rate": 1.8306556807836673e-06,
"loss": 0.3043,
"num_input_tokens_seen": 48300012,
"step": 737
},
{
"epoch": 0.8145695364238411,
"grad_norm": 5.291959762573242,
"learning_rate": 1.8211641604487276e-06,
"loss": 0.3915,
"num_input_tokens_seen": 48365548,
"step": 738
},
{
"epoch": 0.8156732891832229,
"grad_norm": 3.9926469326019287,
"learning_rate": 1.811721732344745e-06,
"loss": 0.2789,
"num_input_tokens_seen": 48431084,
"step": 739
},
{
"epoch": 0.8167770419426048,
"grad_norm": 3.875976085662842,
"learning_rate": 1.8023285224762182e-06,
"loss": 0.2372,
"num_input_tokens_seen": 48496620,
"step": 740
},
{
"epoch": 0.8178807947019867,
"grad_norm": 4.194228649139404,
"learning_rate": 1.792984656190851e-06,
"loss": 0.3044,
"num_input_tokens_seen": 48562156,
"step": 741
},
{
"epoch": 0.8189845474613686,
"grad_norm": 3.9738593101501465,
"learning_rate": 1.7836902581778814e-06,
"loss": 0.2313,
"num_input_tokens_seen": 48627688,
"step": 742
},
{
"epoch": 0.8200883002207505,
"grad_norm": 5.357481479644775,
"learning_rate": 1.7744454524664206e-06,
"loss": 0.3658,
"num_input_tokens_seen": 48693224,
"step": 743
},
{
"epoch": 0.8211920529801324,
"grad_norm": 4.926805019378662,
"learning_rate": 1.7652503624237943e-06,
"loss": 0.3554,
"num_input_tokens_seen": 48758760,
"step": 744
},
{
"epoch": 0.8222958057395143,
"grad_norm": 4.7101149559021,
"learning_rate": 1.7561051107538957e-06,
"loss": 0.3042,
"num_input_tokens_seen": 48824296,
"step": 745
},
{
"epoch": 0.8233995584988962,
"grad_norm": 7.876413822174072,
"learning_rate": 1.7470098194955502e-06,
"loss": 0.9089,
"num_input_tokens_seen": 48889832,
"step": 746
},
{
"epoch": 0.8245033112582781,
"grad_norm": 7.759825706481934,
"learning_rate": 1.737964610020888e-06,
"loss": 0.7701,
"num_input_tokens_seen": 48955368,
"step": 747
},
{
"epoch": 0.82560706401766,
"grad_norm": 5.320918083190918,
"learning_rate": 1.7289696030337217e-06,
"loss": 0.473,
"num_input_tokens_seen": 49020904,
"step": 748
},
{
"epoch": 0.826710816777042,
"grad_norm": 4.480899810791016,
"learning_rate": 1.7200249185679373e-06,
"loss": 0.279,
"num_input_tokens_seen": 49086440,
"step": 749
},
{
"epoch": 0.8278145695364238,
"grad_norm": 3.9195127487182617,
"learning_rate": 1.7111306759858915e-06,
"loss": 0.2524,
"num_input_tokens_seen": 49151976,
"step": 750
},
{
"epoch": 0.8289183222958058,
"grad_norm": 4.489572525024414,
"learning_rate": 1.7022869939768189e-06,
"loss": 0.3206,
"num_input_tokens_seen": 49217512,
"step": 751
},
{
"epoch": 0.8300220750551877,
"grad_norm": 4.9796247482299805,
"learning_rate": 1.6934939905552483e-06,
"loss": 0.3519,
"num_input_tokens_seen": 49283048,
"step": 752
},
{
"epoch": 0.8311258278145696,
"grad_norm": 4.599273204803467,
"learning_rate": 1.6847517830594304e-06,
"loss": 0.2959,
"num_input_tokens_seen": 49348584,
"step": 753
},
{
"epoch": 0.8322295805739515,
"grad_norm": 4.5784502029418945,
"learning_rate": 1.676060488149765e-06,
"loss": 0.3448,
"num_input_tokens_seen": 49414120,
"step": 754
},
{
"epoch": 0.8333333333333334,
"grad_norm": 4.545161247253418,
"learning_rate": 1.6674202218072528e-06,
"loss": 0.2993,
"num_input_tokens_seen": 49479656,
"step": 755
},
{
"epoch": 0.8344370860927153,
"grad_norm": 5.198947429656982,
"learning_rate": 1.6588310993319411e-06,
"loss": 0.3496,
"num_input_tokens_seen": 49545192,
"step": 756
},
{
"epoch": 0.8355408388520972,
"grad_norm": 6.070314407348633,
"learning_rate": 1.6502932353413911e-06,
"loss": 0.6534,
"num_input_tokens_seen": 49610728,
"step": 757
},
{
"epoch": 0.8366445916114791,
"grad_norm": 3.886892557144165,
"learning_rate": 1.641806743769142e-06,
"loss": 0.2371,
"num_input_tokens_seen": 49676264,
"step": 758
},
{
"epoch": 0.8377483443708609,
"grad_norm": 4.761082649230957,
"learning_rate": 1.633371737863194e-06,
"loss": 0.3792,
"num_input_tokens_seen": 49741800,
"step": 759
},
{
"epoch": 0.8388520971302428,
"grad_norm": 4.743200778961182,
"learning_rate": 1.6249883301844977e-06,
"loss": 0.2798,
"num_input_tokens_seen": 49807336,
"step": 760
},
{
"epoch": 0.8399558498896247,
"grad_norm": 4.598114967346191,
"learning_rate": 1.616656632605451e-06,
"loss": 0.3047,
"num_input_tokens_seen": 49872872,
"step": 761
},
{
"epoch": 0.8410596026490066,
"grad_norm": 4.357061386108398,
"learning_rate": 1.6083767563084056e-06,
"loss": 0.2891,
"num_input_tokens_seen": 49938408,
"step": 762
},
{
"epoch": 0.8421633554083885,
"grad_norm": 5.4613938331604,
"learning_rate": 1.6001488117841855e-06,
"loss": 0.3881,
"num_input_tokens_seen": 50003944,
"step": 763
},
{
"epoch": 0.8432671081677704,
"grad_norm": 5.408591270446777,
"learning_rate": 1.5919729088306093e-06,
"loss": 0.4353,
"num_input_tokens_seen": 50069480,
"step": 764
},
{
"epoch": 0.8443708609271523,
"grad_norm": 4.4144697189331055,
"learning_rate": 1.5838491565510275e-06,
"loss": 0.3202,
"num_input_tokens_seen": 50135016,
"step": 765
},
{
"epoch": 0.8454746136865342,
"grad_norm": 4.777880668640137,
"learning_rate": 1.5757776633528654e-06,
"loss": 0.3601,
"num_input_tokens_seen": 50200552,
"step": 766
},
{
"epoch": 0.8465783664459161,
"grad_norm": 5.162795066833496,
"learning_rate": 1.5677585369461796e-06,
"loss": 0.4695,
"num_input_tokens_seen": 50266088,
"step": 767
},
{
"epoch": 0.847682119205298,
"grad_norm": 4.954742431640625,
"learning_rate": 1.5597918843422132e-06,
"loss": 0.4612,
"num_input_tokens_seen": 50331624,
"step": 768
},
{
"epoch": 0.8487858719646799,
"grad_norm": 4.867616653442383,
"learning_rate": 1.5518778118519754e-06,
"loss": 0.3481,
"num_input_tokens_seen": 50397160,
"step": 769
},
{
"epoch": 0.8498896247240618,
"grad_norm": 5.606635093688965,
"learning_rate": 1.5440164250848205e-06,
"loss": 0.3699,
"num_input_tokens_seen": 50462696,
"step": 770
},
{
"epoch": 0.8509933774834437,
"grad_norm": 4.597039699554443,
"learning_rate": 1.5362078289470369e-06,
"loss": 0.2831,
"num_input_tokens_seen": 50528232,
"step": 771
},
{
"epoch": 0.8520971302428256,
"grad_norm": 4.647115707397461,
"learning_rate": 1.5284521276404498e-06,
"loss": 0.3296,
"num_input_tokens_seen": 50593768,
"step": 772
},
{
"epoch": 0.8532008830022075,
"grad_norm": 7.4029083251953125,
"learning_rate": 1.520749424661026e-06,
"loss": 0.8206,
"num_input_tokens_seen": 50659304,
"step": 773
},
{
"epoch": 0.8543046357615894,
"grad_norm": 5.793678283691406,
"learning_rate": 1.513099822797498e-06,
"loss": 0.5007,
"num_input_tokens_seen": 50724840,
"step": 774
},
{
"epoch": 0.8554083885209713,
"grad_norm": 5.196356296539307,
"learning_rate": 1.5055034241299933e-06,
"loss": 0.3584,
"num_input_tokens_seen": 50790376,
"step": 775
},
{
"epoch": 0.8565121412803532,
"grad_norm": 3.680781841278076,
"learning_rate": 1.4979603300286655e-06,
"loss": 0.2379,
"num_input_tokens_seen": 50855912,
"step": 776
},
{
"epoch": 0.8576158940397351,
"grad_norm": 5.659000873565674,
"learning_rate": 1.490470641152345e-06,
"loss": 0.4034,
"num_input_tokens_seen": 50921448,
"step": 777
},
{
"epoch": 0.8587196467991169,
"grad_norm": 4.814398288726807,
"learning_rate": 1.4830344574471986e-06,
"loss": 0.3384,
"num_input_tokens_seen": 50986984,
"step": 778
},
{
"epoch": 0.8598233995584988,
"grad_norm": 3.8748183250427246,
"learning_rate": 1.475651878145391e-06,
"loss": 0.2315,
"num_input_tokens_seen": 51052520,
"step": 779
},
{
"epoch": 0.8609271523178808,
"grad_norm": 4.726593017578125,
"learning_rate": 1.4683230017637653e-06,
"loss": 0.39,
"num_input_tokens_seen": 51118056,
"step": 780
},
{
"epoch": 0.8620309050772627,
"grad_norm": 4.92621374130249,
"learning_rate": 1.4610479261025247e-06,
"loss": 0.3703,
"num_input_tokens_seen": 51183592,
"step": 781
},
{
"epoch": 0.8631346578366446,
"grad_norm": 2.963569164276123,
"learning_rate": 1.4538267482439264e-06,
"loss": 0.1459,
"num_input_tokens_seen": 51249128,
"step": 782
},
{
"epoch": 0.8642384105960265,
"grad_norm": 4.367040634155273,
"learning_rate": 1.4466595645509917e-06,
"loss": 0.2845,
"num_input_tokens_seen": 51314664,
"step": 783
},
{
"epoch": 0.8653421633554084,
"grad_norm": 6.132776260375977,
"learning_rate": 1.4395464706662155e-06,
"loss": 0.3871,
"num_input_tokens_seen": 51380200,
"step": 784
},
{
"epoch": 0.8664459161147903,
"grad_norm": 4.9763593673706055,
"learning_rate": 1.4324875615102896e-06,
"loss": 0.2819,
"num_input_tokens_seen": 51445736,
"step": 785
},
{
"epoch": 0.8675496688741722,
"grad_norm": 4.710178375244141,
"learning_rate": 1.4254829312808405e-06,
"loss": 0.2911,
"num_input_tokens_seen": 51511272,
"step": 786
},
{
"epoch": 0.8686534216335541,
"grad_norm": 5.016818523406982,
"learning_rate": 1.4185326734511667e-06,
"loss": 0.3404,
"num_input_tokens_seen": 51576808,
"step": 787
},
{
"epoch": 0.869757174392936,
"grad_norm": 4.105848789215088,
"learning_rate": 1.4116368807689968e-06,
"loss": 0.255,
"num_input_tokens_seen": 51642344,
"step": 788
},
{
"epoch": 0.8708609271523179,
"grad_norm": 3.7043731212615967,
"learning_rate": 1.4047956452552458e-06,
"loss": 0.2043,
"num_input_tokens_seen": 51707880,
"step": 789
},
{
"epoch": 0.8719646799116998,
"grad_norm": 4.634637832641602,
"learning_rate": 1.3980090582027943e-06,
"loss": 0.302,
"num_input_tokens_seen": 51773416,
"step": 790
},
{
"epoch": 0.8730684326710817,
"grad_norm": 5.685799598693848,
"learning_rate": 1.3912772101752628e-06,
"loss": 0.3963,
"num_input_tokens_seen": 51838952,
"step": 791
},
{
"epoch": 0.8741721854304636,
"grad_norm": 6.1849212646484375,
"learning_rate": 1.384600191005809e-06,
"loss": 0.4956,
"num_input_tokens_seen": 51904488,
"step": 792
},
{
"epoch": 0.8752759381898455,
"grad_norm": 4.928698539733887,
"learning_rate": 1.3779780897959266e-06,
"loss": 0.3066,
"num_input_tokens_seen": 51970024,
"step": 793
},
{
"epoch": 0.8763796909492274,
"grad_norm": 4.356123447418213,
"learning_rate": 1.3714109949142568e-06,
"loss": 0.2387,
"num_input_tokens_seen": 52035560,
"step": 794
},
{
"epoch": 0.8774834437086093,
"grad_norm": 3.3618569374084473,
"learning_rate": 1.3648989939954065e-06,
"loss": 0.1518,
"num_input_tokens_seen": 52101096,
"step": 795
},
{
"epoch": 0.8785871964679912,
"grad_norm": 5.220973968505859,
"learning_rate": 1.3584421739387831e-06,
"loss": 0.3637,
"num_input_tokens_seen": 52166632,
"step": 796
},
{
"epoch": 0.8796909492273731,
"grad_norm": 3.5688204765319824,
"learning_rate": 1.3520406209074333e-06,
"loss": 0.1974,
"num_input_tokens_seen": 52232168,
"step": 797
},
{
"epoch": 0.8807947019867549,
"grad_norm": 5.112353324890137,
"learning_rate": 1.3456944203268918e-06,
"loss": 0.3774,
"num_input_tokens_seen": 52297704,
"step": 798
},
{
"epoch": 0.8818984547461368,
"grad_norm": 2.9946765899658203,
"learning_rate": 1.3394036568840423e-06,
"loss": 0.1356,
"num_input_tokens_seen": 52363240,
"step": 799
},
{
"epoch": 0.8830022075055187,
"grad_norm": 6.761628150939941,
"learning_rate": 1.3331684145259897e-06,
"loss": 0.5675,
"num_input_tokens_seen": 52428776,
"step": 800
},
{
"epoch": 0.8841059602649006,
"grad_norm": 3.472043991088867,
"learning_rate": 1.3269887764589338e-06,
"loss": 0.1852,
"num_input_tokens_seen": 52494312,
"step": 801
},
{
"epoch": 0.8852097130242825,
"grad_norm": 4.0303497314453125,
"learning_rate": 1.3208648251470662e-06,
"loss": 0.2197,
"num_input_tokens_seen": 52559848,
"step": 802
},
{
"epoch": 0.8863134657836644,
"grad_norm": 6.038300514221191,
"learning_rate": 1.314796642311465e-06,
"loss": 0.5266,
"num_input_tokens_seen": 52625384,
"step": 803
},
{
"epoch": 0.8874172185430463,
"grad_norm": 4.095331192016602,
"learning_rate": 1.3087843089290072e-06,
"loss": 0.2284,
"num_input_tokens_seen": 52690920,
"step": 804
},
{
"epoch": 0.8885209713024282,
"grad_norm": 3.6835544109344482,
"learning_rate": 1.3028279052312836e-06,
"loss": 0.1985,
"num_input_tokens_seen": 52756456,
"step": 805
},
{
"epoch": 0.8896247240618101,
"grad_norm": 6.688685417175293,
"learning_rate": 1.2969275107035344e-06,
"loss": 0.5688,
"num_input_tokens_seen": 52821992,
"step": 806
},
{
"epoch": 0.890728476821192,
"grad_norm": 4.240151882171631,
"learning_rate": 1.291083204083584e-06,
"loss": 0.2624,
"num_input_tokens_seen": 52887528,
"step": 807
},
{
"epoch": 0.891832229580574,
"grad_norm": 4.761137008666992,
"learning_rate": 1.2852950633607922e-06,
"loss": 0.2818,
"num_input_tokens_seen": 52953064,
"step": 808
},
{
"epoch": 0.8929359823399559,
"grad_norm": 5.820546627044678,
"learning_rate": 1.2795631657750113e-06,
"loss": 0.4492,
"num_input_tokens_seen": 53018600,
"step": 809
},
{
"epoch": 0.8940397350993378,
"grad_norm": 3.614879846572876,
"learning_rate": 1.2738875878155593e-06,
"loss": 0.1956,
"num_input_tokens_seen": 53084136,
"step": 810
},
{
"epoch": 0.8951434878587197,
"grad_norm": 6.228170394897461,
"learning_rate": 1.268268405220195e-06,
"loss": 0.5272,
"num_input_tokens_seen": 53149672,
"step": 811
},
{
"epoch": 0.8962472406181016,
"grad_norm": 5.280664920806885,
"learning_rate": 1.2627056929741096e-06,
"loss": 0.3557,
"num_input_tokens_seen": 53215208,
"step": 812
},
{
"epoch": 0.8973509933774835,
"grad_norm": 4.466437816619873,
"learning_rate": 1.257199525308927e-06,
"loss": 0.2463,
"num_input_tokens_seen": 53280744,
"step": 813
},
{
"epoch": 0.8984547461368654,
"grad_norm": 4.941433429718018,
"learning_rate": 1.2517499757017098e-06,
"loss": 0.342,
"num_input_tokens_seen": 53346280,
"step": 814
},
{
"epoch": 0.8995584988962473,
"grad_norm": 3.8996658325195312,
"learning_rate": 1.2463571168739825e-06,
"loss": 0.2186,
"num_input_tokens_seen": 53411816,
"step": 815
},
{
"epoch": 0.9006622516556292,
"grad_norm": 5.6412811279296875,
"learning_rate": 1.2410210207907579e-06,
"loss": 0.4572,
"num_input_tokens_seen": 53477352,
"step": 816
},
{
"epoch": 0.9017660044150111,
"grad_norm": 4.444023132324219,
"learning_rate": 1.2357417586595803e-06,
"loss": 0.2967,
"num_input_tokens_seen": 53542888,
"step": 817
},
{
"epoch": 0.9028697571743929,
"grad_norm": 5.247828483581543,
"learning_rate": 1.23051940092957e-06,
"loss": 0.3782,
"num_input_tokens_seen": 53608424,
"step": 818
},
{
"epoch": 0.9039735099337748,
"grad_norm": 4.472630977630615,
"learning_rate": 1.2253540172904894e-06,
"loss": 0.2586,
"num_input_tokens_seen": 53673960,
"step": 819
},
{
"epoch": 0.9050772626931567,
"grad_norm": 6.463611602783203,
"learning_rate": 1.2202456766718092e-06,
"loss": 0.4628,
"num_input_tokens_seen": 53739496,
"step": 820
},
{
"epoch": 0.9061810154525386,
"grad_norm": 4.195343017578125,
"learning_rate": 1.2151944472417888e-06,
"loss": 0.2325,
"num_input_tokens_seen": 53805032,
"step": 821
},
{
"epoch": 0.9072847682119205,
"grad_norm": 4.714290618896484,
"learning_rate": 1.2102003964065693e-06,
"loss": 0.351,
"num_input_tokens_seen": 53870568,
"step": 822
},
{
"epoch": 0.9083885209713024,
"grad_norm": 5.385950565338135,
"learning_rate": 1.205263590809268e-06,
"loss": 0.3268,
"num_input_tokens_seen": 53936104,
"step": 823
},
{
"epoch": 0.9094922737306843,
"grad_norm": 5.244440078735352,
"learning_rate": 1.200384096329096e-06,
"loss": 0.4413,
"num_input_tokens_seen": 54001640,
"step": 824
},
{
"epoch": 0.9105960264900662,
"grad_norm": 6.079404354095459,
"learning_rate": 1.1955619780804757e-06,
"loss": 0.4869,
"num_input_tokens_seen": 54067176,
"step": 825
},
{
"epoch": 0.9116997792494481,
"grad_norm": 4.891379356384277,
"learning_rate": 1.190797300412174e-06,
"loss": 0.3709,
"num_input_tokens_seen": 54132712,
"step": 826
},
{
"epoch": 0.91280353200883,
"grad_norm": 4.037010192871094,
"learning_rate": 1.1860901269064366e-06,
"loss": 0.2608,
"num_input_tokens_seen": 54198248,
"step": 827
},
{
"epoch": 0.9139072847682119,
"grad_norm": 3.740389347076416,
"learning_rate": 1.1814405203781503e-06,
"loss": 0.1963,
"num_input_tokens_seen": 54263784,
"step": 828
},
{
"epoch": 0.9150110375275938,
"grad_norm": 4.0328779220581055,
"learning_rate": 1.1768485428739963e-06,
"loss": 0.2375,
"num_input_tokens_seen": 54329320,
"step": 829
},
{
"epoch": 0.9161147902869757,
"grad_norm": 6.169814109802246,
"learning_rate": 1.1723142556716265e-06,
"loss": 0.5289,
"num_input_tokens_seen": 54394856,
"step": 830
},
{
"epoch": 0.9172185430463576,
"grad_norm": 4.388846397399902,
"learning_rate": 1.167837719278844e-06,
"loss": 0.2342,
"num_input_tokens_seen": 54460392,
"step": 831
},
{
"epoch": 0.9183222958057395,
"grad_norm": 4.988517761230469,
"learning_rate": 1.1634189934327954e-06,
"loss": 0.3133,
"num_input_tokens_seen": 54525928,
"step": 832
},
{
"epoch": 0.9194260485651214,
"grad_norm": 5.121459484100342,
"learning_rate": 1.1590581370991758e-06,
"loss": 0.3411,
"num_input_tokens_seen": 54591464,
"step": 833
},
{
"epoch": 0.9205298013245033,
"grad_norm": 4.2386345863342285,
"learning_rate": 1.1547552084714394e-06,
"loss": 0.2751,
"num_input_tokens_seen": 54657000,
"step": 834
},
{
"epoch": 0.9216335540838853,
"grad_norm": 3.978631019592285,
"learning_rate": 1.1505102649700243e-06,
"loss": 0.2063,
"num_input_tokens_seen": 54722536,
"step": 835
},
{
"epoch": 0.9227373068432672,
"grad_norm": 5.66132926940918,
"learning_rate": 1.1463233632415866e-06,
"loss": 0.4821,
"num_input_tokens_seen": 54788072,
"step": 836
},
{
"epoch": 0.9238410596026491,
"grad_norm": 5.7866692543029785,
"learning_rate": 1.1421945591582428e-06,
"loss": 0.4443,
"num_input_tokens_seen": 54853608,
"step": 837
},
{
"epoch": 0.9249448123620309,
"grad_norm": 5.27299690246582,
"learning_rate": 1.1381239078168262e-06,
"loss": 0.4269,
"num_input_tokens_seen": 54919144,
"step": 838
},
{
"epoch": 0.9260485651214128,
"grad_norm": 4.533444404602051,
"learning_rate": 1.1341114635381506e-06,
"loss": 0.3032,
"num_input_tokens_seen": 54984680,
"step": 839
},
{
"epoch": 0.9271523178807947,
"grad_norm": 7.473748207092285,
"learning_rate": 1.1301572798662849e-06,
"loss": 0.6928,
"num_input_tokens_seen": 55050216,
"step": 840
},
{
"epoch": 0.9282560706401766,
"grad_norm": 4.378798007965088,
"learning_rate": 1.1262614095678395e-06,
"loss": 0.303,
"num_input_tokens_seen": 55115752,
"step": 841
},
{
"epoch": 0.9293598233995585,
"grad_norm": 5.152125835418701,
"learning_rate": 1.1224239046312627e-06,
"loss": 0.5211,
"num_input_tokens_seen": 55181288,
"step": 842
},
{
"epoch": 0.9304635761589404,
"grad_norm": 4.320618152618408,
"learning_rate": 1.1186448162661444e-06,
"loss": 0.2154,
"num_input_tokens_seen": 55246824,
"step": 843
},
{
"epoch": 0.9315673289183223,
"grad_norm": 5.254551887512207,
"learning_rate": 1.1149241949025365e-06,
"loss": 0.2749,
"num_input_tokens_seen": 55312360,
"step": 844
},
{
"epoch": 0.9326710816777042,
"grad_norm": 5.376104354858398,
"learning_rate": 1.1112620901902775e-06,
"loss": 0.4807,
"num_input_tokens_seen": 55377896,
"step": 845
},
{
"epoch": 0.9337748344370861,
"grad_norm": 3.8265469074249268,
"learning_rate": 1.1076585509983285e-06,
"loss": 0.2671,
"num_input_tokens_seen": 55443432,
"step": 846
},
{
"epoch": 0.934878587196468,
"grad_norm": 4.158047199249268,
"learning_rate": 1.104113625414124e-06,
"loss": 0.2879,
"num_input_tokens_seen": 55508968,
"step": 847
},
{
"epoch": 0.9359823399558499,
"grad_norm": 5.733949661254883,
"learning_rate": 1.1006273607429305e-06,
"loss": 0.4776,
"num_input_tokens_seen": 55574504,
"step": 848
},
{
"epoch": 0.9370860927152318,
"grad_norm": 5.860391616821289,
"learning_rate": 1.0971998035072123e-06,
"loss": 0.4113,
"num_input_tokens_seen": 55640040,
"step": 849
},
{
"epoch": 0.9381898454746137,
"grad_norm": 4.877603530883789,
"learning_rate": 1.0938309994460127e-06,
"loss": 0.3053,
"num_input_tokens_seen": 55705576,
"step": 850
},
{
"epoch": 0.9392935982339956,
"grad_norm": 4.361584663391113,
"learning_rate": 1.090520993514343e-06,
"loss": 0.2688,
"num_input_tokens_seen": 55771112,
"step": 851
},
{
"epoch": 0.9403973509933775,
"grad_norm": 4.158669471740723,
"learning_rate": 1.0872698298825822e-06,
"loss": 0.2407,
"num_input_tokens_seen": 55836648,
"step": 852
},
{
"epoch": 0.9415011037527594,
"grad_norm": 3.3524842262268066,
"learning_rate": 1.08407755193589e-06,
"loss": 0.1298,
"num_input_tokens_seen": 55902184,
"step": 853
},
{
"epoch": 0.9426048565121413,
"grad_norm": 7.272517204284668,
"learning_rate": 1.0809442022736238e-06,
"loss": 0.5958,
"num_input_tokens_seen": 55967720,
"step": 854
},
{
"epoch": 0.9437086092715232,
"grad_norm": 4.141610622406006,
"learning_rate": 1.0778698227087736e-06,
"loss": 0.2352,
"num_input_tokens_seen": 56033256,
"step": 855
},
{
"epoch": 0.9448123620309051,
"grad_norm": 6.14682674407959,
"learning_rate": 1.0748544542674028e-06,
"loss": 0.5763,
"num_input_tokens_seen": 56098792,
"step": 856
},
{
"epoch": 0.9459161147902869,
"grad_norm": 4.349286079406738,
"learning_rate": 1.0718981371881004e-06,
"loss": 0.2543,
"num_input_tokens_seen": 56164328,
"step": 857
},
{
"epoch": 0.9470198675496688,
"grad_norm": 5.373385906219482,
"learning_rate": 1.0690009109214443e-06,
"loss": 0.3312,
"num_input_tokens_seen": 56229864,
"step": 858
},
{
"epoch": 0.9481236203090507,
"grad_norm": 5.748344421386719,
"learning_rate": 1.0661628141294758e-06,
"loss": 0.4741,
"num_input_tokens_seen": 56295400,
"step": 859
},
{
"epoch": 0.9492273730684326,
"grad_norm": 5.545693397521973,
"learning_rate": 1.0633838846851817e-06,
"loss": 0.4531,
"num_input_tokens_seen": 56360936,
"step": 860
},
{
"epoch": 0.9503311258278145,
"grad_norm": 5.274410247802734,
"learning_rate": 1.0606641596719908e-06,
"loss": 0.3835,
"num_input_tokens_seen": 56426472,
"step": 861
},
{
"epoch": 0.9514348785871964,
"grad_norm": 4.2041826248168945,
"learning_rate": 1.0580036753832781e-06,
"loss": 0.2417,
"num_input_tokens_seen": 56492008,
"step": 862
},
{
"epoch": 0.9525386313465783,
"grad_norm": 5.932213306427002,
"learning_rate": 1.0554024673218808e-06,
"loss": 0.4734,
"num_input_tokens_seen": 56557544,
"step": 863
},
{
"epoch": 0.9536423841059603,
"grad_norm": 5.501502990722656,
"learning_rate": 1.0528605701996232e-06,
"loss": 0.5387,
"num_input_tokens_seen": 56623080,
"step": 864
},
{
"epoch": 0.9547461368653422,
"grad_norm": 4.631185054779053,
"learning_rate": 1.0503780179368569e-06,
"loss": 0.3111,
"num_input_tokens_seen": 56688616,
"step": 865
},
{
"epoch": 0.9558498896247241,
"grad_norm": 5.278607368469238,
"learning_rate": 1.047954843662004e-06,
"loss": 0.4427,
"num_input_tokens_seen": 56754152,
"step": 866
},
{
"epoch": 0.956953642384106,
"grad_norm": 3.5461151599884033,
"learning_rate": 1.0455910797111182e-06,
"loss": 0.1917,
"num_input_tokens_seen": 56819688,
"step": 867
},
{
"epoch": 0.9580573951434879,
"grad_norm": 3.519580125808716,
"learning_rate": 1.043286757627451e-06,
"loss": 0.1873,
"num_input_tokens_seen": 56885224,
"step": 868
},
{
"epoch": 0.9591611479028698,
"grad_norm": 6.359206199645996,
"learning_rate": 1.0410419081610324e-06,
"loss": 0.5259,
"num_input_tokens_seen": 56950760,
"step": 869
},
{
"epoch": 0.9602649006622517,
"grad_norm": 5.1878557205200195,
"learning_rate": 1.0388565612682591e-06,
"loss": 0.3572,
"num_input_tokens_seen": 57016296,
"step": 870
},
{
"epoch": 0.9613686534216336,
"grad_norm": 5.292546272277832,
"learning_rate": 1.0367307461114976e-06,
"loss": 0.2851,
"num_input_tokens_seen": 57081832,
"step": 871
},
{
"epoch": 0.9624724061810155,
"grad_norm": 4.41323709487915,
"learning_rate": 1.0346644910586912e-06,
"loss": 0.2781,
"num_input_tokens_seen": 57147368,
"step": 872
},
{
"epoch": 0.9635761589403974,
"grad_norm": 5.156498908996582,
"learning_rate": 1.0326578236829837e-06,
"loss": 0.378,
"num_input_tokens_seen": 57212904,
"step": 873
},
{
"epoch": 0.9646799116997793,
"grad_norm": 4.593717575073242,
"learning_rate": 1.0307107707623509e-06,
"loss": 0.2814,
"num_input_tokens_seen": 57278440,
"step": 874
},
{
"epoch": 0.9657836644591612,
"grad_norm": 4.606041431427002,
"learning_rate": 1.0288233582792448e-06,
"loss": 0.3716,
"num_input_tokens_seen": 57343976,
"step": 875
},
{
"epoch": 0.9668874172185431,
"grad_norm": 4.7908124923706055,
"learning_rate": 1.0269956114202435e-06,
"loss": 0.3239,
"num_input_tokens_seen": 57409512,
"step": 876
},
{
"epoch": 0.9679911699779249,
"grad_norm": 5.706357002258301,
"learning_rate": 1.0252275545757185e-06,
"loss": 0.4617,
"num_input_tokens_seen": 57475048,
"step": 877
},
{
"epoch": 0.9690949227373068,
"grad_norm": 6.367629051208496,
"learning_rate": 1.0235192113395068e-06,
"loss": 0.5561,
"num_input_tokens_seen": 57540584,
"step": 878
},
{
"epoch": 0.9701986754966887,
"grad_norm": 4.593899250030518,
"learning_rate": 1.0218706045085982e-06,
"loss": 0.2952,
"num_input_tokens_seen": 57606120,
"step": 879
},
{
"epoch": 0.9713024282560706,
"grad_norm": 6.838583946228027,
"learning_rate": 1.0202817560828287e-06,
"loss": 0.3793,
"num_input_tokens_seen": 57671656,
"step": 880
},
{
"epoch": 0.9724061810154525,
"grad_norm": 3.928147315979004,
"learning_rate": 1.0187526872645888e-06,
"loss": 0.1679,
"num_input_tokens_seen": 57737192,
"step": 881
},
{
"epoch": 0.9735099337748344,
"grad_norm": 4.4882330894470215,
"learning_rate": 1.0172834184585406e-06,
"loss": 0.2663,
"num_input_tokens_seen": 57802728,
"step": 882
},
{
"epoch": 0.9746136865342163,
"grad_norm": 3.8216326236724854,
"learning_rate": 1.0158739692713428e-06,
"loss": 0.1723,
"num_input_tokens_seen": 57868264,
"step": 883
},
{
"epoch": 0.9757174392935982,
"grad_norm": 4.116147041320801,
"learning_rate": 1.0145243585113936e-06,
"loss": 0.1924,
"num_input_tokens_seen": 57933800,
"step": 884
},
{
"epoch": 0.9768211920529801,
"grad_norm": 4.032740592956543,
"learning_rate": 1.0132346041885756e-06,
"loss": 0.2267,
"num_input_tokens_seen": 57999336,
"step": 885
},
{
"epoch": 0.977924944812362,
"grad_norm": 6.308236122131348,
"learning_rate": 1.0120047235140178e-06,
"loss": 0.4259,
"num_input_tokens_seen": 58064872,
"step": 886
},
{
"epoch": 0.9790286975717439,
"grad_norm": 6.122150421142578,
"learning_rate": 1.0108347328998642e-06,
"loss": 0.4391,
"num_input_tokens_seen": 58130408,
"step": 887
},
{
"epoch": 0.9801324503311258,
"grad_norm": 3.8909456729888916,
"learning_rate": 1.0097246479590569e-06,
"loss": 0.2207,
"num_input_tokens_seen": 58195944,
"step": 888
},
{
"epoch": 0.9812362030905077,
"grad_norm": 4.869685173034668,
"learning_rate": 1.008674483505126e-06,
"loss": 0.3534,
"num_input_tokens_seen": 58261480,
"step": 889
},
{
"epoch": 0.9823399558498896,
"grad_norm": 5.161973476409912,
"learning_rate": 1.0076842535519936e-06,
"loss": 0.3576,
"num_input_tokens_seen": 58327016,
"step": 890
},
{
"epoch": 0.9834437086092715,
"grad_norm": 6.185498237609863,
"learning_rate": 1.0067539713137842e-06,
"loss": 0.5,
"num_input_tokens_seen": 58392552,
"step": 891
},
{
"epoch": 0.9845474613686535,
"grad_norm": 4.712432384490967,
"learning_rate": 1.0058836492046506e-06,
"loss": 0.2778,
"num_input_tokens_seen": 58458088,
"step": 892
},
{
"epoch": 0.9856512141280354,
"grad_norm": 4.5431389808654785,
"learning_rate": 1.0050732988386082e-06,
"loss": 0.2725,
"num_input_tokens_seen": 58523624,
"step": 893
},
{
"epoch": 0.9867549668874173,
"grad_norm": 5.538437843322754,
"learning_rate": 1.0043229310293782e-06,
"loss": 0.3557,
"num_input_tokens_seen": 58589160,
"step": 894
},
{
"epoch": 0.9878587196467992,
"grad_norm": 5.204857349395752,
"learning_rate": 1.0036325557902454e-06,
"loss": 0.3684,
"num_input_tokens_seen": 58654696,
"step": 895
},
{
"epoch": 0.9889624724061811,
"grad_norm": 3.494764804840088,
"learning_rate": 1.0030021823339229e-06,
"loss": 0.1827,
"num_input_tokens_seen": 58720232,
"step": 896
},
{
"epoch": 0.9900662251655629,
"grad_norm": 5.376889705657959,
"learning_rate": 1.0024318190724313e-06,
"loss": 0.429,
"num_input_tokens_seen": 58785768,
"step": 897
},
{
"epoch": 0.9911699779249448,
"grad_norm": 4.6836838722229,
"learning_rate": 1.0019214736169832e-06,
"loss": 0.326,
"num_input_tokens_seen": 58851304,
"step": 898
},
{
"epoch": 0.9922737306843267,
"grad_norm": 4.479922771453857,
"learning_rate": 1.0014711527778844e-06,
"loss": 0.2861,
"num_input_tokens_seen": 58916840,
"step": 899
},
{
"epoch": 0.9933774834437086,
"grad_norm": 4.647156715393066,
"learning_rate": 1.0010808625644427e-06,
"loss": 0.3314,
"num_input_tokens_seen": 58982376,
"step": 900
},
{
"epoch": 0.9944812362030905,
"grad_norm": 6.767654895782471,
"learning_rate": 1.000750608184886e-06,
"loss": 0.5011,
"num_input_tokens_seen": 59047912,
"step": 901
},
{
"epoch": 0.9955849889624724,
"grad_norm": 4.77541446685791,
"learning_rate": 1.0004803940462948e-06,
"loss": 0.2887,
"num_input_tokens_seen": 59113448,
"step": 902
},
{
"epoch": 0.9966887417218543,
"grad_norm": 4.733397483825684,
"learning_rate": 1.0002702237545419e-06,
"loss": 0.2941,
"num_input_tokens_seen": 59178984,
"step": 903
},
{
"epoch": 0.9977924944812362,
"grad_norm": 4.308443069458008,
"learning_rate": 1.0001201001142449e-06,
"loss": 0.2109,
"num_input_tokens_seen": 59244520,
"step": 904
},
{
"epoch": 0.9988962472406181,
"grad_norm": 5.600368022918701,
"learning_rate": 1.000030025128729e-06,
"loss": 0.3868,
"num_input_tokens_seen": 59310056,
"step": 905
},
{
"epoch": 1.0,
"grad_norm": 5.8607497215271,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4341,
"num_input_tokens_seen": 59375592,
"step": 906
},
{
"epoch": 1.0,
"num_input_tokens_seen": 59375592,
"step": 906,
"total_flos": 6.684140179701105e+17,
"train_loss": 0.4684678308715094,
"train_runtime": 15557.945,
"train_samples_per_second": 0.233,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1.0,
"max_steps": 906,
"num_input_tokens_seen": 59375592,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.684140179701105e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}