8b-zuckqa-TTTTS / trainer_state.json
amuvarma's picture
Update model
bbc6717 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008,
"grad_norm": 5.9686970710754395,
"learning_rate": 4.9999921043206356e-06,
"loss": 6.1536,
"step": 1
},
{
"epoch": 0.0016,
"grad_norm": 5.313859462738037,
"learning_rate": 4.999968417332415e-06,
"loss": 1.8192,
"step": 2
},
{
"epoch": 0.0024,
"grad_norm": 3.8611130714416504,
"learning_rate": 4.999928939184958e-06,
"loss": 5.7147,
"step": 3
},
{
"epoch": 0.0032,
"grad_norm": 8.215139389038086,
"learning_rate": 4.99987367012763e-06,
"loss": 1.9633,
"step": 4
},
{
"epoch": 0.004,
"grad_norm": 2.859307050704956,
"learning_rate": 4.999802610509541e-06,
"loss": 5.4413,
"step": 5
},
{
"epoch": 0.0048,
"grad_norm": 10.999748229980469,
"learning_rate": 4.999715760779541e-06,
"loss": 1.9931,
"step": 6
},
{
"epoch": 0.0056,
"grad_norm": 2.5857369899749756,
"learning_rate": 4.999613121486222e-06,
"loss": 5.2138,
"step": 7
},
{
"epoch": 0.0064,
"grad_norm": 4.739017009735107,
"learning_rate": 4.9994946932779076e-06,
"loss": 1.5203,
"step": 8
},
{
"epoch": 0.0072,
"grad_norm": 2.03410267829895,
"learning_rate": 4.999360476902656e-06,
"loss": 5.1349,
"step": 9
},
{
"epoch": 0.008,
"grad_norm": 4.154623508453369,
"learning_rate": 4.99921047320825e-06,
"loss": 1.6121,
"step": 10
},
{
"epoch": 0.0088,
"grad_norm": 1.8263472318649292,
"learning_rate": 4.999044683142196e-06,
"loss": 4.9737,
"step": 11
},
{
"epoch": 0.0096,
"grad_norm": 4.39143705368042,
"learning_rate": 4.998863107751711e-06,
"loss": 1.4866,
"step": 12
},
{
"epoch": 0.0104,
"grad_norm": 1.6841758489608765,
"learning_rate": 4.998665748183727e-06,
"loss": 5.0078,
"step": 13
},
{
"epoch": 0.0112,
"grad_norm": 4.099013805389404,
"learning_rate": 4.998452605684874e-06,
"loss": 1.6304,
"step": 14
},
{
"epoch": 0.012,
"grad_norm": 1.6769129037857056,
"learning_rate": 4.9982236816014735e-06,
"loss": 4.8359,
"step": 15
},
{
"epoch": 0.0128,
"grad_norm": 3.6601269245147705,
"learning_rate": 4.9979789773795365e-06,
"loss": 1.6408,
"step": 16
},
{
"epoch": 0.0136,
"grad_norm": 1.6234138011932373,
"learning_rate": 4.997718494564747e-06,
"loss": 4.9268,
"step": 17
},
{
"epoch": 0.0144,
"grad_norm": 4.540709018707275,
"learning_rate": 4.9974422348024565e-06,
"loss": 1.4653,
"step": 18
},
{
"epoch": 0.0152,
"grad_norm": 2.201477527618408,
"learning_rate": 4.997150199837671e-06,
"loss": 4.8255,
"step": 19
},
{
"epoch": 0.016,
"grad_norm": 3.3589704036712646,
"learning_rate": 4.996842391515045e-06,
"loss": 1.3599,
"step": 20
},
{
"epoch": 0.0168,
"grad_norm": 1.7828714847564697,
"learning_rate": 4.996518811778858e-06,
"loss": 4.7924,
"step": 21
},
{
"epoch": 0.0176,
"grad_norm": 4.722200870513916,
"learning_rate": 4.99617946267302e-06,
"loss": 1.8165,
"step": 22
},
{
"epoch": 0.0184,
"grad_norm": 1.5609182119369507,
"learning_rate": 4.995824346341041e-06,
"loss": 4.8322,
"step": 23
},
{
"epoch": 0.0192,
"grad_norm": 3.8967134952545166,
"learning_rate": 4.995453465026033e-06,
"loss": 1.49,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 2.199491024017334,
"learning_rate": 4.9950668210706795e-06,
"loss": 4.6516,
"step": 25
},
{
"epoch": 0.0208,
"grad_norm": 4.164550304412842,
"learning_rate": 4.994664416917236e-06,
"loss": 1.3359,
"step": 26
},
{
"epoch": 0.0216,
"grad_norm": 1.9813035726547241,
"learning_rate": 4.994246255107506e-06,
"loss": 4.6697,
"step": 27
},
{
"epoch": 0.0224,
"grad_norm": 5.564512729644775,
"learning_rate": 4.993812338282826e-06,
"loss": 1.6348,
"step": 28
},
{
"epoch": 0.0232,
"grad_norm": 2.7316086292266846,
"learning_rate": 4.993362669184051e-06,
"loss": 4.4999,
"step": 29
},
{
"epoch": 0.024,
"grad_norm": 4.501605987548828,
"learning_rate": 4.992897250651535e-06,
"loss": 1.4643,
"step": 30
},
{
"epoch": 0.0248,
"grad_norm": 1.8927552700042725,
"learning_rate": 4.992416085625115e-06,
"loss": 4.7085,
"step": 31
},
{
"epoch": 0.0256,
"grad_norm": 4.785287380218506,
"learning_rate": 4.9919191771440905e-06,
"loss": 1.3398,
"step": 32
},
{
"epoch": 0.0264,
"grad_norm": 2.4881515502929688,
"learning_rate": 4.991406528347206e-06,
"loss": 4.5912,
"step": 33
},
{
"epoch": 0.0272,
"grad_norm": 4.189312934875488,
"learning_rate": 4.990878142472628e-06,
"loss": 1.4647,
"step": 34
},
{
"epoch": 0.028,
"grad_norm": 2.654892921447754,
"learning_rate": 4.990334022857932e-06,
"loss": 4.4038,
"step": 35
},
{
"epoch": 0.0288,
"grad_norm": 5.841195583343506,
"learning_rate": 4.989774172940071e-06,
"loss": 1.5347,
"step": 36
},
{
"epoch": 0.0296,
"grad_norm": 3.269841432571411,
"learning_rate": 4.989198596255361e-06,
"loss": 4.4978,
"step": 37
},
{
"epoch": 0.0304,
"grad_norm": 3.6912543773651123,
"learning_rate": 4.988607296439459e-06,
"loss": 1.3615,
"step": 38
},
{
"epoch": 0.0312,
"grad_norm": 3.773468255996704,
"learning_rate": 4.988000277227334e-06,
"loss": 4.4462,
"step": 39
},
{
"epoch": 0.032,
"grad_norm": 4.216678142547607,
"learning_rate": 4.9873775424532515e-06,
"loss": 1.3803,
"step": 40
},
{
"epoch": 0.0328,
"grad_norm": 4.231056213378906,
"learning_rate": 4.98673909605074e-06,
"loss": 4.4349,
"step": 41
},
{
"epoch": 0.0336,
"grad_norm": 4.05332088470459,
"learning_rate": 4.986084942052577e-06,
"loss": 1.3321,
"step": 42
},
{
"epoch": 0.0344,
"grad_norm": 3.9502322673797607,
"learning_rate": 4.985415084590752e-06,
"loss": 4.2693,
"step": 43
},
{
"epoch": 0.0352,
"grad_norm": 8.568007469177246,
"learning_rate": 4.984729527896451e-06,
"loss": 1.6135,
"step": 44
},
{
"epoch": 0.036,
"grad_norm": 4.460508346557617,
"learning_rate": 4.984028276300021e-06,
"loss": 4.4412,
"step": 45
},
{
"epoch": 0.0368,
"grad_norm": 7.591355323791504,
"learning_rate": 4.9833113342309495e-06,
"loss": 1.6569,
"step": 46
},
{
"epoch": 0.0376,
"grad_norm": 5.810396194458008,
"learning_rate": 4.9825787062178315e-06,
"loss": 4.1632,
"step": 47
},
{
"epoch": 0.0384,
"grad_norm": 10.894949913024902,
"learning_rate": 4.9818303968883445e-06,
"loss": 1.6721,
"step": 48
},
{
"epoch": 0.0392,
"grad_norm": 4.217193126678467,
"learning_rate": 4.981066410969215e-06,
"loss": 4.1738,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 8.75684642791748,
"learning_rate": 4.980286753286196e-06,
"loss": 1.3856,
"step": 50
},
{
"epoch": 0.0408,
"grad_norm": 3.8983495235443115,
"learning_rate": 4.9794914287640264e-06,
"loss": 4.0982,
"step": 51
},
{
"epoch": 0.0416,
"grad_norm": 6.7597527503967285,
"learning_rate": 4.978680442426409e-06,
"loss": 1.4406,
"step": 52
},
{
"epoch": 0.0424,
"grad_norm": 5.493980407714844,
"learning_rate": 4.977853799395976e-06,
"loss": 4.3028,
"step": 53
},
{
"epoch": 0.0432,
"grad_norm": 7.1781487464904785,
"learning_rate": 4.977011504894253e-06,
"loss": 1.4716,
"step": 54
},
{
"epoch": 0.044,
"grad_norm": 4.196126937866211,
"learning_rate": 4.9761535642416284e-06,
"loss": 4.1292,
"step": 55
},
{
"epoch": 0.0448,
"grad_norm": 7.720696926116943,
"learning_rate": 4.975279982857324e-06,
"loss": 1.5968,
"step": 56
},
{
"epoch": 0.0456,
"grad_norm": 1.6588771343231201,
"learning_rate": 4.974390766259353e-06,
"loss": 4.2463,
"step": 57
},
{
"epoch": 0.0464,
"grad_norm": 10.156800270080566,
"learning_rate": 4.973485920064491e-06,
"loss": 1.4834,
"step": 58
},
{
"epoch": 0.0472,
"grad_norm": 1.59371018409729,
"learning_rate": 4.972565449988238e-06,
"loss": 4.0996,
"step": 59
},
{
"epoch": 0.048,
"grad_norm": 8.833647727966309,
"learning_rate": 4.971629361844785e-06,
"loss": 1.6226,
"step": 60
},
{
"epoch": 0.0488,
"grad_norm": 1.8904303312301636,
"learning_rate": 4.970677661546972e-06,
"loss": 4.1373,
"step": 61
},
{
"epoch": 0.0496,
"grad_norm": 7.343002796173096,
"learning_rate": 4.969710355106256e-06,
"loss": 1.5989,
"step": 62
},
{
"epoch": 0.0504,
"grad_norm": 1.5326752662658691,
"learning_rate": 4.968727448632669e-06,
"loss": 4.067,
"step": 63
},
{
"epoch": 0.0512,
"grad_norm": 5.595536708831787,
"learning_rate": 4.967728948334784e-06,
"loss": 1.515,
"step": 64
},
{
"epoch": 0.052,
"grad_norm": 2.240656852722168,
"learning_rate": 4.96671486051967e-06,
"loss": 3.9452,
"step": 65
},
{
"epoch": 0.0528,
"grad_norm": 8.656717300415039,
"learning_rate": 4.965685191592859e-06,
"loss": 1.7592,
"step": 66
},
{
"epoch": 0.0536,
"grad_norm": 1.6276272535324097,
"learning_rate": 4.964639948058297e-06,
"loss": 3.9894,
"step": 67
},
{
"epoch": 0.0544,
"grad_norm": 5.7422075271606445,
"learning_rate": 4.963579136518312e-06,
"loss": 1.5689,
"step": 68
},
{
"epoch": 0.0552,
"grad_norm": 1.9765911102294922,
"learning_rate": 4.962502763673566e-06,
"loss": 4.0761,
"step": 69
},
{
"epoch": 0.056,
"grad_norm": 6.2184224128723145,
"learning_rate": 4.961410836323014e-06,
"loss": 1.5643,
"step": 70
},
{
"epoch": 0.0568,
"grad_norm": 1.7013366222381592,
"learning_rate": 4.960303361363863e-06,
"loss": 3.9535,
"step": 71
},
{
"epoch": 0.0576,
"grad_norm": 5.7151713371276855,
"learning_rate": 4.959180345791528e-06,
"loss": 1.3778,
"step": 72
},
{
"epoch": 0.0584,
"grad_norm": 2.092637777328491,
"learning_rate": 4.958041796699583e-06,
"loss": 4.043,
"step": 73
},
{
"epoch": 0.0592,
"grad_norm": 6.953094482421875,
"learning_rate": 4.956887721279726e-06,
"loss": 1.4149,
"step": 74
},
{
"epoch": 0.06,
"grad_norm": 2.5431764125823975,
"learning_rate": 4.9557181268217225e-06,
"loss": 4.1433,
"step": 75
},
{
"epoch": 0.0608,
"grad_norm": 5.6638665199279785,
"learning_rate": 4.954533020713367e-06,
"loss": 1.3123,
"step": 76
},
{
"epoch": 0.0616,
"grad_norm": 2.033217668533325,
"learning_rate": 4.953332410440434e-06,
"loss": 4.12,
"step": 77
},
{
"epoch": 0.0624,
"grad_norm": 5.832539081573486,
"learning_rate": 4.952116303586631e-06,
"loss": 1.4276,
"step": 78
},
{
"epoch": 0.0632,
"grad_norm": 1.4119787216186523,
"learning_rate": 4.95088470783355e-06,
"loss": 3.9499,
"step": 79
},
{
"epoch": 0.064,
"grad_norm": 5.931257247924805,
"learning_rate": 4.949637630960618e-06,
"loss": 1.5232,
"step": 80
},
{
"epoch": 0.0648,
"grad_norm": 1.5532656908035278,
"learning_rate": 4.94837508084505e-06,
"loss": 3.9162,
"step": 81
},
{
"epoch": 0.0656,
"grad_norm": 5.160223007202148,
"learning_rate": 4.947097065461801e-06,
"loss": 1.7749,
"step": 82
},
{
"epoch": 0.0664,
"grad_norm": 1.274683952331543,
"learning_rate": 4.945803592883509e-06,
"loss": 3.9429,
"step": 83
},
{
"epoch": 0.0672,
"grad_norm": 4.50646448135376,
"learning_rate": 4.94449467128045e-06,
"loss": 1.3428,
"step": 84
},
{
"epoch": 0.068,
"grad_norm": 2.7638394832611084,
"learning_rate": 4.943170308920484e-06,
"loss": 4.0664,
"step": 85
},
{
"epoch": 0.0688,
"grad_norm": 5.305659770965576,
"learning_rate": 4.9418305141690045e-06,
"loss": 1.6382,
"step": 86
},
{
"epoch": 0.0696,
"grad_norm": 1.672782301902771,
"learning_rate": 4.940475295488882e-06,
"loss": 3.9736,
"step": 87
},
{
"epoch": 0.0704,
"grad_norm": 4.357553482055664,
"learning_rate": 4.939104661440415e-06,
"loss": 1.2025,
"step": 88
},
{
"epoch": 0.0712,
"grad_norm": 1.9459145069122314,
"learning_rate": 4.937718620681273e-06,
"loss": 3.8823,
"step": 89
},
{
"epoch": 0.072,
"grad_norm": 4.6320085525512695,
"learning_rate": 4.9363171819664434e-06,
"loss": 1.4891,
"step": 90
},
{
"epoch": 0.0728,
"grad_norm": 1.9804147481918335,
"learning_rate": 4.934900354148173e-06,
"loss": 3.673,
"step": 91
},
{
"epoch": 0.0736,
"grad_norm": 5.650574684143066,
"learning_rate": 4.933468146175918e-06,
"loss": 1.6462,
"step": 92
},
{
"epoch": 0.0744,
"grad_norm": 2.002102851867676,
"learning_rate": 4.9320205670962815e-06,
"loss": 3.9996,
"step": 93
},
{
"epoch": 0.0752,
"grad_norm": 5.602189540863037,
"learning_rate": 4.930557626052961e-06,
"loss": 1.57,
"step": 94
},
{
"epoch": 0.076,
"grad_norm": 1.618115782737732,
"learning_rate": 4.929079332286685e-06,
"loss": 3.9771,
"step": 95
},
{
"epoch": 0.0768,
"grad_norm": 4.976815223693848,
"learning_rate": 4.927585695135162e-06,
"loss": 1.3109,
"step": 96
},
{
"epoch": 0.0776,
"grad_norm": 1.5383416414260864,
"learning_rate": 4.926076724033016e-06,
"loss": 3.943,
"step": 97
},
{
"epoch": 0.0784,
"grad_norm": 5.538623809814453,
"learning_rate": 4.924552428511727e-06,
"loss": 1.5928,
"step": 98
},
{
"epoch": 0.0792,
"grad_norm": 1.1636689901351929,
"learning_rate": 4.923012818199576e-06,
"loss": 3.9089,
"step": 99
},
{
"epoch": 0.08,
"grad_norm": 5.035048484802246,
"learning_rate": 4.921457902821578e-06,
"loss": 1.709,
"step": 100
},
{
"epoch": 0.0808,
"grad_norm": 1.3163026571273804,
"learning_rate": 4.919887692199423e-06,
"loss": 3.9234,
"step": 101
},
{
"epoch": 0.0816,
"grad_norm": 4.93280029296875,
"learning_rate": 4.9183021962514145e-06,
"loss": 1.4215,
"step": 102
},
{
"epoch": 0.0824,
"grad_norm": 2.1531784534454346,
"learning_rate": 4.9167014249924075e-06,
"loss": 3.8196,
"step": 103
},
{
"epoch": 0.0832,
"grad_norm": 4.800553798675537,
"learning_rate": 4.915085388533743e-06,
"loss": 1.573,
"step": 104
},
{
"epoch": 0.084,
"grad_norm": 1.383305311203003,
"learning_rate": 4.913454097083185e-06,
"loss": 3.9708,
"step": 105
},
{
"epoch": 0.0848,
"grad_norm": 4.389811038970947,
"learning_rate": 4.911807560944858e-06,
"loss": 1.3961,
"step": 106
},
{
"epoch": 0.0856,
"grad_norm": 1.5299296379089355,
"learning_rate": 4.910145790519177e-06,
"loss": 3.8796,
"step": 107
},
{
"epoch": 0.0864,
"grad_norm": 5.052987575531006,
"learning_rate": 4.90846879630279e-06,
"loss": 1.3103,
"step": 108
},
{
"epoch": 0.0872,
"grad_norm": 1.417496919631958,
"learning_rate": 4.906776588888502e-06,
"loss": 3.9388,
"step": 109
},
{
"epoch": 0.088,
"grad_norm": 4.012498378753662,
"learning_rate": 4.905069178965215e-06,
"loss": 1.1366,
"step": 110
},
{
"epoch": 0.0888,
"grad_norm": 1.2801809310913086,
"learning_rate": 4.903346577317859e-06,
"loss": 3.872,
"step": 111
},
{
"epoch": 0.0896,
"grad_norm": 5.76353120803833,
"learning_rate": 4.901608794827321e-06,
"loss": 1.5188,
"step": 112
},
{
"epoch": 0.0904,
"grad_norm": 1.5510302782058716,
"learning_rate": 4.89985584247038e-06,
"loss": 3.807,
"step": 113
},
{
"epoch": 0.0912,
"grad_norm": 4.934327125549316,
"learning_rate": 4.898087731319637e-06,
"loss": 1.6052,
"step": 114
},
{
"epoch": 0.092,
"grad_norm": 1.849161982536316,
"learning_rate": 4.89630447254344e-06,
"loss": 3.8367,
"step": 115
},
{
"epoch": 0.0928,
"grad_norm": 5.75076150894165,
"learning_rate": 4.894506077405824e-06,
"loss": 1.6729,
"step": 116
},
{
"epoch": 0.0936,
"grad_norm": 1.3285000324249268,
"learning_rate": 4.892692557266429e-06,
"loss": 3.9178,
"step": 117
},
{
"epoch": 0.0944,
"grad_norm": 5.176731586456299,
"learning_rate": 4.8908639235804324e-06,
"loss": 1.3498,
"step": 118
},
{
"epoch": 0.0952,
"grad_norm": 2.258445978164673,
"learning_rate": 4.88902018789848e-06,
"loss": 3.9289,
"step": 119
},
{
"epoch": 0.096,
"grad_norm": 4.080480575561523,
"learning_rate": 4.887161361866608e-06,
"loss": 1.2727,
"step": 120
},
{
"epoch": 0.0968,
"grad_norm": 1.3605031967163086,
"learning_rate": 4.8852874572261715e-06,
"loss": 3.8425,
"step": 121
},
{
"epoch": 0.0976,
"grad_norm": 4.4306135177612305,
"learning_rate": 4.883398485813772e-06,
"loss": 1.4429,
"step": 122
},
{
"epoch": 0.0984,
"grad_norm": 1.9310946464538574,
"learning_rate": 4.881494459561177e-06,
"loss": 3.7989,
"step": 123
},
{
"epoch": 0.0992,
"grad_norm": 5.516058444976807,
"learning_rate": 4.879575390495254e-06,
"loss": 1.6466,
"step": 124
},
{
"epoch": 0.1,
"grad_norm": 1.665083646774292,
"learning_rate": 4.8776412907378845e-06,
"loss": 3.7725,
"step": 125
},
{
"epoch": 0.1008,
"grad_norm": 5.122972011566162,
"learning_rate": 4.8756921725058935e-06,
"loss": 1.4164,
"step": 126
},
{
"epoch": 0.1016,
"grad_norm": 1.7785176038742065,
"learning_rate": 4.873728048110973e-06,
"loss": 3.8428,
"step": 127
},
{
"epoch": 0.1024,
"grad_norm": 4.19711446762085,
"learning_rate": 4.871748929959598e-06,
"loss": 1.4346,
"step": 128
},
{
"epoch": 0.1032,
"grad_norm": 1.5167326927185059,
"learning_rate": 4.869754830552956e-06,
"loss": 3.7787,
"step": 129
},
{
"epoch": 0.104,
"grad_norm": 4.343649387359619,
"learning_rate": 4.867745762486862e-06,
"loss": 1.4161,
"step": 130
},
{
"epoch": 0.1048,
"grad_norm": 1.7682503461837769,
"learning_rate": 4.86572173845168e-06,
"loss": 3.7656,
"step": 131
},
{
"epoch": 0.1056,
"grad_norm": 5.387735843658447,
"learning_rate": 4.863682771232249e-06,
"loss": 1.5529,
"step": 132
},
{
"epoch": 0.1064,
"grad_norm": 1.6323776245117188,
"learning_rate": 4.861628873707792e-06,
"loss": 3.7581,
"step": 133
},
{
"epoch": 0.1072,
"grad_norm": 4.973332405090332,
"learning_rate": 4.859560058851844e-06,
"loss": 1.3401,
"step": 134
},
{
"epoch": 0.108,
"grad_norm": 2.288790464401245,
"learning_rate": 4.857476339732162e-06,
"loss": 3.5462,
"step": 135
},
{
"epoch": 0.1088,
"grad_norm": 4.954509735107422,
"learning_rate": 4.855377729510648e-06,
"loss": 1.4214,
"step": 136
},
{
"epoch": 0.1096,
"grad_norm": 1.466504693031311,
"learning_rate": 4.8532642414432675e-06,
"loss": 3.7383,
"step": 137
},
{
"epoch": 0.1104,
"grad_norm": 4.507660865783691,
"learning_rate": 4.851135888879958e-06,
"loss": 1.429,
"step": 138
},
{
"epoch": 0.1112,
"grad_norm": 1.4335397481918335,
"learning_rate": 4.8489926852645505e-06,
"loss": 3.8185,
"step": 139
},
{
"epoch": 0.112,
"grad_norm": 5.188979148864746,
"learning_rate": 4.846834644134686e-06,
"loss": 1.288,
"step": 140
},
{
"epoch": 0.1128,
"grad_norm": 1.4267185926437378,
"learning_rate": 4.844661779121723e-06,
"loss": 3.7755,
"step": 141
},
{
"epoch": 0.1136,
"grad_norm": 4.5999555587768555,
"learning_rate": 4.842474103950658e-06,
"loss": 1.4337,
"step": 142
},
{
"epoch": 0.1144,
"grad_norm": 1.5960358381271362,
"learning_rate": 4.8402716324400375e-06,
"loss": 3.8674,
"step": 143
},
{
"epoch": 0.1152,
"grad_norm": 4.50584077835083,
"learning_rate": 4.838054378501868e-06,
"loss": 1.4054,
"step": 144
},
{
"epoch": 0.116,
"grad_norm": 2.3714451789855957,
"learning_rate": 4.8358223561415304e-06,
"loss": 3.6878,
"step": 145
},
{
"epoch": 0.1168,
"grad_norm": 4.409125328063965,
"learning_rate": 4.833575579457691e-06,
"loss": 1.4443,
"step": 146
},
{
"epoch": 0.1176,
"grad_norm": 1.876566767692566,
"learning_rate": 4.831314062642213e-06,
"loss": 3.9204,
"step": 147
},
{
"epoch": 0.1184,
"grad_norm": 4.678242206573486,
"learning_rate": 4.829037819980065e-06,
"loss": 1.3475,
"step": 148
},
{
"epoch": 0.1192,
"grad_norm": 1.5604186058044434,
"learning_rate": 4.8267468658492335e-06,
"loss": 3.8065,
"step": 149
},
{
"epoch": 0.12,
"grad_norm": 4.738994598388672,
"learning_rate": 4.824441214720629e-06,
"loss": 1.2868,
"step": 150
},
{
"epoch": 0.1208,
"grad_norm": 1.2587168216705322,
"learning_rate": 4.822120881157998e-06,
"loss": 3.8178,
"step": 151
},
{
"epoch": 0.1216,
"grad_norm": 4.9535298347473145,
"learning_rate": 4.819785879817827e-06,
"loss": 1.4865,
"step": 152
},
{
"epoch": 0.1224,
"grad_norm": 1.3460506200790405,
"learning_rate": 4.8174362254492555e-06,
"loss": 3.7509,
"step": 153
},
{
"epoch": 0.1232,
"grad_norm": 6.2948832511901855,
"learning_rate": 4.815071932893976e-06,
"loss": 1.6562,
"step": 154
},
{
"epoch": 0.124,
"grad_norm": 1.2623156309127808,
"learning_rate": 4.812693017086145e-06,
"loss": 3.7352,
"step": 155
},
{
"epoch": 0.1248,
"grad_norm": 4.746945858001709,
"learning_rate": 4.810299493052289e-06,
"loss": 1.4701,
"step": 156
},
{
"epoch": 0.1256,
"grad_norm": 1.41659414768219,
"learning_rate": 4.807891375911207e-06,
"loss": 3.7158,
"step": 157
},
{
"epoch": 0.1264,
"grad_norm": 5.151709079742432,
"learning_rate": 4.805468680873874e-06,
"loss": 1.5235,
"step": 158
},
{
"epoch": 0.1272,
"grad_norm": 1.1390382051467896,
"learning_rate": 4.803031423243349e-06,
"loss": 3.7685,
"step": 159
},
{
"epoch": 0.128,
"grad_norm": 4.6451802253723145,
"learning_rate": 4.800579618414677e-06,
"loss": 1.3374,
"step": 160
},
{
"epoch": 0.1288,
"grad_norm": 2.0730605125427246,
"learning_rate": 4.798113281874788e-06,
"loss": 3.7551,
"step": 161
},
{
"epoch": 0.1296,
"grad_norm": 4.244422435760498,
"learning_rate": 4.7956324292024045e-06,
"loss": 1.4507,
"step": 162
},
{
"epoch": 0.1304,
"grad_norm": 1.437325119972229,
"learning_rate": 4.7931370760679415e-06,
"loss": 3.8459,
"step": 163
},
{
"epoch": 0.1312,
"grad_norm": 4.308803558349609,
"learning_rate": 4.790627238233405e-06,
"loss": 1.4397,
"step": 164
},
{
"epoch": 0.132,
"grad_norm": 1.3514691591262817,
"learning_rate": 4.788102931552294e-06,
"loss": 3.7826,
"step": 165
},
{
"epoch": 0.1328,
"grad_norm": 4.431159973144531,
"learning_rate": 4.785564171969503e-06,
"loss": 1.3688,
"step": 166
},
{
"epoch": 0.1336,
"grad_norm": 1.9444341659545898,
"learning_rate": 4.783010975521216e-06,
"loss": 3.786,
"step": 167
},
{
"epoch": 0.1344,
"grad_norm": 4.421632289886475,
"learning_rate": 4.78044335833481e-06,
"loss": 1.3799,
"step": 168
},
{
"epoch": 0.1352,
"grad_norm": 1.30320143699646,
"learning_rate": 4.777861336628751e-06,
"loss": 3.7414,
"step": 169
},
{
"epoch": 0.136,
"grad_norm": 4.836937427520752,
"learning_rate": 4.775264926712489e-06,
"loss": 1.3762,
"step": 170
},
{
"epoch": 0.1368,
"grad_norm": 1.720489501953125,
"learning_rate": 4.772654144986364e-06,
"loss": 3.7693,
"step": 171
},
{
"epoch": 0.1376,
"grad_norm": 4.573201656341553,
"learning_rate": 4.77002900794149e-06,
"loss": 1.4831,
"step": 172
},
{
"epoch": 0.1384,
"grad_norm": 1.4767590761184692,
"learning_rate": 4.767389532159659e-06,
"loss": 3.7936,
"step": 173
},
{
"epoch": 0.1392,
"grad_norm": 4.3813090324401855,
"learning_rate": 4.764735734313236e-06,
"loss": 1.3468,
"step": 174
},
{
"epoch": 0.14,
"grad_norm": 1.5614203214645386,
"learning_rate": 4.762067631165049e-06,
"loss": 3.8268,
"step": 175
},
{
"epoch": 0.1408,
"grad_norm": 4.7881317138671875,
"learning_rate": 4.75938523956829e-06,
"loss": 1.6201,
"step": 176
},
{
"epoch": 0.1416,
"grad_norm": 1.2957278490066528,
"learning_rate": 4.756688576466398e-06,
"loss": 3.7073,
"step": 177
},
{
"epoch": 0.1424,
"grad_norm": 4.188969612121582,
"learning_rate": 4.753977658892967e-06,
"loss": 1.4572,
"step": 178
},
{
"epoch": 0.1432,
"grad_norm": 2.046276330947876,
"learning_rate": 4.751252503971624e-06,
"loss": 3.6809,
"step": 179
},
{
"epoch": 0.144,
"grad_norm": 4.05677604675293,
"learning_rate": 4.748513128915928e-06,
"loss": 1.3311,
"step": 180
},
{
"epoch": 0.1448,
"grad_norm": 1.2244303226470947,
"learning_rate": 4.7457595510292615e-06,
"loss": 3.8316,
"step": 181
},
{
"epoch": 0.1456,
"grad_norm": 4.775726795196533,
"learning_rate": 4.74299178770472e-06,
"loss": 1.5603,
"step": 182
},
{
"epoch": 0.1464,
"grad_norm": 1.41436767578125,
"learning_rate": 4.740209856424998e-06,
"loss": 3.7105,
"step": 183
},
{
"epoch": 0.1472,
"grad_norm": 5.448317527770996,
"learning_rate": 4.737413774762287e-06,
"loss": 1.2361,
"step": 184
},
{
"epoch": 0.148,
"grad_norm": 1.222730040550232,
"learning_rate": 4.73460356037816e-06,
"loss": 3.8072,
"step": 185
},
{
"epoch": 0.1488,
"grad_norm": 4.413971900939941,
"learning_rate": 4.731779231023456e-06,
"loss": 1.6303,
"step": 186
},
{
"epoch": 0.1496,
"grad_norm": 1.4510987997055054,
"learning_rate": 4.728940804538176e-06,
"loss": 3.6988,
"step": 187
},
{
"epoch": 0.1504,
"grad_norm": 4.780493259429932,
"learning_rate": 4.726088298851362e-06,
"loss": 1.1804,
"step": 188
},
{
"epoch": 0.1512,
"grad_norm": 1.5533583164215088,
"learning_rate": 4.723221731980993e-06,
"loss": 3.6128,
"step": 189
},
{
"epoch": 0.152,
"grad_norm": 4.775524616241455,
"learning_rate": 4.720341122033862e-06,
"loss": 1.5147,
"step": 190
},
{
"epoch": 0.1528,
"grad_norm": 1.6876249313354492,
"learning_rate": 4.717446487205466e-06,
"loss": 3.7315,
"step": 191
},
{
"epoch": 0.1536,
"grad_norm": 3.9606497287750244,
"learning_rate": 4.714537845779894e-06,
"loss": 1.3284,
"step": 192
},
{
"epoch": 0.1544,
"grad_norm": 1.2425357103347778,
"learning_rate": 4.7116152161297045e-06,
"loss": 3.7983,
"step": 193
},
{
"epoch": 0.1552,
"grad_norm": 3.9687187671661377,
"learning_rate": 4.708678616715815e-06,
"loss": 1.3479,
"step": 194
},
{
"epoch": 0.156,
"grad_norm": 1.5664615631103516,
"learning_rate": 4.705728066087384e-06,
"loss": 3.7247,
"step": 195
},
{
"epoch": 0.1568,
"grad_norm": 4.444562911987305,
"learning_rate": 4.702763582881692e-06,
"loss": 1.2835,
"step": 196
},
{
"epoch": 0.1576,
"grad_norm": 1.8698633909225464,
"learning_rate": 4.699785185824026e-06,
"loss": 3.8091,
"step": 197
},
{
"epoch": 0.1584,
"grad_norm": 4.637014389038086,
"learning_rate": 4.696792893727562e-06,
"loss": 1.3871,
"step": 198
},
{
"epoch": 0.1592,
"grad_norm": 1.3571611642837524,
"learning_rate": 4.693786725493242e-06,
"loss": 3.7813,
"step": 199
},
{
"epoch": 0.16,
"grad_norm": 4.458593368530273,
"learning_rate": 4.690766700109659e-06,
"loss": 1.4933,
"step": 200
},
{
"epoch": 0.1608,
"grad_norm": 1.5887341499328613,
"learning_rate": 4.687732836652935e-06,
"loss": 3.6873,
"step": 201
},
{
"epoch": 0.1616,
"grad_norm": 6.06688928604126,
"learning_rate": 4.684685154286599e-06,
"loss": 1.312,
"step": 202
},
{
"epoch": 0.1624,
"grad_norm": 1.5234293937683105,
"learning_rate": 4.6816236722614694e-06,
"loss": 3.7146,
"step": 203
},
{
"epoch": 0.1632,
"grad_norm": 4.001331806182861,
"learning_rate": 4.6785484099155324e-06,
"loss": 1.4507,
"step": 204
},
{
"epoch": 0.164,
"grad_norm": 1.5702141523361206,
"learning_rate": 4.675459386673815e-06,
"loss": 3.6801,
"step": 205
},
{
"epoch": 0.1648,
"grad_norm": 3.6314635276794434,
"learning_rate": 4.672356622048266e-06,
"loss": 1.2263,
"step": 206
},
{
"epoch": 0.1656,
"grad_norm": 1.422735571861267,
"learning_rate": 4.669240135637635e-06,
"loss": 3.6963,
"step": 207
},
{
"epoch": 0.1664,
"grad_norm": 4.454765796661377,
"learning_rate": 4.666109947127343e-06,
"loss": 1.1784,
"step": 208
},
{
"epoch": 0.1672,
"grad_norm": 2.0289947986602783,
"learning_rate": 4.662966076289363e-06,
"loss": 3.8096,
"step": 209
},
{
"epoch": 0.168,
"grad_norm": 4.10106086730957,
"learning_rate": 4.659808542982089e-06,
"loss": 1.3621,
"step": 210
},
{
"epoch": 0.1688,
"grad_norm": 1.7755879163742065,
"learning_rate": 4.65663736715022e-06,
"loss": 3.6229,
"step": 211
},
{
"epoch": 0.1696,
"grad_norm": 3.9878623485565186,
"learning_rate": 4.653452568824625e-06,
"loss": 1.3814,
"step": 212
},
{
"epoch": 0.1704,
"grad_norm": 1.2768726348876953,
"learning_rate": 4.650254168122222e-06,
"loss": 3.7008,
"step": 213
},
{
"epoch": 0.1712,
"grad_norm": 3.8291852474212646,
"learning_rate": 4.647042185245848e-06,
"loss": 1.3145,
"step": 214
},
{
"epoch": 0.172,
"grad_norm": 1.5507771968841553,
"learning_rate": 4.6438166404841316e-06,
"loss": 3.6915,
"step": 215
},
{
"epoch": 0.1728,
"grad_norm": 4.554000377655029,
"learning_rate": 4.640577554211366e-06,
"loss": 1.2244,
"step": 216
},
{
"epoch": 0.1736,
"grad_norm": 1.2744420766830444,
"learning_rate": 4.637324946887384e-06,
"loss": 3.7756,
"step": 217
},
{
"epoch": 0.1744,
"grad_norm": 5.061426162719727,
"learning_rate": 4.634058839057417e-06,
"loss": 1.479,
"step": 218
},
{
"epoch": 0.1752,
"grad_norm": 1.7611600160598755,
"learning_rate": 4.63077925135198e-06,
"loss": 3.7824,
"step": 219
},
{
"epoch": 0.176,
"grad_norm": 5.889009952545166,
"learning_rate": 4.62748620448673e-06,
"loss": 1.4081,
"step": 220
},
{
"epoch": 0.1768,
"grad_norm": 1.560341238975525,
"learning_rate": 4.624179719262342e-06,
"loss": 3.7535,
"step": 221
},
{
"epoch": 0.1776,
"grad_norm": 4.9289231300354,
"learning_rate": 4.620859816564371e-06,
"loss": 1.4075,
"step": 222
},
{
"epoch": 0.1784,
"grad_norm": 1.3027839660644531,
"learning_rate": 4.6175265173631304e-06,
"loss": 3.7511,
"step": 223
},
{
"epoch": 0.1792,
"grad_norm": 4.20517635345459,
"learning_rate": 4.6141798427135475e-06,
"loss": 1.2056,
"step": 224
},
{
"epoch": 0.18,
"grad_norm": 1.9253166913986206,
"learning_rate": 4.610819813755038e-06,
"loss": 3.5762,
"step": 225
},
{
"epoch": 0.1808,
"grad_norm": 4.654662609100342,
"learning_rate": 4.607446451711372e-06,
"loss": 1.4106,
"step": 226
},
{
"epoch": 0.1816,
"grad_norm": 1.6170463562011719,
"learning_rate": 4.604059777890537e-06,
"loss": 3.5927,
"step": 227
},
{
"epoch": 0.1824,
"grad_norm": 4.272345066070557,
"learning_rate": 4.6006598136846056e-06,
"loss": 1.3751,
"step": 228
},
{
"epoch": 0.1832,
"grad_norm": 1.1468439102172852,
"learning_rate": 4.5972465805696e-06,
"loss": 3.7235,
"step": 229
},
{
"epoch": 0.184,
"grad_norm": 4.337528705596924,
"learning_rate": 4.593820100105355e-06,
"loss": 1.212,
"step": 230
},
{
"epoch": 0.1848,
"grad_norm": 1.6321645975112915,
"learning_rate": 4.590380393935383e-06,
"loss": 3.7544,
"step": 231
},
{
"epoch": 0.1856,
"grad_norm": 4.132114410400391,
"learning_rate": 4.586927483786739e-06,
"loss": 1.4566,
"step": 232
},
{
"epoch": 0.1864,
"grad_norm": 1.6077178716659546,
"learning_rate": 4.583461391469879e-06,
"loss": 3.6934,
"step": 233
},
{
"epoch": 0.1872,
"grad_norm": 4.226905345916748,
"learning_rate": 4.579982138878527e-06,
"loss": 1.5507,
"step": 234
},
{
"epoch": 0.188,
"grad_norm": 1.280689001083374,
"learning_rate": 4.576489747989532e-06,
"loss": 3.77,
"step": 235
},
{
"epoch": 0.1888,
"grad_norm": 3.9274861812591553,
"learning_rate": 4.572984240862733e-06,
"loss": 1.5939,
"step": 236
},
{
"epoch": 0.1896,
"grad_norm": 1.420904278755188,
"learning_rate": 4.56946563964082e-06,
"loss": 3.5977,
"step": 237
},
{
"epoch": 0.1904,
"grad_norm": 4.135627746582031,
"learning_rate": 4.5659339665491894e-06,
"loss": 1.2989,
"step": 238
},
{
"epoch": 0.1912,
"grad_norm": 1.301414966583252,
"learning_rate": 4.562389243895807e-06,
"loss": 3.6786,
"step": 239
},
{
"epoch": 0.192,
"grad_norm": 4.637629508972168,
"learning_rate": 4.558831494071069e-06,
"loss": 1.4187,
"step": 240
},
{
"epoch": 0.1928,
"grad_norm": 1.2166482210159302,
"learning_rate": 4.555260739547657e-06,
"loss": 3.6755,
"step": 241
},
{
"epoch": 0.1936,
"grad_norm": 3.494554281234741,
"learning_rate": 4.551677002880395e-06,
"loss": 1.0023,
"step": 242
},
{
"epoch": 0.1944,
"grad_norm": 1.2456482648849487,
"learning_rate": 4.548080306706114e-06,
"loss": 3.7268,
"step": 243
},
{
"epoch": 0.1952,
"grad_norm": 3.789717674255371,
"learning_rate": 4.544470673743502e-06,
"loss": 1.1345,
"step": 244
},
{
"epoch": 0.196,
"grad_norm": 1.615335464477539,
"learning_rate": 4.54084812679296e-06,
"loss": 3.5679,
"step": 245
},
{
"epoch": 0.1968,
"grad_norm": 4.087082862854004,
"learning_rate": 4.537212688736466e-06,
"loss": 1.5294,
"step": 246
},
{
"epoch": 0.1976,
"grad_norm": 1.3239346742630005,
"learning_rate": 4.533564382537421e-06,
"loss": 3.8232,
"step": 247
},
{
"epoch": 0.1984,
"grad_norm": 3.6679818630218506,
"learning_rate": 4.529903231240511e-06,
"loss": 1.1619,
"step": 248
},
{
"epoch": 0.1992,
"grad_norm": 1.6263890266418457,
"learning_rate": 4.526229257971556e-06,
"loss": 3.7185,
"step": 249
},
{
"epoch": 0.2,
"grad_norm": 4.270927429199219,
"learning_rate": 4.522542485937369e-06,
"loss": 1.4918,
"step": 250
},
{
"epoch": 0.2008,
"grad_norm": 1.6562573909759521,
"learning_rate": 4.518842938425606e-06,
"loss": 3.7609,
"step": 251
},
{
"epoch": 0.2016,
"grad_norm": 4.229763031005859,
"learning_rate": 4.5151306388046175e-06,
"loss": 1.1358,
"step": 252
},
{
"epoch": 0.2024,
"grad_norm": 1.3031507730484009,
"learning_rate": 4.511405610523309e-06,
"loss": 3.6721,
"step": 253
},
{
"epoch": 0.2032,
"grad_norm": 4.729180335998535,
"learning_rate": 4.507667877110982e-06,
"loss": 1.5732,
"step": 254
},
{
"epoch": 0.204,
"grad_norm": 1.4898425340652466,
"learning_rate": 4.503917462177192e-06,
"loss": 3.6121,
"step": 255
},
{
"epoch": 0.2048,
"grad_norm": 4.497402667999268,
"learning_rate": 4.500154389411598e-06,
"loss": 1.3272,
"step": 256
},
{
"epoch": 0.2056,
"grad_norm": 1.141797423362732,
"learning_rate": 4.496378682583813e-06,
"loss": 3.6704,
"step": 257
},
{
"epoch": 0.2064,
"grad_norm": 4.572139739990234,
"learning_rate": 4.492590365543253e-06,
"loss": 1.4076,
"step": 258
},
{
"epoch": 0.2072,
"grad_norm": 1.6577672958374023,
"learning_rate": 4.488789462218988e-06,
"loss": 3.6953,
"step": 259
},
{
"epoch": 0.208,
"grad_norm": 4.384160041809082,
"learning_rate": 4.4849759966195885e-06,
"loss": 1.2979,
"step": 260
},
{
"epoch": 0.2088,
"grad_norm": 1.2096525430679321,
"learning_rate": 4.4811499928329775e-06,
"loss": 3.7744,
"step": 261
},
{
"epoch": 0.2096,
"grad_norm": 4.4223246574401855,
"learning_rate": 4.477311475026271e-06,
"loss": 1.3639,
"step": 262
},
{
"epoch": 0.2104,
"grad_norm": 1.2359306812286377,
"learning_rate": 4.473460467445637e-06,
"loss": 3.6689,
"step": 263
},
{
"epoch": 0.2112,
"grad_norm": 4.513794898986816,
"learning_rate": 4.469596994416131e-06,
"loss": 1.2571,
"step": 264
},
{
"epoch": 0.212,
"grad_norm": 1.4100075960159302,
"learning_rate": 4.465721080341547e-06,
"loss": 3.669,
"step": 265
},
{
"epoch": 0.2128,
"grad_norm": 4.375431537628174,
"learning_rate": 4.4618327497042676e-06,
"loss": 1.3244,
"step": 266
},
{
"epoch": 0.2136,
"grad_norm": 1.1597020626068115,
"learning_rate": 4.457932027065102e-06,
"loss": 3.7463,
"step": 267
},
{
"epoch": 0.2144,
"grad_norm": 4.304786682128906,
"learning_rate": 4.4540189370631315e-06,
"loss": 1.2498,
"step": 268
},
{
"epoch": 0.2152,
"grad_norm": 1.5611578226089478,
"learning_rate": 4.450093504415562e-06,
"loss": 3.7,
"step": 269
},
{
"epoch": 0.216,
"grad_norm": 4.710305213928223,
"learning_rate": 4.446155753917559e-06,
"loss": 1.4829,
"step": 270
},
{
"epoch": 0.2168,
"grad_norm": 1.0595712661743164,
"learning_rate": 4.442205710442095e-06,
"loss": 3.7709,
"step": 271
},
{
"epoch": 0.2176,
"grad_norm": 4.113396644592285,
"learning_rate": 4.43824339893979e-06,
"loss": 1.4732,
"step": 272
},
{
"epoch": 0.2184,
"grad_norm": 1.346928358078003,
"learning_rate": 4.434268844438758e-06,
"loss": 3.6034,
"step": 273
},
{
"epoch": 0.2192,
"grad_norm": 4.2482452392578125,
"learning_rate": 4.4302820720444454e-06,
"loss": 1.3669,
"step": 274
},
{
"epoch": 0.22,
"grad_norm": 1.1629118919372559,
"learning_rate": 4.426283106939474e-06,
"loss": 3.7432,
"step": 275
},
{
"epoch": 0.2208,
"grad_norm": 3.7786972522735596,
"learning_rate": 4.422271974383479e-06,
"loss": 1.3379,
"step": 276
},
{
"epoch": 0.2216,
"grad_norm": 1.7842165231704712,
"learning_rate": 4.418248699712955e-06,
"loss": 3.6675,
"step": 277
},
{
"epoch": 0.2224,
"grad_norm": 3.950294017791748,
"learning_rate": 4.414213308341092e-06,
"loss": 1.5301,
"step": 278
},
{
"epoch": 0.2232,
"grad_norm": 1.4630101919174194,
"learning_rate": 4.410165825757613e-06,
"loss": 3.571,
"step": 279
},
{
"epoch": 0.224,
"grad_norm": 4.155986309051514,
"learning_rate": 4.40610627752862e-06,
"loss": 1.3453,
"step": 280
},
{
"epoch": 0.2248,
"grad_norm": 1.698153018951416,
"learning_rate": 4.402034689296425e-06,
"loss": 3.6699,
"step": 281
},
{
"epoch": 0.2256,
"grad_norm": 4.893118858337402,
"learning_rate": 4.397951086779392e-06,
"loss": 1.6296,
"step": 282
},
{
"epoch": 0.2264,
"grad_norm": 1.9244930744171143,
"learning_rate": 4.393855495771774e-06,
"loss": 3.728,
"step": 283
},
{
"epoch": 0.2272,
"grad_norm": 4.7193827629089355,
"learning_rate": 4.389747942143549e-06,
"loss": 1.3797,
"step": 284
},
{
"epoch": 0.228,
"grad_norm": 1.3077738285064697,
"learning_rate": 4.38562845184026e-06,
"loss": 3.7899,
"step": 285
},
{
"epoch": 0.2288,
"grad_norm": 4.431347370147705,
"learning_rate": 4.381497050882845e-06,
"loss": 1.6555,
"step": 286
},
{
"epoch": 0.2296,
"grad_norm": 1.5692718029022217,
"learning_rate": 4.377353765367479e-06,
"loss": 3.6771,
"step": 287
},
{
"epoch": 0.2304,
"grad_norm": 3.9838104248046875,
"learning_rate": 4.373198621465405e-06,
"loss": 1.1383,
"step": 288
},
{
"epoch": 0.2312,
"grad_norm": 1.101969838142395,
"learning_rate": 4.369031645422768e-06,
"loss": 3.6786,
"step": 289
},
{
"epoch": 0.232,
"grad_norm": 4.563289165496826,
"learning_rate": 4.364852863560456e-06,
"loss": 1.2641,
"step": 290
},
{
"epoch": 0.2328,
"grad_norm": 1.3112094402313232,
"learning_rate": 4.360662302273926e-06,
"loss": 3.7925,
"step": 291
},
{
"epoch": 0.2336,
"grad_norm": 4.193509578704834,
"learning_rate": 4.356459988033039e-06,
"loss": 1.1937,
"step": 292
},
{
"epoch": 0.2344,
"grad_norm": 1.167222499847412,
"learning_rate": 4.352245947381897e-06,
"loss": 3.6606,
"step": 293
},
{
"epoch": 0.2352,
"grad_norm": 5.211182117462158,
"learning_rate": 4.348020206938672e-06,
"loss": 1.5236,
"step": 294
},
{
"epoch": 0.236,
"grad_norm": 1.5906448364257812,
"learning_rate": 4.343782793395435e-06,
"loss": 3.6172,
"step": 295
},
{
"epoch": 0.2368,
"grad_norm": 4.557344913482666,
"learning_rate": 4.3395337335179945e-06,
"loss": 1.2071,
"step": 296
},
{
"epoch": 0.2376,
"grad_norm": 1.5080584287643433,
"learning_rate": 4.3352730541457215e-06,
"loss": 3.5182,
"step": 297
},
{
"epoch": 0.2384,
"grad_norm": 4.691150665283203,
"learning_rate": 4.331000782191384e-06,
"loss": 1.4428,
"step": 298
},
{
"epoch": 0.2392,
"grad_norm": 1.2369650602340698,
"learning_rate": 4.32671694464097e-06,
"loss": 3.6389,
"step": 299
},
{
"epoch": 0.24,
"grad_norm": 5.130438327789307,
"learning_rate": 4.322421568553529e-06,
"loss": 1.4164,
"step": 300
},
{
"epoch": 0.2408,
"grad_norm": 1.76595938205719,
"learning_rate": 4.318114681060989e-06,
"loss": 3.5655,
"step": 301
},
{
"epoch": 0.2416,
"grad_norm": 4.4846954345703125,
"learning_rate": 4.3137963093679945e-06,
"loss": 1.4369,
"step": 302
},
{
"epoch": 0.2424,
"grad_norm": 1.5124865770339966,
"learning_rate": 4.309466480751726e-06,
"loss": 3.5159,
"step": 303
},
{
"epoch": 0.2432,
"grad_norm": 4.232130527496338,
"learning_rate": 4.305125222561736e-06,
"loss": 1.5252,
"step": 304
},
{
"epoch": 0.244,
"grad_norm": 1.544097900390625,
"learning_rate": 4.3007725622197675e-06,
"loss": 3.7571,
"step": 305
},
{
"epoch": 0.2448,
"grad_norm": 3.7335703372955322,
"learning_rate": 4.296408527219592e-06,
"loss": 1.2674,
"step": 306
},
{
"epoch": 0.2456,
"grad_norm": 1.2222108840942383,
"learning_rate": 4.2920331451268246e-06,
"loss": 3.6799,
"step": 307
},
{
"epoch": 0.2464,
"grad_norm": 4.682336807250977,
"learning_rate": 4.2876464435787576e-06,
"loss": 1.3907,
"step": 308
},
{
"epoch": 0.2472,
"grad_norm": 1.7839024066925049,
"learning_rate": 4.283248450284182e-06,
"loss": 3.4632,
"step": 309
},
{
"epoch": 0.248,
"grad_norm": 4.441279411315918,
"learning_rate": 4.278839193023214e-06,
"loss": 1.4755,
"step": 310
},
{
"epoch": 0.2488,
"grad_norm": 1.5365478992462158,
"learning_rate": 4.274418699647117e-06,
"loss": 3.5074,
"step": 311
},
{
"epoch": 0.2496,
"grad_norm": 4.5583062171936035,
"learning_rate": 4.269986998078132e-06,
"loss": 1.681,
"step": 312
},
{
"epoch": 0.2504,
"grad_norm": 1.4559458494186401,
"learning_rate": 4.265544116309294e-06,
"loss": 3.5942,
"step": 313
},
{
"epoch": 0.2512,
"grad_norm": 4.114186763763428,
"learning_rate": 4.2610900824042575e-06,
"loss": 1.6586,
"step": 314
},
{
"epoch": 0.252,
"grad_norm": 1.3927795886993408,
"learning_rate": 4.256624924497124e-06,
"loss": 3.6604,
"step": 315
},
{
"epoch": 0.2528,
"grad_norm": 3.7071781158447266,
"learning_rate": 4.2521486707922545e-06,
"loss": 1.3165,
"step": 316
},
{
"epoch": 0.2536,
"grad_norm": 1.5977774858474731,
"learning_rate": 4.247661349564103e-06,
"loss": 3.71,
"step": 317
},
{
"epoch": 0.2544,
"grad_norm": 4.849422931671143,
"learning_rate": 4.243162989157027e-06,
"loss": 1.4173,
"step": 318
},
{
"epoch": 0.2552,
"grad_norm": 1.525455355644226,
"learning_rate": 4.2386536179851175e-06,
"loss": 3.5833,
"step": 319
},
{
"epoch": 0.256,
"grad_norm": 4.420166969299316,
"learning_rate": 4.234133264532012e-06,
"loss": 1.2962,
"step": 320
},
{
"epoch": 0.2568,
"grad_norm": 1.18903386592865,
"learning_rate": 4.229601957350722e-06,
"loss": 3.6984,
"step": 321
},
{
"epoch": 0.2576,
"grad_norm": 3.8449833393096924,
"learning_rate": 4.225059725063444e-06,
"loss": 1.3112,
"step": 322
},
{
"epoch": 0.2584,
"grad_norm": 1.7980787754058838,
"learning_rate": 4.220506596361387e-06,
"loss": 3.5587,
"step": 323
},
{
"epoch": 0.2592,
"grad_norm": 3.5607681274414062,
"learning_rate": 4.215942600004586e-06,
"loss": 1.2554,
"step": 324
},
{
"epoch": 0.26,
"grad_norm": 1.572067379951477,
"learning_rate": 4.211367764821722e-06,
"loss": 3.7133,
"step": 325
},
{
"epoch": 0.2608,
"grad_norm": 10.11608600616455,
"learning_rate": 4.206782119709942e-06,
"loss": 1.5166,
"step": 326
},
{
"epoch": 0.2616,
"grad_norm": 1.5986098051071167,
"learning_rate": 4.202185693634671e-06,
"loss": 3.6253,
"step": 327
},
{
"epoch": 0.2624,
"grad_norm": 3.9274239540100098,
"learning_rate": 4.197578515629435e-06,
"loss": 1.311,
"step": 328
},
{
"epoch": 0.2632,
"grad_norm": 1.2195369005203247,
"learning_rate": 4.192960614795676e-06,
"loss": 3.7322,
"step": 329
},
{
"epoch": 0.264,
"grad_norm": 4.052531719207764,
"learning_rate": 4.188332020302561e-06,
"loss": 1.3612,
"step": 330
},
{
"epoch": 0.2648,
"grad_norm": 1.4489315748214722,
"learning_rate": 4.183692761386813e-06,
"loss": 3.534,
"step": 331
},
{
"epoch": 0.2656,
"grad_norm": 5.4260053634643555,
"learning_rate": 4.1790428673525104e-06,
"loss": 1.523,
"step": 332
},
{
"epoch": 0.2664,
"grad_norm": 1.6070371866226196,
"learning_rate": 4.1743823675709115e-06,
"loss": 3.4917,
"step": 333
},
{
"epoch": 0.2672,
"grad_norm": 4.363175392150879,
"learning_rate": 4.1697112914802665e-06,
"loss": 1.6258,
"step": 334
},
{
"epoch": 0.268,
"grad_norm": 1.6007026433944702,
"learning_rate": 4.16502966858563e-06,
"loss": 3.575,
"step": 335
},
{
"epoch": 0.2688,
"grad_norm": 4.8055419921875,
"learning_rate": 4.160337528458676e-06,
"loss": 1.7682,
"step": 336
},
{
"epoch": 0.2696,
"grad_norm": 1.2397737503051758,
"learning_rate": 4.155634900737513e-06,
"loss": 3.6629,
"step": 337
},
{
"epoch": 0.2704,
"grad_norm": 4.131043910980225,
"learning_rate": 4.150921815126493e-06,
"loss": 1.5988,
"step": 338
},
{
"epoch": 0.2712,
"grad_norm": 1.2639617919921875,
"learning_rate": 4.146198301396025e-06,
"loss": 3.5698,
"step": 339
},
{
"epoch": 0.272,
"grad_norm": 4.381173610687256,
"learning_rate": 4.141464389382392e-06,
"loss": 1.3198,
"step": 340
},
{
"epoch": 0.2728,
"grad_norm": 1.440491795539856,
"learning_rate": 4.136720108987552e-06,
"loss": 3.6658,
"step": 341
},
{
"epoch": 0.2736,
"grad_norm": 8.941045761108398,
"learning_rate": 4.13196549017896e-06,
"loss": 1.2674,
"step": 342
},
{
"epoch": 0.2744,
"grad_norm": 1.5544283390045166,
"learning_rate": 4.127200562989372e-06,
"loss": 3.5196,
"step": 343
},
{
"epoch": 0.2752,
"grad_norm": 4.094554424285889,
"learning_rate": 4.122425357516658e-06,
"loss": 1.2112,
"step": 344
},
{
"epoch": 0.276,
"grad_norm": 1.1563968658447266,
"learning_rate": 4.117639903923611e-06,
"loss": 3.6399,
"step": 345
},
{
"epoch": 0.2768,
"grad_norm": 4.3765482902526855,
"learning_rate": 4.112844232437757e-06,
"loss": 1.3016,
"step": 346
},
{
"epoch": 0.2776,
"grad_norm": 1.073043704032898,
"learning_rate": 4.108038373351163e-06,
"loss": 3.6758,
"step": 347
},
{
"epoch": 0.2784,
"grad_norm": 4.243771553039551,
"learning_rate": 4.103222357020248e-06,
"loss": 1.4512,
"step": 348
},
{
"epoch": 0.2792,
"grad_norm": 1.4195610284805298,
"learning_rate": 4.098396213865587e-06,
"loss": 3.6391,
"step": 349
},
{
"epoch": 0.28,
"grad_norm": 4.04062032699585,
"learning_rate": 4.093559974371725e-06,
"loss": 1.2876,
"step": 350
},
{
"epoch": 0.2808,
"grad_norm": 1.384352207183838,
"learning_rate": 4.0887136690869774e-06,
"loss": 3.6527,
"step": 351
},
{
"epoch": 0.2816,
"grad_norm": 4.134579181671143,
"learning_rate": 4.083857328623243e-06,
"loss": 1.3498,
"step": 352
},
{
"epoch": 0.2824,
"grad_norm": 1.8394545316696167,
"learning_rate": 4.078990983655807e-06,
"loss": 3.5694,
"step": 353
},
{
"epoch": 0.2832,
"grad_norm": 4.24132776260376,
"learning_rate": 4.07411466492315e-06,
"loss": 1.6123,
"step": 354
},
{
"epoch": 0.284,
"grad_norm": 1.1497430801391602,
"learning_rate": 4.069228403226751e-06,
"loss": 3.6655,
"step": 355
},
{
"epoch": 0.2848,
"grad_norm": 3.8187551498413086,
"learning_rate": 4.064332229430895e-06,
"loss": 1.4159,
"step": 356
},
{
"epoch": 0.2856,
"grad_norm": 1.5703147649765015,
"learning_rate": 4.059426174462476e-06,
"loss": 3.5892,
"step": 357
},
{
"epoch": 0.2864,
"grad_norm": 4.054878234863281,
"learning_rate": 4.054510269310803e-06,
"loss": 1.3898,
"step": 358
},
{
"epoch": 0.2872,
"grad_norm": 1.7447679042816162,
"learning_rate": 4.049584545027406e-06,
"loss": 3.5291,
"step": 359
},
{
"epoch": 0.288,
"grad_norm": 3.6220648288726807,
"learning_rate": 4.044649032725836e-06,
"loss": 1.1255,
"step": 360
},
{
"epoch": 0.2888,
"grad_norm": 1.4866344928741455,
"learning_rate": 4.039703763581472e-06,
"loss": 3.647,
"step": 361
},
{
"epoch": 0.2896,
"grad_norm": 4.575165271759033,
"learning_rate": 4.034748768831319e-06,
"loss": 1.3781,
"step": 362
},
{
"epoch": 0.2904,
"grad_norm": 1.0558618307113647,
"learning_rate": 4.02978407977382e-06,
"loss": 3.6163,
"step": 363
},
{
"epoch": 0.2912,
"grad_norm": 4.454329490661621,
"learning_rate": 4.024809727768648e-06,
"loss": 1.3233,
"step": 364
},
{
"epoch": 0.292,
"grad_norm": 1.3956743478775024,
"learning_rate": 4.019825744236514e-06,
"loss": 3.5997,
"step": 365
},
{
"epoch": 0.2928,
"grad_norm": 4.550688743591309,
"learning_rate": 4.014832160658966e-06,
"loss": 1.4364,
"step": 366
},
{
"epoch": 0.2936,
"grad_norm": 1.2573503255844116,
"learning_rate": 4.009829008578192e-06,
"loss": 3.6729,
"step": 367
},
{
"epoch": 0.2944,
"grad_norm": 4.038947582244873,
"learning_rate": 4.004816319596822e-06,
"loss": 1.2911,
"step": 368
},
{
"epoch": 0.2952,
"grad_norm": 1.9488675594329834,
"learning_rate": 3.999794125377721e-06,
"loss": 3.5393,
"step": 369
},
{
"epoch": 0.296,
"grad_norm": 4.447761535644531,
"learning_rate": 3.9947624576437975e-06,
"loss": 1.5997,
"step": 370
},
{
"epoch": 0.2968,
"grad_norm": 1.2472996711730957,
"learning_rate": 3.989721348177801e-06,
"loss": 3.6067,
"step": 371
},
{
"epoch": 0.2976,
"grad_norm": 4.081388473510742,
"learning_rate": 3.984670828822118e-06,
"loss": 1.4171,
"step": 372
},
{
"epoch": 0.2984,
"grad_norm": 1.7100144624710083,
"learning_rate": 3.979610931478574e-06,
"loss": 3.7103,
"step": 373
},
{
"epoch": 0.2992,
"grad_norm": 4.408793926239014,
"learning_rate": 3.97454168810823e-06,
"loss": 1.3243,
"step": 374
},
{
"epoch": 0.3,
"grad_norm": 1.326974868774414,
"learning_rate": 3.969463130731183e-06,
"loss": 3.6149,
"step": 375
},
{
"epoch": 0.3008,
"grad_norm": 4.624994277954102,
"learning_rate": 3.964375291426361e-06,
"loss": 1.5994,
"step": 376
},
{
"epoch": 0.3016,
"grad_norm": 1.3679853677749634,
"learning_rate": 3.959278202331323e-06,
"loss": 3.5478,
"step": 377
},
{
"epoch": 0.3024,
"grad_norm": 4.432180881500244,
"learning_rate": 3.954171895642052e-06,
"loss": 1.4198,
"step": 378
},
{
"epoch": 0.3032,
"grad_norm": 1.0665056705474854,
"learning_rate": 3.949056403612758e-06,
"loss": 3.7173,
"step": 379
},
{
"epoch": 0.304,
"grad_norm": 3.6534807682037354,
"learning_rate": 3.943931758555669e-06,
"loss": 1.2773,
"step": 380
},
{
"epoch": 0.3048,
"grad_norm": 1.4018532037734985,
"learning_rate": 3.938797992840828e-06,
"loss": 3.5796,
"step": 381
},
{
"epoch": 0.3056,
"grad_norm": 4.3174357414245605,
"learning_rate": 3.933655138895889e-06,
"loss": 1.0747,
"step": 382
},
{
"epoch": 0.3064,
"grad_norm": 1.893721342086792,
"learning_rate": 3.928503229205913e-06,
"loss": 3.5452,
"step": 383
},
{
"epoch": 0.3072,
"grad_norm": 4.509764194488525,
"learning_rate": 3.923342296313162e-06,
"loss": 1.4684,
"step": 384
},
{
"epoch": 0.308,
"grad_norm": 1.2628504037857056,
"learning_rate": 3.918172372816892e-06,
"loss": 3.5872,
"step": 385
},
{
"epoch": 0.3088,
"grad_norm": 3.868783712387085,
"learning_rate": 3.91299349137315e-06,
"loss": 1.316,
"step": 386
},
{
"epoch": 0.3096,
"grad_norm": 1.3258881568908691,
"learning_rate": 3.907805684694567e-06,
"loss": 3.6877,
"step": 387
},
{
"epoch": 0.3104,
"grad_norm": 3.9455106258392334,
"learning_rate": 3.9026089855501475e-06,
"loss": 1.2362,
"step": 388
},
{
"epoch": 0.3112,
"grad_norm": 1.0947574377059937,
"learning_rate": 3.8974034267650695e-06,
"loss": 3.735,
"step": 389
},
{
"epoch": 0.312,
"grad_norm": 4.135454177856445,
"learning_rate": 3.89218904122047e-06,
"loss": 1.3921,
"step": 390
},
{
"epoch": 0.3128,
"grad_norm": 1.3168636560440063,
"learning_rate": 3.886965861853243e-06,
"loss": 3.5585,
"step": 391
},
{
"epoch": 0.3136,
"grad_norm": 3.532658100128174,
"learning_rate": 3.881733921655829e-06,
"loss": 1.2495,
"step": 392
},
{
"epoch": 0.3144,
"grad_norm": 1.3559529781341553,
"learning_rate": 3.876493253676004e-06,
"loss": 3.561,
"step": 393
},
{
"epoch": 0.3152,
"grad_norm": 4.4542036056518555,
"learning_rate": 3.871243891016676e-06,
"loss": 1.3177,
"step": 394
},
{
"epoch": 0.316,
"grad_norm": 1.6158586740493774,
"learning_rate": 3.8659858668356735e-06,
"loss": 3.623,
"step": 395
},
{
"epoch": 0.3168,
"grad_norm": 4.352112293243408,
"learning_rate": 3.8607192143455325e-06,
"loss": 1.3388,
"step": 396
},
{
"epoch": 0.3176,
"grad_norm": 1.2379918098449707,
"learning_rate": 3.855443966813295e-06,
"loss": 3.6086,
"step": 397
},
{
"epoch": 0.3184,
"grad_norm": 4.482300758361816,
"learning_rate": 3.85016015756029e-06,
"loss": 1.3943,
"step": 398
},
{
"epoch": 0.3192,
"grad_norm": 1.632942795753479,
"learning_rate": 3.844867819961928e-06,
"loss": 3.5682,
"step": 399
},
{
"epoch": 0.32,
"grad_norm": 4.9489521980285645,
"learning_rate": 3.839566987447492e-06,
"loss": 1.1445,
"step": 400
},
{
"epoch": 0.3208,
"grad_norm": 1.3084850311279297,
"learning_rate": 3.8342576934999184e-06,
"loss": 3.7127,
"step": 401
},
{
"epoch": 0.3216,
"grad_norm": 3.8171467781066895,
"learning_rate": 3.828939971655595e-06,
"loss": 1.302,
"step": 402
},
{
"epoch": 0.3224,
"grad_norm": 1.2390443086624146,
"learning_rate": 3.823613855504144e-06,
"loss": 3.5798,
"step": 403
},
{
"epoch": 0.3232,
"grad_norm": 4.057291507720947,
"learning_rate": 3.8182793786882065e-06,
"loss": 1.3189,
"step": 404
},
{
"epoch": 0.324,
"grad_norm": 1.3859179019927979,
"learning_rate": 3.8129365749032398e-06,
"loss": 3.6643,
"step": 405
},
{
"epoch": 0.3248,
"grad_norm": 4.864846706390381,
"learning_rate": 3.807585477897296e-06,
"loss": 1.4575,
"step": 406
},
{
"epoch": 0.3256,
"grad_norm": 1.0886560678482056,
"learning_rate": 3.802226121470811e-06,
"loss": 3.7321,
"step": 407
},
{
"epoch": 0.3264,
"grad_norm": 3.940027952194214,
"learning_rate": 3.796858539476394e-06,
"loss": 1.2742,
"step": 408
},
{
"epoch": 0.3272,
"grad_norm": 1.2309926748275757,
"learning_rate": 3.7914827658186104e-06,
"loss": 3.5766,
"step": 409
},
{
"epoch": 0.328,
"grad_norm": 4.414444446563721,
"learning_rate": 3.7860988344537664e-06,
"loss": 1.2858,
"step": 410
},
{
"epoch": 0.3288,
"grad_norm": 1.0498713254928589,
"learning_rate": 3.7807067793897006e-06,
"loss": 3.6743,
"step": 411
},
{
"epoch": 0.3296,
"grad_norm": 4.1902313232421875,
"learning_rate": 3.775306634685562e-06,
"loss": 1.4446,
"step": 412
},
{
"epoch": 0.3304,
"grad_norm": 1.1650660037994385,
"learning_rate": 3.7698984344516e-06,
"loss": 3.6178,
"step": 413
},
{
"epoch": 0.3312,
"grad_norm": 4.5790910720825195,
"learning_rate": 3.7644822128489476e-06,
"loss": 1.5761,
"step": 414
},
{
"epoch": 0.332,
"grad_norm": 1.0688635110855103,
"learning_rate": 3.7590580040894025e-06,
"loss": 3.689,
"step": 415
},
{
"epoch": 0.3328,
"grad_norm": 4.05617094039917,
"learning_rate": 3.7536258424352164e-06,
"loss": 1.6174,
"step": 416
},
{
"epoch": 0.3336,
"grad_norm": 1.236042857170105,
"learning_rate": 3.7481857621988734e-06,
"loss": 3.6902,
"step": 417
},
{
"epoch": 0.3344,
"grad_norm": 4.205336093902588,
"learning_rate": 3.742737797742878e-06,
"loss": 1.3125,
"step": 418
},
{
"epoch": 0.3352,
"grad_norm": 1.460862159729004,
"learning_rate": 3.737281983479534e-06,
"loss": 3.503,
"step": 419
},
{
"epoch": 0.336,
"grad_norm": 4.190709114074707,
"learning_rate": 3.731818353870729e-06,
"loss": 1.2207,
"step": 420
},
{
"epoch": 0.3368,
"grad_norm": 2.0372729301452637,
"learning_rate": 3.726346943427719e-06,
"loss": 3.5128,
"step": 421
},
{
"epoch": 0.3376,
"grad_norm": 4.000549793243408,
"learning_rate": 3.7208677867109042e-06,
"loss": 1.244,
"step": 422
},
{
"epoch": 0.3384,
"grad_norm": 1.509992003440857,
"learning_rate": 3.7153809183296174e-06,
"loss": 3.6028,
"step": 423
},
{
"epoch": 0.3392,
"grad_norm": 3.7690091133117676,
"learning_rate": 3.7098863729418997e-06,
"loss": 1.1382,
"step": 424
},
{
"epoch": 0.34,
"grad_norm": 1.0848690271377563,
"learning_rate": 3.7043841852542884e-06,
"loss": 3.7097,
"step": 425
},
{
"epoch": 0.3408,
"grad_norm": 4.2273359298706055,
"learning_rate": 3.6988743900215895e-06,
"loss": 1.3459,
"step": 426
},
{
"epoch": 0.3416,
"grad_norm": 1.30433189868927,
"learning_rate": 3.6933570220466654e-06,
"loss": 3.5762,
"step": 427
},
{
"epoch": 0.3424,
"grad_norm": 3.894927740097046,
"learning_rate": 3.6878321161802106e-06,
"loss": 1.411,
"step": 428
},
{
"epoch": 0.3432,
"grad_norm": 1.23166024684906,
"learning_rate": 3.682299707320532e-06,
"loss": 3.7625,
"step": 429
},
{
"epoch": 0.344,
"grad_norm": 4.281452655792236,
"learning_rate": 3.6767598304133325e-06,
"loss": 1.2892,
"step": 430
},
{
"epoch": 0.3448,
"grad_norm": 1.510961890220642,
"learning_rate": 3.6712125204514836e-06,
"loss": 3.5778,
"step": 431
},
{
"epoch": 0.3456,
"grad_norm": 3.6072661876678467,
"learning_rate": 3.665657812474812e-06,
"loss": 1.2145,
"step": 432
},
{
"epoch": 0.3464,
"grad_norm": 1.6257572174072266,
"learning_rate": 3.660095741569871e-06,
"loss": 3.7148,
"step": 433
},
{
"epoch": 0.3472,
"grad_norm": 4.151918411254883,
"learning_rate": 3.654526342869724e-06,
"loss": 1.3151,
"step": 434
},
{
"epoch": 0.348,
"grad_norm": 1.7173959016799927,
"learning_rate": 3.6489496515537204e-06,
"loss": 3.5563,
"step": 435
},
{
"epoch": 0.3488,
"grad_norm": 3.5843987464904785,
"learning_rate": 3.643365702847272e-06,
"loss": 1.1541,
"step": 436
},
{
"epoch": 0.3496,
"grad_norm": 1.2119823694229126,
"learning_rate": 3.6377745320216346e-06,
"loss": 3.6086,
"step": 437
},
{
"epoch": 0.3504,
"grad_norm": 4.704022407531738,
"learning_rate": 3.632176174393682e-06,
"loss": 1.5989,
"step": 438
},
{
"epoch": 0.3512,
"grad_norm": 1.3486601114273071,
"learning_rate": 3.6265706653256837e-06,
"loss": 3.6383,
"step": 439
},
{
"epoch": 0.352,
"grad_norm": 4.133458614349365,
"learning_rate": 3.6209580402250816e-06,
"loss": 1.2559,
"step": 440
},
{
"epoch": 0.3528,
"grad_norm": 1.3388392925262451,
"learning_rate": 3.615338334544265e-06,
"loss": 3.6902,
"step": 441
},
{
"epoch": 0.3536,
"grad_norm": 4.311944961547852,
"learning_rate": 3.6097115837803504e-06,
"loss": 1.1318,
"step": 442
},
{
"epoch": 0.3544,
"grad_norm": 1.4599226713180542,
"learning_rate": 3.604077823474954e-06,
"loss": 3.6407,
"step": 443
},
{
"epoch": 0.3552,
"grad_norm": 4.284412384033203,
"learning_rate": 3.5984370892139663e-06,
"loss": 1.4261,
"step": 444
},
{
"epoch": 0.356,
"grad_norm": 1.4893653392791748,
"learning_rate": 3.5927894166273324e-06,
"loss": 3.6037,
"step": 445
},
{
"epoch": 0.3568,
"grad_norm": 3.953293800354004,
"learning_rate": 3.5871348413888207e-06,
"loss": 1.2646,
"step": 446
},
{
"epoch": 0.3576,
"grad_norm": 1.2986643314361572,
"learning_rate": 3.5814733992158025e-06,
"loss": 3.5551,
"step": 447
},
{
"epoch": 0.3584,
"grad_norm": 4.767986297607422,
"learning_rate": 3.5758051258690223e-06,
"loss": 1.6051,
"step": 448
},
{
"epoch": 0.3592,
"grad_norm": 1.4707053899765015,
"learning_rate": 3.5701300571523757e-06,
"loss": 3.4898,
"step": 449
},
{
"epoch": 0.36,
"grad_norm": 4.075262546539307,
"learning_rate": 3.564448228912682e-06,
"loss": 1.0939,
"step": 450
},
{
"epoch": 0.3608,
"grad_norm": 1.6893370151519775,
"learning_rate": 3.558759677039455e-06,
"loss": 3.524,
"step": 451
},
{
"epoch": 0.3616,
"grad_norm": 4.155539035797119,
"learning_rate": 3.553064437464682e-06,
"loss": 1.3009,
"step": 452
},
{
"epoch": 0.3624,
"grad_norm": 1.3253870010375977,
"learning_rate": 3.5473625461625884e-06,
"loss": 3.5764,
"step": 453
},
{
"epoch": 0.3632,
"grad_norm": 4.075945854187012,
"learning_rate": 3.54165403914942e-06,
"loss": 1.2607,
"step": 454
},
{
"epoch": 0.364,
"grad_norm": 1.059866189956665,
"learning_rate": 3.535938952483211e-06,
"loss": 3.6742,
"step": 455
},
{
"epoch": 0.3648,
"grad_norm": 4.110774993896484,
"learning_rate": 3.5302173222635526e-06,
"loss": 1.4106,
"step": 456
},
{
"epoch": 0.3656,
"grad_norm": 1.3632076978683472,
"learning_rate": 3.5244891846313733e-06,
"loss": 3.6836,
"step": 457
},
{
"epoch": 0.3664,
"grad_norm": 3.705369472503662,
"learning_rate": 3.518754575768702e-06,
"loss": 1.3081,
"step": 458
},
{
"epoch": 0.3672,
"grad_norm": 1.1472023725509644,
"learning_rate": 3.5130135318984454e-06,
"loss": 3.6175,
"step": 459
},
{
"epoch": 0.368,
"grad_norm": 3.85665225982666,
"learning_rate": 3.507266089284157e-06,
"loss": 1.3936,
"step": 460
},
{
"epoch": 0.3688,
"grad_norm": 1.0957272052764893,
"learning_rate": 3.501512284229807e-06,
"loss": 3.6699,
"step": 461
},
{
"epoch": 0.3696,
"grad_norm": 5.635092735290527,
"learning_rate": 3.4957521530795576e-06,
"loss": 1.5143,
"step": 462
},
{
"epoch": 0.3704,
"grad_norm": 1.2065218687057495,
"learning_rate": 3.4899857322175252e-06,
"loss": 3.6554,
"step": 463
},
{
"epoch": 0.3712,
"grad_norm": 3.687448263168335,
"learning_rate": 3.484213058067559e-06,
"loss": 1.3567,
"step": 464
},
{
"epoch": 0.372,
"grad_norm": 1.4137887954711914,
"learning_rate": 3.4784341670930067e-06,
"loss": 3.5039,
"step": 465
},
{
"epoch": 0.3728,
"grad_norm": 3.735736131668091,
"learning_rate": 3.4726490957964836e-06,
"loss": 1.1562,
"step": 466
},
{
"epoch": 0.3736,
"grad_norm": 1.429471731185913,
"learning_rate": 3.466857880719645e-06,
"loss": 3.4816,
"step": 467
},
{
"epoch": 0.3744,
"grad_norm": 3.8104074001312256,
"learning_rate": 3.4610605584429526e-06,
"loss": 1.2771,
"step": 468
},
{
"epoch": 0.3752,
"grad_norm": 1.0887689590454102,
"learning_rate": 3.455257165585444e-06,
"loss": 3.6168,
"step": 469
},
{
"epoch": 0.376,
"grad_norm": 4.246683120727539,
"learning_rate": 3.4494477388045035e-06,
"loss": 1.4563,
"step": 470
},
{
"epoch": 0.3768,
"grad_norm": 1.181482195854187,
"learning_rate": 3.443632314795627e-06,
"loss": 3.5803,
"step": 471
},
{
"epoch": 0.3776,
"grad_norm": 4.463985443115234,
"learning_rate": 3.4378109302921946e-06,
"loss": 1.3947,
"step": 472
},
{
"epoch": 0.3784,
"grad_norm": 2.0847549438476562,
"learning_rate": 3.4319836220652334e-06,
"loss": 3.5447,
"step": 473
},
{
"epoch": 0.3792,
"grad_norm": 3.957758903503418,
"learning_rate": 3.4261504269231904e-06,
"loss": 1.3876,
"step": 474
},
{
"epoch": 0.38,
"grad_norm": 1.2002718448638916,
"learning_rate": 3.4203113817116955e-06,
"loss": 3.6171,
"step": 475
},
{
"epoch": 0.3808,
"grad_norm": 3.7537636756896973,
"learning_rate": 3.4144665233133318e-06,
"loss": 1.3785,
"step": 476
},
{
"epoch": 0.3816,
"grad_norm": 1.081315517425537,
"learning_rate": 3.408615888647402e-06,
"loss": 3.6535,
"step": 477
},
{
"epoch": 0.3824,
"grad_norm": 4.511240005493164,
"learning_rate": 3.402759514669694e-06,
"loss": 1.5004,
"step": 478
},
{
"epoch": 0.3832,
"grad_norm": 1.60770845413208,
"learning_rate": 3.3968974383722497e-06,
"loss": 3.6355,
"step": 479
},
{
"epoch": 0.384,
"grad_norm": 4.516547679901123,
"learning_rate": 3.391029696783127e-06,
"loss": 1.2093,
"step": 480
},
{
"epoch": 0.3848,
"grad_norm": 1.8860230445861816,
"learning_rate": 3.385156326966173e-06,
"loss": 3.5089,
"step": 481
},
{
"epoch": 0.3856,
"grad_norm": 4.554468631744385,
"learning_rate": 3.379277366020782e-06,
"loss": 1.477,
"step": 482
},
{
"epoch": 0.3864,
"grad_norm": 1.258987307548523,
"learning_rate": 3.3733928510816677e-06,
"loss": 3.583,
"step": 483
},
{
"epoch": 0.3872,
"grad_norm": 4.783546447753906,
"learning_rate": 3.3675028193186243e-06,
"loss": 1.5192,
"step": 484
},
{
"epoch": 0.388,
"grad_norm": 1.0193849802017212,
"learning_rate": 3.3616073079362925e-06,
"loss": 3.629,
"step": 485
},
{
"epoch": 0.3888,
"grad_norm": 4.146661758422852,
"learning_rate": 3.3557063541739283e-06,
"loss": 1.2621,
"step": 486
},
{
"epoch": 0.3896,
"grad_norm": 1.25571608543396,
"learning_rate": 3.349799995305162e-06,
"loss": 3.5985,
"step": 487
},
{
"epoch": 0.3904,
"grad_norm": 4.230064868927002,
"learning_rate": 3.343888268637765e-06,
"loss": 1.232,
"step": 488
},
{
"epoch": 0.3912,
"grad_norm": 1.292047142982483,
"learning_rate": 3.337971211513417e-06,
"loss": 3.587,
"step": 489
},
{
"epoch": 0.392,
"grad_norm": 4.458502769470215,
"learning_rate": 3.332048861307467e-06,
"loss": 1.5272,
"step": 490
},
{
"epoch": 0.3928,
"grad_norm": 1.4470558166503906,
"learning_rate": 3.3261212554286977e-06,
"loss": 3.617,
"step": 491
},
{
"epoch": 0.3936,
"grad_norm": 3.8012030124664307,
"learning_rate": 3.320188431319088e-06,
"loss": 1.2316,
"step": 492
},
{
"epoch": 0.3944,
"grad_norm": 1.446913242340088,
"learning_rate": 3.3142504264535808e-06,
"loss": 3.6562,
"step": 493
},
{
"epoch": 0.3952,
"grad_norm": 4.147583961486816,
"learning_rate": 3.308307278339842e-06,
"loss": 1.3471,
"step": 494
},
{
"epoch": 0.396,
"grad_norm": 1.4276149272918701,
"learning_rate": 3.3023590245180237e-06,
"loss": 3.5495,
"step": 495
},
{
"epoch": 0.3968,
"grad_norm": 3.8174455165863037,
"learning_rate": 3.296405702560532e-06,
"loss": 1.0808,
"step": 496
},
{
"epoch": 0.3976,
"grad_norm": 1.4224337339401245,
"learning_rate": 3.2904473500717826e-06,
"loss": 3.5136,
"step": 497
},
{
"epoch": 0.3984,
"grad_norm": 4.157987117767334,
"learning_rate": 3.284484004687969e-06,
"loss": 1.3679,
"step": 498
},
{
"epoch": 0.3992,
"grad_norm": 1.2928471565246582,
"learning_rate": 3.278515704076821e-06,
"loss": 3.6342,
"step": 499
},
{
"epoch": 0.4,
"grad_norm": 4.097792625427246,
"learning_rate": 3.272542485937369e-06,
"loss": 1.3664,
"step": 500
},
{
"epoch": 0.4008,
"grad_norm": 1.1602492332458496,
"learning_rate": 3.2665643879997054e-06,
"loss": 3.6839,
"step": 501
},
{
"epoch": 0.4016,
"grad_norm": 3.862520456314087,
"learning_rate": 3.2605814480247454e-06,
"loss": 1.4261,
"step": 502
},
{
"epoch": 0.4024,
"grad_norm": 1.335418462753296,
"learning_rate": 3.2545937038039904e-06,
"loss": 3.599,
"step": 503
},
{
"epoch": 0.4032,
"grad_norm": 4.205375671386719,
"learning_rate": 3.2486011931592863e-06,
"loss": 1.5577,
"step": 504
},
{
"epoch": 0.404,
"grad_norm": 1.6254982948303223,
"learning_rate": 3.2426039539425875e-06,
"loss": 3.4938,
"step": 505
},
{
"epoch": 0.4048,
"grad_norm": 4.060510158538818,
"learning_rate": 3.2366020240357166e-06,
"loss": 1.3317,
"step": 506
},
{
"epoch": 0.4056,
"grad_norm": 1.3750642538070679,
"learning_rate": 3.2305954413501252e-06,
"loss": 3.5692,
"step": 507
},
{
"epoch": 0.4064,
"grad_norm": 4.146080017089844,
"learning_rate": 3.2245842438266526e-06,
"loss": 1.1754,
"step": 508
},
{
"epoch": 0.4072,
"grad_norm": 1.4431229829788208,
"learning_rate": 3.2185684694352913e-06,
"loss": 3.4761,
"step": 509
},
{
"epoch": 0.408,
"grad_norm": 3.423323392868042,
"learning_rate": 3.2125481561749406e-06,
"loss": 1.2221,
"step": 510
},
{
"epoch": 0.4088,
"grad_norm": 1.5508882999420166,
"learning_rate": 3.2065233420731717e-06,
"loss": 3.6483,
"step": 511
},
{
"epoch": 0.4096,
"grad_norm": 3.5361711978912354,
"learning_rate": 3.2004940651859844e-06,
"loss": 1.1119,
"step": 512
},
{
"epoch": 0.4104,
"grad_norm": 1.326869010925293,
"learning_rate": 3.194460363597569e-06,
"loss": 3.5423,
"step": 513
},
{
"epoch": 0.4112,
"grad_norm": 4.03769588470459,
"learning_rate": 3.188422275420063e-06,
"loss": 1.4117,
"step": 514
},
{
"epoch": 0.412,
"grad_norm": 1.3623450994491577,
"learning_rate": 3.1823798387933134e-06,
"loss": 3.498,
"step": 515
},
{
"epoch": 0.4128,
"grad_norm": 4.137259483337402,
"learning_rate": 3.1763330918846347e-06,
"loss": 1.2982,
"step": 516
},
{
"epoch": 0.4136,
"grad_norm": 1.067256212234497,
"learning_rate": 3.1702820728885657e-06,
"loss": 3.7067,
"step": 517
},
{
"epoch": 0.4144,
"grad_norm": 4.063728332519531,
"learning_rate": 3.164226820026632e-06,
"loss": 1.3187,
"step": 518
},
{
"epoch": 0.4152,
"grad_norm": 1.2824773788452148,
"learning_rate": 3.1581673715471007e-06,
"loss": 3.5527,
"step": 519
},
{
"epoch": 0.416,
"grad_norm": 3.7093420028686523,
"learning_rate": 3.152103765724743e-06,
"loss": 1.1281,
"step": 520
},
{
"epoch": 0.4168,
"grad_norm": 1.288455843925476,
"learning_rate": 3.1460360408605866e-06,
"loss": 3.5115,
"step": 521
},
{
"epoch": 0.4176,
"grad_norm": 4.3098063468933105,
"learning_rate": 3.1399642352816825e-06,
"loss": 1.3113,
"step": 522
},
{
"epoch": 0.4184,
"grad_norm": 1.1683874130249023,
"learning_rate": 3.1338883873408517e-06,
"loss": 3.6437,
"step": 523
},
{
"epoch": 0.4192,
"grad_norm": 4.025966167449951,
"learning_rate": 3.127808535416454e-06,
"loss": 1.2751,
"step": 524
},
{
"epoch": 0.42,
"grad_norm": 1.7916266918182373,
"learning_rate": 3.121724717912138e-06,
"loss": 3.5067,
"step": 525
},
{
"epoch": 0.4208,
"grad_norm": 4.328076362609863,
"learning_rate": 3.1156369732566006e-06,
"loss": 1.6473,
"step": 526
},
{
"epoch": 0.4216,
"grad_norm": 1.400840163230896,
"learning_rate": 3.109545339903347e-06,
"loss": 3.5727,
"step": 527
},
{
"epoch": 0.4224,
"grad_norm": 3.689484119415283,
"learning_rate": 3.1034498563304435e-06,
"loss": 1.3867,
"step": 528
},
{
"epoch": 0.4232,
"grad_norm": 1.0594552755355835,
"learning_rate": 3.0973505610402767e-06,
"loss": 3.7167,
"step": 529
},
{
"epoch": 0.424,
"grad_norm": 4.328317642211914,
"learning_rate": 3.0912474925593124e-06,
"loss": 1.5036,
"step": 530
},
{
"epoch": 0.4248,
"grad_norm": 1.1060447692871094,
"learning_rate": 3.085140689437846e-06,
"loss": 3.6933,
"step": 531
},
{
"epoch": 0.4256,
"grad_norm": 4.118087291717529,
"learning_rate": 3.0790301902497664e-06,
"loss": 1.3451,
"step": 532
},
{
"epoch": 0.4264,
"grad_norm": 1.254740595817566,
"learning_rate": 3.072916033592307e-06,
"loss": 3.5871,
"step": 533
},
{
"epoch": 0.4272,
"grad_norm": 4.144657611846924,
"learning_rate": 3.0667982580858047e-06,
"loss": 1.4215,
"step": 534
},
{
"epoch": 0.428,
"grad_norm": 1.1598517894744873,
"learning_rate": 3.0606769023734535e-06,
"loss": 3.6583,
"step": 535
},
{
"epoch": 0.4288,
"grad_norm": 4.24267578125,
"learning_rate": 3.0545520051210637e-06,
"loss": 1.2563,
"step": 536
},
{
"epoch": 0.4296,
"grad_norm": 1.5326381921768188,
"learning_rate": 3.048423605016815e-06,
"loss": 3.5047,
"step": 537
},
{
"epoch": 0.4304,
"grad_norm": 4.730625629425049,
"learning_rate": 3.042291740771014e-06,
"loss": 1.3603,
"step": 538
},
{
"epoch": 0.4312,
"grad_norm": 1.132880687713623,
"learning_rate": 3.036156451115846e-06,
"loss": 3.6709,
"step": 539
},
{
"epoch": 0.432,
"grad_norm": 3.7942590713500977,
"learning_rate": 3.0300177748051375e-06,
"loss": 1.3794,
"step": 540
},
{
"epoch": 0.4328,
"grad_norm": 1.4315778017044067,
"learning_rate": 3.0238757506141013e-06,
"loss": 3.5769,
"step": 541
},
{
"epoch": 0.4336,
"grad_norm": 3.5602166652679443,
"learning_rate": 3.0177304173391038e-06,
"loss": 1.2704,
"step": 542
},
{
"epoch": 0.4344,
"grad_norm": 1.1675716638565063,
"learning_rate": 3.0115818137974066e-06,
"loss": 3.5886,
"step": 543
},
{
"epoch": 0.4352,
"grad_norm": 4.345582962036133,
"learning_rate": 3.0054299788269343e-06,
"loss": 1.4216,
"step": 544
},
{
"epoch": 0.436,
"grad_norm": 1.762725591659546,
"learning_rate": 2.9992749512860177e-06,
"loss": 3.4446,
"step": 545
},
{
"epoch": 0.4368,
"grad_norm": 3.993100643157959,
"learning_rate": 2.9931167700531575e-06,
"loss": 1.343,
"step": 546
},
{
"epoch": 0.4376,
"grad_norm": 1.2319386005401611,
"learning_rate": 2.9869554740267726e-06,
"loss": 3.603,
"step": 547
},
{
"epoch": 0.4384,
"grad_norm": 4.317058086395264,
"learning_rate": 2.9807911021249573e-06,
"loss": 1.4564,
"step": 548
},
{
"epoch": 0.4392,
"grad_norm": 1.6317486763000488,
"learning_rate": 2.9746236932852355e-06,
"loss": 3.5411,
"step": 549
},
{
"epoch": 0.44,
"grad_norm": 3.8238189220428467,
"learning_rate": 2.9684532864643123e-06,
"loss": 1.1421,
"step": 550
},
{
"epoch": 0.4408,
"grad_norm": 1.9044779539108276,
"learning_rate": 2.9622799206378306e-06,
"loss": 3.6848,
"step": 551
},
{
"epoch": 0.4416,
"grad_norm": 3.827505588531494,
"learning_rate": 2.956103634800126e-06,
"loss": 1.3386,
"step": 552
},
{
"epoch": 0.4424,
"grad_norm": 1.3661056756973267,
"learning_rate": 2.949924467963975e-06,
"loss": 3.4422,
"step": 553
},
{
"epoch": 0.4432,
"grad_norm": 4.082735538482666,
"learning_rate": 2.943742459160354e-06,
"loss": 1.3541,
"step": 554
},
{
"epoch": 0.444,
"grad_norm": 1.28450345993042,
"learning_rate": 2.9375576474381907e-06,
"loss": 3.5994,
"step": 555
},
{
"epoch": 0.4448,
"grad_norm": 3.4685943126678467,
"learning_rate": 2.9313700718641167e-06,
"loss": 1.4483,
"step": 556
},
{
"epoch": 0.4456,
"grad_norm": 1.7730368375778198,
"learning_rate": 2.925179771522223e-06,
"loss": 3.6276,
"step": 557
},
{
"epoch": 0.4464,
"grad_norm": 3.9150004386901855,
"learning_rate": 2.9189867855138103e-06,
"loss": 1.3486,
"step": 558
},
{
"epoch": 0.4472,
"grad_norm": 1.5707478523254395,
"learning_rate": 2.912791152957145e-06,
"loss": 3.5531,
"step": 559
},
{
"epoch": 0.448,
"grad_norm": 4.4283766746521,
"learning_rate": 2.9065929129872097e-06,
"loss": 1.4254,
"step": 560
},
{
"epoch": 0.4488,
"grad_norm": 1.5481115579605103,
"learning_rate": 2.900392104755455e-06,
"loss": 3.4633,
"step": 561
},
{
"epoch": 0.4496,
"grad_norm": 3.5355985164642334,
"learning_rate": 2.8941887674295573e-06,
"loss": 1.3703,
"step": 562
},
{
"epoch": 0.4504,
"grad_norm": 1.2419151067733765,
"learning_rate": 2.887982940193165e-06,
"loss": 3.6656,
"step": 563
},
{
"epoch": 0.4512,
"grad_norm": 4.397960186004639,
"learning_rate": 2.8817746622456585e-06,
"loss": 1.338,
"step": 564
},
{
"epoch": 0.452,
"grad_norm": 1.4676947593688965,
"learning_rate": 2.875563972801893e-06,
"loss": 3.6548,
"step": 565
},
{
"epoch": 0.4528,
"grad_norm": 4.111155033111572,
"learning_rate": 2.8693509110919597e-06,
"loss": 1.3694,
"step": 566
},
{
"epoch": 0.4536,
"grad_norm": 1.7541122436523438,
"learning_rate": 2.863135516360932e-06,
"loss": 3.4508,
"step": 567
},
{
"epoch": 0.4544,
"grad_norm": 4.085772514343262,
"learning_rate": 2.8569178278686222e-06,
"loss": 1.3314,
"step": 568
},
{
"epoch": 0.4552,
"grad_norm": 1.2001174688339233,
"learning_rate": 2.85069788488933e-06,
"loss": 3.5885,
"step": 569
},
{
"epoch": 0.456,
"grad_norm": 4.38803768157959,
"learning_rate": 2.844475726711595e-06,
"loss": 1.1816,
"step": 570
},
{
"epoch": 0.4568,
"grad_norm": 1.2394533157348633,
"learning_rate": 2.8382513926379508e-06,
"loss": 3.6019,
"step": 571
},
{
"epoch": 0.4576,
"grad_norm": 4.420421600341797,
"learning_rate": 2.832024921984674e-06,
"loss": 1.4351,
"step": 572
},
{
"epoch": 0.4584,
"grad_norm": 1.2522428035736084,
"learning_rate": 2.825796354081537e-06,
"loss": 3.6141,
"step": 573
},
{
"epoch": 0.4592,
"grad_norm": 4.002085208892822,
"learning_rate": 2.8195657282715595e-06,
"loss": 1.1009,
"step": 574
},
{
"epoch": 0.46,
"grad_norm": 1.433961272239685,
"learning_rate": 2.813333083910761e-06,
"loss": 3.6517,
"step": 575
},
{
"epoch": 0.4608,
"grad_norm": 4.165874004364014,
"learning_rate": 2.807098460367911e-06,
"loss": 1.3473,
"step": 576
},
{
"epoch": 0.4616,
"grad_norm": 1.468865990638733,
"learning_rate": 2.800861897024279e-06,
"loss": 3.6747,
"step": 577
},
{
"epoch": 0.4624,
"grad_norm": 4.306812286376953,
"learning_rate": 2.79462343327339e-06,
"loss": 1.416,
"step": 578
},
{
"epoch": 0.4632,
"grad_norm": 1.0383753776550293,
"learning_rate": 2.7883831085207707e-06,
"loss": 3.575,
"step": 579
},
{
"epoch": 0.464,
"grad_norm": 4.186305999755859,
"learning_rate": 2.7821409621837042e-06,
"loss": 1.5874,
"step": 580
},
{
"epoch": 0.4648,
"grad_norm": 1.3052856922149658,
"learning_rate": 2.7758970336909795e-06,
"loss": 3.6154,
"step": 581
},
{
"epoch": 0.4656,
"grad_norm": 3.598694324493408,
"learning_rate": 2.7696513624826422e-06,
"loss": 1.2231,
"step": 582
},
{
"epoch": 0.4664,
"grad_norm": 1.3978124856948853,
"learning_rate": 2.763403988009746e-06,
"loss": 3.5403,
"step": 583
},
{
"epoch": 0.4672,
"grad_norm": 3.618967056274414,
"learning_rate": 2.7571549497341044e-06,
"loss": 1.29,
"step": 584
},
{
"epoch": 0.468,
"grad_norm": 1.4016177654266357,
"learning_rate": 2.7509042871280373e-06,
"loss": 3.6256,
"step": 585
},
{
"epoch": 0.4688,
"grad_norm": 3.9204423427581787,
"learning_rate": 2.7446520396741293e-06,
"loss": 1.4597,
"step": 586
},
{
"epoch": 0.4696,
"grad_norm": 1.4617024660110474,
"learning_rate": 2.7383982468649715e-06,
"loss": 3.482,
"step": 587
},
{
"epoch": 0.4704,
"grad_norm": 4.012588024139404,
"learning_rate": 2.73214294820292e-06,
"loss": 1.2928,
"step": 588
},
{
"epoch": 0.4712,
"grad_norm": 1.4617540836334229,
"learning_rate": 2.725886183199839e-06,
"loss": 3.626,
"step": 589
},
{
"epoch": 0.472,
"grad_norm": 3.5914876461029053,
"learning_rate": 2.7196279913768587e-06,
"loss": 1.3148,
"step": 590
},
{
"epoch": 0.4728,
"grad_norm": 1.4136903285980225,
"learning_rate": 2.713368412264118e-06,
"loss": 3.5289,
"step": 591
},
{
"epoch": 0.4736,
"grad_norm": 3.7139124870300293,
"learning_rate": 2.7071074854005206e-06,
"loss": 1.3292,
"step": 592
},
{
"epoch": 0.4744,
"grad_norm": 1.2121789455413818,
"learning_rate": 2.700845250333486e-06,
"loss": 3.6458,
"step": 593
},
{
"epoch": 0.4752,
"grad_norm": 4.53924036026001,
"learning_rate": 2.694581746618691e-06,
"loss": 1.3469,
"step": 594
},
{
"epoch": 0.476,
"grad_norm": 1.2464954853057861,
"learning_rate": 2.688317013819832e-06,
"loss": 3.5712,
"step": 595
},
{
"epoch": 0.4768,
"grad_norm": 3.8551762104034424,
"learning_rate": 2.682051091508365e-06,
"loss": 1.3476,
"step": 596
},
{
"epoch": 0.4776,
"grad_norm": 1.2209997177124023,
"learning_rate": 2.67578401926326e-06,
"loss": 3.6444,
"step": 597
},
{
"epoch": 0.4784,
"grad_norm": 4.334421634674072,
"learning_rate": 2.6695158366707526e-06,
"loss": 1.4771,
"step": 598
},
{
"epoch": 0.4792,
"grad_norm": 1.5928137302398682,
"learning_rate": 2.6632465833240895e-06,
"loss": 3.4254,
"step": 599
},
{
"epoch": 0.48,
"grad_norm": 3.963142156600952,
"learning_rate": 2.6569762988232838e-06,
"loss": 1.3901,
"step": 600
},
{
"epoch": 0.4808,
"grad_norm": 1.2593353986740112,
"learning_rate": 2.6507050227748595e-06,
"loss": 3.5619,
"step": 601
},
{
"epoch": 0.4816,
"grad_norm": 4.629072189331055,
"learning_rate": 2.6444327947916037e-06,
"loss": 1.5413,
"step": 602
},
{
"epoch": 0.4824,
"grad_norm": 1.2204415798187256,
"learning_rate": 2.6381596544923184e-06,
"loss": 3.6041,
"step": 603
},
{
"epoch": 0.4832,
"grad_norm": 4.39404821395874,
"learning_rate": 2.6318856415015664e-06,
"loss": 1.1507,
"step": 604
},
{
"epoch": 0.484,
"grad_norm": 1.2167773246765137,
"learning_rate": 2.625610795449424e-06,
"loss": 3.5377,
"step": 605
},
{
"epoch": 0.4848,
"grad_norm": 4.067314624786377,
"learning_rate": 2.6193351559712294e-06,
"loss": 1.3543,
"step": 606
},
{
"epoch": 0.4856,
"grad_norm": 1.054069995880127,
"learning_rate": 2.6130587627073315e-06,
"loss": 3.678,
"step": 607
},
{
"epoch": 0.4864,
"grad_norm": 4.561433792114258,
"learning_rate": 2.606781655302843e-06,
"loss": 1.5264,
"step": 608
},
{
"epoch": 0.4872,
"grad_norm": 1.6582963466644287,
"learning_rate": 2.6005038734073833e-06,
"loss": 3.4737,
"step": 609
},
{
"epoch": 0.488,
"grad_norm": 4.4807233810424805,
"learning_rate": 2.594225456674837e-06,
"loss": 1.5468,
"step": 610
},
{
"epoch": 0.4888,
"grad_norm": 1.4780353307724,
"learning_rate": 2.5879464447630947e-06,
"loss": 3.6692,
"step": 611
},
{
"epoch": 0.4896,
"grad_norm": 4.209949493408203,
"learning_rate": 2.58166687733381e-06,
"loss": 1.2275,
"step": 612
},
{
"epoch": 0.4904,
"grad_norm": 1.4267958402633667,
"learning_rate": 2.575386794052142e-06,
"loss": 3.4531,
"step": 613
},
{
"epoch": 0.4912,
"grad_norm": 3.8919217586517334,
"learning_rate": 2.569106234586511e-06,
"loss": 1.3178,
"step": 614
},
{
"epoch": 0.492,
"grad_norm": 1.4168897867202759,
"learning_rate": 2.5628252386083443e-06,
"loss": 3.4955,
"step": 615
},
{
"epoch": 0.4928,
"grad_norm": 3.9594831466674805,
"learning_rate": 2.5565438457918247e-06,
"loss": 1.3968,
"step": 616
},
{
"epoch": 0.4936,
"grad_norm": 1.1420923471450806,
"learning_rate": 2.5502620958136444e-06,
"loss": 3.6264,
"step": 617
},
{
"epoch": 0.4944,
"grad_norm": 4.060093402862549,
"learning_rate": 2.5439800283527495e-06,
"loss": 1.3898,
"step": 618
},
{
"epoch": 0.4952,
"grad_norm": 1.4885039329528809,
"learning_rate": 2.537697683090093e-06,
"loss": 3.492,
"step": 619
},
{
"epoch": 0.496,
"grad_norm": 4.163914203643799,
"learning_rate": 2.531415099708382e-06,
"loss": 1.1859,
"step": 620
},
{
"epoch": 0.4968,
"grad_norm": 1.1269545555114746,
"learning_rate": 2.525132317891827e-06,
"loss": 3.5954,
"step": 621
},
{
"epoch": 0.4976,
"grad_norm": 4.090238571166992,
"learning_rate": 2.518849377325893e-06,
"loss": 1.3966,
"step": 622
},
{
"epoch": 0.4984,
"grad_norm": 1.5226904153823853,
"learning_rate": 2.5125663176970475e-06,
"loss": 3.6323,
"step": 623
},
{
"epoch": 0.4992,
"grad_norm": 3.7972140312194824,
"learning_rate": 2.5062831786925102e-06,
"loss": 1.39,
"step": 624
},
{
"epoch": 0.5,
"grad_norm": 1.4045028686523438,
"learning_rate": 2.5e-06,
"loss": 3.5625,
"step": 625
},
{
"epoch": 0.5008,
"grad_norm": 3.8131749629974365,
"learning_rate": 2.4937168213074906e-06,
"loss": 1.2028,
"step": 626
},
{
"epoch": 0.5016,
"grad_norm": 2.0082039833068848,
"learning_rate": 2.487433682302953e-06,
"loss": 3.3618,
"step": 627
},
{
"epoch": 0.5024,
"grad_norm": 4.199687957763672,
"learning_rate": 2.4811506226741077e-06,
"loss": 1.2716,
"step": 628
},
{
"epoch": 0.5032,
"grad_norm": 1.1121747493743896,
"learning_rate": 2.474867682108174e-06,
"loss": 3.5795,
"step": 629
},
{
"epoch": 0.504,
"grad_norm": 3.8342783451080322,
"learning_rate": 2.4685849002916184e-06,
"loss": 1.2034,
"step": 630
},
{
"epoch": 0.5048,
"grad_norm": 1.6797664165496826,
"learning_rate": 2.4623023169099074e-06,
"loss": 3.5073,
"step": 631
},
{
"epoch": 0.5056,
"grad_norm": 5.292508125305176,
"learning_rate": 2.456019971647251e-06,
"loss": 1.3187,
"step": 632
},
{
"epoch": 0.5064,
"grad_norm": 1.0428590774536133,
"learning_rate": 2.449737904186357e-06,
"loss": 3.6168,
"step": 633
},
{
"epoch": 0.5072,
"grad_norm": 3.81816029548645,
"learning_rate": 2.4434561542081765e-06,
"loss": 1.3212,
"step": 634
},
{
"epoch": 0.508,
"grad_norm": 1.0982403755187988,
"learning_rate": 2.4371747613916566e-06,
"loss": 3.6012,
"step": 635
},
{
"epoch": 0.5088,
"grad_norm": 4.740167617797852,
"learning_rate": 2.4308937654134893e-06,
"loss": 1.3399,
"step": 636
},
{
"epoch": 0.5096,
"grad_norm": 1.26600980758667,
"learning_rate": 2.4246132059478582e-06,
"loss": 3.5275,
"step": 637
},
{
"epoch": 0.5104,
"grad_norm": 4.418180465698242,
"learning_rate": 2.4183331226661913e-06,
"loss": 1.4019,
"step": 638
},
{
"epoch": 0.5112,
"grad_norm": 2.0348660945892334,
"learning_rate": 2.4120535552369057e-06,
"loss": 3.5616,
"step": 639
},
{
"epoch": 0.512,
"grad_norm": 3.7417869567871094,
"learning_rate": 2.4057745433251637e-06,
"loss": 1.3269,
"step": 640
},
{
"epoch": 0.5128,
"grad_norm": 1.818655252456665,
"learning_rate": 2.3994961265926166e-06,
"loss": 3.5734,
"step": 641
},
{
"epoch": 0.5136,
"grad_norm": 3.8714828491210938,
"learning_rate": 2.3932183446971584e-06,
"loss": 1.3336,
"step": 642
},
{
"epoch": 0.5144,
"grad_norm": 1.1985024213790894,
"learning_rate": 2.386941237292669e-06,
"loss": 3.5905,
"step": 643
},
{
"epoch": 0.5152,
"grad_norm": 3.901711940765381,
"learning_rate": 2.3806648440287715e-06,
"loss": 1.1541,
"step": 644
},
{
"epoch": 0.516,
"grad_norm": 1.3076053857803345,
"learning_rate": 2.3743892045505764e-06,
"loss": 3.6319,
"step": 645
},
{
"epoch": 0.5168,
"grad_norm": 3.9768855571746826,
"learning_rate": 2.368114358498434e-06,
"loss": 1.5297,
"step": 646
},
{
"epoch": 0.5176,
"grad_norm": 1.135161280632019,
"learning_rate": 2.361840345507683e-06,
"loss": 3.6021,
"step": 647
},
{
"epoch": 0.5184,
"grad_norm": 3.6397156715393066,
"learning_rate": 2.355567205208397e-06,
"loss": 1.3282,
"step": 648
},
{
"epoch": 0.5192,
"grad_norm": 1.3913445472717285,
"learning_rate": 2.3492949772251418e-06,
"loss": 3.4597,
"step": 649
},
{
"epoch": 0.52,
"grad_norm": 3.9108190536499023,
"learning_rate": 2.3430237011767166e-06,
"loss": 1.0836,
"step": 650
},
{
"epoch": 0.5208,
"grad_norm": 1.6176162958145142,
"learning_rate": 2.3367534166759105e-06,
"loss": 3.5934,
"step": 651
},
{
"epoch": 0.5216,
"grad_norm": 3.639057159423828,
"learning_rate": 2.3304841633292487e-06,
"loss": 1.2418,
"step": 652
},
{
"epoch": 0.5224,
"grad_norm": 1.5021276473999023,
"learning_rate": 2.324215980736741e-06,
"loss": 3.4284,
"step": 653
},
{
"epoch": 0.5232,
"grad_norm": 5.434640407562256,
"learning_rate": 2.317948908491636e-06,
"loss": 1.3802,
"step": 654
},
{
"epoch": 0.524,
"grad_norm": 1.7329832315444946,
"learning_rate": 2.3116829861801687e-06,
"loss": 3.4577,
"step": 655
},
{
"epoch": 0.5248,
"grad_norm": 3.633262872695923,
"learning_rate": 2.305418253381309e-06,
"loss": 1.1311,
"step": 656
},
{
"epoch": 0.5256,
"grad_norm": 1.2898222208023071,
"learning_rate": 2.299154749666515e-06,
"loss": 3.5833,
"step": 657
},
{
"epoch": 0.5264,
"grad_norm": 3.3343076705932617,
"learning_rate": 2.2928925145994798e-06,
"loss": 1.2565,
"step": 658
},
{
"epoch": 0.5272,
"grad_norm": 1.1492732763290405,
"learning_rate": 2.286631587735883e-06,
"loss": 3.6572,
"step": 659
},
{
"epoch": 0.528,
"grad_norm": 4.284005165100098,
"learning_rate": 2.280372008623142e-06,
"loss": 1.4464,
"step": 660
},
{
"epoch": 0.5288,
"grad_norm": 1.7030223608016968,
"learning_rate": 2.274113816800161e-06,
"loss": 3.4687,
"step": 661
},
{
"epoch": 0.5296,
"grad_norm": 4.307010650634766,
"learning_rate": 2.267857051797081e-06,
"loss": 1.3294,
"step": 662
},
{
"epoch": 0.5304,
"grad_norm": 1.5467772483825684,
"learning_rate": 2.261601753135029e-06,
"loss": 3.5568,
"step": 663
},
{
"epoch": 0.5312,
"grad_norm": 3.650076150894165,
"learning_rate": 2.255347960325871e-06,
"loss": 1.3358,
"step": 664
},
{
"epoch": 0.532,
"grad_norm": 1.5734375715255737,
"learning_rate": 2.2490957128719627e-06,
"loss": 3.4565,
"step": 665
},
{
"epoch": 0.5328,
"grad_norm": 3.6878743171691895,
"learning_rate": 2.2428450502658964e-06,
"loss": 1.1379,
"step": 666
},
{
"epoch": 0.5336,
"grad_norm": 1.115048885345459,
"learning_rate": 2.2365960119902543e-06,
"loss": 3.6159,
"step": 667
},
{
"epoch": 0.5344,
"grad_norm": 4.451643943786621,
"learning_rate": 2.2303486375173586e-06,
"loss": 1.3798,
"step": 668
},
{
"epoch": 0.5352,
"grad_norm": 1.2209587097167969,
"learning_rate": 2.224102966309021e-06,
"loss": 3.5913,
"step": 669
},
{
"epoch": 0.536,
"grad_norm": 3.687743663787842,
"learning_rate": 2.2178590378162957e-06,
"loss": 1.2116,
"step": 670
},
{
"epoch": 0.5368,
"grad_norm": 1.4728742837905884,
"learning_rate": 2.2116168914792293e-06,
"loss": 3.5415,
"step": 671
},
{
"epoch": 0.5376,
"grad_norm": 3.96630859375,
"learning_rate": 2.205376566726611e-06,
"loss": 1.3889,
"step": 672
},
{
"epoch": 0.5384,
"grad_norm": 1.215154767036438,
"learning_rate": 2.1991381029757216e-06,
"loss": 3.5867,
"step": 673
},
{
"epoch": 0.5392,
"grad_norm": 3.8956687450408936,
"learning_rate": 2.19290153963209e-06,
"loss": 1.5616,
"step": 674
},
{
"epoch": 0.54,
"grad_norm": 1.183532476425171,
"learning_rate": 2.186666916089239e-06,
"loss": 3.5136,
"step": 675
},
{
"epoch": 0.5408,
"grad_norm": 3.5824153423309326,
"learning_rate": 2.1804342717284414e-06,
"loss": 1.2544,
"step": 676
},
{
"epoch": 0.5416,
"grad_norm": 1.325810432434082,
"learning_rate": 2.174203645918464e-06,
"loss": 3.5406,
"step": 677
},
{
"epoch": 0.5424,
"grad_norm": 3.4541144371032715,
"learning_rate": 2.1679750780153265e-06,
"loss": 1.3576,
"step": 678
},
{
"epoch": 0.5432,
"grad_norm": 1.5813454389572144,
"learning_rate": 2.1617486073620496e-06,
"loss": 3.4813,
"step": 679
},
{
"epoch": 0.544,
"grad_norm": 3.9602949619293213,
"learning_rate": 2.155524273288405e-06,
"loss": 1.426,
"step": 680
},
{
"epoch": 0.5448,
"grad_norm": 1.4534196853637695,
"learning_rate": 2.1493021151106704e-06,
"loss": 3.5585,
"step": 681
},
{
"epoch": 0.5456,
"grad_norm": 3.9135422706604004,
"learning_rate": 2.143082172131378e-06,
"loss": 1.3641,
"step": 682
},
{
"epoch": 0.5464,
"grad_norm": 1.6020511388778687,
"learning_rate": 2.1368644836390684e-06,
"loss": 3.5024,
"step": 683
},
{
"epoch": 0.5472,
"grad_norm": 4.677028179168701,
"learning_rate": 2.130649088908041e-06,
"loss": 1.366,
"step": 684
},
{
"epoch": 0.548,
"grad_norm": 1.4928466081619263,
"learning_rate": 2.1244360271981073e-06,
"loss": 3.5495,
"step": 685
},
{
"epoch": 0.5488,
"grad_norm": 4.278928279876709,
"learning_rate": 2.1182253377543428e-06,
"loss": 1.3534,
"step": 686
},
{
"epoch": 0.5496,
"grad_norm": 1.3462296724319458,
"learning_rate": 2.1120170598068353e-06,
"loss": 3.6396,
"step": 687
},
{
"epoch": 0.5504,
"grad_norm": 5.2212653160095215,
"learning_rate": 2.1058112325704436e-06,
"loss": 1.3357,
"step": 688
},
{
"epoch": 0.5512,
"grad_norm": 1.1819498538970947,
"learning_rate": 2.0996078952445453e-06,
"loss": 3.6596,
"step": 689
},
{
"epoch": 0.552,
"grad_norm": 3.7068729400634766,
"learning_rate": 2.093407087012791e-06,
"loss": 1.3518,
"step": 690
},
{
"epoch": 0.5528,
"grad_norm": 1.0458273887634277,
"learning_rate": 2.0872088470428553e-06,
"loss": 3.607,
"step": 691
},
{
"epoch": 0.5536,
"grad_norm": 4.25509786605835,
"learning_rate": 2.08101321448619e-06,
"loss": 1.4629,
"step": 692
},
{
"epoch": 0.5544,
"grad_norm": 1.1481705904006958,
"learning_rate": 2.0748202284777775e-06,
"loss": 3.6161,
"step": 693
},
{
"epoch": 0.5552,
"grad_norm": 3.934365749359131,
"learning_rate": 2.0686299281358837e-06,
"loss": 1.4318,
"step": 694
},
{
"epoch": 0.556,
"grad_norm": 1.4977188110351562,
"learning_rate": 2.0624423525618097e-06,
"loss": 3.6224,
"step": 695
},
{
"epoch": 0.5568,
"grad_norm": 3.6773321628570557,
"learning_rate": 2.0562575408396475e-06,
"loss": 1.1651,
"step": 696
},
{
"epoch": 0.5576,
"grad_norm": 1.449863314628601,
"learning_rate": 2.0500755320360263e-06,
"loss": 3.6073,
"step": 697
},
{
"epoch": 0.5584,
"grad_norm": 3.81058406829834,
"learning_rate": 2.0438963651998747e-06,
"loss": 1.2255,
"step": 698
},
{
"epoch": 0.5592,
"grad_norm": 1.1542376279830933,
"learning_rate": 2.0377200793621694e-06,
"loss": 3.6066,
"step": 699
},
{
"epoch": 0.56,
"grad_norm": 4.023213863372803,
"learning_rate": 2.031546713535688e-06,
"loss": 1.3477,
"step": 700
},
{
"epoch": 0.5608,
"grad_norm": 1.3673769235610962,
"learning_rate": 2.0253763067147657e-06,
"loss": 3.5453,
"step": 701
},
{
"epoch": 0.5616,
"grad_norm": 4.080592155456543,
"learning_rate": 2.019208897875043e-06,
"loss": 1.4669,
"step": 702
},
{
"epoch": 0.5624,
"grad_norm": 1.4954679012298584,
"learning_rate": 2.0130445259732282e-06,
"loss": 3.4227,
"step": 703
},
{
"epoch": 0.5632,
"grad_norm": 4.1900248527526855,
"learning_rate": 2.006883229946843e-06,
"loss": 1.4427,
"step": 704
},
{
"epoch": 0.564,
"grad_norm": 1.4168885946273804,
"learning_rate": 2.0007250487139827e-06,
"loss": 3.6209,
"step": 705
},
{
"epoch": 0.5648,
"grad_norm": 3.834075450897217,
"learning_rate": 1.994570021173067e-06,
"loss": 1.2146,
"step": 706
},
{
"epoch": 0.5656,
"grad_norm": 1.18809974193573,
"learning_rate": 1.9884181862025938e-06,
"loss": 3.5612,
"step": 707
},
{
"epoch": 0.5664,
"grad_norm": 3.8719165325164795,
"learning_rate": 1.9822695826608975e-06,
"loss": 1.4709,
"step": 708
},
{
"epoch": 0.5672,
"grad_norm": 1.2471320629119873,
"learning_rate": 1.9761242493858987e-06,
"loss": 3.5347,
"step": 709
},
{
"epoch": 0.568,
"grad_norm": 3.889285087585449,
"learning_rate": 1.969982225194864e-06,
"loss": 1.1893,
"step": 710
},
{
"epoch": 0.5688,
"grad_norm": 1.6830719709396362,
"learning_rate": 1.9638435488841543e-06,
"loss": 3.3654,
"step": 711
},
{
"epoch": 0.5696,
"grad_norm": 3.806553363800049,
"learning_rate": 1.957708259228987e-06,
"loss": 1.179,
"step": 712
},
{
"epoch": 0.5704,
"grad_norm": 1.273412823677063,
"learning_rate": 1.9515763949831852e-06,
"loss": 3.5977,
"step": 713
},
{
"epoch": 0.5712,
"grad_norm": 3.846447229385376,
"learning_rate": 1.945447994878937e-06,
"loss": 1.559,
"step": 714
},
{
"epoch": 0.572,
"grad_norm": 1.3436466455459595,
"learning_rate": 1.9393230976265478e-06,
"loss": 3.6578,
"step": 715
},
{
"epoch": 0.5728,
"grad_norm": 3.7785065174102783,
"learning_rate": 1.933201741914196e-06,
"loss": 1.4349,
"step": 716
},
{
"epoch": 0.5736,
"grad_norm": 1.8797110319137573,
"learning_rate": 1.9270839664076937e-06,
"loss": 3.545,
"step": 717
},
{
"epoch": 0.5744,
"grad_norm": 4.088225841522217,
"learning_rate": 1.920969809750234e-06,
"loss": 1.31,
"step": 718
},
{
"epoch": 0.5752,
"grad_norm": 1.348626732826233,
"learning_rate": 1.9148593105621542e-06,
"loss": 3.5437,
"step": 719
},
{
"epoch": 0.576,
"grad_norm": 3.5283923149108887,
"learning_rate": 1.908752507440689e-06,
"loss": 1.179,
"step": 720
},
{
"epoch": 0.5768,
"grad_norm": 1.4678329229354858,
"learning_rate": 1.9026494389597239e-06,
"loss": 3.5683,
"step": 721
},
{
"epoch": 0.5776,
"grad_norm": 4.486749172210693,
"learning_rate": 1.8965501436695578e-06,
"loss": 1.2648,
"step": 722
},
{
"epoch": 0.5784,
"grad_norm": 1.4773081541061401,
"learning_rate": 1.8904546600966539e-06,
"loss": 3.5973,
"step": 723
},
{
"epoch": 0.5792,
"grad_norm": 4.043974876403809,
"learning_rate": 1.8843630267434e-06,
"loss": 1.425,
"step": 724
},
{
"epoch": 0.58,
"grad_norm": 1.2826696634292603,
"learning_rate": 1.8782752820878636e-06,
"loss": 3.5147,
"step": 725
},
{
"epoch": 0.5808,
"grad_norm": 3.6155593395233154,
"learning_rate": 1.872191464583547e-06,
"loss": 1.4485,
"step": 726
},
{
"epoch": 0.5816,
"grad_norm": 1.2381564378738403,
"learning_rate": 1.8661116126591492e-06,
"loss": 3.64,
"step": 727
},
{
"epoch": 0.5824,
"grad_norm": 4.1232380867004395,
"learning_rate": 1.8600357647183188e-06,
"loss": 1.3699,
"step": 728
},
{
"epoch": 0.5832,
"grad_norm": 1.070135474205017,
"learning_rate": 1.8539639591394131e-06,
"loss": 3.5735,
"step": 729
},
{
"epoch": 0.584,
"grad_norm": 3.9993014335632324,
"learning_rate": 1.8478962342752584e-06,
"loss": 1.46,
"step": 730
},
{
"epoch": 0.5848,
"grad_norm": 1.5479552745819092,
"learning_rate": 1.8418326284528997e-06,
"loss": 3.431,
"step": 731
},
{
"epoch": 0.5856,
"grad_norm": 4.261895656585693,
"learning_rate": 1.8357731799733686e-06,
"loss": 1.5391,
"step": 732
},
{
"epoch": 0.5864,
"grad_norm": 0.9864424467086792,
"learning_rate": 1.8297179271114345e-06,
"loss": 3.6108,
"step": 733
},
{
"epoch": 0.5872,
"grad_norm": 4.133561134338379,
"learning_rate": 1.8236669081153657e-06,
"loss": 1.3051,
"step": 734
},
{
"epoch": 0.588,
"grad_norm": 1.7257312536239624,
"learning_rate": 1.8176201612066874e-06,
"loss": 3.5698,
"step": 735
},
{
"epoch": 0.5888,
"grad_norm": 3.8284997940063477,
"learning_rate": 1.8115777245799383e-06,
"loss": 1.1011,
"step": 736
},
{
"epoch": 0.5896,
"grad_norm": 1.4894834756851196,
"learning_rate": 1.8055396364024318e-06,
"loss": 3.5975,
"step": 737
},
{
"epoch": 0.5904,
"grad_norm": 4.291233539581299,
"learning_rate": 1.7995059348140165e-06,
"loss": 1.4558,
"step": 738
},
{
"epoch": 0.5912,
"grad_norm": 1.2095164060592651,
"learning_rate": 1.7934766579268292e-06,
"loss": 3.5745,
"step": 739
},
{
"epoch": 0.592,
"grad_norm": 4.15226936340332,
"learning_rate": 1.7874518438250598e-06,
"loss": 1.4725,
"step": 740
},
{
"epoch": 0.5928,
"grad_norm": 1.2965120077133179,
"learning_rate": 1.7814315305647095e-06,
"loss": 3.5479,
"step": 741
},
{
"epoch": 0.5936,
"grad_norm": 3.704596519470215,
"learning_rate": 1.7754157561733476e-06,
"loss": 1.2924,
"step": 742
},
{
"epoch": 0.5944,
"grad_norm": 1.8090176582336426,
"learning_rate": 1.7694045586498754e-06,
"loss": 3.418,
"step": 743
},
{
"epoch": 0.5952,
"grad_norm": 3.9790186882019043,
"learning_rate": 1.7633979759642844e-06,
"loss": 1.4173,
"step": 744
},
{
"epoch": 0.596,
"grad_norm": 1.8232885599136353,
"learning_rate": 1.7573960460574133e-06,
"loss": 3.5081,
"step": 745
},
{
"epoch": 0.5968,
"grad_norm": 3.6959445476531982,
"learning_rate": 1.7513988068407145e-06,
"loss": 1.2422,
"step": 746
},
{
"epoch": 0.5976,
"grad_norm": 1.4322175979614258,
"learning_rate": 1.7454062961960102e-06,
"loss": 3.5851,
"step": 747
},
{
"epoch": 0.5984,
"grad_norm": 3.444291591644287,
"learning_rate": 1.7394185519752546e-06,
"loss": 1.2407,
"step": 748
},
{
"epoch": 0.5992,
"grad_norm": 1.024861454963684,
"learning_rate": 1.7334356120002956e-06,
"loss": 3.6587,
"step": 749
},
{
"epoch": 0.6,
"grad_norm": 4.007371425628662,
"learning_rate": 1.7274575140626318e-06,
"loss": 1.3341,
"step": 750
},
{
"epoch": 0.6008,
"grad_norm": 1.387477159500122,
"learning_rate": 1.7214842959231796e-06,
"loss": 3.5696,
"step": 751
},
{
"epoch": 0.6016,
"grad_norm": 3.6198816299438477,
"learning_rate": 1.7155159953120315e-06,
"loss": 1.1709,
"step": 752
},
{
"epoch": 0.6024,
"grad_norm": 1.5271052122116089,
"learning_rate": 1.7095526499282172e-06,
"loss": 3.5466,
"step": 753
},
{
"epoch": 0.6032,
"grad_norm": 4.3780317306518555,
"learning_rate": 1.703594297439469e-06,
"loss": 1.4056,
"step": 754
},
{
"epoch": 0.604,
"grad_norm": 1.0889999866485596,
"learning_rate": 1.6976409754819767e-06,
"loss": 3.6382,
"step": 755
},
{
"epoch": 0.6048,
"grad_norm": 4.148120403289795,
"learning_rate": 1.6916927216601593e-06,
"loss": 1.3061,
"step": 756
},
{
"epoch": 0.6056,
"grad_norm": 1.0028917789459229,
"learning_rate": 1.6857495735464196e-06,
"loss": 3.6111,
"step": 757
},
{
"epoch": 0.6064,
"grad_norm": 3.956118583679199,
"learning_rate": 1.6798115686809125e-06,
"loss": 1.4431,
"step": 758
},
{
"epoch": 0.6072,
"grad_norm": 1.1292115449905396,
"learning_rate": 1.673878744571304e-06,
"loss": 3.6654,
"step": 759
},
{
"epoch": 0.608,
"grad_norm": 3.675584554672241,
"learning_rate": 1.6679511386925337e-06,
"loss": 1.2957,
"step": 760
},
{
"epoch": 0.6088,
"grad_norm": 1.6884305477142334,
"learning_rate": 1.6620287884865831e-06,
"loss": 3.471,
"step": 761
},
{
"epoch": 0.6096,
"grad_norm": 3.8323042392730713,
"learning_rate": 1.656111731362236e-06,
"loss": 1.1559,
"step": 762
},
{
"epoch": 0.6104,
"grad_norm": 1.2776001691818237,
"learning_rate": 1.650200004694839e-06,
"loss": 3.5601,
"step": 763
},
{
"epoch": 0.6112,
"grad_norm": 3.951807737350464,
"learning_rate": 1.6442936458260723e-06,
"loss": 1.2963,
"step": 764
},
{
"epoch": 0.612,
"grad_norm": 1.0104762315750122,
"learning_rate": 1.6383926920637077e-06,
"loss": 3.6454,
"step": 765
},
{
"epoch": 0.6128,
"grad_norm": 3.8364481925964355,
"learning_rate": 1.6324971806813766e-06,
"loss": 1.2477,
"step": 766
},
{
"epoch": 0.6136,
"grad_norm": 1.404075264930725,
"learning_rate": 1.6266071489183327e-06,
"loss": 3.5319,
"step": 767
},
{
"epoch": 0.6144,
"grad_norm": 3.647761583328247,
"learning_rate": 1.620722633979219e-06,
"loss": 1.3192,
"step": 768
},
{
"epoch": 0.6152,
"grad_norm": 1.2602980136871338,
"learning_rate": 1.6148436730338279e-06,
"loss": 3.5468,
"step": 769
},
{
"epoch": 0.616,
"grad_norm": 4.292653560638428,
"learning_rate": 1.6089703032168736e-06,
"loss": 1.1626,
"step": 770
},
{
"epoch": 0.6168,
"grad_norm": 1.8109797239303589,
"learning_rate": 1.6031025616277512e-06,
"loss": 3.5154,
"step": 771
},
{
"epoch": 0.6176,
"grad_norm": 4.427074909210205,
"learning_rate": 1.5972404853303061e-06,
"loss": 1.1841,
"step": 772
},
{
"epoch": 0.6184,
"grad_norm": 1.114534854888916,
"learning_rate": 1.591384111352599e-06,
"loss": 3.5374,
"step": 773
},
{
"epoch": 0.6192,
"grad_norm": 3.930265426635742,
"learning_rate": 1.585533476686669e-06,
"loss": 1.203,
"step": 774
},
{
"epoch": 0.62,
"grad_norm": 1.7864525318145752,
"learning_rate": 1.5796886182883053e-06,
"loss": 3.4942,
"step": 775
},
{
"epoch": 0.6208,
"grad_norm": 4.248049259185791,
"learning_rate": 1.5738495730768104e-06,
"loss": 1.5361,
"step": 776
},
{
"epoch": 0.6216,
"grad_norm": 1.1578404903411865,
"learning_rate": 1.5680163779347668e-06,
"loss": 3.5659,
"step": 777
},
{
"epoch": 0.6224,
"grad_norm": 4.111908435821533,
"learning_rate": 1.5621890697078069e-06,
"loss": 1.582,
"step": 778
},
{
"epoch": 0.6232,
"grad_norm": 1.2350143194198608,
"learning_rate": 1.5563676852043738e-06,
"loss": 3.5397,
"step": 779
},
{
"epoch": 0.624,
"grad_norm": 4.6647562980651855,
"learning_rate": 1.5505522611954977e-06,
"loss": 1.5677,
"step": 780
},
{
"epoch": 0.6248,
"grad_norm": 1.5898746252059937,
"learning_rate": 1.5447428344145565e-06,
"loss": 3.4637,
"step": 781
},
{
"epoch": 0.6256,
"grad_norm": 4.031108856201172,
"learning_rate": 1.538939441557048e-06,
"loss": 1.5085,
"step": 782
},
{
"epoch": 0.6264,
"grad_norm": 1.1129035949707031,
"learning_rate": 1.5331421192803565e-06,
"loss": 3.7525,
"step": 783
},
{
"epoch": 0.6272,
"grad_norm": 3.7480621337890625,
"learning_rate": 1.5273509042035172e-06,
"loss": 1.3526,
"step": 784
},
{
"epoch": 0.628,
"grad_norm": 1.4506335258483887,
"learning_rate": 1.521565832906994e-06,
"loss": 3.4543,
"step": 785
},
{
"epoch": 0.6288,
"grad_norm": 4.091665267944336,
"learning_rate": 1.515786941932441e-06,
"loss": 1.3925,
"step": 786
},
{
"epoch": 0.6296,
"grad_norm": 1.7259176969528198,
"learning_rate": 1.5100142677824752e-06,
"loss": 3.5212,
"step": 787
},
{
"epoch": 0.6304,
"grad_norm": 3.6364309787750244,
"learning_rate": 1.5042478469204437e-06,
"loss": 1.486,
"step": 788
},
{
"epoch": 0.6312,
"grad_norm": 1.0510691404342651,
"learning_rate": 1.4984877157701932e-06,
"loss": 3.5759,
"step": 789
},
{
"epoch": 0.632,
"grad_norm": 3.974539041519165,
"learning_rate": 1.4927339107158437e-06,
"loss": 1.3787,
"step": 790
},
{
"epoch": 0.6328,
"grad_norm": 1.5087684392929077,
"learning_rate": 1.486986468101555e-06,
"loss": 3.547,
"step": 791
},
{
"epoch": 0.6336,
"grad_norm": 3.6339049339294434,
"learning_rate": 1.481245424231298e-06,
"loss": 1.321,
"step": 792
},
{
"epoch": 0.6344,
"grad_norm": 1.1450809240341187,
"learning_rate": 1.4755108153686275e-06,
"loss": 3.6239,
"step": 793
},
{
"epoch": 0.6352,
"grad_norm": 3.5662426948547363,
"learning_rate": 1.4697826777364478e-06,
"loss": 1.2403,
"step": 794
},
{
"epoch": 0.636,
"grad_norm": 1.2532669305801392,
"learning_rate": 1.46406104751679e-06,
"loss": 3.5814,
"step": 795
},
{
"epoch": 0.6368,
"grad_norm": 3.5871071815490723,
"learning_rate": 1.45834596085058e-06,
"loss": 1.2413,
"step": 796
},
{
"epoch": 0.6376,
"grad_norm": 1.7455424070358276,
"learning_rate": 1.4526374538374133e-06,
"loss": 3.5806,
"step": 797
},
{
"epoch": 0.6384,
"grad_norm": 4.081576824188232,
"learning_rate": 1.4469355625353199e-06,
"loss": 1.314,
"step": 798
},
{
"epoch": 0.6392,
"grad_norm": 1.2774088382720947,
"learning_rate": 1.4412403229605453e-06,
"loss": 3.5766,
"step": 799
},
{
"epoch": 0.64,
"grad_norm": 4.024228572845459,
"learning_rate": 1.4355517710873184e-06,
"loss": 1.3179,
"step": 800
},
{
"epoch": 0.6408,
"grad_norm": 1.5069676637649536,
"learning_rate": 1.4298699428476236e-06,
"loss": 3.4628,
"step": 801
},
{
"epoch": 0.6416,
"grad_norm": 3.8722047805786133,
"learning_rate": 1.4241948741309783e-06,
"loss": 1.2991,
"step": 802
},
{
"epoch": 0.6424,
"grad_norm": 1.4869807958602905,
"learning_rate": 1.418526600784198e-06,
"loss": 3.5303,
"step": 803
},
{
"epoch": 0.6432,
"grad_norm": 4.096463680267334,
"learning_rate": 1.412865158611179e-06,
"loss": 1.4464,
"step": 804
},
{
"epoch": 0.644,
"grad_norm": 1.3232511281967163,
"learning_rate": 1.4072105833726685e-06,
"loss": 3.5599,
"step": 805
},
{
"epoch": 0.6448,
"grad_norm": 3.500465154647827,
"learning_rate": 1.401562910786034e-06,
"loss": 1.3568,
"step": 806
},
{
"epoch": 0.6456,
"grad_norm": 1.6436785459518433,
"learning_rate": 1.395922176525047e-06,
"loss": 3.5835,
"step": 807
},
{
"epoch": 0.6464,
"grad_norm": 3.5307986736297607,
"learning_rate": 1.3902884162196509e-06,
"loss": 1.3578,
"step": 808
},
{
"epoch": 0.6472,
"grad_norm": 1.2310173511505127,
"learning_rate": 1.384661665455736e-06,
"loss": 3.626,
"step": 809
},
{
"epoch": 0.648,
"grad_norm": 5.397148132324219,
"learning_rate": 1.3790419597749198e-06,
"loss": 1.3758,
"step": 810
},
{
"epoch": 0.6488,
"grad_norm": 1.2223182916641235,
"learning_rate": 1.373429334674317e-06,
"loss": 3.5392,
"step": 811
},
{
"epoch": 0.6496,
"grad_norm": 5.135192394256592,
"learning_rate": 1.3678238256063193e-06,
"loss": 1.27,
"step": 812
},
{
"epoch": 0.6504,
"grad_norm": 1.457159161567688,
"learning_rate": 1.3622254679783665e-06,
"loss": 3.5182,
"step": 813
},
{
"epoch": 0.6512,
"grad_norm": 3.729689359664917,
"learning_rate": 1.356634297152729e-06,
"loss": 1.219,
"step": 814
},
{
"epoch": 0.652,
"grad_norm": 1.7926121950149536,
"learning_rate": 1.3510503484462807e-06,
"loss": 3.4169,
"step": 815
},
{
"epoch": 0.6528,
"grad_norm": 3.46643328666687,
"learning_rate": 1.3454736571302761e-06,
"loss": 1.2486,
"step": 816
},
{
"epoch": 0.6536,
"grad_norm": 1.3711421489715576,
"learning_rate": 1.3399042584301298e-06,
"loss": 3.5197,
"step": 817
},
{
"epoch": 0.6544,
"grad_norm": 4.594119071960449,
"learning_rate": 1.334342187525189e-06,
"loss": 1.2484,
"step": 818
},
{
"epoch": 0.6552,
"grad_norm": 1.1788302659988403,
"learning_rate": 1.3287874795485168e-06,
"loss": 3.574,
"step": 819
},
{
"epoch": 0.656,
"grad_norm": 3.5496530532836914,
"learning_rate": 1.3232401695866686e-06,
"loss": 1.1791,
"step": 820
},
{
"epoch": 0.6568,
"grad_norm": 1.140120267868042,
"learning_rate": 1.3177002926794685e-06,
"loss": 3.6431,
"step": 821
},
{
"epoch": 0.6576,
"grad_norm": 4.5700554847717285,
"learning_rate": 1.312167883819791e-06,
"loss": 1.3331,
"step": 822
},
{
"epoch": 0.6584,
"grad_norm": 1.6417975425720215,
"learning_rate": 1.3066429779533352e-06,
"loss": 3.4451,
"step": 823
},
{
"epoch": 0.6592,
"grad_norm": 3.6675314903259277,
"learning_rate": 1.3011256099784103e-06,
"loss": 1.1985,
"step": 824
},
{
"epoch": 0.66,
"grad_norm": 0.9253246784210205,
"learning_rate": 1.2956158147457116e-06,
"loss": 3.6082,
"step": 825
},
{
"epoch": 0.6608,
"grad_norm": 4.173038482666016,
"learning_rate": 1.2901136270580994e-06,
"loss": 1.2908,
"step": 826
},
{
"epoch": 0.6616,
"grad_norm": 1.7744218111038208,
"learning_rate": 1.2846190816703836e-06,
"loss": 3.4493,
"step": 827
},
{
"epoch": 0.6624,
"grad_norm": 3.8822882175445557,
"learning_rate": 1.279132213289096e-06,
"loss": 1.5025,
"step": 828
},
{
"epoch": 0.6632,
"grad_norm": 1.4533785581588745,
"learning_rate": 1.273653056572282e-06,
"loss": 3.5351,
"step": 829
},
{
"epoch": 0.664,
"grad_norm": 3.9480459690093994,
"learning_rate": 1.2681816461292715e-06,
"loss": 1.3216,
"step": 830
},
{
"epoch": 0.6648,
"grad_norm": 1.3655693531036377,
"learning_rate": 1.2627180165204671e-06,
"loss": 3.5135,
"step": 831
},
{
"epoch": 0.6656,
"grad_norm": 3.7476413249969482,
"learning_rate": 1.257262202257124e-06,
"loss": 1.4918,
"step": 832
},
{
"epoch": 0.6664,
"grad_norm": 1.7849209308624268,
"learning_rate": 1.251814237801128e-06,
"loss": 3.4437,
"step": 833
},
{
"epoch": 0.6672,
"grad_norm": 4.042788982391357,
"learning_rate": 1.246374157564785e-06,
"loss": 1.1764,
"step": 834
},
{
"epoch": 0.668,
"grad_norm": 1.2156387567520142,
"learning_rate": 1.2409419959105981e-06,
"loss": 3.565,
"step": 835
},
{
"epoch": 0.6688,
"grad_norm": 3.900473117828369,
"learning_rate": 1.2355177871510538e-06,
"loss": 1.2951,
"step": 836
},
{
"epoch": 0.6696,
"grad_norm": 1.0474777221679688,
"learning_rate": 1.2301015655484006e-06,
"loss": 3.6051,
"step": 837
},
{
"epoch": 0.6704,
"grad_norm": 3.8230295181274414,
"learning_rate": 1.2246933653144386e-06,
"loss": 1.4542,
"step": 838
},
{
"epoch": 0.6712,
"grad_norm": 1.6013360023498535,
"learning_rate": 1.2192932206103e-06,
"loss": 3.4223,
"step": 839
},
{
"epoch": 0.672,
"grad_norm": 3.603398084640503,
"learning_rate": 1.2139011655462338e-06,
"loss": 1.1428,
"step": 840
},
{
"epoch": 0.6728,
"grad_norm": 0.9630873203277588,
"learning_rate": 1.208517234181391e-06,
"loss": 3.63,
"step": 841
},
{
"epoch": 0.6736,
"grad_norm": 3.746964931488037,
"learning_rate": 1.2031414605236066e-06,
"loss": 1.2324,
"step": 842
},
{
"epoch": 0.6744,
"grad_norm": 1.1261411905288696,
"learning_rate": 1.1977738785291894e-06,
"loss": 3.5977,
"step": 843
},
{
"epoch": 0.6752,
"grad_norm": 3.895467519760132,
"learning_rate": 1.1924145221027048e-06,
"loss": 1.1571,
"step": 844
},
{
"epoch": 0.676,
"grad_norm": 1.2304555177688599,
"learning_rate": 1.1870634250967606e-06,
"loss": 3.613,
"step": 845
},
{
"epoch": 0.6768,
"grad_norm": 3.7354040145874023,
"learning_rate": 1.1817206213117943e-06,
"loss": 1.4115,
"step": 846
},
{
"epoch": 0.6776,
"grad_norm": 1.3557534217834473,
"learning_rate": 1.1763861444958573e-06,
"loss": 3.5227,
"step": 847
},
{
"epoch": 0.6784,
"grad_norm": 8.678403854370117,
"learning_rate": 1.1710600283444048e-06,
"loss": 1.3812,
"step": 848
},
{
"epoch": 0.6792,
"grad_norm": 1.2234259843826294,
"learning_rate": 1.1657423065000811e-06,
"loss": 3.5525,
"step": 849
},
{
"epoch": 0.68,
"grad_norm": 4.474430084228516,
"learning_rate": 1.160433012552508e-06,
"loss": 1.5074,
"step": 850
},
{
"epoch": 0.6808,
"grad_norm": 1.9095535278320312,
"learning_rate": 1.1551321800380722e-06,
"loss": 3.3455,
"step": 851
},
{
"epoch": 0.6816,
"grad_norm": 4.141076564788818,
"learning_rate": 1.1498398424397106e-06,
"loss": 1.2947,
"step": 852
},
{
"epoch": 0.6824,
"grad_norm": 1.9714593887329102,
"learning_rate": 1.1445560331867054e-06,
"loss": 3.455,
"step": 853
},
{
"epoch": 0.6832,
"grad_norm": 4.287348747253418,
"learning_rate": 1.1392807856544682e-06,
"loss": 1.3707,
"step": 854
},
{
"epoch": 0.684,
"grad_norm": 1.3626141548156738,
"learning_rate": 1.1340141331643276e-06,
"loss": 3.4847,
"step": 855
},
{
"epoch": 0.6848,
"grad_norm": 4.172240734100342,
"learning_rate": 1.128756108983325e-06,
"loss": 1.1837,
"step": 856
},
{
"epoch": 0.6856,
"grad_norm": 1.6149402856826782,
"learning_rate": 1.123506746323997e-06,
"loss": 3.3876,
"step": 857
},
{
"epoch": 0.6864,
"grad_norm": 4.046041011810303,
"learning_rate": 1.1182660783441719e-06,
"loss": 1.199,
"step": 858
},
{
"epoch": 0.6872,
"grad_norm": 1.2951021194458008,
"learning_rate": 1.1130341381467569e-06,
"loss": 3.546,
"step": 859
},
{
"epoch": 0.688,
"grad_norm": 3.817901611328125,
"learning_rate": 1.1078109587795311e-06,
"loss": 1.2792,
"step": 860
},
{
"epoch": 0.6888,
"grad_norm": 1.45967435836792,
"learning_rate": 1.1025965732349318e-06,
"loss": 3.5619,
"step": 861
},
{
"epoch": 0.6896,
"grad_norm": 3.8560800552368164,
"learning_rate": 1.0973910144498534e-06,
"loss": 1.3367,
"step": 862
},
{
"epoch": 0.6904,
"grad_norm": 1.186650037765503,
"learning_rate": 1.0921943153054343e-06,
"loss": 3.5638,
"step": 863
},
{
"epoch": 0.6912,
"grad_norm": 3.8473381996154785,
"learning_rate": 1.0870065086268506e-06,
"loss": 1.3076,
"step": 864
},
{
"epoch": 0.692,
"grad_norm": 1.6394022703170776,
"learning_rate": 1.0818276271831094e-06,
"loss": 3.5127,
"step": 865
},
{
"epoch": 0.6928,
"grad_norm": 4.1624016761779785,
"learning_rate": 1.0766577036868395e-06,
"loss": 1.3827,
"step": 866
},
{
"epoch": 0.6936,
"grad_norm": 1.134089469909668,
"learning_rate": 1.0714967707940876e-06,
"loss": 3.5572,
"step": 867
},
{
"epoch": 0.6944,
"grad_norm": 4.057480335235596,
"learning_rate": 1.0663448611041114e-06,
"loss": 1.4129,
"step": 868
},
{
"epoch": 0.6952,
"grad_norm": 1.2894881963729858,
"learning_rate": 1.0612020071591722e-06,
"loss": 3.5994,
"step": 869
},
{
"epoch": 0.696,
"grad_norm": 3.5933890342712402,
"learning_rate": 1.0560682414443315e-06,
"loss": 1.1426,
"step": 870
},
{
"epoch": 0.6968,
"grad_norm": 1.4715263843536377,
"learning_rate": 1.0509435963872422e-06,
"loss": 3.5776,
"step": 871
},
{
"epoch": 0.6976,
"grad_norm": 3.6835391521453857,
"learning_rate": 1.0458281043579482e-06,
"loss": 1.3991,
"step": 872
},
{
"epoch": 0.6984,
"grad_norm": 1.2193199396133423,
"learning_rate": 1.0407217976686777e-06,
"loss": 3.5754,
"step": 873
},
{
"epoch": 0.6992,
"grad_norm": 3.6208441257476807,
"learning_rate": 1.0356247085736388e-06,
"loss": 1.2799,
"step": 874
},
{
"epoch": 0.7,
"grad_norm": 1.3012170791625977,
"learning_rate": 1.0305368692688175e-06,
"loss": 3.5576,
"step": 875
},
{
"epoch": 0.7008,
"grad_norm": 3.988499879837036,
"learning_rate": 1.0254583118917699e-06,
"loss": 1.4413,
"step": 876
},
{
"epoch": 0.7016,
"grad_norm": 1.3237192630767822,
"learning_rate": 1.020389068521426e-06,
"loss": 3.5586,
"step": 877
},
{
"epoch": 0.7024,
"grad_norm": 4.113298416137695,
"learning_rate": 1.0153291711778825e-06,
"loss": 1.4436,
"step": 878
},
{
"epoch": 0.7032,
"grad_norm": 1.1641186475753784,
"learning_rate": 1.0102786518221997e-06,
"loss": 3.5658,
"step": 879
},
{
"epoch": 0.704,
"grad_norm": 4.27529239654541,
"learning_rate": 1.0052375423562038e-06,
"loss": 1.3145,
"step": 880
},
{
"epoch": 0.7048,
"grad_norm": 1.370846152305603,
"learning_rate": 1.0002058746222807e-06,
"loss": 3.5536,
"step": 881
},
{
"epoch": 0.7056,
"grad_norm": 4.043067932128906,
"learning_rate": 9.951836804031795e-07,
"loss": 1.2685,
"step": 882
},
{
"epoch": 0.7064,
"grad_norm": 1.643572211265564,
"learning_rate": 9.90170991421808e-07,
"loss": 3.5677,
"step": 883
},
{
"epoch": 0.7072,
"grad_norm": 4.03674840927124,
"learning_rate": 9.851678393410343e-07,
"loss": 1.3122,
"step": 884
},
{
"epoch": 0.708,
"grad_norm": 1.0866400003433228,
"learning_rate": 9.801742557634872e-07,
"loss": 3.5932,
"step": 885
},
{
"epoch": 0.7088,
"grad_norm": 3.896414279937744,
"learning_rate": 9.751902722313527e-07,
"loss": 1.2974,
"step": 886
},
{
"epoch": 0.7096,
"grad_norm": 1.1581923961639404,
"learning_rate": 9.702159202261802e-07,
"loss": 3.5641,
"step": 887
},
{
"epoch": 0.7104,
"grad_norm": 3.8378193378448486,
"learning_rate": 9.65251231168681e-07,
"loss": 1.2477,
"step": 888
},
{
"epoch": 0.7112,
"grad_norm": 1.1178447008132935,
"learning_rate": 9.602962364185286e-07,
"loss": 3.5832,
"step": 889
},
{
"epoch": 0.712,
"grad_norm": 3.76153302192688,
"learning_rate": 9.553509672741646e-07,
"loss": 1.5284,
"step": 890
},
{
"epoch": 0.7128,
"grad_norm": 1.6611312627792358,
"learning_rate": 9.504154549725944e-07,
"loss": 3.4278,
"step": 891
},
{
"epoch": 0.7136,
"grad_norm": 3.821173906326294,
"learning_rate": 9.454897306891972e-07,
"loss": 1.3952,
"step": 892
},
{
"epoch": 0.7144,
"grad_norm": 0.9451780915260315,
"learning_rate": 9.405738255375243e-07,
"loss": 3.5839,
"step": 893
},
{
"epoch": 0.7152,
"grad_norm": 5.367844104766846,
"learning_rate": 9.356677705691058e-07,
"loss": 1.3163,
"step": 894
},
{
"epoch": 0.716,
"grad_norm": 1.4917246103286743,
"learning_rate": 9.307715967732492e-07,
"loss": 3.3808,
"step": 895
},
{
"epoch": 0.7168,
"grad_norm": 4.245250225067139,
"learning_rate": 9.258853350768499e-07,
"loss": 1.3849,
"step": 896
},
{
"epoch": 0.7176,
"grad_norm": 1.8379777669906616,
"learning_rate": 9.210090163441928e-07,
"loss": 3.5479,
"step": 897
},
{
"epoch": 0.7184,
"grad_norm": 3.840579032897949,
"learning_rate": 9.161426713767574e-07,
"loss": 1.3287,
"step": 898
},
{
"epoch": 0.7192,
"grad_norm": 1.2158552408218384,
"learning_rate": 9.112863309130235e-07,
"loss": 3.5524,
"step": 899
},
{
"epoch": 0.72,
"grad_norm": 4.019105434417725,
"learning_rate": 9.064400256282757e-07,
"loss": 1.2645,
"step": 900
},
{
"epoch": 0.7208,
"grad_norm": 1.4201416969299316,
"learning_rate": 9.01603786134413e-07,
"loss": 3.5722,
"step": 901
},
{
"epoch": 0.7216,
"grad_norm": 3.683457851409912,
"learning_rate": 8.967776429797529e-07,
"loss": 1.2652,
"step": 902
},
{
"epoch": 0.7224,
"grad_norm": 1.3120098114013672,
"learning_rate": 8.919616266488373e-07,
"loss": 3.5835,
"step": 903
},
{
"epoch": 0.7232,
"grad_norm": 3.85827898979187,
"learning_rate": 8.871557675622442e-07,
"loss": 1.407,
"step": 904
},
{
"epoch": 0.724,
"grad_norm": 1.2667253017425537,
"learning_rate": 8.823600960763901e-07,
"loss": 3.5396,
"step": 905
},
{
"epoch": 0.7248,
"grad_norm": 3.5598056316375732,
"learning_rate": 8.775746424833428e-07,
"loss": 1.1467,
"step": 906
},
{
"epoch": 0.7256,
"grad_norm": 1.2805604934692383,
"learning_rate": 8.727994370106288e-07,
"loss": 3.5316,
"step": 907
},
{
"epoch": 0.7264,
"grad_norm": 4.258754253387451,
"learning_rate": 8.680345098210408e-07,
"loss": 1.312,
"step": 908
},
{
"epoch": 0.7272,
"grad_norm": 1.3038127422332764,
"learning_rate": 8.632798910124493e-07,
"loss": 3.5995,
"step": 909
},
{
"epoch": 0.728,
"grad_norm": 3.3651838302612305,
"learning_rate": 8.585356106176093e-07,
"loss": 1.12,
"step": 910
},
{
"epoch": 0.7288,
"grad_norm": 1.9212744235992432,
"learning_rate": 8.538016986039751e-07,
"loss": 3.5292,
"step": 911
},
{
"epoch": 0.7296,
"grad_norm": 4.390267848968506,
"learning_rate": 8.49078184873508e-07,
"loss": 1.2082,
"step": 912
},
{
"epoch": 0.7304,
"grad_norm": 1.133646845817566,
"learning_rate": 8.443650992624877e-07,
"loss": 3.6091,
"step": 913
},
{
"epoch": 0.7312,
"grad_norm": 3.671508550643921,
"learning_rate": 8.396624715413251e-07,
"loss": 1.2595,
"step": 914
},
{
"epoch": 0.732,
"grad_norm": 1.238884687423706,
"learning_rate": 8.349703314143712e-07,
"loss": 3.516,
"step": 915
},
{
"epoch": 0.7328,
"grad_norm": 4.374630451202393,
"learning_rate": 8.302887085197342e-07,
"loss": 1.2724,
"step": 916
},
{
"epoch": 0.7336,
"grad_norm": 1.0681443214416504,
"learning_rate": 8.256176324290885e-07,
"loss": 3.5777,
"step": 917
},
{
"epoch": 0.7344,
"grad_norm": 4.399445056915283,
"learning_rate": 8.209571326474897e-07,
"loss": 1.5055,
"step": 918
},
{
"epoch": 0.7352,
"grad_norm": 1.302098035812378,
"learning_rate": 8.163072386131876e-07,
"loss": 3.5391,
"step": 919
},
{
"epoch": 0.736,
"grad_norm": 4.033039093017578,
"learning_rate": 8.116679796974389e-07,
"loss": 1.4171,
"step": 920
},
{
"epoch": 0.7368,
"grad_norm": 1.2380177974700928,
"learning_rate": 8.070393852043251e-07,
"loss": 3.5787,
"step": 921
},
{
"epoch": 0.7376,
"grad_norm": 4.127280235290527,
"learning_rate": 8.024214843705647e-07,
"loss": 1.4362,
"step": 922
},
{
"epoch": 0.7384,
"grad_norm": 1.448819875717163,
"learning_rate": 7.978143063653296e-07,
"loss": 3.5109,
"step": 923
},
{
"epoch": 0.7392,
"grad_norm": 4.252338886260986,
"learning_rate": 7.93217880290059e-07,
"loss": 1.2241,
"step": 924
},
{
"epoch": 0.74,
"grad_norm": 1.3917127847671509,
"learning_rate": 7.886322351782782e-07,
"loss": 3.5236,
"step": 925
},
{
"epoch": 0.7408,
"grad_norm": 3.9095723628997803,
"learning_rate": 7.840573999954154e-07,
"loss": 1.3039,
"step": 926
},
{
"epoch": 0.7416,
"grad_norm": 1.6759053468704224,
"learning_rate": 7.794934036386139e-07,
"loss": 3.5408,
"step": 927
},
{
"epoch": 0.7424,
"grad_norm": 3.9729490280151367,
"learning_rate": 7.749402749365573e-07,
"loss": 1.2951,
"step": 928
},
{
"epoch": 0.7432,
"grad_norm": 1.7310004234313965,
"learning_rate": 7.703980426492791e-07,
"loss": 3.4605,
"step": 929
},
{
"epoch": 0.744,
"grad_norm": 4.3605523109436035,
"learning_rate": 7.65866735467988e-07,
"loss": 1.2495,
"step": 930
},
{
"epoch": 0.7448,
"grad_norm": 1.055009365081787,
"learning_rate": 7.613463820148831e-07,
"loss": 3.5749,
"step": 931
},
{
"epoch": 0.7456,
"grad_norm": 4.379756450653076,
"learning_rate": 7.568370108429732e-07,
"loss": 1.3678,
"step": 932
},
{
"epoch": 0.7464,
"grad_norm": 1.133419156074524,
"learning_rate": 7.523386504358984e-07,
"loss": 3.6624,
"step": 933
},
{
"epoch": 0.7472,
"grad_norm": 3.2285141944885254,
"learning_rate": 7.478513292077463e-07,
"loss": 1.2785,
"step": 934
},
{
"epoch": 0.748,
"grad_norm": 1.2085245847702026,
"learning_rate": 7.433750755028774e-07,
"loss": 3.6372,
"step": 935
},
{
"epoch": 0.7488,
"grad_norm": 3.985098123550415,
"learning_rate": 7.389099175957426e-07,
"loss": 1.3853,
"step": 936
},
{
"epoch": 0.7496,
"grad_norm": 1.3521220684051514,
"learning_rate": 7.344558836907067e-07,
"loss": 3.4587,
"step": 937
},
{
"epoch": 0.7504,
"grad_norm": 3.7972023487091064,
"learning_rate": 7.300130019218688e-07,
"loss": 1.4041,
"step": 938
},
{
"epoch": 0.7512,
"grad_norm": 1.1607991456985474,
"learning_rate": 7.255813003528834e-07,
"loss": 3.5921,
"step": 939
},
{
"epoch": 0.752,
"grad_norm": 4.701716423034668,
"learning_rate": 7.211608069767867e-07,
"loss": 1.1838,
"step": 940
},
{
"epoch": 0.7528,
"grad_norm": 1.6962052583694458,
"learning_rate": 7.167515497158179e-07,
"loss": 3.4455,
"step": 941
},
{
"epoch": 0.7536,
"grad_norm": 3.769155502319336,
"learning_rate": 7.123535564212419e-07,
"loss": 1.417,
"step": 942
},
{
"epoch": 0.7544,
"grad_norm": 1.5282889604568481,
"learning_rate": 7.079668548731757e-07,
"loss": 3.4607,
"step": 943
},
{
"epoch": 0.7552,
"grad_norm": 4.213266372680664,
"learning_rate": 7.035914727804085e-07,
"loss": 1.1793,
"step": 944
},
{
"epoch": 0.756,
"grad_norm": 1.5362334251403809,
"learning_rate": 6.992274377802328e-07,
"loss": 3.5102,
"step": 945
},
{
"epoch": 0.7568,
"grad_norm": 3.7498528957366943,
"learning_rate": 6.94874777438265e-07,
"loss": 1.2506,
"step": 946
},
{
"epoch": 0.7576,
"grad_norm": 1.2717052698135376,
"learning_rate": 6.905335192482734e-07,
"loss": 3.5799,
"step": 947
},
{
"epoch": 0.7584,
"grad_norm": 4.157364368438721,
"learning_rate": 6.862036906320055e-07,
"loss": 1.3018,
"step": 948
},
{
"epoch": 0.7592,
"grad_norm": 1.7433124780654907,
"learning_rate": 6.818853189390104e-07,
"loss": 3.4984,
"step": 949
},
{
"epoch": 0.76,
"grad_norm": 4.441183567047119,
"learning_rate": 6.775784314464717e-07,
"loss": 1.4515,
"step": 950
},
{
"epoch": 0.7608,
"grad_norm": 1.48224675655365,
"learning_rate": 6.732830553590305e-07,
"loss": 3.5688,
"step": 951
},
{
"epoch": 0.7616,
"grad_norm": 3.9499704837799072,
"learning_rate": 6.689992178086174e-07,
"loss": 1.2271,
"step": 952
},
{
"epoch": 0.7624,
"grad_norm": 1.458235263824463,
"learning_rate": 6.647269458542793e-07,
"loss": 3.5244,
"step": 953
},
{
"epoch": 0.7632,
"grad_norm": 3.810727596282959,
"learning_rate": 6.604662664820063e-07,
"loss": 1.2276,
"step": 954
},
{
"epoch": 0.764,
"grad_norm": 1.6759514808654785,
"learning_rate": 6.562172066045655e-07,
"loss": 3.4945,
"step": 955
},
{
"epoch": 0.7648,
"grad_norm": 4.024814128875732,
"learning_rate": 6.519797930613289e-07,
"loss": 1.3065,
"step": 956
},
{
"epoch": 0.7656,
"grad_norm": 1.238553524017334,
"learning_rate": 6.477540526181036e-07,
"loss": 3.5006,
"step": 957
},
{
"epoch": 0.7664,
"grad_norm": 3.444575786590576,
"learning_rate": 6.435400119669618e-07,
"loss": 1.3996,
"step": 958
},
{
"epoch": 0.7672,
"grad_norm": 1.3021897077560425,
"learning_rate": 6.393376977260754e-07,
"loss": 3.5961,
"step": 959
},
{
"epoch": 0.768,
"grad_norm": 4.322812080383301,
"learning_rate": 6.351471364395448e-07,
"loss": 1.5874,
"step": 960
},
{
"epoch": 0.7688,
"grad_norm": 1.3130619525909424,
"learning_rate": 6.309683545772327e-07,
"loss": 3.5893,
"step": 961
},
{
"epoch": 0.7696,
"grad_norm": 4.154742240905762,
"learning_rate": 6.268013785345969e-07,
"loss": 1.5529,
"step": 962
},
{
"epoch": 0.7704,
"grad_norm": 1.2372699975967407,
"learning_rate": 6.226462346325221e-07,
"loss": 3.5887,
"step": 963
},
{
"epoch": 0.7712,
"grad_norm": 3.7366716861724854,
"learning_rate": 6.185029491171554e-07,
"loss": 1.3078,
"step": 964
},
{
"epoch": 0.772,
"grad_norm": 1.2591793537139893,
"learning_rate": 6.143715481597404e-07,
"loss": 3.5405,
"step": 965
},
{
"epoch": 0.7728,
"grad_norm": 3.966529369354248,
"learning_rate": 6.102520578564508e-07,
"loss": 1.2979,
"step": 966
},
{
"epoch": 0.7736,
"grad_norm": 1.7405962944030762,
"learning_rate": 6.061445042282271e-07,
"loss": 3.4681,
"step": 967
},
{
"epoch": 0.7744,
"grad_norm": 4.989678859710693,
"learning_rate": 6.02048913220609e-07,
"loss": 1.6273,
"step": 968
},
{
"epoch": 0.7752,
"grad_norm": 1.1819043159484863,
"learning_rate": 5.979653107035754e-07,
"loss": 3.5553,
"step": 969
},
{
"epoch": 0.776,
"grad_norm": 4.24968957901001,
"learning_rate": 5.9389372247138e-07,
"loss": 1.7848,
"step": 970
},
{
"epoch": 0.7768,
"grad_norm": 1.146349549293518,
"learning_rate": 5.898341742423866e-07,
"loss": 3.5557,
"step": 971
},
{
"epoch": 0.7776,
"grad_norm": 3.359968423843384,
"learning_rate": 5.857866916589089e-07,
"loss": 1.1097,
"step": 972
},
{
"epoch": 0.7784,
"grad_norm": 1.3294552564620972,
"learning_rate": 5.817513002870451e-07,
"loss": 3.5291,
"step": 973
},
{
"epoch": 0.7792,
"grad_norm": 3.7747585773468018,
"learning_rate": 5.777280256165218e-07,
"loss": 1.1422,
"step": 974
},
{
"epoch": 0.78,
"grad_norm": 1.3020869493484497,
"learning_rate": 5.737168930605272e-07,
"loss": 3.5797,
"step": 975
},
{
"epoch": 0.7808,
"grad_norm": 4.284913063049316,
"learning_rate": 5.697179279555551e-07,
"loss": 1.2182,
"step": 976
},
{
"epoch": 0.7816,
"grad_norm": 1.17784583568573,
"learning_rate": 5.657311555612433e-07,
"loss": 3.5849,
"step": 977
},
{
"epoch": 0.7824,
"grad_norm": 3.8503072261810303,
"learning_rate": 5.617566010602113e-07,
"loss": 1.1606,
"step": 978
},
{
"epoch": 0.7832,
"grad_norm": 1.4357177019119263,
"learning_rate": 5.577942895579064e-07,
"loss": 3.4606,
"step": 979
},
{
"epoch": 0.784,
"grad_norm": 4.020089626312256,
"learning_rate": 5.538442460824417e-07,
"loss": 1.2557,
"step": 980
},
{
"epoch": 0.7848,
"grad_norm": 1.3439040184020996,
"learning_rate": 5.499064955844383e-07,
"loss": 3.5545,
"step": 981
},
{
"epoch": 0.7856,
"grad_norm": 3.5121538639068604,
"learning_rate": 5.459810629368692e-07,
"loss": 1.1383,
"step": 982
},
{
"epoch": 0.7864,
"grad_norm": 1.4466603994369507,
"learning_rate": 5.420679729348993e-07,
"loss": 3.4426,
"step": 983
},
{
"epoch": 0.7872,
"grad_norm": 4.1092047691345215,
"learning_rate": 5.381672502957324e-07,
"loss": 1.3047,
"step": 984
},
{
"epoch": 0.788,
"grad_norm": 1.4652632474899292,
"learning_rate": 5.342789196584527e-07,
"loss": 3.4522,
"step": 985
},
{
"epoch": 0.7888,
"grad_norm": 4.341894626617432,
"learning_rate": 5.304030055838704e-07,
"loss": 1.5886,
"step": 986
},
{
"epoch": 0.7896,
"grad_norm": 1.5312821865081787,
"learning_rate": 5.26539532554364e-07,
"loss": 3.4746,
"step": 987
},
{
"epoch": 0.7904,
"grad_norm": 3.956395149230957,
"learning_rate": 5.226885249737292e-07,
"loss": 1.3278,
"step": 988
},
{
"epoch": 0.7912,
"grad_norm": 1.5505242347717285,
"learning_rate": 5.188500071670235e-07,
"loss": 3.4367,
"step": 989
},
{
"epoch": 0.792,
"grad_norm": 3.910429000854492,
"learning_rate": 5.150240033804116e-07,
"loss": 1.0932,
"step": 990
},
{
"epoch": 0.7928,
"grad_norm": 1.518563985824585,
"learning_rate": 5.112105377810128e-07,
"loss": 3.412,
"step": 991
},
{
"epoch": 0.7936,
"grad_norm": 3.3202965259552,
"learning_rate": 5.074096344567475e-07,
"loss": 1.1174,
"step": 992
},
{
"epoch": 0.7944,
"grad_norm": 1.5806505680084229,
"learning_rate": 5.036213174161877e-07,
"loss": 3.47,
"step": 993
},
{
"epoch": 0.7952,
"grad_norm": 6.9575324058532715,
"learning_rate": 4.998456105884025e-07,
"loss": 1.5321,
"step": 994
},
{
"epoch": 0.796,
"grad_norm": 1.1276708841323853,
"learning_rate": 4.960825378228082e-07,
"loss": 3.6015,
"step": 995
},
{
"epoch": 0.7968,
"grad_norm": 3.954547166824341,
"learning_rate": 4.923321228890184e-07,
"loss": 1.1861,
"step": 996
},
{
"epoch": 0.7976,
"grad_norm": 1.08054780960083,
"learning_rate": 4.885943894766909e-07,
"loss": 3.5029,
"step": 997
},
{
"epoch": 0.7984,
"grad_norm": 3.6978795528411865,
"learning_rate": 4.848693611953825e-07,
"loss": 1.3936,
"step": 998
},
{
"epoch": 0.7992,
"grad_norm": 1.0338634252548218,
"learning_rate": 4.811570615743952e-07,
"loss": 3.6014,
"step": 999
},
{
"epoch": 0.8,
"grad_norm": 4.1188507080078125,
"learning_rate": 4.774575140626317e-07,
"loss": 1.2529,
"step": 1000
},
{
"epoch": 0.8008,
"grad_norm": 1.9042516946792603,
"learning_rate": 4.7377074202844514e-07,
"loss": 3.4267,
"step": 1001
},
{
"epoch": 0.8016,
"grad_norm": 4.330513954162598,
"learning_rate": 4.700967687594901e-07,
"loss": 1.369,
"step": 1002
},
{
"epoch": 0.8024,
"grad_norm": 1.0320863723754883,
"learning_rate": 4.664356174625795e-07,
"loss": 3.5636,
"step": 1003
},
{
"epoch": 0.8032,
"grad_norm": 4.5047287940979,
"learning_rate": 4.6278731126353447e-07,
"loss": 1.3017,
"step": 1004
},
{
"epoch": 0.804,
"grad_norm": 1.59553062915802,
"learning_rate": 4.591518732070402e-07,
"loss": 3.5466,
"step": 1005
},
{
"epoch": 0.8048,
"grad_norm": 3.6305763721466064,
"learning_rate": 4.555293262564994e-07,
"loss": 1.3101,
"step": 1006
},
{
"epoch": 0.8056,
"grad_norm": 1.155205488204956,
"learning_rate": 4.5191969329388627e-07,
"loss": 3.5494,
"step": 1007
},
{
"epoch": 0.8064,
"grad_norm": 4.001699924468994,
"learning_rate": 4.483229971196054e-07,
"loss": 1.1268,
"step": 1008
},
{
"epoch": 0.8072,
"grad_norm": 1.1981041431427002,
"learning_rate": 4.447392604523443e-07,
"loss": 3.5732,
"step": 1009
},
{
"epoch": 0.808,
"grad_norm": 3.6024370193481445,
"learning_rate": 4.411685059289314e-07,
"loss": 1.1444,
"step": 1010
},
{
"epoch": 0.8088,
"grad_norm": 1.3383228778839111,
"learning_rate": 4.376107561041937e-07,
"loss": 3.5367,
"step": 1011
},
{
"epoch": 0.8096,
"grad_norm": 3.9421496391296387,
"learning_rate": 4.340660334508115e-07,
"loss": 1.3883,
"step": 1012
},
{
"epoch": 0.8104,
"grad_norm": 1.0924482345581055,
"learning_rate": 4.305343603591802e-07,
"loss": 3.5681,
"step": 1013
},
{
"epoch": 0.8112,
"grad_norm": 3.4752144813537598,
"learning_rate": 4.2701575913726644e-07,
"loss": 1.059,
"step": 1014
},
{
"epoch": 0.812,
"grad_norm": 1.952444314956665,
"learning_rate": 4.235102520104681e-07,
"loss": 3.5588,
"step": 1015
},
{
"epoch": 0.8128,
"grad_norm": 4.0423688888549805,
"learning_rate": 4.200178611214736e-07,
"loss": 1.1042,
"step": 1016
},
{
"epoch": 0.8136,
"grad_norm": 1.218482494354248,
"learning_rate": 4.165386085301212e-07,
"loss": 3.5486,
"step": 1017
},
{
"epoch": 0.8144,
"grad_norm": 4.175278663635254,
"learning_rate": 4.1307251621326124e-07,
"loss": 1.4889,
"step": 1018
},
{
"epoch": 0.8152,
"grad_norm": 2.6647427082061768,
"learning_rate": 4.096196060646168e-07,
"loss": 3.5716,
"step": 1019
},
{
"epoch": 0.816,
"grad_norm": 4.009509563446045,
"learning_rate": 4.061798998946459e-07,
"loss": 1.2765,
"step": 1020
},
{
"epoch": 0.8168,
"grad_norm": 1.1483063697814941,
"learning_rate": 4.0275341943040057e-07,
"loss": 3.6826,
"step": 1021
},
{
"epoch": 0.8176,
"grad_norm": 3.944807291030884,
"learning_rate": 3.9934018631539506e-07,
"loss": 1.2861,
"step": 1022
},
{
"epoch": 0.8184,
"grad_norm": 1.6391054391860962,
"learning_rate": 3.9594022210946355e-07,
"loss": 3.3965,
"step": 1023
},
{
"epoch": 0.8192,
"grad_norm": 3.9761102199554443,
"learning_rate": 3.925535482886286e-07,
"loss": 1.2771,
"step": 1024
},
{
"epoch": 0.82,
"grad_norm": 1.8166158199310303,
"learning_rate": 3.891801862449629e-07,
"loss": 3.481,
"step": 1025
},
{
"epoch": 0.8208,
"grad_norm": 3.909714460372925,
"learning_rate": 3.8582015728645366e-07,
"loss": 1.3296,
"step": 1026
},
{
"epoch": 0.8216,
"grad_norm": 1.1448289155960083,
"learning_rate": 3.8247348263687035e-07,
"loss": 3.5438,
"step": 1027
},
{
"epoch": 0.8224,
"grad_norm": 3.7021570205688477,
"learning_rate": 3.7914018343562896e-07,
"loss": 1.3568,
"step": 1028
},
{
"epoch": 0.8232,
"grad_norm": 1.1746755838394165,
"learning_rate": 3.75820280737659e-07,
"loss": 3.631,
"step": 1029
},
{
"epoch": 0.824,
"grad_norm": 4.372186660766602,
"learning_rate": 3.725137955132707e-07,
"loss": 1.5514,
"step": 1030
},
{
"epoch": 0.8248,
"grad_norm": 1.2693135738372803,
"learning_rate": 3.6922074864802095e-07,
"loss": 3.6151,
"step": 1031
},
{
"epoch": 0.8256,
"grad_norm": 4.060328483581543,
"learning_rate": 3.659411609425834e-07,
"loss": 1.2585,
"step": 1032
},
{
"epoch": 0.8264,
"grad_norm": 1.1194394826889038,
"learning_rate": 3.626750531126169e-07,
"loss": 3.5576,
"step": 1033
},
{
"epoch": 0.8272,
"grad_norm": 4.196378707885742,
"learning_rate": 3.594224457886336e-07,
"loss": 1.1795,
"step": 1034
},
{
"epoch": 0.828,
"grad_norm": 1.4582164287567139,
"learning_rate": 3.561833595158698e-07,
"loss": 3.4901,
"step": 1035
},
{
"epoch": 0.8288,
"grad_norm": 3.783414602279663,
"learning_rate": 3.529578147541532e-07,
"loss": 1.1758,
"step": 1036
},
{
"epoch": 0.8296,
"grad_norm": 1.4051135778427124,
"learning_rate": 3.4974583187777853e-07,
"loss": 3.4493,
"step": 1037
},
{
"epoch": 0.8304,
"grad_norm": 3.584596633911133,
"learning_rate": 3.4654743117537525e-07,
"loss": 1.2126,
"step": 1038
},
{
"epoch": 0.8312,
"grad_norm": 1.3267326354980469,
"learning_rate": 3.433626328497805e-07,
"loss": 3.6435,
"step": 1039
},
{
"epoch": 0.832,
"grad_norm": 4.257800579071045,
"learning_rate": 3.4019145701791186e-07,
"loss": 1.4825,
"step": 1040
},
{
"epoch": 0.8328,
"grad_norm": 1.1711785793304443,
"learning_rate": 3.370339237106385e-07,
"loss": 3.5212,
"step": 1041
},
{
"epoch": 0.8336,
"grad_norm": 4.394068717956543,
"learning_rate": 3.3389005287265713e-07,
"loss": 1.1283,
"step": 1042
},
{
"epoch": 0.8344,
"grad_norm": 1.297494888305664,
"learning_rate": 3.3075986436236494e-07,
"loss": 3.5152,
"step": 1043
},
{
"epoch": 0.8352,
"grad_norm": 3.9251017570495605,
"learning_rate": 3.2764337795173433e-07,
"loss": 1.2356,
"step": 1044
},
{
"epoch": 0.836,
"grad_norm": 1.0191597938537598,
"learning_rate": 3.245406133261858e-07,
"loss": 3.6092,
"step": 1045
},
{
"epoch": 0.8368,
"grad_norm": 4.02804708480835,
"learning_rate": 3.214515900844681e-07,
"loss": 1.2928,
"step": 1046
},
{
"epoch": 0.8376,
"grad_norm": 1.1345746517181396,
"learning_rate": 3.18376327738531e-07,
"loss": 3.5869,
"step": 1047
},
{
"epoch": 0.8384,
"grad_norm": 4.080638408660889,
"learning_rate": 3.15314845713402e-07,
"loss": 1.3423,
"step": 1048
},
{
"epoch": 0.8392,
"grad_norm": 1.3001468181610107,
"learning_rate": 3.122671633470664e-07,
"loss": 3.4875,
"step": 1049
},
{
"epoch": 0.84,
"grad_norm": 3.684081792831421,
"learning_rate": 3.092332998903416e-07,
"loss": 1.3089,
"step": 1050
},
{
"epoch": 0.8408,
"grad_norm": 1.3111592531204224,
"learning_rate": 3.0621327450675806e-07,
"loss": 3.5502,
"step": 1051
},
{
"epoch": 0.8416,
"grad_norm": 4.330699443817139,
"learning_rate": 3.0320710627243815e-07,
"loss": 1.4276,
"step": 1052
},
{
"epoch": 0.8424,
"grad_norm": 1.4837126731872559,
"learning_rate": 3.002148141759739e-07,
"loss": 3.5433,
"step": 1053
},
{
"epoch": 0.8432,
"grad_norm": 3.8255903720855713,
"learning_rate": 2.9723641711830896e-07,
"loss": 1.3503,
"step": 1054
},
{
"epoch": 0.844,
"grad_norm": 1.0839451551437378,
"learning_rate": 2.942719339126171e-07,
"loss": 3.659,
"step": 1055
},
{
"epoch": 0.8448,
"grad_norm": 4.035921573638916,
"learning_rate": 2.913213832841857e-07,
"loss": 1.3085,
"step": 1056
},
{
"epoch": 0.8456,
"grad_norm": 1.2930865287780762,
"learning_rate": 2.8838478387029605e-07,
"loss": 3.4512,
"step": 1057
},
{
"epoch": 0.8464,
"grad_norm": 3.7543997764587402,
"learning_rate": 2.854621542201064e-07,
"loss": 1.1318,
"step": 1058
},
{
"epoch": 0.8472,
"grad_norm": 1.1573505401611328,
"learning_rate": 2.8255351279453446e-07,
"loss": 3.5605,
"step": 1059
},
{
"epoch": 0.848,
"grad_norm": 3.8682708740234375,
"learning_rate": 2.796588779661388e-07,
"loss": 1.3628,
"step": 1060
},
{
"epoch": 0.8488,
"grad_norm": 2.039510726928711,
"learning_rate": 2.767782680190073e-07,
"loss": 3.5517,
"step": 1061
},
{
"epoch": 0.8496,
"grad_norm": 3.9016358852386475,
"learning_rate": 2.739117011486378e-07,
"loss": 1.1586,
"step": 1062
},
{
"epoch": 0.8504,
"grad_norm": 1.1205612421035767,
"learning_rate": 2.710591954618247e-07,
"loss": 3.5143,
"step": 1063
},
{
"epoch": 0.8512,
"grad_norm": 4.346203327178955,
"learning_rate": 2.6822076897654453e-07,
"loss": 1.3599,
"step": 1064
},
{
"epoch": 0.852,
"grad_norm": 1.4595547914505005,
"learning_rate": 2.653964396218406e-07,
"loss": 3.5174,
"step": 1065
},
{
"epoch": 0.8528,
"grad_norm": 3.893127918243408,
"learning_rate": 2.625862252377129e-07,
"loss": 1.1346,
"step": 1066
},
{
"epoch": 0.8536,
"grad_norm": 1.3180551528930664,
"learning_rate": 2.597901435750025e-07,
"loss": 3.4543,
"step": 1067
},
{
"epoch": 0.8544,
"grad_norm": 3.9734368324279785,
"learning_rate": 2.5700821229528164e-07,
"loss": 1.2548,
"step": 1068
},
{
"epoch": 0.8552,
"grad_norm": 1.505300521850586,
"learning_rate": 2.5424044897073895e-07,
"loss": 3.5335,
"step": 1069
},
{
"epoch": 0.856,
"grad_norm": 3.921257972717285,
"learning_rate": 2.514868710840723e-07,
"loss": 1.5256,
"step": 1070
},
{
"epoch": 0.8568,
"grad_norm": 1.551336407661438,
"learning_rate": 2.48747496028377e-07,
"loss": 3.3823,
"step": 1071
},
{
"epoch": 0.8576,
"grad_norm": 3.929121494293213,
"learning_rate": 2.460223411070337e-07,
"loss": 1.2628,
"step": 1072
},
{
"epoch": 0.8584,
"grad_norm": 1.1952719688415527,
"learning_rate": 2.4331142353360206e-07,
"loss": 3.4138,
"step": 1073
},
{
"epoch": 0.8592,
"grad_norm": 3.588552713394165,
"learning_rate": 2.406147604317119e-07,
"loss": 1.2508,
"step": 1074
},
{
"epoch": 0.86,
"grad_norm": 1.0674008131027222,
"learning_rate": 2.3793236883495164e-07,
"loss": 3.5885,
"step": 1075
},
{
"epoch": 0.8608,
"grad_norm": 3.9291443824768066,
"learning_rate": 2.3526426568676485e-07,
"loss": 1.5289,
"step": 1076
},
{
"epoch": 0.8616,
"grad_norm": 1.1263163089752197,
"learning_rate": 2.3261046784034154e-07,
"loss": 3.5685,
"step": 1077
},
{
"epoch": 0.8624,
"grad_norm": 3.7272915840148926,
"learning_rate": 2.299709920585108e-07,
"loss": 1.0725,
"step": 1078
},
{
"epoch": 0.8632,
"grad_norm": 1.9841383695602417,
"learning_rate": 2.2734585501363676e-07,
"loss": 3.4305,
"step": 1079
},
{
"epoch": 0.864,
"grad_norm": 3.725369691848755,
"learning_rate": 2.2473507328751086e-07,
"loss": 1.2885,
"step": 1080
},
{
"epoch": 0.8648,
"grad_norm": 1.2514499425888062,
"learning_rate": 2.2213866337125022e-07,
"loss": 3.6041,
"step": 1081
},
{
"epoch": 0.8656,
"grad_norm": 3.798311233520508,
"learning_rate": 2.1955664166519036e-07,
"loss": 1.3569,
"step": 1082
},
{
"epoch": 0.8664,
"grad_norm": 1.05547034740448,
"learning_rate": 2.1698902447878478e-07,
"loss": 3.6443,
"step": 1083
},
{
"epoch": 0.8672,
"grad_norm": 4.112440586090088,
"learning_rate": 2.1443582803049757e-07,
"loss": 1.3431,
"step": 1084
},
{
"epoch": 0.868,
"grad_norm": 1.1724605560302734,
"learning_rate": 2.118970684477062e-07,
"loss": 3.5914,
"step": 1085
},
{
"epoch": 0.8688,
"grad_norm": 3.977243423461914,
"learning_rate": 2.0937276176659553e-07,
"loss": 1.4519,
"step": 1086
},
{
"epoch": 0.8696,
"grad_norm": 1.413366436958313,
"learning_rate": 2.068629239320588e-07,
"loss": 3.5239,
"step": 1087
},
{
"epoch": 0.8704,
"grad_norm": 3.696100950241089,
"learning_rate": 2.043675707975959e-07,
"loss": 1.5434,
"step": 1088
},
{
"epoch": 0.8712,
"grad_norm": 1.1970295906066895,
"learning_rate": 2.0188671812521293e-07,
"loss": 3.4977,
"step": 1089
},
{
"epoch": 0.872,
"grad_norm": 4.029970169067383,
"learning_rate": 1.9942038158532407e-07,
"loss": 1.3306,
"step": 1090
},
{
"epoch": 0.8728,
"grad_norm": 1.2960518598556519,
"learning_rate": 1.9696857675665122e-07,
"loss": 3.5162,
"step": 1091
},
{
"epoch": 0.8736,
"grad_norm": 3.725883960723877,
"learning_rate": 1.9453131912612694e-07,
"loss": 1.4022,
"step": 1092
},
{
"epoch": 0.8744,
"grad_norm": 1.3842031955718994,
"learning_rate": 1.9210862408879373e-07,
"loss": 3.5151,
"step": 1093
},
{
"epoch": 0.8752,
"grad_norm": 3.8603460788726807,
"learning_rate": 1.8970050694771064e-07,
"loss": 1.2135,
"step": 1094
},
{
"epoch": 0.876,
"grad_norm": 1.2414811849594116,
"learning_rate": 1.8730698291385518e-07,
"loss": 3.5374,
"step": 1095
},
{
"epoch": 0.8768,
"grad_norm": 4.625464916229248,
"learning_rate": 1.8492806710602495e-07,
"loss": 1.3096,
"step": 1096
},
{
"epoch": 0.8776,
"grad_norm": 1.5665608644485474,
"learning_rate": 1.8256377455074526e-07,
"loss": 3.4397,
"step": 1097
},
{
"epoch": 0.8784,
"grad_norm": 3.919268846511841,
"learning_rate": 1.802141201821736e-07,
"loss": 1.376,
"step": 1098
},
{
"epoch": 0.8792,
"grad_norm": 1.4185221195220947,
"learning_rate": 1.7787911884200314e-07,
"loss": 3.6158,
"step": 1099
},
{
"epoch": 0.88,
"grad_norm": 4.121542930603027,
"learning_rate": 1.7555878527937164e-07,
"loss": 1.3549,
"step": 1100
},
{
"epoch": 0.8808,
"grad_norm": 1.706099033355713,
"learning_rate": 1.7325313415076705e-07,
"loss": 3.5284,
"step": 1101
},
{
"epoch": 0.8816,
"grad_norm": 4.369479656219482,
"learning_rate": 1.7096218001993514e-07,
"loss": 1.5352,
"step": 1102
},
{
"epoch": 0.8824,
"grad_norm": 1.2528761625289917,
"learning_rate": 1.686859373577876e-07,
"loss": 3.6018,
"step": 1103
},
{
"epoch": 0.8832,
"grad_norm": 3.7873117923736572,
"learning_rate": 1.6642442054230935e-07,
"loss": 1.1694,
"step": 1104
},
{
"epoch": 0.884,
"grad_norm": 1.2879388332366943,
"learning_rate": 1.6417764385846996e-07,
"loss": 3.4757,
"step": 1105
},
{
"epoch": 0.8848,
"grad_norm": 3.334120988845825,
"learning_rate": 1.6194562149813241e-07,
"loss": 0.8637,
"step": 1106
},
{
"epoch": 0.8856,
"grad_norm": 1.3120352029800415,
"learning_rate": 1.5972836755996286e-07,
"loss": 3.4815,
"step": 1107
},
{
"epoch": 0.8864,
"grad_norm": 3.6376547813415527,
"learning_rate": 1.5752589604934255e-07,
"loss": 1.2615,
"step": 1108
},
{
"epoch": 0.8872,
"grad_norm": 1.1396851539611816,
"learning_rate": 1.5533822087827805e-07,
"loss": 3.5342,
"step": 1109
},
{
"epoch": 0.888,
"grad_norm": 3.7635209560394287,
"learning_rate": 1.5316535586531483e-07,
"loss": 1.1877,
"step": 1110
},
{
"epoch": 0.8888,
"grad_norm": 1.371699571609497,
"learning_rate": 1.5100731473544932e-07,
"loss": 3.5637,
"step": 1111
},
{
"epoch": 0.8896,
"grad_norm": 3.8787107467651367,
"learning_rate": 1.4886411112004258e-07,
"loss": 1.3821,
"step": 1112
},
{
"epoch": 0.8904,
"grad_norm": 1.8077179193496704,
"learning_rate": 1.4673575855673278e-07,
"loss": 3.4341,
"step": 1113
},
{
"epoch": 0.8912,
"grad_norm": 4.23999547958374,
"learning_rate": 1.4462227048935185e-07,
"loss": 1.5234,
"step": 1114
},
{
"epoch": 0.892,
"grad_norm": 1.4485225677490234,
"learning_rate": 1.425236602678387e-07,
"loss": 3.4551,
"step": 1115
},
{
"epoch": 0.8928,
"grad_norm": 3.488999128341675,
"learning_rate": 1.4043994114815663e-07,
"loss": 1.1846,
"step": 1116
},
{
"epoch": 0.8936,
"grad_norm": 1.237518072128296,
"learning_rate": 1.38371126292208e-07,
"loss": 3.5263,
"step": 1117
},
{
"epoch": 0.8944,
"grad_norm": 3.7093005180358887,
"learning_rate": 1.3631722876775137e-07,
"loss": 1.3514,
"step": 1118
},
{
"epoch": 0.8952,
"grad_norm": 1.2599142789840698,
"learning_rate": 1.342782615483204e-07,
"loss": 3.528,
"step": 1119
},
{
"epoch": 0.896,
"grad_norm": 3.7309329509735107,
"learning_rate": 1.3225423751313942e-07,
"loss": 1.5911,
"step": 1120
},
{
"epoch": 0.8968,
"grad_norm": 1.202618956565857,
"learning_rate": 1.3024516944704495e-07,
"loss": 3.4832,
"step": 1121
},
{
"epoch": 0.8976,
"grad_norm": 4.492614269256592,
"learning_rate": 1.2825107004040272e-07,
"loss": 1.2915,
"step": 1122
},
{
"epoch": 0.8984,
"grad_norm": 1.1479798555374146,
"learning_rate": 1.262719518890279e-07,
"loss": 3.5571,
"step": 1123
},
{
"epoch": 0.8992,
"grad_norm": 4.050600528717041,
"learning_rate": 1.2430782749410676e-07,
"loss": 1.388,
"step": 1124
},
{
"epoch": 0.9,
"grad_norm": 1.292321801185608,
"learning_rate": 1.223587092621162e-07,
"loss": 3.5855,
"step": 1125
},
{
"epoch": 0.9008,
"grad_norm": 4.229612350463867,
"learning_rate": 1.204246095047465e-07,
"loss": 1.3577,
"step": 1126
},
{
"epoch": 0.9016,
"grad_norm": 1.274814248085022,
"learning_rate": 1.1850554043882329e-07,
"loss": 3.5057,
"step": 1127
},
{
"epoch": 0.9024,
"grad_norm": 3.170250654220581,
"learning_rate": 1.1660151418622923e-07,
"loss": 0.8845,
"step": 1128
},
{
"epoch": 0.9032,
"grad_norm": 1.3429255485534668,
"learning_rate": 1.1471254277382882e-07,
"loss": 3.5239,
"step": 1129
},
{
"epoch": 0.904,
"grad_norm": 3.8732850551605225,
"learning_rate": 1.1283863813339263e-07,
"loss": 1.4954,
"step": 1130
},
{
"epoch": 0.9048,
"grad_norm": 1.0475130081176758,
"learning_rate": 1.1097981210152042e-07,
"loss": 3.5743,
"step": 1131
},
{
"epoch": 0.9056,
"grad_norm": 4.163371562957764,
"learning_rate": 1.0913607641956842e-07,
"loss": 1.3211,
"step": 1132
},
{
"epoch": 0.9064,
"grad_norm": 1.1388672590255737,
"learning_rate": 1.0730744273357213e-07,
"loss": 3.6136,
"step": 1133
},
{
"epoch": 0.9072,
"grad_norm": 3.882986068725586,
"learning_rate": 1.0549392259417646e-07,
"loss": 1.1432,
"step": 1134
},
{
"epoch": 0.908,
"grad_norm": 1.1615536212921143,
"learning_rate": 1.0369552745656014e-07,
"loss": 3.6521,
"step": 1135
},
{
"epoch": 0.9088,
"grad_norm": 3.6023221015930176,
"learning_rate": 1.0191226868036419e-07,
"loss": 1.3323,
"step": 1136
},
{
"epoch": 0.9096,
"grad_norm": 1.2144973278045654,
"learning_rate": 1.0014415752962081e-07,
"loss": 3.5626,
"step": 1137
},
{
"epoch": 0.9104,
"grad_norm": 3.877840280532837,
"learning_rate": 9.839120517267986e-08,
"loss": 1.3083,
"step": 1138
},
{
"epoch": 0.9112,
"grad_norm": 1.4756907224655151,
"learning_rate": 9.665342268214167e-08,
"loss": 3.4514,
"step": 1139
},
{
"epoch": 0.912,
"grad_norm": 4.363102436065674,
"learning_rate": 9.493082103478519e-08,
"loss": 1.1601,
"step": 1140
},
{
"epoch": 0.9128,
"grad_norm": 1.2879115343093872,
"learning_rate": 9.322341111149852e-08,
"loss": 3.4346,
"step": 1141
},
{
"epoch": 0.9136,
"grad_norm": 4.510580539703369,
"learning_rate": 9.153120369721047e-08,
"loss": 1.3901,
"step": 1142
},
{
"epoch": 0.9144,
"grad_norm": 1.3555859327316284,
"learning_rate": 8.985420948082329e-08,
"loss": 3.4953,
"step": 1143
},
{
"epoch": 0.9152,
"grad_norm": 4.071751594543457,
"learning_rate": 8.819243905514308e-08,
"loss": 1.2933,
"step": 1144
},
{
"epoch": 0.916,
"grad_norm": 1.0624727010726929,
"learning_rate": 8.654590291681531e-08,
"loss": 3.6109,
"step": 1145
},
{
"epoch": 0.9168,
"grad_norm": 4.541050910949707,
"learning_rate": 8.491461146625774e-08,
"loss": 1.5013,
"step": 1146
},
{
"epoch": 0.9176,
"grad_norm": 1.036971926689148,
"learning_rate": 8.329857500759291e-08,
"loss": 3.5826,
"step": 1147
},
{
"epoch": 0.9184,
"grad_norm": 4.1964287757873535,
"learning_rate": 8.169780374858577e-08,
"loss": 1.4736,
"step": 1148
},
{
"epoch": 0.9192,
"grad_norm": 1.3899742364883423,
"learning_rate": 8.011230780057749e-08,
"loss": 3.4604,
"step": 1149
},
{
"epoch": 0.92,
"grad_norm": 3.7320985794067383,
"learning_rate": 7.854209717842231e-08,
"loss": 1.1507,
"step": 1150
},
{
"epoch": 0.9208,
"grad_norm": 1.4710829257965088,
"learning_rate": 7.698718180042392e-08,
"loss": 3.5542,
"step": 1151
},
{
"epoch": 0.9216,
"grad_norm": 3.88554048538208,
"learning_rate": 7.544757148827297e-08,
"loss": 1.0699,
"step": 1152
},
{
"epoch": 0.9224,
"grad_norm": 1.352371096611023,
"learning_rate": 7.392327596698474e-08,
"loss": 3.5077,
"step": 1153
},
{
"epoch": 0.9232,
"grad_norm": 3.7906062602996826,
"learning_rate": 7.24143048648382e-08,
"loss": 1.3162,
"step": 1154
},
{
"epoch": 0.924,
"grad_norm": 1.3275525569915771,
"learning_rate": 7.092066771331507e-08,
"loss": 3.516,
"step": 1155
},
{
"epoch": 0.9248,
"grad_norm": 3.684339761734009,
"learning_rate": 6.944237394703985e-08,
"loss": 1.0855,
"step": 1156
},
{
"epoch": 0.9256,
"grad_norm": 1.6030592918395996,
"learning_rate": 6.797943290371839e-08,
"loss": 3.3999,
"step": 1157
},
{
"epoch": 0.9264,
"grad_norm": 3.9943041801452637,
"learning_rate": 6.653185382408195e-08,
"loss": 1.3748,
"step": 1158
},
{
"epoch": 0.9272,
"grad_norm": 2.058311939239502,
"learning_rate": 6.509964585182688e-08,
"loss": 3.4637,
"step": 1159
},
{
"epoch": 0.928,
"grad_norm": 4.087345123291016,
"learning_rate": 6.368281803355692e-08,
"loss": 1.3247,
"step": 1160
},
{
"epoch": 0.9288,
"grad_norm": 1.4231693744659424,
"learning_rate": 6.228137931872713e-08,
"loss": 3.5084,
"step": 1161
},
{
"epoch": 0.9296,
"grad_norm": 3.276982545852661,
"learning_rate": 6.089533855958508e-08,
"loss": 1.0859,
"step": 1162
},
{
"epoch": 0.9304,
"grad_norm": 0.9627519249916077,
"learning_rate": 5.9524704511118305e-08,
"loss": 3.6085,
"step": 1163
},
{
"epoch": 0.9312,
"grad_norm": 4.000705242156982,
"learning_rate": 5.8169485830996134e-08,
"loss": 1.2021,
"step": 1164
},
{
"epoch": 0.932,
"grad_norm": 1.0772417783737183,
"learning_rate": 5.68296910795163e-08,
"loss": 3.5649,
"step": 1165
},
{
"epoch": 0.9328,
"grad_norm": 4.611580848693848,
"learning_rate": 5.550532871955061e-08,
"loss": 1.2716,
"step": 1166
},
{
"epoch": 0.9336,
"grad_norm": 1.6169544458389282,
"learning_rate": 5.419640711649188e-08,
"loss": 3.4921,
"step": 1167
},
{
"epoch": 0.9344,
"grad_norm": 3.6111767292022705,
"learning_rate": 5.290293453819956e-08,
"loss": 1.1447,
"step": 1168
},
{
"epoch": 0.9352,
"grad_norm": 1.527208924293518,
"learning_rate": 5.162491915495005e-08,
"loss": 3.5345,
"step": 1169
},
{
"epoch": 0.936,
"grad_norm": 3.3724429607391357,
"learning_rate": 5.036236903938285e-08,
"loss": 1.1051,
"step": 1170
},
{
"epoch": 0.9368,
"grad_norm": 1.2857189178466797,
"learning_rate": 4.911529216645089e-08,
"loss": 3.5927,
"step": 1171
},
{
"epoch": 0.9376,
"grad_norm": 3.823451519012451,
"learning_rate": 4.788369641336943e-08,
"loss": 1.2766,
"step": 1172
},
{
"epoch": 0.9384,
"grad_norm": 1.3951259851455688,
"learning_rate": 4.6667589559566405e-08,
"loss": 3.5188,
"step": 1173
},
{
"epoch": 0.9392,
"grad_norm": 4.200174331665039,
"learning_rate": 4.546697928663357e-08,
"loss": 1.4409,
"step": 1174
},
{
"epoch": 0.94,
"grad_norm": 1.4412181377410889,
"learning_rate": 4.428187317827848e-08,
"loss": 3.536,
"step": 1175
},
{
"epoch": 0.9408,
"grad_norm": 4.055942058563232,
"learning_rate": 4.311227872027479e-08,
"loss": 1.3862,
"step": 1176
},
{
"epoch": 0.9416,
"grad_norm": 1.1776350736618042,
"learning_rate": 4.1958203300417056e-08,
"loss": 3.6454,
"step": 1177
},
{
"epoch": 0.9424,
"grad_norm": 3.8492658138275146,
"learning_rate": 4.0819654208472947e-08,
"loss": 1.2609,
"step": 1178
},
{
"epoch": 0.9432,
"grad_norm": 1.2920982837677002,
"learning_rate": 3.969663863613721e-08,
"loss": 3.4813,
"step": 1179
},
{
"epoch": 0.944,
"grad_norm": 3.726270914077759,
"learning_rate": 3.8589163676986674e-08,
"loss": 1.3,
"step": 1180
},
{
"epoch": 0.9448,
"grad_norm": 1.0104079246520996,
"learning_rate": 3.749723632643476e-08,
"loss": 3.6193,
"step": 1181
},
{
"epoch": 0.9456,
"grad_norm": 3.768679618835449,
"learning_rate": 3.642086348168844e-08,
"loss": 1.2007,
"step": 1182
},
{
"epoch": 0.9464,
"grad_norm": 1.5914446115493774,
"learning_rate": 3.536005194170328e-08,
"loss": 3.4693,
"step": 1183
},
{
"epoch": 0.9472,
"grad_norm": 3.930814743041992,
"learning_rate": 3.431480840714152e-08,
"loss": 1.4124,
"step": 1184
},
{
"epoch": 0.948,
"grad_norm": 1.1689213514328003,
"learning_rate": 3.328513948032991e-08,
"loss": 3.5226,
"step": 1185
},
{
"epoch": 0.9488,
"grad_norm": 3.568666934967041,
"learning_rate": 3.227105166521638e-08,
"loss": 1.3847,
"step": 1186
},
{
"epoch": 0.9496,
"grad_norm": 1.2137675285339355,
"learning_rate": 3.127255136733093e-08,
"loss": 3.5211,
"step": 1187
},
{
"epoch": 0.9504,
"grad_norm": 4.159763336181641,
"learning_rate": 3.028964489374453e-08,
"loss": 1.3348,
"step": 1188
},
{
"epoch": 0.9512,
"grad_norm": 0.9644594788551331,
"learning_rate": 2.9322338453028066e-08,
"loss": 3.5866,
"step": 1189
},
{
"epoch": 0.952,
"grad_norm": 3.9226300716400146,
"learning_rate": 2.8370638155215125e-08,
"loss": 1.4359,
"step": 1190
},
{
"epoch": 0.9528,
"grad_norm": 1.1887046098709106,
"learning_rate": 2.7434550011761763e-08,
"loss": 3.578,
"step": 1191
},
{
"epoch": 0.9536,
"grad_norm": 3.7943222522735596,
"learning_rate": 2.6514079935509586e-08,
"loss": 1.2984,
"step": 1192
},
{
"epoch": 0.9544,
"grad_norm": 1.480806589126587,
"learning_rate": 2.560923374064772e-08,
"loss": 3.4495,
"step": 1193
},
{
"epoch": 0.9552,
"grad_norm": 3.667187213897705,
"learning_rate": 2.4720017142676745e-08,
"loss": 1.4821,
"step": 1194
},
{
"epoch": 0.956,
"grad_norm": 1.1104971170425415,
"learning_rate": 2.3846435758372034e-08,
"loss": 3.6191,
"step": 1195
},
{
"epoch": 0.9568,
"grad_norm": 3.9890453815460205,
"learning_rate": 2.2988495105748245e-08,
"loss": 1.2608,
"step": 1196
},
{
"epoch": 0.9576,
"grad_norm": 1.3386608362197876,
"learning_rate": 2.2146200604024614e-08,
"loss": 3.5502,
"step": 1197
},
{
"epoch": 0.9584,
"grad_norm": 3.8145041465759277,
"learning_rate": 2.131955757359111e-08,
"loss": 1.3914,
"step": 1198
},
{
"epoch": 0.9592,
"grad_norm": 1.692157506942749,
"learning_rate": 2.050857123597455e-08,
"loss": 3.5147,
"step": 1199
},
{
"epoch": 0.96,
"grad_norm": 3.8497886657714844,
"learning_rate": 1.9713246713805588e-08,
"loss": 1.2747,
"step": 1200
},
{
"epoch": 0.9608,
"grad_norm": 1.7304649353027344,
"learning_rate": 1.893358903078568e-08,
"loss": 3.4559,
"step": 1201
},
{
"epoch": 0.9616,
"grad_norm": 4.028602123260498,
"learning_rate": 1.8169603111656554e-08,
"loss": 1.2436,
"step": 1202
},
{
"epoch": 0.9624,
"grad_norm": 1.0460162162780762,
"learning_rate": 1.7421293782168837e-08,
"loss": 3.6491,
"step": 1203
},
{
"epoch": 0.9632,
"grad_norm": 4.187633514404297,
"learning_rate": 1.6688665769050704e-08,
"loss": 1.2076,
"step": 1204
},
{
"epoch": 0.964,
"grad_norm": 1.656624674797058,
"learning_rate": 1.5971723699979015e-08,
"loss": 3.5022,
"step": 1205
},
{
"epoch": 0.9648,
"grad_norm": 4.018679141998291,
"learning_rate": 1.5270472103549317e-08,
"loss": 1.4379,
"step": 1206
},
{
"epoch": 0.9656,
"grad_norm": 1.5885015726089478,
"learning_rate": 1.4584915409248113e-08,
"loss": 3.4547,
"step": 1207
},
{
"epoch": 0.9664,
"grad_norm": 3.9813663959503174,
"learning_rate": 1.3915057947423705e-08,
"loss": 1.3217,
"step": 1208
},
{
"epoch": 0.9672,
"grad_norm": 1.4755148887634277,
"learning_rate": 1.3260903949260107e-08,
"loss": 3.4995,
"step": 1209
},
{
"epoch": 0.968,
"grad_norm": 3.5924222469329834,
"learning_rate": 1.2622457546749567e-08,
"loss": 1.3469,
"step": 1210
},
{
"epoch": 0.9688,
"grad_norm": 1.0457367897033691,
"learning_rate": 1.1999722772666478e-08,
"loss": 3.5185,
"step": 1211
},
{
"epoch": 0.9696,
"grad_norm": 4.9514994621276855,
"learning_rate": 1.1392703560542118e-08,
"loss": 1.3577,
"step": 1212
},
{
"epoch": 0.9704,
"grad_norm": 1.328444004058838,
"learning_rate": 1.0801403744639672e-08,
"loss": 3.4504,
"step": 1213
},
{
"epoch": 0.9712,
"grad_norm": 3.700564384460449,
"learning_rate": 1.0225827059930082e-08,
"loss": 1.2764,
"step": 1214
},
{
"epoch": 0.972,
"grad_norm": 1.7747372388839722,
"learning_rate": 9.665977142068738e-09,
"loss": 3.4396,
"step": 1215
},
{
"epoch": 0.9728,
"grad_norm": 3.901719331741333,
"learning_rate": 9.121857527372157e-09,
"loss": 1.4179,
"step": 1216
},
{
"epoch": 0.9736,
"grad_norm": 1.1439679861068726,
"learning_rate": 8.59347165279495e-09,
"loss": 3.5297,
"step": 1217
},
{
"epoch": 0.9744,
"grad_norm": 4.542992115020752,
"learning_rate": 8.080822855909832e-09,
"loss": 1.4076,
"step": 1218
},
{
"epoch": 0.9752,
"grad_norm": 1.05239737033844,
"learning_rate": 7.583914374885426e-09,
"loss": 3.6203,
"step": 1219
},
{
"epoch": 0.976,
"grad_norm": 3.649535655975342,
"learning_rate": 7.102749348465166e-09,
"loss": 1.2697,
"step": 1220
},
{
"epoch": 0.9768,
"grad_norm": 1.6955548524856567,
"learning_rate": 6.6373308159495275e-09,
"loss": 3.4582,
"step": 1221
},
{
"epoch": 0.9776,
"grad_norm": 4.211562156677246,
"learning_rate": 6.1876617171743865e-09,
"loss": 1.3995,
"step": 1222
},
{
"epoch": 0.9784,
"grad_norm": 1.1870956420898438,
"learning_rate": 5.753744892494639e-09,
"loss": 3.5536,
"step": 1223
},
{
"epoch": 0.9792,
"grad_norm": 3.487827777862549,
"learning_rate": 5.335583082764495e-09,
"loss": 1.4411,
"step": 1224
},
{
"epoch": 0.98,
"grad_norm": 1.736832857131958,
"learning_rate": 4.933178929321103e-09,
"loss": 3.5151,
"step": 1225
},
{
"epoch": 0.9808,
"grad_norm": 3.914550304412842,
"learning_rate": 4.546534973968175e-09,
"loss": 1.2732,
"step": 1226
},
{
"epoch": 0.9816,
"grad_norm": 1.4647449254989624,
"learning_rate": 4.175653658958501e-09,
"loss": 3.3779,
"step": 1227
},
{
"epoch": 0.9824,
"grad_norm": 4.559305191040039,
"learning_rate": 3.820537326980622e-09,
"loss": 1.5739,
"step": 1228
},
{
"epoch": 0.9832,
"grad_norm": 1.1620067358016968,
"learning_rate": 3.481188221142184e-09,
"loss": 3.5552,
"step": 1229
},
{
"epoch": 0.984,
"grad_norm": 3.963010787963867,
"learning_rate": 3.1576084849563315e-09,
"loss": 1.3199,
"step": 1230
},
{
"epoch": 0.9848,
"grad_norm": 1.101914644241333,
"learning_rate": 2.849800162328664e-09,
"loss": 3.5772,
"step": 1231
},
{
"epoch": 0.9856,
"grad_norm": 3.9038467407226562,
"learning_rate": 2.557765197543638e-09,
"loss": 1.2684,
"step": 1232
},
{
"epoch": 0.9864,
"grad_norm": 1.2498347759246826,
"learning_rate": 2.2815054352531842e-09,
"loss": 3.6124,
"step": 1233
},
{
"epoch": 0.9872,
"grad_norm": 3.7474238872528076,
"learning_rate": 2.0210226204639414e-09,
"loss": 1.2981,
"step": 1234
},
{
"epoch": 0.988,
"grad_norm": 1.3778389692306519,
"learning_rate": 1.7763183985269882e-09,
"loss": 3.5426,
"step": 1235
},
{
"epoch": 0.9888,
"grad_norm": 3.6975715160369873,
"learning_rate": 1.5473943151270155e-09,
"loss": 1.3295,
"step": 1236
},
{
"epoch": 0.9896,
"grad_norm": 1.4429659843444824,
"learning_rate": 1.3342518162728913e-09,
"loss": 3.6067,
"step": 1237
},
{
"epoch": 0.9904,
"grad_norm": 3.43681263923645,
"learning_rate": 1.1368922482887789e-09,
"loss": 1.1235,
"step": 1238
},
{
"epoch": 0.9912,
"grad_norm": 1.3926042318344116,
"learning_rate": 9.553168578049776e-10,
"loss": 3.4841,
"step": 1239
},
{
"epoch": 0.992,
"grad_norm": 3.8875744342803955,
"learning_rate": 7.895267917501503e-10,
"loss": 1.3565,
"step": 1240
},
{
"epoch": 0.9928,
"grad_norm": 1.6624120473861694,
"learning_rate": 6.395230973443856e-10,
"loss": 3.4427,
"step": 1241
},
{
"epoch": 0.9936,
"grad_norm": 3.605576753616333,
"learning_rate": 5.053067220925356e-10,
"loss": 1.1553,
"step": 1242
},
{
"epoch": 0.9944,
"grad_norm": 1.560855507850647,
"learning_rate": 3.868785137786657e-10,
"loss": 3.4811,
"step": 1243
},
{
"epoch": 0.9952,
"grad_norm": 4.160490989685059,
"learning_rate": 2.842392204591149e-10,
"loss": 1.2979,
"step": 1244
},
{
"epoch": 0.996,
"grad_norm": 1.5523591041564941,
"learning_rate": 1.9738949045972068e-10,
"loss": 3.4412,
"step": 1245
},
{
"epoch": 0.9968,
"grad_norm": 4.556288719177246,
"learning_rate": 1.2632987237054527e-10,
"loss": 1.2008,
"step": 1246
},
{
"epoch": 0.9976,
"grad_norm": 1.2331137657165527,
"learning_rate": 7.106081504254514e-11,
"loss": 3.4326,
"step": 1247
},
{
"epoch": 0.9984,
"grad_norm": 4.683450222015381,
"learning_rate": 3.158266758562789e-11,
"loss": 1.5665,
"step": 1248
},
{
"epoch": 0.9992,
"grad_norm": 1.4326642751693726,
"learning_rate": 7.89567936476665e-12,
"loss": 3.5399,
"step": 1249
},
{
"epoch": 1.0,
"grad_norm": 3.5186572074890137,
"learning_rate": 0.0,
"loss": 1.4857,
"step": 1250
}
],
"logging_steps": 1,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.22349105912873e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}