diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,100651 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9999652161814324,
+  "eval_steps": 500,
+  "global_step": 14374,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 6.956763713520471e-05,
+      "grad_norm": 16.375,
+      "learning_rate": 4.6296296296296296e-06,
+      "loss": 4.1097,
+      "step": 1
+    },
+    {
+      "epoch": 0.00013913527427040942,
+      "grad_norm": 30.125,
+      "learning_rate": 9.259259259259259e-06,
+      "loss": 4.3742,
+      "step": 2
+    },
+    {
+      "epoch": 0.0002087029114056141,
+      "grad_norm": 20.625,
+      "learning_rate": 1.3888888888888888e-05,
+      "loss": 3.9231,
+      "step": 3
+    },
+    {
+      "epoch": 0.00027827054854081884,
+      "grad_norm": 14.3125,
+      "learning_rate": 1.8518518518518518e-05,
+      "loss": 3.7877,
+      "step": 4
+    },
+    {
+      "epoch": 0.0003478381856760235,
+      "grad_norm": 20.0,
+      "learning_rate": 2.3148148148148147e-05,
+      "loss": 3.9694,
+      "step": 5
+    },
+    {
+      "epoch": 0.0004174058228112282,
+      "grad_norm": 13.75,
+      "learning_rate": 2.7777777777777776e-05,
+      "loss": 4.0547,
+      "step": 6
+    },
+    {
+      "epoch": 0.0004869734599464329,
+      "grad_norm": 17.875,
+      "learning_rate": 3.240740740740741e-05,
+      "loss": 3.9384,
+      "step": 7
+    },
+    {
+      "epoch": 0.0005565410970816377,
+      "grad_norm": 15.8125,
+      "learning_rate": 3.7037037037037037e-05,
+      "loss": 3.8156,
+      "step": 8
+    },
+    {
+      "epoch": 0.0006261087342168423,
+      "grad_norm": 11.625,
+      "learning_rate": 4.1666666666666665e-05,
+      "loss": 4.0875,
+      "step": 9
+    },
+    {
+      "epoch": 0.000695676371352047,
+      "grad_norm": 14.125,
+      "learning_rate": 4.6296296296296294e-05,
+      "loss": 3.9151,
+      "step": 10
+    },
+    {
+      "epoch": 0.0007652440084872517,
+      "grad_norm": 5.65625,
+      "learning_rate": 5.092592592592592e-05,
+      "loss": 3.8153,
+      "step": 11
+    },
+    {
+      "epoch": 0.0008348116456224564,
+      "grad_norm": 5.09375,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 3.9969,
+      "step": 12
+    },
+    {
+      "epoch": 0.0009043792827576611,
+      "grad_norm": 7.65625,
+      "learning_rate": 6.018518518518518e-05,
+      "loss": 3.6755,
+      "step": 13
+    },
+    {
+      "epoch": 0.0009739469198928658,
+      "grad_norm": 5.6875,
+      "learning_rate": 6.481481481481482e-05,
+      "loss": 3.8728,
+      "step": 14
+    },
+    {
+      "epoch": 0.0010435145570280705,
+      "grad_norm": 3.03125,
+      "learning_rate": 6.944444444444444e-05,
+      "loss": 3.8949,
+      "step": 15
+    },
+    {
+      "epoch": 0.0011130821941632753,
+      "grad_norm": 2.65625,
+      "learning_rate": 7.407407407407407e-05,
+      "loss": 4.2955,
+      "step": 16
+    },
+    {
+      "epoch": 0.00118264983129848,
+      "grad_norm": 3.546875,
+      "learning_rate": 7.87037037037037e-05,
+      "loss": 3.6512,
+      "step": 17
+    },
+    {
+      "epoch": 0.0012522174684336846,
+      "grad_norm": 2.765625,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 4.3409,
+      "step": 18
+    },
+    {
+      "epoch": 0.0013217851055688894,
+      "grad_norm": 2.546875,
+      "learning_rate": 8.796296296296297e-05,
+      "loss": 3.5914,
+      "step": 19
+    },
+    {
+      "epoch": 0.001391352742704094,
+      "grad_norm": 2.21875,
+      "learning_rate": 9.259259259259259e-05,
+      "loss": 4.1952,
+      "step": 20
+    },
+    {
+      "epoch": 0.0014609203798392988,
+      "grad_norm": 2.234375,
+      "learning_rate": 9.722222222222223e-05,
+      "loss": 3.7605,
+      "step": 21
+    },
+    {
+      "epoch": 0.0015304880169745034,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.00010185185185185185,
+      "loss": 3.8877,
+      "step": 22
+    },
+    {
+      "epoch": 0.0016000556541097082,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.00010648148148148149,
+      "loss": 3.738,
+      "step": 23
+    },
+    {
+      "epoch": 0.0016696232912449128,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 3.7867,
+      "step": 24
+    },
+    {
+      "epoch": 0.0017391909283801176,
+      "grad_norm": 2.0,
+      "learning_rate": 0.00011574074074074075,
+      "loss": 3.6365,
+      "step": 25
+    },
+    {
+      "epoch": 0.0018087585655153222,
+      "grad_norm": 1.8984375,
+      "learning_rate": 0.00012037037037037036,
+      "loss": 4.0938,
+      "step": 26
+    },
+    {
+      "epoch": 0.001878326202650527,
+      "grad_norm": 2.25,
+      "learning_rate": 0.000125,
+      "loss": 3.5465,
+      "step": 27
+    },
+    {
+      "epoch": 0.0019478938397857316,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.00012962962962962963,
+      "loss": 4.0124,
+      "step": 28
+    },
+    {
+      "epoch": 0.0020174614769209362,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0001342592592592593,
+      "loss": 3.7071,
+      "step": 29
+    },
+    {
+      "epoch": 0.002087029114056141,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 4.0923,
+      "step": 30
+    },
+    {
+      "epoch": 0.002156596751191346,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.00014351851851851852,
+      "loss": 3.6059,
+      "step": 31
+    },
+    {
+      "epoch": 0.0022261643883265507,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.00014814814814814815,
+      "loss": 3.5063,
+      "step": 32
+    },
+    {
+      "epoch": 0.002295732025461755,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.0001527777777777778,
+      "loss": 3.846,
+      "step": 33
+    },
+    {
+      "epoch": 0.00236529966259696,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0001574074074074074,
+      "loss": 3.8888,
+      "step": 34
+    },
+    {
+      "epoch": 0.0024348672997321647,
+      "grad_norm": 3.046875,
+      "learning_rate": 0.00016203703703703703,
+      "loss": 3.7714,
+      "step": 35
+    },
+    {
+      "epoch": 0.002504434936867369,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 3.7296,
+      "step": 36
+    },
+    {
+      "epoch": 0.002574002574002574,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.00017129629629629632,
+      "loss": 3.9198,
+      "step": 37
+    },
+    {
+      "epoch": 0.0026435702111377787,
+      "grad_norm": 2.125,
+      "learning_rate": 0.00017592592592592595,
+      "loss": 3.6842,
+      "step": 38
+    },
+    {
+      "epoch": 0.0027131378482729836,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.00018055555555555555,
+      "loss": 3.5604,
+      "step": 39
+    },
+    {
+      "epoch": 0.002782705485408188,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.00018518518518518518,
+      "loss": 3.8453,
+      "step": 40
+    },
+    {
+      "epoch": 0.0028522731225433928,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.00018981481481481483,
+      "loss": 3.5141,
+      "step": 41
+    },
+    {
+      "epoch": 0.0029218407596785976,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.00019444444444444446,
+      "loss": 3.477,
+      "step": 42
+    },
+    {
+      "epoch": 0.0029914083968138024,
+      "grad_norm": 4.0,
+      "learning_rate": 0.00019907407407407406,
+      "loss": 3.822,
+      "step": 43
+    },
+    {
+      "epoch": 0.0030609760339490068,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0002037037037037037,
+      "loss": 3.9645,
+      "step": 44
+    },
+    {
+      "epoch": 0.0031305436710842116,
+      "grad_norm": 4.375,
+      "learning_rate": 0.00020833333333333335,
+      "loss": 3.3629,
+      "step": 45
+    },
+    {
+      "epoch": 0.0032001113082194164,
+      "grad_norm": 3.578125,
+      "learning_rate": 0.00021296296296296298,
+      "loss": 3.4721,
+      "step": 46
+    },
+    {
+      "epoch": 0.0032696789453546212,
+      "grad_norm": 4.03125,
+      "learning_rate": 0.0002175925925925926,
+      "loss": 3.2992,
+      "step": 47
+    },
+    {
+      "epoch": 0.0033392465824898256,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 3.648,
+      "step": 48
+    },
+    {
+      "epoch": 0.0034088142196250304,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.00022685185185185186,
+      "loss": 3.3534,
+      "step": 49
+    },
+    {
+      "epoch": 0.0034783818567602352,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0002314814814814815,
+      "loss": 3.8404,
+      "step": 50
+    },
+    {
+      "epoch": 0.0035479494938954396,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.00023611111111111112,
+      "loss": 3.6755,
+      "step": 51
+    },
+    {
+      "epoch": 0.0036175171310306444,
+      "grad_norm": 3.25,
+      "learning_rate": 0.00024074074074074072,
+      "loss": 3.5372,
+      "step": 52
+    },
+    {
+      "epoch": 0.0036870847681658493,
+      "grad_norm": 2.953125,
+      "learning_rate": 0.0002453703703703704,
+      "loss": 3.6597,
+      "step": 53
+    },
+    {
+      "epoch": 0.003756652405301054,
+      "grad_norm": 3.53125,
+      "learning_rate": 0.00025,
+      "loss": 3.7115,
+      "step": 54
+    },
+    {
+      "epoch": 0.0038262200424362585,
+      "grad_norm": 3.9375,
+      "learning_rate": 0.00025462962962962966,
+      "loss": 3.8284,
+      "step": 55
+    },
+    {
+      "epoch": 0.0038957876795714633,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.00025925925925925926,
+      "loss": 3.8952,
+      "step": 56
+    },
+    {
+      "epoch": 0.003965355316706668,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0002638888888888889,
+      "loss": 3.6997,
+      "step": 57
+    },
+    {
+      "epoch": 0.0040349229538418725,
+      "grad_norm": 7.03125,
+      "learning_rate": 0.0002685185185185186,
+      "loss": 3.8363,
+      "step": 58
+    },
+    {
+      "epoch": 0.004104490590977078,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0002731481481481481,
+      "loss": 3.6792,
+      "step": 59
+    },
+    {
+      "epoch": 0.004174058228112282,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.6028,
+      "step": 60
+    },
+    {
+      "epoch": 0.0042436258652474865,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0002824074074074074,
+      "loss": 3.6588,
+      "step": 61
+    },
+    {
+      "epoch": 0.004313193502382692,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.00028703703703703703,
+      "loss": 3.3777,
+      "step": 62
+    },
+    {
+      "epoch": 0.004382761139517896,
+      "grad_norm": 4.5,
+      "learning_rate": 0.0002916666666666667,
+      "loss": 3.3868,
+      "step": 63
+    },
+    {
+      "epoch": 0.004452328776653101,
+      "grad_norm": 9.5,
+      "learning_rate": 0.0002962962962962963,
+      "loss": 3.8549,
+      "step": 64
+    },
+    {
+      "epoch": 0.004521896413788306,
+      "grad_norm": 3.328125,
+      "learning_rate": 0.00030092592592592595,
+      "loss": 3.5282,
+      "step": 65
+    },
+    {
+      "epoch": 0.00459146405092351,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0003055555555555556,
+      "loss": 3.5236,
+      "step": 66
+    },
+    {
+      "epoch": 0.004661031688058715,
+      "grad_norm": 4.59375,
+      "learning_rate": 0.0003101851851851852,
+      "loss": 3.8009,
+      "step": 67
+    },
+    {
+      "epoch": 0.00473059932519392,
+      "grad_norm": 3.25,
+      "learning_rate": 0.0003148148148148148,
+      "loss": 3.4033,
+      "step": 68
+    },
+    {
+      "epoch": 0.004800166962329124,
+      "grad_norm": 4.96875,
+      "learning_rate": 0.0003194444444444444,
+      "loss": 3.2774,
+      "step": 69
+    },
+    {
+      "epoch": 0.004869734599464329,
+      "grad_norm": 3.265625,
+      "learning_rate": 0.00032407407407407406,
+      "loss": 3.7126,
+      "step": 70
+    },
+    {
+      "epoch": 0.004939302236599534,
+      "grad_norm": 3.625,
+      "learning_rate": 0.0003287037037037037,
+      "loss": 3.7453,
+      "step": 71
+    },
+    {
+      "epoch": 0.005008869873734738,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 3.2229,
+      "step": 72
+    },
+    {
+      "epoch": 0.0050784375108699435,
+      "grad_norm": 4.875,
+      "learning_rate": 0.000337962962962963,
+      "loss": 3.7234,
+      "step": 73
+    },
+    {
+      "epoch": 0.005148005148005148,
+      "grad_norm": 3.71875,
+      "learning_rate": 0.00034259259259259263,
+      "loss": 3.7748,
+      "step": 74
+    },
+    {
+      "epoch": 0.005217572785140353,
+      "grad_norm": 2.25,
+      "learning_rate": 0.00034722222222222224,
+      "loss": 3.6813,
+      "step": 75
+    },
+    {
+      "epoch": 0.0052871404222755575,
+      "grad_norm": 3.3125,
+      "learning_rate": 0.0003518518518518519,
+      "loss": 3.793,
+      "step": 76
+    },
+    {
+      "epoch": 0.005356708059410762,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.00035648148148148144,
+      "loss": 3.2565,
+      "step": 77
+    },
+    {
+      "epoch": 0.005426275696545967,
+      "grad_norm": 3.5,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 3.6943,
+      "step": 78
+    },
+    {
+      "epoch": 0.0054958433336811715,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.00036574074074074075,
+      "loss": 3.3554,
+      "step": 79
+    },
+    {
+      "epoch": 0.005565410970816376,
+      "grad_norm": 3.171875,
+      "learning_rate": 0.00037037037037037035,
+      "loss": 3.6748,
+      "step": 80
+    },
+    {
+      "epoch": 0.005634978607951581,
+      "grad_norm": 3.25,
+      "learning_rate": 0.000375,
+      "loss": 3.5064,
+      "step": 81
+    },
+    {
+      "epoch": 0.0057045462450867855,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.00037962962962962966,
+      "loss": 3.9164,
+      "step": 82
+    },
+    {
+      "epoch": 0.005774113882221991,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.00038425925925925927,
+      "loss": 3.4874,
+      "step": 83
+    },
+    {
+      "epoch": 0.005843681519357195,
+      "grad_norm": 3.40625,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 3.6889,
+      "step": 84
+    },
+    {
+      "epoch": 0.0059132491564923995,
+      "grad_norm": 3.140625,
+      "learning_rate": 0.0003935185185185186,
+      "loss": 3.3944,
+      "step": 85
+    },
+    {
+      "epoch": 0.005982816793627605,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0003981481481481481,
+      "loss": 3.3187,
+      "step": 86
+    },
+    {
+      "epoch": 0.006052384430762809,
+      "grad_norm": 6.5,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 3.6085,
+      "step": 87
+    },
+    {
+      "epoch": 0.0061219520678980135,
+      "grad_norm": 6.375,
+      "learning_rate": 0.0004074074074074074,
+      "loss": 3.3279,
+      "step": 88
+    },
+    {
+      "epoch": 0.006191519705033219,
+      "grad_norm": 4.28125,
+      "learning_rate": 0.00041203703703703704,
+      "loss": 3.6968,
+      "step": 89
+    },
+    {
+      "epoch": 0.006261087342168423,
+      "grad_norm": 5.0,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 3.162,
+      "step": 90
+    },
+    {
+      "epoch": 0.006330654979303628,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.0004212962962962963,
+      "loss": 3.106,
+      "step": 91
+    },
+    {
+      "epoch": 0.006400222616438833,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.00042592592592592595,
+      "loss": 3.2511,
+      "step": 92
+    },
+    {
+      "epoch": 0.006469790253574037,
+      "grad_norm": 3.734375,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 3.3395,
+      "step": 93
+    },
+    {
+      "epoch": 0.0065393578907092425,
+      "grad_norm": 3.5625,
+      "learning_rate": 0.0004351851851851852,
+      "loss": 3.9849,
+      "step": 94
+    },
+    {
+      "epoch": 0.006608925527844447,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.0004398148148148148,
+      "loss": 3.7063,
+      "step": 95
+    },
+    {
+      "epoch": 0.006678493164979651,
+      "grad_norm": 3.671875,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 3.643,
+      "step": 96
+    },
+    {
+      "epoch": 0.0067480608021148565,
+      "grad_norm": 3.671875,
+      "learning_rate": 0.00044907407407407407,
+      "loss": 3.7839,
+      "step": 97
+    },
+    {
+      "epoch": 0.006817628439250061,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.0004537037037037037,
+      "loss": 3.3117,
+      "step": 98
+    },
+    {
+      "epoch": 0.006887196076385265,
+      "grad_norm": 4.0625,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 3.2695,
+      "step": 99
+    },
+    {
+      "epoch": 0.0069567637135204705,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.000462962962962963,
+      "loss": 4.1127,
+      "step": 100
+    },
+    {
+      "epoch": 0.007026331350655675,
+      "grad_norm": 7.0,
+      "learning_rate": 0.00046759259259259264,
+      "loss": 3.3675,
+      "step": 101
+    },
+    {
+      "epoch": 0.007095898987790879,
+      "grad_norm": 4.0625,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 3.4207,
+      "step": 102
+    },
+    {
+      "epoch": 0.0071654666249260845,
+      "grad_norm": 4.125,
+      "learning_rate": 0.0004768518518518519,
+      "loss": 3.3211,
+      "step": 103
+    },
+    {
+      "epoch": 0.007235034262061289,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.00048148148148148144,
+      "loss": 3.0541,
+      "step": 104
+    },
+    {
+      "epoch": 0.007304601899196494,
+      "grad_norm": 6.46875,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 3.7767,
+      "step": 105
+    },
+    {
+      "epoch": 0.0073741695363316985,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.0004907407407407408,
+      "loss": 3.6529,
+      "step": 106
+    },
+    {
+      "epoch": 0.007443737173466903,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.0004953703703703704,
+      "loss": 3.3748,
+      "step": 107
+    },
+    {
+      "epoch": 0.007513304810602108,
+      "grad_norm": 3.3125,
+      "learning_rate": 0.0005,
+      "loss": 3.1601,
+      "step": 108
+    },
+    {
+      "epoch": 0.0075828724477373126,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.0005046296296296296,
+      "loss": 3.583,
+      "step": 109
+    },
+    {
+      "epoch": 0.007652440084872517,
+      "grad_norm": 4.46875,
+      "learning_rate": 0.0005092592592592593,
+      "loss": 3.576,
+      "step": 110
+    },
+    {
+      "epoch": 0.007722007722007722,
+      "grad_norm": 3.140625,
+      "learning_rate": 0.0005138888888888888,
+      "loss": 3.2224,
+      "step": 111
+    },
+    {
+      "epoch": 0.007791575359142927,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.0005185185185185185,
+      "loss": 3.5271,
+      "step": 112
+    },
+    {
+      "epoch": 0.007861142996278131,
+      "grad_norm": 5.6875,
+      "learning_rate": 0.0005231481481481481,
+      "loss": 3.5304,
+      "step": 113
+    },
+    {
+      "epoch": 0.007930710633413336,
+      "grad_norm": 3.609375,
+      "learning_rate": 0.0005277777777777778,
+      "loss": 3.4794,
+      "step": 114
+    },
+    {
+      "epoch": 0.008000278270548541,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0005324074074074074,
+      "loss": 3.7191,
+      "step": 115
+    },
+    {
+      "epoch": 0.008069845907683745,
+      "grad_norm": 3.328125,
+      "learning_rate": 0.0005370370370370371,
+      "loss": 3.5999,
+      "step": 116
+    },
+    {
+      "epoch": 0.00813941354481895,
+      "grad_norm": 3.828125,
+      "learning_rate": 0.0005416666666666666,
+      "loss": 3.5959,
+      "step": 117
+    },
+    {
+      "epoch": 0.008208981181954155,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0005462962962962962,
+      "loss": 3.5367,
+      "step": 118
+    },
+    {
+      "epoch": 0.008278548819089359,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.000550925925925926,
+      "loss": 3.5518,
+      "step": 119
+    },
+    {
+      "epoch": 0.008348116456224564,
+      "grad_norm": 3.546875,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 3.5799,
+      "step": 120
+    },
+    {
+      "epoch": 0.00841768409335977,
+      "grad_norm": 3.25,
+      "learning_rate": 0.0005601851851851853,
+      "loss": 3.4259,
+      "step": 121
+    },
+    {
+      "epoch": 0.008487251730494973,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.0005648148148148148,
+      "loss": 3.1868,
+      "step": 122
+    },
+    {
+      "epoch": 0.008556819367630178,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0005694444444444445,
+      "loss": 3.6312,
+      "step": 123
+    },
+    {
+      "epoch": 0.008626387004765384,
+      "grad_norm": 3.421875,
+      "learning_rate": 0.0005740740740740741,
+      "loss": 3.5673,
+      "step": 124
+    },
+    {
+      "epoch": 0.008695954641900587,
+      "grad_norm": 3.5,
+      "learning_rate": 0.0005787037037037038,
+      "loss": 3.3845,
+      "step": 125
+    },
+    {
+      "epoch": 0.008765522279035792,
+      "grad_norm": 3.46875,
+      "learning_rate": 0.0005833333333333334,
+      "loss": 3.5458,
+      "step": 126
+    },
+    {
+      "epoch": 0.008835089916170998,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0005879629629629629,
+      "loss": 3.3844,
+      "step": 127
+    },
+    {
+      "epoch": 0.008904657553306203,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0005925925925925926,
+      "loss": 3.4753,
+      "step": 128
+    },
+    {
+      "epoch": 0.008974225190441406,
+      "grad_norm": 2.75,
+      "learning_rate": 0.0005972222222222222,
+      "loss": 3.7892,
+      "step": 129
+    },
+    {
+      "epoch": 0.009043792827576612,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.0006018518518518519,
+      "loss": 3.3437,
+      "step": 130
+    },
+    {
+      "epoch": 0.009113360464711817,
+      "grad_norm": 3.171875,
+      "learning_rate": 0.0006064814814814815,
+      "loss": 3.3362,
+      "step": 131
+    },
+    {
+      "epoch": 0.00918292810184702,
+      "grad_norm": 5.875,
+      "learning_rate": 0.0006111111111111112,
+      "loss": 3.806,
+      "step": 132
+    },
+    {
+      "epoch": 0.009252495738982226,
+      "grad_norm": 4.65625,
+      "learning_rate": 0.0006157407407407407,
+      "loss": 3.3642,
+      "step": 133
+    },
+    {
+      "epoch": 0.00932206337611743,
+      "grad_norm": 4.8125,
+      "learning_rate": 0.0006203703703703704,
+      "loss": 3.4821,
+      "step": 134
+    },
+    {
+      "epoch": 0.009391631013252634,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.000625,
+      "loss": 3.5422,
+      "step": 135
+    },
+    {
+      "epoch": 0.00946119865038784,
+      "grad_norm": 3.796875,
+      "learning_rate": 0.0006296296296296296,
+      "loss": 3.7212,
+      "step": 136
+    },
+    {
+      "epoch": 0.009530766287523045,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.0006342592592592593,
+      "loss": 3.7358,
+      "step": 137
+    },
+    {
+      "epoch": 0.009600333924658248,
+      "grad_norm": 3.125,
+      "learning_rate": 0.0006388888888888888,
+      "loss": 3.2124,
+      "step": 138
+    },
+    {
+      "epoch": 0.009669901561793454,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0006435185185185185,
+      "loss": 3.5355,
+      "step": 139
+    },
+    {
+      "epoch": 0.009739469198928659,
+      "grad_norm": 3.828125,
+      "learning_rate": 0.0006481481481481481,
+      "loss": 2.8285,
+      "step": 140
+    },
+    {
+      "epoch": 0.009809036836063862,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0006527777777777778,
+      "loss": 3.2107,
+      "step": 141
+    },
+    {
+      "epoch": 0.009878604473199068,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0006574074074074074,
+      "loss": 3.6405,
+      "step": 142
+    },
+    {
+      "epoch": 0.009948172110334273,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.0006620370370370372,
+      "loss": 3.77,
+      "step": 143
+    },
+    {
+      "epoch": 0.010017739747469476,
+      "grad_norm": 5.5625,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 3.5891,
+      "step": 144
+    },
+    {
+      "epoch": 0.010087307384604682,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.0006712962962962962,
+      "loss": 3.6008,
+      "step": 145
+    },
+    {
+      "epoch": 0.010156875021739887,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.000675925925925926,
+      "loss": 3.3318,
+      "step": 146
+    },
+    {
+      "epoch": 0.010226442658875092,
+      "grad_norm": 3.75,
+      "learning_rate": 0.0006805555555555556,
+      "loss": 3.635,
+      "step": 147
+    },
+    {
+      "epoch": 0.010296010296010296,
+      "grad_norm": 3.59375,
+      "learning_rate": 0.0006851851851851853,
+      "loss": 3.6206,
+      "step": 148
+    },
+    {
+      "epoch": 0.010365577933145501,
+      "grad_norm": 3.5625,
+      "learning_rate": 0.0006898148148148148,
+      "loss": 3.5804,
+      "step": 149
+    },
+    {
+      "epoch": 0.010435145570280706,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0006944444444444445,
+      "loss": 3.4559,
+      "step": 150
+    },
+    {
+      "epoch": 0.01050471320741591,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.0006990740740740741,
+      "loss": 3.2063,
+      "step": 151
+    },
+    {
+      "epoch": 0.010574280844551115,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.0007037037037037038,
+      "loss": 3.746,
+      "step": 152
+    },
+    {
+      "epoch": 0.01064384848168632,
+      "grad_norm": 5.75,
+      "learning_rate": 0.0007083333333333334,
+      "loss": 2.9674,
+      "step": 153
+    },
+    {
+      "epoch": 0.010713416118821524,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0007129629629629629,
+      "loss": 3.7912,
+      "step": 154
+    },
+    {
+      "epoch": 0.010782983755956729,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.0007175925925925926,
+      "loss": 3.2516,
+      "step": 155
+    },
+    {
+      "epoch": 0.010852551393091934,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.0007222222222222222,
+      "loss": 3.3805,
+      "step": 156
+    },
+    {
+      "epoch": 0.010922119030227138,
+      "grad_norm": 3.96875,
+      "learning_rate": 0.0007268518518518519,
+      "loss": 3.542,
+      "step": 157
+    },
+    {
+      "epoch": 0.010991686667362343,
+      "grad_norm": 3.328125,
+      "learning_rate": 0.0007314814814814815,
+      "loss": 3.4749,
+      "step": 158
+    },
+    {
+      "epoch": 0.011061254304497548,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0007361111111111112,
+      "loss": 3.5944,
+      "step": 159
+    },
+    {
+      "epoch": 0.011130821941632752,
+      "grad_norm": 3.25,
+      "learning_rate": 0.0007407407407407407,
+      "loss": 3.2803,
+      "step": 160
+    },
+    {
+      "epoch": 0.011200389578767957,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.0007453703703703704,
+      "loss": 3.3337,
+      "step": 161
+    },
+    {
+      "epoch": 0.011269957215903162,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.00075,
+      "loss": 3.5694,
+      "step": 162
+    },
+    {
+      "epoch": 0.011339524853038366,
+      "grad_norm": 3.921875,
+      "learning_rate": 0.0007546296296296296,
+      "loss": 3.3504,
+      "step": 163
+    },
+    {
+      "epoch": 0.011409092490173571,
+      "grad_norm": 4.65625,
+      "learning_rate": 0.0007592592592592593,
+      "loss": 3.3871,
+      "step": 164
+    },
+    {
+      "epoch": 0.011478660127308776,
+      "grad_norm": 3.703125,
+      "learning_rate": 0.0007638888888888888,
+      "loss": 3.5259,
+      "step": 165
+    },
+    {
+      "epoch": 0.011548227764443982,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0007685185185185185,
+      "loss": 3.2406,
+      "step": 166
+    },
+    {
+      "epoch": 0.011617795401579185,
+      "grad_norm": 4.03125,
+      "learning_rate": 0.0007731481481481481,
+      "loss": 3.4967,
+      "step": 167
+    },
+    {
+      "epoch": 0.01168736303871439,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.0007777777777777778,
+      "loss": 3.6256,
+      "step": 168
+    },
+    {
+      "epoch": 0.011756930675849596,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0007824074074074074,
+      "loss": 3.4215,
+      "step": 169
+    },
+    {
+      "epoch": 0.011826498312984799,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0007870370370370372,
+      "loss": 3.2696,
+      "step": 170
+    },
+    {
+      "epoch": 0.011896065950120004,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0007916666666666666,
+      "loss": 3.34,
+      "step": 171
+    },
+    {
+      "epoch": 0.01196563358725521,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0007962962962962962,
+      "loss": 3.4738,
+      "step": 172
+    },
+    {
+      "epoch": 0.012035201224390413,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.000800925925925926,
+      "loss": 3.5333,
+      "step": 173
+    },
+    {
+      "epoch": 0.012104768861525618,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.0008055555555555556,
+      "loss": 3.6269,
+      "step": 174
+    },
+    {
+      "epoch": 0.012174336498660824,
+      "grad_norm": 2.546875,
+      "learning_rate": 0.0008101851851851853,
+      "loss": 3.3796,
+      "step": 175
+    },
+    {
+      "epoch": 0.012243904135796027,
+      "grad_norm": 4.71875,
+      "learning_rate": 0.0008148148148148148,
+      "loss": 3.7743,
+      "step": 176
+    },
+    {
+      "epoch": 0.012313471772931232,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.0008194444444444445,
+      "loss": 3.494,
+      "step": 177
+    },
+    {
+      "epoch": 0.012383039410066438,
+      "grad_norm": 3.328125,
+      "learning_rate": 0.0008240740740740741,
+      "loss": 3.4347,
+      "step": 178
+    },
+    {
+      "epoch": 0.012452607047201641,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.0008287037037037038,
+      "loss": 3.0524,
+      "step": 179
+    },
+    {
+      "epoch": 0.012522174684336846,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0008333333333333334,
+      "loss": 3.4803,
+      "step": 180
+    },
+    {
+      "epoch": 0.012591742321472052,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.0008379629629629629,
+      "loss": 3.4848,
+      "step": 181
+    },
+    {
+      "epoch": 0.012661309958607255,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0008425925925925926,
+      "loss": 3.5803,
+      "step": 182
+    },
+    {
+      "epoch": 0.01273087759574246,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.0008472222222222222,
+      "loss": 3.1998,
+      "step": 183
+    },
+    {
+      "epoch": 0.012800445232877666,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0008518518518518519,
+      "loss": 4.02,
+      "step": 184
+    },
+    {
+      "epoch": 0.01287001287001287,
+      "grad_norm": 3.46875,
+      "learning_rate": 0.0008564814814814815,
+      "loss": 3.5389,
+      "step": 185
+    },
+    {
+      "epoch": 0.012939580507148074,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0008611111111111112,
+      "loss": 3.6485,
+      "step": 186
+    },
+    {
+      "epoch": 0.01300914814428328,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.0008657407407407407,
+      "loss": 3.6143,
+      "step": 187
+    },
+    {
+      "epoch": 0.013078715781418485,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0008703703703703704,
+      "loss": 3.2083,
+      "step": 188
+    },
+    {
+      "epoch": 0.013148283418553688,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.000875,
+      "loss": 3.7757,
+      "step": 189
+    },
+    {
+      "epoch": 0.013217851055688894,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0008796296296296296,
+      "loss": 3.3781,
+      "step": 190
+    },
+    {
+      "epoch": 0.013287418692824099,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0008842592592592593,
+      "loss": 3.5871,
+      "step": 191
+    },
+    {
+      "epoch": 0.013356986329959302,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0008888888888888888,
+      "loss": 3.4771,
+      "step": 192
+    },
+    {
+      "epoch": 0.013426553967094508,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0008935185185185185,
+      "loss": 3.2491,
+      "step": 193
+    },
+    {
+      "epoch": 0.013496121604229713,
+      "grad_norm": 7.21875,
+      "learning_rate": 0.0008981481481481481,
+      "loss": 3.3524,
+      "step": 194
+    },
+    {
+      "epoch": 0.013565689241364916,
+      "grad_norm": 3.15625,
+      "learning_rate": 0.0009027777777777778,
+      "loss": 3.4881,
+      "step": 195
+    },
+    {
+      "epoch": 0.013635256878500122,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0009074074074074074,
+      "loss": 3.3662,
+      "step": 196
+    },
+    {
+      "epoch": 0.013704824515635327,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.0009120370370370372,
+      "loss": 3.647,
+      "step": 197
+    },
+    {
+      "epoch": 0.01377439215277053,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0009166666666666666,
+      "loss": 3.2356,
+      "step": 198
+    },
+    {
+      "epoch": 0.013843959789905736,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0009212962962962963,
+      "loss": 3.5328,
+      "step": 199
+    },
+    {
+      "epoch": 0.013913527427040941,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.000925925925925926,
+      "loss": 3.576,
+      "step": 200
+    },
+    {
+      "epoch": 0.013983095064176145,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.0009305555555555556,
+      "loss": 3.3183,
+      "step": 201
+    },
+    {
+      "epoch": 0.01405266270131135,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0009351851851851853,
+      "loss": 3.4818,
+      "step": 202
+    },
+    {
+      "epoch": 0.014122230338446555,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0009398148148148148,
+      "loss": 3.5219,
+      "step": 203
+    },
+    {
+      "epoch": 0.014191797975581759,
+      "grad_norm": 2.6875,
+      "learning_rate": 0.0009444444444444445,
+      "loss": 3.775,
+      "step": 204
+    },
+    {
+      "epoch": 0.014261365612716964,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.0009490740740740741,
+      "loss": 3.4462,
+      "step": 205
+    },
+    {
+      "epoch": 0.014330933249852169,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0009537037037037038,
+      "loss": 3.4381,
+      "step": 206
+    },
+    {
+      "epoch": 0.014400500886987374,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0009583333333333334,
+      "loss": 3.0672,
+      "step": 207
+    },
+    {
+      "epoch": 0.014470068524122578,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0009629629629629629,
+      "loss": 3.1428,
+      "step": 208
+    },
+    {
+      "epoch": 0.014539636161257783,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0009675925925925926,
+      "loss": 3.4844,
+      "step": 209
+    },
+    {
+      "epoch": 0.014609203798392988,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.0009722222222222222,
+      "loss": 3.3521,
+      "step": 210
+    },
+    {
+      "epoch": 0.014678771435528192,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0009768518518518518,
+      "loss": 3.4093,
+      "step": 211
+    },
+    {
+      "epoch": 0.014748339072663397,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.0009814814814814816,
+      "loss": 3.8329,
+      "step": 212
+    },
+    {
+      "epoch": 0.014817906709798602,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0009861111111111112,
+      "loss": 3.2119,
+      "step": 213
+    },
+    {
+      "epoch": 0.014887474346933806,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0009907407407407408,
+      "loss": 3.1916,
+      "step": 214
+    },
+    {
+      "epoch": 0.014957041984069011,
+      "grad_norm": 5.3125,
+      "learning_rate": 0.0009953703703703704,
+      "loss": 3.7224,
+      "step": 215
+    },
+    {
+      "epoch": 0.015026609621204216,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.001,
+      "loss": 3.3947,
+      "step": 216
+    },
+    {
+      "epoch": 0.01509617725833942,
+      "grad_norm": 4.40625,
+      "learning_rate": 0.0010046296296296296,
+      "loss": 3.8063,
+      "step": 217
+    },
+    {
+      "epoch": 0.015165744895474625,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.0010092592592592592,
+      "loss": 3.4035,
+      "step": 218
+    },
+    {
+      "epoch": 0.01523531253260983,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0010138888888888888,
+      "loss": 3.2933,
+      "step": 219
+    },
+    {
+      "epoch": 0.015304880169745034,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0010185185185185186,
+      "loss": 3.5955,
+      "step": 220
+    },
+    {
+      "epoch": 0.015374447806880239,
+      "grad_norm": 3.265625,
+      "learning_rate": 0.0010231481481481482,
+      "loss": 3.4307,
+      "step": 221
+    },
+    {
+      "epoch": 0.015444015444015444,
+      "grad_norm": 4.0625,
+      "learning_rate": 0.0010277777777777776,
+      "loss": 3.0483,
+      "step": 222
+    },
+    {
+      "epoch": 0.015513583081150648,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0010324074074074074,
+      "loss": 3.3634,
+      "step": 223
+    },
+    {
+      "epoch": 0.015583150718285853,
+      "grad_norm": 3.78125,
+      "learning_rate": 0.001037037037037037,
+      "loss": 3.6414,
+      "step": 224
+    },
+    {
+      "epoch": 0.015652718355421057,
+      "grad_norm": 2.6875,
+      "learning_rate": 0.0010416666666666669,
+      "loss": 3.4195,
+      "step": 225
+    },
+    {
+      "epoch": 0.015722285992556262,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0010462962962962963,
+      "loss": 3.9841,
+      "step": 226
+    },
+    {
+      "epoch": 0.015791853629691467,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0010509259259259259,
+      "loss": 3.6758,
+      "step": 227
+    },
+    {
+      "epoch": 0.015861421266826672,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0010555555555555557,
+      "loss": 3.6038,
+      "step": 228
+    },
+    {
+      "epoch": 0.015930988903961878,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.001060185185185185,
+      "loss": 3.7388,
+      "step": 229
+    },
+    {
+      "epoch": 0.016000556541097083,
+      "grad_norm": 3.9375,
+      "learning_rate": 0.0010648148148148149,
+      "loss": 3.3876,
+      "step": 230
+    },
+    {
+      "epoch": 0.016070124178232285,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.0010694444444444445,
+      "loss": 3.3911,
+      "step": 231
+    },
+    {
+      "epoch": 0.01613969181536749,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0010740740740740743,
+      "loss": 3.7106,
+      "step": 232
+    },
+    {
+      "epoch": 0.016209259452502695,
+      "grad_norm": 4.28125,
+      "learning_rate": 0.0010787037037037037,
+      "loss": 3.7648,
+      "step": 233
+    },
+    {
+      "epoch": 0.0162788270896379,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0010833333333333333,
+      "loss": 3.1461,
+      "step": 234
+    },
+    {
+      "epoch": 0.016348394726773106,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.001087962962962963,
+      "loss": 3.7154,
+      "step": 235
+    },
+    {
+      "epoch": 0.01641796236390831,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.0010925925925925925,
+      "loss": 3.5562,
+      "step": 236
+    },
+    {
+      "epoch": 0.016487530001043516,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.0010972222222222223,
+      "loss": 3.2213,
+      "step": 237
+    },
+    {
+      "epoch": 0.016557097638178718,
+      "grad_norm": 4.4375,
+      "learning_rate": 0.001101851851851852,
+      "loss": 3.7439,
+      "step": 238
+    },
+    {
+      "epoch": 0.016626665275313923,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0011064814814814815,
+      "loss": 3.4636,
+      "step": 239
+    },
+    {
+      "epoch": 0.01669623291244913,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.0011111111111111111,
+      "loss": 3.4433,
+      "step": 240
+    },
+    {
+      "epoch": 0.016765800549584334,
+      "grad_norm": 3.59375,
+      "learning_rate": 0.0011157407407407407,
+      "loss": 3.5306,
+      "step": 241
+    },
+    {
+      "epoch": 0.01683536818671954,
+      "grad_norm": 4.6875,
+      "learning_rate": 0.0011203703703703705,
+      "loss": 3.5676,
+      "step": 242
+    },
+    {
+      "epoch": 0.016904935823854744,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.0011250000000000001,
+      "loss": 2.8527,
+      "step": 243
+    },
+    {
+      "epoch": 0.016974503460989946,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.0011296296296296295,
+      "loss": 3.7269,
+      "step": 244
+    },
+    {
+      "epoch": 0.01704407109812515,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0011342592592592593,
+      "loss": 3.1405,
+      "step": 245
+    },
+    {
+      "epoch": 0.017113638735260357,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.001138888888888889,
+      "loss": 3.7365,
+      "step": 246
+    },
+    {
+      "epoch": 0.017183206372395562,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0011435185185185185,
+      "loss": 2.9634,
+      "step": 247
+    },
+    {
+      "epoch": 0.017252774009530767,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0011481481481481481,
+      "loss": 3.4298,
+      "step": 248
+    },
+    {
+      "epoch": 0.017322341646665972,
+      "grad_norm": 3.09375,
+      "learning_rate": 0.0011527777777777777,
+      "loss": 3.5184,
+      "step": 249
+    },
+    {
+      "epoch": 0.017391909283801174,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0011574074074074076,
+      "loss": 3.6976,
+      "step": 250
+    },
+    {
+      "epoch": 0.01746147692093638,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.001162037037037037,
+      "loss": 3.2592,
+      "step": 251
+    },
+    {
+      "epoch": 0.017531044558071585,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 3.491,
+      "step": 252
+    },
+    {
+      "epoch": 0.01760061219520679,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0011712962962962964,
+      "loss": 3.6034,
+      "step": 253
+    },
+    {
+      "epoch": 0.017670179832341995,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0011759259259259257,
+      "loss": 3.5464,
+      "step": 254
+    },
+    {
+      "epoch": 0.0177397474694772,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.0011805555555555556,
+      "loss": 3.6768,
+      "step": 255
+    },
+    {
+      "epoch": 0.017809315106612406,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.0011851851851851852,
+      "loss": 3.2789,
+      "step": 256
+    },
+    {
+      "epoch": 0.017878882743747607,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.001189814814814815,
+      "loss": 3.4833,
+      "step": 257
+    },
+    {
+      "epoch": 0.017948450380882813,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0011944444444444444,
+      "loss": 3.3873,
+      "step": 258
+    },
+    {
+      "epoch": 0.018018018018018018,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0011990740740740742,
+      "loss": 3.7072,
+      "step": 259
+    },
+    {
+      "epoch": 0.018087585655153223,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0012037037037037038,
+      "loss": 2.9247,
+      "step": 260
+    },
+    {
+      "epoch": 0.01815715329228843,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0012083333333333332,
+      "loss": 3.7827,
+      "step": 261
+    },
+    {
+      "epoch": 0.018226720929423634,
+      "grad_norm": 2.875,
+      "learning_rate": 0.001212962962962963,
+      "loss": 3.6445,
+      "step": 262
+    },
+    {
+      "epoch": 0.018296288566558835,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.0012175925925925926,
+      "loss": 3.3478,
+      "step": 263
+    },
+    {
+      "epoch": 0.01836585620369404,
+      "grad_norm": 228.0,
+      "learning_rate": 0.0012222222222222224,
+      "loss": 4.2626,
+      "step": 264
+    },
+    {
+      "epoch": 0.018435423840829246,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.0012268518518518518,
+      "loss": 3.646,
+      "step": 265
+    },
+    {
+      "epoch": 0.01850499147796445,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0012314814814814814,
+      "loss": 3.4842,
+      "step": 266
+    },
+    {
+      "epoch": 0.018574559115099656,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0012361111111111112,
+      "loss": 3.6565,
+      "step": 267
+    },
+    {
+      "epoch": 0.01864412675223486,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0012407407407407408,
+      "loss": 3.2464,
+      "step": 268
+    },
+    {
+      "epoch": 0.018713694389370063,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0012453703703703704,
+      "loss": 3.9116,
+      "step": 269
+    },
+    {
+      "epoch": 0.01878326202650527,
+      "grad_norm": 4.15625,
+      "learning_rate": 0.00125,
+      "loss": 3.1034,
+      "step": 270
+    },
+    {
+      "epoch": 0.018852829663640474,
+      "grad_norm": 3.53125,
+      "learning_rate": 0.0012546296296296296,
+      "loss": 3.4661,
+      "step": 271
+    },
+    {
+      "epoch": 0.01892239730077568,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0012592592592592592,
+      "loss": 3.4787,
+      "step": 272
+    },
+    {
+      "epoch": 0.018991964937910884,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0012638888888888888,
+      "loss": 3.4316,
+      "step": 273
+    },
+    {
+      "epoch": 0.01906153257504609,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.0012685185185185186,
+      "loss": 3.4963,
+      "step": 274
+    },
+    {
+      "epoch": 0.019131100212181295,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0012731481481481483,
+      "loss": 3.8052,
+      "step": 275
+    },
+    {
+      "epoch": 0.019200667849316497,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0012777777777777776,
+      "loss": 3.6617,
+      "step": 276
+    },
+    {
+      "epoch": 0.019270235486451702,
+      "grad_norm": 3.3125,
+      "learning_rate": 0.0012824074074074075,
+      "loss": 3.0172,
+      "step": 277
+    },
+    {
+      "epoch": 0.019339803123586907,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.001287037037037037,
+      "loss": 3.6914,
+      "step": 278
+    },
+    {
+      "epoch": 0.019409370760722112,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0012916666666666669,
+      "loss": 3.7064,
+      "step": 279
+    },
+    {
+      "epoch": 0.019478938397857318,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0012962962962962963,
+      "loss": 3.1845,
+      "step": 280
+    },
+    {
+      "epoch": 0.019548506034992523,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0013009259259259259,
+      "loss": 3.29,
+      "step": 281
+    },
+    {
+      "epoch": 0.019618073672127725,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0013055555555555557,
+      "loss": 3.5425,
+      "step": 282
+    },
+    {
+      "epoch": 0.01968764130926293,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.001310185185185185,
+      "loss": 3.0504,
+      "step": 283
+    },
+    {
+      "epoch": 0.019757208946398135,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0013148148148148149,
+      "loss": 3.6795,
+      "step": 284
+    },
+    {
+      "epoch": 0.01982677658353334,
+      "grad_norm": 3.3125,
+      "learning_rate": 0.0013194444444444445,
+      "loss": 3.4279,
+      "step": 285
+    },
+    {
+      "epoch": 0.019896344220668546,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0013240740740740743,
+      "loss": 3.3526,
+      "step": 286
+    },
+    {
+      "epoch": 0.01996591185780375,
+      "grad_norm": 3.21875,
+      "learning_rate": 0.0013287037037037037,
+      "loss": 3.5018,
+      "step": 287
+    },
+    {
+      "epoch": 0.020035479494938953,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 3.1581,
+      "step": 288
+    },
+    {
+      "epoch": 0.020105047132074158,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.001337962962962963,
+      "loss": 3.0851,
+      "step": 289
+    },
+    {
+      "epoch": 0.020174614769209363,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0013425925925925925,
+      "loss": 3.6954,
+      "step": 290
+    },
+    {
+      "epoch": 0.02024418240634457,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0013472222222222223,
+      "loss": 3.5156,
+      "step": 291
+    },
+    {
+      "epoch": 0.020313750043479774,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.001351851851851852,
+      "loss": 3.3342,
+      "step": 292
+    },
+    {
+      "epoch": 0.02038331768061498,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0013564814814814815,
+      "loss": 3.4297,
+      "step": 293
+    },
+    {
+      "epoch": 0.020452885317750184,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.0013611111111111111,
+      "loss": 3.461,
+      "step": 294
+    },
+    {
+      "epoch": 0.020522452954885386,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0013657407407407407,
+      "loss": 3.9659,
+      "step": 295
+    },
+    {
+      "epoch": 0.02059202059202059,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0013703703703703705,
+      "loss": 3.7741,
+      "step": 296
+    },
+    {
+      "epoch": 0.020661588229155797,
+      "grad_norm": 1.75,
+      "learning_rate": 0.001375,
+      "loss": 3.4807,
+      "step": 297
+    },
+    {
+      "epoch": 0.020731155866291002,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0013796296296296295,
+      "loss": 3.4547,
+      "step": 298
+    },
+    {
+      "epoch": 0.020800723503426207,
+      "grad_norm": 3.125,
+      "learning_rate": 0.0013842592592592593,
+      "loss": 3.5839,
+      "step": 299
+    },
+    {
+      "epoch": 0.020870291140561412,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.001388888888888889,
+      "loss": 3.4602,
+      "step": 300
+    },
+    {
+      "epoch": 0.020939858777696614,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0013935185185185185,
+      "loss": 3.0079,
+      "step": 301
+    },
+    {
+      "epoch": 0.02100942641483182,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0013981481481481481,
+      "loss": 3.4029,
+      "step": 302
+    },
+    {
+      "epoch": 0.021078994051967025,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0014027777777777777,
+      "loss": 3.3146,
+      "step": 303
+    },
+    {
+      "epoch": 0.02114856168910223,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0014074074074074076,
+      "loss": 3.468,
+      "step": 304
+    },
+    {
+      "epoch": 0.021218129326237435,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.001412037037037037,
+      "loss": 3.3304,
+      "step": 305
+    },
+    {
+      "epoch": 0.02128769696337264,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0014166666666666668,
+      "loss": 3.6421,
+      "step": 306
+    },
+    {
+      "epoch": 0.021357264600507842,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0014212962962962964,
+      "loss": 2.9303,
+      "step": 307
+    },
+    {
+      "epoch": 0.021426832237643047,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.0014259259259259258,
+      "loss": 3.2054,
+      "step": 308
+    },
+    {
+      "epoch": 0.021496399874778253,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.0014305555555555556,
+      "loss": 3.7044,
+      "step": 309
+    },
+    {
+      "epoch": 0.021565967511913458,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0014351851851851852,
+      "loss": 3.7959,
+      "step": 310
+    },
+    {
+      "epoch": 0.021635535149048663,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.001439814814814815,
+      "loss": 3.2365,
+      "step": 311
+    },
+    {
+      "epoch": 0.02170510278618387,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 3.5516,
+      "step": 312
+    },
+    {
+      "epoch": 0.021774670423319074,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0014490740740740742,
+      "loss": 3.3478,
+      "step": 313
+    },
+    {
+      "epoch": 0.021844238060454275,
+      "grad_norm": 3.625,
+      "learning_rate": 0.0014537037037037038,
+      "loss": 2.91,
+      "step": 314
+    },
+    {
+      "epoch": 0.02191380569758948,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0014583333333333332,
+      "loss": 3.3708,
+      "step": 315
+    },
+    {
+      "epoch": 0.021983373334724686,
+      "grad_norm": 3.265625,
+      "learning_rate": 0.001462962962962963,
+      "loss": 3.5365,
+      "step": 316
+    },
+    {
+      "epoch": 0.02205294097185989,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0014675925925925926,
+      "loss": 3.3751,
+      "step": 317
+    },
+    {
+      "epoch": 0.022122508608995096,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0014722222222222224,
+      "loss": 3.3161,
+      "step": 318
+    },
+    {
+      "epoch": 0.0221920762461303,
+      "grad_norm": 3.265625,
+      "learning_rate": 0.0014768518518518518,
+      "loss": 3.3045,
+      "step": 319
+    },
+    {
+      "epoch": 0.022261643883265504,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0014814814814814814,
+      "loss": 3.5489,
+      "step": 320
+    },
+    {
+      "epoch": 0.02233121152040071,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.0014861111111111112,
+      "loss": 3.5659,
+      "step": 321
+    },
+    {
+      "epoch": 0.022400779157535914,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.0014907407407407408,
+      "loss": 3.2205,
+      "step": 322
+    },
+    {
+      "epoch": 0.02247034679467112,
+      "grad_norm": 2.640625,
+      "learning_rate": 0.0014953703703703704,
+      "loss": 3.3073,
+      "step": 323
+    },
+    {
+      "epoch": 0.022539914431806325,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.0015,
+      "loss": 3.6127,
+      "step": 324
+    },
+    {
+      "epoch": 0.02260948206894153,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.0015046296296296296,
+      "loss": 3.2646,
+      "step": 325
+    },
+    {
+      "epoch": 0.02267904970607673,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0015092592592592592,
+      "loss": 3.4497,
+      "step": 326
+    },
+    {
+      "epoch": 0.022748617343211937,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0015138888888888888,
+      "loss": 3.2017,
+      "step": 327
+    },
+    {
+      "epoch": 0.022818184980347142,
+      "grad_norm": 2.5,
+      "learning_rate": 0.0015185185185185187,
+      "loss": 3.5135,
+      "step": 328
+    },
+    {
+      "epoch": 0.022887752617482347,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0015231481481481483,
+      "loss": 3.2238,
+      "step": 329
+    },
+    {
+      "epoch": 0.022957320254617553,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0015277777777777776,
+      "loss": 2.8799,
+      "step": 330
+    },
+    {
+      "epoch": 0.023026887891752758,
+      "grad_norm": 2.703125,
+      "learning_rate": 0.0015324074074074075,
+      "loss": 3.4803,
+      "step": 331
+    },
+    {
+      "epoch": 0.023096455528887963,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.001537037037037037,
+      "loss": 3.3901,
+      "step": 332
+    },
+    {
+      "epoch": 0.023166023166023165,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0015416666666666669,
+      "loss": 3.2674,
+      "step": 333
+    },
+    {
+      "epoch": 0.02323559080315837,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0015462962962962963,
+      "loss": 3.6683,
+      "step": 334
+    },
+    {
+      "epoch": 0.023305158440293575,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.0015509259259259259,
+      "loss": 3.4703,
+      "step": 335
+    },
+    {
+      "epoch": 0.02337472607742878,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0015555555555555557,
+      "loss": 3.2555,
+      "step": 336
+    },
+    {
+      "epoch": 0.023444293714563986,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.001560185185185185,
+      "loss": 3.6211,
+      "step": 337
+    },
+    {
+      "epoch": 0.02351386135169919,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0015648148148148149,
+      "loss": 3.5455,
+      "step": 338
+    },
+    {
+      "epoch": 0.023583428988834393,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0015694444444444445,
+      "loss": 3.6748,
+      "step": 339
+    },
+    {
+      "epoch": 0.023652996625969598,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0015740740740740743,
+      "loss": 3.453,
+      "step": 340
+    },
+    {
+      "epoch": 0.023722564263104803,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0015787037037037037,
+      "loss": 3.3692,
+      "step": 341
+    },
+    {
+      "epoch": 0.02379213190024001,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0015833333333333333,
+      "loss": 2.9392,
+      "step": 342
+    },
+    {
+      "epoch": 0.023861699537375214,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0015879629629629631,
+      "loss": 3.5069,
+      "step": 343
+    },
+    {
+      "epoch": 0.02393126717451042,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0015925925925925925,
+      "loss": 3.5041,
+      "step": 344
+    },
+    {
+      "epoch": 0.02400083481164562,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0015972222222222223,
+      "loss": 3.8914,
+      "step": 345
+    },
+    {
+      "epoch": 0.024070402448780826,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.001601851851851852,
+      "loss": 3.3932,
+      "step": 346
+    },
+    {
+      "epoch": 0.02413997008591603,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0016064814814814815,
+      "loss": 3.2436,
+      "step": 347
+    },
+    {
+      "epoch": 0.024209537723051237,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.0016111111111111111,
+      "loss": 3.4229,
+      "step": 348
+    },
+    {
+      "epoch": 0.024279105360186442,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0016157407407407407,
+      "loss": 3.5363,
+      "step": 349
+    },
+    {
+      "epoch": 0.024348672997321647,
+      "grad_norm": 3.71875,
+      "learning_rate": 0.0016203703703703705,
+      "loss": 3.1235,
+      "step": 350
+    },
+    {
+      "epoch": 0.02441824063445685,
+      "grad_norm": 3.140625,
+      "learning_rate": 0.0016250000000000001,
+      "loss": 3.0492,
+      "step": 351
+    },
+    {
+      "epoch": 0.024487808271592054,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0016296296296296295,
+      "loss": 3.3601,
+      "step": 352
+    },
+    {
+      "epoch": 0.02455737590872726,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0016342592592592593,
+      "loss": 4.0213,
+      "step": 353
+    },
+    {
+      "epoch": 0.024626943545862465,
+      "grad_norm": 6.1875,
+      "learning_rate": 0.001638888888888889,
+      "loss": 3.2648,
+      "step": 354
+    },
+    {
+      "epoch": 0.02469651118299767,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0016435185185185185,
+      "loss": 3.5466,
+      "step": 355
+    },
+    {
+      "epoch": 0.024766078820132875,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0016481481481481482,
+      "loss": 3.611,
+      "step": 356
+    },
+    {
+      "epoch": 0.02483564645726808,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0016527777777777778,
+      "loss": 3.8875,
+      "step": 357
+    },
+    {
+      "epoch": 0.024905214094403282,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0016574074074074076,
+      "loss": 3.0971,
+      "step": 358
+    },
+    {
+      "epoch": 0.024974781731538487,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.001662037037037037,
+      "loss": 3.4832,
+      "step": 359
+    },
+    {
+      "epoch": 0.025044349368673693,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0016666666666666668,
+      "loss": 3.4558,
+      "step": 360
+    },
+    {
+      "epoch": 0.025113917005808898,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0016712962962962964,
+      "loss": 3.7369,
+      "step": 361
+    },
+    {
+      "epoch": 0.025183484642944103,
+      "grad_norm": 3.15625,
+      "learning_rate": 0.0016759259259259258,
+      "loss": 3.5278,
+      "step": 362
+    },
+    {
+      "epoch": 0.02525305228007931,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0016805555555555556,
+      "loss": 3.575,
+      "step": 363
+    },
+    {
+      "epoch": 0.02532261991721451,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0016851851851851852,
+      "loss": 3.7479,
+      "step": 364
+    },
+    {
+      "epoch": 0.025392187554349716,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.001689814814814815,
+      "loss": 3.2374,
+      "step": 365
+    },
+    {
+      "epoch": 0.02546175519148492,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0016944444444444444,
+      "loss": 3.479,
+      "step": 366
+    },
+    {
+      "epoch": 0.025531322828620126,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.0016990740740740742,
+      "loss": 3.6505,
+      "step": 367
+    },
+    {
+      "epoch": 0.02560089046575533,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0017037037037037038,
+      "loss": 3.5518,
+      "step": 368
+    },
+    {
+      "epoch": 0.025670458102890537,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.0017083333333333332,
+      "loss": 3.5959,
+      "step": 369
+    },
+    {
+      "epoch": 0.02574002574002574,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.001712962962962963,
+      "loss": 3.6952,
+      "step": 370
+    },
+    {
+      "epoch": 0.025809593377160944,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0017175925925925926,
+      "loss": 3.4196,
+      "step": 371
+    },
+    {
+      "epoch": 0.02587916101429615,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0017222222222222224,
+      "loss": 3.3174,
+      "step": 372
+    },
+    {
+      "epoch": 0.025948728651431354,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0017268518518518518,
+      "loss": 3.5334,
+      "step": 373
+    },
+    {
+      "epoch": 0.02601829628856656,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0017314814814814814,
+      "loss": 3.6951,
+      "step": 374
+    },
+    {
+      "epoch": 0.026087863925701765,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0017361111111111112,
+      "loss": 3.0378,
+      "step": 375
+    },
+    {
+      "epoch": 0.02615743156283697,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0017407407407407408,
+      "loss": 3.1484,
+      "step": 376
+    },
+    {
+      "epoch": 0.02622699919997217,
+      "grad_norm": 2.640625,
+      "learning_rate": 0.0017453703703703704,
+      "loss": 3.3457,
+      "step": 377
+    },
+    {
+      "epoch": 0.026296566837107377,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.00175,
+      "loss": 3.4347,
+      "step": 378
+    },
+    {
+      "epoch": 0.026366134474242582,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0017546296296296296,
+      "loss": 3.3564,
+      "step": 379
+    },
+    {
+      "epoch": 0.026435702111377787,
+      "grad_norm": 3.046875,
+      "learning_rate": 0.0017592592592592592,
+      "loss": 3.223,
+      "step": 380
+    },
+    {
+      "epoch": 0.026505269748512993,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0017638888888888888,
+      "loss": 3.593,
+      "step": 381
+    },
+    {
+      "epoch": 0.026574837385648198,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0017685185185185187,
+      "loss": 3.6302,
+      "step": 382
+    },
+    {
+      "epoch": 0.0266444050227834,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0017731481481481483,
+      "loss": 3.5909,
+      "step": 383
+    },
+    {
+      "epoch": 0.026713972659918605,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0017777777777777776,
+      "loss": 3.5074,
+      "step": 384
+    },
+    {
+      "epoch": 0.02678354029705381,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017824074074074075,
+      "loss": 3.8452,
+      "step": 385
+    },
+    {
+      "epoch": 0.026853107934189015,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001787037037037037,
+      "loss": 3.5292,
+      "step": 386
+    },
+    {
+      "epoch": 0.02692267557132422,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0017916666666666669,
+      "loss": 3.3985,
+      "step": 387
+    },
+    {
+      "epoch": 0.026992243208459426,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0017962962962962963,
+      "loss": 3.6987,
+      "step": 388
+    },
+    {
+      "epoch": 0.027061810845594628,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.0018009259259259259,
+      "loss": 3.6334,
+      "step": 389
+    },
+    {
+      "epoch": 0.027131378482729833,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0018055555555555557,
+      "loss": 3.3575,
+      "step": 390
+    },
+    {
+      "epoch": 0.027200946119865038,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.001810185185185185,
+      "loss": 3.7009,
+      "step": 391
+    },
+    {
+      "epoch": 0.027270513757000243,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001814814814814815,
+      "loss": 3.602,
+      "step": 392
+    },
+    {
+      "epoch": 0.02734008139413545,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0018194444444444445,
+      "loss": 3.6099,
+      "step": 393
+    },
+    {
+      "epoch": 0.027409649031270654,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018240740740740743,
+      "loss": 3.3514,
+      "step": 394
+    },
+    {
+      "epoch": 0.02747921666840586,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018287037037037037,
+      "loss": 3.6098,
+      "step": 395
+    },
+    {
+      "epoch": 0.02754878430554106,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 3.5332,
+      "step": 396
+    },
+    {
+      "epoch": 0.027618351942676266,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0018379629629629631,
+      "loss": 3.3918,
+      "step": 397
+    },
+    {
+      "epoch": 0.02768791957981147,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.0018425925925925925,
+      "loss": 3.2549,
+      "step": 398
+    },
+    {
+      "epoch": 0.027757487216946677,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0018472222222222223,
+      "loss": 3.378,
+      "step": 399
+    },
+    {
+      "epoch": 0.027827054854081882,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.001851851851851852,
+      "loss": 3.3514,
+      "step": 400
+    },
+    {
+      "epoch": 0.027896622491217087,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0018564814814814815,
+      "loss": 3.163,
+      "step": 401
+    },
+    {
+      "epoch": 0.02796619012835229,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0018611111111111111,
+      "loss": 3.3832,
+      "step": 402
+    },
+    {
+      "epoch": 0.028035757765487494,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018657407407407407,
+      "loss": 3.3657,
+      "step": 403
+    },
+    {
+      "epoch": 0.0281053254026227,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.0018703703703703705,
+      "loss": 3.7129,
+      "step": 404
+    },
+    {
+      "epoch": 0.028174893039757905,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001875,
+      "loss": 3.3413,
+      "step": 405
+    },
+    {
+      "epoch": 0.02824446067689311,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0018796296296296295,
+      "loss": 3.5818,
+      "step": 406
+    },
+    {
+      "epoch": 0.028314028314028315,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018842592592592594,
+      "loss": 3.2641,
+      "step": 407
+    },
+    {
+      "epoch": 0.028383595951163517,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.001888888888888889,
+      "loss": 3.4429,
+      "step": 408
+    },
+    {
+      "epoch": 0.028453163588298722,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0018935185185185186,
+      "loss": 3.2141,
+      "step": 409
+    },
+    {
+      "epoch": 0.028522731225433928,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.0018981481481481482,
+      "loss": 3.4574,
+      "step": 410
+    },
+    {
+      "epoch": 0.028592298862569133,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0019027777777777778,
+      "loss": 3.148,
+      "step": 411
+    },
+    {
+      "epoch": 0.028661866499704338,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0019074074074074076,
+      "loss": 3.7982,
+      "step": 412
+    },
+    {
+      "epoch": 0.028731434136839543,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.001912037037037037,
+      "loss": 3.3567,
+      "step": 413
+    },
+    {
+      "epoch": 0.02880100177397475,
+      "grad_norm": 3.046875,
+      "learning_rate": 0.0019166666666666668,
+      "loss": 3.6453,
+      "step": 414
+    },
+    {
+      "epoch": 0.02887056941110995,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0019212962962962964,
+      "loss": 3.4031,
+      "step": 415
+    },
+    {
+      "epoch": 0.028940137048245156,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0019259259259259258,
+      "loss": 3.6185,
+      "step": 416
+    },
+    {
+      "epoch": 0.02900970468538036,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0019305555555555556,
+      "loss": 3.0648,
+      "step": 417
+    },
+    {
+      "epoch": 0.029079272322515566,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0019351851851851852,
+      "loss": 3.3076,
+      "step": 418
+    },
+    {
+      "epoch": 0.02914883995965077,
+      "grad_norm": 2.0,
+      "learning_rate": 0.001939814814814815,
+      "loss": 3.5246,
+      "step": 419
+    },
+    {
+      "epoch": 0.029218407596785977,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0019444444444444444,
+      "loss": 3.6498,
+      "step": 420
+    },
+    {
+      "epoch": 0.02928797523392118,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.0019490740740740742,
+      "loss": 3.2355,
+      "step": 421
+    },
+    {
+      "epoch": 0.029357542871056384,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0019537037037037036,
+      "loss": 3.9044,
+      "step": 422
+    },
+    {
+      "epoch": 0.02942711050819159,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.001958333333333333,
+      "loss": 3.5943,
+      "step": 423
+    },
+    {
+      "epoch": 0.029496678145326794,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0019629629629629632,
+      "loss": 3.7883,
+      "step": 424
+    },
+    {
+      "epoch": 0.029566245782462,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.0019675925925925924,
+      "loss": 3.5543,
+      "step": 425
+    },
+    {
+      "epoch": 0.029635813419597205,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0019722222222222224,
+      "loss": 3.6355,
+      "step": 426
+    },
+    {
+      "epoch": 0.029705381056732406,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001976851851851852,
+      "loss": 3.1634,
+      "step": 427
+    },
+    {
+      "epoch": 0.02977494869386761,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0019814814814814816,
+      "loss": 3.2555,
+      "step": 428
+    },
+    {
+      "epoch": 0.029844516331002817,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0019861111111111112,
+      "loss": 3.1424,
+      "step": 429
+    },
+    {
+      "epoch": 0.029914083968138022,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.001990740740740741,
+      "loss": 3.5279,
+      "step": 430
+    },
+    {
+      "epoch": 0.029983651605273227,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0019953703703703704,
+      "loss": 3.0176,
+      "step": 431
+    },
+    {
+      "epoch": 0.030053219242408433,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.002,
+      "loss": 3.1193,
+      "step": 432
+    },
+    {
+      "epoch": 0.030122786879543638,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.00199999997461252,
+      "loss": 3.2754,
+      "step": 433
+    },
+    {
+      "epoch": 0.03019235451667884,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0019999998984500823,
+      "loss": 3.5648,
+      "step": 434
+    },
+    {
+      "epoch": 0.030261922153814045,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0019999997715126894,
+      "loss": 3.3541,
+      "step": 435
+    },
+    {
+      "epoch": 0.03033148979094925,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.001999999593800349,
+      "loss": 3.3309,
+      "step": 436
+    },
+    {
+      "epoch": 0.030401057428084455,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.001999999365313069,
+      "loss": 3.3605,
+      "step": 437
+    },
+    {
+      "epoch": 0.03047062506521966,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0019999990860508623,
+      "loss": 3.3191,
+      "step": 438
+    },
+    {
+      "epoch": 0.030540192702354866,
+      "grad_norm": 2.25,
+      "learning_rate": 0.001999998756013742,
+      "loss": 3.1285,
+      "step": 439
+    },
+    {
+      "epoch": 0.030609760339490068,
+      "grad_norm": 2.984375,
+      "learning_rate": 0.001999998375201725,
+      "loss": 3.0919,
+      "step": 440
+    },
+    {
+      "epoch": 0.030679327976625273,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.001999997943614831,
+      "loss": 3.4453,
+      "step": 441
+    },
+    {
+      "epoch": 0.030748895613760478,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.001999997461253082,
+      "loss": 3.4458,
+      "step": 442
+    },
+    {
+      "epoch": 0.030818463250895684,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.001999996928116502,
+      "loss": 3.7504,
+      "step": 443
+    },
+    {
+      "epoch": 0.03088803088803089,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.001999996344205119,
+      "loss": 3.333,
+      "step": 444
+    },
+    {
+      "epoch": 0.030957598525166094,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0019999957095189615,
+      "loss": 3.4064,
+      "step": 445
+    },
+    {
+      "epoch": 0.031027166162301296,
+      "grad_norm": 1.875,
+      "learning_rate": 0.001999995024058062,
+      "loss": 3.095,
+      "step": 446
+    },
+    {
+      "epoch": 0.0310967337994365,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.001999994287822456,
+      "loss": 3.4324,
+      "step": 447
+    },
+    {
+      "epoch": 0.031166301436571706,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.00199999350081218,
+      "loss": 2.9063,
+      "step": 448
+    },
+    {
+      "epoch": 0.03123586907370691,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.001999992663027275,
+      "loss": 3.5043,
+      "step": 449
+    },
+    {
+      "epoch": 0.03130543671084211,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.0019999917744677824,
+      "loss": 3.4375,
+      "step": 450
+    },
+    {
+      "epoch": 0.03137500434797732,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.001999990835133748,
+      "loss": 3.3257,
+      "step": 451
+    },
+    {
+      "epoch": 0.031444571985112524,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.001999989845025219,
+      "loss": 3.6367,
+      "step": 452
+    },
+    {
+      "epoch": 0.03151413962224773,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001999988804142246,
+      "loss": 3.3612,
+      "step": 453
+    },
+    {
+      "epoch": 0.031583707259382934,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0019999877124848822,
+      "loss": 3.1522,
+      "step": 454
+    },
+    {
+      "epoch": 0.03165327489651814,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.0019999865700531826,
+      "loss": 3.3392,
+      "step": 455
+    },
+    {
+      "epoch": 0.031722842533653345,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.001999985376847205,
+      "loss": 3.2132,
+      "step": 456
+    },
+    {
+      "epoch": 0.03179241017078855,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0019999841328670106,
+      "loss": 2.9862,
+      "step": 457
+    },
+    {
+      "epoch": 0.031861977807923755,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.001999982838112662,
+      "loss": 3.2235,
+      "step": 458
+    },
+    {
+      "epoch": 0.03193154544505896,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0019999814925842256,
+      "loss": 3.1465,
+      "step": 459
+    },
+    {
+      "epoch": 0.032001113082194166,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0019999800962817687,
+      "loss": 3.6032,
+      "step": 460
+    },
+    {
+      "epoch": 0.03207068071932937,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019999786492053634,
+      "loss": 3.2446,
+      "step": 461
+    },
+    {
+      "epoch": 0.03214024835646457,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.001999977151355082,
+      "loss": 3.5569,
+      "step": 462
+    },
+    {
+      "epoch": 0.032209815993599775,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.001999975602731001,
+      "loss": 3.2789,
+      "step": 463
+    },
+    {
+      "epoch": 0.03227938363073498,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019999740033332,
+      "loss": 3.1285,
+      "step": 464
+    },
+    {
+      "epoch": 0.032348951267870185,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019999723531617586,
+      "loss": 3.2197,
+      "step": 465
+    },
+    {
+      "epoch": 0.03241851890500539,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.0019999706522167617,
+      "loss": 3.6819,
+      "step": 466
+    },
+    {
+      "epoch": 0.032488086542140596,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.0019999689004982953,
+      "loss": 3.6977,
+      "step": 467
+    },
+    {
+      "epoch": 0.0325576541792758,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.001999967098006448,
+      "loss": 3.4413,
+      "step": 468
+    },
+    {
+      "epoch": 0.032627221816411006,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0019999652447413117,
+      "loss": 3.381,
+      "step": 469
+    },
+    {
+      "epoch": 0.03269678945354621,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0019999633407029806,
+      "loss": 3.3125,
+      "step": 470
+    },
+    {
+      "epoch": 0.03276635709068142,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0019999613858915515,
+      "loss": 3.4642,
+      "step": 471
+    },
+    {
+      "epoch": 0.03283592472781662,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0019999593803071234,
+      "loss": 3.5275,
+      "step": 472
+    },
+    {
+      "epoch": 0.03290549236495183,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019999573239497977,
+      "loss": 3.3881,
+      "step": 473
+    },
+    {
+      "epoch": 0.03297506000208703,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0019999552168196797,
+      "loss": 3.446,
+      "step": 474
+    },
+    {
+      "epoch": 0.03304462763922223,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.0019999530589168753,
+      "loss": 3.4311,
+      "step": 475
+    },
+    {
+      "epoch": 0.033114195276357436,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.001999950850241495,
+      "loss": 3.626,
+      "step": 476
+    },
+    {
+      "epoch": 0.03318376291349264,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.001999948590793651,
+      "loss": 3.486,
+      "step": 477
+    },
+    {
+      "epoch": 0.033253330550627846,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0019999462805734575,
+      "loss": 3.8674,
+      "step": 478
+    },
+    {
+      "epoch": 0.03332289818776305,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0019999439195810317,
+      "loss": 3.5158,
+      "step": 479
+    },
+    {
+      "epoch": 0.03339246582489826,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0019999415078164945,
+      "loss": 3.1666,
+      "step": 480
+    },
+    {
+      "epoch": 0.03346203346203346,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001999939045279967,
+      "loss": 3.6701,
+      "step": 481
+    },
+    {
+      "epoch": 0.03353160109916867,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0019999365319715748,
+      "loss": 3.4038,
+      "step": 482
+    },
+    {
+      "epoch": 0.03360116873630387,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0019999339678914456,
+      "loss": 3.3869,
+      "step": 483
+    },
+    {
+      "epoch": 0.03367073637343908,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.00199993135303971,
+      "loss": 3.4404,
+      "step": 484
+    },
+    {
+      "epoch": 0.03374030401057428,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0019999286874165,
+      "loss": 3.2973,
+      "step": 485
+    },
+    {
+      "epoch": 0.03380987164770949,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.0019999259710219513,
+      "loss": 3.4502,
+      "step": 486
+    },
+    {
+      "epoch": 0.03387943928484469,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019999232038562013,
+      "loss": 3.3338,
+      "step": 487
+    },
+    {
+      "epoch": 0.03394900692197989,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019999203859193916,
+      "loss": 3.5657,
+      "step": 488
+    },
+    {
+      "epoch": 0.0340185745591151,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0019999175172116645,
+      "loss": 3.6222,
+      "step": 489
+    },
+    {
+      "epoch": 0.0340881421962503,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0019999145977331657,
+      "loss": 3.5801,
+      "step": 490
+    },
+    {
+      "epoch": 0.03415770983338551,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.001999911627484044,
+      "loss": 3.0592,
+      "step": 491
+    },
+    {
+      "epoch": 0.03422727747052071,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0019999086064644493,
+      "loss": 3.1636,
+      "step": 492
+    },
+    {
+      "epoch": 0.03429684510765592,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0019999055346745357,
+      "loss": 3.2533,
+      "step": 493
+    },
+    {
+      "epoch": 0.034366412744791124,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0019999024121144585,
+      "loss": 2.8686,
+      "step": 494
+    },
+    {
+      "epoch": 0.03443598038192633,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.001999899238784377,
+      "loss": 3.1926,
+      "step": 495
+    },
+    {
+      "epoch": 0.034505548019061534,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.0019998960146844526,
+      "loss": 3.454,
+      "step": 496
+    },
+    {
+      "epoch": 0.03457511565619674,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.001999892739814848,
+      "loss": 3.9289,
+      "step": 497
+    },
+    {
+      "epoch": 0.034644683293331945,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0019998894141757297,
+      "loss": 3.6297,
+      "step": 498
+    },
+    {
+      "epoch": 0.03471425093046715,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001999886037767267,
+      "loss": 3.2615,
+      "step": 499
+    },
+    {
+      "epoch": 0.03478381856760235,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019998826105896306,
+      "loss": 3.6395,
+      "step": 500
+    },
+    {
+      "epoch": 0.03485338620473755,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019998791326429955,
+      "loss": 2.9141,
+      "step": 501
+    },
+    {
+      "epoch": 0.03492295384187276,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.001999875603927538,
+      "loss": 3.3799,
+      "step": 502
+    },
+    {
+      "epoch": 0.034992521479007964,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.001999872024443437,
+      "loss": 3.7696,
+      "step": 503
+    },
+    {
+      "epoch": 0.03506208911614317,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.001999868394190874,
+      "loss": 3.5759,
+      "step": 504
+    },
+    {
+      "epoch": 0.035131656753278374,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.001999864713170034,
+      "loss": 3.5347,
+      "step": 505
+    },
+    {
+      "epoch": 0.03520122439041358,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001999860981381103,
+      "loss": 3.4105,
+      "step": 506
+    },
+    {
+      "epoch": 0.035270792027548785,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019998571988242716,
+      "loss": 3.516,
+      "step": 507
+    },
+    {
+      "epoch": 0.03534035966468399,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001999853365499731,
+      "loss": 3.7225,
+      "step": 508
+    },
+    {
+      "epoch": 0.035409927301819195,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001999849481407676,
+      "loss": 3.3798,
+      "step": 509
+    },
+    {
+      "epoch": 0.0354794949389544,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0019998455465483045,
+      "loss": 3.2524,
+      "step": 510
+    },
+    {
+      "epoch": 0.035549062576089606,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0019998415609218155,
+      "loss": 3.4873,
+      "step": 511
+    },
+    {
+      "epoch": 0.03561863021322481,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0019998375245284116,
+      "loss": 3.3883,
+      "step": 512
+    },
+    {
+      "epoch": 0.03568819785036001,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019998334373682977,
+      "loss": 3.401,
+      "step": 513
+    },
+    {
+      "epoch": 0.035757765487495215,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0019998292994416814,
+      "loss": 2.9872,
+      "step": 514
+    },
+    {
+      "epoch": 0.03582733312463042,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0019998251107487728,
+      "loss": 3.3022,
+      "step": 515
+    },
+    {
+      "epoch": 0.035896900761765625,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0019998208712897845,
+      "loss": 3.3426,
+      "step": 516
+    },
+    {
+      "epoch": 0.03596646839890083,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0019998165810649316,
+      "loss": 3.2612,
+      "step": 517
+    },
+    {
+      "epoch": 0.036036036036036036,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.0019998122400744327,
+      "loss": 3.2155,
+      "step": 518
+    },
+    {
+      "epoch": 0.03610560367317124,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001999807848318507,
+      "loss": 3.3988,
+      "step": 519
+    },
+    {
+      "epoch": 0.036175171310306446,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001999803405797379,
+      "loss": 3.5409,
+      "step": 520
+    },
+    {
+      "epoch": 0.03624473894744165,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001999798912511273,
+      "loss": 3.8011,
+      "step": 521
+    },
+    {
+      "epoch": 0.03631430658457686,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0019997943684604176,
+      "loss": 3.0568,
+      "step": 522
+    },
+    {
+      "epoch": 0.03638387422171206,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.001999789773645043,
+      "loss": 3.2267,
+      "step": 523
+    },
+    {
+      "epoch": 0.03645344185884727,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.001999785128065384,
+      "loss": 3.0883,
+      "step": 524
+    },
+    {
+      "epoch": 0.036523009495982466,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001999780431721675,
+      "loss": 3.5484,
+      "step": 525
+    },
+    {
+      "epoch": 0.03659257713311767,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.0019997756846141545,
+      "loss": 3.5082,
+      "step": 526
+    },
+    {
+      "epoch": 0.036662144770252876,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019997708867430645,
+      "loss": 3.5143,
+      "step": 527
+    },
+    {
+      "epoch": 0.03673171240738808,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001999766038108648,
+      "loss": 3.2694,
+      "step": 528
+    },
+    {
+      "epoch": 0.03680128004452329,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.0019997611387111516,
+      "loss": 3.274,
+      "step": 529
+    },
+    {
+      "epoch": 0.03687084768165849,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001999756188550823,
+      "loss": 3.415,
+      "step": 530
+    },
+    {
+      "epoch": 0.0369404153187937,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.001999751187627915,
+      "loss": 3.8245,
+      "step": 531
+    },
+    {
+      "epoch": 0.0370099829559289,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0019997461359426805,
+      "loss": 3.3096,
+      "step": 532
+    },
+    {
+      "epoch": 0.03707955059306411,
+      "grad_norm": 2.625,
+      "learning_rate": 0.001999741033495376,
+      "loss": 3.1154,
+      "step": 533
+    },
+    {
+      "epoch": 0.03714911823019931,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0019997358802862617,
+      "loss": 3.028,
+      "step": 534
+    },
+    {
+      "epoch": 0.03721868586733452,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0019997306763155976,
+      "loss": 3.3044,
+      "step": 535
+    },
+    {
+      "epoch": 0.03728825350446972,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.001999725421583649,
+      "loss": 3.5187,
+      "step": 536
+    },
+    {
+      "epoch": 0.03735782114160493,
+      "grad_norm": 1.75,
+      "learning_rate": 0.001999720116090683,
+      "loss": 3.1256,
+      "step": 537
+    },
+    {
+      "epoch": 0.03742738877874013,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.001999714759836968,
+      "loss": 3.4639,
+      "step": 538
+    },
+    {
+      "epoch": 0.03749695641587533,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019997093528227768,
+      "loss": 3.4131,
+      "step": 539
+    },
+    {
+      "epoch": 0.03756652405301054,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001999703895048383,
+      "loss": 3.2755,
+      "step": 540
+    },
+    {
+      "epoch": 0.03763609169014574,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0019996983865140645,
+      "loss": 3.0651,
+      "step": 541
+    },
+    {
+      "epoch": 0.03770565932728095,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001999692827220101,
+      "loss": 3.7271,
+      "step": 542
+    },
+    {
+      "epoch": 0.03777522696441615,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001999687217166774,
+      "loss": 2.8666,
+      "step": 543
+    },
+    {
+      "epoch": 0.03784479460155136,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0019996815563543694,
+      "loss": 3.3484,
+      "step": 544
+    },
+    {
+      "epoch": 0.037914362238686564,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.0019996758447831746,
+      "loss": 2.9785,
+      "step": 545
+    },
+    {
+      "epoch": 0.03798392987582177,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0019996700824534783,
+      "loss": 3.1998,
+      "step": 546
+    },
+    {
+      "epoch": 0.038053497512956974,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.001999664269365574,
+      "loss": 2.9826,
+      "step": 547
+    },
+    {
+      "epoch": 0.03812306515009218,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001999658405519757,
+      "loss": 3.3673,
+      "step": 548
+    },
+    {
+      "epoch": 0.038192632787227385,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001999652490916325,
+      "loss": 3.7924,
+      "step": 549
+    },
+    {
+      "epoch": 0.03826220042436259,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.001999646525555578,
+      "loss": 3.3943,
+      "step": 550
+    },
+    {
+      "epoch": 0.03833176806149779,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0019996405094378188,
+      "loss": 2.9449,
+      "step": 551
+    },
+    {
+      "epoch": 0.03840133569863299,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0019996344425633533,
+      "loss": 3.5443,
+      "step": 552
+    },
+    {
+      "epoch": 0.0384709033357682,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.0019996283249324896,
+      "loss": 3.2403,
+      "step": 553
+    },
+    {
+      "epoch": 0.038540470972903404,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0019996221565455378,
+      "loss": 3.493,
+      "step": 554
+    },
+    {
+      "epoch": 0.03861003861003861,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019996159374028113,
+      "loss": 3.4755,
+      "step": 555
+    },
+    {
+      "epoch": 0.038679606247173814,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019996096675046256,
+      "loss": 3.5693,
+      "step": 556
+    },
+    {
+      "epoch": 0.03874917388430902,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0019996033468513003,
+      "loss": 3.2506,
+      "step": 557
+    },
+    {
+      "epoch": 0.038818741521444225,
+      "grad_norm": 1.75,
+      "learning_rate": 0.001999596975443155,
+      "loss": 3.5411,
+      "step": 558
+    },
+    {
+      "epoch": 0.03888830915857943,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0019995905532805133,
+      "loss": 3.6029,
+      "step": 559
+    },
+    {
+      "epoch": 0.038957876795714635,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.001999584080363702,
+      "loss": 3.3463,
+      "step": 560
+    },
+    {
+      "epoch": 0.03902744443284984,
+      "grad_norm": 3.0625,
+      "learning_rate": 0.001999577556693049,
+      "loss": 3.5366,
+      "step": 561
+    },
+    {
+      "epoch": 0.039097012069985046,
+      "grad_norm": 1.84375,
+      "learning_rate": 0.001999570982268886,
+      "loss": 3.3199,
+      "step": 562
+    },
+    {
+      "epoch": 0.039166579707120244,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001999564357091547,
+      "loss": 3.5079,
+      "step": 563
+    },
+    {
+      "epoch": 0.03923614734425545,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001999557681161368,
+      "loss": 3.5224,
+      "step": 564
+    },
+    {
+      "epoch": 0.039305714981390655,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.001999550954478688,
+      "loss": 3.674,
+      "step": 565
+    },
+    {
+      "epoch": 0.03937528261852586,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019995441770438486,
+      "loss": 3.3461,
+      "step": 566
+    },
+    {
+      "epoch": 0.039444850255661065,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001999537348857194,
+      "loss": 3.3256,
+      "step": 567
+    },
+    {
+      "epoch": 0.03951441789279627,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0019995304699190713,
+      "loss": 3.0955,
+      "step": 568
+    },
+    {
+      "epoch": 0.039583985529931476,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0019995235402298288,
+      "loss": 3.6332,
+      "step": 569
+    },
+    {
+      "epoch": 0.03965355316706668,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0019995165597898193,
+      "loss": 3.2571,
+      "step": 570
+    },
+    {
+      "epoch": 0.039723120804201886,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0019995095285993965,
+      "loss": 2.8634,
+      "step": 571
+    },
+    {
+      "epoch": 0.03979268844133709,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001999502446658918,
+      "loss": 3.444,
+      "step": 572
+    },
+    {
+      "epoch": 0.0398622560784723,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001999495313968743,
+      "loss": 3.4595,
+      "step": 573
+    },
+    {
+      "epoch": 0.0399318237156075,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0019994881305292335,
+      "loss": 3.588,
+      "step": 574
+    },
+    {
+      "epoch": 0.04000139135274271,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0019994808963407548,
+      "loss": 3.1029,
+      "step": 575
+    },
+    {
+      "epoch": 0.040070958989877906,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001999473611403674,
+      "loss": 3.4616,
+      "step": 576
+    },
+    {
+      "epoch": 0.04014052662701311,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.001999466275718361,
+      "loss": 3.1576,
+      "step": 577
+    },
+    {
+      "epoch": 0.040210094264148316,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.001999458889285188,
+      "loss": 3.1757,
+      "step": 578
+    },
+    {
+      "epoch": 0.04027966190128352,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.00199945145210453,
+      "loss": 2.8828,
+      "step": 579
+    },
+    {
+      "epoch": 0.04034922953841873,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0019994439641767654,
+      "loss": 3.1672,
+      "step": 580
+    },
+    {
+      "epoch": 0.04041879717555393,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001999436425502274,
+      "loss": 3.35,
+      "step": 581
+    },
+    {
+      "epoch": 0.04048836481268914,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0019994288360814377,
+      "loss": 3.3505,
+      "step": 582
+    },
+    {
+      "epoch": 0.04055793244982434,
+      "grad_norm": 1.5,
+      "learning_rate": 0.001999421195914643,
+      "loss": 3.2588,
+      "step": 583
+    },
+    {
+      "epoch": 0.04062750008695955,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0019994135050022776,
+      "loss": 3.6955,
+      "step": 584
+    },
+    {
+      "epoch": 0.04069706772409475,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019994057633447317,
+      "loss": 3.1572,
+      "step": 585
+    },
+    {
+      "epoch": 0.04076663536122996,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0019993979709423985,
+      "loss": 3.1665,
+      "step": 586
+    },
+    {
+      "epoch": 0.04083620299836516,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0019993901277956735,
+      "loss": 3.7347,
+      "step": 587
+    },
+    {
+      "epoch": 0.04090577063550037,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019993822339049554,
+      "loss": 3.4667,
+      "step": 588
+    },
+    {
+      "epoch": 0.04097533827263557,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0019993742892706447,
+      "loss": 3.3394,
+      "step": 589
+    },
+    {
+      "epoch": 0.04104490590977077,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001999366293893145,
+      "loss": 3.34,
+      "step": 590
+    },
+    {
+      "epoch": 0.04111447354690598,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019993582477728614,
+      "loss": 3.2674,
+      "step": 591
+    },
+    {
+      "epoch": 0.04118404118404118,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0019993501509102036,
+      "loss": 3.2031,
+      "step": 592
+    },
+    {
+      "epoch": 0.04125360882117639,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001999342003305582,
+      "loss": 3.6727,
+      "step": 593
+    },
+    {
+      "epoch": 0.04132317645831159,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0019993338049594106,
+      "loss": 2.8559,
+      "step": 594
+    },
+    {
+      "epoch": 0.0413927440954468,
+      "grad_norm": 1.5,
+      "learning_rate": 0.001999325555872106,
+      "loss": 3.2697,
+      "step": 595
+    },
+    {
+      "epoch": 0.041462311732582004,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001999317256044086,
+      "loss": 3.6079,
+      "step": 596
+    },
+    {
+      "epoch": 0.04153187936971721,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0019993089054757733,
+      "loss": 3.4023,
+      "step": 597
+    },
+    {
+      "epoch": 0.041601447006852414,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.001999300504167591,
+      "loss": 3.3421,
+      "step": 598
+    },
+    {
+      "epoch": 0.04167101464398762,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019992920521199656,
+      "loss": 3.1932,
+      "step": 599
+    },
+    {
+      "epoch": 0.041740582281122825,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.001999283549333327,
+      "loss": 3.1098,
+      "step": 600
+    },
+    {
+      "epoch": 0.04181014991825802,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.001999274995808106,
+      "loss": 3.0167,
+      "step": 601
+    },
+    {
+      "epoch": 0.04187971755539323,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.001999266391544738,
+      "loss": 3.2582,
+      "step": 602
+    },
+    {
+      "epoch": 0.041949285192528434,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.0019992577365436593,
+      "loss": 3.6975,
+      "step": 603
+    },
+    {
+      "epoch": 0.04201885282966364,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.001999249030805309,
+      "loss": 3.2861,
+      "step": 604
+    },
+    {
+      "epoch": 0.042088420466798844,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.00199924027433013,
+      "loss": 3.6903,
+      "step": 605
+    },
+    {
+      "epoch": 0.04215798810393405,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019992314671185662,
+      "loss": 3.3531,
+      "step": 606
+    },
+    {
+      "epoch": 0.042227555741069255,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001999222609171065,
+      "loss": 3.5518,
+      "step": 607
+    },
+    {
+      "epoch": 0.04229712337820446,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001999213700488076,
+      "loss": 3.0014,
+      "step": 608
+    },
+    {
+      "epoch": 0.042366691015339665,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0019992047410700518,
+      "loss": 2.9034,
+      "step": 609
+    },
+    {
+      "epoch": 0.04243625865247487,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0019991957309174473,
+      "loss": 3.6726,
+      "step": 610
+    },
+    {
+      "epoch": 0.042505826289610076,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019991866700307197,
+      "loss": 3.7199,
+      "step": 611
+    },
+    {
+      "epoch": 0.04257539392674528,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019991775584103297,
+      "loss": 3.639,
+      "step": 612
+    },
+    {
+      "epoch": 0.042644961563880486,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001999168396056739,
+      "loss": 3.2102,
+      "step": 613
+    },
+    {
+      "epoch": 0.042714529201015684,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0019991591829704135,
+      "loss": 3.319,
+      "step": 614
+    },
+    {
+      "epoch": 0.04278409683815089,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0019991499191518206,
+      "loss": 3.1985,
+      "step": 615
+    },
+    {
+      "epoch": 0.042853664475286095,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0019991406046014314,
+      "loss": 3.4867,
+      "step": 616
+    },
+    {
+      "epoch": 0.0429232321124213,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001999131239319718,
+      "loss": 3.304,
+      "step": 617
+    },
+    {
+      "epoch": 0.042992799749556505,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0019991218233071564,
+      "loss": 3.4094,
+      "step": 618
+    },
+    {
+      "epoch": 0.04306236738669171,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019991123565642247,
+      "loss": 3.1638,
+      "step": 619
+    },
+    {
+      "epoch": 0.043131935023826916,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001999102839091403,
+      "loss": 3.3121,
+      "step": 620
+    },
+    {
+      "epoch": 0.04320150266096212,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001999093270889175,
+      "loss": 3.2227,
+      "step": 621
+    },
+    {
+      "epoch": 0.043271070298097326,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.001999083651958027,
+      "loss": 3.4165,
+      "step": 622
+    },
+    {
+      "epoch": 0.04334063793523253,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019990739822984467,
+      "loss": 3.5862,
+      "step": 623
+    },
+    {
+      "epoch": 0.04341020557236774,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0019990642619109253,
+      "loss": 3.0626,
+      "step": 624
+    },
+    {
+      "epoch": 0.04347977320950294,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001999054490795956,
+      "loss": 3.7579,
+      "step": 625
+    },
+    {
+      "epoch": 0.04354934084663815,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001999044668954036,
+      "loss": 3.245,
+      "step": 626
+    },
+    {
+      "epoch": 0.043618908483773346,
+      "grad_norm": 1.984375,
+      "learning_rate": 0.0019990347963856625,
+      "loss": 3.6678,
+      "step": 627
+    },
+    {
+      "epoch": 0.04368847612090855,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001999024873091338,
+      "loss": 3.2047,
+      "step": 628
+    },
+    {
+      "epoch": 0.043758043758043756,
+      "grad_norm": 1.84375,
+      "learning_rate": 0.0019990148990715654,
+      "loss": 3.3547,
+      "step": 629
+    },
+    {
+      "epoch": 0.04382761139517896,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.001999004874326852,
+      "loss": 3.2467,
+      "step": 630
+    },
+    {
+      "epoch": 0.04389717903231417,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.001998994798857707,
+      "loss": 3.5803,
+      "step": 631
+    },
+    {
+      "epoch": 0.04396674666944937,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0019989846726646407,
+      "loss": 3.2881,
+      "step": 632
+    },
+    {
+      "epoch": 0.04403631430658458,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.001998974495748168,
+      "loss": 3.0706,
+      "step": 633
+    },
+    {
+      "epoch": 0.04410588194371978,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019989642681088058,
+      "loss": 3.4599,
+      "step": 634
+    },
+    {
+      "epoch": 0.04417544958085499,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001998953989747073,
+      "loss": 3.3248,
+      "step": 635
+    },
+    {
+      "epoch": 0.04424501721799019,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001998943660663492,
+      "loss": 3.4659,
+      "step": 636
+    },
+    {
+      "epoch": 0.0443145848551254,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001998933280858587,
+      "loss": 3.6053,
+      "step": 637
+    },
+    {
+      "epoch": 0.0443841524922606,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019989228503328846,
+      "loss": 3.4567,
+      "step": 638
+    },
+    {
+      "epoch": 0.0444537201293958,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001998912369086915,
+      "loss": 2.8121,
+      "step": 639
+    },
+    {
+      "epoch": 0.04452328776653101,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00199890183712121,
+      "loss": 3.4138,
+      "step": 640
+    },
+    {
+      "epoch": 0.04459285540366621,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001998891254436305,
+      "loss": 3.1913,
+      "step": 641
+    },
+    {
+      "epoch": 0.04466242304080142,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019988806210327367,
+      "loss": 3.3442,
+      "step": 642
+    },
+    {
+      "epoch": 0.04473199067793662,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001998869936911045,
+      "loss": 3.4197,
+      "step": 643
+    },
+    {
+      "epoch": 0.04480155831507183,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0019988592020717725,
+      "loss": 3.5198,
+      "step": 644
+    },
+    {
+      "epoch": 0.04487112595220703,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001998848416515465,
+      "loss": 3.3542,
+      "step": 645
+    },
+    {
+      "epoch": 0.04494069358934224,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.001998837580242669,
+      "loss": 3.1037,
+      "step": 646
+    },
+    {
+      "epoch": 0.045010261226477444,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001998826693253935,
+      "loss": 3.4104,
+      "step": 647
+    },
+    {
+      "epoch": 0.04507982886361265,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019988157555498164,
+      "loss": 3.5321,
+      "step": 648
+    },
+    {
+      "epoch": 0.045149396500747854,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001998804767130868,
+      "loss": 3.0773,
+      "step": 649
+    },
+    {
+      "epoch": 0.04521896413788306,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019987937279976474,
+      "loss": 3.6503,
+      "step": 650
+    },
+    {
+      "epoch": 0.045288531775018265,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001998782638150716,
+      "loss": 3.2508,
+      "step": 651
+    },
+    {
+      "epoch": 0.04535809941215346,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.001998771497590637,
+      "loss": 3.3105,
+      "step": 652
+    },
+    {
+      "epoch": 0.04542766704928867,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001998760306317975,
+      "loss": 3.1075,
+      "step": 653
+    },
+    {
+      "epoch": 0.045497234686423874,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0019987490643332984,
+      "loss": 3.257,
+      "step": 654
+    },
+    {
+      "epoch": 0.04556680232355908,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001998737771637179,
+      "loss": 3.4907,
+      "step": 655
+    },
+    {
+      "epoch": 0.045636369960694284,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0019987264282301893,
+      "loss": 3.7234,
+      "step": 656
+    },
+    {
+      "epoch": 0.04570593759782949,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0019987150341129055,
+      "loss": 2.9309,
+      "step": 657
+    },
+    {
+      "epoch": 0.045775505234964695,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001998703589285906,
+      "loss": 3.348,
+      "step": 658
+    },
+    {
+      "epoch": 0.0458450728720999,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0019986920937497725,
+      "loss": 3.3921,
+      "step": 659
+    },
+    {
+      "epoch": 0.045914640509235105,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001998680547505088,
+      "loss": 3.0439,
+      "step": 660
+    },
+    {
+      "epoch": 0.04598420814637031,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001998668950552439,
+      "loss": 3.1959,
+      "step": 661
+    },
+    {
+      "epoch": 0.046053775783505516,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0019986573028924143,
+      "loss": 3.4075,
+      "step": 662
+    },
+    {
+      "epoch": 0.04612334342064072,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0019986456045256056,
+      "loss": 3.1542,
+      "step": 663
+    },
+    {
+      "epoch": 0.046192911057775926,
+      "grad_norm": 1.5,
+      "learning_rate": 0.001998633855452607,
+      "loss": 3.1908,
+      "step": 664
+    },
+    {
+      "epoch": 0.046262478694911124,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.001998622055674014,
+      "loss": 3.5424,
+      "step": 665
+    },
+    {
+      "epoch": 0.04633204633204633,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019986102051904268,
+      "loss": 3.1491,
+      "step": 666
+    },
+    {
+      "epoch": 0.046401613969181535,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0019985983040024468,
+      "loss": 3.4403,
+      "step": 667
+    },
+    {
+      "epoch": 0.04647118160631674,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001998586352110678,
+      "loss": 3.3588,
+      "step": 668
+    },
+    {
+      "epoch": 0.046540749243451945,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019985743495157275,
+      "loss": 3.7794,
+      "step": 669
+    },
+    {
+      "epoch": 0.04661031688058715,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019985622962182046,
+      "loss": 3.3294,
+      "step": 670
+    },
+    {
+      "epoch": 0.046679884517722356,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001998550192218722,
+      "loss": 3.1803,
+      "step": 671
+    },
+    {
+      "epoch": 0.04674945215485756,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.001998538037517893,
+      "loss": 3.0911,
+      "step": 672
+    },
+    {
+      "epoch": 0.046819019791992766,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019985258321163356,
+      "loss": 2.7282,
+      "step": 673
+    },
+    {
+      "epoch": 0.04688858742912797,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00199851357601467,
+      "loss": 2.9728,
+      "step": 674
+    },
+    {
+      "epoch": 0.04695815506626318,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001998501269213517,
+      "loss": 3.3479,
+      "step": 675
+    },
+    {
+      "epoch": 0.04702772270339838,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.001998488911713503,
+      "loss": 3.2668,
+      "step": 676
+    },
+    {
+      "epoch": 0.04709729034053358,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019984765035152546,
+      "loss": 3.5445,
+      "step": 677
+    },
+    {
+      "epoch": 0.047166857977668786,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001998464044619402,
+      "loss": 3.2349,
+      "step": 678
+    },
+    {
+      "epoch": 0.04723642561480399,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0019984515350265778,
+      "loss": 3.1874,
+      "step": 679
+    },
+    {
+      "epoch": 0.047305993251939196,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019984389747374175,
+      "loss": 3.4472,
+      "step": 680
+    },
+    {
+      "epoch": 0.0473755608890744,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019984263637525587,
+      "loss": 3.7444,
+      "step": 681
+    },
+    {
+      "epoch": 0.04744512852620961,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001998413702072641,
+      "loss": 3.2759,
+      "step": 682
+    },
+    {
+      "epoch": 0.04751469616334481,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019984009896983086,
+      "loss": 3.2497,
+      "step": 683
+    },
+    {
+      "epoch": 0.04758426380048002,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019983882266302057,
+      "loss": 3.6491,
+      "step": 684
+    },
+    {
+      "epoch": 0.04765383143761522,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001998375412868981,
+      "loss": 3.5994,
+      "step": 685
+    },
+    {
+      "epoch": 0.04772339907475043,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001998362548415285,
+      "loss": 3.3485,
+      "step": 686
+    },
+    {
+      "epoch": 0.04779296671188563,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.001998349633269771,
+      "loss": 3.5639,
+      "step": 687
+    },
+    {
+      "epoch": 0.04786253434902084,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0019983366674330948,
+      "loss": 3.4652,
+      "step": 688
+    },
+    {
+      "epoch": 0.047932101986156044,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019983236509059144,
+      "loss": 3.4984,
+      "step": 689
+    },
+    {
+      "epoch": 0.04800166962329124,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001998310583688891,
+      "loss": 3.1597,
+      "step": 690
+    },
+    {
+      "epoch": 0.04807123726042645,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001998297465782688,
+      "loss": 3.3416,
+      "step": 691
+    },
+    {
+      "epoch": 0.04814080489756165,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0019982842971879716,
+      "loss": 3.7584,
+      "step": 692
+    },
+    {
+      "epoch": 0.04821037253469686,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0019982710779054102,
+      "loss": 3.1876,
+      "step": 693
+    },
+    {
+      "epoch": 0.04827994017183206,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.001998257807935675,
+      "loss": 3.5906,
+      "step": 694
+    },
+    {
+      "epoch": 0.04834950780896727,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00199824448727944,
+      "loss": 3.512,
+      "step": 695
+    },
+    {
+      "epoch": 0.04841907544610247,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019982311159373817,
+      "loss": 3.5001,
+      "step": 696
+    },
+    {
+      "epoch": 0.04848864308323768,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019982176939101785,
+      "loss": 3.4267,
+      "step": 697
+    },
+    {
+      "epoch": 0.048558210720372884,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001998204221198512,
+      "loss": 3.5476,
+      "step": 698
+    },
+    {
+      "epoch": 0.04862777835750809,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001998190697803067,
+      "loss": 3.7046,
+      "step": 699
+    },
+    {
+      "epoch": 0.048697345994643294,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019981771237245296,
+      "loss": 3.4709,
+      "step": 700
+    },
+    {
+      "epoch": 0.0487669136317785,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0019981634989635886,
+      "loss": 3.6087,
+      "step": 701
+    },
+    {
+      "epoch": 0.0488364812689137,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019981498235209366,
+      "loss": 3.3568,
+      "step": 702
+    },
+    {
+      "epoch": 0.0489060489060489,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0019981360973972675,
+      "loss": 3.1876,
+      "step": 703
+    },
+    {
+      "epoch": 0.04897561654318411,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019981223205932782,
+      "loss": 3.2481,
+      "step": 704
+    },
+    {
+      "epoch": 0.049045184180319314,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019981084931096687,
+      "loss": 3.5898,
+      "step": 705
+    },
+    {
+      "epoch": 0.04911475181745452,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0019980946149471403,
+      "loss": 3.5733,
+      "step": 706
+    },
+    {
+      "epoch": 0.049184319454589724,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001998080686106399,
+      "loss": 3.2213,
+      "step": 707
+    },
+    {
+      "epoch": 0.04925388709172493,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.00199806670658815,
+      "loss": 2.8544,
+      "step": 708
+    },
+    {
+      "epoch": 0.049323454728860135,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001998052676393105,
+      "loss": 3.8152,
+      "step": 709
+    },
+    {
+      "epoch": 0.04939302236599534,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0019980385955219756,
+      "loss": 3.4783,
+      "step": 710
+    },
+    {
+      "epoch": 0.049462590003130545,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0019980244639754767,
+      "loss": 3.3807,
+      "step": 711
+    },
+    {
+      "epoch": 0.04953215764026575,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019980102817543258,
+      "loss": 3.7659,
+      "step": 712
+    },
+    {
+      "epoch": 0.049601725277400956,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.001997996048859243,
+      "loss": 3.4794,
+      "step": 713
+    },
+    {
+      "epoch": 0.04967129291453616,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019979817652909515,
+      "loss": 3.496,
+      "step": 714
+    },
+    {
+      "epoch": 0.04974086055167136,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019979674310501763,
+      "loss": 3.189,
+      "step": 715
+    },
+    {
+      "epoch": 0.049810428188806564,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019979530461376447,
+      "loss": 3.2305,
+      "step": 716
+    },
+    {
+      "epoch": 0.04987999582594177,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001997938610554087,
+      "loss": 3.175,
+      "step": 717
+    },
+    {
+      "epoch": 0.049949563463076975,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019979241243002375,
+      "loss": 3.3976,
+      "step": 718
+    },
+    {
+      "epoch": 0.05001913110021218,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019979095873768307,
+      "loss": 3.2898,
+      "step": 719
+    },
+    {
+      "epoch": 0.050088698737347385,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0019978949997846046,
+      "loss": 3.3068,
+      "step": 720
+    },
+    {
+      "epoch": 0.05015826637448259,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019978803615243006,
+      "loss": 2.9773,
+      "step": 721
+    },
+    {
+      "epoch": 0.050227834011617796,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019978656725966612,
+      "loss": 3.5996,
+      "step": 722
+    },
+    {
+      "epoch": 0.050297401648753,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0019978509330024325,
+      "loss": 2.9751,
+      "step": 723
+    },
+    {
+      "epoch": 0.050366969285888206,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019978361427423633,
+      "loss": 3.2896,
+      "step": 724
+    },
+    {
+      "epoch": 0.05043653692302341,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.001997821301817204,
+      "loss": 3.4558,
+      "step": 725
+    },
+    {
+      "epoch": 0.05050610456015862,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0019978064102277085,
+      "loss": 3.5634,
+      "step": 726
+    },
+    {
+      "epoch": 0.05057567219729382,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019977914679746326,
+      "loss": 3.643,
+      "step": 727
+    },
+    {
+      "epoch": 0.05064523983442902,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019977764750587356,
+      "loss": 3.5794,
+      "step": 728
+    },
+    {
+      "epoch": 0.050714807471564226,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.001997761431480778,
+      "loss": 3.5215,
+      "step": 729
+    },
+    {
+      "epoch": 0.05078437510869943,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019977463372415237,
+      "loss": 3.1712,
+      "step": 730
+    },
+    {
+      "epoch": 0.050853942745834636,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00199773119234174,
+      "loss": 3.6297,
+      "step": 731
+    },
+    {
+      "epoch": 0.05092351038296984,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001997715996782195,
+      "loss": 3.3124,
+      "step": 732
+    },
+    {
+      "epoch": 0.05099307802010505,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0019977007505636605,
+      "loss": 3.1167,
+      "step": 733
+    },
+    {
+      "epoch": 0.05106264565724025,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019976854536869113,
+      "loss": 3.2124,
+      "step": 734
+    },
+    {
+      "epoch": 0.05113221329437546,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001997670106152723,
+      "loss": 3.2263,
+      "step": 735
+    },
+    {
+      "epoch": 0.05120178093151066,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001997654707961875,
+      "loss": 3.3927,
+      "step": 736
+    },
+    {
+      "epoch": 0.05127134856864587,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0019976392591151497,
+      "loss": 3.4537,
+      "step": 737
+    },
+    {
+      "epoch": 0.05134091620578107,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019976237596133315,
+      "loss": 3.4768,
+      "step": 738
+    },
+    {
+      "epoch": 0.05141048384291628,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019976082094572073,
+      "loss": 3.4063,
+      "step": 739
+    },
+    {
+      "epoch": 0.05148005148005148,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0019975926086475662,
+      "loss": 3.5262,
+      "step": 740
+    },
+    {
+      "epoch": 0.05154961911718668,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001997576957185201,
+      "loss": 3.1927,
+      "step": 741
+    },
+    {
+      "epoch": 0.05161918675432189,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.0019975612550709055,
+      "loss": 3.4512,
+      "step": 742
+    },
+    {
+      "epoch": 0.05168875439145709,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001997545502305478,
+      "loss": 3.3019,
+      "step": 743
+    },
+    {
+      "epoch": 0.0517583220285923,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001997529698889718,
+      "loss": 2.9856,
+      "step": 744
+    },
+    {
+      "epoch": 0.0518278896657275,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019975138448244272,
+      "loss": 2.9967,
+      "step": 745
+    },
+    {
+      "epoch": 0.05189745730286271,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001997497940110412,
+      "loss": 3.3576,
+      "step": 746
+    },
+    {
+      "epoch": 0.05196702493999791,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019974819847484787,
+      "loss": 3.4691,
+      "step": 747
+    },
+    {
+      "epoch": 0.05203659257713312,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001997465978739438,
+      "loss": 3.7629,
+      "step": 748
+    },
+    {
+      "epoch": 0.052106160214268324,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019974499220841023,
+      "loss": 3.1283,
+      "step": 749
+    },
+    {
+      "epoch": 0.05217572785140353,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019974338147832876,
+      "loss": 3.7169,
+      "step": 750
+    },
+    {
+      "epoch": 0.052245295488538734,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019974176568378107,
+      "loss": 3.3305,
+      "step": 751
+    },
+    {
+      "epoch": 0.05231486312567394,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001997401448248493,
+      "loss": 2.918,
+      "step": 752
+    },
+    {
+      "epoch": 0.05238443076280914,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019973851890161564,
+      "loss": 3.1841,
+      "step": 753
+    },
+    {
+      "epoch": 0.05245399839994434,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001997368879141628,
+      "loss": 3.4353,
+      "step": 754
+    },
+    {
+      "epoch": 0.05252356603707955,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019973525186257344,
+      "loss": 3.779,
+      "step": 755
+    },
+    {
+      "epoch": 0.052593133674214754,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019973361074693066,
+      "loss": 3.8655,
+      "step": 756
+    },
+    {
+      "epoch": 0.05266270131134996,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001997319645673179,
+      "loss": 3.0859,
+      "step": 757
+    },
+    {
+      "epoch": 0.052732268948485164,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001997303133238186,
+      "loss": 3.3803,
+      "step": 758
+    },
+    {
+      "epoch": 0.05280183658562037,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001997286570165167,
+      "loss": 3.1753,
+      "step": 759
+    },
+    {
+      "epoch": 0.052871404222755575,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0019972699564549624,
+      "loss": 3.5117,
+      "step": 760
+    },
+    {
+      "epoch": 0.05294097185989078,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019972532921084165,
+      "loss": 3.3453,
+      "step": 761
+    },
+    {
+      "epoch": 0.053010539497025985,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.001997236577126375,
+      "loss": 3.1444,
+      "step": 762
+    },
+    {
+      "epoch": 0.05308010713416119,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001997219811509686,
+      "loss": 2.9711,
+      "step": 763
+    },
+    {
+      "epoch": 0.053149674771296396,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019972029952592014,
+      "loss": 3.7106,
+      "step": 764
+    },
+    {
+      "epoch": 0.0532192424084316,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0019971861283757755,
+      "loss": 3.5073,
+      "step": 765
+    },
+    {
+      "epoch": 0.0532888100455668,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0019971692108602637,
+      "loss": 3.1031,
+      "step": 766
+    },
+    {
+      "epoch": 0.053358377682702005,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001997152242713526,
+      "loss": 3.2373,
+      "step": 767
+    },
+    {
+      "epoch": 0.05342794531983721,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0019971352239364225,
+      "loss": 3.1924,
+      "step": 768
+    },
+    {
+      "epoch": 0.053497512956972415,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001997118154529819,
+      "loss": 3.5255,
+      "step": 769
+    },
+    {
+      "epoch": 0.05356708059410762,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001997101034494581,
+      "loss": 3.5165,
+      "step": 770
+    },
+    {
+      "epoch": 0.053636648231242826,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001997083863831579,
+      "loss": 3.2297,
+      "step": 771
+    },
+    {
+      "epoch": 0.05370621586837803,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0019970666425416835,
+      "loss": 3.3447,
+      "step": 772
+    },
+    {
+      "epoch": 0.053775783505513236,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019970493706257695,
+      "loss": 3.6247,
+      "step": 773
+    },
+    {
+      "epoch": 0.05384535114264844,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001997032048084714,
+      "loss": 3.2659,
+      "step": 774
+    },
+    {
+      "epoch": 0.05391491877978365,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019970146749193965,
+      "loss": 3.3362,
+      "step": 775
+    },
+    {
+      "epoch": 0.05398448641691885,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0019969972511306995,
+      "loss": 3.8919,
+      "step": 776
+    },
+    {
+      "epoch": 0.05405405405405406,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0019969797767195067,
+      "loss": 3.1955,
+      "step": 777
+    },
+    {
+      "epoch": 0.054123621691189255,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0019969622516867063,
+      "loss": 3.4623,
+      "step": 778
+    },
+    {
+      "epoch": 0.05419318932832446,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001996944676033188,
+      "loss": 3.5068,
+      "step": 779
+    },
+    {
+      "epoch": 0.054262756965459666,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019969270497598437,
+      "loss": 3.8145,
+      "step": 780
+    },
+    {
+      "epoch": 0.05433232460259487,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001996909372867569,
+      "loss": 3.3767,
+      "step": 781
+    },
+    {
+      "epoch": 0.054401892239730076,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001996891645357261,
+      "loss": 3.5851,
+      "step": 782
+    },
+    {
+      "epoch": 0.05447145987686528,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019968738672298198,
+      "loss": 3.4255,
+      "step": 783
+    },
+    {
+      "epoch": 0.05454102751400049,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019968560384861487,
+      "loss": 3.5207,
+      "step": 784
+    },
+    {
+      "epoch": 0.05461059515113569,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001996838159127152,
+      "loss": 3.5369,
+      "step": 785
+    },
+    {
+      "epoch": 0.0546801627882709,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0019968202291537384,
+      "loss": 3.1211,
+      "step": 786
+    },
+    {
+      "epoch": 0.0547497304254061,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0019968022485668175,
+      "loss": 3.6356,
+      "step": 787
+    },
+    {
+      "epoch": 0.05481929806254131,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0019967842173673027,
+      "loss": 3.6714,
+      "step": 788
+    },
+    {
+      "epoch": 0.05488886569967651,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00199676613555611,
+      "loss": 3.1509,
+      "step": 789
+    },
+    {
+      "epoch": 0.05495843333681172,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0019967480031341566,
+      "loss": 3.2795,
+      "step": 790
+    },
+    {
+      "epoch": 0.05502800097394692,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019967298201023637,
+      "loss": 3.2036,
+      "step": 791
+    },
+    {
+      "epoch": 0.05509756861108212,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019967115864616544,
+      "loss": 3.2689,
+      "step": 792
+    },
+    {
+      "epoch": 0.05516713624821733,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019966933022129542,
+      "loss": 3.3734,
+      "step": 793
+    },
+    {
+      "epoch": 0.05523670388535253,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.001996674967357192,
+      "loss": 2.7514,
+      "step": 794
+    },
+    {
+      "epoch": 0.05530627152248774,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001996656581895299,
+      "loss": 3.4004,
+      "step": 795
+    },
+    {
+      "epoch": 0.05537583915962294,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019966381458282082,
+      "loss": 3.0403,
+      "step": 796
+    },
+    {
+      "epoch": 0.05544540679675815,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019966196591568557,
+      "loss": 3.2993,
+      "step": 797
+    },
+    {
+      "epoch": 0.05551497443389335,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00199660112188218,
+      "loss": 3.3328,
+      "step": 798
+    },
+    {
+      "epoch": 0.05558454207102856,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001996582534005123,
+      "loss": 3.5094,
+      "step": 799
+    },
+    {
+      "epoch": 0.055654109708163764,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019965638955266275,
+      "loss": 3.4173,
+      "step": 800
+    },
+    {
+      "epoch": 0.05572367734529897,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0019965452064476404,
+      "loss": 3.6744,
+      "step": 801
+    },
+    {
+      "epoch": 0.055793244982434174,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019965264667691114,
+      "loss": 3.2192,
+      "step": 802
+    },
+    {
+      "epoch": 0.05586281261956938,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0019965076764919907,
+      "loss": 3.1377,
+      "step": 803
+    },
+    {
+      "epoch": 0.05593238025670458,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001996488835617233,
+      "loss": 3.2694,
+      "step": 804
+    },
+    {
+      "epoch": 0.05600194789383978,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019964699441457952,
+      "loss": 3.5418,
+      "step": 805
+    },
+    {
+      "epoch": 0.05607151553097499,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001996451002078636,
+      "loss": 3.1088,
+      "step": 806
+    },
+    {
+      "epoch": 0.056141083168110194,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019964320094167176,
+      "loss": 3.3395,
+      "step": 807
+    },
+    {
+      "epoch": 0.0562106508052454,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.001996412966161004,
+      "loss": 3.2299,
+      "step": 808
+    },
+    {
+      "epoch": 0.056280218442380604,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019963938723124622,
+      "loss": 3.4256,
+      "step": 809
+    },
+    {
+      "epoch": 0.05634978607951581,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001996374727872062,
+      "loss": 3.3638,
+      "step": 810
+    },
+    {
+      "epoch": 0.056419353716651015,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001996355532840775,
+      "loss": 3.8451,
+      "step": 811
+    },
+    {
+      "epoch": 0.05648892135378622,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001996336287219576,
+      "loss": 3.2477,
+      "step": 812
+    },
+    {
+      "epoch": 0.056558488990921425,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019963169910094426,
+      "loss": 3.607,
+      "step": 813
+    },
+    {
+      "epoch": 0.05662805662805663,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019962976442113537,
+      "loss": 3.2908,
+      "step": 814
+    },
+    {
+      "epoch": 0.056697624265191836,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019962782468262927,
+      "loss": 3.3164,
+      "step": 815
+    },
+    {
+      "epoch": 0.056767191902327034,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019962587988552436,
+      "loss": 3.2608,
+      "step": 816
+    },
+    {
+      "epoch": 0.05683675953946224,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0019962393002991943,
+      "loss": 2.8271,
+      "step": 817
+    },
+    {
+      "epoch": 0.056906327176597445,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019962197511591345,
+      "loss": 3.2975,
+      "step": 818
+    },
+    {
+      "epoch": 0.05697589481373265,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019962001514360573,
+      "loss": 3.4618,
+      "step": 819
+    },
+    {
+      "epoch": 0.057045462450867855,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0019961805011309577,
+      "loss": 3.2017,
+      "step": 820
+    },
+    {
+      "epoch": 0.05711503008800306,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019961608002448334,
+      "loss": 3.4054,
+      "step": 821
+    },
+    {
+      "epoch": 0.057184597725138266,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0019961410487786845,
+      "loss": 3.5605,
+      "step": 822
+    },
+    {
+      "epoch": 0.05725416536227347,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019961212467335143,
+      "loss": 3.4814,
+      "step": 823
+    },
+    {
+      "epoch": 0.057323732999408676,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0019961013941103274,
+      "loss": 3.1413,
+      "step": 824
+    },
+    {
+      "epoch": 0.05739330063654388,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.001996081490910133,
+      "loss": 3.4585,
+      "step": 825
+    },
+    {
+      "epoch": 0.05746286827367909,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.001996061537133941,
+      "loss": 3.4995,
+      "step": 826
+    },
+    {
+      "epoch": 0.05753243591081429,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0019960415327827646,
+      "loss": 3.4805,
+      "step": 827
+    },
+    {
+      "epoch": 0.0576020035479495,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019960214778576195,
+      "loss": 3.0566,
+      "step": 828
+    },
+    {
+      "epoch": 0.057671571185084695,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001996001372359524,
+      "loss": 3.0971,
+      "step": 829
+    },
+    {
+      "epoch": 0.0577411388222199,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019959812162894997,
+      "loss": 3.596,
+      "step": 830
+    },
+    {
+      "epoch": 0.057810706459355106,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001995961009648569,
+      "loss": 3.5438,
+      "step": 831
+    },
+    {
+      "epoch": 0.05788027409649031,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001995940752437758,
+      "loss": 3.7194,
+      "step": 832
+    },
+    {
+      "epoch": 0.057949841733625516,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019959204446580955,
+      "loss": 3.187,
+      "step": 833
+    },
+    {
+      "epoch": 0.05801940937076072,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.001995900086310613,
+      "loss": 3.2581,
+      "step": 834
+    },
+    {
+      "epoch": 0.05808897700789593,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0019958796773963433,
+      "loss": 3.3424,
+      "step": 835
+    },
+    {
+      "epoch": 0.05815854464503113,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019958592179163234,
+      "loss": 3.5114,
+      "step": 836
+    },
+    {
+      "epoch": 0.05822811228216634,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019958387078715923,
+      "loss": 3.2554,
+      "step": 837
+    },
+    {
+      "epoch": 0.05829767991930154,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0019958181472631907,
+      "loss": 3.3679,
+      "step": 838
+    },
+    {
+      "epoch": 0.05836724755643675,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001995797536092163,
+      "loss": 3.3486,
+      "step": 839
+    },
+    {
+      "epoch": 0.05843681519357195,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001995776874359555,
+      "loss": 3.0417,
+      "step": 840
+    },
+    {
+      "epoch": 0.05850638283070716,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.001995756162066417,
+      "loss": 3.4033,
+      "step": 841
+    },
+    {
+      "epoch": 0.05857595046784236,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019957353992138003,
+      "loss": 3.1399,
+      "step": 842
+    },
+    {
+      "epoch": 0.05864551810497756,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019957145858027587,
+      "loss": 3.3802,
+      "step": 843
+    },
+    {
+      "epoch": 0.05871508574211277,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001995693721834349,
+      "loss": 3.0986,
+      "step": 844
+    },
+    {
+      "epoch": 0.05878465337924797,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001995672807309631,
+      "loss": 3.2547,
+      "step": 845
+    },
+    {
+      "epoch": 0.05885422101638318,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001995651842229666,
+      "loss": 3.1011,
+      "step": 846
+    },
+    {
+      "epoch": 0.05892378865351838,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019956308265955194,
+      "loss": 3.4335,
+      "step": 847
+    },
+    {
+      "epoch": 0.05899335629065359,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019956097604082574,
+      "loss": 2.7745,
+      "step": 848
+    },
+    {
+      "epoch": 0.059062923927788794,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00199558864366895,
+      "loss": 3.4019,
+      "step": 849
+    },
+    {
+      "epoch": 0.059132491564924,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0019955674763786698,
+      "loss": 3.3095,
+      "step": 850
+    },
+    {
+      "epoch": 0.059202059202059204,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001995546258538491,
+      "loss": 3.3396,
+      "step": 851
+    },
+    {
+      "epoch": 0.05927162683919441,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001995524990149491,
+      "loss": 3.4062,
+      "step": 852
+    },
+    {
+      "epoch": 0.059341194476329615,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00199550367121275,
+      "loss": 3.2541,
+      "step": 853
+    },
+    {
+      "epoch": 0.05941076211346481,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.00199548230172935,
+      "loss": 3.0319,
+      "step": 854
+    },
+    {
+      "epoch": 0.05948032975060002,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.001995460881700377,
+      "loss": 3.1415,
+      "step": 855
+    },
+    {
+      "epoch": 0.05954989738773522,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001995439411126917,
+      "loss": 3.4585,
+      "step": 856
+    },
+    {
+      "epoch": 0.05961946502487043,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019954178900100615,
+      "loss": 3.2141,
+      "step": 857
+    },
+    {
+      "epoch": 0.059689032662005634,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001995396318350903,
+      "loss": 3.3658,
+      "step": 858
+    },
+    {
+      "epoch": 0.05975860029914084,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019953746961505364,
+      "loss": 3.5806,
+      "step": 859
+    },
+    {
+      "epoch": 0.059828167936276044,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00199535302341006,
+      "loss": 3.4283,
+      "step": 860
+    },
+    {
+      "epoch": 0.05989773557341125,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019953313001305735,
+      "loss": 3.2957,
+      "step": 861
+    },
+    {
+      "epoch": 0.059967303210546455,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001995309526313181,
+      "loss": 3.1868,
+      "step": 862
+    },
+    {
+      "epoch": 0.06003687084768166,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001995287701958987,
+      "loss": 3.5114,
+      "step": 863
+    },
+    {
+      "epoch": 0.060106438484816865,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019952658270691007,
+      "loss": 3.2758,
+      "step": 864
+    },
+    {
+      "epoch": 0.06017600612195207,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001995243901644632,
+      "loss": 3.0768,
+      "step": 865
+    },
+    {
+      "epoch": 0.060245573759087276,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0019952219256866945,
+      "loss": 3.4132,
+      "step": 866
+    },
+    {
+      "epoch": 0.060315141396222474,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019951998991964036,
+      "loss": 3.3159,
+      "step": 867
+    },
+    {
+      "epoch": 0.06038470903335768,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001995177822174878,
+      "loss": 3.1059,
+      "step": 868
+    },
+    {
+      "epoch": 0.060454276670492885,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019951556946232385,
+      "loss": 3.3609,
+      "step": 869
+    },
+    {
+      "epoch": 0.06052384430762809,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001995133516542609,
+      "loss": 3.1245,
+      "step": 870
+    },
+    {
+      "epoch": 0.060593411944763295,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0019951112879341157,
+      "loss": 3.4607,
+      "step": 871
+    },
+    {
+      "epoch": 0.0606629795818985,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019950890087988868,
+      "loss": 3.9611,
+      "step": 872
+    },
+    {
+      "epoch": 0.060732547219033706,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019950666791380533,
+      "loss": 3.0612,
+      "step": 873
+    },
+    {
+      "epoch": 0.06080211485616891,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019950442989527493,
+      "loss": 3.4439,
+      "step": 874
+    },
+    {
+      "epoch": 0.060871682493304116,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019950218682441116,
+      "loss": 3.4221,
+      "step": 875
+    },
+    {
+      "epoch": 0.06094125013043932,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019949993870132785,
+      "loss": 3.5011,
+      "step": 876
+    },
+    {
+      "epoch": 0.06101081776757453,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001994976855261392,
+      "loss": 3.1799,
+      "step": 877
+    },
+    {
+      "epoch": 0.06108038540470973,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019949542729895955,
+      "loss": 3.4673,
+      "step": 878
+    },
+    {
+      "epoch": 0.06114995304184494,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001994931640199036,
+      "loss": 3.3861,
+      "step": 879
+    },
+    {
+      "epoch": 0.061219520678980135,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019949089568908627,
+      "loss": 3.7525,
+      "step": 880
+    },
+    {
+      "epoch": 0.06128908831611534,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001994886223066227,
+      "loss": 3.5964,
+      "step": 881
+    },
+    {
+      "epoch": 0.061358655953250546,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001994863438726284,
+      "loss": 3.6425,
+      "step": 882
+    },
+    {
+      "epoch": 0.06142822359038575,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0019948406038721896,
+      "loss": 3.5512,
+      "step": 883
+    },
+    {
+      "epoch": 0.061497791227520957,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001994817718505104,
+      "loss": 3.445,
+      "step": 884
+    },
+    {
+      "epoch": 0.06156735886465616,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.001994794782626189,
+      "loss": 2.9935,
+      "step": 885
+    },
+    {
+      "epoch": 0.06163692650179137,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019947717962366085,
+      "loss": 3.5828,
+      "step": 886
+    },
+    {
+      "epoch": 0.06170649413892657,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001994748759337531,
+      "loss": 3.1945,
+      "step": 887
+    },
+    {
+      "epoch": 0.06177606177606178,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001994725671930125,
+      "loss": 3.4912,
+      "step": 888
+    },
+    {
+      "epoch": 0.06184562941319698,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001994702534015563,
+      "loss": 3.0837,
+      "step": 889
+    },
+    {
+      "epoch": 0.06191519705033219,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00199467934559502,
+      "loss": 3.4557,
+      "step": 890
+    },
+    {
+      "epoch": 0.06198476468746739,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001994656106669674,
+      "loss": 3.282,
+      "step": 891
+    },
+    {
+      "epoch": 0.06205433232460259,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019946328172407036,
+      "loss": 3.8112,
+      "step": 892
+    },
+    {
+      "epoch": 0.0621238999617378,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019946094773092924,
+      "loss": 3.429,
+      "step": 893
+    },
+    {
+      "epoch": 0.062193467598873,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001994586086876625,
+      "loss": 3.5757,
+      "step": 894
+    },
+    {
+      "epoch": 0.06226303523600821,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019945626459438896,
+      "loss": 3.4571,
+      "step": 895
+    },
+    {
+      "epoch": 0.06233260287314341,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019945391545122754,
+      "loss": 2.874,
+      "step": 896
+    },
+    {
+      "epoch": 0.06240217051027862,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001994515612582976,
+      "loss": 2.7974,
+      "step": 897
+    },
+    {
+      "epoch": 0.06247173814741382,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0019944920201571867,
+      "loss": 3.2491,
+      "step": 898
+    },
+    {
+      "epoch": 0.06254130578454903,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019944683772361053,
+      "loss": 3.3027,
+      "step": 899
+    },
+    {
+      "epoch": 0.06261087342168423,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001994444683820932,
+      "loss": 3.3587,
+      "step": 900
+    },
+    {
+      "epoch": 0.06268044105881944,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00199442093991287,
+      "loss": 3.7572,
+      "step": 901
+    },
+    {
+      "epoch": 0.06275000869595464,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001994397145513125,
+      "loss": 3.4524,
+      "step": 902
+    },
+    {
+      "epoch": 0.06281957633308985,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0019943733006229053,
+      "loss": 3.2251,
+      "step": 903
+    },
+    {
+      "epoch": 0.06288914397022505,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001994349405243421,
+      "loss": 3.4437,
+      "step": 904
+    },
+    {
+      "epoch": 0.06295871160736026,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001994325459375886,
+      "loss": 3.3404,
+      "step": 905
+    },
+    {
+      "epoch": 0.06302827924449546,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001994301463021516,
+      "loss": 3.4911,
+      "step": 906
+    },
+    {
+      "epoch": 0.06309784688163067,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001994277416181529,
+      "loss": 3.2754,
+      "step": 907
+    },
+    {
+      "epoch": 0.06316741451876587,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001994253318857147,
+      "loss": 3.1639,
+      "step": 908
+    },
+    {
+      "epoch": 0.06323698215590108,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001994229171049592,
+      "loss": 3.0842,
+      "step": 909
+    },
+    {
+      "epoch": 0.06330654979303628,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001994204972760092,
+      "loss": 3.7311,
+      "step": 910
+    },
+    {
+      "epoch": 0.06337611743017148,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001994180723989874,
+      "loss": 3.3897,
+      "step": 911
+    },
+    {
+      "epoch": 0.06344568506730669,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00199415642474017,
+      "loss": 3.3411,
+      "step": 912
+    },
+    {
+      "epoch": 0.06351525270444189,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019941320750122135,
+      "loss": 3.3479,
+      "step": 913
+    },
+    {
+      "epoch": 0.0635848203415771,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019941076748072415,
+      "loss": 3.6582,
+      "step": 914
+    },
+    {
+      "epoch": 0.0636543879787123,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001994083224126492,
+      "loss": 3.0862,
+      "step": 915
+    },
+    {
+      "epoch": 0.06372395561584751,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001994058722971207,
+      "loss": 3.3082,
+      "step": 916
+    },
+    {
+      "epoch": 0.06379352325298271,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019940341713426306,
+      "loss": 3.7296,
+      "step": 917
+    },
+    {
+      "epoch": 0.06386309089011792,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001994009569242009,
+      "loss": 3.6124,
+      "step": 918
+    },
+    {
+      "epoch": 0.06393265852725312,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001993984916670592,
+      "loss": 3.1775,
+      "step": 919
+    },
+    {
+      "epoch": 0.06400222616438833,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001993960213629631,
+      "loss": 2.9769,
+      "step": 920
+    },
+    {
+      "epoch": 0.06407179380152353,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019939354601203802,
+      "loss": 3.117,
+      "step": 921
+    },
+    {
+      "epoch": 0.06414136143865874,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019939106561440963,
+      "loss": 3.3155,
+      "step": 922
+    },
+    {
+      "epoch": 0.06421092907579394,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019938858017020393,
+      "loss": 3.7916,
+      "step": 923
+    },
+    {
+      "epoch": 0.06428049671292914,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019938608967954704,
+      "loss": 3.5828,
+      "step": 924
+    },
+    {
+      "epoch": 0.06435006435006435,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001993835941425655,
+      "loss": 3.4896,
+      "step": 925
+    },
+    {
+      "epoch": 0.06441963198719955,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00199381093559386,
+      "loss": 3.454,
+      "step": 926
+    },
+    {
+      "epoch": 0.06448919962433476,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019937858793013545,
+      "loss": 3.5434,
+      "step": 927
+    },
+    {
+      "epoch": 0.06455876726146996,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001993760772549411,
+      "loss": 3.5979,
+      "step": 928
+    },
+    {
+      "epoch": 0.06462833489860517,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019937356153393046,
+      "loss": 3.6209,
+      "step": 929
+    },
+    {
+      "epoch": 0.06469790253574037,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019937104076723127,
+      "loss": 3.1411,
+      "step": 930
+    },
+    {
+      "epoch": 0.06476747017287558,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019936851495497144,
+      "loss": 3.1043,
+      "step": 931
+    },
+    {
+      "epoch": 0.06483703781001078,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001993659840972793,
+      "loss": 3.3624,
+      "step": 932
+    },
+    {
+      "epoch": 0.06490660544714599,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019936344819428335,
+      "loss": 3.3843,
+      "step": 933
+    },
+    {
+      "epoch": 0.06497617308428119,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001993609072461123,
+      "loss": 3.1549,
+      "step": 934
+    },
+    {
+      "epoch": 0.0650457407214164,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001993583612528952,
+      "loss": 3.5149,
+      "step": 935
+    },
+    {
+      "epoch": 0.0651153083585516,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019935581021476136,
+      "loss": 3.5026,
+      "step": 936
+    },
+    {
+      "epoch": 0.0651848759956868,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001993532541318402,
+      "loss": 3.2724,
+      "step": 937
+    },
+    {
+      "epoch": 0.06525444363282201,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001993506930042616,
+      "loss": 3.2721,
+      "step": 938
+    },
+    {
+      "epoch": 0.06532401126995721,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001993481268321556,
+      "loss": 3.0339,
+      "step": 939
+    },
+    {
+      "epoch": 0.06539357890709242,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019934555561565244,
+      "loss": 3.6624,
+      "step": 940
+    },
+    {
+      "epoch": 0.06546314654422762,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019934297935488275,
+      "loss": 3.2563,
+      "step": 941
+    },
+    {
+      "epoch": 0.06553271418136283,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019934039804997724,
+      "loss": 3.1389,
+      "step": 942
+    },
+    {
+      "epoch": 0.06560228181849803,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019933781170106703,
+      "loss": 3.7684,
+      "step": 943
+    },
+    {
+      "epoch": 0.06567184945563324,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019933522030828347,
+      "loss": 3.2817,
+      "step": 944
+    },
+    {
+      "epoch": 0.06574141709276844,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019933262387175814,
+      "loss": 3.4684,
+      "step": 945
+    },
+    {
+      "epoch": 0.06581098472990365,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001993300223916228,
+      "loss": 3.2135,
+      "step": 946
+    },
+    {
+      "epoch": 0.06588055236703885,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019932741586800957,
+      "loss": 3.4436,
+      "step": 947
+    },
+    {
+      "epoch": 0.06595012000417406,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019932480430105083,
+      "loss": 3.4002,
+      "step": 948
+    },
+    {
+      "epoch": 0.06601968764130926,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019932218769087916,
+      "loss": 3.8864,
+      "step": 949
+    },
+    {
+      "epoch": 0.06608925527844446,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001993195660376274,
+      "loss": 2.9565,
+      "step": 950
+    },
+    {
+      "epoch": 0.06615882291557967,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001993169393414287,
+      "loss": 3.2403,
+      "step": 951
+    },
+    {
+      "epoch": 0.06622839055271487,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001993143076024164,
+      "loss": 3.5741,
+      "step": 952
+    },
+    {
+      "epoch": 0.06629795818985008,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001993116708207242,
+      "loss": 3.2992,
+      "step": 953
+    },
+    {
+      "epoch": 0.06636752582698528,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001993090289964859,
+      "loss": 3.3092,
+      "step": 954
+    },
+    {
+      "epoch": 0.0664370934641205,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0019930638212983564,
+      "loss": 3.2502,
+      "step": 955
+    },
+    {
+      "epoch": 0.06650666110125569,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019930373022090785,
+      "loss": 3.2321,
+      "step": 956
+    },
+    {
+      "epoch": 0.0665762287383909,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019930107326983715,
+      "loss": 3.2501,
+      "step": 957
+    },
+    {
+      "epoch": 0.0666457963755261,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019929841127675845,
+      "loss": 3.5014,
+      "step": 958
+    },
+    {
+      "epoch": 0.06671536401266132,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019929574424180697,
+      "loss": 3.1929,
+      "step": 959
+    },
+    {
+      "epoch": 0.06678493164979651,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0019929307216511806,
+      "loss": 3.5224,
+      "step": 960
+    },
+    {
+      "epoch": 0.06685449928693173,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019929039504682743,
+      "loss": 3.7231,
+      "step": 961
+    },
+    {
+      "epoch": 0.06692406692406692,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0019928771288707098,
+      "loss": 3.4419,
+      "step": 962
+    },
+    {
+      "epoch": 0.06699363456120212,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019928502568598494,
+      "loss": 3.5337,
+      "step": 963
+    },
+    {
+      "epoch": 0.06706320219833733,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0019928233344370574,
+      "loss": 3.2093,
+      "step": 964
+    },
+    {
+      "epoch": 0.06713276983547253,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019927963616037003,
+      "loss": 3.0913,
+      "step": 965
+    },
+    {
+      "epoch": 0.06720233747260775,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001992769338361148,
+      "loss": 3.4042,
+      "step": 966
+    },
+    {
+      "epoch": 0.06727190510974294,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001992742264710773,
+      "loss": 3.6867,
+      "step": 967
+    },
+    {
+      "epoch": 0.06734147274687816,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0019927151406539494,
+      "loss": 3.4302,
+      "step": 968
+    },
+    {
+      "epoch": 0.06741104038401335,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019926879661920547,
+      "loss": 3.4974,
+      "step": 969
+    },
+    {
+      "epoch": 0.06748060802114857,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019926607413264684,
+      "loss": 3.2897,
+      "step": 970
+    },
+    {
+      "epoch": 0.06755017565828376,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019926334660585734,
+      "loss": 3.2806,
+      "step": 971
+    },
+    {
+      "epoch": 0.06761974329541898,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019926061403897537,
+      "loss": 3.3603,
+      "step": 972
+    },
+    {
+      "epoch": 0.06768931093255418,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001992578764321398,
+      "loss": 3.0188,
+      "step": 973
+    },
+    {
+      "epoch": 0.06775887856968937,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001992551337854895,
+      "loss": 3.3528,
+      "step": 974
+    },
+    {
+      "epoch": 0.06782844620682459,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019925238609916377,
+      "loss": 3.1065,
+      "step": 975
+    },
+    {
+      "epoch": 0.06789801384395978,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019924963337330224,
+      "loss": 3.5713,
+      "step": 976
+    },
+    {
+      "epoch": 0.067967581481095,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001992468756080445,
+      "loss": 3.1457,
+      "step": 977
+    },
+    {
+      "epoch": 0.0680371491182302,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001992441128035307,
+      "loss": 3.346,
+      "step": 978
+    },
+    {
+      "epoch": 0.0681067167553654,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019924134495990105,
+      "loss": 3.6706,
+      "step": 979
+    },
+    {
+      "epoch": 0.0681762843925006,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019923857207729614,
+      "loss": 3.5294,
+      "step": 980
+    },
+    {
+      "epoch": 0.06824585202963582,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019923579415585674,
+      "loss": 3.4409,
+      "step": 981
+    },
+    {
+      "epoch": 0.06831541966677102,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019923301119572387,
+      "loss": 3.3162,
+      "step": 982
+    },
+    {
+      "epoch": 0.06838498730390623,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019923022319703887,
+      "loss": 3.1688,
+      "step": 983
+    },
+    {
+      "epoch": 0.06845455494104143,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001992274301599433,
+      "loss": 3.2179,
+      "step": 984
+    },
+    {
+      "epoch": 0.06852412257817664,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019922463208457896,
+      "loss": 3.6096,
+      "step": 985
+    },
+    {
+      "epoch": 0.06859369021531184,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0019922182897108794,
+      "loss": 3.4342,
+      "step": 986
+    },
+    {
+      "epoch": 0.06866325785244703,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001992190208196126,
+      "loss": 2.9741,
+      "step": 987
+    },
+    {
+      "epoch": 0.06873282548958225,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019921620763029544,
+      "loss": 3.58,
+      "step": 988
+    },
+    {
+      "epoch": 0.06880239312671745,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019921338940327936,
+      "loss": 3.4243,
+      "step": 989
+    },
+    {
+      "epoch": 0.06887196076385266,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001992105661387074,
+      "loss": 3.3534,
+      "step": 990
+    },
+    {
+      "epoch": 0.06894152840098786,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00199207737836723,
+      "loss": 3.1215,
+      "step": 991
+    },
+    {
+      "epoch": 0.06901109603812307,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019920490449746972,
+      "loss": 3.4135,
+      "step": 992
+    },
+    {
+      "epoch": 0.06908066367525827,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001992020661210914,
+      "loss": 3.516,
+      "step": 993
+    },
+    {
+      "epoch": 0.06915023131239348,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019919922270773215,
+      "loss": 2.9574,
+      "step": 994
+    },
+    {
+      "epoch": 0.06921979894952868,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001991963742575364,
+      "loss": 3.4767,
+      "step": 995
+    },
+    {
+      "epoch": 0.06928936658666389,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0019919352077064872,
+      "loss": 3.7818,
+      "step": 996
+    },
+    {
+      "epoch": 0.06935893422379909,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0019919066224721406,
+      "loss": 3.3807,
+      "step": 997
+    },
+    {
+      "epoch": 0.0694285018609343,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019918779868737754,
+      "loss": 3.2762,
+      "step": 998
+    },
+    {
+      "epoch": 0.0694980694980695,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019918493009128454,
+      "loss": 3.0363,
+      "step": 999
+    },
+    {
+      "epoch": 0.0695676371352047,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019918205645908073,
+      "loss": 3.3782,
+      "step": 1000
+    },
+    {
+      "epoch": 0.06963720477233991,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019917917779091196,
+      "loss": 3.4582,
+      "step": 1001
+    },
+    {
+      "epoch": 0.0697067724094751,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019917629408692447,
+      "loss": 3.3803,
+      "step": 1002
+    },
+    {
+      "epoch": 0.06977634004661032,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019917340534726467,
+      "loss": 3.3236,
+      "step": 1003
+    },
+    {
+      "epoch": 0.06984590768374552,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019917051157207918,
+      "loss": 3.3535,
+      "step": 1004
+    },
+    {
+      "epoch": 0.06991547532088073,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00199167612761515,
+      "loss": 3.5928,
+      "step": 1005
+    },
+    {
+      "epoch": 0.06998504295801593,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019916470891571925,
+      "loss": 3.3844,
+      "step": 1006
+    },
+    {
+      "epoch": 0.07005461059515114,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019916180003483946,
+      "loss": 3.1268,
+      "step": 1007
+    },
+    {
+      "epoch": 0.07012417823228634,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019915888611902323,
+      "loss": 3.4423,
+      "step": 1008
+    },
+    {
+      "epoch": 0.07019374586942155,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001991559671684186,
+      "loss": 2.9498,
+      "step": 1009
+    },
+    {
+      "epoch": 0.07026331350655675,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001991530431831737,
+      "loss": 3.3612,
+      "step": 1010
+    },
+    {
+      "epoch": 0.07033288114369196,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019915011416343706,
+      "loss": 3.0226,
+      "step": 1011
+    },
+    {
+      "epoch": 0.07040244878082716,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001991471801093574,
+      "loss": 3.5298,
+      "step": 1012
+    },
+    {
+      "epoch": 0.07047201641796236,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001991442410210836,
+      "loss": 3.2802,
+      "step": 1013
+    },
+    {
+      "epoch": 0.07054158405509757,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019914129689876502,
+      "loss": 3.4393,
+      "step": 1014
+    },
+    {
+      "epoch": 0.07061115169223277,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019913834774255112,
+      "loss": 3.0369,
+      "step": 1015
+    },
+    {
+      "epoch": 0.07068071932936798,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001991353935525916,
+      "loss": 3.4006,
+      "step": 1016
+    },
+    {
+      "epoch": 0.07075028696650318,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001991324343290364,
+      "loss": 3.2334,
+      "step": 1017
+    },
+    {
+      "epoch": 0.07081985460363839,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001991294700720359,
+      "loss": 3.5597,
+      "step": 1018
+    },
+    {
+      "epoch": 0.07088942224077359,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001991265007817406,
+      "loss": 3.2408,
+      "step": 1019
+    },
+    {
+      "epoch": 0.0709589898779088,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001991235264583012,
+      "loss": 3.2134,
+      "step": 1020
+    },
+    {
+      "epoch": 0.071028557515044,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.001991205471018687,
+      "loss": 3.423,
+      "step": 1021
+    },
+    {
+      "epoch": 0.07109812515217921,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019911756271259445,
+      "loss": 3.1471,
+      "step": 1022
+    },
+    {
+      "epoch": 0.07116769278931441,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019911457329062996,
+      "loss": 3.37,
+      "step": 1023
+    },
+    {
+      "epoch": 0.07123726042644962,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019911157883612703,
+      "loss": 3.5813,
+      "step": 1024
+    },
+    {
+      "epoch": 0.07130682806358482,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019910857934923765,
+      "loss": 3.6288,
+      "step": 1025
+    },
+    {
+      "epoch": 0.07137639570072002,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.001991055748301142,
+      "loss": 3.3881,
+      "step": 1026
+    },
+    {
+      "epoch": 0.07144596333785523,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019910256527890914,
+      "loss": 3.1126,
+      "step": 1027
+    },
+    {
+      "epoch": 0.07151553097499043,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019909955069577533,
+      "loss": 3.278,
+      "step": 1028
+    },
+    {
+      "epoch": 0.07158509861212564,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001990965310808659,
+      "loss": 3.2712,
+      "step": 1029
+    },
+    {
+      "epoch": 0.07165466624926084,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019909350643433402,
+      "loss": 3.1583,
+      "step": 1030
+    },
+    {
+      "epoch": 0.07172423388639605,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019909047675633344,
+      "loss": 3.6229,
+      "step": 1031
+    },
+    {
+      "epoch": 0.07179380152353125,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019908744204701783,
+      "loss": 3.1285,
+      "step": 1032
+    },
+    {
+      "epoch": 0.07186336916066646,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019908440230654136,
+      "loss": 3.3274,
+      "step": 1033
+    },
+    {
+      "epoch": 0.07193293679780166,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001990813575350584,
+      "loss": 3.6442,
+      "step": 1034
+    },
+    {
+      "epoch": 0.07200250443493687,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019907830773272348,
+      "loss": 3.8308,
+      "step": 1035
+    },
+    {
+      "epoch": 0.07207207207207207,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001990752528996915,
+      "loss": 3.1517,
+      "step": 1036
+    },
+    {
+      "epoch": 0.07214163970920728,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019907219303611757,
+      "loss": 3.2052,
+      "step": 1037
+    },
+    {
+      "epoch": 0.07221120734634248,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0019906912814215702,
+      "loss": 3.0487,
+      "step": 1038
+    },
+    {
+      "epoch": 0.07228077498347768,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019906605821796547,
+      "loss": 3.6735,
+      "step": 1039
+    },
+    {
+      "epoch": 0.07235034262061289,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019906298326369887,
+      "loss": 3.3629,
+      "step": 1040
+    },
+    {
+      "epoch": 0.07241991025774809,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001990599032795132,
+      "loss": 3.1491,
+      "step": 1041
+    },
+    {
+      "epoch": 0.0724894778948833,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019905681826556504,
+      "loss": 3.3664,
+      "step": 1042
+    },
+    {
+      "epoch": 0.0725590455320185,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001990537282220109,
+      "loss": 3.2537,
+      "step": 1043
+    },
+    {
+      "epoch": 0.07262861316915371,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019905063314900767,
+      "loss": 3.6151,
+      "step": 1044
+    },
+    {
+      "epoch": 0.07269818080628891,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019904753304671257,
+      "loss": 3.6813,
+      "step": 1045
+    },
+    {
+      "epoch": 0.07276774844342412,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00199044427915283,
+      "loss": 3.1794,
+      "step": 1046
+    },
+    {
+      "epoch": 0.07283731608055932,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019904131775487655,
+      "loss": 3.7046,
+      "step": 1047
+    },
+    {
+      "epoch": 0.07290688371769453,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019903820256565122,
+      "loss": 3.285,
+      "step": 1048
+    },
+    {
+      "epoch": 0.07297645135482973,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019903508234776516,
+      "loss": 3.1065,
+      "step": 1049
+    },
+    {
+      "epoch": 0.07304601899196493,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001990319571013768,
+      "loss": 3.3196,
+      "step": 1050
+    },
+    {
+      "epoch": 0.07311558662910014,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001990288268266448,
+      "loss": 3.2582,
+      "step": 1051
+    },
+    {
+      "epoch": 0.07318515426623534,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019902569152372806,
+      "loss": 3.3076,
+      "step": 1052
+    },
+    {
+      "epoch": 0.07325472190337055,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001990225511927859,
+      "loss": 2.9895,
+      "step": 1053
+    },
+    {
+      "epoch": 0.07332428954050575,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001990194058339777,
+      "loss": 2.9849,
+      "step": 1054
+    },
+    {
+      "epoch": 0.07339385717764096,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019901625544746313,
+      "loss": 3.5619,
+      "step": 1055
+    },
+    {
+      "epoch": 0.07346342481477616,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019901310003340223,
+      "loss": 3.2445,
+      "step": 1056
+    },
+    {
+      "epoch": 0.07353299245191137,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001990099395919552,
+      "loss": 3.4955,
+      "step": 1057
+    },
+    {
+      "epoch": 0.07360256008904657,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019900677412328237,
+      "loss": 3.0674,
+      "step": 1058
+    },
+    {
+      "epoch": 0.07367212772618179,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019900360362754468,
+      "loss": 3.2239,
+      "step": 1059
+    },
+    {
+      "epoch": 0.07374169536331698,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019900042810490296,
+      "loss": 3.0941,
+      "step": 1060
+    },
+    {
+      "epoch": 0.0738112630004522,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019899724755551855,
+      "loss": 3.154,
+      "step": 1061
+    },
+    {
+      "epoch": 0.0738808306375874,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019899406197955286,
+      "loss": 3.3571,
+      "step": 1062
+    },
+    {
+      "epoch": 0.07395039827472259,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0019899087137716766,
+      "loss": 3.3969,
+      "step": 1063
+    },
+    {
+      "epoch": 0.0740199659118578,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019898767574852497,
+      "loss": 3.2255,
+      "step": 1064
+    },
+    {
+      "epoch": 0.074089533548993,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019898447509378706,
+      "loss": 3.3027,
+      "step": 1065
+    },
+    {
+      "epoch": 0.07415910118612822,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001989812694131164,
+      "loss": 3.6268,
+      "step": 1066
+    },
+    {
+      "epoch": 0.07422866882326341,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001989780587066758,
+      "loss": 3.3741,
+      "step": 1067
+    },
+    {
+      "epoch": 0.07429823646039863,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019897484297462828,
+      "loss": 2.8676,
+      "step": 1068
+    },
+    {
+      "epoch": 0.07436780409753382,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019897162221713706,
+      "loss": 3.2567,
+      "step": 1069
+    },
+    {
+      "epoch": 0.07443737173466904,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019896839643436573,
+      "loss": 3.2005,
+      "step": 1070
+    },
+    {
+      "epoch": 0.07450693937180423,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001989651656264781,
+      "loss": 3.1738,
+      "step": 1071
+    },
+    {
+      "epoch": 0.07457650700893945,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019896192979363815,
+      "loss": 3.3275,
+      "step": 1072
+    },
+    {
+      "epoch": 0.07464607464607464,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019895868893601023,
+      "loss": 3.5421,
+      "step": 1073
+    },
+    {
+      "epoch": 0.07471564228320986,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019895544305375884,
+      "loss": 3.0984,
+      "step": 1074
+    },
+    {
+      "epoch": 0.07478520992034506,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019895219214704886,
+      "loss": 3.1449,
+      "step": 1075
+    },
+    {
+      "epoch": 0.07485477755748025,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001989489362160453,
+      "loss": 3.1699,
+      "step": 1076
+    },
+    {
+      "epoch": 0.07492434519461547,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019894567526091353,
+      "loss": 3.2374,
+      "step": 1077
+    },
+    {
+      "epoch": 0.07499391283175066,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019894240928181907,
+      "loss": 3.3657,
+      "step": 1078
+    },
+    {
+      "epoch": 0.07506348046888588,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0019893913827892773,
+      "loss": 3.5596,
+      "step": 1079
+    },
+    {
+      "epoch": 0.07513304810602107,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001989358622524057,
+      "loss": 3.4357,
+      "step": 1080
+    },
+    {
+      "epoch": 0.07520261574315629,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019893258120241924,
+      "loss": 3.1769,
+      "step": 1081
+    },
+    {
+      "epoch": 0.07527218338029149,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019892929512913497,
+      "loss": 3.4845,
+      "step": 1082
+    },
+    {
+      "epoch": 0.0753417510174267,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001989260040327197,
+      "loss": 3.3399,
+      "step": 1083
+    },
+    {
+      "epoch": 0.0754113186545619,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001989227079133406,
+      "loss": 3.1975,
+      "step": 1084
+    },
+    {
+      "epoch": 0.07548088629169711,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00198919406771165,
+      "loss": 2.9935,
+      "step": 1085
+    },
+    {
+      "epoch": 0.0755504539288323,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001989161006063605,
+      "loss": 3.0267,
+      "step": 1086
+    },
+    {
+      "epoch": 0.07562002156596752,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0019891278941909503,
+      "loss": 3.2466,
+      "step": 1087
+    },
+    {
+      "epoch": 0.07568958920310272,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001989094732095366,
+      "loss": 3.3007,
+      "step": 1088
+    },
+    {
+      "epoch": 0.07575915684023792,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001989061519778537,
+      "loss": 3.4514,
+      "step": 1089
+    },
+    {
+      "epoch": 0.07582872447737313,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019890282572421493,
+      "loss": 3.48,
+      "step": 1090
+    },
+    {
+      "epoch": 0.07589829211450833,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019889949444878915,
+      "loss": 3.2318,
+      "step": 1091
+    },
+    {
+      "epoch": 0.07596785975164354,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0019889615815174557,
+      "loss": 3.0235,
+      "step": 1092
+    },
+    {
+      "epoch": 0.07603742738877874,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001988928168332535,
+      "loss": 3.2355,
+      "step": 1093
+    },
+    {
+      "epoch": 0.07610699502591395,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019888947049348273,
+      "loss": 3.3776,
+      "step": 1094
+    },
+    {
+      "epoch": 0.07617656266304915,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019888611913260303,
+      "loss": 3.1374,
+      "step": 1095
+    },
+    {
+      "epoch": 0.07624613030018436,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019888276275078463,
+      "loss": 3.3041,
+      "step": 1096
+    },
+    {
+      "epoch": 0.07631569793731956,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019887940134819793,
+      "loss": 3.3709,
+      "step": 1097
+    },
+    {
+      "epoch": 0.07638526557445477,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019887603492501366,
+      "loss": 3.4368,
+      "step": 1098
+    },
+    {
+      "epoch": 0.07645483321158997,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019887266348140266,
+      "loss": 3.4639,
+      "step": 1099
+    },
+    {
+      "epoch": 0.07652440084872518,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001988692870175362,
+      "loss": 3.2643,
+      "step": 1100
+    },
+    {
+      "epoch": 0.07659396848586038,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019886590553358564,
+      "loss": 3.0054,
+      "step": 1101
+    },
+    {
+      "epoch": 0.07666353612299558,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0019886251902972276,
+      "loss": 2.6731,
+      "step": 1102
+    },
+    {
+      "epoch": 0.07673310376013079,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001988591275061195,
+      "loss": 3.034,
+      "step": 1103
+    },
+    {
+      "epoch": 0.07680267139726599,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019885573096294793,
+      "loss": 3.5486,
+      "step": 1104
+    },
+    {
+      "epoch": 0.0768722390344012,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.001988523294003807,
+      "loss": 3.0527,
+      "step": 1105
+    },
+    {
+      "epoch": 0.0769418066715364,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001988489228185904,
+      "loss": 3.0049,
+      "step": 1106
+    },
+    {
+      "epoch": 0.07701137430867161,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019884551121775004,
+      "loss": 3.3528,
+      "step": 1107
+    },
+    {
+      "epoch": 0.07708094194580681,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001988420945980328,
+      "loss": 3.3525,
+      "step": 1108
+    },
+    {
+      "epoch": 0.07715050958294202,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001988386729596123,
+      "loss": 3.2421,
+      "step": 1109
+    },
+    {
+      "epoch": 0.07722007722007722,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001988352463026621,
+      "loss": 3.6422,
+      "step": 1110
+    },
+    {
+      "epoch": 0.07728964485721243,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019883181462735625,
+      "loss": 3.4884,
+      "step": 1111
+    },
+    {
+      "epoch": 0.07735921249434763,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019882837793386903,
+      "loss": 3.4869,
+      "step": 1112
+    },
+    {
+      "epoch": 0.07742878013148284,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001988249362223749,
+      "loss": 3.354,
+      "step": 1113
+    },
+    {
+      "epoch": 0.07749834776861804,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019882148949304864,
+      "loss": 3.2511,
+      "step": 1114
+    },
+    {
+      "epoch": 0.07756791540575324,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001988180377460652,
+      "loss": 3.2186,
+      "step": 1115
+    },
+    {
+      "epoch": 0.07763748304288845,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001988145809815999,
+      "loss": 3.2695,
+      "step": 1116
+    },
+    {
+      "epoch": 0.07770705068002365,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019881111919982826,
+      "loss": 3.7394,
+      "step": 1117
+    },
+    {
+      "epoch": 0.07777661831715886,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019880765240092605,
+      "loss": 3.0813,
+      "step": 1118
+    },
+    {
+      "epoch": 0.07784618595429406,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0019880418058506925,
+      "loss": 3.2941,
+      "step": 1119
+    },
+    {
+      "epoch": 0.07791575359142927,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019880070375243417,
+      "loss": 3.671,
+      "step": 1120
+    },
+    {
+      "epoch": 0.07798532122856447,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019879722190319733,
+      "loss": 3.3528,
+      "step": 1121
+    },
+    {
+      "epoch": 0.07805488886569968,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019879373503753554,
+      "loss": 3.386,
+      "step": 1122
+    },
+    {
+      "epoch": 0.07812445650283488,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019879024315562583,
+      "loss": 2.9778,
+      "step": 1123
+    },
+    {
+      "epoch": 0.07819402413997009,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019878674625764554,
+      "loss": 3.3305,
+      "step": 1124
+    },
+    {
+      "epoch": 0.07826359177710529,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001987832443437722,
+      "loss": 2.9773,
+      "step": 1125
+    },
+    {
+      "epoch": 0.07833315941424049,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019877973741418364,
+      "loss": 3.394,
+      "step": 1126
+    },
+    {
+      "epoch": 0.0784027270513757,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019877622546905786,
+      "loss": 3.2623,
+      "step": 1127
+    },
+    {
+      "epoch": 0.0784722946885109,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001987727085085732,
+      "loss": 3.1564,
+      "step": 1128
+    },
+    {
+      "epoch": 0.07854186232564611,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001987691865329083,
+      "loss": 3.6401,
+      "step": 1129
+    },
+    {
+      "epoch": 0.07861142996278131,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019876565954224192,
+      "loss": 3.0965,
+      "step": 1130
+    },
+    {
+      "epoch": 0.07868099759991652,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001987621275367532,
+      "loss": 3.0531,
+      "step": 1131
+    },
+    {
+      "epoch": 0.07875056523705172,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019875859051662137,
+      "loss": 2.9751,
+      "step": 1132
+    },
+    {
+      "epoch": 0.07882013287418693,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019875504848202614,
+      "loss": 3.5153,
+      "step": 1133
+    },
+    {
+      "epoch": 0.07888970051132213,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001987515014331473,
+      "loss": 2.8853,
+      "step": 1134
+    },
+    {
+      "epoch": 0.07895926814845734,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019874794937016498,
+      "loss": 3.1095,
+      "step": 1135
+    },
+    {
+      "epoch": 0.07902883578559254,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.001987443922932595,
+      "loss": 3.3813,
+      "step": 1136
+    },
+    {
+      "epoch": 0.07909840342272775,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001987408302026115,
+      "loss": 3.7459,
+      "step": 1137
+    },
+    {
+      "epoch": 0.07916797105986295,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001987372630984018,
+      "loss": 3.0523,
+      "step": 1138
+    },
+    {
+      "epoch": 0.07923753869699815,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001987336909808116,
+      "loss": 2.879,
+      "step": 1139
+    },
+    {
+      "epoch": 0.07930710633413336,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001987301138500222,
+      "loss": 3.192,
+      "step": 1140
+    },
+    {
+      "epoch": 0.07937667397126856,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001987265317062153,
+      "loss": 2.8798,
+      "step": 1141
+    },
+    {
+      "epoch": 0.07944624160840377,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019872294454957268,
+      "loss": 2.8866,
+      "step": 1142
+    },
+    {
+      "epoch": 0.07951580924553897,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001987193523802765,
+      "loss": 3.1609,
+      "step": 1143
+    },
+    {
+      "epoch": 0.07958537688267418,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019871575519850924,
+      "loss": 3.6688,
+      "step": 1144
+    },
+    {
+      "epoch": 0.07965494451980938,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0019871215300445353,
+      "loss": 3.64,
+      "step": 1145
+    },
+    {
+      "epoch": 0.0797245121569446,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.001987085457982922,
+      "loss": 3.4658,
+      "step": 1146
+    },
+    {
+      "epoch": 0.07979407979407979,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019870493358020843,
+      "loss": 3.5664,
+      "step": 1147
+    },
+    {
+      "epoch": 0.079863647431215,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001987013163503857,
+      "loss": 3.882,
+      "step": 1148
+    },
+    {
+      "epoch": 0.0799332150683502,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0019869769410900753,
+      "loss": 3.1736,
+      "step": 1149
+    },
+    {
+      "epoch": 0.08000278270548541,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00198694066856258,
+      "loss": 2.935,
+      "step": 1150
+    },
+    {
+      "epoch": 0.08007235034262061,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001986904345923212,
+      "loss": 3.3874,
+      "step": 1151
+    },
+    {
+      "epoch": 0.08014191797975581,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001986867973173815,
+      "loss": 3.2382,
+      "step": 1152
+    },
+    {
+      "epoch": 0.08021148561689102,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001986831550316237,
+      "loss": 3.1287,
+      "step": 1153
+    },
+    {
+      "epoch": 0.08028105325402622,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001986795077352327,
+      "loss": 3.1296,
+      "step": 1154
+    },
+    {
+      "epoch": 0.08035062089116143,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019867585542839373,
+      "loss": 3.1815,
+      "step": 1155
+    },
+    {
+      "epoch": 0.08042018852829663,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001986721981112921,
+      "loss": 3.7003,
+      "step": 1156
+    },
+    {
+      "epoch": 0.08048975616543184,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001986685357841136,
+      "loss": 3.2178,
+      "step": 1157
+    },
+    {
+      "epoch": 0.08055932380256704,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001986648684470442,
+      "loss": 3.4522,
+      "step": 1158
+    },
+    {
+      "epoch": 0.08062889143970225,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001986611961002701,
+      "loss": 3.4755,
+      "step": 1159
+    },
+    {
+      "epoch": 0.08069845907683745,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001986575187439777,
+      "loss": 3.3797,
+      "step": 1160
+    },
+    {
+      "epoch": 0.08076802671397267,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001986538363783538,
+      "loss": 3.1683,
+      "step": 1161
+    },
+    {
+      "epoch": 0.08083759435110786,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019865014900358534,
+      "loss": 3.1062,
+      "step": 1162
+    },
+    {
+      "epoch": 0.08090716198824308,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019864645661985957,
+      "loss": 3.574,
+      "step": 1163
+    },
+    {
+      "epoch": 0.08097672962537827,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019864275922736397,
+      "loss": 3.4354,
+      "step": 1164
+    },
+    {
+      "epoch": 0.08104629726251347,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001986390568262862,
+      "loss": 3.3539,
+      "step": 1165
+    },
+    {
+      "epoch": 0.08111586489964868,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019863534941681428,
+      "loss": 3.5538,
+      "step": 1166
+    },
+    {
+      "epoch": 0.08118543253678388,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001986316369991365,
+      "loss": 3.5096,
+      "step": 1167
+    },
+    {
+      "epoch": 0.0812550001739191,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019862791957344136,
+      "loss": 2.8434,
+      "step": 1168
+    },
+    {
+      "epoch": 0.0813245678110543,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019862419713991756,
+      "loss": 3.2823,
+      "step": 1169
+    },
+    {
+      "epoch": 0.0813941354481895,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019862046969875416,
+      "loss": 3.4693,
+      "step": 1170
+    },
+    {
+      "epoch": 0.0814637030853247,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019861673725014035,
+      "loss": 3.1002,
+      "step": 1171
+    },
+    {
+      "epoch": 0.08153327072245992,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019861299979426574,
+      "loss": 3.5625,
+      "step": 1172
+    },
+    {
+      "epoch": 0.08160283835959511,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019860925733132004,
+      "loss": 3.1363,
+      "step": 1173
+    },
+    {
+      "epoch": 0.08167240599673033,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019860550986149322,
+      "loss": 3.7165,
+      "step": 1174
+    },
+    {
+      "epoch": 0.08174197363386553,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019860175738497564,
+      "loss": 3.2552,
+      "step": 1175
+    },
+    {
+      "epoch": 0.08181154127100074,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019859799990195786,
+      "loss": 3.5978,
+      "step": 1176
+    },
+    {
+      "epoch": 0.08188110890813594,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019859423741263055,
+      "loss": 3.0328,
+      "step": 1177
+    },
+    {
+      "epoch": 0.08195067654527113,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019859046991718486,
+      "loss": 3.2697,
+      "step": 1178
+    },
+    {
+      "epoch": 0.08202024418240635,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019858669741581207,
+      "loss": 3.0997,
+      "step": 1179
+    },
+    {
+      "epoch": 0.08208981181954154,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019858291990870365,
+      "loss": 3.2616,
+      "step": 1180
+    },
+    {
+      "epoch": 0.08215937945667676,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019857913739605147,
+      "loss": 3.0133,
+      "step": 1181
+    },
+    {
+      "epoch": 0.08222894709381195,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001985753498780475,
+      "loss": 3.3974,
+      "step": 1182
+    },
+    {
+      "epoch": 0.08229851473094717,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001985715573548842,
+      "loss": 3.466,
+      "step": 1183
+    },
+    {
+      "epoch": 0.08236808236808237,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019856775982675405,
+      "loss": 2.859,
+      "step": 1184
+    },
+    {
+      "epoch": 0.08243765000521758,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019856395729384983,
+      "loss": 3.4976,
+      "step": 1185
+    },
+    {
+      "epoch": 0.08250721764235278,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001985601497563647,
+      "loss": 3.2808,
+      "step": 1186
+    },
+    {
+      "epoch": 0.08257678527948799,
+      "grad_norm": 6.3125,
+      "learning_rate": 0.001985563372144919,
+      "loss": 3.4329,
+      "step": 1187
+    },
+    {
+      "epoch": 0.08264635291662319,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001985525196684251,
+      "loss": 3.2684,
+      "step": 1188
+    },
+    {
+      "epoch": 0.08271592055375838,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001985486971183581,
+      "loss": 3.5413,
+      "step": 1189
+    },
+    {
+      "epoch": 0.0827854881908936,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00198544869564485,
+      "loss": 3.0702,
+      "step": 1190
+    },
+    {
+      "epoch": 0.0828550558280288,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00198541037007,
+      "loss": 2.9937,
+      "step": 1191
+    },
+    {
+      "epoch": 0.08292462346516401,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00198537199446098,
+      "loss": 3.5534,
+      "step": 1192
+    },
+    {
+      "epoch": 0.0829941911022992,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019853335688197354,
+      "loss": 3.4087,
+      "step": 1193
+    },
+    {
+      "epoch": 0.08306375873943442,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019852950931482194,
+      "loss": 3.1602,
+      "step": 1194
+    },
+    {
+      "epoch": 0.08313332637656962,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019852565674483846,
+      "loss": 3.3389,
+      "step": 1195
+    },
+    {
+      "epoch": 0.08320289401370483,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001985217991722187,
+      "loss": 2.9335,
+      "step": 1196
+    },
+    {
+      "epoch": 0.08327246165084003,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001985179365971586,
+      "loss": 3.0408,
+      "step": 1197
+    },
+    {
+      "epoch": 0.08334202928797524,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019851406901985427,
+      "loss": 3.4378,
+      "step": 1198
+    },
+    {
+      "epoch": 0.08341159692511044,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019851019644050202,
+      "loss": 3.5636,
+      "step": 1199
+    },
+    {
+      "epoch": 0.08348116456224565,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019850631885929854,
+      "loss": 2.891,
+      "step": 1200
+    },
+    {
+      "epoch": 0.08355073219938085,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001985024362764407,
+      "loss": 3.2612,
+      "step": 1201
+    },
+    {
+      "epoch": 0.08362029983651605,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019849854869212562,
+      "loss": 3.5886,
+      "step": 1202
+    },
+    {
+      "epoch": 0.08368986747365126,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019849465610655074,
+      "loss": 3.4887,
+      "step": 1203
+    },
+    {
+      "epoch": 0.08375943511078646,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019849075851991363,
+      "loss": 3.4052,
+      "step": 1204
+    },
+    {
+      "epoch": 0.08382900274792167,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019848685593241225,
+      "loss": 3.3308,
+      "step": 1205
+    },
+    {
+      "epoch": 0.08389857038505687,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019848294834424476,
+      "loss": 3.395,
+      "step": 1206
+    },
+    {
+      "epoch": 0.08396813802219208,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001984790357556095,
+      "loss": 3.2642,
+      "step": 1207
+    },
+    {
+      "epoch": 0.08403770565932728,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001984751181667052,
+      "loss": 3.7784,
+      "step": 1208
+    },
+    {
+      "epoch": 0.08410727329646249,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019847119557773072,
+      "loss": 3.5094,
+      "step": 1209
+    },
+    {
+      "epoch": 0.08417684093359769,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001984672679888853,
+      "loss": 3.0859,
+      "step": 1210
+    },
+    {
+      "epoch": 0.0842464085707329,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019846333540036835,
+      "loss": 3.2304,
+      "step": 1211
+    },
+    {
+      "epoch": 0.0843159762078681,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001984593978123795,
+      "loss": 3.5656,
+      "step": 1212
+    },
+    {
+      "epoch": 0.08438554384500331,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001984554552251186,
+      "loss": 3.2698,
+      "step": 1213
+    },
+    {
+      "epoch": 0.08445511148213851,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019845150763878605,
+      "loss": 3.5456,
+      "step": 1214
+    },
+    {
+      "epoch": 0.08452467911927371,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019844755505358217,
+      "loss": 3.2847,
+      "step": 1215
+    },
+    {
+      "epoch": 0.08459424675640892,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001984435974697076,
+      "loss": 3.2534,
+      "step": 1216
+    },
+    {
+      "epoch": 0.08466381439354412,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001984396348873634,
+      "loss": 3.2247,
+      "step": 1217
+    },
+    {
+      "epoch": 0.08473338203067933,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019843566730675067,
+      "loss": 3.5911,
+      "step": 1218
+    },
+    {
+      "epoch": 0.08480294966781453,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019843169472807095,
+      "loss": 3.4861,
+      "step": 1219
+    },
+    {
+      "epoch": 0.08487251730494974,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019842771715152586,
+      "loss": 3.0054,
+      "step": 1220
+    },
+    {
+      "epoch": 0.08494208494208494,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019842373457731742,
+      "loss": 3.276,
+      "step": 1221
+    },
+    {
+      "epoch": 0.08501165257922015,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0019841974700564786,
+      "loss": 3.45,
+      "step": 1222
+    },
+    {
+      "epoch": 0.08508122021635535,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019841575443671957,
+      "loss": 3.2529,
+      "step": 1223
+    },
+    {
+      "epoch": 0.08515078785349056,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019841175687073534,
+      "loss": 3.2271,
+      "step": 1224
+    },
+    {
+      "epoch": 0.08522035549062576,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019840775430789814,
+      "loss": 3.2621,
+      "step": 1225
+    },
+    {
+      "epoch": 0.08528992312776097,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001984037467484112,
+      "loss": 3.589,
+      "step": 1226
+    },
+    {
+      "epoch": 0.08535949076489617,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019839973419247797,
+      "loss": 3.0352,
+      "step": 1227
+    },
+    {
+      "epoch": 0.08542905840203137,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001983957166403022,
+      "loss": 3.4417,
+      "step": 1228
+    },
+    {
+      "epoch": 0.08549862603916658,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001983916940920879,
+      "loss": 3.7552,
+      "step": 1229
+    },
+    {
+      "epoch": 0.08556819367630178,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001983876665480393,
+      "loss": 3.4003,
+      "step": 1230
+    },
+    {
+      "epoch": 0.08563776131343699,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019838363400836094,
+      "loss": 3.5857,
+      "step": 1231
+    },
+    {
+      "epoch": 0.08570732895057219,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001983795964732575,
+      "loss": 3.5616,
+      "step": 1232
+    },
+    {
+      "epoch": 0.0857768965877074,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019837555394293404,
+      "loss": 3.2636,
+      "step": 1233
+    },
+    {
+      "epoch": 0.0858464642248426,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019837150641759576,
+      "loss": 3.4016,
+      "step": 1234
+    },
+    {
+      "epoch": 0.08591603186197781,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019836745389744826,
+      "loss": 2.9563,
+      "step": 1235
+    },
+    {
+      "epoch": 0.08598559949911301,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001983633963826972,
+      "loss": 3.4908,
+      "step": 1236
+    },
+    {
+      "epoch": 0.08605516713624822,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019835933387354872,
+      "loss": 3.0906,
+      "step": 1237
+    },
+    {
+      "epoch": 0.08612473477338342,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019835526637020902,
+      "loss": 3.4301,
+      "step": 1238
+    },
+    {
+      "epoch": 0.08619430241051863,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019835119387288463,
+      "loss": 3.3323,
+      "step": 1239
+    },
+    {
+      "epoch": 0.08626387004765383,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0019834711638178236,
+      "loss": 3.3489,
+      "step": 1240
+    },
+    {
+      "epoch": 0.08633343768478903,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001983430338971092,
+      "loss": 3.6983,
+      "step": 1241
+    },
+    {
+      "epoch": 0.08640300532192424,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001983389464190725,
+      "loss": 3.4093,
+      "step": 1242
+    },
+    {
+      "epoch": 0.08647257295905944,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019833485394787974,
+      "loss": 3.1098,
+      "step": 1243
+    },
+    {
+      "epoch": 0.08654214059619465,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019833075648373875,
+      "loss": 3.2846,
+      "step": 1244
+    },
+    {
+      "epoch": 0.08661170823332985,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019832665402685756,
+      "loss": 3.5138,
+      "step": 1245
+    },
+    {
+      "epoch": 0.08668127587046506,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001983225465774445,
+      "loss": 3.6069,
+      "step": 1246
+    },
+    {
+      "epoch": 0.08675084350760026,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001983184341357081,
+      "loss": 3.4257,
+      "step": 1247
+    },
+    {
+      "epoch": 0.08682041114473547,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019831431670185714,
+      "loss": 3.2438,
+      "step": 1248
+    },
+    {
+      "epoch": 0.08688997878187067,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019831019427610074,
+      "loss": 3.291,
+      "step": 1249
+    },
+    {
+      "epoch": 0.08695954641900588,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001983060668586482,
+      "loss": 3.2301,
+      "step": 1250
+    },
+    {
+      "epoch": 0.08702911405614108,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001983019344497091,
+      "loss": 3.4895,
+      "step": 1251
+    },
+    {
+      "epoch": 0.0870986816932763,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019829779704949326,
+      "loss": 3.1253,
+      "step": 1252
+    },
+    {
+      "epoch": 0.08716824933041149,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019829365465821066,
+      "loss": 3.1541,
+      "step": 1253
+    },
+    {
+      "epoch": 0.08723781696754669,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001982895072760718,
+      "loss": 3.1458,
+      "step": 1254
+    },
+    {
+      "epoch": 0.0873073846046819,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019828535490328714,
+      "loss": 3.3,
+      "step": 1255
+    },
+    {
+      "epoch": 0.0873769522418171,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019828119754006757,
+      "loss": 3.5455,
+      "step": 1256
+    },
+    {
+      "epoch": 0.08744651987895231,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019827703518662415,
+      "loss": 3.2653,
+      "step": 1257
+    },
+    {
+      "epoch": 0.08751608751608751,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019827286784316824,
+      "loss": 3.3314,
+      "step": 1258
+    },
+    {
+      "epoch": 0.08758565515322272,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019826869550991144,
+      "loss": 3.4117,
+      "step": 1259
+    },
+    {
+      "epoch": 0.08765522279035792,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019826451818706556,
+      "loss": 3.286,
+      "step": 1260
+    },
+    {
+      "epoch": 0.08772479042749314,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001982603358748428,
+      "loss": 3.2271,
+      "step": 1261
+    },
+    {
+      "epoch": 0.08779435806462833,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001982561485734554,
+      "loss": 3.4454,
+      "step": 1262
+    },
+    {
+      "epoch": 0.08786392570176355,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019825195628311604,
+      "loss": 3.2706,
+      "step": 1263
+    },
+    {
+      "epoch": 0.08793349333889874,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019824775900403754,
+      "loss": 3.1977,
+      "step": 1264
+    },
+    {
+      "epoch": 0.08800306097603394,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019824355673643307,
+      "loss": 3.0425,
+      "step": 1265
+    },
+    {
+      "epoch": 0.08807262861316915,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019823934948051598,
+      "loss": 3.3867,
+      "step": 1266
+    },
+    {
+      "epoch": 0.08814219625030435,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001982351372364999,
+      "loss": 3.6068,
+      "step": 1267
+    },
+    {
+      "epoch": 0.08821176388743956,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019823092000459865,
+      "loss": 3.5567,
+      "step": 1268
+    },
+    {
+      "epoch": 0.08828133152457476,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001982266977850264,
+      "loss": 3.138,
+      "step": 1269
+    },
+    {
+      "epoch": 0.08835089916170998,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001982224705779976,
+      "loss": 2.9568,
+      "step": 1270
+    },
+    {
+      "epoch": 0.08842046679884517,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019821823838372674,
+      "loss": 2.9139,
+      "step": 1271
+    },
+    {
+      "epoch": 0.08849003443598039,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019821400120242885,
+      "loss": 3.6534,
+      "step": 1272
+    },
+    {
+      "epoch": 0.08855960207311558,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00198209759034319,
+      "loss": 3.1228,
+      "step": 1273
+    },
+    {
+      "epoch": 0.0886291697102508,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019820551187961256,
+      "loss": 3.2362,
+      "step": 1274
+    },
+    {
+      "epoch": 0.088698737347386,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001982012597385253,
+      "loss": 3.4493,
+      "step": 1275
+    },
+    {
+      "epoch": 0.0887683049845212,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019819700261127296,
+      "loss": 3.139,
+      "step": 1276
+    },
+    {
+      "epoch": 0.0888378726216564,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019819274049807183,
+      "loss": 3.2712,
+      "step": 1277
+    },
+    {
+      "epoch": 0.0889074402587916,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001981884733991382,
+      "loss": 3.0725,
+      "step": 1278
+    },
+    {
+      "epoch": 0.08897700789592682,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019818420131468887,
+      "loss": 3.3549,
+      "step": 1279
+    },
+    {
+      "epoch": 0.08904657553306201,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019817992424494067,
+      "loss": 3.5994,
+      "step": 1280
+    },
+    {
+      "epoch": 0.08911614317019723,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019817564219011077,
+      "loss": 3.1747,
+      "step": 1281
+    },
+    {
+      "epoch": 0.08918571080733242,
+      "grad_norm": 3.953125,
+      "learning_rate": 0.001981713551504166,
+      "loss": 3.5693,
+      "step": 1282
+    },
+    {
+      "epoch": 0.08925527844446764,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019816706312607586,
+      "loss": 3.6042,
+      "step": 1283
+    },
+    {
+      "epoch": 0.08932484608160284,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019816276611730643,
+      "loss": 3.2726,
+      "step": 1284
+    },
+    {
+      "epoch": 0.08939441371873805,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001981584641243265,
+      "loss": 3.1582,
+      "step": 1285
+    },
+    {
+      "epoch": 0.08946398135587325,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001981541571473545,
+      "loss": 3.5227,
+      "step": 1286
+    },
+    {
+      "epoch": 0.08953354899300846,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001981498451866092,
+      "loss": 3.4966,
+      "step": 1287
+    },
+    {
+      "epoch": 0.08960311663014366,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001981455282423094,
+      "loss": 2.9587,
+      "step": 1288
+    },
+    {
+      "epoch": 0.08967268426727887,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019814120631467444,
+      "loss": 3.4397,
+      "step": 1289
+    },
+    {
+      "epoch": 0.08974225190441407,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019813687940392366,
+      "loss": 3.5678,
+      "step": 1290
+    },
+    {
+      "epoch": 0.08981181954154926,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001981325475102768,
+      "loss": 3.3464,
+      "step": 1291
+    },
+    {
+      "epoch": 0.08988138717868448,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019812821063395374,
+      "loss": 3.3282,
+      "step": 1292
+    },
+    {
+      "epoch": 0.08995095481581968,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001981238687751748,
+      "loss": 3.6958,
+      "step": 1293
+    },
+    {
+      "epoch": 0.09002052245295489,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019811952193416037,
+      "loss": 3.4398,
+      "step": 1294
+    },
+    {
+      "epoch": 0.09009009009009009,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001981151701111312,
+      "loss": 3.1065,
+      "step": 1295
+    },
+    {
+      "epoch": 0.0901596577272253,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019811081330630823,
+      "loss": 3.4621,
+      "step": 1296
+    },
+    {
+      "epoch": 0.0902292253643605,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019810645151991262,
+      "loss": 3.3886,
+      "step": 1297
+    },
+    {
+      "epoch": 0.09029879300149571,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019810208475216596,
+      "loss": 3.4419,
+      "step": 1298
+    },
+    {
+      "epoch": 0.0903683606386309,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019809771300328986,
+      "loss": 3.3693,
+      "step": 1299
+    },
+    {
+      "epoch": 0.09043792827576612,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019809333627350636,
+      "loss": 3.2262,
+      "step": 1300
+    },
+    {
+      "epoch": 0.09050749591290132,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001980889545630377,
+      "loss": 3.2679,
+      "step": 1301
+    },
+    {
+      "epoch": 0.09057706355003653,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001980845678721063,
+      "loss": 3.2695,
+      "step": 1302
+    },
+    {
+      "epoch": 0.09064663118717173,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001980801762009349,
+      "loss": 3.4142,
+      "step": 1303
+    },
+    {
+      "epoch": 0.09071619882430693,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019807577954974657,
+      "loss": 3.063,
+      "step": 1304
+    },
+    {
+      "epoch": 0.09078576646144214,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001980713779187645,
+      "loss": 3.5843,
+      "step": 1305
+    },
+    {
+      "epoch": 0.09085533409857734,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001980669713082121,
+      "loss": 3.2546,
+      "step": 1306
+    },
+    {
+      "epoch": 0.09092490173571255,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019806255971831326,
+      "loss": 3.5231,
+      "step": 1307
+    },
+    {
+      "epoch": 0.09099446937284775,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019805814314929186,
+      "loss": 3.2505,
+      "step": 1308
+    },
+    {
+      "epoch": 0.09106403700998296,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019805372160137226,
+      "loss": 3.1583,
+      "step": 1309
+    },
+    {
+      "epoch": 0.09113360464711816,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019804929507477886,
+      "loss": 3.1289,
+      "step": 1310
+    },
+    {
+      "epoch": 0.09120317228425337,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019804486356973646,
+      "loss": 3.1288,
+      "step": 1311
+    },
+    {
+      "epoch": 0.09127273992138857,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001980404270864701,
+      "loss": 3.6134,
+      "step": 1312
+    },
+    {
+      "epoch": 0.09134230755852378,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00198035985625205,
+      "loss": 3.2821,
+      "step": 1313
+    },
+    {
+      "epoch": 0.09141187519565898,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019803153918616667,
+      "loss": 3.1245,
+      "step": 1314
+    },
+    {
+      "epoch": 0.09148144283279419,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001980270877695809,
+      "loss": 2.9684,
+      "step": 1315
+    },
+    {
+      "epoch": 0.09155101046992939,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001980226313756737,
+      "loss": 3.5429,
+      "step": 1316
+    },
+    {
+      "epoch": 0.09162057810706459,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001980181700046714,
+      "loss": 3.2961,
+      "step": 1317
+    },
+    {
+      "epoch": 0.0916901457441998,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001980137036568004,
+      "loss": 3.4082,
+      "step": 1318
+    },
+    {
+      "epoch": 0.091759713381335,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001980092323322876,
+      "loss": 3.447,
+      "step": 1319
+    },
+    {
+      "epoch": 0.09182928101847021,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019800475603135997,
+      "loss": 3.417,
+      "step": 1320
+    },
+    {
+      "epoch": 0.09189884865560541,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019800027475424483,
+      "loss": 3.7718,
+      "step": 1321
+    },
+    {
+      "epoch": 0.09196841629274062,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019799578850116972,
+      "loss": 2.9843,
+      "step": 1322
+    },
+    {
+      "epoch": 0.09203798392987582,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019799129727236233,
+      "loss": 3.3471,
+      "step": 1323
+    },
+    {
+      "epoch": 0.09210755156701103,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001979868010680508,
+      "loss": 3.2991,
+      "step": 1324
+    },
+    {
+      "epoch": 0.09217711920414623,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019798229988846347,
+      "loss": 3.3589,
+      "step": 1325
+    },
+    {
+      "epoch": 0.09224668684128144,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019797779373382876,
+      "loss": 3.2222,
+      "step": 1326
+    },
+    {
+      "epoch": 0.09231625447841664,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001979732826043755,
+      "loss": 3.3215,
+      "step": 1327
+    },
+    {
+      "epoch": 0.09238582211555185,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019796876650033284,
+      "loss": 3.4502,
+      "step": 1328
+    },
+    {
+      "epoch": 0.09245538975268705,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019796424542192995,
+      "loss": 3.2011,
+      "step": 1329
+    },
+    {
+      "epoch": 0.09252495738982225,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001979597193693965,
+      "loss": 3.5101,
+      "step": 1330
+    },
+    {
+      "epoch": 0.09259452502695746,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001979551883429623,
+      "loss": 3.1729,
+      "step": 1331
+    },
+    {
+      "epoch": 0.09266409266409266,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001979506523428573,
+      "loss": 3.4822,
+      "step": 1332
+    },
+    {
+      "epoch": 0.09273366030122787,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001979461113693119,
+      "loss": 3.6057,
+      "step": 1333
+    },
+    {
+      "epoch": 0.09280322793836307,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001979415654225566,
+      "loss": 3.4683,
+      "step": 1334
+    },
+    {
+      "epoch": 0.09287279557549828,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001979370145028223,
+      "loss": 3.489,
+      "step": 1335
+    },
+    {
+      "epoch": 0.09294236321263348,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001979324586103401,
+      "loss": 3.2464,
+      "step": 1336
+    },
+    {
+      "epoch": 0.09301193084976869,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019792789774534117,
+      "loss": 3.1344,
+      "step": 1337
+    },
+    {
+      "epoch": 0.09308149848690389,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019792333190805727,
+      "loss": 3.333,
+      "step": 1338
+    },
+    {
+      "epoch": 0.0931510661240391,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001979187610987201,
+      "loss": 3.5017,
+      "step": 1339
+    },
+    {
+      "epoch": 0.0932206337611743,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019791418531756176,
+      "loss": 3.2691,
+      "step": 1340
+    },
+    {
+      "epoch": 0.0932902013983095,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001979096045648147,
+      "loss": 3.5223,
+      "step": 1341
+    },
+    {
+      "epoch": 0.09335976903544471,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019790501884071137,
+      "loss": 3.3475,
+      "step": 1342
+    },
+    {
+      "epoch": 0.09342933667257991,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019790042814548463,
+      "loss": 3.3571,
+      "step": 1343
+    },
+    {
+      "epoch": 0.09349890430971512,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019789583247936766,
+      "loss": 3.123,
+      "step": 1344
+    },
+    {
+      "epoch": 0.09356847194685032,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019789123184259373,
+      "loss": 3.2013,
+      "step": 1345
+    },
+    {
+      "epoch": 0.09363803958398553,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001978866262353964,
+      "loss": 3.4691,
+      "step": 1346
+    },
+    {
+      "epoch": 0.09370760722112073,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019788201565800966,
+      "loss": 3.2226,
+      "step": 1347
+    },
+    {
+      "epoch": 0.09377717485825594,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001978774001106675,
+      "loss": 3.3479,
+      "step": 1348
+    },
+    {
+      "epoch": 0.09384674249539114,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001978727795936043,
+      "loss": 3.2993,
+      "step": 1349
+    },
+    {
+      "epoch": 0.09391631013252635,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019786815410705464,
+      "loss": 3.4622,
+      "step": 1350
+    },
+    {
+      "epoch": 0.09398587776966155,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019786352365125347,
+      "loss": 3.5396,
+      "step": 1351
+    },
+    {
+      "epoch": 0.09405544540679676,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001978588882264358,
+      "loss": 3.6279,
+      "step": 1352
+    },
+    {
+      "epoch": 0.09412501304393196,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00197854247832837,
+      "loss": 3.3464,
+      "step": 1353
+    },
+    {
+      "epoch": 0.09419458068106716,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019784960247069276,
+      "loss": 3.1044,
+      "step": 1354
+    },
+    {
+      "epoch": 0.09426414831820237,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001978449521402389,
+      "loss": 3.1176,
+      "step": 1355
+    },
+    {
+      "epoch": 0.09433371595533757,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019784029684171154,
+      "loss": 3.3721,
+      "step": 1356
+    },
+    {
+      "epoch": 0.09440328359247278,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019783563657534706,
+      "loss": 3.4134,
+      "step": 1357
+    },
+    {
+      "epoch": 0.09447285122960798,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001978309713413821,
+      "loss": 3.091,
+      "step": 1358
+    },
+    {
+      "epoch": 0.0945424188667432,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019782630114005347,
+      "loss": 3.1234,
+      "step": 1359
+    },
+    {
+      "epoch": 0.09461198650387839,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019782162597159836,
+      "loss": 3.0553,
+      "step": 1360
+    },
+    {
+      "epoch": 0.0946815541410136,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019781694583625416,
+      "loss": 3.4159,
+      "step": 1361
+    },
+    {
+      "epoch": 0.0947511217781488,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019781226073425848,
+      "loss": 3.199,
+      "step": 1362
+    },
+    {
+      "epoch": 0.09482068941528402,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019780757066584923,
+      "loss": 3.4261,
+      "step": 1363
+    },
+    {
+      "epoch": 0.09489025705241921,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001978028756312645,
+      "loss": 3.0732,
+      "step": 1364
+    },
+    {
+      "epoch": 0.09495982468955443,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001977981756307427,
+      "loss": 3.3938,
+      "step": 1365
+    },
+    {
+      "epoch": 0.09502939232668962,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001977934706645225,
+      "loss": 3.3882,
+      "step": 1366
+    },
+    {
+      "epoch": 0.09509895996382482,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001977887607328428,
+      "loss": 2.9706,
+      "step": 1367
+    },
+    {
+      "epoch": 0.09516852760096003,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001977840458359427,
+      "loss": 2.8586,
+      "step": 1368
+    },
+    {
+      "epoch": 0.09523809523809523,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001977793259740616,
+      "loss": 3.1019,
+      "step": 1369
+    },
+    {
+      "epoch": 0.09530766287523044,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001977746011474392,
+      "loss": 3.264,
+      "step": 1370
+    },
+    {
+      "epoch": 0.09537723051236564,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001977698713563154,
+      "loss": 3.151,
+      "step": 1371
+    },
+    {
+      "epoch": 0.09544679814950086,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019776513660093027,
+      "loss": 3.145,
+      "step": 1372
+    },
+    {
+      "epoch": 0.09551636578663605,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001977603968815243,
+      "loss": 3.6223,
+      "step": 1373
+    },
+    {
+      "epoch": 0.09558593342377127,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001977556521983381,
+      "loss": 3.2582,
+      "step": 1374
+    },
+    {
+      "epoch": 0.09565550106090646,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0019775090255161262,
+      "loss": 3.3363,
+      "step": 1375
+    },
+    {
+      "epoch": 0.09572506869804168,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019774614794158905,
+      "loss": 3.116,
+      "step": 1376
+    },
+    {
+      "epoch": 0.09579463633517687,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019774138836850873,
+      "loss": 3.6931,
+      "step": 1377
+    },
+    {
+      "epoch": 0.09586420397231209,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019773662383261335,
+      "loss": 3.1336,
+      "step": 1378
+    },
+    {
+      "epoch": 0.09593377160944729,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019773185433414487,
+      "loss": 3.2009,
+      "step": 1379
+    },
+    {
+      "epoch": 0.09600333924658248,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001977270798733454,
+      "loss": 3.4753,
+      "step": 1380
+    },
+    {
+      "epoch": 0.0960729068837177,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019772230045045744,
+      "loss": 3.1052,
+      "step": 1381
+    },
+    {
+      "epoch": 0.0961424745208529,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001977175160657236,
+      "loss": 3.0842,
+      "step": 1382
+    },
+    {
+      "epoch": 0.0962120421579881,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019771272671938677,
+      "loss": 3.3699,
+      "step": 1383
+    },
+    {
+      "epoch": 0.0962816097951233,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019770793241169027,
+      "loss": 3.4106,
+      "step": 1384
+    },
+    {
+      "epoch": 0.09635117743225852,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001977031331428774,
+      "loss": 3.2359,
+      "step": 1385
+    },
+    {
+      "epoch": 0.09642074506939372,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019769832891319192,
+      "loss": 3.3751,
+      "step": 1386
+    },
+    {
+      "epoch": 0.09649031270652893,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001976935197228777,
+      "loss": 3.5038,
+      "step": 1387
+    },
+    {
+      "epoch": 0.09655988034366413,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00197688705572179,
+      "loss": 3.3873,
+      "step": 1388
+    },
+    {
+      "epoch": 0.09662944798079934,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019768388646134016,
+      "loss": 3.3603,
+      "step": 1389
+    },
+    {
+      "epoch": 0.09669901561793454,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019767906239060596,
+      "loss": 3.106,
+      "step": 1390
+    },
+    {
+      "epoch": 0.09676858325506975,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001976742333602213,
+      "loss": 3.4672,
+      "step": 1391
+    },
+    {
+      "epoch": 0.09683815089220495,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0019766939937043144,
+      "loss": 3.1086,
+      "step": 1392
+    },
+    {
+      "epoch": 0.09690771852934014,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019766456042148175,
+      "loss": 3.3279,
+      "step": 1393
+    },
+    {
+      "epoch": 0.09697728616647536,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001976597165136179,
+      "loss": 3.3536,
+      "step": 1394
+    },
+    {
+      "epoch": 0.09704685380361056,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001976548676470859,
+      "loss": 3.0432,
+      "step": 1395
+    },
+    {
+      "epoch": 0.09711642144074577,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019765001382213198,
+      "loss": 3.5107,
+      "step": 1396
+    },
+    {
+      "epoch": 0.09718598907788097,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001976451550390025,
+      "loss": 2.7966,
+      "step": 1397
+    },
+    {
+      "epoch": 0.09725555671501618,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019764029129794424,
+      "loss": 3.3384,
+      "step": 1398
+    },
+    {
+      "epoch": 0.09732512435215138,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001976354225992041,
+      "loss": 3.1992,
+      "step": 1399
+    },
+    {
+      "epoch": 0.09739469198928659,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001976305489430294,
+      "loss": 3.2581,
+      "step": 1400
+    },
+    {
+      "epoch": 0.09746425962642179,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019762567032966744,
+      "loss": 3.0971,
+      "step": 1401
+    },
+    {
+      "epoch": 0.097533827263557,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019762078675936608,
+      "loss": 2.7413,
+      "step": 1402
+    },
+    {
+      "epoch": 0.0976033949006922,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019761589823237315,
+      "loss": 3.189,
+      "step": 1403
+    },
+    {
+      "epoch": 0.0976729625378274,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019761100474893693,
+      "loss": 3.2528,
+      "step": 1404
+    },
+    {
+      "epoch": 0.09774253017496261,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019760610630930593,
+      "loss": 2.9884,
+      "step": 1405
+    },
+    {
+      "epoch": 0.0978120978120978,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019760120291372877,
+      "loss": 3.2232,
+      "step": 1406
+    },
+    {
+      "epoch": 0.09788166544923302,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001975962945624545,
+      "loss": 3.3636,
+      "step": 1407
+    },
+    {
+      "epoch": 0.09795123308636822,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019759138125573232,
+      "loss": 3.3866,
+      "step": 1408
+    },
+    {
+      "epoch": 0.09802080072350343,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019758646299381168,
+      "loss": 3.3013,
+      "step": 1409
+    },
+    {
+      "epoch": 0.09809036836063863,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019758153977694234,
+      "loss": 3.3126,
+      "step": 1410
+    },
+    {
+      "epoch": 0.09815993599777384,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001975766116053743,
+      "loss": 3.2412,
+      "step": 1411
+    },
+    {
+      "epoch": 0.09822950363490904,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019757167847935767,
+      "loss": 3.6484,
+      "step": 1412
+    },
+    {
+      "epoch": 0.09829907127204425,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00197566740399143,
+      "loss": 3.5801,
+      "step": 1413
+    },
+    {
+      "epoch": 0.09836863890917945,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019756179736498108,
+      "loss": 3.2148,
+      "step": 1414
+    },
+    {
+      "epoch": 0.09843820654631466,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001975568493771228,
+      "loss": 3.4016,
+      "step": 1415
+    },
+    {
+      "epoch": 0.09850777418344986,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019755189643581943,
+      "loss": 3.5892,
+      "step": 1416
+    },
+    {
+      "epoch": 0.09857734182058506,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001975469385413225,
+      "loss": 3.1497,
+      "step": 1417
+    },
+    {
+      "epoch": 0.09864690945772027,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019754197569388367,
+      "loss": 3.2365,
+      "step": 1418
+    },
+    {
+      "epoch": 0.09871647709485547,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00197537007893755,
+      "loss": 3.446,
+      "step": 1419
+    },
+    {
+      "epoch": 0.09878604473199068,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001975320351411886,
+      "loss": 3.2635,
+      "step": 1420
+    },
+    {
+      "epoch": 0.09885561236912588,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019752705743643715,
+      "loss": 3.2567,
+      "step": 1421
+    },
+    {
+      "epoch": 0.09892518000626109,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019752207477975324,
+      "loss": 3.5977,
+      "step": 1422
+    },
+    {
+      "epoch": 0.09899474764339629,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019751708717138995,
+      "loss": 3.2144,
+      "step": 1423
+    },
+    {
+      "epoch": 0.0990643152805315,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019751209461160045,
+      "loss": 3.4039,
+      "step": 1424
+    },
+    {
+      "epoch": 0.0991338829176667,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019750709710063836,
+      "loss": 3.4512,
+      "step": 1425
+    },
+    {
+      "epoch": 0.09920345055480191,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001975020946387573,
+      "loss": 3.3055,
+      "step": 1426
+    },
+    {
+      "epoch": 0.09927301819193711,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001974970872262113,
+      "loss": 3.291,
+      "step": 1427
+    },
+    {
+      "epoch": 0.09934258582907232,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001974920748632547,
+      "loss": 3.3701,
+      "step": 1428
+    },
+    {
+      "epoch": 0.09941215346620752,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019748705755014188,
+      "loss": 3.3214,
+      "step": 1429
+    },
+    {
+      "epoch": 0.09948172110334272,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001974820352871277,
+      "loss": 3.5112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.09955128874047793,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019747700807446703,
+      "loss": 3.3854,
+      "step": 1431
+    },
+    {
+      "epoch": 0.09962085637761313,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019747197591241526,
+      "loss": 3.2159,
+      "step": 1432
+    },
+    {
+      "epoch": 0.09969042401474834,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019746693880122786,
+      "loss": 3.06,
+      "step": 1433
+    },
+    {
+      "epoch": 0.09975999165188354,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001974618967411606,
+      "loss": 3.1673,
+      "step": 1434
+    },
+    {
+      "epoch": 0.09982955928901875,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019745684973246943,
+      "loss": 3.449,
+      "step": 1435
+    },
+    {
+      "epoch": 0.09989912692615395,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019745179777541063,
+      "loss": 3.5406,
+      "step": 1436
+    },
+    {
+      "epoch": 0.09996869456328916,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019744674087024076,
+      "loss": 3.5287,
+      "step": 1437
+    },
+    {
+      "epoch": 0.10003826220042436,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019744167901721657,
+      "loss": 3.2567,
+      "step": 1438
+    },
+    {
+      "epoch": 0.10010782983755957,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019743661221659505,
+      "loss": 3.0954,
+      "step": 1439
+    },
+    {
+      "epoch": 0.10017739747469477,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019743154046863347,
+      "loss": 3.3116,
+      "step": 1440
+    },
+    {
+      "epoch": 0.10024696511182998,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001974264637735894,
+      "loss": 3.5275,
+      "step": 1441
+    },
+    {
+      "epoch": 0.10031653274896518,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019742138213172046,
+      "loss": 3.3459,
+      "step": 1442
+    },
+    {
+      "epoch": 0.10038610038610038,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019741629554328485,
+      "loss": 3.5695,
+      "step": 1443
+    },
+    {
+      "epoch": 0.10045566802323559,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019741120400854077,
+      "loss": 3.4048,
+      "step": 1444
+    },
+    {
+      "epoch": 0.10052523566037079,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019740610752774675,
+      "loss": 2.9705,
+      "step": 1445
+    },
+    {
+      "epoch": 0.100594803297506,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001974010061011615,
+      "loss": 3.5704,
+      "step": 1446
+    },
+    {
+      "epoch": 0.1006643709346412,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019739589972904417,
+      "loss": 3.2439,
+      "step": 1447
+    },
+    {
+      "epoch": 0.10073393857177641,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001973907884116539,
+      "loss": 3.3124,
+      "step": 1448
+    },
+    {
+      "epoch": 0.10080350620891161,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001973856721492503,
+      "loss": 3.5505,
+      "step": 1449
+    },
+    {
+      "epoch": 0.10087307384604682,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001973805509420931,
+      "loss": 3.4526,
+      "step": 1450
+    },
+    {
+      "epoch": 0.10094264148318202,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019737542479044243,
+      "loss": 3.2931,
+      "step": 1451
+    },
+    {
+      "epoch": 0.10101220912031723,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019737029369455844,
+      "loss": 3.1317,
+      "step": 1452
+    },
+    {
+      "epoch": 0.10108177675745243,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019736515765470175,
+      "loss": 3.0454,
+      "step": 1453
+    },
+    {
+      "epoch": 0.10115134439458764,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019736001667113308,
+      "loss": 3.2824,
+      "step": 1454
+    },
+    {
+      "epoch": 0.10122091203172284,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001973548707441135,
+      "loss": 3.1641,
+      "step": 1455
+    },
+    {
+      "epoch": 0.10129047966885804,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019734971987390433,
+      "loss": 3.6603,
+      "step": 1456
+    },
+    {
+      "epoch": 0.10136004730599325,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00197344564060767,
+      "loss": 3.3129,
+      "step": 1457
+    },
+    {
+      "epoch": 0.10142961494312845,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001973394033049634,
+      "loss": 3.5636,
+      "step": 1458
+    },
+    {
+      "epoch": 0.10149918258026366,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001973342376067555,
+      "loss": 3.5552,
+      "step": 1459
+    },
+    {
+      "epoch": 0.10156875021739886,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001973290669664057,
+      "loss": 3.288,
+      "step": 1460
+    },
+    {
+      "epoch": 0.10163831785453407,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019732389138417635,
+      "loss": 3.0768,
+      "step": 1461
+    },
+    {
+      "epoch": 0.10170788549166927,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001973187108603304,
+      "loss": 3.5058,
+      "step": 1462
+    },
+    {
+      "epoch": 0.10177745312880448,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001973135253951308,
+      "loss": 3.5485,
+      "step": 1463
+    },
+    {
+      "epoch": 0.10184702076593968,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001973083349888409,
+      "loss": 3.6063,
+      "step": 1464
+    },
+    {
+      "epoch": 0.1019165884030749,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001973031396417242,
+      "loss": 3.6008,
+      "step": 1465
+    },
+    {
+      "epoch": 0.1019861560402101,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001972979393540445,
+      "loss": 3.136,
+      "step": 1466
+    },
+    {
+      "epoch": 0.1020557236773453,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019729273412606592,
+      "loss": 3.3279,
+      "step": 1467
+    },
+    {
+      "epoch": 0.1021252913144805,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019728752395805267,
+      "loss": 3.2092,
+      "step": 1468
+    },
+    {
+      "epoch": 0.1021948589516157,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001972823088502693,
+      "loss": 3.56,
+      "step": 1469
+    },
+    {
+      "epoch": 0.10226442658875091,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019727708880298064,
+      "loss": 3.3485,
+      "step": 1470
+    },
+    {
+      "epoch": 0.10233399422588611,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001972718638164517,
+      "loss": 3.2539,
+      "step": 1471
+    },
+    {
+      "epoch": 0.10240356186302133,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0019726663389094783,
+      "loss": 3.4803,
+      "step": 1472
+    },
+    {
+      "epoch": 0.10247312950015652,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019726139902673454,
+      "loss": 3.5746,
+      "step": 1473
+    },
+    {
+      "epoch": 0.10254269713729174,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019725615922407762,
+      "loss": 3.2785,
+      "step": 1474
+    },
+    {
+      "epoch": 0.10261226477442693,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019725091448324315,
+      "loss": 3.046,
+      "step": 1475
+    },
+    {
+      "epoch": 0.10268183241156215,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019724566480449745,
+      "loss": 3.5367,
+      "step": 1476
+    },
+    {
+      "epoch": 0.10275140004869734,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019724041018810705,
+      "loss": 3.6516,
+      "step": 1477
+    },
+    {
+      "epoch": 0.10282096768583256,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001972351506343387,
+      "loss": 3.448,
+      "step": 1478
+    },
+    {
+      "epoch": 0.10289053532296775,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019722988614345955,
+      "loss": 2.9195,
+      "step": 1479
+    },
+    {
+      "epoch": 0.10296010296010295,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019722461671573682,
+      "loss": 3.3375,
+      "step": 1480
+    },
+    {
+      "epoch": 0.10302967059723817,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019721934235143817,
+      "loss": 3.1518,
+      "step": 1481
+    },
+    {
+      "epoch": 0.10309923823437336,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001972140630508313,
+      "loss": 3.3298,
+      "step": 1482
+    },
+    {
+      "epoch": 0.10316880587150858,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019720877881418426,
+      "loss": 3.4051,
+      "step": 1483
+    },
+    {
+      "epoch": 0.10323837350864377,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001972034896417654,
+      "loss": 3.2962,
+      "step": 1484
+    },
+    {
+      "epoch": 0.10330794114577899,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001971981955338433,
+      "loss": 3.2481,
+      "step": 1485
+    },
+    {
+      "epoch": 0.10337750878291418,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001971928964906868,
+      "loss": 3.1045,
+      "step": 1486
+    },
+    {
+      "epoch": 0.1034470764200494,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019718759251256485,
+      "loss": 3.4065,
+      "step": 1487
+    },
+    {
+      "epoch": 0.1035166440571846,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001971822835997468,
+      "loss": 3.0315,
+      "step": 1488
+    },
+    {
+      "epoch": 0.10358621169431981,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001971769697525023,
+      "loss": 3.1924,
+      "step": 1489
+    },
+    {
+      "epoch": 0.103655779331455,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00197171650971101,
+      "loss": 3.1543,
+      "step": 1490
+    },
+    {
+      "epoch": 0.10372534696859022,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001971663272558131,
+      "loss": 3.4982,
+      "step": 1491
+    },
+    {
+      "epoch": 0.10379491460572542,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001971609986069088,
+      "loss": 3.4726,
+      "step": 1492
+    },
+    {
+      "epoch": 0.10386448224286061,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019715566502465877,
+      "loss": 3.1342,
+      "step": 1493
+    },
+    {
+      "epoch": 0.10393404987999583,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019715032650933374,
+      "loss": 3.8033,
+      "step": 1494
+    },
+    {
+      "epoch": 0.10400361751713103,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019714498306120484,
+      "loss": 3.3207,
+      "step": 1495
+    },
+    {
+      "epoch": 0.10407318515426624,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001971396346805433,
+      "loss": 3.5685,
+      "step": 1496
+    },
+    {
+      "epoch": 0.10414275279140144,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019713428136762076,
+      "loss": 3.7468,
+      "step": 1497
+    },
+    {
+      "epoch": 0.10421232042853665,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00197128923122709,
+      "loss": 3.2513,
+      "step": 1498
+    },
+    {
+      "epoch": 0.10428188806567185,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019712355994608013,
+      "loss": 3.2436,
+      "step": 1499
+    },
+    {
+      "epoch": 0.10435145570280706,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019711819183800636,
+      "loss": 3.3017,
+      "step": 1500
+    },
+    {
+      "epoch": 0.10442102333994226,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019711281879876037,
+      "loss": 2.9926,
+      "step": 1501
+    },
+    {
+      "epoch": 0.10449059097707747,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019710744082861486,
+      "loss": 3.0992,
+      "step": 1502
+    },
+    {
+      "epoch": 0.10456015861421267,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019710205792784303,
+      "loss": 3.4089,
+      "step": 1503
+    },
+    {
+      "epoch": 0.10462972625134788,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001970966700967181,
+      "loss": 2.9906,
+      "step": 1504
+    },
+    {
+      "epoch": 0.10469929388848308,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019709127733551365,
+      "loss": 3.3956,
+      "step": 1505
+    },
+    {
+      "epoch": 0.10476886152561828,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019708587964450356,
+      "loss": 3.6315,
+      "step": 1506
+    },
+    {
+      "epoch": 0.10483842916275349,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019708047702396182,
+      "loss": 3.4508,
+      "step": 1507
+    },
+    {
+      "epoch": 0.10490799679988869,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001970750694741628,
+      "loss": 3.3902,
+      "step": 1508
+    },
+    {
+      "epoch": 0.1049775644370239,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00197069656995381,
+      "loss": 3.3237,
+      "step": 1509
+    },
+    {
+      "epoch": 0.1050471320741591,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001970642395878913,
+      "loss": 3.215,
+      "step": 1510
+    },
+    {
+      "epoch": 0.10511669971129431,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001970588172519688,
+      "loss": 2.8556,
+      "step": 1511
+    },
+    {
+      "epoch": 0.10518626734842951,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001970533899878887,
+      "loss": 3.1504,
+      "step": 1512
+    },
+    {
+      "epoch": 0.10525583498556472,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019704795779592666,
+      "loss": 3.3045,
+      "step": 1513
+    },
+    {
+      "epoch": 0.10532540262269992,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019704252067635855,
+      "loss": 3.2279,
+      "step": 1514
+    },
+    {
+      "epoch": 0.10539497025983513,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001970370786294603,
+      "loss": 2.9838,
+      "step": 1515
+    },
+    {
+      "epoch": 0.10546453789697033,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019703163165550835,
+      "loss": 3.3872,
+      "step": 1516
+    },
+    {
+      "epoch": 0.10553410553410554,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019702617975477918,
+      "loss": 3.2918,
+      "step": 1517
+    },
+    {
+      "epoch": 0.10560367317124074,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001970207229275497,
+      "loss": 2.9289,
+      "step": 1518
+    },
+    {
+      "epoch": 0.10567324080837594,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001970152611740969,
+      "loss": 3.4421,
+      "step": 1519
+    },
+    {
+      "epoch": 0.10574280844551115,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019700979449469806,
+      "loss": 3.3153,
+      "step": 1520
+    },
+    {
+      "epoch": 0.10581237608264635,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001970043228896309,
+      "loss": 3.7921,
+      "step": 1521
+    },
+    {
+      "epoch": 0.10588194371978156,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019699884635917316,
+      "loss": 3.2177,
+      "step": 1522
+    },
+    {
+      "epoch": 0.10595151135691676,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001969933649036029,
+      "loss": 3.3786,
+      "step": 1523
+    },
+    {
+      "epoch": 0.10602107899405197,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019698787852319845,
+      "loss": 3.679,
+      "step": 1524
+    },
+    {
+      "epoch": 0.10609064663118717,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001969823872182384,
+      "loss": 2.9693,
+      "step": 1525
+    },
+    {
+      "epoch": 0.10616021426832238,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019697689098900155,
+      "loss": 3.2078,
+      "step": 1526
+    },
+    {
+      "epoch": 0.10622978190545758,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019697138983576696,
+      "loss": 3.2731,
+      "step": 1527
+    },
+    {
+      "epoch": 0.10629934954259279,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019696588375881395,
+      "loss": 3.4905,
+      "step": 1528
+    },
+    {
+      "epoch": 0.10636891717972799,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019696037275842215,
+      "loss": 3.7366,
+      "step": 1529
+    },
+    {
+      "epoch": 0.1064384848168632,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001969548568348713,
+      "loss": 3.1914,
+      "step": 1530
+    },
+    {
+      "epoch": 0.1065080524539984,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019694933598844153,
+      "loss": 3.5288,
+      "step": 1531
+    },
+    {
+      "epoch": 0.1065776200911336,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019694381021941316,
+      "loss": 3.1728,
+      "step": 1532
+    },
+    {
+      "epoch": 0.10664718772826881,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019693827952806673,
+      "loss": 3.5763,
+      "step": 1533
+    },
+    {
+      "epoch": 0.10671675536540401,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019693274391468303,
+      "loss": 3.5042,
+      "step": 1534
+    },
+    {
+      "epoch": 0.10678632300253922,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001969272033795432,
+      "loss": 2.8814,
+      "step": 1535
+    },
+    {
+      "epoch": 0.10685589063967442,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019692165792292854,
+      "loss": 3.2389,
+      "step": 1536
+    },
+    {
+      "epoch": 0.10692545827680963,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001969161075451206,
+      "loss": 3.2213,
+      "step": 1537
+    },
+    {
+      "epoch": 0.10699502591394483,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001969105522464012,
+      "loss": 3.6244,
+      "step": 1538
+    },
+    {
+      "epoch": 0.10706459355108004,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019690499202705243,
+      "loss": 3.2784,
+      "step": 1539
+    },
+    {
+      "epoch": 0.10713416118821524,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001968994268873566,
+      "loss": 3.08,
+      "step": 1540
+    },
+    {
+      "epoch": 0.10720372882535045,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001968938568275963,
+      "loss": 2.9797,
+      "step": 1541
+    },
+    {
+      "epoch": 0.10727329646248565,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019688828184805432,
+      "loss": 3.0292,
+      "step": 1542
+    },
+    {
+      "epoch": 0.10734286409962086,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019688270194901376,
+      "loss": 2.9543,
+      "step": 1543
+    },
+    {
+      "epoch": 0.10741243173675606,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001968771171307579,
+      "loss": 2.8975,
+      "step": 1544
+    },
+    {
+      "epoch": 0.10748199937389126,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019687152739357033,
+      "loss": 3.6984,
+      "step": 1545
+    },
+    {
+      "epoch": 0.10755156701102647,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019686593273773485,
+      "loss": 3.5366,
+      "step": 1546
+    },
+    {
+      "epoch": 0.10762113464816167,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019686033316353557,
+      "loss": 3.4072,
+      "step": 1547
+    },
+    {
+      "epoch": 0.10769070228529688,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001968547286712568,
+      "loss": 3.4246,
+      "step": 1548
+    },
+    {
+      "epoch": 0.10776026992243208,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019684911926118307,
+      "loss": 3.1723,
+      "step": 1549
+    },
+    {
+      "epoch": 0.1078298375595673,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001968435049335992,
+      "loss": 3.4788,
+      "step": 1550
+    },
+    {
+      "epoch": 0.10789940519670249,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001968378856887903,
+      "loss": 3.4059,
+      "step": 1551
+    },
+    {
+      "epoch": 0.1079689728338377,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019683226152704164,
+      "loss": 3.2771,
+      "step": 1552
+    },
+    {
+      "epoch": 0.1080385404709729,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001968266324486389,
+      "loss": 3.1777,
+      "step": 1553
+    },
+    {
+      "epoch": 0.10810810810810811,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001968209984538677,
+      "loss": 3.3148,
+      "step": 1554
+    },
+    {
+      "epoch": 0.10817767574524331,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019681535954301425,
+      "loss": 3.3594,
+      "step": 1555
+    },
+    {
+      "epoch": 0.10824724338237851,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019680971571636482,
+      "loss": 3.2512,
+      "step": 1556
+    },
+    {
+      "epoch": 0.10831681101951372,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.00196804066974206,
+      "loss": 3.4698,
+      "step": 1557
+    },
+    {
+      "epoch": 0.10838637865664892,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001967984133168246,
+      "loss": 3.5944,
+      "step": 1558
+    },
+    {
+      "epoch": 0.10845594629378413,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001967927547445076,
+      "loss": 3.4931,
+      "step": 1559
+    },
+    {
+      "epoch": 0.10852551393091933,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001967870912575425,
+      "loss": 3.1851,
+      "step": 1560
+    },
+    {
+      "epoch": 0.10859508156805454,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019678142285621666,
+      "loss": 3.4562,
+      "step": 1561
+    },
+    {
+      "epoch": 0.10866464920518974,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00196775749540818,
+      "loss": 3.0386,
+      "step": 1562
+    },
+    {
+      "epoch": 0.10873421684232495,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0019677007131163457,
+      "loss": 3.611,
+      "step": 1563
+    },
+    {
+      "epoch": 0.10880378447946015,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001967643881689547,
+      "loss": 3.2333,
+      "step": 1564
+    },
+    {
+      "epoch": 0.10887335211659536,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019675870011306687,
+      "loss": 3.495,
+      "step": 1565
+    },
+    {
+      "epoch": 0.10894291975373056,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019675300714426004,
+      "loss": 3.0353,
+      "step": 1566
+    },
+    {
+      "epoch": 0.10901248739086578,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001967473092628231,
+      "loss": 3.3026,
+      "step": 1567
+    },
+    {
+      "epoch": 0.10908205502800097,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001967416064690455,
+      "loss": 3.4083,
+      "step": 1568
+    },
+    {
+      "epoch": 0.10915162266513617,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001967358987632167,
+      "loss": 3.1005,
+      "step": 1569
+    },
+    {
+      "epoch": 0.10922119030227138,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001967301861456265,
+      "loss": 3.5104,
+      "step": 1570
+    },
+    {
+      "epoch": 0.10929075793940658,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019672446861656507,
+      "loss": 3.2699,
+      "step": 1571
+    },
+    {
+      "epoch": 0.1093603255765418,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001967187461763226,
+      "loss": 3.2808,
+      "step": 1572
+    },
+    {
+      "epoch": 0.109429893213677,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019671301882518977,
+      "loss": 3.6146,
+      "step": 1573
+    },
+    {
+      "epoch": 0.1094994608508122,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019670728656345725,
+      "loss": 3.3131,
+      "step": 1574
+    },
+    {
+      "epoch": 0.1095690284879474,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019670154939141616,
+      "loss": 3.522,
+      "step": 1575
+    },
+    {
+      "epoch": 0.10963859612508262,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019669580730935785,
+      "loss": 3.5155,
+      "step": 1576
+    },
+    {
+      "epoch": 0.10970816376221781,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001966900603175738,
+      "loss": 3.1026,
+      "step": 1577
+    },
+    {
+      "epoch": 0.10977773139935303,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019668430841635583,
+      "loss": 3.4919,
+      "step": 1578
+    },
+    {
+      "epoch": 0.10984729903648822,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019667855160599604,
+      "loss": 3.2533,
+      "step": 1579
+    },
+    {
+      "epoch": 0.10991686667362344,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019667278988678666,
+      "loss": 3.6415,
+      "step": 1580
+    },
+    {
+      "epoch": 0.10998643431075864,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001966670232590203,
+      "loss": 3.059,
+      "step": 1581
+    },
+    {
+      "epoch": 0.11005600194789383,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019666125172298973,
+      "loss": 2.7372,
+      "step": 1582
+    },
+    {
+      "epoch": 0.11012556958502905,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019665547527898796,
+      "loss": 3.3197,
+      "step": 1583
+    },
+    {
+      "epoch": 0.11019513722216424,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001966496939273084,
+      "loss": 2.9155,
+      "step": 1584
+    },
+    {
+      "epoch": 0.11026470485929946,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001966439076682445,
+      "loss": 2.9173,
+      "step": 1585
+    },
+    {
+      "epoch": 0.11033427249643465,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001966381165020901,
+      "loss": 3.2683,
+      "step": 1586
+    },
+    {
+      "epoch": 0.11040384013356987,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019663232042913923,
+      "loss": 3.6652,
+      "step": 1587
+    },
+    {
+      "epoch": 0.11047340777070506,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019662651944968622,
+      "loss": 3.0684,
+      "step": 1588
+    },
+    {
+      "epoch": 0.11054297540784028,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019662071356402557,
+      "loss": 3.464,
+      "step": 1589
+    },
+    {
+      "epoch": 0.11061254304497548,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019661490277245205,
+      "loss": 3.1648,
+      "step": 1590
+    },
+    {
+      "epoch": 0.11068211068211069,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001966090870752608,
+      "loss": 3.8514,
+      "step": 1591
+    },
+    {
+      "epoch": 0.11075167831924589,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00196603266472747,
+      "loss": 3.5695,
+      "step": 1592
+    },
+    {
+      "epoch": 0.1108212459563811,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019659744096520632,
+      "loss": 3.2381,
+      "step": 1593
+    },
+    {
+      "epoch": 0.1108908135935163,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019659161055293442,
+      "loss": 3.2566,
+      "step": 1594
+    },
+    {
+      "epoch": 0.1109603812306515,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001965857752362274,
+      "loss": 3.4792,
+      "step": 1595
+    },
+    {
+      "epoch": 0.1110299488677867,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019657993501538155,
+      "loss": 3.3937,
+      "step": 1596
+    },
+    {
+      "epoch": 0.1110995165049219,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001965740898906934,
+      "loss": 3.25,
+      "step": 1597
+    },
+    {
+      "epoch": 0.11116908414205712,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001965682398624597,
+      "loss": 3.3554,
+      "step": 1598
+    },
+    {
+      "epoch": 0.11123865177919232,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001965623849309776,
+      "loss": 3.114,
+      "step": 1599
+    },
+    {
+      "epoch": 0.11130821941632753,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019655652509654423,
+      "loss": 3.368,
+      "step": 1600
+    },
+    {
+      "epoch": 0.11137778705346273,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001965506603594572,
+      "loss": 3.2518,
+      "step": 1601
+    },
+    {
+      "epoch": 0.11144735469059794,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001965447907200143,
+      "loss": 3.5058,
+      "step": 1602
+    },
+    {
+      "epoch": 0.11151692232773314,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019653891617851357,
+      "loss": 3.2876,
+      "step": 1603
+    },
+    {
+      "epoch": 0.11158648996486835,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001965330367352533,
+      "loss": 3.3141,
+      "step": 1604
+    },
+    {
+      "epoch": 0.11165605760200355,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001965271523905319,
+      "loss": 3.6069,
+      "step": 1605
+    },
+    {
+      "epoch": 0.11172562523913876,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001965212631446483,
+      "loss": 3.1281,
+      "step": 1606
+    },
+    {
+      "epoch": 0.11179519287627396,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019651536899790143,
+      "loss": 2.982,
+      "step": 1607
+    },
+    {
+      "epoch": 0.11186476051340916,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001965094699505906,
+      "loss": 3.2783,
+      "step": 1608
+    },
+    {
+      "epoch": 0.11193432815054437,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019650356600301533,
+      "loss": 2.854,
+      "step": 1609
+    },
+    {
+      "epoch": 0.11200389578767957,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001964976571554754,
+      "loss": 3.1584,
+      "step": 1610
+    },
+    {
+      "epoch": 0.11207346342481478,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001964917434082708,
+      "loss": 2.9245,
+      "step": 1611
+    },
+    {
+      "epoch": 0.11214303106194998,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019648582476170184,
+      "loss": 3.347,
+      "step": 1612
+    },
+    {
+      "epoch": 0.11221259869908519,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019647990121606906,
+      "loss": 3.3861,
+      "step": 1613
+    },
+    {
+      "epoch": 0.11228216633622039,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019647397277167316,
+      "loss": 3.1979,
+      "step": 1614
+    },
+    {
+      "epoch": 0.1123517339733556,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019646803942881515,
+      "loss": 3.4472,
+      "step": 1615
+    },
+    {
+      "epoch": 0.1124213016104908,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019646210118779636,
+      "loss": 3.1445,
+      "step": 1616
+    },
+    {
+      "epoch": 0.11249086924762601,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019645615804891833,
+      "loss": 3.1762,
+      "step": 1617
+    },
+    {
+      "epoch": 0.11256043688476121,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001964502100124827,
+      "loss": 3.0012,
+      "step": 1618
+    },
+    {
+      "epoch": 0.1126300045218964,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001964442570787916,
+      "loss": 3.1677,
+      "step": 1619
+    },
+    {
+      "epoch": 0.11269957215903162,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001964382992481472,
+      "loss": 3.3573,
+      "step": 1620
+    },
+    {
+      "epoch": 0.11276913979616682,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019643233652085206,
+      "loss": 3.5359,
+      "step": 1621
+    },
+    {
+      "epoch": 0.11283870743330203,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019642636889720894,
+      "loss": 2.8434,
+      "step": 1622
+    },
+    {
+      "epoch": 0.11290827507043723,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001964203963775208,
+      "loss": 3.3141,
+      "step": 1623
+    },
+    {
+      "epoch": 0.11297784270757244,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.00196414418962091,
+      "loss": 3.6836,
+      "step": 1624
+    },
+    {
+      "epoch": 0.11304741034470764,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019640843665122286,
+      "loss": 3.5512,
+      "step": 1625
+    },
+    {
+      "epoch": 0.11311697798184285,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019640244944522036,
+      "loss": 2.9838,
+      "step": 1626
+    },
+    {
+      "epoch": 0.11318654561897805,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001963964573443873,
+      "loss": 3.2729,
+      "step": 1627
+    },
+    {
+      "epoch": 0.11325611325611326,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00196390460349028,
+      "loss": 3.4352,
+      "step": 1628
+    },
+    {
+      "epoch": 0.11332568089324846,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019638445845944702,
+      "loss": 3.5427,
+      "step": 1629
+    },
+    {
+      "epoch": 0.11339524853038367,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019637845167594903,
+      "loss": 3.2987,
+      "step": 1630
+    },
+    {
+      "epoch": 0.11346481616751887,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019637243999883905,
+      "loss": 3.2223,
+      "step": 1631
+    },
+    {
+      "epoch": 0.11353438380465407,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001963664234284223,
+      "loss": 3.177,
+      "step": 1632
+    },
+    {
+      "epoch": 0.11360395144178928,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019636040196500436,
+      "loss": 3.2046,
+      "step": 1633
+    },
+    {
+      "epoch": 0.11367351907892448,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019635437560889084,
+      "loss": 3.163,
+      "step": 1634
+    },
+    {
+      "epoch": 0.11374308671605969,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001963483443603878,
+      "loss": 3.1626,
+      "step": 1635
+    },
+    {
+      "epoch": 0.11381265435319489,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019634230821980146,
+      "loss": 3.3732,
+      "step": 1636
+    },
+    {
+      "epoch": 0.1138822219903301,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001963362671874383,
+      "loss": 3.3927,
+      "step": 1637
+    },
+    {
+      "epoch": 0.1139517896274653,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019633022126360512,
+      "loss": 3.0315,
+      "step": 1638
+    },
+    {
+      "epoch": 0.11402135726460051,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019632417044860876,
+      "loss": 3.236,
+      "step": 1639
+    },
+    {
+      "epoch": 0.11409092490173571,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001963181147427566,
+      "loss": 3.0346,
+      "step": 1640
+    },
+    {
+      "epoch": 0.11416049253887092,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019631205414635602,
+      "loss": 3.0258,
+      "step": 1641
+    },
+    {
+      "epoch": 0.11423006017600612,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001963059886597148,
+      "loss": 3.1226,
+      "step": 1642
+    },
+    {
+      "epoch": 0.11429962781314133,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001962999182831409,
+      "loss": 3.1423,
+      "step": 1643
+    },
+    {
+      "epoch": 0.11436919545027653,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019629384301694253,
+      "loss": 3.4149,
+      "step": 1644
+    },
+    {
+      "epoch": 0.11443876308741173,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019628776286142813,
+      "loss": 3.0182,
+      "step": 1645
+    },
+    {
+      "epoch": 0.11450833072454694,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001962816778169065,
+      "loss": 3.2926,
+      "step": 1646
+    },
+    {
+      "epoch": 0.11457789836168214,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019627558788368657,
+      "loss": 3.243,
+      "step": 1647
+    },
+    {
+      "epoch": 0.11464746599881735,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001962694930620775,
+      "loss": 3.2997,
+      "step": 1648
+    },
+    {
+      "epoch": 0.11471703363595255,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001962633933523889,
+      "loss": 2.6841,
+      "step": 1649
+    },
+    {
+      "epoch": 0.11478660127308776,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001962572887549303,
+      "loss": 3.6762,
+      "step": 1650
+    },
+    {
+      "epoch": 0.11485616891022296,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001962511792700118,
+      "loss": 3.12,
+      "step": 1651
+    },
+    {
+      "epoch": 0.11492573654735817,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001962450648979435,
+      "loss": 3.3223,
+      "step": 1652
+    },
+    {
+      "epoch": 0.11499530418449337,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019623894563903597,
+      "loss": 3.5976,
+      "step": 1653
+    },
+    {
+      "epoch": 0.11506487182162858,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019623282149359984,
+      "loss": 2.9595,
+      "step": 1654
+    },
+    {
+      "epoch": 0.11513443945876378,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001962266924619461,
+      "loss": 3.4546,
+      "step": 1655
+    },
+    {
+      "epoch": 0.115204007095899,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001962205585443859,
+      "loss": 3.3012,
+      "step": 1656
+    },
+    {
+      "epoch": 0.11527357473303419,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019621441974123077,
+      "loss": 3.1979,
+      "step": 1657
+    },
+    {
+      "epoch": 0.11534314237016939,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019620827605279236,
+      "loss": 3.0767,
+      "step": 1658
+    },
+    {
+      "epoch": 0.1154127100073046,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001962021274793826,
+      "loss": 3.2852,
+      "step": 1659
+    },
+    {
+      "epoch": 0.1154822776444398,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001961959740213137,
+      "loss": 3.336,
+      "step": 1660
+    },
+    {
+      "epoch": 0.11555184528157501,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001961898156788981,
+      "loss": 2.9833,
+      "step": 1661
+    },
+    {
+      "epoch": 0.11562141291871021,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001961836524524485,
+      "loss": 3.4473,
+      "step": 1662
+    },
+    {
+      "epoch": 0.11569098055584542,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001961774843422778,
+      "loss": 3.2512,
+      "step": 1663
+    },
+    {
+      "epoch": 0.11576054819298062,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019617131134869927,
+      "loss": 3.3328,
+      "step": 1664
+    },
+    {
+      "epoch": 0.11583011583011583,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019616513347202624,
+      "loss": 3.4149,
+      "step": 1665
+    },
+    {
+      "epoch": 0.11589968346725103,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001961589507125725,
+      "loss": 3.1448,
+      "step": 1666
+    },
+    {
+      "epoch": 0.11596925110438625,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019615276307065185,
+      "loss": 3.1483,
+      "step": 1667
+    },
+    {
+      "epoch": 0.11603881874152144,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019614657054657855,
+      "loss": 3.2344,
+      "step": 1668
+    },
+    {
+      "epoch": 0.11610838637865666,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019614037314066705,
+      "loss": 3.5726,
+      "step": 1669
+    },
+    {
+      "epoch": 0.11617795401579185,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019613417085323193,
+      "loss": 3.5865,
+      "step": 1670
+    },
+    {
+      "epoch": 0.11624752165292705,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019612796368458827,
+      "loss": 3.2813,
+      "step": 1671
+    },
+    {
+      "epoch": 0.11631708929006226,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019612175163505104,
+      "loss": 3.1217,
+      "step": 1672
+    },
+    {
+      "epoch": 0.11638665692719746,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019611553470493576,
+      "loss": 2.907,
+      "step": 1673
+    },
+    {
+      "epoch": 0.11645622456433267,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019610931289455813,
+      "loss": 3.2636,
+      "step": 1674
+    },
+    {
+      "epoch": 0.11652579220146787,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019610308620423397,
+      "loss": 3.4404,
+      "step": 1675
+    },
+    {
+      "epoch": 0.11659535983860309,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019609685463427952,
+      "loss": 3.5559,
+      "step": 1676
+    },
+    {
+      "epoch": 0.11666492747573828,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019609061818501115,
+      "loss": 3.477,
+      "step": 1677
+    },
+    {
+      "epoch": 0.1167344951128735,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001960843768567455,
+      "loss": 3.286,
+      "step": 1678
+    },
+    {
+      "epoch": 0.1168040627500087,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019607813064979954,
+      "loss": 3.4537,
+      "step": 1679
+    },
+    {
+      "epoch": 0.1168736303871439,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019607187956449034,
+      "loss": 3.4741,
+      "step": 1680
+    },
+    {
+      "epoch": 0.1169431980242791,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019606562360113535,
+      "loss": 3.4188,
+      "step": 1681
+    },
+    {
+      "epoch": 0.11701276566141432,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019605936276005215,
+      "loss": 3.1084,
+      "step": 1682
+    },
+    {
+      "epoch": 0.11708233329854952,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019605309704155876,
+      "loss": 3.5886,
+      "step": 1683
+    },
+    {
+      "epoch": 0.11715190093568471,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001960468264459732,
+      "loss": 3.1211,
+      "step": 1684
+    },
+    {
+      "epoch": 0.11722146857281993,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019604055097361393,
+      "loss": 3.3281,
+      "step": 1685
+    },
+    {
+      "epoch": 0.11729103620995512,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019603427062479953,
+      "loss": 3.4789,
+      "step": 1686
+    },
+    {
+      "epoch": 0.11736060384709034,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001960279853998489,
+      "loss": 3.2314,
+      "step": 1687
+    },
+    {
+      "epoch": 0.11743017148422553,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019602169529908124,
+      "loss": 3.8185,
+      "step": 1688
+    },
+    {
+      "epoch": 0.11749973912136075,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001960154003228159,
+      "loss": 3.1382,
+      "step": 1689
+    },
+    {
+      "epoch": 0.11756930675849595,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019600910047137244,
+      "loss": 3.1299,
+      "step": 1690
+    },
+    {
+      "epoch": 0.11763887439563116,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019600279574507077,
+      "loss": 3.1588,
+      "step": 1691
+    },
+    {
+      "epoch": 0.11770844203276636,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00195996486144231,
+      "loss": 3.1371,
+      "step": 1692
+    },
+    {
+      "epoch": 0.11777800966990157,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001959901716691736,
+      "loss": 3.4681,
+      "step": 1693
+    },
+    {
+      "epoch": 0.11784757730703677,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019598385232021905,
+      "loss": 3.3351,
+      "step": 1694
+    },
+    {
+      "epoch": 0.11791714494417196,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019597752809768832,
+      "loss": 3.4628,
+      "step": 1695
+    },
+    {
+      "epoch": 0.11798671258130718,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019597119900190245,
+      "loss": 3.2477,
+      "step": 1696
+    },
+    {
+      "epoch": 0.11805628021844237,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001959648650331828,
+      "loss": 3.3238,
+      "step": 1697
+    },
+    {
+      "epoch": 0.11812584785557759,
+      "grad_norm": 0.75,
+      "learning_rate": 0.00195958526191851,
+      "loss": 3.4188,
+      "step": 1698
+    },
+    {
+      "epoch": 0.11819541549271279,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00195952182478229,
+      "loss": 3.3663,
+      "step": 1699
+    },
+    {
+      "epoch": 0.118264983129848,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019594583389263872,
+      "loss": 3.1903,
+      "step": 1700
+    },
+    {
+      "epoch": 0.1183345507669832,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001959394804354026,
+      "loss": 3.4655,
+      "step": 1701
+    },
+    {
+      "epoch": 0.11840411840411841,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019593312210684326,
+      "loss": 3.4838,
+      "step": 1702
+    },
+    {
+      "epoch": 0.1184736860412536,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001959267589072835,
+      "loss": 3.4469,
+      "step": 1703
+    },
+    {
+      "epoch": 0.11854325367838882,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019592039083704644,
+      "loss": 3.0336,
+      "step": 1704
+    },
+    {
+      "epoch": 0.11861282131552402,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001959140178964554,
+      "loss": 3.3108,
+      "step": 1705
+    },
+    {
+      "epoch": 0.11868238895265923,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00195907640085834,
+      "loss": 3.0878,
+      "step": 1706
+    },
+    {
+      "epoch": 0.11875195658979443,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00195901257405506,
+      "loss": 3.0738,
+      "step": 1707
+    },
+    {
+      "epoch": 0.11882152422692963,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019589486985579557,
+      "loss": 3.4438,
+      "step": 1708
+    },
+    {
+      "epoch": 0.11889109186406484,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00195888477437027,
+      "loss": 3.465,
+      "step": 1709
+    },
+    {
+      "epoch": 0.11896065950120004,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001958820801495248,
+      "loss": 3.4594,
+      "step": 1710
+    },
+    {
+      "epoch": 0.11903022713833525,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001958756779936139,
+      "loss": 3.4865,
+      "step": 1711
+    },
+    {
+      "epoch": 0.11909979477547045,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019586927096961935,
+      "loss": 3.2511,
+      "step": 1712
+    },
+    {
+      "epoch": 0.11916936241260566,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001958628590778664,
+      "loss": 3.3198,
+      "step": 1713
+    },
+    {
+      "epoch": 0.11923893004974086,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019585644231868062,
+      "loss": 3.6062,
+      "step": 1714
+    },
+    {
+      "epoch": 0.11930849768687607,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001958500206923879,
+      "loss": 3.5153,
+      "step": 1715
+    },
+    {
+      "epoch": 0.11937806532401127,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001958435941993142,
+      "loss": 3.3921,
+      "step": 1716
+    },
+    {
+      "epoch": 0.11944763296114648,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019583716283978593,
+      "loss": 3.3137,
+      "step": 1717
+    },
+    {
+      "epoch": 0.11951720059828168,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019583072661412955,
+      "loss": 3.4524,
+      "step": 1718
+    },
+    {
+      "epoch": 0.11958676823541689,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001958242855226719,
+      "loss": 3.3938,
+      "step": 1719
+    },
+    {
+      "epoch": 0.11965633587255209,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019581783956574006,
+      "loss": 3.168,
+      "step": 1720
+    },
+    {
+      "epoch": 0.11972590350968729,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001958113887436612,
+      "loss": 3.6354,
+      "step": 1721
+    },
+    {
+      "epoch": 0.1197954711468225,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00195804933056763,
+      "loss": 3.1484,
+      "step": 1722
+    },
+    {
+      "epoch": 0.1198650387839577,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019579847250537318,
+      "loss": 2.7735,
+      "step": 1723
+    },
+    {
+      "epoch": 0.11993460642109291,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001957920070898198,
+      "loss": 2.9981,
+      "step": 1724
+    },
+    {
+      "epoch": 0.12000417405822811,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019578553681043115,
+      "loss": 2.9309,
+      "step": 1725
+    },
+    {
+      "epoch": 0.12007374169536332,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001957790616675357,
+      "loss": 3.4824,
+      "step": 1726
+    },
+    {
+      "epoch": 0.12014330933249852,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019577258166146227,
+      "loss": 3.1838,
+      "step": 1727
+    },
+    {
+      "epoch": 0.12021287696963373,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019576609679253986,
+      "loss": 2.8742,
+      "step": 1728
+    },
+    {
+      "epoch": 0.12028244460676893,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001957596070610978,
+      "loss": 3.5897,
+      "step": 1729
+    },
+    {
+      "epoch": 0.12035201224390414,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001957531124674655,
+      "loss": 3.6366,
+      "step": 1730
+    },
+    {
+      "epoch": 0.12042157988103934,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001957466130119728,
+      "loss": 3.0287,
+      "step": 1731
+    },
+    {
+      "epoch": 0.12049114751817455,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019574010869494968,
+      "loss": 3.2798,
+      "step": 1732
+    },
+    {
+      "epoch": 0.12056071515530975,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019573359951672643,
+      "loss": 3.4167,
+      "step": 1733
+    },
+    {
+      "epoch": 0.12063028279244495,
+      "grad_norm": 2.75,
+      "learning_rate": 0.001957270854776335,
+      "loss": 3.3749,
+      "step": 1734
+    },
+    {
+      "epoch": 0.12069985042958016,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001957205665780017,
+      "loss": 3.7976,
+      "step": 1735
+    },
+    {
+      "epoch": 0.12076941806671536,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00195714042818162,
+      "loss": 3.2681,
+      "step": 1736
+    },
+    {
+      "epoch": 0.12083898570385057,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001957075141984456,
+      "loss": 3.0821,
+      "step": 1737
+    },
+    {
+      "epoch": 0.12090855334098577,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019570098071918407,
+      "loss": 3.477,
+      "step": 1738
+    },
+    {
+      "epoch": 0.12097812097812098,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001956944423807091,
+      "loss": 3.3121,
+      "step": 1739
+    },
+    {
+      "epoch": 0.12104768861525618,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019568789918335268,
+      "loss": 3.0542,
+      "step": 1740
+    },
+    {
+      "epoch": 0.12111725625239139,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019568135112744698,
+      "loss": 3.432,
+      "step": 1741
+    },
+    {
+      "epoch": 0.12118682388952659,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019567479821332463,
+      "loss": 2.9629,
+      "step": 1742
+    },
+    {
+      "epoch": 0.1212563915266618,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001956682404413182,
+      "loss": 3.4069,
+      "step": 1743
+    },
+    {
+      "epoch": 0.121325959163797,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019566167781176077,
+      "loss": 3.2643,
+      "step": 1744
+    },
+    {
+      "epoch": 0.12139552680093221,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001956551103249855,
+      "loss": 3.8109,
+      "step": 1745
+    },
+    {
+      "epoch": 0.12146509443806741,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019564853798132585,
+      "loss": 3.5165,
+      "step": 1746
+    },
+    {
+      "epoch": 0.12153466207520261,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019564196078111556,
+      "loss": 3.4767,
+      "step": 1747
+    },
+    {
+      "epoch": 0.12160422971233782,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019563537872468854,
+      "loss": 3.2814,
+      "step": 1748
+    },
+    {
+      "epoch": 0.12167379734947302,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019562879181237907,
+      "loss": 3.188,
+      "step": 1749
+    },
+    {
+      "epoch": 0.12174336498660823,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019562220004452156,
+      "loss": 3.3306,
+      "step": 1750
+    },
+    {
+      "epoch": 0.12181293262374343,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001956156034214507,
+      "loss": 3.3211,
+      "step": 1751
+    },
+    {
+      "epoch": 0.12188250026087864,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019560900194350137,
+      "loss": 3.0687,
+      "step": 1752
+    },
+    {
+      "epoch": 0.12195206789801384,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001956023956110089,
+      "loss": 3.1071,
+      "step": 1753
+    },
+    {
+      "epoch": 0.12202163553514905,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019559578442430864,
+      "loss": 3.1057,
+      "step": 1754
+    },
+    {
+      "epoch": 0.12209120317228425,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019558916838373626,
+      "loss": 3.3788,
+      "step": 1755
+    },
+    {
+      "epoch": 0.12216077080941946,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019558254748962773,
+      "loss": 3.0311,
+      "step": 1756
+    },
+    {
+      "epoch": 0.12223033844655466,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001955759217423192,
+      "loss": 3.4973,
+      "step": 1757
+    },
+    {
+      "epoch": 0.12229990608368987,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001955692911421471,
+      "loss": 3.0044,
+      "step": 1758
+    },
+    {
+      "epoch": 0.12236947372082507,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019556265568944813,
+      "loss": 2.8627,
+      "step": 1759
+    },
+    {
+      "epoch": 0.12243904135796027,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019555601538455915,
+      "loss": 3.5029,
+      "step": 1760
+    },
+    {
+      "epoch": 0.12250860899509548,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019554937022781735,
+      "loss": 3.1952,
+      "step": 1761
+    },
+    {
+      "epoch": 0.12257817663223068,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019554272021956014,
+      "loss": 3.7242,
+      "step": 1762
+    },
+    {
+      "epoch": 0.1226477442693659,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001955360653601252,
+      "loss": 3.31,
+      "step": 1763
+    },
+    {
+      "epoch": 0.12271731190650109,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019552940564985036,
+      "loss": 3.3173,
+      "step": 1764
+    },
+    {
+      "epoch": 0.1227868795436363,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001955227410890738,
+      "loss": 3.2139,
+      "step": 1765
+    },
+    {
+      "epoch": 0.1228564471807715,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001955160716781339,
+      "loss": 3.0229,
+      "step": 1766
+    },
+    {
+      "epoch": 0.12292601481790671,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019550939741736937,
+      "loss": 3.0964,
+      "step": 1767
+    },
+    {
+      "epoch": 0.12299558245504191,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00195502718307119,
+      "loss": 3.3713,
+      "step": 1768
+    },
+    {
+      "epoch": 0.12306515009217713,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019549603434772197,
+      "loss": 3.3651,
+      "step": 1769
+    },
+    {
+      "epoch": 0.12313471772931232,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001954893455395177,
+      "loss": 2.9908,
+      "step": 1770
+    },
+    {
+      "epoch": 0.12320428536644752,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019548265188284574,
+      "loss": 2.9317,
+      "step": 1771
+    },
+    {
+      "epoch": 0.12327385300358273,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019547595337804594,
+      "loss": 2.8074,
+      "step": 1772
+    },
+    {
+      "epoch": 0.12334342064071793,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001954692500254585,
+      "loss": 3.339,
+      "step": 1773
+    },
+    {
+      "epoch": 0.12341298827785314,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019546254182542374,
+      "loss": 3.0529,
+      "step": 1774
+    },
+    {
+      "epoch": 0.12348255591498834,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001954558287782823,
+      "loss": 3.3201,
+      "step": 1775
+    },
+    {
+      "epoch": 0.12355212355212356,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019544911088437496,
+      "loss": 3.211,
+      "step": 1776
+    },
+    {
+      "epoch": 0.12362169118925875,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019544238814404287,
+      "loss": 3.4603,
+      "step": 1777
+    },
+    {
+      "epoch": 0.12369125882639397,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019543566055762744,
+      "loss": 3.3202,
+      "step": 1778
+    },
+    {
+      "epoch": 0.12376082646352916,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019542892812547015,
+      "loss": 3.4099,
+      "step": 1779
+    },
+    {
+      "epoch": 0.12383039410066438,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019542219084791286,
+      "loss": 3.6499,
+      "step": 1780
+    },
+    {
+      "epoch": 0.12389996173779957,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001954154487252977,
+      "loss": 3.2814,
+      "step": 1781
+    },
+    {
+      "epoch": 0.12396952937493479,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00195408701757967,
+      "loss": 3.1144,
+      "step": 1782
+    },
+    {
+      "epoch": 0.12403909701206998,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001954019499462633,
+      "loss": 3.2145,
+      "step": 1783
+    },
+    {
+      "epoch": 0.12410866464920518,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001953951932905295,
+      "loss": 2.9224,
+      "step": 1784
+    },
+    {
+      "epoch": 0.1241782322863404,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019538843179110854,
+      "loss": 3.5563,
+      "step": 1785
+    },
+    {
+      "epoch": 0.1242477999234756,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019538166544834385,
+      "loss": 3.7433,
+      "step": 1786
+    },
+    {
+      "epoch": 0.1243173675606108,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019537489426257894,
+      "loss": 3.5424,
+      "step": 1787
+    },
+    {
+      "epoch": 0.124386935197746,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001953681182341576,
+      "loss": 3.1177,
+      "step": 1788
+    },
+    {
+      "epoch": 0.12445650283488122,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019536133736342393,
+      "loss": 3.1545,
+      "step": 1789
+    },
+    {
+      "epoch": 0.12452607047201641,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001953545516507222,
+      "loss": 3.2966,
+      "step": 1790
+    },
+    {
+      "epoch": 0.12459563810915163,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00195347761096397,
+      "loss": 3.0389,
+      "step": 1791
+    },
+    {
+      "epoch": 0.12466520574628683,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019534096570079304,
+      "loss": 3.2605,
+      "step": 1792
+    },
+    {
+      "epoch": 0.12473477338342204,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001953341654642554,
+      "loss": 3.5102,
+      "step": 1793
+    },
+    {
+      "epoch": 0.12480434102055724,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019532736038712934,
+      "loss": 3.6672,
+      "step": 1794
+    },
+    {
+      "epoch": 0.12487390865769245,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019532055046976044,
+      "loss": 3.2388,
+      "step": 1795
+    },
+    {
+      "epoch": 0.12494347629482765,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001953137357124944,
+      "loss": 3.2163,
+      "step": 1796
+    },
+    {
+      "epoch": 0.12501304393196286,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001953069161156773,
+      "loss": 3.3399,
+      "step": 1797
+    },
+    {
+      "epoch": 0.12508261156909806,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019530009167965537,
+      "loss": 3.4525,
+      "step": 1798
+    },
+    {
+      "epoch": 0.12515217920623325,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019529326240477513,
+      "loss": 2.9544,
+      "step": 1799
+    },
+    {
+      "epoch": 0.12522174684336845,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019528642829138338,
+      "loss": 3.4445,
+      "step": 1800
+    },
+    {
+      "epoch": 0.12529131448050368,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019527958933982703,
+      "loss": 3.1022,
+      "step": 1801
+    },
+    {
+      "epoch": 0.12536088211763888,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001952727455504534,
+      "loss": 3.1783,
+      "step": 1802
+    },
+    {
+      "epoch": 0.12543044975477408,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019526589692360997,
+      "loss": 3.3324,
+      "step": 1803
+    },
+    {
+      "epoch": 0.12550001739190927,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019525904345964445,
+      "loss": 3.2979,
+      "step": 1804
+    },
+    {
+      "epoch": 0.1255695850290445,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019525218515890487,
+      "loss": 3.2694,
+      "step": 1805
+    },
+    {
+      "epoch": 0.1256391526661797,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019524532202173938,
+      "loss": 3.6337,
+      "step": 1806
+    },
+    {
+      "epoch": 0.1257087203033149,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019523845404849655,
+      "loss": 3.0394,
+      "step": 1807
+    },
+    {
+      "epoch": 0.1257782879404501,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019523158123952507,
+      "loss": 3.2475,
+      "step": 1808
+    },
+    {
+      "epoch": 0.1258478555775853,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001952247035951739,
+      "loss": 3.5361,
+      "step": 1809
+    },
+    {
+      "epoch": 0.12591742321472052,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019521782111579223,
+      "loss": 3.2208,
+      "step": 1810
+    },
+    {
+      "epoch": 0.12598699085185572,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019521093380172954,
+      "loss": 3.0137,
+      "step": 1811
+    },
+    {
+      "epoch": 0.12605655848899092,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019520404165333555,
+      "loss": 3.4475,
+      "step": 1812
+    },
+    {
+      "epoch": 0.12612612612612611,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019519714467096016,
+      "loss": 3.3839,
+      "step": 1813
+    },
+    {
+      "epoch": 0.12619569376326134,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019519024285495359,
+      "loss": 3.4224,
+      "step": 1814
+    },
+    {
+      "epoch": 0.12626526140039654,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019518333620566631,
+      "loss": 3.4469,
+      "step": 1815
+    },
+    {
+      "epoch": 0.12633482903753174,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019517642472344895,
+      "loss": 3.385,
+      "step": 1816
+    },
+    {
+      "epoch": 0.12640439667466694,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019516950840865249,
+      "loss": 3.0788,
+      "step": 1817
+    },
+    {
+      "epoch": 0.12647396431180216,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019516258726162807,
+      "loss": 2.922,
+      "step": 1818
+    },
+    {
+      "epoch": 0.12654353194893736,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019515566128272713,
+      "loss": 3.7069,
+      "step": 1819
+    },
+    {
+      "epoch": 0.12661309958607256,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019514873047230133,
+      "loss": 3.5041,
+      "step": 1820
+    },
+    {
+      "epoch": 0.12668266722320776,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019514179483070258,
+      "loss": 3.3119,
+      "step": 1821
+    },
+    {
+      "epoch": 0.12675223486034295,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019513485435828303,
+      "loss": 3.5025,
+      "step": 1822
+    },
+    {
+      "epoch": 0.12682180249747818,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001951279090553951,
+      "loss": 3.1403,
+      "step": 1823
+    },
+    {
+      "epoch": 0.12689137013461338,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019512095892239144,
+      "loss": 3.0786,
+      "step": 1824
+    },
+    {
+      "epoch": 0.12696093777174858,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001951140039596249,
+      "loss": 3.2466,
+      "step": 1825
+    },
+    {
+      "epoch": 0.12703050540888378,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001951070441674487,
+      "loss": 3.1296,
+      "step": 1826
+    },
+    {
+      "epoch": 0.127100073046019,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019510007954621612,
+      "loss": 3.5657,
+      "step": 1827
+    },
+    {
+      "epoch": 0.1271696406831542,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019509311009628086,
+      "loss": 3.0529,
+      "step": 1828
+    },
+    {
+      "epoch": 0.1272392083202894,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019508613581799676,
+      "loss": 3.0305,
+      "step": 1829
+    },
+    {
+      "epoch": 0.1273087759574246,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019507915671171797,
+      "loss": 3.1983,
+      "step": 1830
+    },
+    {
+      "epoch": 0.12737834359455982,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019507217277779884,
+      "loss": 3.1657,
+      "step": 1831
+    },
+    {
+      "epoch": 0.12744791123169502,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019506518401659397,
+      "loss": 3.5715,
+      "step": 1832
+    },
+    {
+      "epoch": 0.12751747886883022,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019505819042845822,
+      "loss": 3.2889,
+      "step": 1833
+    },
+    {
+      "epoch": 0.12758704650596542,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001950511920137467,
+      "loss": 3.3527,
+      "step": 1834
+    },
+    {
+      "epoch": 0.12765661414310062,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001950441887728147,
+      "loss": 3.7539,
+      "step": 1835
+    },
+    {
+      "epoch": 0.12772618178023584,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019503718070601791,
+      "loss": 3.2217,
+      "step": 1836
+    },
+    {
+      "epoch": 0.12779574941737104,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019503016781371209,
+      "loss": 3.275,
+      "step": 1837
+    },
+    {
+      "epoch": 0.12786531705450624,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019502315009625331,
+      "loss": 3.4357,
+      "step": 1838
+    },
+    {
+      "epoch": 0.12793488469164144,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019501612755399795,
+      "loss": 3.3004,
+      "step": 1839
+    },
+    {
+      "epoch": 0.12800445232877666,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019500910018730253,
+      "loss": 3.1225,
+      "step": 1840
+    },
+    {
+      "epoch": 0.12807401996591186,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019500206799652386,
+      "loss": 3.3034,
+      "step": 1841
+    },
+    {
+      "epoch": 0.12814358760304706,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019499503098201908,
+      "loss": 3.1774,
+      "step": 1842
+    },
+    {
+      "epoch": 0.12821315524018226,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001949879891441454,
+      "loss": 2.9553,
+      "step": 1843
+    },
+    {
+      "epoch": 0.12828272287731748,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019498094248326043,
+      "loss": 3.0137,
+      "step": 1844
+    },
+    {
+      "epoch": 0.12835229051445268,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019497389099972192,
+      "loss": 3.063,
+      "step": 1845
+    },
+    {
+      "epoch": 0.12842185815158788,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019496683469388794,
+      "loss": 3.5581,
+      "step": 1846
+    },
+    {
+      "epoch": 0.12849142578872308,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019495977356611674,
+      "loss": 3.3502,
+      "step": 1847
+    },
+    {
+      "epoch": 0.12856099342585828,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001949527076167669,
+      "loss": 3.3218,
+      "step": 1848
+    },
+    {
+      "epoch": 0.1286305610629935,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019494563684619715,
+      "loss": 3.4957,
+      "step": 1849
+    },
+    {
+      "epoch": 0.1287001287001287,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019493856125476652,
+      "loss": 3.7629,
+      "step": 1850
+    },
+    {
+      "epoch": 0.1287696963372639,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019493148084283427,
+      "loss": 3.0125,
+      "step": 1851
+    },
+    {
+      "epoch": 0.1288392639743991,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019492439561075994,
+      "loss": 3.1882,
+      "step": 1852
+    },
+    {
+      "epoch": 0.12890883161153432,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019491730555890323,
+      "loss": 3.1547,
+      "step": 1853
+    },
+    {
+      "epoch": 0.12897839924866952,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019491021068762417,
+      "loss": 3.2827,
+      "step": 1854
+    },
+    {
+      "epoch": 0.12904796688580472,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00194903110997283,
+      "loss": 3.4886,
+      "step": 1855
+    },
+    {
+      "epoch": 0.12911753452293992,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001948960064882402,
+      "loss": 3.4038,
+      "step": 1856
+    },
+    {
+      "epoch": 0.12918710216007515,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019488889716085648,
+      "loss": 3.119,
+      "step": 1857
+    },
+    {
+      "epoch": 0.12925666979721034,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019488178301549286,
+      "loss": 3.3197,
+      "step": 1858
+    },
+    {
+      "epoch": 0.12932623743434554,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019487466405251053,
+      "loss": 3.5417,
+      "step": 1859
+    },
+    {
+      "epoch": 0.12939580507148074,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019486754027227098,
+      "loss": 3.174,
+      "step": 1860
+    },
+    {
+      "epoch": 0.12946537270861594,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019486041167513588,
+      "loss": 3.2131,
+      "step": 1861
+    },
+    {
+      "epoch": 0.12953494034575117,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019485327826146723,
+      "loss": 3.4978,
+      "step": 1862
+    },
+    {
+      "epoch": 0.12960450798288636,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019484614003162717,
+      "loss": 3.408,
+      "step": 1863
+    },
+    {
+      "epoch": 0.12967407562002156,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019483899698597821,
+      "loss": 3.3175,
+      "step": 1864
+    },
+    {
+      "epoch": 0.12974364325715676,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019483184912488301,
+      "loss": 2.8108,
+      "step": 1865
+    },
+    {
+      "epoch": 0.12981321089429199,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001948246964487045,
+      "loss": 3.226,
+      "step": 1866
+    },
+    {
+      "epoch": 0.12988277853142718,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019481753895780583,
+      "loss": 3.2306,
+      "step": 1867
+    },
+    {
+      "epoch": 0.12995234616856238,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019481037665255046,
+      "loss": 3.292,
+      "step": 1868
+    },
+    {
+      "epoch": 0.13002191380569758,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019480320953330205,
+      "loss": 3.6482,
+      "step": 1869
+    },
+    {
+      "epoch": 0.1300914814428328,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019479603760042448,
+      "loss": 3.6202,
+      "step": 1870
+    },
+    {
+      "epoch": 0.130161049079968,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019478886085428195,
+      "loss": 2.8485,
+      "step": 1871
+    },
+    {
+      "epoch": 0.1302306167171032,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019478167929523884,
+      "loss": 3.5329,
+      "step": 1872
+    },
+    {
+      "epoch": 0.1303001843542384,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019477449292365978,
+      "loss": 3.2885,
+      "step": 1873
+    },
+    {
+      "epoch": 0.1303697519913736,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001947673017399097,
+      "loss": 3.0415,
+      "step": 1874
+    },
+    {
+      "epoch": 0.13043931962850883,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019476010574435364,
+      "loss": 3.4042,
+      "step": 1875
+    },
+    {
+      "epoch": 0.13050888726564402,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001947529049373571,
+      "loss": 3.4087,
+      "step": 1876
+    },
+    {
+      "epoch": 0.13057845490277922,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019474569931928558,
+      "loss": 3.4739,
+      "step": 1877
+    },
+    {
+      "epoch": 0.13064802253991442,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019473848889050504,
+      "loss": 3.1786,
+      "step": 1878
+    },
+    {
+      "epoch": 0.13071759017704965,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019473127365138155,
+      "loss": 3.2386,
+      "step": 1879
+    },
+    {
+      "epoch": 0.13078715781418485,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019472405360228145,
+      "loss": 3.5112,
+      "step": 1880
+    },
+    {
+      "epoch": 0.13085672545132004,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019471682874357135,
+      "loss": 3.318,
+      "step": 1881
+    },
+    {
+      "epoch": 0.13092629308845524,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019470959907561811,
+      "loss": 3.6096,
+      "step": 1882
+    },
+    {
+      "epoch": 0.13099586072559047,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019470236459878877,
+      "loss": 2.9816,
+      "step": 1883
+    },
+    {
+      "epoch": 0.13106542836272567,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019469512531345072,
+      "loss": 2.9925,
+      "step": 1884
+    },
+    {
+      "epoch": 0.13113499599986086,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001946878812199715,
+      "loss": 3.3357,
+      "step": 1885
+    },
+    {
+      "epoch": 0.13120456363699606,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019468063231871896,
+      "loss": 3.3221,
+      "step": 1886
+    },
+    {
+      "epoch": 0.13127413127413126,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001946733786100611,
+      "loss": 3.0446,
+      "step": 1887
+    },
+    {
+      "epoch": 0.1313436989112665,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019466612009436627,
+      "loss": 3.4744,
+      "step": 1888
+    },
+    {
+      "epoch": 0.13141326654840169,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00194658856772003,
+      "loss": 3.3944,
+      "step": 1889
+    },
+    {
+      "epoch": 0.13148283418553688,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001946515886433401,
+      "loss": 3.1389,
+      "step": 1890
+    },
+    {
+      "epoch": 0.13155240182267208,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019464431570874665,
+      "loss": 3.3595,
+      "step": 1891
+    },
+    {
+      "epoch": 0.1316219694598073,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019463703796859188,
+      "loss": 3.3208,
+      "step": 1892
+    },
+    {
+      "epoch": 0.1316915370969425,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001946297554232453,
+      "loss": 3.5098,
+      "step": 1893
+    },
+    {
+      "epoch": 0.1317611047340777,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019462246807307672,
+      "loss": 3.2446,
+      "step": 1894
+    },
+    {
+      "epoch": 0.1318306723712129,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019461517591845615,
+      "loss": 3.3563,
+      "step": 1895
+    },
+    {
+      "epoch": 0.13190024000834813,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019460787895975386,
+      "loss": 3.5702,
+      "step": 1896
+    },
+    {
+      "epoch": 0.13196980764548333,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001946005771973403,
+      "loss": 3.3963,
+      "step": 1897
+    },
+    {
+      "epoch": 0.13203937528261853,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019459327063158628,
+      "loss": 3.2952,
+      "step": 1898
+    },
+    {
+      "epoch": 0.13210894291975372,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019458595926286272,
+      "loss": 3.0189,
+      "step": 1899
+    },
+    {
+      "epoch": 0.13217851055688892,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019457864309154094,
+      "loss": 3.6055,
+      "step": 1900
+    },
+    {
+      "epoch": 0.13224807819402415,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019457132211799235,
+      "loss": 2.9506,
+      "step": 1901
+    },
+    {
+      "epoch": 0.13231764583115935,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019456399634258871,
+      "loss": 3.2754,
+      "step": 1902
+    },
+    {
+      "epoch": 0.13238721346829455,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00194556665765702,
+      "loss": 3.2409,
+      "step": 1903
+    },
+    {
+      "epoch": 0.13245678110542974,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019454933038770435,
+      "loss": 3.1072,
+      "step": 1904
+    },
+    {
+      "epoch": 0.13252634874256497,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001945419902089683,
+      "loss": 2.9871,
+      "step": 1905
+    },
+    {
+      "epoch": 0.13259591637970017,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001945346452298665,
+      "loss": 3.0467,
+      "step": 1906
+    },
+    {
+      "epoch": 0.13266548401683537,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019452729545077192,
+      "loss": 3.4325,
+      "step": 1907
+    },
+    {
+      "epoch": 0.13273505165397056,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001945199408720577,
+      "loss": 3.1134,
+      "step": 1908
+    },
+    {
+      "epoch": 0.1328046192911058,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019451258149409735,
+      "loss": 3.4843,
+      "step": 1909
+    },
+    {
+      "epoch": 0.132874186928241,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019450521731726447,
+      "loss": 3.384,
+      "step": 1910
+    },
+    {
+      "epoch": 0.1329437545653762,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019449784834193297,
+      "loss": 3.2625,
+      "step": 1911
+    },
+    {
+      "epoch": 0.13301332220251139,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019449047456847706,
+      "loss": 3.519,
+      "step": 1912
+    },
+    {
+      "epoch": 0.13308288983964658,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019448309599727112,
+      "loss": 3.3985,
+      "step": 1913
+    },
+    {
+      "epoch": 0.1331524574767818,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001944757126286898,
+      "loss": 3.4797,
+      "step": 1914
+    },
+    {
+      "epoch": 0.133222025113917,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019446832446310793,
+      "loss": 3.2945,
+      "step": 1915
+    },
+    {
+      "epoch": 0.1332915927510522,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019446093150090075,
+      "loss": 3.3423,
+      "step": 1916
+    },
+    {
+      "epoch": 0.1333611603881874,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019445353374244359,
+      "loss": 3.6269,
+      "step": 1917
+    },
+    {
+      "epoch": 0.13343072802532263,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019444613118811205,
+      "loss": 3.362,
+      "step": 1918
+    },
+    {
+      "epoch": 0.13350029566245783,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019443872383828203,
+      "loss": 3.3338,
+      "step": 1919
+    },
+    {
+      "epoch": 0.13356986329959303,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019443131169332962,
+      "loss": 3.3511,
+      "step": 1920
+    },
+    {
+      "epoch": 0.13363943093672823,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019442389475363116,
+      "loss": 3.5065,
+      "step": 1921
+    },
+    {
+      "epoch": 0.13370899857386345,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019441647301956324,
+      "loss": 3.3673,
+      "step": 1922
+    },
+    {
+      "epoch": 0.13377856621099865,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019440904649150276,
+      "loss": 3.0948,
+      "step": 1923
+    },
+    {
+      "epoch": 0.13384813384813385,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019440161516982668,
+      "loss": 3.6924,
+      "step": 1924
+    },
+    {
+      "epoch": 0.13391770148526905,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019439417905491247,
+      "loss": 3.0385,
+      "step": 1925
+    },
+    {
+      "epoch": 0.13398726912240425,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019438673814713761,
+      "loss": 3.1771,
+      "step": 1926
+    },
+    {
+      "epoch": 0.13405683675953947,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001943792924468799,
+      "loss": 3.7504,
+      "step": 1927
+    },
+    {
+      "epoch": 0.13412640439667467,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019437184195451747,
+      "loss": 3.4519,
+      "step": 1928
+    },
+    {
+      "epoch": 0.13419597203380987,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019436438667042855,
+      "loss": 3.4553,
+      "step": 1929
+    },
+    {
+      "epoch": 0.13426553967094507,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019435692659499173,
+      "loss": 3.4122,
+      "step": 1930
+    },
+    {
+      "epoch": 0.1343351073080803,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019434946172858577,
+      "loss": 3.4642,
+      "step": 1931
+    },
+    {
+      "epoch": 0.1344046749452155,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019434199207158968,
+      "loss": 3.3712,
+      "step": 1932
+    },
+    {
+      "epoch": 0.1344742425823507,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019433451762438274,
+      "loss": 3.2983,
+      "step": 1933
+    },
+    {
+      "epoch": 0.1345438102194859,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019432703838734452,
+      "loss": 3.5689,
+      "step": 1934
+    },
+    {
+      "epoch": 0.1346133778566211,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019431955436085468,
+      "loss": 3.0071,
+      "step": 1935
+    },
+    {
+      "epoch": 0.1346829454937563,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019431206554529333,
+      "loss": 3.5376,
+      "step": 1936
+    },
+    {
+      "epoch": 0.1347525131308915,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019430457194104063,
+      "loss": 3.5035,
+      "step": 1937
+    },
+    {
+      "epoch": 0.1348220807680267,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019429707354847712,
+      "loss": 3.1914,
+      "step": 1938
+    },
+    {
+      "epoch": 0.1348916484051619,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019428957036798347,
+      "loss": 3.9197,
+      "step": 1939
+    },
+    {
+      "epoch": 0.13496121604229713,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001942820623999407,
+      "loss": 3.2672,
+      "step": 1940
+    },
+    {
+      "epoch": 0.13503078367943233,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019427454964473006,
+      "loss": 3.0891,
+      "step": 1941
+    },
+    {
+      "epoch": 0.13510035131656753,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019426703210273294,
+      "loss": 2.8527,
+      "step": 1942
+    },
+    {
+      "epoch": 0.13516991895370273,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019425950977433105,
+      "loss": 2.8761,
+      "step": 1943
+    },
+    {
+      "epoch": 0.13523948659083795,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019425198265990637,
+      "loss": 3.564,
+      "step": 1944
+    },
+    {
+      "epoch": 0.13530905422797315,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001942444507598411,
+      "loss": 3.5785,
+      "step": 1945
+    },
+    {
+      "epoch": 0.13537862186510835,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019423691407451761,
+      "loss": 3.4398,
+      "step": 1946
+    },
+    {
+      "epoch": 0.13544818950224355,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019422937260431864,
+      "loss": 3.2611,
+      "step": 1947
+    },
+    {
+      "epoch": 0.13551775713937875,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019422182634962707,
+      "loss": 3.4193,
+      "step": 1948
+    },
+    {
+      "epoch": 0.13558732477651397,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019421427531082606,
+      "loss": 3.2257,
+      "step": 1949
+    },
+    {
+      "epoch": 0.13565689241364917,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019420671948829904,
+      "loss": 3.2713,
+      "step": 1950
+    },
+    {
+      "epoch": 0.13572646005078437,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019419915888242963,
+      "loss": 3.2238,
+      "step": 1951
+    },
+    {
+      "epoch": 0.13579602768791957,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019419159349360173,
+      "loss": 3.5063,
+      "step": 1952
+    },
+    {
+      "epoch": 0.1358655953250548,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019418402332219951,
+      "loss": 3.765,
+      "step": 1953
+    },
+    {
+      "epoch": 0.13593516296219,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019417644836860727,
+      "loss": 3.1648,
+      "step": 1954
+    },
+    {
+      "epoch": 0.1360047305993252,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019416886863320968,
+      "loss": 3.2006,
+      "step": 1955
+    },
+    {
+      "epoch": 0.1360742982364604,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001941612841163916,
+      "loss": 3.1752,
+      "step": 1956
+    },
+    {
+      "epoch": 0.13614386587359562,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019415369481853811,
+      "loss": 3.1891,
+      "step": 1957
+    },
+    {
+      "epoch": 0.1362134335107308,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019414610074003455,
+      "loss": 3.3473,
+      "step": 1958
+    },
+    {
+      "epoch": 0.136283001147866,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001941385018812665,
+      "loss": 3.1638,
+      "step": 1959
+    },
+    {
+      "epoch": 0.1363525687850012,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019413089824261989,
+      "loss": 3.2753,
+      "step": 1960
+    },
+    {
+      "epoch": 0.1364221364221364,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019412328982448069,
+      "loss": 3.0721,
+      "step": 1961
+    },
+    {
+      "epoch": 0.13649170405927163,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019411567662723523,
+      "loss": 3.5696,
+      "step": 1962
+    },
+    {
+      "epoch": 0.13656127169640683,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001941080586512701,
+      "loss": 3.1816,
+      "step": 1963
+    },
+    {
+      "epoch": 0.13663083933354203,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001941004358969721,
+      "loss": 3.2473,
+      "step": 1964
+    },
+    {
+      "epoch": 0.13670040697067723,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019409280836472829,
+      "loss": 3.4571,
+      "step": 1965
+    },
+    {
+      "epoch": 0.13676997460781246,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019408517605492592,
+      "loss": 3.5023,
+      "step": 1966
+    },
+    {
+      "epoch": 0.13683954224494765,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001940775389679525,
+      "loss": 3.7084,
+      "step": 1967
+    },
+    {
+      "epoch": 0.13690910988208285,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019406989710419587,
+      "loss": 3.5293,
+      "step": 1968
+    },
+    {
+      "epoch": 0.13697867751921805,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00194062250464044,
+      "loss": 3.2683,
+      "step": 1969
+    },
+    {
+      "epoch": 0.13704824515635328,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019405459904788516,
+      "loss": 3.4587,
+      "step": 1970
+    },
+    {
+      "epoch": 0.13711781279348847,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019404694285610783,
+      "loss": 2.7896,
+      "step": 1971
+    },
+    {
+      "epoch": 0.13718738043062367,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019403928188910082,
+      "loss": 3.2769,
+      "step": 1972
+    },
+    {
+      "epoch": 0.13725694806775887,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00194031616147253,
+      "loss": 3.0715,
+      "step": 1973
+    },
+    {
+      "epoch": 0.13732651570489407,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019402394563095373,
+      "loss": 3.2322,
+      "step": 1974
+    },
+    {
+      "epoch": 0.1373960833420293,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001940162703405924,
+      "loss": 3.543,
+      "step": 1975
+    },
+    {
+      "epoch": 0.1374656509791645,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019400859027655876,
+      "loss": 3.3874,
+      "step": 1976
+    },
+    {
+      "epoch": 0.1375352186162997,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019400090543924271,
+      "loss": 3.2165,
+      "step": 1977
+    },
+    {
+      "epoch": 0.1376047862534349,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019399321582903451,
+      "loss": 2.9563,
+      "step": 1978
+    },
+    {
+      "epoch": 0.13767435389057012,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019398552144632454,
+      "loss": 3.2973,
+      "step": 1979
+    },
+    {
+      "epoch": 0.13774392152770532,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019397782229150355,
+      "loss": 3.3801,
+      "step": 1980
+    },
+    {
+      "epoch": 0.1378134891648405,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001939701183649624,
+      "loss": 3.2854,
+      "step": 1981
+    },
+    {
+      "epoch": 0.1378830568019757,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019396240966709226,
+      "loss": 3.5824,
+      "step": 1982
+    },
+    {
+      "epoch": 0.13795262443911094,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001939546961982846,
+      "loss": 3.2963,
+      "step": 1983
+    },
+    {
+      "epoch": 0.13802219207624614,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019394697795893103,
+      "loss": 3.2933,
+      "step": 1984
+    },
+    {
+      "epoch": 0.13809175971338133,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019393925494942345,
+      "loss": 3.187,
+      "step": 1985
+    },
+    {
+      "epoch": 0.13816132735051653,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019393152717015396,
+      "loss": 3.6,
+      "step": 1986
+    },
+    {
+      "epoch": 0.13823089498765173,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019392379462151502,
+      "loss": 3.9195,
+      "step": 1987
+    },
+    {
+      "epoch": 0.13830046262478696,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019391605730389916,
+      "loss": 3.3988,
+      "step": 1988
+    },
+    {
+      "epoch": 0.13837003026192216,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019390831521769929,
+      "loss": 3.3932,
+      "step": 1989
+    },
+    {
+      "epoch": 0.13843959789905735,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019390056836330852,
+      "loss": 3.1819,
+      "step": 1990
+    },
+    {
+      "epoch": 0.13850916553619255,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019389281674112018,
+      "loss": 3.409,
+      "step": 1991
+    },
+    {
+      "epoch": 0.13857873317332778,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019388506035152785,
+      "loss": 3.4284,
+      "step": 1992
+    },
+    {
+      "epoch": 0.13864830081046298,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019387729919492541,
+      "loss": 3.3189,
+      "step": 1993
+    },
+    {
+      "epoch": 0.13871786844759817,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019386953327170684,
+      "loss": 3.356,
+      "step": 1994
+    },
+    {
+      "epoch": 0.13878743608473337,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019386176258226653,
+      "loss": 3.1132,
+      "step": 1995
+    },
+    {
+      "epoch": 0.1388570037218686,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00193853987126999,
+      "loss": 3.3558,
+      "step": 1996
+    },
+    {
+      "epoch": 0.1389265713590038,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019384620690629907,
+      "loss": 3.2451,
+      "step": 1997
+    },
+    {
+      "epoch": 0.138996138996139,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001938384219205618,
+      "loss": 3.4747,
+      "step": 1998
+    },
+    {
+      "epoch": 0.1390657066332742,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019383063217018241,
+      "loss": 2.8982,
+      "step": 1999
+    },
+    {
+      "epoch": 0.1391352742704094,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019382283765555651,
+      "loss": 2.9246,
+      "step": 2000
+    },
+    {
+      "epoch": 0.13920484190754462,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019381503837707977,
+      "loss": 3.1143,
+      "step": 2001
+    },
+    {
+      "epoch": 0.13927440954467982,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019380723433514823,
+      "loss": 3.2632,
+      "step": 2002
+    },
+    {
+      "epoch": 0.13934397718181502,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001937994255301582,
+      "loss": 3.4156,
+      "step": 2003
+    },
+    {
+      "epoch": 0.1394135448189502,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001937916119625061,
+      "loss": 3.0436,
+      "step": 2004
+    },
+    {
+      "epoch": 0.13948311245608544,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001937837936325887,
+      "loss": 3.1214,
+      "step": 2005
+    },
+    {
+      "epoch": 0.13955268009322064,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019377597054080296,
+      "loss": 3.0121,
+      "step": 2006
+    },
+    {
+      "epoch": 0.13962224773035584,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019376814268754609,
+      "loss": 3.0581,
+      "step": 2007
+    },
+    {
+      "epoch": 0.13969181536749103,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019376031007321557,
+      "loss": 3.3915,
+      "step": 2008
+    },
+    {
+      "epoch": 0.13976138300462626,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001937524726982091,
+      "loss": 3.26,
+      "step": 2009
+    },
+    {
+      "epoch": 0.13983095064176146,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019374463056292459,
+      "loss": 3.6236,
+      "step": 2010
+    },
+    {
+      "epoch": 0.13990051827889666,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019373678366776028,
+      "loss": 3.1375,
+      "step": 2011
+    },
+    {
+      "epoch": 0.13997008591603186,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019372893201311454,
+      "loss": 3.4221,
+      "step": 2012
+    },
+    {
+      "epoch": 0.14003965355316705,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019372107559938608,
+      "loss": 3.3759,
+      "step": 2013
+    },
+    {
+      "epoch": 0.14010922119030228,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001937132144269738,
+      "loss": 3.4701,
+      "step": 2014
+    },
+    {
+      "epoch": 0.14017878882743748,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019370534849627679,
+      "loss": 3.1729,
+      "step": 2015
+    },
+    {
+      "epoch": 0.14024835646457268,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019369747780769453,
+      "loss": 2.7963,
+      "step": 2016
+    },
+    {
+      "epoch": 0.14031792410170787,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019368960236162663,
+      "loss": 3.4503,
+      "step": 2017
+    },
+    {
+      "epoch": 0.1403874917388431,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019368172215847293,
+      "loss": 3.3126,
+      "step": 2018
+    },
+    {
+      "epoch": 0.1404570593759783,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019367383719863355,
+      "loss": 3.8492,
+      "step": 2019
+    },
+    {
+      "epoch": 0.1405266270131135,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019366594748250893,
+      "loss": 3.3776,
+      "step": 2020
+    },
+    {
+      "epoch": 0.1405961946502487,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019365805301049955,
+      "loss": 3.2232,
+      "step": 2021
+    },
+    {
+      "epoch": 0.14066576228738392,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019365015378300632,
+      "loss": 3.2879,
+      "step": 2022
+    },
+    {
+      "epoch": 0.14073532992451912,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019364224980043033,
+      "loss": 3.2998,
+      "step": 2023
+    },
+    {
+      "epoch": 0.14080489756165432,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019363434106317288,
+      "loss": 3.2886,
+      "step": 2024
+    },
+    {
+      "epoch": 0.14087446519878952,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019362642757163556,
+      "loss": 3.2717,
+      "step": 2025
+    },
+    {
+      "epoch": 0.14094403283592472,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019361850932622011,
+      "loss": 3.119,
+      "step": 2026
+    },
+    {
+      "epoch": 0.14101360047305994,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019361058632732867,
+      "loss": 3.2256,
+      "step": 2027
+    },
+    {
+      "epoch": 0.14108316811019514,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001936026585753635,
+      "loss": 2.9657,
+      "step": 2028
+    },
+    {
+      "epoch": 0.14115273574733034,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001935947260707271,
+      "loss": 2.9658,
+      "step": 2029
+    },
+    {
+      "epoch": 0.14122230338446554,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019358678881382227,
+      "loss": 3.5397,
+      "step": 2030
+    },
+    {
+      "epoch": 0.14129187102160076,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019357884680505197,
+      "loss": 3.3371,
+      "step": 2031
+    },
+    {
+      "epoch": 0.14136143865873596,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019357090004481954,
+      "loss": 3.7608,
+      "step": 2032
+    },
+    {
+      "epoch": 0.14143100629587116,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019356294853352845,
+      "loss": 3.2105,
+      "step": 2033
+    },
+    {
+      "epoch": 0.14150057393300636,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019355499227158243,
+      "loss": 3.3079,
+      "step": 2034
+    },
+    {
+      "epoch": 0.14157014157014158,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019354703125938543,
+      "loss": 3.2313,
+      "step": 2035
+    },
+    {
+      "epoch": 0.14163970920727678,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001935390654973417,
+      "loss": 3.4517,
+      "step": 2036
+    },
+    {
+      "epoch": 0.14170927684441198,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001935310949858557,
+      "loss": 2.969,
+      "step": 2037
+    },
+    {
+      "epoch": 0.14177884448154718,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019352311972533212,
+      "loss": 3.5515,
+      "step": 2038
+    },
+    {
+      "epoch": 0.14184841211868238,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019351513971617594,
+      "loss": 3.2527,
+      "step": 2039
+    },
+    {
+      "epoch": 0.1419179797558176,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001935071549587923,
+      "loss": 3.3396,
+      "step": 2040
+    },
+    {
+      "epoch": 0.1419875473929528,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001934991654535866,
+      "loss": 3.0555,
+      "step": 2041
+    },
+    {
+      "epoch": 0.142057115030088,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001934911712009646,
+      "loss": 3.5917,
+      "step": 2042
+    },
+    {
+      "epoch": 0.1421266826672232,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019348317220133217,
+      "loss": 3.4887,
+      "step": 2043
+    },
+    {
+      "epoch": 0.14219625030435842,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001934751684550954,
+      "loss": 3.6032,
+      "step": 2044
+    },
+    {
+      "epoch": 0.14226581794149362,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019346715996266073,
+      "loss": 3.1755,
+      "step": 2045
+    },
+    {
+      "epoch": 0.14233538557862882,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019345914672443483,
+      "loss": 3.165,
+      "step": 2046
+    },
+    {
+      "epoch": 0.14240495321576402,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019345112874082449,
+      "loss": 2.8039,
+      "step": 2047
+    },
+    {
+      "epoch": 0.14247452085289924,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019344310601223686,
+      "loss": 2.9524,
+      "step": 2048
+    },
+    {
+      "epoch": 0.14254408849003444,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001934350785390793,
+      "loss": 3.5818,
+      "step": 2049
+    },
+    {
+      "epoch": 0.14261365612716964,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019342704632175944,
+      "loss": 3.5404,
+      "step": 2050
+    },
+    {
+      "epoch": 0.14268322376430484,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019341900936068503,
+      "loss": 3.0195,
+      "step": 2051
+    },
+    {
+      "epoch": 0.14275279140144004,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001934109676562642,
+      "loss": 3.166,
+      "step": 2052
+    },
+    {
+      "epoch": 0.14282235903857526,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019340292120890524,
+      "loss": 3.1962,
+      "step": 2053
+    },
+    {
+      "epoch": 0.14289192667571046,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019339487001901676,
+      "loss": 3.3675,
+      "step": 2054
+    },
+    {
+      "epoch": 0.14296149431284566,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019338681408700752,
+      "loss": 3.3433,
+      "step": 2055
+    },
+    {
+      "epoch": 0.14303106194998086,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019337875341328655,
+      "loss": 3.031,
+      "step": 2056
+    },
+    {
+      "epoch": 0.14310062958711608,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019337068799826316,
+      "loss": 3.0238,
+      "step": 2057
+    },
+    {
+      "epoch": 0.14317019722425128,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019336261784234684,
+      "loss": 3.3712,
+      "step": 2058
+    },
+    {
+      "epoch": 0.14323976486138648,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001933545429459474,
+      "loss": 3.4989,
+      "step": 2059
+    },
+    {
+      "epoch": 0.14330933249852168,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019334646330947476,
+      "loss": 3.2556,
+      "step": 2060
+    },
+    {
+      "epoch": 0.1433789001356569,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019333837893333926,
+      "loss": 3.0214,
+      "step": 2061
+    },
+    {
+      "epoch": 0.1434484677727921,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019333028981795132,
+      "loss": 3.1768,
+      "step": 2062
+    },
+    {
+      "epoch": 0.1435180354099273,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019332219596372166,
+      "loss": 3.4896,
+      "step": 2063
+    },
+    {
+      "epoch": 0.1435876030470625,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019331409737106129,
+      "loss": 2.6882,
+      "step": 2064
+    },
+    {
+      "epoch": 0.1436571706841977,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001933059940403814,
+      "loss": 3.2502,
+      "step": 2065
+    },
+    {
+      "epoch": 0.14372673832133293,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019329788597209343,
+      "loss": 3.2372,
+      "step": 2066
+    },
+    {
+      "epoch": 0.14379630595846812,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019328977316660906,
+      "loss": 3.2938,
+      "step": 2067
+    },
+    {
+      "epoch": 0.14386587359560332,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019328165562434024,
+      "loss": 3.1811,
+      "step": 2068
+    },
+    {
+      "epoch": 0.14393544123273852,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001932735333456991,
+      "loss": 3.4409,
+      "step": 2069
+    },
+    {
+      "epoch": 0.14400500886987375,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019326540633109808,
+      "loss": 3.1112,
+      "step": 2070
+    },
+    {
+      "epoch": 0.14407457650700894,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019325727458094982,
+      "loss": 3.1347,
+      "step": 2071
+    },
+    {
+      "epoch": 0.14414414414414414,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019324913809566717,
+      "loss": 3.1182,
+      "step": 2072
+    },
+    {
+      "epoch": 0.14421371178127934,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019324099687566335,
+      "loss": 3.2554,
+      "step": 2073
+    },
+    {
+      "epoch": 0.14428327941841457,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019323285092135167,
+      "loss": 2.8964,
+      "step": 2074
+    },
+    {
+      "epoch": 0.14435284705554977,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019322470023314573,
+      "loss": 3.049,
+      "step": 2075
+    },
+    {
+      "epoch": 0.14442241469268496,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001932165448114594,
+      "loss": 2.8512,
+      "step": 2076
+    },
+    {
+      "epoch": 0.14449198232982016,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019320838465670678,
+      "loss": 3.2504,
+      "step": 2077
+    },
+    {
+      "epoch": 0.14456154996695536,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001932002197693022,
+      "loss": 3.0084,
+      "step": 2078
+    },
+    {
+      "epoch": 0.1446311176040906,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019319205014966022,
+      "loss": 3.415,
+      "step": 2079
+    },
+    {
+      "epoch": 0.14470068524122578,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019318387579819562,
+      "loss": 3.2434,
+      "step": 2080
+    },
+    {
+      "epoch": 0.14477025287836098,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019317569671532353,
+      "loss": 3.4969,
+      "step": 2081
+    },
+    {
+      "epoch": 0.14483982051549618,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019316751290145923,
+      "loss": 2.9212,
+      "step": 2082
+    },
+    {
+      "epoch": 0.1449093881526314,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019315932435701817,
+      "loss": 3.4919,
+      "step": 2083
+    },
+    {
+      "epoch": 0.1449789557897666,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019315113108241617,
+      "loss": 3.3674,
+      "step": 2084
+    },
+    {
+      "epoch": 0.1450485234269018,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019314293307806927,
+      "loss": 3.6306,
+      "step": 2085
+    },
+    {
+      "epoch": 0.145118091064037,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019313473034439372,
+      "loss": 3.1649,
+      "step": 2086
+    },
+    {
+      "epoch": 0.1451876587011722,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00193126522881806,
+      "loss": 3.0952,
+      "step": 2087
+    },
+    {
+      "epoch": 0.14525722633830743,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019311831069072278,
+      "loss": 3.3009,
+      "step": 2088
+    },
+    {
+      "epoch": 0.14532679397544263,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019311009377156116,
+      "loss": 2.86,
+      "step": 2089
+    },
+    {
+      "epoch": 0.14539636161257782,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019310187212473826,
+      "loss": 3.2769,
+      "step": 2090
+    },
+    {
+      "epoch": 0.14546592924971302,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019309364575067157,
+      "loss": 3.2831,
+      "step": 2091
+    },
+    {
+      "epoch": 0.14553549688684825,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019308541464977877,
+      "loss": 3.105,
+      "step": 2092
+    },
+    {
+      "epoch": 0.14560506452398345,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001930771788224778,
+      "loss": 3.2699,
+      "step": 2093
+    },
+    {
+      "epoch": 0.14567463216111864,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019306893826918684,
+      "loss": 3.2145,
+      "step": 2094
+    },
+    {
+      "epoch": 0.14574419979825384,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001930606929903243,
+      "loss": 3.4146,
+      "step": 2095
+    },
+    {
+      "epoch": 0.14581376743538907,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001930524429863088,
+      "loss": 3.2052,
+      "step": 2096
+    },
+    {
+      "epoch": 0.14588333507252427,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019304418825755929,
+      "loss": 3.7529,
+      "step": 2097
+    },
+    {
+      "epoch": 0.14595290270965947,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019303592880449488,
+      "loss": 3.1949,
+      "step": 2098
+    },
+    {
+      "epoch": 0.14602247034679466,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019302766462753493,
+      "loss": 2.8965,
+      "step": 2099
+    },
+    {
+      "epoch": 0.14609203798392986,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019301939572709907,
+      "loss": 3.0809,
+      "step": 2100
+    },
+    {
+      "epoch": 0.1461616056210651,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019301112210360714,
+      "loss": 3.1178,
+      "step": 2101
+    },
+    {
+      "epoch": 0.1462311732582003,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019300284375747925,
+      "loss": 3.6246,
+      "step": 2102
+    },
+    {
+      "epoch": 0.14630074089533548,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019299456068913572,
+      "loss": 3.0863,
+      "step": 2103
+    },
+    {
+      "epoch": 0.14637030853247068,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019298627289899715,
+      "loss": 3.1766,
+      "step": 2104
+    },
+    {
+      "epoch": 0.1464398761696059,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001929779803874843,
+      "loss": 3.4152,
+      "step": 2105
+    },
+    {
+      "epoch": 0.1465094438067411,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019296968315501823,
+      "loss": 3.3496,
+      "step": 2106
+    },
+    {
+      "epoch": 0.1465790114438763,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001929613812020203,
+      "loss": 3.267,
+      "step": 2107
+    },
+    {
+      "epoch": 0.1466485790810115,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019295307452891195,
+      "loss": 3.1405,
+      "step": 2108
+    },
+    {
+      "epoch": 0.14671814671814673,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019294476313611501,
+      "loss": 3.1015,
+      "step": 2109
+    },
+    {
+      "epoch": 0.14678771435528193,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019293644702405147,
+      "loss": 2.8852,
+      "step": 2110
+    },
+    {
+      "epoch": 0.14685728199241713,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001929281261931436,
+      "loss": 3.3098,
+      "step": 2111
+    },
+    {
+      "epoch": 0.14692684962955233,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019291980064381385,
+      "loss": 3.1474,
+      "step": 2112
+    },
+    {
+      "epoch": 0.14699641726668752,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.00192911470376485,
+      "loss": 3.6916,
+      "step": 2113
+    },
+    {
+      "epoch": 0.14706598490382275,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019290313539158,
+      "loss": 3.4483,
+      "step": 2114
+    },
+    {
+      "epoch": 0.14713555254095795,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019289479568952203,
+      "loss": 3.3741,
+      "step": 2115
+    },
+    {
+      "epoch": 0.14720512017809315,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019288645127073455,
+      "loss": 2.9652,
+      "step": 2116
+    },
+    {
+      "epoch": 0.14727468781522834,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019287810213564126,
+      "loss": 2.9294,
+      "step": 2117
+    },
+    {
+      "epoch": 0.14734425545236357,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001928697482846661,
+      "loss": 3.3925,
+      "step": 2118
+    },
+    {
+      "epoch": 0.14741382308949877,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001928613897182332,
+      "loss": 3.0946,
+      "step": 2119
+    },
+    {
+      "epoch": 0.14748339072663397,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.00192853026436767,
+      "loss": 3.5871,
+      "step": 2120
+    },
+    {
+      "epoch": 0.14755295836376917,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019284465844069212,
+      "loss": 3.4433,
+      "step": 2121
+    },
+    {
+      "epoch": 0.1476225260009044,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019283628573043348,
+      "loss": 3.4745,
+      "step": 2122
+    },
+    {
+      "epoch": 0.1476920936380396,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019282790830641616,
+      "loss": 3.5395,
+      "step": 2123
+    },
+    {
+      "epoch": 0.1477616612751748,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019281952616906554,
+      "loss": 2.7807,
+      "step": 2124
+    },
+    {
+      "epoch": 0.14783122891231,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019281113931880727,
+      "loss": 3.6789,
+      "step": 2125
+    },
+    {
+      "epoch": 0.14790079654944518,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001928027477560671,
+      "loss": 3.056,
+      "step": 2126
+    },
+    {
+      "epoch": 0.1479703641865804,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019279435148127117,
+      "loss": 3.503,
+      "step": 2127
+    },
+    {
+      "epoch": 0.1480399318237156,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001927859504948458,
+      "loss": 3.5267,
+      "step": 2128
+    },
+    {
+      "epoch": 0.1481094994608508,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019277754479721755,
+      "loss": 3.436,
+      "step": 2129
+    },
+    {
+      "epoch": 0.148179067097986,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019276913438881316,
+      "loss": 3.2678,
+      "step": 2130
+    },
+    {
+      "epoch": 0.14824863473512123,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019276071927005977,
+      "loss": 3.2497,
+      "step": 2131
+    },
+    {
+      "epoch": 0.14831820237225643,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019275229944138456,
+      "loss": 3.058,
+      "step": 2132
+    },
+    {
+      "epoch": 0.14838777000939163,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019274387490321515,
+      "loss": 3.205,
+      "step": 2133
+    },
+    {
+      "epoch": 0.14845733764652683,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019273544565597918,
+      "loss": 2.9491,
+      "step": 2134
+    },
+    {
+      "epoch": 0.14852690528366205,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019272701170010471,
+      "loss": 3.448,
+      "step": 2135
+    },
+    {
+      "epoch": 0.14859647292079725,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019271857303602,
+      "loss": 3.424,
+      "step": 2136
+    },
+    {
+      "epoch": 0.14866604055793245,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019271012966415345,
+      "loss": 3.2642,
+      "step": 2137
+    },
+    {
+      "epoch": 0.14873560819506765,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001927016815849338,
+      "loss": 3.8212,
+      "step": 2138
+    },
+    {
+      "epoch": 0.14880517583220285,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019269322879879006,
+      "loss": 3.1188,
+      "step": 2139
+    },
+    {
+      "epoch": 0.14887474346933807,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019268477130615135,
+      "loss": 3.5071,
+      "step": 2140
+    },
+    {
+      "epoch": 0.14894431110647327,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019267630910744708,
+      "loss": 3.1713,
+      "step": 2141
+    },
+    {
+      "epoch": 0.14901387874360847,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.00192667842203107,
+      "loss": 3.2658,
+      "step": 2142
+    },
+    {
+      "epoch": 0.14908344638074367,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019265937059356095,
+      "loss": 3.1708,
+      "step": 2143
+    },
+    {
+      "epoch": 0.1491530140178789,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019265089427923914,
+      "loss": 3.24,
+      "step": 2144
+    },
+    {
+      "epoch": 0.1492225816550141,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019264241326057189,
+      "loss": 3.2901,
+      "step": 2145
+    },
+    {
+      "epoch": 0.1492921492921493,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019263392753798981,
+      "loss": 3.1596,
+      "step": 2146
+    },
+    {
+      "epoch": 0.1493617169292845,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019262543711192385,
+      "loss": 2.9588,
+      "step": 2147
+    },
+    {
+      "epoch": 0.14943128456641971,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019261694198280503,
+      "loss": 2.9698,
+      "step": 2148
+    },
+    {
+      "epoch": 0.1495008522035549,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019260844215106471,
+      "loss": 3.4912,
+      "step": 2149
+    },
+    {
+      "epoch": 0.1495704198406901,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019259993761713452,
+      "loss": 3.1958,
+      "step": 2150
+    },
+    {
+      "epoch": 0.1496399874778253,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019259142838144623,
+      "loss": 2.9556,
+      "step": 2151
+    },
+    {
+      "epoch": 0.1497095551149605,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019258291444443187,
+      "loss": 3.0389,
+      "step": 2152
+    },
+    {
+      "epoch": 0.14977912275209573,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019257439580652378,
+      "loss": 3.3051,
+      "step": 2153
+    },
+    {
+      "epoch": 0.14984869038923093,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019256587246815448,
+      "loss": 3.0117,
+      "step": 2154
+    },
+    {
+      "epoch": 0.14991825802636613,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019255734442975676,
+      "loss": 3.5847,
+      "step": 2155
+    },
+    {
+      "epoch": 0.14998782566350133,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001925488116917636,
+      "loss": 3.4234,
+      "step": 2156
+    },
+    {
+      "epoch": 0.15005739330063655,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019254027425460827,
+      "loss": 3.4137,
+      "step": 2157
+    },
+    {
+      "epoch": 0.15012696093777175,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019253173211872423,
+      "loss": 3.4854,
+      "step": 2158
+    },
+    {
+      "epoch": 0.15019652857490695,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019252318528454526,
+      "loss": 3.1512,
+      "step": 2159
+    },
+    {
+      "epoch": 0.15026609621204215,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019251463375250526,
+      "loss": 3.7507,
+      "step": 2160
+    },
+    {
+      "epoch": 0.15033566384917738,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001925060775230385,
+      "loss": 3.488,
+      "step": 2161
+    },
+    {
+      "epoch": 0.15040523148631257,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019249751659657934,
+      "loss": 3.3587,
+      "step": 2162
+    },
+    {
+      "epoch": 0.15047479912344777,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019248895097356256,
+      "loss": 3.428,
+      "step": 2163
+    },
+    {
+      "epoch": 0.15054436676058297,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.00192480380654423,
+      "loss": 3.1657,
+      "step": 2164
+    },
+    {
+      "epoch": 0.15061393439771817,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019247180563959586,
+      "loss": 3.5087,
+      "step": 2165
+    },
+    {
+      "epoch": 0.1506835020348534,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019246322592951653,
+      "loss": 3.457,
+      "step": 2166
+    },
+    {
+      "epoch": 0.1507530696719886,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019245464152462062,
+      "loss": 3.4768,
+      "step": 2167
+    },
+    {
+      "epoch": 0.1508226373091238,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019244605242534402,
+      "loss": 3.0841,
+      "step": 2168
+    },
+    {
+      "epoch": 0.150892204946259,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019243745863212283,
+      "loss": 3.1466,
+      "step": 2169
+    },
+    {
+      "epoch": 0.15096177258339422,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019242886014539343,
+      "loss": 2.7856,
+      "step": 2170
+    },
+    {
+      "epoch": 0.15103134022052941,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019242025696559239,
+      "loss": 3.379,
+      "step": 2171
+    },
+    {
+      "epoch": 0.1511009078576646,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019241164909315652,
+      "loss": 3.4753,
+      "step": 2172
+    },
+    {
+      "epoch": 0.1511704754947998,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001924030365285229,
+      "loss": 2.9882,
+      "step": 2173
+    },
+    {
+      "epoch": 0.15124004313193504,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019239441927212885,
+      "loss": 3.2017,
+      "step": 2174
+    },
+    {
+      "epoch": 0.15130961076907024,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019238579732441185,
+      "loss": 2.9532,
+      "step": 2175
+    },
+    {
+      "epoch": 0.15137917840620543,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019237717068580973,
+      "loss": 3.4523,
+      "step": 2176
+    },
+    {
+      "epoch": 0.15144874604334063,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019236853935676052,
+      "loss": 3.2862,
+      "step": 2177
+    },
+    {
+      "epoch": 0.15151831368047583,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019235990333770247,
+      "loss": 3.4693,
+      "step": 2178
+    },
+    {
+      "epoch": 0.15158788131761106,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019235126262907402,
+      "loss": 3.1924,
+      "step": 2179
+    },
+    {
+      "epoch": 0.15165744895474625,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019234261723131395,
+      "loss": 3.4629,
+      "step": 2180
+    },
+    {
+      "epoch": 0.15172701659188145,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019233396714486122,
+      "loss": 3.4659,
+      "step": 2181
+    },
+    {
+      "epoch": 0.15179658422901665,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019232531237015503,
+      "loss": 3.5634,
+      "step": 2182
+    },
+    {
+      "epoch": 0.15186615186615188,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019231665290763485,
+      "loss": 3.2782,
+      "step": 2183
+    },
+    {
+      "epoch": 0.15193571950328708,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019230798875774031,
+      "loss": 2.896,
+      "step": 2184
+    },
+    {
+      "epoch": 0.15200528714042227,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001922993199209114,
+      "loss": 3.9471,
+      "step": 2185
+    },
+    {
+      "epoch": 0.15207485477755747,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019229064639758825,
+      "loss": 3.2744,
+      "step": 2186
+    },
+    {
+      "epoch": 0.1521444224146927,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019228196818821127,
+      "loss": 3.12,
+      "step": 2187
+    },
+    {
+      "epoch": 0.1522139900518279,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019227328529322102,
+      "loss": 3.2227,
+      "step": 2188
+    },
+    {
+      "epoch": 0.1522835576889631,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001922645977130585,
+      "loss": 3.2465,
+      "step": 2189
+    },
+    {
+      "epoch": 0.1523531253260983,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019225590544816472,
+      "loss": 2.951,
+      "step": 2190
+    },
+    {
+      "epoch": 0.1524226929632335,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019224720849898107,
+      "loss": 3.3373,
+      "step": 2191
+    },
+    {
+      "epoch": 0.15249226060036872,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019223850686594913,
+      "loss": 3.667,
+      "step": 2192
+    },
+    {
+      "epoch": 0.15256182823750392,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019222980054951072,
+      "loss": 3.4597,
+      "step": 2193
+    },
+    {
+      "epoch": 0.15263139587463911,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019222108955010793,
+      "loss": 3.0378,
+      "step": 2194
+    },
+    {
+      "epoch": 0.1527009635117743,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019221237386818305,
+      "loss": 3.2428,
+      "step": 2195
+    },
+    {
+      "epoch": 0.15277053114890954,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019220365350417858,
+      "loss": 3.5437,
+      "step": 2196
+    },
+    {
+      "epoch": 0.15284009878604474,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019219492845853733,
+      "loss": 3.1585,
+      "step": 2197
+    },
+    {
+      "epoch": 0.15290966642317994,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019218619873170232,
+      "loss": 3.2917,
+      "step": 2198
+    },
+    {
+      "epoch": 0.15297923406031513,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001921774643241168,
+      "loss": 3.4092,
+      "step": 2199
+    },
+    {
+      "epoch": 0.15304880169745036,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019216872523622427,
+      "loss": 3.2245,
+      "step": 2200
+    },
+    {
+      "epoch": 0.15311836933458556,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019215998146846838,
+      "loss": 2.9539,
+      "step": 2201
+    },
+    {
+      "epoch": 0.15318793697172076,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001921512330212932,
+      "loss": 3.0401,
+      "step": 2202
+    },
+    {
+      "epoch": 0.15325750460885595,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019214247989514286,
+      "loss": 3.1097,
+      "step": 2203
+    },
+    {
+      "epoch": 0.15332707224599115,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019213372209046183,
+      "loss": 3.3438,
+      "step": 2204
+    },
+    {
+      "epoch": 0.15339663988312638,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019212495960769479,
+      "loss": 3.4192,
+      "step": 2205
+    },
+    {
+      "epoch": 0.15346620752026158,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001921161924472866,
+      "loss": 3.0318,
+      "step": 2206
+    },
+    {
+      "epoch": 0.15353577515739678,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001921074206096825,
+      "loss": 3.2601,
+      "step": 2207
+    },
+    {
+      "epoch": 0.15360534279453197,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019209864409532784,
+      "loss": 3.0052,
+      "step": 2208
+    },
+    {
+      "epoch": 0.1536749104316672,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019208986290466822,
+      "loss": 3.2566,
+      "step": 2209
+    },
+    {
+      "epoch": 0.1537444780688024,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019208107703814954,
+      "loss": 3.2904,
+      "step": 2210
+    },
+    {
+      "epoch": 0.1538140457059376,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001920722864962179,
+      "loss": 3.2598,
+      "step": 2211
+    },
+    {
+      "epoch": 0.1538836133430728,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019206349127931963,
+      "loss": 2.8552,
+      "step": 2212
+    },
+    {
+      "epoch": 0.15395318098020802,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001920546913879013,
+      "loss": 3.4522,
+      "step": 2213
+    },
+    {
+      "epoch": 0.15402274861734322,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019204588682240973,
+      "loss": 3.4462,
+      "step": 2214
+    },
+    {
+      "epoch": 0.15409231625447842,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019203707758329198,
+      "loss": 3.5985,
+      "step": 2215
+    },
+    {
+      "epoch": 0.15416188389161362,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019202826367099534,
+      "loss": 3.1834,
+      "step": 2216
+    },
+    {
+      "epoch": 0.15423145152874881,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019201944508596732,
+      "loss": 3.5128,
+      "step": 2217
+    },
+    {
+      "epoch": 0.15430101916588404,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019201062182865566,
+      "loss": 3.4152,
+      "step": 2218
+    },
+    {
+      "epoch": 0.15437058680301924,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019200179389950842,
+      "loss": 3.2574,
+      "step": 2219
+    },
+    {
+      "epoch": 0.15444015444015444,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001919929612989738,
+      "loss": 3.3873,
+      "step": 2220
+    },
+    {
+      "epoch": 0.15450972207728964,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001919841240275003,
+      "loss": 3.3195,
+      "step": 2221
+    },
+    {
+      "epoch": 0.15457928971442486,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019197528208553661,
+      "loss": 3.0571,
+      "step": 2222
+    },
+    {
+      "epoch": 0.15464885735156006,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019196643547353168,
+      "loss": 3.526,
+      "step": 2223
+    },
+    {
+      "epoch": 0.15471842498869526,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001919575841919347,
+      "loss": 3.4711,
+      "step": 2224
+    },
+    {
+      "epoch": 0.15478799262583046,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001919487282411951,
+      "loss": 3.3619,
+      "step": 2225
+    },
+    {
+      "epoch": 0.15485756026296568,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019193986762176252,
+      "loss": 3.0812,
+      "step": 2226
+    },
+    {
+      "epoch": 0.15492712790010088,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019193100233408692,
+      "loss": 3.1839,
+      "step": 2227
+    },
+    {
+      "epoch": 0.15499669553723608,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019192213237861834,
+      "loss": 3.0777,
+      "step": 2228
+    },
+    {
+      "epoch": 0.15506626317437128,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019191325775580722,
+      "loss": 2.9409,
+      "step": 2229
+    },
+    {
+      "epoch": 0.15513583081150648,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019190437846610413,
+      "loss": 3.1332,
+      "step": 2230
+    },
+    {
+      "epoch": 0.1552053984486417,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019189549450995996,
+      "loss": 3.3082,
+      "step": 2231
+    },
+    {
+      "epoch": 0.1552749660857769,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019188660588782573,
+      "loss": 3.2494,
+      "step": 2232
+    },
+    {
+      "epoch": 0.1553445337229121,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019187771260015284,
+      "loss": 3.215,
+      "step": 2233
+    },
+    {
+      "epoch": 0.1554141013600473,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019186881464739278,
+      "loss": 3.6842,
+      "step": 2234
+    },
+    {
+      "epoch": 0.15548366899718252,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019185991202999738,
+      "loss": 3.232,
+      "step": 2235
+    },
+    {
+      "epoch": 0.15555323663431772,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019185100474841863,
+      "loss": 3.0475,
+      "step": 2236
+    },
+    {
+      "epoch": 0.15562280427145292,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019184209280310883,
+      "loss": 3.2183,
+      "step": 2237
+    },
+    {
+      "epoch": 0.15569237190858812,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001918331761945205,
+      "loss": 3.4003,
+      "step": 2238
+    },
+    {
+      "epoch": 0.15576193954572332,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019182425492310633,
+      "loss": 3.3352,
+      "step": 2239
+    },
+    {
+      "epoch": 0.15583150718285854,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019181532898931934,
+      "loss": 3.2472,
+      "step": 2240
+    },
+    {
+      "epoch": 0.15590107481999374,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001918063983936127,
+      "loss": 3.2951,
+      "step": 2241
+    },
+    {
+      "epoch": 0.15597064245712894,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019179746313643992,
+      "loss": 3.3719,
+      "step": 2242
+    },
+    {
+      "epoch": 0.15604021009426414,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019178852321825464,
+      "loss": 3.3772,
+      "step": 2243
+    },
+    {
+      "epoch": 0.15610977773139936,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001917795786395108,
+      "loss": 3.1354,
+      "step": 2244
+    },
+    {
+      "epoch": 0.15617934536853456,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019177062940066256,
+      "loss": 3.3926,
+      "step": 2245
+    },
+    {
+      "epoch": 0.15624891300566976,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019176167550216433,
+      "loss": 3.2926,
+      "step": 2246
+    },
+    {
+      "epoch": 0.15631848064280496,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019175271694447072,
+      "loss": 3.3812,
+      "step": 2247
+    },
+    {
+      "epoch": 0.15638804827994018,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019174375372803662,
+      "loss": 3.2116,
+      "step": 2248
+    },
+    {
+      "epoch": 0.15645761591707538,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019173478585331712,
+      "loss": 2.9926,
+      "step": 2249
+    },
+    {
+      "epoch": 0.15652718355421058,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019172581332076756,
+      "loss": 3.1369,
+      "step": 2250
+    },
+    {
+      "epoch": 0.15659675119134578,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019171683613084353,
+      "loss": 3.4026,
+      "step": 2251
+    },
+    {
+      "epoch": 0.15666631882848098,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019170785428400086,
+      "loss": 3.0608,
+      "step": 2252
+    },
+    {
+      "epoch": 0.1567358864656162,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001916988677806956,
+      "loss": 3.22,
+      "step": 2253
+    },
+    {
+      "epoch": 0.1568054541027514,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019168987662138402,
+      "loss": 3.0833,
+      "step": 2254
+    },
+    {
+      "epoch": 0.1568750217398866,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019168088080652268,
+      "loss": 2.9146,
+      "step": 2255
+    },
+    {
+      "epoch": 0.1569445893770218,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019167188033656828,
+      "loss": 3.3521,
+      "step": 2256
+    },
+    {
+      "epoch": 0.15701415701415702,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019166287521197786,
+      "loss": 3.179,
+      "step": 2257
+    },
+    {
+      "epoch": 0.15708372465129222,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019165386543320867,
+      "loss": 3.3823,
+      "step": 2258
+    },
+    {
+      "epoch": 0.15715329228842742,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019164485100071817,
+      "loss": 3.5961,
+      "step": 2259
+    },
+    {
+      "epoch": 0.15722285992556262,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019163583191496407,
+      "loss": 3.0851,
+      "step": 2260
+    },
+    {
+      "epoch": 0.15729242756269785,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019162680817640429,
+      "loss": 3.232,
+      "step": 2261
+    },
+    {
+      "epoch": 0.15736199519983304,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00191617779785497,
+      "loss": 3.3365,
+      "step": 2262
+    },
+    {
+      "epoch": 0.15743156283696824,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019160874674270067,
+      "loss": 3.7762,
+      "step": 2263
+    },
+    {
+      "epoch": 0.15750113047410344,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019159970904847393,
+      "loss": 3.0307,
+      "step": 2264
+    },
+    {
+      "epoch": 0.15757069811123864,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019159066670327563,
+      "loss": 3.2305,
+      "step": 2265
+    },
+    {
+      "epoch": 0.15764026574837386,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019158161970756493,
+      "loss": 3.0286,
+      "step": 2266
+    },
+    {
+      "epoch": 0.15770983338550906,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001915725680618012,
+      "loss": 3.1607,
+      "step": 2267
+    },
+    {
+      "epoch": 0.15777940102264426,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019156351176644404,
+      "loss": 3.2419,
+      "step": 2268
+    },
+    {
+      "epoch": 0.15784896865977946,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019155445082195324,
+      "loss": 3.6515,
+      "step": 2269
+    },
+    {
+      "epoch": 0.15791853629691469,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001915453852287889,
+      "loss": 3.5183,
+      "step": 2270
+    },
+    {
+      "epoch": 0.15798810393404988,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019153631498741133,
+      "loss": 3.5219,
+      "step": 2271
+    },
+    {
+      "epoch": 0.15805767157118508,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019152724009828105,
+      "loss": 3.2711,
+      "step": 2272
+    },
+    {
+      "epoch": 0.15812723920832028,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019151816056185887,
+      "loss": 3.2095,
+      "step": 2273
+    },
+    {
+      "epoch": 0.1581968068454555,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019150907637860576,
+      "loss": 3.4705,
+      "step": 2274
+    },
+    {
+      "epoch": 0.1582663744825907,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019149998754898298,
+      "loss": 3.5706,
+      "step": 2275
+    },
+    {
+      "epoch": 0.1583359421197259,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019149089407345206,
+      "loss": 3.1592,
+      "step": 2276
+    },
+    {
+      "epoch": 0.1584055097568611,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019148179595247468,
+      "loss": 3.2951,
+      "step": 2277
+    },
+    {
+      "epoch": 0.1584750773939963,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019147269318651279,
+      "loss": 3.4067,
+      "step": 2278
+    },
+    {
+      "epoch": 0.15854464503113153,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001914635857760286,
+      "loss": 3.2653,
+      "step": 2279
+    },
+    {
+      "epoch": 0.15861421266826672,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019145447372148454,
+      "loss": 3.1743,
+      "step": 2280
+    },
+    {
+      "epoch": 0.15868378030540192,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019144535702334327,
+      "loss": 3.3068,
+      "step": 2281
+    },
+    {
+      "epoch": 0.15875334794253712,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001914362356820677,
+      "loss": 3.6477,
+      "step": 2282
+    },
+    {
+      "epoch": 0.15882291557967235,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019142710969812092,
+      "loss": 3.1305,
+      "step": 2283
+    },
+    {
+      "epoch": 0.15889248321680755,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019141797907196638,
+      "loss": 2.9882,
+      "step": 2284
+    },
+    {
+      "epoch": 0.15896205085394274,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019140884380406762,
+      "loss": 3.4047,
+      "step": 2285
+    },
+    {
+      "epoch": 0.15903161849107794,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001913997038948885,
+      "loss": 3.4563,
+      "step": 2286
+    },
+    {
+      "epoch": 0.15910118612821317,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001913905593448931,
+      "loss": 3.0906,
+      "step": 2287
+    },
+    {
+      "epoch": 0.15917075376534837,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019138141015454578,
+      "loss": 3.483,
+      "step": 2288
+    },
+    {
+      "epoch": 0.15924032140248356,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00191372256324311,
+      "loss": 3.0954,
+      "step": 2289
+    },
+    {
+      "epoch": 0.15930988903961876,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019136309785465363,
+      "loss": 3.2783,
+      "step": 2290
+    },
+    {
+      "epoch": 0.15937945667675396,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019135393474603863,
+      "loss": 3.4251,
+      "step": 2291
+    },
+    {
+      "epoch": 0.1594490243138892,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019134476699893131,
+      "loss": 3.1533,
+      "step": 2292
+    },
+    {
+      "epoch": 0.15951859195102439,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019133559461379708,
+      "loss": 3.1875,
+      "step": 2293
+    },
+    {
+      "epoch": 0.15958815958815958,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019132641759110175,
+      "loss": 3.1194,
+      "step": 2294
+    },
+    {
+      "epoch": 0.15965772722529478,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001913172359313113,
+      "loss": 3.3485,
+      "step": 2295
+    },
+    {
+      "epoch": 0.15972729486243,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019130804963489183,
+      "loss": 3.5899,
+      "step": 2296
+    },
+    {
+      "epoch": 0.1597968624995652,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019129885870230983,
+      "loss": 2.8317,
+      "step": 2297
+    },
+    {
+      "epoch": 0.1598664301367004,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019128966313403197,
+      "loss": 3.3308,
+      "step": 2298
+    },
+    {
+      "epoch": 0.1599359977738356,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019128046293052515,
+      "loss": 3.3317,
+      "step": 2299
+    },
+    {
+      "epoch": 0.16000556541097083,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019127125809225653,
+      "loss": 3.7295,
+      "step": 2300
+    },
+    {
+      "epoch": 0.16007513304810603,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019126204861969344,
+      "loss": 3.281,
+      "step": 2301
+    },
+    {
+      "epoch": 0.16014470068524123,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019125283451330354,
+      "loss": 3.1455,
+      "step": 2302
+    },
+    {
+      "epoch": 0.16021426832237642,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019124361577355462,
+      "loss": 2.7583,
+      "step": 2303
+    },
+    {
+      "epoch": 0.16028383595951162,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019123439240091482,
+      "loss": 3.3957,
+      "step": 2304
+    },
+    {
+      "epoch": 0.16035340359664685,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019122516439585243,
+      "loss": 3.1651,
+      "step": 2305
+    },
+    {
+      "epoch": 0.16042297123378205,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019121593175883596,
+      "loss": 3.4217,
+      "step": 2306
+    },
+    {
+      "epoch": 0.16049253887091725,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019120669449033429,
+      "loss": 3.1833,
+      "step": 2307
+    },
+    {
+      "epoch": 0.16056210650805244,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019119745259081635,
+      "loss": 3.5411,
+      "step": 2308
+    },
+    {
+      "epoch": 0.16063167414518767,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019118820606075146,
+      "loss": 3.6817,
+      "step": 2309
+    },
+    {
+      "epoch": 0.16070124178232287,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001911789549006091,
+      "loss": 3.2903,
+      "step": 2310
+    },
+    {
+      "epoch": 0.16077080941945807,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019116969911085896,
+      "loss": 2.9979,
+      "step": 2311
+    },
+    {
+      "epoch": 0.16084037705659326,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019116043869197102,
+      "loss": 3.6423,
+      "step": 2312
+    },
+    {
+      "epoch": 0.1609099446937285,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019115117364441553,
+      "loss": 3.4296,
+      "step": 2313
+    },
+    {
+      "epoch": 0.1609795123308637,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019114190396866283,
+      "loss": 3.6313,
+      "step": 2314
+    },
+    {
+      "epoch": 0.1610490799679989,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019113262966518369,
+      "loss": 3.1692,
+      "step": 2315
+    },
+    {
+      "epoch": 0.16111864760513409,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019112335073444891,
+      "loss": 3.4536,
+      "step": 2316
+    },
+    {
+      "epoch": 0.16118821524226928,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019111406717692966,
+      "loss": 2.7128,
+      "step": 2317
+    },
+    {
+      "epoch": 0.1612577828794045,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019110477899309739,
+      "loss": 3.1569,
+      "step": 2318
+    },
+    {
+      "epoch": 0.1613273505165397,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001910954861834236,
+      "loss": 3.2842,
+      "step": 2319
+    },
+    {
+      "epoch": 0.1613969181536749,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001910861887483802,
+      "loss": 3.1674,
+      "step": 2320
+    },
+    {
+      "epoch": 0.1614664857908101,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019107688668843924,
+      "loss": 3.3832,
+      "step": 2321
+    },
+    {
+      "epoch": 0.16153605342794533,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00191067580004073,
+      "loss": 2.9273,
+      "step": 2322
+    },
+    {
+      "epoch": 0.16160562106508053,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001910582686957541,
+      "loss": 3.4529,
+      "step": 2323
+    },
+    {
+      "epoch": 0.16167518870221573,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001910489527639553,
+      "loss": 3.2073,
+      "step": 2324
+    },
+    {
+      "epoch": 0.16174475633935093,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019103963220914958,
+      "loss": 3.587,
+      "step": 2325
+    },
+    {
+      "epoch": 0.16181432397648615,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001910303070318102,
+      "loss": 2.9228,
+      "step": 2326
+    },
+    {
+      "epoch": 0.16188389161362135,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019102097723241065,
+      "loss": 3.3639,
+      "step": 2327
+    },
+    {
+      "epoch": 0.16195345925075655,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019101164281142466,
+      "loss": 3.4723,
+      "step": 2328
+    },
+    {
+      "epoch": 0.16202302688789175,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019100230376932618,
+      "loss": 3.236,
+      "step": 2329
+    },
+    {
+      "epoch": 0.16209259452502695,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001909929601065894,
+      "loss": 3.2439,
+      "step": 2330
+    },
+    {
+      "epoch": 0.16216216216216217,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019098361182368878,
+      "loss": 3.2361,
+      "step": 2331
+    },
+    {
+      "epoch": 0.16223172979929737,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019097425892109889,
+      "loss": 3.3246,
+      "step": 2332
+    },
+    {
+      "epoch": 0.16230129743643257,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019096490139929472,
+      "loss": 2.9247,
+      "step": 2333
+    },
+    {
+      "epoch": 0.16237086507356777,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019095553925875133,
+      "loss": 3.1494,
+      "step": 2334
+    },
+    {
+      "epoch": 0.162440432710703,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001909461724999441,
+      "loss": 3.2959,
+      "step": 2335
+    },
+    {
+      "epoch": 0.1625100003478382,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019093680112334864,
+      "loss": 3.0625,
+      "step": 2336
+    },
+    {
+      "epoch": 0.1625795679849734,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001909274251294408,
+      "loss": 3.179,
+      "step": 2337
+    },
+    {
+      "epoch": 0.1626491356221086,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001909180445186966,
+      "loss": 3.1252,
+      "step": 2338
+    },
+    {
+      "epoch": 0.1627187032592438,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019090865929159233,
+      "loss": 3.6039,
+      "step": 2339
+    },
+    {
+      "epoch": 0.162788270896379,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019089926944860461,
+      "loss": 3.4938,
+      "step": 2340
+    },
+    {
+      "epoch": 0.1628578385335142,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019088987499021012,
+      "loss": 3.6166,
+      "step": 2341
+    },
+    {
+      "epoch": 0.1629274061706494,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001908804759168859,
+      "loss": 3.4761,
+      "step": 2342
+    },
+    {
+      "epoch": 0.1629969738077846,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001908710722291092,
+      "loss": 3.2296,
+      "step": 2343
+    },
+    {
+      "epoch": 0.16306654144491983,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019086166392735745,
+      "loss": 3.1014,
+      "step": 2344
+    },
+    {
+      "epoch": 0.16313610908205503,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001908522510121084,
+      "loss": 3.6301,
+      "step": 2345
+    },
+    {
+      "epoch": 0.16320567671919023,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019084283348383994,
+      "loss": 2.8503,
+      "step": 2346
+    },
+    {
+      "epoch": 0.16327524435632543,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019083341134303034,
+      "loss": 3.1469,
+      "step": 2347
+    },
+    {
+      "epoch": 0.16334481199346065,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001908239845901579,
+      "loss": 3.2092,
+      "step": 2348
+    },
+    {
+      "epoch": 0.16341437963059585,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019081455322570134,
+      "loss": 3.4614,
+      "step": 2349
+    },
+    {
+      "epoch": 0.16348394726773105,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001908051172501395,
+      "loss": 2.7877,
+      "step": 2350
+    },
+    {
+      "epoch": 0.16355351490486625,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019079567666395146,
+      "loss": 3.1249,
+      "step": 2351
+    },
+    {
+      "epoch": 0.16362308254200147,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019078623146761662,
+      "loss": 3.3158,
+      "step": 2352
+    },
+    {
+      "epoch": 0.16369265017913667,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019077678166161457,
+      "loss": 3.1093,
+      "step": 2353
+    },
+    {
+      "epoch": 0.16376221781627187,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019076732724642507,
+      "loss": 3.0221,
+      "step": 2354
+    },
+    {
+      "epoch": 0.16383178545340707,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019075786822252822,
+      "loss": 3.3588,
+      "step": 2355
+    },
+    {
+      "epoch": 0.16390135309054227,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019074840459040426,
+      "loss": 3.5697,
+      "step": 2356
+    },
+    {
+      "epoch": 0.1639709207276775,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019073893635053372,
+      "loss": 3.0052,
+      "step": 2357
+    },
+    {
+      "epoch": 0.1640404883648127,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019072946350339732,
+      "loss": 2.9758,
+      "step": 2358
+    },
+    {
+      "epoch": 0.1641100560019479,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019071998604947612,
+      "loss": 2.8526,
+      "step": 2359
+    },
+    {
+      "epoch": 0.1641796236390831,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019071050398925128,
+      "loss": 3.6262,
+      "step": 2360
+    },
+    {
+      "epoch": 0.16424919127621831,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019070101732320426,
+      "loss": 3.246,
+      "step": 2361
+    },
+    {
+      "epoch": 0.1643187589133535,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019069152605181673,
+      "loss": 3.2814,
+      "step": 2362
+    },
+    {
+      "epoch": 0.1643883265504887,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019068203017557064,
+      "loss": 3.2555,
+      "step": 2363
+    },
+    {
+      "epoch": 0.1644578941876239,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019067252969494812,
+      "loss": 2.951,
+      "step": 2364
+    },
+    {
+      "epoch": 0.16452746182475914,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019066302461043158,
+      "loss": 3.3416,
+      "step": 2365
+    },
+    {
+      "epoch": 0.16459702946189433,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019065351492250362,
+      "loss": 3.3133,
+      "step": 2366
+    },
+    {
+      "epoch": 0.16466659709902953,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019064400063164711,
+      "loss": 3.5457,
+      "step": 2367
+    },
+    {
+      "epoch": 0.16473616473616473,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001906344817383451,
+      "loss": 3.1784,
+      "step": 2368
+    },
+    {
+      "epoch": 0.16480573237329993,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019062495824308098,
+      "loss": 3.2771,
+      "step": 2369
+    },
+    {
+      "epoch": 0.16487530001043516,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019061543014633822,
+      "loss": 2.851,
+      "step": 2370
+    },
+    {
+      "epoch": 0.16494486764757035,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019060589744860068,
+      "loss": 3.1726,
+      "step": 2371
+    },
+    {
+      "epoch": 0.16501443528470555,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019059636015035235,
+      "loss": 3.5959,
+      "step": 2372
+    },
+    {
+      "epoch": 0.16508400292184075,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019058681825207748,
+      "loss": 3.3155,
+      "step": 2373
+    },
+    {
+      "epoch": 0.16515357055897598,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001905772717542606,
+      "loss": 3.2452,
+      "step": 2374
+    },
+    {
+      "epoch": 0.16522313819611117,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019056772065738636,
+      "loss": 3.0752,
+      "step": 2375
+    },
+    {
+      "epoch": 0.16529270583324637,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019055816496193981,
+      "loss": 3.1546,
+      "step": 2376
+    },
+    {
+      "epoch": 0.16536227347038157,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019054860466840606,
+      "loss": 3.1118,
+      "step": 2377
+    },
+    {
+      "epoch": 0.16543184110751677,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019053903977727057,
+      "loss": 3.4787,
+      "step": 2378
+    },
+    {
+      "epoch": 0.165501408744652,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019052947028901897,
+      "loss": 3.3528,
+      "step": 2379
+    },
+    {
+      "epoch": 0.1655709763817872,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019051989620413718,
+      "loss": 3.4341,
+      "step": 2380
+    },
+    {
+      "epoch": 0.1656405440189224,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019051031752311135,
+      "loss": 3.4025,
+      "step": 2381
+    },
+    {
+      "epoch": 0.1657101116560576,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019050073424642779,
+      "loss": 3.6356,
+      "step": 2382
+    },
+    {
+      "epoch": 0.16577967929319282,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019049114637457306,
+      "loss": 3.1271,
+      "step": 2383
+    },
+    {
+      "epoch": 0.16584924693032801,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019048155390803405,
+      "loss": 3.3048,
+      "step": 2384
+    },
+    {
+      "epoch": 0.1659188145674632,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019047195684729781,
+      "loss": 3.3535,
+      "step": 2385
+    },
+    {
+      "epoch": 0.1659883822045984,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001904623551928516,
+      "loss": 3.0647,
+      "step": 2386
+    },
+    {
+      "epoch": 0.16605794984173364,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019045274894518296,
+      "loss": 3.1102,
+      "step": 2387
+    },
+    {
+      "epoch": 0.16612751747886884,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019044313810477964,
+      "loss": 3.2833,
+      "step": 2388
+    },
+    {
+      "epoch": 0.16619708511600403,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019043352267212965,
+      "loss": 3.1556,
+      "step": 2389
+    },
+    {
+      "epoch": 0.16626665275313923,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019042390264772118,
+      "loss": 3.3388,
+      "step": 2390
+    },
+    {
+      "epoch": 0.16633622039027443,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001904142780320427,
+      "loss": 2.995,
+      "step": 2391
+    },
+    {
+      "epoch": 0.16640578802740966,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019040464882558292,
+      "loss": 3.6169,
+      "step": 2392
+    },
+    {
+      "epoch": 0.16647535566454486,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019039501502883071,
+      "loss": 2.7984,
+      "step": 2393
+    },
+    {
+      "epoch": 0.16654492330168005,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001903853766422753,
+      "loss": 3.3614,
+      "step": 2394
+    },
+    {
+      "epoch": 0.16661449093881525,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019037573366640604,
+      "loss": 3.0099,
+      "step": 2395
+    },
+    {
+      "epoch": 0.16668405857595048,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019036608610171256,
+      "loss": 3.6974,
+      "step": 2396
+    },
+    {
+      "epoch": 0.16675362621308568,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019035643394868468,
+      "loss": 3.5691,
+      "step": 2397
+    },
+    {
+      "epoch": 0.16682319385022087,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001903467772078125,
+      "loss": 3.4569,
+      "step": 2398
+    },
+    {
+      "epoch": 0.16689276148735607,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019033711587958639,
+      "loss": 2.9937,
+      "step": 2399
+    },
+    {
+      "epoch": 0.1669623291244913,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019032744996449688,
+      "loss": 3.3972,
+      "step": 2400
+    },
+    {
+      "epoch": 0.1670318967616265,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001903177794630347,
+      "loss": 3.1435,
+      "step": 2401
+    },
+    {
+      "epoch": 0.1671014643987617,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019030810437569096,
+      "loss": 3.33,
+      "step": 2402
+    },
+    {
+      "epoch": 0.1671710320358969,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019029842470295682,
+      "loss": 3.0786,
+      "step": 2403
+    },
+    {
+      "epoch": 0.1672405996730321,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019028874044532383,
+      "loss": 3.3754,
+      "step": 2404
+    },
+    {
+      "epoch": 0.16731016731016732,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001902790516032837,
+      "loss": 3.0439,
+      "step": 2405
+    },
+    {
+      "epoch": 0.16737973494730252,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019026935817732836,
+      "loss": 3.9029,
+      "step": 2406
+    },
+    {
+      "epoch": 0.16744930258443771,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019025966016795,
+      "loss": 3.1789,
+      "step": 2407
+    },
+    {
+      "epoch": 0.1675188702215729,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019024995757564102,
+      "loss": 3.7364,
+      "step": 2408
+    },
+    {
+      "epoch": 0.16758843785870814,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019024025040089412,
+      "loss": 3.3665,
+      "step": 2409
+    },
+    {
+      "epoch": 0.16765800549584334,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019023053864420216,
+      "loss": 3.232,
+      "step": 2410
+    },
+    {
+      "epoch": 0.16772757313297854,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019022082230605822,
+      "loss": 3.2489,
+      "step": 2411
+    },
+    {
+      "epoch": 0.16779714077011373,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019021110138695567,
+      "loss": 3.1511,
+      "step": 2412
+    },
+    {
+      "epoch": 0.16786670840724896,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019020137588738808,
+      "loss": 3.2653,
+      "step": 2413
+    },
+    {
+      "epoch": 0.16793627604438416,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001901916458078493,
+      "loss": 3.3701,
+      "step": 2414
+    },
+    {
+      "epoch": 0.16800584368151936,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019018191114883332,
+      "loss": 3.3645,
+      "step": 2415
+    },
+    {
+      "epoch": 0.16807541131865456,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019017217191083446,
+      "loss": 2.9964,
+      "step": 2416
+    },
+    {
+      "epoch": 0.16814497895578975,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001901624280943472,
+      "loss": 3.108,
+      "step": 2417
+    },
+    {
+      "epoch": 0.16821454659292498,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001901526796998663,
+      "loss": 3.2885,
+      "step": 2418
+    },
+    {
+      "epoch": 0.16828411423006018,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019014292672788673,
+      "loss": 3.3568,
+      "step": 2419
+    },
+    {
+      "epoch": 0.16835368186719538,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019013316917890369,
+      "loss": 3.131,
+      "step": 2420
+    },
+    {
+      "epoch": 0.16842324950433057,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019012340705341262,
+      "loss": 3.459,
+      "step": 2421
+    },
+    {
+      "epoch": 0.1684928171414658,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001901136403519092,
+      "loss": 3.1376,
+      "step": 2422
+    },
+    {
+      "epoch": 0.168562384778601,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019010386907488933,
+      "loss": 2.9253,
+      "step": 2423
+    },
+    {
+      "epoch": 0.1686319524157362,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019009409322284915,
+      "loss": 3.3112,
+      "step": 2424
+    },
+    {
+      "epoch": 0.1687015200528714,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.00190084312796285,
+      "loss": 3.2567,
+      "step": 2425
+    },
+    {
+      "epoch": 0.16877108769000662,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019007452779569354,
+      "loss": 3.2253,
+      "step": 2426
+    },
+    {
+      "epoch": 0.16884065532714182,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019006473822157153,
+      "loss": 3.2455,
+      "step": 2427
+    },
+    {
+      "epoch": 0.16891022296427702,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001900549440744161,
+      "loss": 3.3872,
+      "step": 2428
+    },
+    {
+      "epoch": 0.16897979060141222,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001900451453547245,
+      "loss": 3.3497,
+      "step": 2429
+    },
+    {
+      "epoch": 0.16904935823854741,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001900353420629943,
+      "loss": 3.0166,
+      "step": 2430
+    },
+    {
+      "epoch": 0.16911892587568264,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019002553419972324,
+      "loss": 3.2642,
+      "step": 2431
+    },
+    {
+      "epoch": 0.16918849351281784,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001900157217654093,
+      "loss": 3.4829,
+      "step": 2432
+    },
+    {
+      "epoch": 0.16925806114995304,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019000590476055076,
+      "loss": 3.4229,
+      "step": 2433
+    },
+    {
+      "epoch": 0.16932762878708824,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00189996083185646,
+      "loss": 3.0214,
+      "step": 2434
+    },
+    {
+      "epoch": 0.16939719642422346,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018998625704119377,
+      "loss": 3.3381,
+      "step": 2435
+    },
+    {
+      "epoch": 0.16946676406135866,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0018997642632769297,
+      "loss": 3.724,
+      "step": 2436
+    },
+    {
+      "epoch": 0.16953633169849386,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0018996659104564273,
+      "loss": 3.2562,
+      "step": 2437
+    },
+    {
+      "epoch": 0.16960589933562906,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001899567511955425,
+      "loss": 2.8605,
+      "step": 2438
+    },
+    {
+      "epoch": 0.16967546697276428,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018994690677789183,
+      "loss": 3.4274,
+      "step": 2439
+    },
+    {
+      "epoch": 0.16974503460989948,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0018993705779319062,
+      "loss": 3.4065,
+      "step": 2440
+    },
+    {
+      "epoch": 0.16981460224703468,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018992720424193892,
+      "loss": 3.3679,
+      "step": 2441
+    },
+    {
+      "epoch": 0.16988416988416988,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0018991734612463706,
+      "loss": 3.1656,
+      "step": 2442
+    },
+    {
+      "epoch": 0.16995373752130508,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001899074834417856,
+      "loss": 2.9411,
+      "step": 2443
+    },
+    {
+      "epoch": 0.1700233051584403,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018989761619388527,
+      "loss": 3.1267,
+      "step": 2444
+    },
+    {
+      "epoch": 0.1700928727955755,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0018988774438143713,
+      "loss": 3.4626,
+      "step": 2445
+    },
+    {
+      "epoch": 0.1701624404327107,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018987786800494235,
+      "loss": 2.9381,
+      "step": 2446
+    },
+    {
+      "epoch": 0.1702320080698459,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001898679870649025,
+      "loss": 3.3637,
+      "step": 2447
+    },
+    {
+      "epoch": 0.17030157570698112,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018985810156181922,
+      "loss": 3.1177,
+      "step": 2448
+    },
+    {
+      "epoch": 0.17037114334411632,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018984821149619444,
+      "loss": 3.3731,
+      "step": 2449
+    },
+    {
+      "epoch": 0.17044071098125152,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001898383168685304,
+      "loss": 3.191,
+      "step": 2450
+    },
+    {
+      "epoch": 0.17051027861838672,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001898284176793294,
+      "loss": 3.696,
+      "step": 2451
+    },
+    {
+      "epoch": 0.17057984625552194,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018981851392909413,
+      "loss": 3.4107,
+      "step": 2452
+    },
+    {
+      "epoch": 0.17064941389265714,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018980860561832746,
+      "loss": 3.6169,
+      "step": 2453
+    },
+    {
+      "epoch": 0.17071898152979234,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018979869274753246,
+      "loss": 3.0298,
+      "step": 2454
+    },
+    {
+      "epoch": 0.17078854916692754,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018978877531721245,
+      "loss": 3.2814,
+      "step": 2455
+    },
+    {
+      "epoch": 0.17085811680406274,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00189778853327871,
+      "loss": 2.9714,
+      "step": 2456
+    },
+    {
+      "epoch": 0.17092768444119796,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001897689267800119,
+      "loss": 3.0664,
+      "step": 2457
+    },
+    {
+      "epoch": 0.17099725207833316,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018975899567413915,
+      "loss": 2.9897,
+      "step": 2458
+    },
+    {
+      "epoch": 0.17106681971546836,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018974906001075706,
+      "loss": 3.3436,
+      "step": 2459
+    },
+    {
+      "epoch": 0.17113638735260356,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018973911979037004,
+      "loss": 3.2716,
+      "step": 2460
+    },
+    {
+      "epoch": 0.17120595498973878,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018972917501348283,
+      "loss": 3.4302,
+      "step": 2461
+    },
+    {
+      "epoch": 0.17127552262687398,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001897192256806004,
+      "loss": 3.034,
+      "step": 2462
+    },
+    {
+      "epoch": 0.17134509026400918,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001897092717922279,
+      "loss": 3.1561,
+      "step": 2463
+    },
+    {
+      "epoch": 0.17141465790114438,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0018969931334887073,
+      "loss": 3.3995,
+      "step": 2464
+    },
+    {
+      "epoch": 0.1714842255382796,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018968935035103458,
+      "loss": 3.1383,
+      "step": 2465
+    },
+    {
+      "epoch": 0.1715537931754148,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018967938279922528,
+      "loss": 3.1904,
+      "step": 2466
+    },
+    {
+      "epoch": 0.17162336081255,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018966941069394894,
+      "loss": 3.2457,
+      "step": 2467
+    },
+    {
+      "epoch": 0.1716929284496852,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001896594340357119,
+      "loss": 3.4366,
+      "step": 2468
+    },
+    {
+      "epoch": 0.1717624960868204,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001896494528250207,
+      "loss": 3.1295,
+      "step": 2469
+    },
+    {
+      "epoch": 0.17183206372395562,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018963946706238213,
+      "loss": 3.4161,
+      "step": 2470
+    },
+    {
+      "epoch": 0.17190163136109082,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018962947674830324,
+      "loss": 3.6559,
+      "step": 2471
+    },
+    {
+      "epoch": 0.17197119899822602,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018961948188329133,
+      "loss": 3.0453,
+      "step": 2472
+    },
+    {
+      "epoch": 0.17204076663536122,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018960948246785382,
+      "loss": 3.4069,
+      "step": 2473
+    },
+    {
+      "epoch": 0.17211033427249645,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018959947850249845,
+      "loss": 3.2656,
+      "step": 2474
+    },
+    {
+      "epoch": 0.17217990190963164,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018958946998773318,
+      "loss": 3.3958,
+      "step": 2475
+    },
+    {
+      "epoch": 0.17224946954676684,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018957945692406621,
+      "loss": 3.2935,
+      "step": 2476
+    },
+    {
+      "epoch": 0.17231903718390204,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018956943931200591,
+      "loss": 3.5589,
+      "step": 2477
+    },
+    {
+      "epoch": 0.17238860482103727,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018955941715206096,
+      "loss": 2.9786,
+      "step": 2478
+    },
+    {
+      "epoch": 0.17245817245817247,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001895493904447402,
+      "loss": 3.2443,
+      "step": 2479
+    },
+    {
+      "epoch": 0.17252774009530766,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018953935919055276,
+      "loss": 3.1005,
+      "step": 2480
+    },
+    {
+      "epoch": 0.17259730773244286,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00189529323390008,
+      "loss": 3.1678,
+      "step": 2481
+    },
+    {
+      "epoch": 0.17266687536957806,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018951928304361543,
+      "loss": 2.9976,
+      "step": 2482
+    },
+    {
+      "epoch": 0.17273644300671329,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001895092381518849,
+      "loss": 3.3298,
+      "step": 2483
+    },
+    {
+      "epoch": 0.17280601064384848,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018949918871532638,
+      "loss": 3.0909,
+      "step": 2484
+    },
+    {
+      "epoch": 0.17287557828098368,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001894891347344502,
+      "loss": 3.2494,
+      "step": 2485
+    },
+    {
+      "epoch": 0.17294514591811888,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001894790762097668,
+      "loss": 3.1686,
+      "step": 2486
+    },
+    {
+      "epoch": 0.1730147135552541,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018946901314178693,
+      "loss": 3.6562,
+      "step": 2487
+    },
+    {
+      "epoch": 0.1730842811923893,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018945894553102152,
+      "loss": 3.07,
+      "step": 2488
+    },
+    {
+      "epoch": 0.1731538488295245,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018944887337798177,
+      "loss": 2.9494,
+      "step": 2489
+    },
+    {
+      "epoch": 0.1732234164666597,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018943879668317906,
+      "loss": 3.03,
+      "step": 2490
+    },
+    {
+      "epoch": 0.17329298410379493,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0018942871544712508,
+      "loss": 3.5412,
+      "step": 2491
+    },
+    {
+      "epoch": 0.17336255174093013,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001894186296703317,
+      "loss": 3.2025,
+      "step": 2492
+    },
+    {
+      "epoch": 0.17343211937806532,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00189408539353311,
+      "loss": 2.7529,
+      "step": 2493
+    },
+    {
+      "epoch": 0.17350168701520052,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001893984444965753,
+      "loss": 3.3874,
+      "step": 2494
+    },
+    {
+      "epoch": 0.17357125465233572,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001893883451006372,
+      "loss": 3.3101,
+      "step": 2495
+    },
+    {
+      "epoch": 0.17364082228947095,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001893782411660095,
+      "loss": 3.3576,
+      "step": 2496
+    },
+    {
+      "epoch": 0.17371038992660615,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001893681326932052,
+      "loss": 3.1649,
+      "step": 2497
+    },
+    {
+      "epoch": 0.17377995756374134,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018935801968273758,
+      "loss": 2.8563,
+      "step": 2498
+    },
+    {
+      "epoch": 0.17384952520087654,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018934790213512014,
+      "loss": 3.3319,
+      "step": 2499
+    },
+    {
+      "epoch": 0.17391909283801177,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018933778005086653,
+      "loss": 3.5107,
+      "step": 2500
+    },
+    {
+      "epoch": 0.17398866047514697,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0018932765343049076,
+      "loss": 3.4601,
+      "step": 2501
+    },
+    {
+      "epoch": 0.17405822811228217,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018931752227450702,
+      "loss": 3.3166,
+      "step": 2502
+    },
+    {
+      "epoch": 0.17412779574941736,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0018930738658342965,
+      "loss": 3.2444,
+      "step": 2503
+    },
+    {
+      "epoch": 0.1741973633865526,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018929724635777336,
+      "loss": 3.1286,
+      "step": 2504
+    },
+    {
+      "epoch": 0.1742669310236878,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00189287101598053,
+      "loss": 3.2518,
+      "step": 2505
+    },
+    {
+      "epoch": 0.17433649866082299,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018927695230478365,
+      "loss": 3.1074,
+      "step": 2506
+    },
+    {
+      "epoch": 0.17440606629795818,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018926679847848064,
+      "loss": 3.2652,
+      "step": 2507
+    },
+    {
+      "epoch": 0.17447563393509338,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018925664011965955,
+      "loss": 3.2207,
+      "step": 2508
+    },
+    {
+      "epoch": 0.1745452015722286,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018924647722883617,
+      "loss": 2.8783,
+      "step": 2509
+    },
+    {
+      "epoch": 0.1746147692093638,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018923630980652649,
+      "loss": 3.0064,
+      "step": 2510
+    },
+    {
+      "epoch": 0.174684336846499,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001892261378532468,
+      "loss": 3.5639,
+      "step": 2511
+    },
+    {
+      "epoch": 0.1747539044836342,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018921596136951355,
+      "loss": 3.1965,
+      "step": 2512
+    },
+    {
+      "epoch": 0.17482347212076943,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018920578035584348,
+      "loss": 2.857,
+      "step": 2513
+    },
+    {
+      "epoch": 0.17489303975790463,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001891955948127535,
+      "loss": 2.7816,
+      "step": 2514
+    },
+    {
+      "epoch": 0.17496260739503983,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018918540474076081,
+      "loss": 3.2822,
+      "step": 2515
+    },
+    {
+      "epoch": 0.17503217503217502,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0018917521014038278,
+      "loss": 3.0852,
+      "step": 2516
+    },
+    {
+      "epoch": 0.17510174266931022,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018916501101213705,
+      "loss": 3.4133,
+      "step": 2517
+    },
+    {
+      "epoch": 0.17517131030644545,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001891548073565415,
+      "loss": 3.0457,
+      "step": 2518
+    },
+    {
+      "epoch": 0.17524087794358065,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018914459917411422,
+      "loss": 2.9106,
+      "step": 2519
+    },
+    {
+      "epoch": 0.17531044558071585,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018913438646537349,
+      "loss": 3.2036,
+      "step": 2520
+    },
+    {
+      "epoch": 0.17538001321785104,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018912416923083791,
+      "loss": 2.8513,
+      "step": 2521
+    },
+    {
+      "epoch": 0.17544958085498627,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0018911394747102622,
+      "loss": 3.2675,
+      "step": 2522
+    },
+    {
+      "epoch": 0.17551914849212147,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018910372118645742,
+      "loss": 3.3492,
+      "step": 2523
+    },
+    {
+      "epoch": 0.17558871612925667,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001890934903776508,
+      "loss": 3.0853,
+      "step": 2524
+    },
+    {
+      "epoch": 0.17565828376639187,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001890832550451258,
+      "loss": 3.0819,
+      "step": 2525
+    },
+    {
+      "epoch": 0.1757278514035271,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018907301518940214,
+      "loss": 3.1941,
+      "step": 2526
+    },
+    {
+      "epoch": 0.1757974190406623,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018906277081099973,
+      "loss": 3.2225,
+      "step": 2527
+    },
+    {
+      "epoch": 0.1758669866777975,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018905252191043869,
+      "loss": 3.2995,
+      "step": 2528
+    },
+    {
+      "epoch": 0.17593655431493269,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018904226848823948,
+      "loss": 2.9885,
+      "step": 2529
+    },
+    {
+      "epoch": 0.17600612195206788,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018903201054492266,
+      "loss": 3.4344,
+      "step": 2530
+    },
+    {
+      "epoch": 0.1760756895892031,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018902174808100912,
+      "loss": 3.1054,
+      "step": 2531
+    },
+    {
+      "epoch": 0.1761452572263383,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018901148109701988,
+      "loss": 3.2626,
+      "step": 2532
+    },
+    {
+      "epoch": 0.1762148248634735,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018900120959347633,
+      "loss": 3.1817,
+      "step": 2533
+    },
+    {
+      "epoch": 0.1762843925006087,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018899093357089992,
+      "loss": 3.0,
+      "step": 2534
+    },
+    {
+      "epoch": 0.17635396013774393,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018898065302981246,
+      "loss": 3.1899,
+      "step": 2535
+    },
+    {
+      "epoch": 0.17642352777487913,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018897036797073594,
+      "loss": 2.9696,
+      "step": 2536
+    },
+    {
+      "epoch": 0.17649309541201433,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018896007839419259,
+      "loss": 2.9419,
+      "step": 2537
+    },
+    {
+      "epoch": 0.17656266304914953,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018894978430070482,
+      "loss": 3.2079,
+      "step": 2538
+    },
+    {
+      "epoch": 0.17663223068628475,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0018893948569079536,
+      "loss": 3.162,
+      "step": 2539
+    },
+    {
+      "epoch": 0.17670179832341995,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001889291825649871,
+      "loss": 3.4439,
+      "step": 2540
+    },
+    {
+      "epoch": 0.17677136596055515,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001889188749238032,
+      "loss": 3.2802,
+      "step": 2541
+    },
+    {
+      "epoch": 0.17684093359769035,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.00188908562767767,
+      "loss": 3.414,
+      "step": 2542
+    },
+    {
+      "epoch": 0.17691050123482555,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001888982460974021,
+      "loss": 2.7324,
+      "step": 2543
+    },
+    {
+      "epoch": 0.17698006887196077,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001888879249132324,
+      "loss": 3.433,
+      "step": 2544
+    },
+    {
+      "epoch": 0.17704963650909597,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018887759921578184,
+      "loss": 3.1548,
+      "step": 2545
+    },
+    {
+      "epoch": 0.17711920414623117,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001888672690055748,
+      "loss": 3.1417,
+      "step": 2546
+    },
+    {
+      "epoch": 0.17718877178336637,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0018885693428313576,
+      "loss": 3.1294,
+      "step": 2547
+    },
+    {
+      "epoch": 0.1772583394205016,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018884659504898947,
+      "loss": 3.7066,
+      "step": 2548
+    },
+    {
+      "epoch": 0.1773279070576368,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001888362513036609,
+      "loss": 3.0523,
+      "step": 2549
+    },
+    {
+      "epoch": 0.177397474694772,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018882590304767526,
+      "loss": 2.8411,
+      "step": 2550
+    },
+    {
+      "epoch": 0.1774670423319072,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0018881555028155796,
+      "loss": 3.0223,
+      "step": 2551
+    },
+    {
+      "epoch": 0.1775366099690424,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018880519300583471,
+      "loss": 3.3749,
+      "step": 2552
+    },
+    {
+      "epoch": 0.1776061776061776,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018879483122103136,
+      "loss": 2.9869,
+      "step": 2553
+    },
+    {
+      "epoch": 0.1776757452433128,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018878446492767403,
+      "loss": 3.4166,
+      "step": 2554
+    },
+    {
+      "epoch": 0.177745312880448,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018877409412628907,
+      "loss": 3.3083,
+      "step": 2555
+    },
+    {
+      "epoch": 0.1778148805175832,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0018876371881740308,
+      "loss": 2.9763,
+      "step": 2556
+    },
+    {
+      "epoch": 0.17788444815471843,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018875333900154289,
+      "loss": 3.5804,
+      "step": 2557
+    },
+    {
+      "epoch": 0.17795401579185363,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0018874295467923544,
+      "loss": 2.897,
+      "step": 2558
+    },
+    {
+      "epoch": 0.17802358342898883,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018873256585100807,
+      "loss": 3.2606,
+      "step": 2559
+    },
+    {
+      "epoch": 0.17809315106612403,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0018872217251738824,
+      "loss": 3.6405,
+      "step": 2560
+    },
+    {
+      "epoch": 0.17816271870325925,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0018871177467890369,
+      "loss": 3.1806,
+      "step": 2561
+    },
+    {
+      "epoch": 0.17823228634039445,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018870137233608236,
+      "loss": 3.3116,
+      "step": 2562
+    },
+    {
+      "epoch": 0.17830185397752965,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018869096548945242,
+      "loss": 3.1411,
+      "step": 2563
+    },
+    {
+      "epoch": 0.17837142161466485,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018868055413954231,
+      "loss": 3.2375,
+      "step": 2564
+    },
+    {
+      "epoch": 0.17844098925180008,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018867013828688065,
+      "loss": 3.5075,
+      "step": 2565
+    },
+    {
+      "epoch": 0.17851055688893527,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018865971793199626,
+      "loss": 3.1211,
+      "step": 2566
+    },
+    {
+      "epoch": 0.17858012452607047,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001886492930754183,
+      "loss": 3.4119,
+      "step": 2567
+    },
+    {
+      "epoch": 0.17864969216320567,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018863886371767605,
+      "loss": 3.0101,
+      "step": 2568
+    },
+    {
+      "epoch": 0.17871925980034087,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018862842985929906,
+      "loss": 3.0612,
+      "step": 2569
+    },
+    {
+      "epoch": 0.1787888274374761,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0018861799150081719,
+      "loss": 3.1752,
+      "step": 2570
+    },
+    {
+      "epoch": 0.1788583950746113,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0018860754864276031,
+      "loss": 3.3085,
+      "step": 2571
+    },
+    {
+      "epoch": 0.1789279627117465,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018859710128565875,
+      "loss": 3.4382,
+      "step": 2572
+    },
+    {
+      "epoch": 0.1789975303488817,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0018858664943004295,
+      "loss": 3.635,
+      "step": 2573
+    },
+    {
+      "epoch": 0.17906709798601692,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001885761930764436,
+      "loss": 3.3867,
+      "step": 2574
+    },
+    {
+      "epoch": 0.1791366656231521,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018856573222539163,
+      "loss": 3.0955,
+      "step": 2575
+    },
+    {
+      "epoch": 0.1792062332602873,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018855526687741816,
+      "loss": 3.4234,
+      "step": 2576
+    },
+    {
+      "epoch": 0.1792758008974225,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001885447970330546,
+      "loss": 2.7572,
+      "step": 2577
+    },
+    {
+      "epoch": 0.17934536853455774,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018853432269283254,
+      "loss": 3.193,
+      "step": 2578
+    },
+    {
+      "epoch": 0.17941493617169293,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018852384385728382,
+      "loss": 3.1121,
+      "step": 2579
+    },
+    {
+      "epoch": 0.17948450380882813,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018851336052694051,
+      "loss": 3.2991,
+      "step": 2580
+    },
+    {
+      "epoch": 0.17955407144596333,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018850287270233488,
+      "loss": 3.3188,
+      "step": 2581
+    },
+    {
+      "epoch": 0.17962363908309853,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001884923803839995,
+      "loss": 3.432,
+      "step": 2582
+    },
+    {
+      "epoch": 0.17969320672023376,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0018848188357246706,
+      "loss": 3.5215,
+      "step": 2583
+    },
+    {
+      "epoch": 0.17976277435736895,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0018847138226827053,
+      "loss": 3.0869,
+      "step": 2584
+    },
+    {
+      "epoch": 0.17983234199450415,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018846087647194315,
+      "loss": 3.4616,
+      "step": 2585
+    },
+    {
+      "epoch": 0.17990190963163935,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018845036618401834,
+      "loss": 3.6504,
+      "step": 2586
+    },
+    {
+      "epoch": 0.17997147726877458,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018843985140502976,
+      "loss": 2.9132,
+      "step": 2587
+    },
+    {
+      "epoch": 0.18004104490590978,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001884293321355113,
+      "loss": 3.5805,
+      "step": 2588
+    },
+    {
+      "epoch": 0.18011061254304497,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018841880837599705,
+      "loss": 3.0303,
+      "step": 2589
+    },
+    {
+      "epoch": 0.18018018018018017,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001884082801270214,
+      "loss": 3.5147,
+      "step": 2590
+    },
+    {
+      "epoch": 0.1802497478173154,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0018839774738911889,
+      "loss": 3.0765,
+      "step": 2591
+    },
+    {
+      "epoch": 0.1803193154544506,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018838721016282433,
+      "loss": 3.494,
+      "step": 2592
+    },
+    {
+      "epoch": 0.1803888830915858,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018837666844867273,
+      "loss": 3.1105,
+      "step": 2593
+    },
+    {
+      "epoch": 0.180458450728721,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0018836612224719938,
+      "loss": 3.0279,
+      "step": 2594
+    },
+    {
+      "epoch": 0.1805280183658562,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001883555715589397,
+      "loss": 3.5961,
+      "step": 2595
+    },
+    {
+      "epoch": 0.18059758600299142,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018834501638442947,
+      "loss": 3.3071,
+      "step": 2596
+    },
+    {
+      "epoch": 0.18066715364012662,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001883344567242046,
+      "loss": 3.5241,
+      "step": 2597
+    },
+    {
+      "epoch": 0.1807367212772618,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018832389257880124,
+      "loss": 3.1847,
+      "step": 2598
+    },
+    {
+      "epoch": 0.180806288914397,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018831332394875582,
+      "loss": 3.5169,
+      "step": 2599
+    },
+    {
+      "epoch": 0.18087585655153224,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0018830275083460493,
+      "loss": 3.0003,
+      "step": 2600
+    },
+    {
+      "epoch": 0.18094542418866744,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0018829217323688544,
+      "loss": 3.1013,
+      "step": 2601
+    },
+    {
+      "epoch": 0.18101499182580263,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018828159115613441,
+      "loss": 3.0551,
+      "step": 2602
+    },
+    {
+      "epoch": 0.18108455946293783,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018827100459288914,
+      "loss": 3.3201,
+      "step": 2603
+    },
+    {
+      "epoch": 0.18115412710007306,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001882604135476872,
+      "loss": 3.3826,
+      "step": 2604
+    },
+    {
+      "epoch": 0.18122369473720826,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018824981802106633,
+      "loss": 3.7203,
+      "step": 2605
+    },
+    {
+      "epoch": 0.18129326237434346,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001882392180135645,
+      "loss": 3.1752,
+      "step": 2606
+    },
+    {
+      "epoch": 0.18136283001147865,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018822861352571995,
+      "loss": 3.4479,
+      "step": 2607
+    },
+    {
+      "epoch": 0.18143239764861385,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018821800455807109,
+      "loss": 3.347,
+      "step": 2608
+    },
+    {
+      "epoch": 0.18150196528574908,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001882073911111566,
+      "loss": 3.342,
+      "step": 2609
+    },
+    {
+      "epoch": 0.18157153292288428,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018819677318551542,
+      "loss": 3.4264,
+      "step": 2610
+    },
+    {
+      "epoch": 0.18164110056001948,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0018818615078168661,
+      "loss": 3.2509,
+      "step": 2611
+    },
+    {
+      "epoch": 0.18171066819715467,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018817552390020958,
+      "loss": 3.0668,
+      "step": 2612
+    },
+    {
+      "epoch": 0.1817802358342899,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018816489254162387,
+      "loss": 3.3121,
+      "step": 2613
+    },
+    {
+      "epoch": 0.1818498034714251,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001881542567064693,
+      "loss": 3.1481,
+      "step": 2614
+    },
+    {
+      "epoch": 0.1819193711085603,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018814361639528593,
+      "loss": 3.3161,
+      "step": 2615
+    },
+    {
+      "epoch": 0.1819889387456955,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018813297160861398,
+      "loss": 2.8992,
+      "step": 2616
+    },
+    {
+      "epoch": 0.18205850638283072,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0018812232234699394,
+      "loss": 3.1757,
+      "step": 2617
+    },
+    {
+      "epoch": 0.18212807401996592,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018811166861096656,
+      "loss": 2.7238,
+      "step": 2618
+    },
+    {
+      "epoch": 0.18219764165710112,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018810101040107276,
+      "loss": 3.3991,
+      "step": 2619
+    },
+    {
+      "epoch": 0.18226720929423632,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001880903477178537,
+      "loss": 2.9947,
+      "step": 2620
+    },
+    {
+      "epoch": 0.1823367769313715,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001880796805618508,
+      "loss": 3.1391,
+      "step": 2621
+    },
+    {
+      "epoch": 0.18240634456850674,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018806900893360567,
+      "loss": 3.1868,
+      "step": 2622
+    },
+    {
+      "epoch": 0.18247591220564194,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001880583328336602,
+      "loss": 3.2925,
+      "step": 2623
+    },
+    {
+      "epoch": 0.18254547984277714,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001880476522625564,
+      "loss": 3.3639,
+      "step": 2624
+    },
+    {
+      "epoch": 0.18261504747991233,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018803696722083662,
+      "loss": 2.7638,
+      "step": 2625
+    },
+    {
+      "epoch": 0.18268461511704756,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018802627770904338,
+      "loss": 3.1189,
+      "step": 2626
+    },
+    {
+      "epoch": 0.18275418275418276,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018801558372771945,
+      "loss": 3.4456,
+      "step": 2627
+    },
+    {
+      "epoch": 0.18282375039131796,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018800488527740782,
+      "loss": 3.0031,
+      "step": 2628
+    },
+    {
+      "epoch": 0.18289331802845316,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001879941823586517,
+      "loss": 3.2647,
+      "step": 2629
+    },
+    {
+      "epoch": 0.18296288566558838,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001879834749719945,
+      "loss": 3.4376,
+      "step": 2630
+    },
+    {
+      "epoch": 0.18303245330272358,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001879727631179799,
+      "loss": 2.9715,
+      "step": 2631
+    },
+    {
+      "epoch": 0.18310202093985878,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018796204679715183,
+      "loss": 3.1558,
+      "step": 2632
+    },
+    {
+      "epoch": 0.18317158857699398,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018795132601005435,
+      "loss": 3.399,
+      "step": 2633
+    },
+    {
+      "epoch": 0.18324115621412917,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018794060075723188,
+      "loss": 3.0927,
+      "step": 2634
+    },
+    {
+      "epoch": 0.1833107238512644,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018792987103922894,
+      "loss": 3.4046,
+      "step": 2635
+    },
+    {
+      "epoch": 0.1833802914883996,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018791913685659036,
+      "loss": 3.0692,
+      "step": 2636
+    },
+    {
+      "epoch": 0.1834498591255348,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0018790839820986113,
+      "loss": 3.1099,
+      "step": 2637
+    },
+    {
+      "epoch": 0.18351942676267,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018789765509958656,
+      "loss": 2.9657,
+      "step": 2638
+    },
+    {
+      "epoch": 0.18358899439980522,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001878869075263121,
+      "loss": 3.4141,
+      "step": 2639
+    },
+    {
+      "epoch": 0.18365856203694042,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018787615549058347,
+      "loss": 2.8735,
+      "step": 2640
+    },
+    {
+      "epoch": 0.18372812967407562,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018786539899294655,
+      "loss": 3.509,
+      "step": 2641
+    },
+    {
+      "epoch": 0.18379769731121082,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018785463803394757,
+      "loss": 3.4174,
+      "step": 2642
+    },
+    {
+      "epoch": 0.18386726494834604,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001878438726141329,
+      "loss": 2.6337,
+      "step": 2643
+    },
+    {
+      "epoch": 0.18393683258548124,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001878331027340491,
+      "loss": 3.4049,
+      "step": 2644
+    },
+    {
+      "epoch": 0.18400640022261644,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018782232839424308,
+      "loss": 3.0697,
+      "step": 2645
+    },
+    {
+      "epoch": 0.18407596785975164,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001878115495952619,
+      "loss": 3.241,
+      "step": 2646
+    },
+    {
+      "epoch": 0.18414553549688684,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001878007663376528,
+      "loss": 3.2673,
+      "step": 2647
+    },
+    {
+      "epoch": 0.18421510313402206,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018778997862196338,
+      "loss": 3.3684,
+      "step": 2648
+    },
+    {
+      "epoch": 0.18428467077115726,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001877791864487413,
+      "loss": 2.8385,
+      "step": 2649
+    },
+    {
+      "epoch": 0.18435423840829246,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001877683898185346,
+      "loss": 3.0731,
+      "step": 2650
+    },
+    {
+      "epoch": 0.18442380604542766,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0018775758873189143,
+      "loss": 3.0177,
+      "step": 2651
+    },
+    {
+      "epoch": 0.18449337368256288,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018774678318936025,
+      "loss": 3.1539,
+      "step": 2652
+    },
+    {
+      "epoch": 0.18456294131969808,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018773597319148968,
+      "loss": 2.9013,
+      "step": 2653
+    },
+    {
+      "epoch": 0.18463250895683328,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0018772515873882864,
+      "loss": 3.1443,
+      "step": 2654
+    },
+    {
+      "epoch": 0.18470207659396848,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018771433983192619,
+      "loss": 3.1884,
+      "step": 2655
+    },
+    {
+      "epoch": 0.1847716442311037,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018770351647133165,
+      "loss": 3.4777,
+      "step": 2656
+    },
+    {
+      "epoch": 0.1848412118682389,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018769268865759467,
+      "loss": 3.2733,
+      "step": 2657
+    },
+    {
+      "epoch": 0.1849107795053741,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001876818563912649,
+      "loss": 3.135,
+      "step": 2658
+    },
+    {
+      "epoch": 0.1849803471425093,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018767101967289244,
+      "loss": 3.3096,
+      "step": 2659
+    },
+    {
+      "epoch": 0.1850499147796445,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018766017850302748,
+      "loss": 3.4159,
+      "step": 2660
+    },
+    {
+      "epoch": 0.18511948241677972,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001876493328822205,
+      "loss": 3.3289,
+      "step": 2661
+    },
+    {
+      "epoch": 0.18518905005391492,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018763848281102221,
+      "loss": 3.3328,
+      "step": 2662
+    },
+    {
+      "epoch": 0.18525861769105012,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018762762828998345,
+      "loss": 3.0924,
+      "step": 2663
+    },
+    {
+      "epoch": 0.18532818532818532,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018761676931965542,
+      "loss": 2.9847,
+      "step": 2664
+    },
+    {
+      "epoch": 0.18539775296532054,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018760590590058946,
+      "loss": 3.0087,
+      "step": 2665
+    },
+    {
+      "epoch": 0.18546732060245574,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0018759503803333717,
+      "loss": 2.853,
+      "step": 2666
+    },
+    {
+      "epoch": 0.18553688823959094,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0018758416571845037,
+      "loss": 2.8915,
+      "step": 2667
+    },
+    {
+      "epoch": 0.18560645587672614,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018757328895648109,
+      "loss": 3.0704,
+      "step": 2668
+    },
+    {
+      "epoch": 0.18567602351386134,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0018756240774798157,
+      "loss": 3.1353,
+      "step": 2669
+    },
+    {
+      "epoch": 0.18574559115099656,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018755152209350436,
+      "loss": 3.1826,
+      "step": 2670
+    },
+    {
+      "epoch": 0.18581515878813176,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018754063199360217,
+      "loss": 3.1387,
+      "step": 2671
+    },
+    {
+      "epoch": 0.18588472642526696,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018752973744882789,
+      "loss": 2.8912,
+      "step": 2672
+    },
+    {
+      "epoch": 0.18595429406240216,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001875188384597347,
+      "loss": 3.3143,
+      "step": 2673
+    },
+    {
+      "epoch": 0.18602386169953739,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018750793502687606,
+      "loss": 3.4538,
+      "step": 2674
+    },
+    {
+      "epoch": 0.18609342933667258,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018749702715080557,
+      "loss": 3.6592,
+      "step": 2675
+    },
+    {
+      "epoch": 0.18616299697380778,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018748611483207704,
+      "loss": 3.3338,
+      "step": 2676
+    },
+    {
+      "epoch": 0.18623256461094298,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018747519807124453,
+      "loss": 2.9657,
+      "step": 2677
+    },
+    {
+      "epoch": 0.1863021322480782,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001874642768688624,
+      "loss": 3.3772,
+      "step": 2678
+    },
+    {
+      "epoch": 0.1863716998852134,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0018745335122548514,
+      "loss": 2.5748,
+      "step": 2679
+    },
+    {
+      "epoch": 0.1864412675223486,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018744242114166752,
+      "loss": 3.4695,
+      "step": 2680
+    },
+    {
+      "epoch": 0.1865108351594838,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018743148661796447,
+      "loss": 3.6006,
+      "step": 2681
+    },
+    {
+      "epoch": 0.186580402796619,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018742054765493125,
+      "loss": 2.9741,
+      "step": 2682
+    },
+    {
+      "epoch": 0.18664997043375423,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001874096042531232,
+      "loss": 3.1929,
+      "step": 2683
+    },
+    {
+      "epoch": 0.18671953807088942,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018739865641309605,
+      "loss": 3.0261,
+      "step": 2684
+    },
+    {
+      "epoch": 0.18678910570802462,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018738770413540566,
+      "loss": 3.3793,
+      "step": 2685
+    },
+    {
+      "epoch": 0.18685867334515982,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001873767474206081,
+      "loss": 2.8978,
+      "step": 2686
+    },
+    {
+      "epoch": 0.18692824098229505,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018736578626925976,
+      "loss": 3.3339,
+      "step": 2687
+    },
+    {
+      "epoch": 0.18699780861943024,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0018735482068191712,
+      "loss": 2.9924,
+      "step": 2688
+    },
+    {
+      "epoch": 0.18706737625656544,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018734385065913698,
+      "loss": 3.4697,
+      "step": 2689
+    },
+    {
+      "epoch": 0.18713694389370064,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018733287620147634,
+      "loss": 3.0865,
+      "step": 2690
+    },
+    {
+      "epoch": 0.18720651153083587,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0018732189730949246,
+      "loss": 3.5094,
+      "step": 2691
+    },
+    {
+      "epoch": 0.18727607916797107,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018731091398374276,
+      "loss": 3.3612,
+      "step": 2692
+    },
+    {
+      "epoch": 0.18734564680510626,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018729992622478493,
+      "loss": 3.5381,
+      "step": 2693
+    },
+    {
+      "epoch": 0.18741521444224146,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018728893403317686,
+      "loss": 3.3153,
+      "step": 2694
+    },
+    {
+      "epoch": 0.18748478207937666,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018727793740947669,
+      "loss": 3.1899,
+      "step": 2695
+    },
+    {
+      "epoch": 0.1875543497165119,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001872669363542428,
+      "loss": 3.0942,
+      "step": 2696
+    },
+    {
+      "epoch": 0.18762391735364709,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018725593086803371,
+      "loss": 3.0085,
+      "step": 2697
+    },
+    {
+      "epoch": 0.18769348499078228,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018724492095140825,
+      "loss": 3.2013,
+      "step": 2698
+    },
+    {
+      "epoch": 0.18776305262791748,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0018723390660492548,
+      "loss": 2.9254,
+      "step": 2699
+    },
+    {
+      "epoch": 0.1878326202650527,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.001872228878291446,
+      "loss": 3.2803,
+      "step": 2700
+    },
+    {
+      "epoch": 0.1879021879021879,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0018721186462462513,
+      "loss": 2.8896,
+      "step": 2701
+    },
+    {
+      "epoch": 0.1879717555393231,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018720083699192674,
+      "loss": 3.1164,
+      "step": 2702
+    },
+    {
+      "epoch": 0.1880413231764583,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018718980493160938,
+      "loss": 3.4225,
+      "step": 2703
+    },
+    {
+      "epoch": 0.18811089081359353,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018717876844423318,
+      "loss": 3.3561,
+      "step": 2704
+    },
+    {
+      "epoch": 0.18818045845072873,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018716772753035852,
+      "loss": 2.6811,
+      "step": 2705
+    },
+    {
+      "epoch": 0.18825002608786393,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0018715668219054606,
+      "loss": 3.4476,
+      "step": 2706
+    },
+    {
+      "epoch": 0.18831959372499912,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018714563242535657,
+      "loss": 3.7711,
+      "step": 2707
+    },
+    {
+      "epoch": 0.18838916136213432,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018713457823535107,
+      "loss": 3.125,
+      "step": 2708
+    },
+    {
+      "epoch": 0.18845872899926955,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001871235196210909,
+      "loss": 3.4208,
+      "step": 2709
+    },
+    {
+      "epoch": 0.18852829663640475,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018711245658313755,
+      "loss": 2.6746,
+      "step": 2710
+    },
+    {
+      "epoch": 0.18859786427353994,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0018710138912205274,
+      "loss": 3.2908,
+      "step": 2711
+    },
+    {
+      "epoch": 0.18866743191067514,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018709031723839842,
+      "loss": 3.5754,
+      "step": 2712
+    },
+    {
+      "epoch": 0.18873699954781037,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0018707924093273674,
+      "loss": 3.3013,
+      "step": 2713
+    },
+    {
+      "epoch": 0.18880656718494557,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018706816020563012,
+      "loss": 3.2028,
+      "step": 2714
+    },
+    {
+      "epoch": 0.18887613482208077,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018705707505764116,
+      "loss": 3.3403,
+      "step": 2715
+    },
+    {
+      "epoch": 0.18894570245921596,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018704598548933277,
+      "loss": 3.3962,
+      "step": 2716
+    },
+    {
+      "epoch": 0.1890152700963512,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018703489150126793,
+      "loss": 2.7822,
+      "step": 2717
+    },
+    {
+      "epoch": 0.1890848377334864,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018702379309401005,
+      "loss": 3.3276,
+      "step": 2718
+    },
+    {
+      "epoch": 0.1891544053706216,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0018701269026812253,
+      "loss": 3.452,
+      "step": 2719
+    },
+    {
+      "epoch": 0.18922397300775678,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018700158302416923,
+      "loss": 3.1536,
+      "step": 2720
+    },
+    {
+      "epoch": 0.18929354064489198,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018699047136271402,
+      "loss": 2.8866,
+      "step": 2721
+    },
+    {
+      "epoch": 0.1893631082820272,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018697935528432118,
+      "loss": 3.1771,
+      "step": 2722
+    },
+    {
+      "epoch": 0.1894326759191624,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018696823478955502,
+      "loss": 3.6915,
+      "step": 2723
+    },
+    {
+      "epoch": 0.1895022435562976,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018695710987898032,
+      "loss": 3.3638,
+      "step": 2724
+    },
+    {
+      "epoch": 0.1895718111934328,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018694598055316184,
+      "loss": 3.0669,
+      "step": 2725
+    },
+    {
+      "epoch": 0.18964137883056803,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0018693484681266473,
+      "loss": 3.0863,
+      "step": 2726
+    },
+    {
+      "epoch": 0.18971094646770323,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018692370865805426,
+      "loss": 3.2591,
+      "step": 2727
+    },
+    {
+      "epoch": 0.18978051410483843,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00186912566089896,
+      "loss": 3.1086,
+      "step": 2728
+    },
+    {
+      "epoch": 0.18985008174197363,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001869014191087557,
+      "loss": 2.9723,
+      "step": 2729
+    },
+    {
+      "epoch": 0.18991964937910885,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018689026771519937,
+      "loss": 3.1802,
+      "step": 2730
+    },
+    {
+      "epoch": 0.18998921701624405,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001868791119097932,
+      "loss": 3.3697,
+      "step": 2731
+    },
+    {
+      "epoch": 0.19005878465337925,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001868679516931036,
+      "loss": 3.5163,
+      "step": 2732
+    },
+    {
+      "epoch": 0.19012835229051445,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001868567870656973,
+      "loss": 3.3499,
+      "step": 2733
+    },
+    {
+      "epoch": 0.19019791992764964,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018684561802814112,
+      "loss": 3.3363,
+      "step": 2734
+    },
+    {
+      "epoch": 0.19026748756478487,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0018683444458100222,
+      "loss": 2.8722,
+      "step": 2735
+    },
+    {
+      "epoch": 0.19033705520192007,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0018682326672484785,
+      "loss": 3.327,
+      "step": 2736
+    },
+    {
+      "epoch": 0.19040662283905527,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0018681208446024566,
+      "loss": 3.1799,
+      "step": 2737
+    },
+    {
+      "epoch": 0.19047619047619047,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001868008977877634,
+      "loss": 3.1104,
+      "step": 2738
+    },
+    {
+      "epoch": 0.1905457581133257,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018678970670796902,
+      "loss": 3.0156,
+      "step": 2739
+    },
+    {
+      "epoch": 0.1906153257504609,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0018677851122143082,
+      "loss": 3.1659,
+      "step": 2740
+    },
+    {
+      "epoch": 0.1906848933875961,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018676731132871718,
+      "loss": 3.354,
+      "step": 2741
+    },
+    {
+      "epoch": 0.1907544610247313,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018675610703039682,
+      "loss": 3.3231,
+      "step": 2742
+    },
+    {
+      "epoch": 0.1908240286618665,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018674489832703864,
+      "loss": 3.2926,
+      "step": 2743
+    },
+    {
+      "epoch": 0.1908935962990017,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0018673368521921177,
+      "loss": 3.3228,
+      "step": 2744
+    },
+    {
+      "epoch": 0.1909631639361369,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001867224677074855,
+      "loss": 2.9053,
+      "step": 2745
+    },
+    {
+      "epoch": 0.1910327315732721,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018671124579242944,
+      "loss": 3.0507,
+      "step": 2746
+    },
+    {
+      "epoch": 0.1911022992104073,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018670001947461339,
+      "loss": 3.5,
+      "step": 2747
+    },
+    {
+      "epoch": 0.19117186684754253,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0018668878875460733,
+      "loss": 3.6022,
+      "step": 2748
+    },
+    {
+      "epoch": 0.19124143448467773,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0018667755363298154,
+      "loss": 3.3031,
+      "step": 2749
+    },
+    {
+      "epoch": 0.19131100212181293,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0018666631411030645,
+      "loss": 3.531,
+      "step": 2750
+    },
+    {
+      "epoch": 0.19138056975894813,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018665507018715277,
+      "loss": 3.1135,
+      "step": 2751
+    },
+    {
+      "epoch": 0.19145013739608335,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001866438218640914,
+      "loss": 3.5832,
+      "step": 2752
+    },
+    {
+      "epoch": 0.19151970503321855,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018663256914169346,
+      "loss": 2.8053,
+      "step": 2753
+    },
+    {
+      "epoch": 0.19158927267035375,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018662131202053032,
+      "loss": 3.1533,
+      "step": 2754
+    },
+    {
+      "epoch": 0.19165884030748895,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018661005050117359,
+      "loss": 3.1857,
+      "step": 2755
+    },
+    {
+      "epoch": 0.19172840794462417,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018659878458419498,
+      "loss": 3.2337,
+      "step": 2756
+    },
+    {
+      "epoch": 0.19179797558175937,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018658751427016664,
+      "loss": 2.8843,
+      "step": 2757
+    },
+    {
+      "epoch": 0.19186754321889457,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018657623955966075,
+      "loss": 3.1811,
+      "step": 2758
+    },
+    {
+      "epoch": 0.19193711085602977,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018656496045324977,
+      "loss": 3.4058,
+      "step": 2759
+    },
+    {
+      "epoch": 0.19200667849316497,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018655367695150642,
+      "loss": 3.3727,
+      "step": 2760
+    },
+    {
+      "epoch": 0.1920762461303002,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0018654238905500362,
+      "loss": 3.2887,
+      "step": 2761
+    },
+    {
+      "epoch": 0.1921458137674354,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018653109676431453,
+      "loss": 3.2733,
+      "step": 2762
+    },
+    {
+      "epoch": 0.1922153814045706,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018651980008001247,
+      "loss": 3.2227,
+      "step": 2763
+    },
+    {
+      "epoch": 0.1922849490417058,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001865084990026711,
+      "loss": 2.544,
+      "step": 2764
+    },
+    {
+      "epoch": 0.19235451667884101,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018649719353286411,
+      "loss": 2.9591,
+      "step": 2765
+    },
+    {
+      "epoch": 0.1924240843159762,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018648588367116568,
+      "loss": 3.5255,
+      "step": 2766
+    },
+    {
+      "epoch": 0.1924936519531114,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018647456941814995,
+      "loss": 3.4794,
+      "step": 2767
+    },
+    {
+      "epoch": 0.1925632195902466,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0018646325077439148,
+      "loss": 3.0202,
+      "step": 2768
+    },
+    {
+      "epoch": 0.19263278722738184,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018645192774046492,
+      "loss": 3.567,
+      "step": 2769
+    },
+    {
+      "epoch": 0.19270235486451703,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018644060031694522,
+      "loss": 3.051,
+      "step": 2770
+    },
+    {
+      "epoch": 0.19277192250165223,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018642926850440755,
+      "loss": 2.9566,
+      "step": 2771
+    },
+    {
+      "epoch": 0.19284149013878743,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018641793230342726,
+      "loss": 3.2604,
+      "step": 2772
+    },
+    {
+      "epoch": 0.19291105777592263,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018640659171457992,
+      "loss": 3.246,
+      "step": 2773
+    },
+    {
+      "epoch": 0.19298062541305785,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0018639524673844143,
+      "loss": 3.4788,
+      "step": 2774
+    },
+    {
+      "epoch": 0.19305019305019305,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001863838973755877,
+      "loss": 2.9149,
+      "step": 2775
+    },
+    {
+      "epoch": 0.19311976068732825,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001863725436265951,
+      "loss": 2.9041,
+      "step": 2776
+    },
+    {
+      "epoch": 0.19318932832446345,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018636118549204008,
+      "loss": 3.3209,
+      "step": 2777
+    },
+    {
+      "epoch": 0.19325889596159868,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018634982297249937,
+      "loss": 3.6583,
+      "step": 2778
+    },
+    {
+      "epoch": 0.19332846359873387,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001863384560685499,
+      "loss": 3.1681,
+      "step": 2779
+    },
+    {
+      "epoch": 0.19339803123586907,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0018632708478076875,
+      "loss": 3.3337,
+      "step": 2780
+    },
+    {
+      "epoch": 0.19346759887300427,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018631570910973342,
+      "loss": 3.3942,
+      "step": 2781
+    },
+    {
+      "epoch": 0.1935371665101395,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001863043290560214,
+      "loss": 3.2764,
+      "step": 2782
+    },
+    {
+      "epoch": 0.1936067341472747,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018629294462021058,
+      "loss": 3.1924,
+      "step": 2783
+    },
+    {
+      "epoch": 0.1936763017844099,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0018628155580287897,
+      "loss": 3.2434,
+      "step": 2784
+    },
+    {
+      "epoch": 0.1937458694215451,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0018627016260460486,
+      "loss": 3.2823,
+      "step": 2785
+    },
+    {
+      "epoch": 0.1938154370586803,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001862587650259667,
+      "loss": 3.1277,
+      "step": 2786
+    },
+    {
+      "epoch": 0.19388500469581552,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0018624736306754324,
+      "loss": 3.0137,
+      "step": 2787
+    },
+    {
+      "epoch": 0.19395457233295071,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0018623595672991342,
+      "loss": 3.1154,
+      "step": 2788
+    },
+    {
+      "epoch": 0.1940241399700859,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0018622454601365636,
+      "loss": 3.1293,
+      "step": 2789
+    },
+    {
+      "epoch": 0.1940937076072211,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0018621313091935145,
+      "loss": 2.8241,
+      "step": 2790
+    },
+    {
+      "epoch": 0.19416327524435634,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0018620171144757833,
+      "loss": 3.5697,
+      "step": 2791
+    },
+    {
+      "epoch": 0.19423284288149154,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018619028759891676,
+      "loss": 2.8754,
+      "step": 2792
+    },
+    {
+      "epoch": 0.19430241051862673,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018617885937394685,
+      "loss": 2.8006,
+      "step": 2793
+    },
+    {
+      "epoch": 0.19437197815576193,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001861674267732488,
+      "loss": 2.903,
+      "step": 2794
+    },
+    {
+      "epoch": 0.19444154579289716,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0018615598979740318,
+      "loss": 3.1794,
+      "step": 2795
+    },
+    {
+      "epoch": 0.19451111343003236,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018614454844699062,
+      "loss": 3.3262,
+      "step": 2796
+    },
+    {
+      "epoch": 0.19458068106716755,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018613310272259209,
+      "loss": 2.9255,
+      "step": 2797
+    },
+    {
+      "epoch": 0.19465024870430275,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0018612165262478875,
+      "loss": 3.1538,
+      "step": 2798
+    },
+    {
+      "epoch": 0.19471981634143795,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0018611019815416197,
+      "loss": 2.8641,
+      "step": 2799
+    },
+    {
+      "epoch": 0.19478938397857318,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0018609873931129338,
+      "loss": 2.8544,
+      "step": 2800
+    },
+    {
+      "epoch": 0.19485895161570838,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018608727609676476,
+      "loss": 3.2558,
+      "step": 2801
+    },
+    {
+      "epoch": 0.19492851925284357,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0018607580851115817,
+      "loss": 3.1328,
+      "step": 2802
+    },
+    {
+      "epoch": 0.19499808688997877,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018606433655505587,
+      "loss": 2.9437,
+      "step": 2803
+    },
+    {
+      "epoch": 0.195067654527114,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0018605286022904037,
+      "loss": 2.9838,
+      "step": 2804
+    },
+    {
+      "epoch": 0.1951372221642492,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0018604137953369439,
+      "loss": 3.0965,
+      "step": 2805
+    },
+    {
+      "epoch": 0.1952067898013844,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018602989446960079,
+      "loss": 3.2953,
+      "step": 2806
+    },
+    {
+      "epoch": 0.1952763574385196,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001860184050373428,
+      "loss": 3.4891,
+      "step": 2807
+    },
+    {
+      "epoch": 0.1953459250756548,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0018600691123750374,
+      "loss": 3.2762,
+      "step": 2808
+    },
+    {
+      "epoch": 0.19541549271279002,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018599541307066727,
+      "loss": 2.7164,
+      "step": 2809
+    },
+    {
+      "epoch": 0.19548506034992522,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018598391053741717,
+      "loss": 3.2189,
+      "step": 2810
+    },
+    {
+      "epoch": 0.19555462798706041,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0018597240363833745,
+      "loss": 3.0099,
+      "step": 2811
+    },
+    {
+      "epoch": 0.1956241956241956,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0018596089237401245,
+      "loss": 2.7148,
+      "step": 2812
+    },
+    {
+      "epoch": 0.19569376326133084,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0018594937674502657,
+      "loss": 3.1438,
+      "step": 2813
+    },
+    {
+      "epoch": 0.19576333089846604,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001859378567519646,
+      "loss": 3.3735,
+      "step": 2814
+    },
+    {
+      "epoch": 0.19583289853560124,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018592633239541136,
+      "loss": 3.291,
+      "step": 2815
+    },
+    {
+      "epoch": 0.19590246617273643,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018591480367595213,
+      "loss": 3.312,
+      "step": 2816
+    },
+    {
+      "epoch": 0.19597203380987166,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0018590327059417216,
+      "loss": 3.276,
+      "step": 2817
+    },
+    {
+      "epoch": 0.19604160144700686,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018589173315065712,
+      "loss": 2.7598,
+      "step": 2818
+    },
+    {
+      "epoch": 0.19611116908414206,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001858801913459928,
+      "loss": 3.0982,
+      "step": 2819
+    },
+    {
+      "epoch": 0.19618073672127725,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0018586864518076523,
+      "loss": 3.0716,
+      "step": 2820
+    },
+    {
+      "epoch": 0.19625030435841245,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018585709465556066,
+      "loss": 3.7368,
+      "step": 2821
+    },
+    {
+      "epoch": 0.19631987199554768,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0018584553977096557,
+      "loss": 3.1684,
+      "step": 2822
+    },
+    {
+      "epoch": 0.19638943963268288,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.0018583398052756665,
+      "loss": 2.9681,
+      "step": 2823
+    },
+    {
+      "epoch": 0.19645900726981808,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018582241692595089,
+      "loss": 3.0652,
+      "step": 2824
+    },
+    {
+      "epoch": 0.19652857490695327,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0018581084896670532,
+      "loss": 2.9162,
+      "step": 2825
+    },
+    {
+      "epoch": 0.1965981425440885,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.0018579927665041739,
+      "loss": 3.1073,
+      "step": 2826
+    },
+    {
+      "epoch": 0.1966677101812237,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0018578769997767465,
+      "loss": 2.6965,
+      "step": 2827
+    },
+    {
+      "epoch": 0.1967372778183589,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001857761189490649,
+      "loss": 2.9426,
+      "step": 2828
+    },
+    {
+      "epoch": 0.1968068454554941,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0018576453356517618,
+      "loss": 3.1776,
+      "step": 2829
+    },
+    {
+      "epoch": 0.19687641309262932,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001857529438265967,
+      "loss": 2.8095,
+      "step": 2830
+    },
+    {
+      "epoch": 0.19694598072976452,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.0018574134973391497,
+      "loss": 3.233,
+      "step": 2831
+    },
+    {
+      "epoch": 0.19701554836689972,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001857297512877197,
+      "loss": 3.2599,
+      "step": 2832
+    },
+    {
+      "epoch": 0.19708511600403492,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0018571814848859973,
+      "loss": 2.7753,
+      "step": 2833
+    },
+    {
+      "epoch": 0.19715468364117011,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018570654133714425,
+      "loss": 2.755,
+      "step": 2834
+    },
+    {
+      "epoch": 0.19722425127830534,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.001856949298339426,
+      "loss": 2.9832,
+      "step": 2835
+    },
+    {
+      "epoch": 0.19729381891544054,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0018568331397958435,
+      "loss": 3.0149,
+      "step": 2836
+    },
+    {
+      "epoch": 0.19736338655257574,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0018567169377465928,
+      "loss": 3.0255,
+      "step": 2837
+    },
+    {
+      "epoch": 0.19743295418971094,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0018566006921975741,
+      "loss": 2.6039,
+      "step": 2838
+    },
+    {
+      "epoch": 0.19750252182684616,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0018564844031546902,
+      "loss": 2.7991,
+      "step": 2839
+    },
+    {
+      "epoch": 0.19757208946398136,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.001856368070623845,
+      "loss": 2.8238,
+      "step": 2840
+    },
+    {
+      "epoch": 0.19764165710111656,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0018562516946109455,
+      "loss": 2.7847,
+      "step": 2841
+    },
+    {
+      "epoch": 0.19771122473825176,
+      "grad_norm": 1.625,
+      "learning_rate": 0.001856135275121901,
+      "loss": 2.5704,
+      "step": 2842
+    },
+    {
+      "epoch": 0.19778079237538698,
+      "grad_norm": 4.5625,
+      "learning_rate": 0.0018560188121626224,
+      "loss": 3.1919,
+      "step": 2843
+    },
+    {
+      "epoch": 0.19785036001252218,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0018559023057390235,
+      "loss": 2.3178,
+      "step": 2844
+    },
+    {
+      "epoch": 0.19791992764965738,
+      "grad_norm": 2.25,
+      "learning_rate": 0.001855785755857019,
+      "loss": 2.9565,
+      "step": 2845
+    },
+    {
+      "epoch": 0.19798949528679258,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0018556691625225277,
+      "loss": 2.532,
+      "step": 2846
+    },
+    {
+      "epoch": 0.19805906292392778,
+      "grad_norm": 3.484375,
+      "learning_rate": 0.001855552525741469,
+      "loss": 2.7136,
+      "step": 2847
+    },
+    {
+      "epoch": 0.198128630561063,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0018554358455197652,
+      "loss": 2.5548,
+      "step": 2848
+    },
+    {
+      "epoch": 0.1981981981981982,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018553191218633415,
+      "loss": 2.5284,
+      "step": 2849
+    },
+    {
+      "epoch": 0.1982677658353334,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0018552023547781231,
+      "loss": 2.6736,
+      "step": 2850
+    },
+    {
+      "epoch": 0.1983373334724686,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.0018550855442700403,
+      "loss": 2.8147,
+      "step": 2851
+    },
+    {
+      "epoch": 0.19840690110960382,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0018549686903450234,
+      "loss": 2.7503,
+      "step": 2852
+    },
+    {
+      "epoch": 0.19847646874673902,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0018548517930090057,
+      "loss": 2.8713,
+      "step": 2853
+    },
+    {
+      "epoch": 0.19854603638387422,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0018547348522679225,
+      "loss": 2.7856,
+      "step": 2854
+    },
+    {
+      "epoch": 0.19861560402100942,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0018546178681277119,
+      "loss": 2.6505,
+      "step": 2855
+    },
+    {
+      "epoch": 0.19868517165814464,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0018545008405943136,
+      "loss": 2.6597,
+      "step": 2856
+    },
+    {
+      "epoch": 0.19875473929527984,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0018543837696736694,
+      "loss": 2.4495,
+      "step": 2857
+    },
+    {
+      "epoch": 0.19882430693241504,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.001854266655371724,
+      "loss": 2.5478,
+      "step": 2858
+    },
+    {
+      "epoch": 0.19889387456955024,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.0018541494976944235,
+      "loss": 2.8368,
+      "step": 2859
+    },
+    {
+      "epoch": 0.19896344220668544,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0018540322966477168,
+      "loss": 2.3955,
+      "step": 2860
+    },
+    {
+      "epoch": 0.19903300984382066,
+      "grad_norm": 3.015625,
+      "learning_rate": 0.001853915052237555,
+      "loss": 2.4895,
+      "step": 2861
+    },
+    {
+      "epoch": 0.19910257748095586,
+      "grad_norm": 2.640625,
+      "learning_rate": 0.0018537977644698907,
+      "loss": 2.7793,
+      "step": 2862
+    },
+    {
+      "epoch": 0.19917214511809106,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0018536804333506793,
+      "loss": 2.3713,
+      "step": 2863
+    },
+    {
+      "epoch": 0.19924171275522626,
+      "grad_norm": 2.578125,
+      "learning_rate": 0.0018535630588858783,
+      "loss": 2.6354,
+      "step": 2864
+    },
+    {
+      "epoch": 0.19931128039236148,
+      "grad_norm": 3.390625,
+      "learning_rate": 0.0018534456410814473,
+      "loss": 2.3463,
+      "step": 2865
+    },
+    {
+      "epoch": 0.19938084802949668,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.0018533281799433489,
+      "loss": 2.2692,
+      "step": 2866
+    },
+    {
+      "epoch": 0.19945041566663188,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.0018532106754775462,
+      "loss": 2.1026,
+      "step": 2867
+    },
+    {
+      "epoch": 0.19951998330376708,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.001853093127690006,
+      "loss": 2.2183,
+      "step": 2868
+    },
+    {
+      "epoch": 0.1995895509409023,
+      "grad_norm": 2.921875,
+      "learning_rate": 0.0018529755365866967,
+      "loss": 2.5848,
+      "step": 2869
+    },
+    {
+      "epoch": 0.1996591185780375,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.001852857902173589,
+      "loss": 2.3433,
+      "step": 2870
+    },
+    {
+      "epoch": 0.1997286862151727,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.0018527402244566554,
+      "loss": 2.0217,
+      "step": 2871
+    },
+    {
+      "epoch": 0.1997982538523079,
+      "grad_norm": 2.578125,
+      "learning_rate": 0.0018526225034418715,
+      "loss": 2.2611,
+      "step": 2872
+    },
+    {
+      "epoch": 0.1998678214894431,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0018525047391352144,
+      "loss": 2.3769,
+      "step": 2873
+    },
+    {
+      "epoch": 0.19993738912657832,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.001852386931542664,
+      "loss": 2.2041,
+      "step": 2874
+    },
+    {
+      "epoch": 0.20000695676371352,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.0018522690806702013,
+      "loss": 2.1939,
+      "step": 2875
+    },
+    {
+      "epoch": 0.20007652440084872,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0018521511865238103,
+      "loss": 2.1469,
+      "step": 2876
+    },
+    {
+      "epoch": 0.20014609203798392,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0018520332491094775,
+      "loss": 2.3859,
+      "step": 2877
+    },
+    {
+      "epoch": 0.20021565967511915,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0018519152684331906,
+      "loss": 2.2566,
+      "step": 2878
+    },
+    {
+      "epoch": 0.20028522731225434,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0018517972445009404,
+      "loss": 2.1774,
+      "step": 2879
+    },
+    {
+      "epoch": 0.20035479494938954,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0018516791773187196,
+      "loss": 1.9746,
+      "step": 2880
+    },
+    {
+      "epoch": 0.20042436258652474,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0018515610668925228,
+      "loss": 2.3208,
+      "step": 2881
+    },
+    {
+      "epoch": 0.20049393022365997,
+      "grad_norm": 2.6875,
+      "learning_rate": 0.0018514429132283476,
+      "loss": 2.1074,
+      "step": 2882
+    },
+    {
+      "epoch": 0.20056349786079516,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0018513247163321925,
+      "loss": 2.1219,
+      "step": 2883
+    },
+    {
+      "epoch": 0.20063306549793036,
+      "grad_norm": 2.953125,
+      "learning_rate": 0.0018512064762100594,
+      "loss": 2.3607,
+      "step": 2884
+    },
+    {
+      "epoch": 0.20070263313506556,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0018510881928679517,
+      "loss": 2.1561,
+      "step": 2885
+    },
+    {
+      "epoch": 0.20077220077220076,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0018509698663118754,
+      "loss": 1.8336,
+      "step": 2886
+    },
+    {
+      "epoch": 0.20084176840933599,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0018508514965478384,
+      "loss": 2.1067,
+      "step": 2887
+    },
+    {
+      "epoch": 0.20091133604647118,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0018507330835818513,
+      "loss": 2.0895,
+      "step": 2888
+    },
+    {
+      "epoch": 0.20098090368360638,
+      "grad_norm": 2.953125,
+      "learning_rate": 0.0018506146274199261,
+      "loss": 1.8984,
+      "step": 2889
+    },
+    {
+      "epoch": 0.20105047132074158,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0018504961280680777,
+      "loss": 1.9448,
+      "step": 2890
+    },
+    {
+      "epoch": 0.2011200389578768,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0018503775855323226,
+      "loss": 1.7361,
+      "step": 2891
+    },
+    {
+      "epoch": 0.201189606595012,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.00185025899981868,
+      "loss": 1.8027,
+      "step": 2892
+    },
+    {
+      "epoch": 0.2012591742321472,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0018501403709331706,
+      "loss": 2.1839,
+      "step": 2893
+    },
+    {
+      "epoch": 0.2013287418692824,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0018500216988818186,
+      "loss": 1.9038,
+      "step": 2894
+    },
+    {
+      "epoch": 0.20139830950641763,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.0018499029836706491,
+      "loss": 1.8661,
+      "step": 2895
+    },
+    {
+      "epoch": 0.20146787714355283,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0018497842253056898,
+      "loss": 1.9367,
+      "step": 2896
+    },
+    {
+      "epoch": 0.20153744478068802,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.0018496654237929709,
+      "loss": 2.0319,
+      "step": 2897
+    },
+    {
+      "epoch": 0.20160701241782322,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0018495465791385243,
+      "loss": 1.7812,
+      "step": 2898
+    },
+    {
+      "epoch": 0.20167658005495842,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0018494276913483846,
+      "loss": 2.0751,
+      "step": 2899
+    },
+    {
+      "epoch": 0.20174614769209365,
+      "grad_norm": 4.34375,
+      "learning_rate": 0.0018493087604285882,
+      "loss": 2.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.20181571532922885,
+      "grad_norm": 3.125,
+      "learning_rate": 0.0018491897863851733,
+      "loss": 2.1483,
+      "step": 2901
+    },
+    {
+      "epoch": 0.20188528296636404,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.001849070769224182,
+      "loss": 1.8735,
+      "step": 2902
+    },
+    {
+      "epoch": 0.20195485060349924,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.001848951708951656,
+      "loss": 1.9557,
+      "step": 2903
+    },
+    {
+      "epoch": 0.20202441824063447,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0018488326055736417,
+      "loss": 1.9589,
+      "step": 2904
+    },
+    {
+      "epoch": 0.20209398587776967,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.001848713459096186,
+      "loss": 1.814,
+      "step": 2905
+    },
+    {
+      "epoch": 0.20216355351490486,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0018485942695253387,
+      "loss": 2.0086,
+      "step": 2906
+    },
+    {
+      "epoch": 0.20223312115204006,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0018484750368671515,
+      "loss": 1.9076,
+      "step": 2907
+    },
+    {
+      "epoch": 0.2023026887891753,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.0018483557611276788,
+      "loss": 1.9679,
+      "step": 2908
+    },
+    {
+      "epoch": 0.2023722564263105,
+      "grad_norm": 6.5625,
+      "learning_rate": 0.0018482364423129762,
+      "loss": 1.9789,
+      "step": 2909
+    },
+    {
+      "epoch": 0.20244182406344569,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0018481170804291029,
+      "loss": 1.7068,
+      "step": 2910
+    },
+    {
+      "epoch": 0.20251139170058088,
+      "grad_norm": 2.90625,
+      "learning_rate": 0.0018479976754821187,
+      "loss": 1.8457,
+      "step": 2911
+    },
+    {
+      "epoch": 0.20258095933771608,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.001847878227478087,
+      "loss": 1.8104,
+      "step": 2912
+    },
+    {
+      "epoch": 0.2026505269748513,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0018477587364230726,
+      "loss": 2.1387,
+      "step": 2913
+    },
+    {
+      "epoch": 0.2027200946119865,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0018476392023231423,
+      "loss": 1.9466,
+      "step": 2914
+    },
+    {
+      "epoch": 0.2027896622491217,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.001847519625184366,
+      "loss": 1.7601,
+      "step": 2915
+    },
+    {
+      "epoch": 0.2028592298862569,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018474000050128147,
+      "loss": 1.6999,
+      "step": 2916
+    },
+    {
+      "epoch": 0.20292879752339213,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0018472803418145625,
+      "loss": 1.6827,
+      "step": 2917
+    },
+    {
+      "epoch": 0.20299836516052733,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.001847160635595685,
+      "loss": 1.7657,
+      "step": 2918
+    },
+    {
+      "epoch": 0.20306793279766253,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0018470408863622608,
+      "loss": 1.7335,
+      "step": 2919
+    },
+    {
+      "epoch": 0.20313750043479772,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0018469210941203698,
+      "loss": 1.7775,
+      "step": 2920
+    },
+    {
+      "epoch": 0.20320706807193295,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.001846801258876094,
+      "loss": 1.6758,
+      "step": 2921
+    },
+    {
+      "epoch": 0.20327663570906815,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.0018466813806355187,
+      "loss": 1.8042,
+      "step": 2922
+    },
+    {
+      "epoch": 0.20334620334620335,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018465614594047307,
+      "loss": 1.8612,
+      "step": 2923
+    },
+    {
+      "epoch": 0.20341577098333855,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018464414951898185,
+      "loss": 1.7777,
+      "step": 2924
+    },
+    {
+      "epoch": 0.20348533862047374,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0018463214879968735,
+      "loss": 1.7603,
+      "step": 2925
+    },
+    {
+      "epoch": 0.20355490625760897,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018462014378319892,
+      "loss": 1.7786,
+      "step": 2926
+    },
+    {
+      "epoch": 0.20362447389474417,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0018460813447012613,
+      "loss": 1.666,
+      "step": 2927
+    },
+    {
+      "epoch": 0.20369404153187937,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0018459612086107868,
+      "loss": 1.8724,
+      "step": 2928
+    },
+    {
+      "epoch": 0.20376360916901456,
+      "grad_norm": 4.3125,
+      "learning_rate": 0.0018458410295666664,
+      "loss": 2.3202,
+      "step": 2929
+    },
+    {
+      "epoch": 0.2038331768061498,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.0018457208075750018,
+      "loss": 1.7222,
+      "step": 2930
+    },
+    {
+      "epoch": 0.203902744443285,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0018456005426418973,
+      "loss": 1.6514,
+      "step": 2931
+    },
+    {
+      "epoch": 0.2039723120804202,
+      "grad_norm": 2.078125,
+      "learning_rate": 0.0018454802347734596,
+      "loss": 1.9026,
+      "step": 2932
+    },
+    {
+      "epoch": 0.20404187971755539,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0018453598839757968,
+      "loss": 1.6522,
+      "step": 2933
+    },
+    {
+      "epoch": 0.2041114473546906,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018452394902550202,
+      "loss": 1.6696,
+      "step": 2934
+    },
+    {
+      "epoch": 0.2041810149918258,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0018451190536172427,
+      "loss": 1.6172,
+      "step": 2935
+    },
+    {
+      "epoch": 0.204250582628961,
+      "grad_norm": 4.5,
+      "learning_rate": 0.0018449985740685794,
+      "loss": 1.675,
+      "step": 2936
+    },
+    {
+      "epoch": 0.2043201502660962,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0018448780516151474,
+      "loss": 1.7131,
+      "step": 2937
+    },
+    {
+      "epoch": 0.2043897179032314,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0018447574862630663,
+      "loss": 1.7751,
+      "step": 2938
+    },
+    {
+      "epoch": 0.20445928554036663,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.0018446368780184583,
+      "loss": 1.5138,
+      "step": 2939
+    },
+    {
+      "epoch": 0.20452885317750183,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0018445162268874466,
+      "loss": 1.7987,
+      "step": 2940
+    },
+    {
+      "epoch": 0.20459842081463703,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018443955328761579,
+      "loss": 1.6765,
+      "step": 2941
+    },
+    {
+      "epoch": 0.20466798845177223,
+      "grad_norm": 2.0,
+      "learning_rate": 0.00184427479599072,
+      "loss": 1.8651,
+      "step": 2942
+    },
+    {
+      "epoch": 0.20473755608890745,
+      "grad_norm": 3.046875,
+      "learning_rate": 0.0018441540162372632,
+      "loss": 1.6984,
+      "step": 2943
+    },
+    {
+      "epoch": 0.20480712372604265,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0018440331936219207,
+      "loss": 1.554,
+      "step": 2944
+    },
+    {
+      "epoch": 0.20487669136317785,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0018439123281508265,
+      "loss": 1.8372,
+      "step": 2945
+    },
+    {
+      "epoch": 0.20494625900031305,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0018437914198301182,
+      "loss": 1.6942,
+      "step": 2946
+    },
+    {
+      "epoch": 0.20501582663744825,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0018436704686659346,
+      "loss": 1.6305,
+      "step": 2947
+    },
+    {
+      "epoch": 0.20508539427458347,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0018435494746644168,
+      "loss": 1.6892,
+      "step": 2948
+    },
+    {
+      "epoch": 0.20515496191171867,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018434284378317086,
+      "loss": 1.6875,
+      "step": 2949
+    },
+    {
+      "epoch": 0.20522452954885387,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0018433073581739555,
+      "loss": 1.799,
+      "step": 2950
+    },
+    {
+      "epoch": 0.20529409718598907,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0018431862356973056,
+      "loss": 1.9487,
+      "step": 2951
+    },
+    {
+      "epoch": 0.2053636648231243,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.001843065070407908,
+      "loss": 1.5475,
+      "step": 2952
+    },
+    {
+      "epoch": 0.2054332324602595,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0018429438623119162,
+      "loss": 1.4665,
+      "step": 2953
+    },
+    {
+      "epoch": 0.2055028000973947,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.0018428226114154832,
+      "loss": 1.5239,
+      "step": 2954
+    },
+    {
+      "epoch": 0.2055723677345299,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018427013177247664,
+      "loss": 1.602,
+      "step": 2955
+    },
+    {
+      "epoch": 0.2056419353716651,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0018425799812459244,
+      "loss": 1.4334,
+      "step": 2956
+    },
+    {
+      "epoch": 0.2057115030088003,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0018424586019851175,
+      "loss": 1.825,
+      "step": 2957
+    },
+    {
+      "epoch": 0.2057810706459355,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0018423371799485095,
+      "loss": 1.6815,
+      "step": 2958
+    },
+    {
+      "epoch": 0.2058506382830707,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.001842215715142265,
+      "loss": 1.6816,
+      "step": 2959
+    },
+    {
+      "epoch": 0.2059202059202059,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0018420942075725514,
+      "loss": 1.4627,
+      "step": 2960
+    },
+    {
+      "epoch": 0.20598977355734113,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0018419726572455387,
+      "loss": 1.7165,
+      "step": 2961
+    },
+    {
+      "epoch": 0.20605934119447633,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0018418510641673982,
+      "loss": 1.6336,
+      "step": 2962
+    },
+    {
+      "epoch": 0.20612890883161153,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001841729428344304,
+      "loss": 1.2457,
+      "step": 2963
+    },
+    {
+      "epoch": 0.20619847646874673,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001841607749782432,
+      "loss": 1.6383,
+      "step": 2964
+    },
+    {
+      "epoch": 0.20626804410588195,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0018414860284879603,
+      "loss": 1.4405,
+      "step": 2965
+    },
+    {
+      "epoch": 0.20633761174301715,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0018413642644670696,
+      "loss": 1.8695,
+      "step": 2966
+    },
+    {
+      "epoch": 0.20640717938015235,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018412424577259423,
+      "loss": 1.6998,
+      "step": 2967
+    },
+    {
+      "epoch": 0.20647674701728755,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0018411206082707633,
+      "loss": 1.4037,
+      "step": 2968
+    },
+    {
+      "epoch": 0.20654631465442277,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0018409987161077193,
+      "loss": 1.4348,
+      "step": 2969
+    },
+    {
+      "epoch": 0.20661588229155797,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0018408767812429993,
+      "loss": 1.6154,
+      "step": 2970
+    },
+    {
+      "epoch": 0.20668544992869317,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.001840754803682795,
+      "loss": 1.5229,
+      "step": 2971
+    },
+    {
+      "epoch": 0.20675501756582837,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.0018406327834332994,
+      "loss": 1.7966,
+      "step": 2972
+    },
+    {
+      "epoch": 0.20682458520296357,
+      "grad_norm": 1.984375,
+      "learning_rate": 0.0018405107205007082,
+      "loss": 1.7795,
+      "step": 2973
+    },
+    {
+      "epoch": 0.2068941528400988,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0018403886148912188,
+      "loss": 1.6348,
+      "step": 2974
+    },
+    {
+      "epoch": 0.206963720477234,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018402664666110316,
+      "loss": 1.429,
+      "step": 2975
+    },
+    {
+      "epoch": 0.2070332881143692,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.0018401442756663484,
+      "loss": 1.4652,
+      "step": 2976
+    },
+    {
+      "epoch": 0.2071028557515044,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0018400220420633736,
+      "loss": 1.3845,
+      "step": 2977
+    },
+    {
+      "epoch": 0.20717242338863961,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0018398997658083136,
+      "loss": 1.7564,
+      "step": 2978
+    },
+    {
+      "epoch": 0.2072419910257748,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0018397774469073767,
+      "loss": 1.7363,
+      "step": 2979
+    },
+    {
+      "epoch": 0.20731155866291,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0018396550853667741,
+      "loss": 1.3616,
+      "step": 2980
+    },
+    {
+      "epoch": 0.2073811263000452,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0018395326811927182,
+      "loss": 1.2341,
+      "step": 2981
+    },
+    {
+      "epoch": 0.20745069393718044,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0018394102343914245,
+      "loss": 1.4868,
+      "step": 2982
+    },
+    {
+      "epoch": 0.20752026157431563,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0018392877449691098,
+      "loss": 1.5404,
+      "step": 2983
+    },
+    {
+      "epoch": 0.20758982921145083,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.0018391652129319941,
+      "loss": 1.5061,
+      "step": 2984
+    },
+    {
+      "epoch": 0.20765939684858603,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.001839042638286298,
+      "loss": 1.4491,
+      "step": 2985
+    },
+    {
+      "epoch": 0.20772896448572123,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0018389200210382464,
+      "loss": 1.4995,
+      "step": 2986
+    },
+    {
+      "epoch": 0.20779853212285646,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0018387973611940645,
+      "loss": 1.7329,
+      "step": 2987
+    },
+    {
+      "epoch": 0.20786809975999165,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0018386746587599804,
+      "loss": 1.326,
+      "step": 2988
+    },
+    {
+      "epoch": 0.20793766739712685,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.001838551913742224,
+      "loss": 1.3855,
+      "step": 2989
+    },
+    {
+      "epoch": 0.20800723503426205,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0018384291261470285,
+      "loss": 1.4359,
+      "step": 2990
+    },
+    {
+      "epoch": 0.20807680267139728,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018383062959806279,
+      "loss": 1.6186,
+      "step": 2991
+    },
+    {
+      "epoch": 0.20814637030853247,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.0018381834232492587,
+      "loss": 1.6195,
+      "step": 2992
+    },
+    {
+      "epoch": 0.20821593794566767,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018380605079591603,
+      "loss": 1.2774,
+      "step": 2993
+    },
+    {
+      "epoch": 0.20828550558280287,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0018379375501165734,
+      "loss": 1.4691,
+      "step": 2994
+    },
+    {
+      "epoch": 0.2083550732199381,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0018378145497277409,
+      "loss": 1.3312,
+      "step": 2995
+    },
+    {
+      "epoch": 0.2084246408570733,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001837691506798909,
+      "loss": 1.5309,
+      "step": 2996
+    },
+    {
+      "epoch": 0.2084942084942085,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0018375684213363243,
+      "loss": 1.5467,
+      "step": 2997
+    },
+    {
+      "epoch": 0.2085637761313437,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.001837445293346237,
+      "loss": 1.6222,
+      "step": 2998
+    },
+    {
+      "epoch": 0.2086333437684789,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0018373221228348987,
+      "loss": 1.4877,
+      "step": 2999
+    },
+    {
+      "epoch": 0.20870291140561412,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0018371989098085633,
+      "loss": 1.4715,
+      "step": 3000
+    },
+    {
+      "epoch": 0.20877247904274931,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0018370756542734872,
+      "loss": 1.6579,
+      "step": 3001
+    },
+    {
+      "epoch": 0.2088420466798845,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0018369523562359285,
+      "loss": 1.6065,
+      "step": 3002
+    },
+    {
+      "epoch": 0.2089116143170197,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0018368290157021474,
+      "loss": 1.3236,
+      "step": 3003
+    },
+    {
+      "epoch": 0.20898118195415494,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0018367056326784074,
+      "loss": 1.462,
+      "step": 3004
+    },
+    {
+      "epoch": 0.20905074959129014,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0018365822071709724,
+      "loss": 1.4751,
+      "step": 3005
+    },
+    {
+      "epoch": 0.20912031722842533,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0018364587391861095,
+      "loss": 1.385,
+      "step": 3006
+    },
+    {
+      "epoch": 0.20918988486556053,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0018363352287300877,
+      "loss": 1.3311,
+      "step": 3007
+    },
+    {
+      "epoch": 0.20925945250269576,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.001836211675809179,
+      "loss": 1.6793,
+      "step": 3008
+    },
+    {
+      "epoch": 0.20932902013983096,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.001836088080429656,
+      "loss": 1.4785,
+      "step": 3009
+    },
+    {
+      "epoch": 0.20939858777696616,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0018359644425977942,
+      "loss": 1.5472,
+      "step": 3010
+    },
+    {
+      "epoch": 0.20946815541410135,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0018358407623198718,
+      "loss": 1.3917,
+      "step": 3011
+    },
+    {
+      "epoch": 0.20953772305123655,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0018357170396021685,
+      "loss": 1.3381,
+      "step": 3012
+    },
+    {
+      "epoch": 0.20960729068837178,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0018355932744509662,
+      "loss": 1.3268,
+      "step": 3013
+    },
+    {
+      "epoch": 0.20967685832550698,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018354694668725488,
+      "loss": 1.5767,
+      "step": 3014
+    },
+    {
+      "epoch": 0.20974642596264217,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0018353456168732035,
+      "loss": 1.5286,
+      "step": 3015
+    },
+    {
+      "epoch": 0.20981599359977737,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001835221724459218,
+      "loss": 1.3449,
+      "step": 3016
+    },
+    {
+      "epoch": 0.2098855612369126,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018350977896368832,
+      "loss": 1.4017,
+      "step": 3017
+    },
+    {
+      "epoch": 0.2099551288740478,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0018349738124124918,
+      "loss": 1.4264,
+      "step": 3018
+    },
+    {
+      "epoch": 0.210024696511183,
+      "grad_norm": 1.84375,
+      "learning_rate": 0.0018348497927923387,
+      "loss": 1.4162,
+      "step": 3019
+    },
+    {
+      "epoch": 0.2100942641483182,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0018347257307827212,
+      "loss": 1.407,
+      "step": 3020
+    },
+    {
+      "epoch": 0.21016383178545342,
+      "grad_norm": 1.8984375,
+      "learning_rate": 0.0018346016263899383,
+      "loss": 1.4882,
+      "step": 3021
+    },
+    {
+      "epoch": 0.21023339942258862,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0018344774796202916,
+      "loss": 1.4429,
+      "step": 3022
+    },
+    {
+      "epoch": 0.21030296705972382,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018343532904800846,
+      "loss": 1.3768,
+      "step": 3023
+    },
+    {
+      "epoch": 0.21037253469685901,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018342290589756227,
+      "loss": 1.2807,
+      "step": 3024
+    },
+    {
+      "epoch": 0.2104421023339942,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0018341047851132143,
+      "loss": 1.4446,
+      "step": 3025
+    },
+    {
+      "epoch": 0.21051166997112944,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.001833980468899169,
+      "loss": 1.412,
+      "step": 3026
+    },
+    {
+      "epoch": 0.21058123760826464,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001833856110339799,
+      "loss": 1.3339,
+      "step": 3027
+    },
+    {
+      "epoch": 0.21065080524539984,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0018337317094414187,
+      "loss": 1.5386,
+      "step": 3028
+    },
+    {
+      "epoch": 0.21072037288253503,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0018336072662103447,
+      "loss": 1.5191,
+      "step": 3029
+    },
+    {
+      "epoch": 0.21078994051967026,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018334827806528954,
+      "loss": 1.5986,
+      "step": 3030
+    },
+    {
+      "epoch": 0.21085950815680546,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018333582527753913,
+      "loss": 1.4021,
+      "step": 3031
+    },
+    {
+      "epoch": 0.21092907579394066,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0018332336825841557,
+      "loss": 1.5484,
+      "step": 3032
+    },
+    {
+      "epoch": 0.21099864343107586,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0018331090700855134,
+      "loss": 1.4599,
+      "step": 3033
+    },
+    {
+      "epoch": 0.21106821106821108,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001832984415285792,
+      "loss": 1.5886,
+      "step": 3034
+    },
+    {
+      "epoch": 0.21113777870534628,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0018328597181913203,
+      "loss": 1.5175,
+      "step": 3035
+    },
+    {
+      "epoch": 0.21120734634248148,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0018327349788084303,
+      "loss": 1.4476,
+      "step": 3036
+    },
+    {
+      "epoch": 0.21127691397961668,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.001832610197143455,
+      "loss": 1.539,
+      "step": 3037
+    },
+    {
+      "epoch": 0.21134648161675187,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0018324853732027307,
+      "loss": 1.6018,
+      "step": 3038
+    },
+    {
+      "epoch": 0.2114160492538871,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0018323605069925954,
+      "loss": 1.7061,
+      "step": 3039
+    },
+    {
+      "epoch": 0.2114856168910223,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.0018322355985193885,
+      "loss": 1.5131,
+      "step": 3040
+    },
+    {
+      "epoch": 0.2115551845281575,
+      "grad_norm": 1.5,
+      "learning_rate": 0.001832110647789453,
+      "loss": 1.5458,
+      "step": 3041
+    },
+    {
+      "epoch": 0.2116247521652927,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0018319856548091329,
+      "loss": 1.3315,
+      "step": 3042
+    },
+    {
+      "epoch": 0.21169431980242792,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0018318606195847748,
+      "loss": 1.2364,
+      "step": 3043
+    },
+    {
+      "epoch": 0.21176388743956312,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0018317355421227273,
+      "loss": 1.3256,
+      "step": 3044
+    },
+    {
+      "epoch": 0.21183345507669832,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0018316104224293413,
+      "loss": 1.2375,
+      "step": 3045
+    },
+    {
+      "epoch": 0.21190302271383352,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0018314852605109695,
+      "loss": 1.2515,
+      "step": 3046
+    },
+    {
+      "epoch": 0.21197259035096874,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018313600563739673,
+      "loss": 1.3807,
+      "step": 3047
+    },
+    {
+      "epoch": 0.21204215798810394,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0018312348100246918,
+      "loss": 1.3903,
+      "step": 3048
+    },
+    {
+      "epoch": 0.21211172562523914,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0018311095214695024,
+      "loss": 1.9212,
+      "step": 3049
+    },
+    {
+      "epoch": 0.21218129326237434,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.0018309841907147605,
+      "loss": 1.1514,
+      "step": 3050
+    },
+    {
+      "epoch": 0.21225086089950954,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00183085881776683,
+      "loss": 1.3666,
+      "step": 3051
+    },
+    {
+      "epoch": 0.21232042853664476,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0018307334026320765,
+      "loss": 1.6806,
+      "step": 3052
+    },
+    {
+      "epoch": 0.21238999617377996,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018306079453168681,
+      "loss": 1.5053,
+      "step": 3053
+    },
+    {
+      "epoch": 0.21245956381091516,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0018304824458275745,
+      "loss": 1.2886,
+      "step": 3054
+    },
+    {
+      "epoch": 0.21252913144805036,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018303569041705685,
+      "loss": 1.4875,
+      "step": 3055
+    },
+    {
+      "epoch": 0.21259869908518558,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018302313203522242,
+      "loss": 1.5887,
+      "step": 3056
+    },
+    {
+      "epoch": 0.21266826672232078,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0018301056943789181,
+      "loss": 1.4743,
+      "step": 3057
+    },
+    {
+      "epoch": 0.21273783435945598,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.001829980026257029,
+      "loss": 1.2388,
+      "step": 3058
+    },
+    {
+      "epoch": 0.21280740199659118,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.0018298543159929372,
+      "loss": 1.6247,
+      "step": 3059
+    },
+    {
+      "epoch": 0.2128769696337264,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0018297285635930265,
+      "loss": 1.4666,
+      "step": 3060
+    },
+    {
+      "epoch": 0.2129465372708616,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0018296027690636813,
+      "loss": 1.4244,
+      "step": 3061
+    },
+    {
+      "epoch": 0.2130161049079968,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001829476932411289,
+      "loss": 1.3429,
+      "step": 3062
+    },
+    {
+      "epoch": 0.213085672545132,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0018293510536422388,
+      "loss": 1.1101,
+      "step": 3063
+    },
+    {
+      "epoch": 0.2131552401822672,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018292251327629225,
+      "loss": 1.2017,
+      "step": 3064
+    },
+    {
+      "epoch": 0.21322480781940242,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018290991697797335,
+      "loss": 1.3601,
+      "step": 3065
+    },
+    {
+      "epoch": 0.21329437545653762,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018289731646990678,
+      "loss": 1.2416,
+      "step": 3066
+    },
+    {
+      "epoch": 0.21336394309367282,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0018288471175273227,
+      "loss": 1.4516,
+      "step": 3067
+    },
+    {
+      "epoch": 0.21343351073080802,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.001828721028270899,
+      "loss": 1.245,
+      "step": 3068
+    },
+    {
+      "epoch": 0.21350307836794324,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0018285948969361985,
+      "loss": 1.4108,
+      "step": 3069
+    },
+    {
+      "epoch": 0.21357264600507844,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018284687235296255,
+      "loss": 1.1665,
+      "step": 3070
+    },
+    {
+      "epoch": 0.21364221364221364,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.0018283425080575866,
+      "loss": 1.4434,
+      "step": 3071
+    },
+    {
+      "epoch": 0.21371178127934884,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00182821625052649,
+      "loss": 1.3012,
+      "step": 3072
+    },
+    {
+      "epoch": 0.21378134891648407,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.001828089950942747,
+      "loss": 1.5307,
+      "step": 3073
+    },
+    {
+      "epoch": 0.21385091655361926,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018279636093127705,
+      "loss": 1.18,
+      "step": 3074
+    },
+    {
+      "epoch": 0.21392048419075446,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0018278372256429747,
+      "loss": 1.4393,
+      "step": 3075
+    },
+    {
+      "epoch": 0.21399005182788966,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0018277107999397774,
+      "loss": 1.3494,
+      "step": 3076
+    },
+    {
+      "epoch": 0.21405961946502486,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0018275843322095974,
+      "loss": 1.3145,
+      "step": 3077
+    },
+    {
+      "epoch": 0.21412918710216008,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0018274578224588564,
+      "loss": 1.6444,
+      "step": 3078
+    },
+    {
+      "epoch": 0.21419875473929528,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001827331270693978,
+      "loss": 1.2429,
+      "step": 3079
+    },
+    {
+      "epoch": 0.21426832237643048,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0018272046769213879,
+      "loss": 1.5588,
+      "step": 3080
+    },
+    {
+      "epoch": 0.21433789001356568,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.0018270780411475133,
+      "loss": 1.5714,
+      "step": 3081
+    },
+    {
+      "epoch": 0.2144074576507009,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0018269513633787848,
+      "loss": 1.2007,
+      "step": 3082
+    },
+    {
+      "epoch": 0.2144770252878361,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018268246436216342,
+      "loss": 1.2613,
+      "step": 3083
+    },
+    {
+      "epoch": 0.2145465929249713,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018266978818824958,
+      "loss": 1.6095,
+      "step": 3084
+    },
+    {
+      "epoch": 0.2146161605621065,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0018265710781678055,
+      "loss": 1.3277,
+      "step": 3085
+    },
+    {
+      "epoch": 0.21468572819924173,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0018264442324840025,
+      "loss": 1.4692,
+      "step": 3086
+    },
+    {
+      "epoch": 0.21475529583637692,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018263173448375266,
+      "loss": 1.3238,
+      "step": 3087
+    },
+    {
+      "epoch": 0.21482486347351212,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0018261904152348212,
+      "loss": 1.2368,
+      "step": 3088
+    },
+    {
+      "epoch": 0.21489443111064732,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0018260634436823304,
+      "loss": 1.4299,
+      "step": 3089
+    },
+    {
+      "epoch": 0.21496399874778252,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001825936430186502,
+      "loss": 1.3339,
+      "step": 3090
+    },
+    {
+      "epoch": 0.21503356638491775,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018258093747537845,
+      "loss": 1.3982,
+      "step": 3091
+    },
+    {
+      "epoch": 0.21510313402205294,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0018256822773906297,
+      "loss": 1.451,
+      "step": 3092
+    },
+    {
+      "epoch": 0.21517270165918814,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.00182555513810349,
+      "loss": 1.56,
+      "step": 3093
+    },
+    {
+      "epoch": 0.21524226929632334,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0018254279568988218,
+      "loss": 1.1239,
+      "step": 3094
+    },
+    {
+      "epoch": 0.21531183693345857,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018253007337830824,
+      "loss": 1.2984,
+      "step": 3095
+    },
+    {
+      "epoch": 0.21538140457059377,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0018251734687627318,
+      "loss": 1.0488,
+      "step": 3096
+    },
+    {
+      "epoch": 0.21545097220772896,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0018250461618442312,
+      "loss": 1.0971,
+      "step": 3097
+    },
+    {
+      "epoch": 0.21552053984486416,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018249188130340453,
+      "loss": 1.3812,
+      "step": 3098
+    },
+    {
+      "epoch": 0.21559010748199936,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018247914223386398,
+      "loss": 1.4449,
+      "step": 3099
+    },
+    {
+      "epoch": 0.2156596751191346,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018246639897644835,
+      "loss": 1.3833,
+      "step": 3100
+    },
+    {
+      "epoch": 0.21572924275626978,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001824536515318046,
+      "loss": 1.4897,
+      "step": 3101
+    },
+    {
+      "epoch": 0.21579881039340498,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0018244089990058004,
+      "loss": 1.369,
+      "step": 3102
+    },
+    {
+      "epoch": 0.21586837803054018,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.0018242814408342212,
+      "loss": 1.5356,
+      "step": 3103
+    },
+    {
+      "epoch": 0.2159379456676754,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0018241538408097849,
+      "loss": 1.2144,
+      "step": 3104
+    },
+    {
+      "epoch": 0.2160075133048106,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001824026198938971,
+      "loss": 1.2871,
+      "step": 3105
+    },
+    {
+      "epoch": 0.2160770809419458,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018238985152282598,
+      "loss": 1.4326,
+      "step": 3106
+    },
+    {
+      "epoch": 0.216146648579081,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018237707896841347,
+      "loss": 1.3786,
+      "step": 3107
+    },
+    {
+      "epoch": 0.21621621621621623,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0018236430223130813,
+      "loss": 1.4567,
+      "step": 3108
+    },
+    {
+      "epoch": 0.21628578385335143,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018235152131215867,
+      "loss": 1.397,
+      "step": 3109
+    },
+    {
+      "epoch": 0.21635535149048662,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0018233873621161401,
+      "loss": 1.3467,
+      "step": 3110
+    },
+    {
+      "epoch": 0.21642491912762182,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018232594693032337,
+      "loss": 1.4307,
+      "step": 3111
+    },
+    {
+      "epoch": 0.21649448676475702,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001823131534689361,
+      "loss": 1.3591,
+      "step": 3112
+    },
+    {
+      "epoch": 0.21656405440189225,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0018230035582810175,
+      "loss": 1.3185,
+      "step": 3113
+    },
+    {
+      "epoch": 0.21663362203902745,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018228755400847016,
+      "loss": 1.3026,
+      "step": 3114
+    },
+    {
+      "epoch": 0.21670318967616264,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0018227474801069136,
+      "loss": 1.2597,
+      "step": 3115
+    },
+    {
+      "epoch": 0.21677275731329784,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0018226193783541557,
+      "loss": 1.2826,
+      "step": 3116
+    },
+    {
+      "epoch": 0.21684232495043307,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0018224912348329316,
+      "loss": 1.2702,
+      "step": 3117
+    },
+    {
+      "epoch": 0.21691189258756827,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0018223630495497484,
+      "loss": 1.1387,
+      "step": 3118
+    },
+    {
+      "epoch": 0.21698146022470347,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001822234822511115,
+      "loss": 1.3643,
+      "step": 3119
+    },
+    {
+      "epoch": 0.21705102786183866,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018221065537235412,
+      "loss": 1.0964,
+      "step": 3120
+    },
+    {
+      "epoch": 0.2171205954989739,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0018219782431935405,
+      "loss": 1.242,
+      "step": 3121
+    },
+    {
+      "epoch": 0.2171901631361091,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018218498909276276,
+      "loss": 1.3961,
+      "step": 3122
+    },
+    {
+      "epoch": 0.2172597307732443,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018217214969323198,
+      "loss": 1.3259,
+      "step": 3123
+    },
+    {
+      "epoch": 0.21732929841037948,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.001821593061214136,
+      "loss": 1.1908,
+      "step": 3124
+    },
+    {
+      "epoch": 0.21739886604751468,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0018214645837795979,
+      "loss": 1.5525,
+      "step": 3125
+    },
+    {
+      "epoch": 0.2174684336846499,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018213360646352286,
+      "loss": 1.1041,
+      "step": 3126
+    },
+    {
+      "epoch": 0.2175380013217851,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018212075037875538,
+      "loss": 1.4062,
+      "step": 3127
+    },
+    {
+      "epoch": 0.2176075689589203,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001821078901243101,
+      "loss": 1.3016,
+      "step": 3128
+    },
+    {
+      "epoch": 0.2176771365960555,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0018209502570084005,
+      "loss": 1.2827,
+      "step": 3129
+    },
+    {
+      "epoch": 0.21774670423319073,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018208215710899833,
+      "loss": 1.5377,
+      "step": 3130
+    },
+    {
+      "epoch": 0.21781627187032593,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018206928434943846,
+      "loss": 1.3466,
+      "step": 3131
+    },
+    {
+      "epoch": 0.21788583950746113,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0018205640742281397,
+      "loss": 1.2992,
+      "step": 3132
+    },
+    {
+      "epoch": 0.21795540714459632,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001820435263297787,
+      "loss": 1.181,
+      "step": 3133
+    },
+    {
+      "epoch": 0.21802497478173155,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0018203064107098666,
+      "loss": 1.4374,
+      "step": 3134
+    },
+    {
+      "epoch": 0.21809454241886675,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018201775164709219,
+      "loss": 1.1472,
+      "step": 3135
+    },
+    {
+      "epoch": 0.21816411005600195,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0018200485805874962,
+      "loss": 1.2548,
+      "step": 3136
+    },
+    {
+      "epoch": 0.21823367769313715,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018199196030661375,
+      "loss": 1.1976,
+      "step": 3137
+    },
+    {
+      "epoch": 0.21830324533027234,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001819790583913394,
+      "loss": 1.3096,
+      "step": 3138
+    },
+    {
+      "epoch": 0.21837281296740757,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0018196615231358165,
+      "loss": 1.3699,
+      "step": 3139
+    },
+    {
+      "epoch": 0.21844238060454277,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0018195324207399587,
+      "loss": 1.2893,
+      "step": 3140
+    },
+    {
+      "epoch": 0.21851194824167797,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018194032767323747,
+      "loss": 1.2761,
+      "step": 3141
+    },
+    {
+      "epoch": 0.21858151587881317,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018192740911196225,
+      "loss": 1.1881,
+      "step": 3142
+    },
+    {
+      "epoch": 0.2186510835159484,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001819144863908262,
+      "loss": 1.2316,
+      "step": 3143
+    },
+    {
+      "epoch": 0.2187206511530836,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018190155951048534,
+      "loss": 1.3229,
+      "step": 3144
+    },
+    {
+      "epoch": 0.2187902187902188,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018188862847159616,
+      "loss": 1.2341,
+      "step": 3145
+    },
+    {
+      "epoch": 0.218859786427354,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018187569327481512,
+      "loss": 1.1094,
+      "step": 3146
+    },
+    {
+      "epoch": 0.2189293540644892,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.001818627539207991,
+      "loss": 1.2033,
+      "step": 3147
+    },
+    {
+      "epoch": 0.2189989217016244,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018184981041020505,
+      "loss": 1.0848,
+      "step": 3148
+    },
+    {
+      "epoch": 0.2190684893387596,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0018183686274369016,
+      "loss": 1.4267,
+      "step": 3149
+    },
+    {
+      "epoch": 0.2191380569758948,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.001818239109219119,
+      "loss": 1.2773,
+      "step": 3150
+    },
+    {
+      "epoch": 0.21920762461303,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0018181095494552784,
+      "loss": 1.3228,
+      "step": 3151
+    },
+    {
+      "epoch": 0.21927719225016523,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018179799481519586,
+      "loss": 1.2327,
+      "step": 3152
+    },
+    {
+      "epoch": 0.21934675988730043,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00181785030531574,
+      "loss": 1.3893,
+      "step": 3153
+    },
+    {
+      "epoch": 0.21941632752443563,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001817720620953205,
+      "loss": 1.3026,
+      "step": 3154
+    },
+    {
+      "epoch": 0.21948589516157083,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018175908950709384,
+      "loss": 1.1513,
+      "step": 3155
+    },
+    {
+      "epoch": 0.21955546279870605,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018174611276755273,
+      "loss": 1.239,
+      "step": 3156
+    },
+    {
+      "epoch": 0.21962503043584125,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0018173313187735602,
+      "loss": 1.1086,
+      "step": 3157
+    },
+    {
+      "epoch": 0.21969459807297645,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018172014683716287,
+      "loss": 1.2177,
+      "step": 3158
+    },
+    {
+      "epoch": 0.21976416571011165,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0018170715764763254,
+      "loss": 1.1821,
+      "step": 3159
+    },
+    {
+      "epoch": 0.21983373334724687,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018169416430942461,
+      "loss": 1.7046,
+      "step": 3160
+    },
+    {
+      "epoch": 0.21990330098438207,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0018168116682319875,
+      "loss": 1.3001,
+      "step": 3161
+    },
+    {
+      "epoch": 0.21997286862151727,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0018166816518961498,
+      "loss": 1.2037,
+      "step": 3162
+    },
+    {
+      "epoch": 0.22004243625865247,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001816551594093334,
+      "loss": 1.4909,
+      "step": 3163
+    },
+    {
+      "epoch": 0.22011200389578767,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001816421494830144,
+      "loss": 1.2482,
+      "step": 3164
+    },
+    {
+      "epoch": 0.2201815715329229,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018162913541131856,
+      "loss": 1.5315,
+      "step": 3165
+    },
+    {
+      "epoch": 0.2202511391700581,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0018161611719490663,
+      "loss": 1.2521,
+      "step": 3166
+    },
+    {
+      "epoch": 0.2203207068071933,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018160309483443969,
+      "loss": 1.2491,
+      "step": 3167
+    },
+    {
+      "epoch": 0.2203902744443285,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001815900683305789,
+      "loss": 1.3487,
+      "step": 3168
+    },
+    {
+      "epoch": 0.2204598420814637,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0018157703768398566,
+      "loss": 1.2102,
+      "step": 3169
+    },
+    {
+      "epoch": 0.2205294097185989,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0018156400289532164,
+      "loss": 1.2524,
+      "step": 3170
+    },
+    {
+      "epoch": 0.2205989773557341,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0018155096396524867,
+      "loss": 1.3116,
+      "step": 3171
+    },
+    {
+      "epoch": 0.2206685449928693,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018153792089442879,
+      "loss": 1.3005,
+      "step": 3172
+    },
+    {
+      "epoch": 0.22073811263000453,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0018152487368352426,
+      "loss": 1.3617,
+      "step": 3173
+    },
+    {
+      "epoch": 0.22080768026713973,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018151182233319756,
+      "loss": 1.3319,
+      "step": 3174
+    },
+    {
+      "epoch": 0.22087724790427493,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001814987668441114,
+      "loss": 1.1319,
+      "step": 3175
+    },
+    {
+      "epoch": 0.22094681554141013,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0018148570721692862,
+      "loss": 1.1426,
+      "step": 3176
+    },
+    {
+      "epoch": 0.22101638317854533,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018147264345231234,
+      "loss": 1.5089,
+      "step": 3177
+    },
+    {
+      "epoch": 0.22108595081568055,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001814595755509259,
+      "loss": 1.2042,
+      "step": 3178
+    },
+    {
+      "epoch": 0.22115551845281575,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0018144650351343277,
+      "loss": 1.3103,
+      "step": 3179
+    },
+    {
+      "epoch": 0.22122508608995095,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0018143342734049672,
+      "loss": 1.256,
+      "step": 3180
+    },
+    {
+      "epoch": 0.22129465372708615,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0018142034703278172,
+      "loss": 1.4062,
+      "step": 3181
+    },
+    {
+      "epoch": 0.22136422136422138,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018140726259095186,
+      "loss": 1.5177,
+      "step": 3182
+    },
+    {
+      "epoch": 0.22143378900135657,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018139417401567153,
+      "loss": 1.3101,
+      "step": 3183
+    },
+    {
+      "epoch": 0.22150335663849177,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0018138108130760528,
+      "loss": 1.2181,
+      "step": 3184
+    },
+    {
+      "epoch": 0.22157292427562697,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018136798446741797,
+      "loss": 1.1762,
+      "step": 3185
+    },
+    {
+      "epoch": 0.2216424919127622,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001813548834957745,
+      "loss": 1.3214,
+      "step": 3186
+    },
+    {
+      "epoch": 0.2217120595498974,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0018134177839334007,
+      "loss": 1.332,
+      "step": 3187
+    },
+    {
+      "epoch": 0.2217816271870326,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0018132866916078017,
+      "loss": 1.5935,
+      "step": 3188
+    },
+    {
+      "epoch": 0.2218511948241678,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0018131555579876037,
+      "loss": 1.1784,
+      "step": 3189
+    },
+    {
+      "epoch": 0.221920762461303,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001813024383079465,
+      "loss": 1.1413,
+      "step": 3190
+    },
+    {
+      "epoch": 0.22199033009843822,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018128931668900462,
+      "loss": 1.2292,
+      "step": 3191
+    },
+    {
+      "epoch": 0.2220598977355734,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0018127619094260095,
+      "loss": 1.0415,
+      "step": 3192
+    },
+    {
+      "epoch": 0.2221294653727086,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018126306106940198,
+      "loss": 1.2261,
+      "step": 3193
+    },
+    {
+      "epoch": 0.2221990330098438,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018124992707007435,
+      "loss": 0.8691,
+      "step": 3194
+    },
+    {
+      "epoch": 0.22226860064697904,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0018123678894528498,
+      "loss": 1.3883,
+      "step": 3195
+    },
+    {
+      "epoch": 0.22233816828411423,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018122364669570091,
+      "loss": 1.3116,
+      "step": 3196
+    },
+    {
+      "epoch": 0.22240773592124943,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018121050032198945,
+      "loss": 1.032,
+      "step": 3197
+    },
+    {
+      "epoch": 0.22247730355838463,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018119734982481814,
+      "loss": 1.2981,
+      "step": 3198
+    },
+    {
+      "epoch": 0.22254687119551986,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0018118419520485466,
+      "loss": 1.3269,
+      "step": 3199
+    },
+    {
+      "epoch": 0.22261643883265506,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0018117103646276692,
+      "loss": 1.2838,
+      "step": 3200
+    },
+    {
+      "epoch": 0.22268600646979025,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001811578735992231,
+      "loss": 1.3026,
+      "step": 3201
+    },
+    {
+      "epoch": 0.22275557410692545,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0018114470661489154,
+      "loss": 1.264,
+      "step": 3202
+    },
+    {
+      "epoch": 0.22282514174406065,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018113153551044077,
+      "loss": 1.1016,
+      "step": 3203
+    },
+    {
+      "epoch": 0.22289470938119588,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0018111836028653957,
+      "loss": 1.1228,
+      "step": 3204
+    },
+    {
+      "epoch": 0.22296427701833108,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018110518094385686,
+      "loss": 1.4613,
+      "step": 3205
+    },
+    {
+      "epoch": 0.22303384465546627,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.001810919974830619,
+      "loss": 1.4146,
+      "step": 3206
+    },
+    {
+      "epoch": 0.22310341229260147,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018107880990482403,
+      "loss": 1.2999,
+      "step": 3207
+    },
+    {
+      "epoch": 0.2231729799297367,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018106561820981286,
+      "loss": 1.2888,
+      "step": 3208
+    },
+    {
+      "epoch": 0.2232425475668719,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0018105242239869822,
+      "loss": 1.3011,
+      "step": 3209
+    },
+    {
+      "epoch": 0.2233121152040071,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0018103922247215008,
+      "loss": 1.2069,
+      "step": 3210
+    },
+    {
+      "epoch": 0.2233816828411423,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018102601843083869,
+      "loss": 1.0903,
+      "step": 3211
+    },
+    {
+      "epoch": 0.22345125047827752,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0018101281027543448,
+      "loss": 1.044,
+      "step": 3212
+    },
+    {
+      "epoch": 0.22352081811541272,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018099959800660812,
+      "loss": 1.3568,
+      "step": 3213
+    },
+    {
+      "epoch": 0.22359038575254792,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0018098638162503042,
+      "loss": 1.4231,
+      "step": 3214
+    },
+    {
+      "epoch": 0.2236599533896831,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001809731611313725,
+      "loss": 1.0716,
+      "step": 3215
+    },
+    {
+      "epoch": 0.2237295210268183,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0018095993652630555,
+      "loss": 1.3694,
+      "step": 3216
+    },
+    {
+      "epoch": 0.22379908866395354,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001809467078105011,
+      "loss": 1.1203,
+      "step": 3217
+    },
+    {
+      "epoch": 0.22386865630108874,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018093347498463086,
+      "loss": 1.2607,
+      "step": 3218
+    },
+    {
+      "epoch": 0.22393822393822393,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0018092023804936667,
+      "loss": 1.3865,
+      "step": 3219
+    },
+    {
+      "epoch": 0.22400779157535913,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0018090699700538068,
+      "loss": 1.1439,
+      "step": 3220
+    },
+    {
+      "epoch": 0.22407735921249436,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0018089375185334515,
+      "loss": 1.3804,
+      "step": 3221
+    },
+    {
+      "epoch": 0.22414692684962956,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0018088050259393268,
+      "loss": 1.1861,
+      "step": 3222
+    },
+    {
+      "epoch": 0.22421649448676476,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0018086724922781593,
+      "loss": 1.2895,
+      "step": 3223
+    },
+    {
+      "epoch": 0.22428606212389995,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018085399175566783,
+      "loss": 0.9787,
+      "step": 3224
+    },
+    {
+      "epoch": 0.22435562976103518,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0018084073017816161,
+      "loss": 1.5001,
+      "step": 3225
+    },
+    {
+      "epoch": 0.22442519739817038,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018082746449597056,
+      "loss": 1.0478,
+      "step": 3226
+    },
+    {
+      "epoch": 0.22449476503530558,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0018081419470976827,
+      "loss": 1.1353,
+      "step": 3227
+    },
+    {
+      "epoch": 0.22456433267244078,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001808009208202285,
+      "loss": 1.3408,
+      "step": 3228
+    },
+    {
+      "epoch": 0.22463390030957597,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0018078764282802526,
+      "loss": 1.3925,
+      "step": 3229
+    },
+    {
+      "epoch": 0.2247034679467112,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001807743607338327,
+      "loss": 1.2309,
+      "step": 3230
+    },
+    {
+      "epoch": 0.2247730355838464,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018076107453832524,
+      "loss": 1.3426,
+      "step": 3231
+    },
+    {
+      "epoch": 0.2248426032209816,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018074778424217745,
+      "loss": 1.1903,
+      "step": 3232
+    },
+    {
+      "epoch": 0.2249121708581168,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018073448984606423,
+      "loss": 1.2461,
+      "step": 3233
+    },
+    {
+      "epoch": 0.22498173849525202,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0018072119135066052,
+      "loss": 1.4229,
+      "step": 3234
+    },
+    {
+      "epoch": 0.22505130613238722,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018070788875664157,
+      "loss": 1.1639,
+      "step": 3235
+    },
+    {
+      "epoch": 0.22512087376952242,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018069458206468284,
+      "loss": 1.0116,
+      "step": 3236
+    },
+    {
+      "epoch": 0.22519044140665762,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018068127127545998,
+      "loss": 1.3706,
+      "step": 3237
+    },
+    {
+      "epoch": 0.2252600090437928,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018066795638964877,
+      "loss": 1.1156,
+      "step": 3238
+    },
+    {
+      "epoch": 0.22532957668092804,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001806546374079254,
+      "loss": 1.3318,
+      "step": 3239
+    },
+    {
+      "epoch": 0.22539914431806324,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0018064131433096601,
+      "loss": 1.2423,
+      "step": 3240
+    },
+    {
+      "epoch": 0.22546871195519844,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0018062798715944718,
+      "loss": 1.0989,
+      "step": 3241
+    },
+    {
+      "epoch": 0.22553827959233363,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0018061465589404556,
+      "loss": 1.5471,
+      "step": 3242
+    },
+    {
+      "epoch": 0.22560784722946886,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0018060132053543804,
+      "loss": 1.1482,
+      "step": 3243
+    },
+    {
+      "epoch": 0.22567741486660406,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018058798108430167,
+      "loss": 1.229,
+      "step": 3244
+    },
+    {
+      "epoch": 0.22574698250373926,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001805746375413139,
+      "loss": 1.4308,
+      "step": 3245
+    },
+    {
+      "epoch": 0.22581655014087446,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001805612899071521,
+      "loss": 1.007,
+      "step": 3246
+    },
+    {
+      "epoch": 0.22588611777800968,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0018054793818249406,
+      "loss": 1.3335,
+      "step": 3247
+    },
+    {
+      "epoch": 0.22595568541514488,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018053458236801773,
+      "loss": 1.1016,
+      "step": 3248
+    },
+    {
+      "epoch": 0.22602525305228008,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018052122246440124,
+      "loss": 1.1453,
+      "step": 3249
+    },
+    {
+      "epoch": 0.22609482068941528,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018050785847232294,
+      "loss": 1.2315,
+      "step": 3250
+    },
+    {
+      "epoch": 0.22616438832655048,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0018049449039246133,
+      "loss": 1.1245,
+      "step": 3251
+    },
+    {
+      "epoch": 0.2262339559636857,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018048111822549524,
+      "loss": 1.0022,
+      "step": 3252
+    },
+    {
+      "epoch": 0.2263035236008209,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018046774197210365,
+      "loss": 1.274,
+      "step": 3253
+    },
+    {
+      "epoch": 0.2263730912379561,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018045436163296566,
+      "loss": 1.3615,
+      "step": 3254
+    },
+    {
+      "epoch": 0.2264426588750913,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0018044097720876077,
+      "loss": 1.3341,
+      "step": 3255
+    },
+    {
+      "epoch": 0.22651222651222652,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0018042758870016847,
+      "loss": 1.013,
+      "step": 3256
+    },
+    {
+      "epoch": 0.22658179414936172,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.001804141961078686,
+      "loss": 1.3983,
+      "step": 3257
+    },
+    {
+      "epoch": 0.22665136178649692,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0018040079943254118,
+      "loss": 1.1396,
+      "step": 3258
+    },
+    {
+      "epoch": 0.22672092942363212,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.001803873986748664,
+      "loss": 1.4228,
+      "step": 3259
+    },
+    {
+      "epoch": 0.22679049706076734,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018037399383552472,
+      "loss": 1.3769,
+      "step": 3260
+    },
+    {
+      "epoch": 0.22686006469790254,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001803605849151967,
+      "loss": 1.754,
+      "step": 3261
+    },
+    {
+      "epoch": 0.22692963233503774,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0018034717191456327,
+      "loss": 1.2549,
+      "step": 3262
+    },
+    {
+      "epoch": 0.22699919997217294,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018033375483430542,
+      "loss": 1.3371,
+      "step": 3263
+    },
+    {
+      "epoch": 0.22706876760930814,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0018032033367510443,
+      "loss": 1.3409,
+      "step": 3264
+    },
+    {
+      "epoch": 0.22713833524644336,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0018030690843764173,
+      "loss": 1.0501,
+      "step": 3265
+    },
+    {
+      "epoch": 0.22720790288357856,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0018029347912259896,
+      "loss": 1.2673,
+      "step": 3266
+    },
+    {
+      "epoch": 0.22727747052071376,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018028004573065806,
+      "loss": 1.1292,
+      "step": 3267
+    },
+    {
+      "epoch": 0.22734703815784896,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0018026660826250106,
+      "loss": 1.181,
+      "step": 3268
+    },
+    {
+      "epoch": 0.22741660579498418,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018025316671881032,
+      "loss": 1.2582,
+      "step": 3269
+    },
+    {
+      "epoch": 0.22748617343211938,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0018023972110026822,
+      "loss": 1.1066,
+      "step": 3270
+    },
+    {
+      "epoch": 0.22755574106925458,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0018022627140755754,
+      "loss": 1.1172,
+      "step": 3271
+    },
+    {
+      "epoch": 0.22762530870638978,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0018021281764136119,
+      "loss": 1.0191,
+      "step": 3272
+    },
+    {
+      "epoch": 0.227694876343525,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0018019935980236224,
+      "loss": 1.1673,
+      "step": 3273
+    },
+    {
+      "epoch": 0.2277644439806602,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018018589789124404,
+      "loss": 1.2633,
+      "step": 3274
+    },
+    {
+      "epoch": 0.2278340116177954,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001801724319086901,
+      "loss": 1.1363,
+      "step": 3275
+    },
+    {
+      "epoch": 0.2279035792549306,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018015896185538418,
+      "loss": 1.1201,
+      "step": 3276
+    },
+    {
+      "epoch": 0.2279731468920658,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001801454877320102,
+      "loss": 1.094,
+      "step": 3277
+    },
+    {
+      "epoch": 0.22804271452920102,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0018013200953925232,
+      "loss": 1.1318,
+      "step": 3278
+    },
+    {
+      "epoch": 0.22811228216633622,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001801185272777949,
+      "loss": 1.079,
+      "step": 3279
+    },
+    {
+      "epoch": 0.22818184980347142,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.001801050409483225,
+      "loss": 1.4306,
+      "step": 3280
+    },
+    {
+      "epoch": 0.22825141744060662,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0018009155055151984,
+      "loss": 1.4664,
+      "step": 3281
+    },
+    {
+      "epoch": 0.22832098507774184,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018007805608807198,
+      "loss": 1.2103,
+      "step": 3282
+    },
+    {
+      "epoch": 0.22839055271487704,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0018006455755866404,
+      "loss": 1.1655,
+      "step": 3283
+    },
+    {
+      "epoch": 0.22846012035201224,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0018005105496398139,
+      "loss": 1.2991,
+      "step": 3284
+    },
+    {
+      "epoch": 0.22852968798914744,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0018003754830470968,
+      "loss": 1.04,
+      "step": 3285
+    },
+    {
+      "epoch": 0.22859925562628267,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001800240375815347,
+      "loss": 1.297,
+      "step": 3286
+    },
+    {
+      "epoch": 0.22866882326341786,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0018001052279514242,
+      "loss": 1.0963,
+      "step": 3287
+    },
+    {
+      "epoch": 0.22873839090055306,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001799970039462191,
+      "loss": 0.9655,
+      "step": 3288
+    },
+    {
+      "epoch": 0.22880795853768826,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0017998348103545113,
+      "loss": 1.0901,
+      "step": 3289
+    },
+    {
+      "epoch": 0.22887752617482346,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017996995406352513,
+      "loss": 1.0801,
+      "step": 3290
+    },
+    {
+      "epoch": 0.22894709381195869,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0017995642303112794,
+      "loss": 1.3627,
+      "step": 3291
+    },
+    {
+      "epoch": 0.22901666144909388,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001799428879389466,
+      "loss": 1.0702,
+      "step": 3292
+    },
+    {
+      "epoch": 0.22908622908622908,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017992934878766835,
+      "loss": 1.3215,
+      "step": 3293
+    },
+    {
+      "epoch": 0.22915579672336428,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017991580557798065,
+      "loss": 1.1234,
+      "step": 3294
+    },
+    {
+      "epoch": 0.2292253643604995,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017990225831057114,
+      "loss": 1.1273,
+      "step": 3295
+    },
+    {
+      "epoch": 0.2292949319976347,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001798887069861277,
+      "loss": 1.0268,
+      "step": 3296
+    },
+    {
+      "epoch": 0.2293644996347699,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017987515160533837,
+      "loss": 1.0267,
+      "step": 3297
+    },
+    {
+      "epoch": 0.2294340672719051,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017986159216889146,
+      "loss": 0.9197,
+      "step": 3298
+    },
+    {
+      "epoch": 0.22950363490904033,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0017984802867747542,
+      "loss": 1.3159,
+      "step": 3299
+    },
+    {
+      "epoch": 0.22957320254617553,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017983446113177895,
+      "loss": 1.6719,
+      "step": 3300
+    },
+    {
+      "epoch": 0.22964277018331072,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017982088953249096,
+      "loss": 1.0058,
+      "step": 3301
+    },
+    {
+      "epoch": 0.22971233782044592,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017980731388030052,
+      "loss": 1.0851,
+      "step": 3302
+    },
+    {
+      "epoch": 0.22978190545758112,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017979373417589693,
+      "loss": 0.8804,
+      "step": 3303
+    },
+    {
+      "epoch": 0.22985147309471635,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017978015041996969,
+      "loss": 1.0491,
+      "step": 3304
+    },
+    {
+      "epoch": 0.22992104073185154,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0017976656261320856,
+      "loss": 1.2712,
+      "step": 3305
+    },
+    {
+      "epoch": 0.22999060836898674,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017975297075630342,
+      "loss": 1.0895,
+      "step": 3306
+    },
+    {
+      "epoch": 0.23006017600612194,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0017973937484994443,
+      "loss": 1.0422,
+      "step": 3307
+    },
+    {
+      "epoch": 0.23012974364325717,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0017972577489482188,
+      "loss": 0.9921,
+      "step": 3308
+    },
+    {
+      "epoch": 0.23019931128039237,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017971217089162632,
+      "loss": 0.9485,
+      "step": 3309
+    },
+    {
+      "epoch": 0.23026887891752756,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017969856284104854,
+      "loss": 1.1615,
+      "step": 3310
+    },
+    {
+      "epoch": 0.23033844655466276,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001796849507437794,
+      "loss": 1.1171,
+      "step": 3311
+    },
+    {
+      "epoch": 0.230408014191798,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017967133460051014,
+      "loss": 1.1948,
+      "step": 3312
+    },
+    {
+      "epoch": 0.2304775818289332,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0017965771441193206,
+      "loss": 1.2799,
+      "step": 3313
+    },
+    {
+      "epoch": 0.23054714946606839,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017964409017873675,
+      "loss": 1.0338,
+      "step": 3314
+    },
+    {
+      "epoch": 0.23061671710320358,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017963046190161598,
+      "loss": 1.2925,
+      "step": 3315
+    },
+    {
+      "epoch": 0.23068628474033878,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0017961682958126174,
+      "loss": 1.0702,
+      "step": 3316
+    },
+    {
+      "epoch": 0.230755852377474,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017960319321836619,
+      "loss": 1.0485,
+      "step": 3317
+    },
+    {
+      "epoch": 0.2308254200146092,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001795895528136217,
+      "loss": 1.0255,
+      "step": 3318
+    },
+    {
+      "epoch": 0.2308949876517444,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017957590836772091,
+      "loss": 1.1028,
+      "step": 3319
+    },
+    {
+      "epoch": 0.2309645552888796,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0017956225988135653,
+      "loss": 1.2513,
+      "step": 3320
+    },
+    {
+      "epoch": 0.23103412292601483,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0017954860735522166,
+      "loss": 1.1436,
+      "step": 3321
+    },
+    {
+      "epoch": 0.23110369056315003,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017953495079000945,
+      "loss": 1.1775,
+      "step": 3322
+    },
+    {
+      "epoch": 0.23117325820028523,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017952129018641333,
+      "loss": 1.1371,
+      "step": 3323
+    },
+    {
+      "epoch": 0.23124282583742042,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.001795076255451269,
+      "loss": 1.3008,
+      "step": 3324
+    },
+    {
+      "epoch": 0.23131239347455565,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00179493956866844,
+      "loss": 1.0581,
+      "step": 3325
+    },
+    {
+      "epoch": 0.23138196111169085,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017948028415225865,
+      "loss": 1.0475,
+      "step": 3326
+    },
+    {
+      "epoch": 0.23145152874882605,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017946660740206508,
+      "loss": 1.3079,
+      "step": 3327
+    },
+    {
+      "epoch": 0.23152109638596124,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017945292661695773,
+      "loss": 1.2221,
+      "step": 3328
+    },
+    {
+      "epoch": 0.23159066402309644,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017943924179763125,
+      "loss": 1.0878,
+      "step": 3329
+    },
+    {
+      "epoch": 0.23166023166023167,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0017942555294478044,
+      "loss": 1.4133,
+      "step": 3330
+    },
+    {
+      "epoch": 0.23172979929736687,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017941186005910042,
+      "loss": 1.1608,
+      "step": 3331
+    },
+    {
+      "epoch": 0.23179936693450207,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001793981631412864,
+      "loss": 1.3104,
+      "step": 3332
+    },
+    {
+      "epoch": 0.23186893457163726,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017938446219203385,
+      "loss": 1.2514,
+      "step": 3333
+    },
+    {
+      "epoch": 0.2319385022087725,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017937075721203843,
+      "loss": 1.1533,
+      "step": 3334
+    },
+    {
+      "epoch": 0.2320080698459077,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017935704820199604,
+      "loss": 1.4404,
+      "step": 3335
+    },
+    {
+      "epoch": 0.2320776374830429,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017934333516260272,
+      "loss": 1.0626,
+      "step": 3336
+    },
+    {
+      "epoch": 0.23214720512017809,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0017932961809455476,
+      "loss": 1.2449,
+      "step": 3337
+    },
+    {
+      "epoch": 0.2322167727573133,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017931589699854865,
+      "loss": 1.0743,
+      "step": 3338
+    },
+    {
+      "epoch": 0.2322863403944485,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017930217187528106,
+      "loss": 1.2008,
+      "step": 3339
+    },
+    {
+      "epoch": 0.2323559080315837,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0017928844272544892,
+      "loss": 1.1208,
+      "step": 3340
+    },
+    {
+      "epoch": 0.2324254756687189,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017927470954974924,
+      "loss": 1.2209,
+      "step": 3341
+    },
+    {
+      "epoch": 0.2324950433058541,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017926097234887944,
+      "loss": 1.5304,
+      "step": 3342
+    },
+    {
+      "epoch": 0.23256461094298933,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0017924723112353695,
+      "loss": 0.9315,
+      "step": 3343
+    },
+    {
+      "epoch": 0.23263417858012453,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017923348587441951,
+      "loss": 1.0834,
+      "step": 3344
+    },
+    {
+      "epoch": 0.23270374621725973,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017921973660222502,
+      "loss": 1.0801,
+      "step": 3345
+    },
+    {
+      "epoch": 0.23277331385439493,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001792059833076516,
+      "loss": 0.9739,
+      "step": 3346
+    },
+    {
+      "epoch": 0.23284288149153015,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0017919222599139758,
+      "loss": 1.2927,
+      "step": 3347
+    },
+    {
+      "epoch": 0.23291244912866535,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017917846465416148,
+      "loss": 0.9077,
+      "step": 3348
+    },
+    {
+      "epoch": 0.23298201676580055,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00179164699296642,
+      "loss": 0.7696,
+      "step": 3349
+    },
+    {
+      "epoch": 0.23305158440293575,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017915092991953815,
+      "loss": 1.3586,
+      "step": 3350
+    },
+    {
+      "epoch": 0.23312115204007097,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017913715652354903,
+      "loss": 1.1076,
+      "step": 3351
+    },
+    {
+      "epoch": 0.23319071967720617,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017912337910937395,
+      "loss": 1.0509,
+      "step": 3352
+    },
+    {
+      "epoch": 0.23326028731434137,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017910959767771253,
+      "loss": 1.5079,
+      "step": 3353
+    },
+    {
+      "epoch": 0.23332985495147657,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017909581222926446,
+      "loss": 0.9356,
+      "step": 3354
+    },
+    {
+      "epoch": 0.23339942258861177,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001790820227647297,
+      "loss": 1.0451,
+      "step": 3355
+    },
+    {
+      "epoch": 0.233468990225747,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017906822928480848,
+      "loss": 1.0699,
+      "step": 3356
+    },
+    {
+      "epoch": 0.2335385578628822,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017905443179020107,
+      "loss": 1.1171,
+      "step": 3357
+    },
+    {
+      "epoch": 0.2336081255000174,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0017904063028160806,
+      "loss": 1.2825,
+      "step": 3358
+    },
+    {
+      "epoch": 0.2336776931371526,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001790268247597303,
+      "loss": 1.0923,
+      "step": 3359
+    },
+    {
+      "epoch": 0.2337472607742878,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0017901301522526864,
+      "loss": 1.2474,
+      "step": 3360
+    },
+    {
+      "epoch": 0.233816828411423,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017899920167892436,
+      "loss": 1.1623,
+      "step": 3361
+    },
+    {
+      "epoch": 0.2338863960485582,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.001789853841213988,
+      "loss": 1.3545,
+      "step": 3362
+    },
+    {
+      "epoch": 0.2339559636856934,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017897156255339355,
+      "loss": 1.0498,
+      "step": 3363
+    },
+    {
+      "epoch": 0.23402553132282863,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017895773697561039,
+      "loss": 1.158,
+      "step": 3364
+    },
+    {
+      "epoch": 0.23409509895996383,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0017894390738875132,
+      "loss": 0.9316,
+      "step": 3365
+    },
+    {
+      "epoch": 0.23416466659709903,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0017893007379351854,
+      "loss": 0.9714,
+      "step": 3366
+    },
+    {
+      "epoch": 0.23423423423423423,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017891623619061445,
+      "loss": 1.3723,
+      "step": 3367
+    },
+    {
+      "epoch": 0.23430380187136943,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017890239458074166,
+      "loss": 1.2839,
+      "step": 3368
+    },
+    {
+      "epoch": 0.23437336950850465,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017888854896460297,
+      "loss": 1.397,
+      "step": 3369
+    },
+    {
+      "epoch": 0.23444293714563985,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017887469934290139,
+      "loss": 1.3389,
+      "step": 3370
+    },
+    {
+      "epoch": 0.23451250478277505,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017886084571634014,
+      "loss": 1.0145,
+      "step": 3371
+    },
+    {
+      "epoch": 0.23458207241991025,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0017884698808562263,
+      "loss": 1.1883,
+      "step": 3372
+    },
+    {
+      "epoch": 0.23465164005704547,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017883312645145249,
+      "loss": 1.115,
+      "step": 3373
+    },
+    {
+      "epoch": 0.23472120769418067,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017881926081453354,
+      "loss": 1.1842,
+      "step": 3374
+    },
+    {
+      "epoch": 0.23479077533131587,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017880539117556978,
+      "loss": 0.9997,
+      "step": 3375
+    },
+    {
+      "epoch": 0.23486034296845107,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017879151753526549,
+      "loss": 0.9747,
+      "step": 3376
+    },
+    {
+      "epoch": 0.2349299106055863,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017877763989432504,
+      "loss": 1.0838,
+      "step": 3377
+    },
+    {
+      "epoch": 0.2349994782427215,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017876375825345314,
+      "loss": 1.1765,
+      "step": 3378
+    },
+    {
+      "epoch": 0.2350690458798567,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001787498726133546,
+      "loss": 1.1738,
+      "step": 3379
+    },
+    {
+      "epoch": 0.2351386135169919,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017873598297473445,
+      "loss": 1.1137,
+      "step": 3380
+    },
+    {
+      "epoch": 0.2352081811541271,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017872208933829793,
+      "loss": 1.1858,
+      "step": 3381
+    },
+    {
+      "epoch": 0.23527774879126231,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0017870819170475053,
+      "loss": 1.1267,
+      "step": 3382
+    },
+    {
+      "epoch": 0.2353473164283975,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017869429007479783,
+      "loss": 1.1509,
+      "step": 3383
+    },
+    {
+      "epoch": 0.2354168840655327,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017868038444914577,
+      "loss": 1.2707,
+      "step": 3384
+    },
+    {
+      "epoch": 0.2354864517026679,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017866647482850033,
+      "loss": 1.1556,
+      "step": 3385
+    },
+    {
+      "epoch": 0.23555601933980314,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0017865256121356783,
+      "loss": 1.5673,
+      "step": 3386
+    },
+    {
+      "epoch": 0.23562558697693833,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001786386436050547,
+      "loss": 1.1818,
+      "step": 3387
+    },
+    {
+      "epoch": 0.23569515461407353,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017862472200366763,
+      "loss": 1.1394,
+      "step": 3388
+    },
+    {
+      "epoch": 0.23576472225120873,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017861079641011345,
+      "loss": 0.9994,
+      "step": 3389
+    },
+    {
+      "epoch": 0.23583428988834393,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017859686682509927,
+      "loss": 1.1313,
+      "step": 3390
+    },
+    {
+      "epoch": 0.23590385752547915,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0017858293324933237,
+      "loss": 1.2673,
+      "step": 3391
+    },
+    {
+      "epoch": 0.23597342516261435,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0017856899568352018,
+      "loss": 1.2771,
+      "step": 3392
+    },
+    {
+      "epoch": 0.23604299279974955,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017855505412837044,
+      "loss": 1.0559,
+      "step": 3393
+    },
+    {
+      "epoch": 0.23611256043688475,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017854110858459094,
+      "loss": 1.2803,
+      "step": 3394
+    },
+    {
+      "epoch": 0.23618212807401998,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017852715905288985,
+      "loss": 1.2212,
+      "step": 3395
+    },
+    {
+      "epoch": 0.23625169571115517,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017851320553397545,
+      "loss": 1.2095,
+      "step": 3396
+    },
+    {
+      "epoch": 0.23632126334829037,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001784992480285562,
+      "loss": 1.1702,
+      "step": 3397
+    },
+    {
+      "epoch": 0.23639083098542557,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0017848528653734079,
+      "loss": 1.1098,
+      "step": 3398
+    },
+    {
+      "epoch": 0.2364603986225608,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0017847132106103812,
+      "loss": 1.2043,
+      "step": 3399
+    },
+    {
+      "epoch": 0.236529966259696,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017845735160035732,
+      "loss": 1.3687,
+      "step": 3400
+    },
+    {
+      "epoch": 0.2365995338968312,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017844337815600762,
+      "loss": 1.0551,
+      "step": 3401
+    },
+    {
+      "epoch": 0.2366691015339664,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017842940072869858,
+      "loss": 1.0612,
+      "step": 3402
+    },
+    {
+      "epoch": 0.2367386691711016,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001784154193191399,
+      "loss": 1.1619,
+      "step": 3403
+    },
+    {
+      "epoch": 0.23680823680823682,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017840143392804145,
+      "loss": 1.0387,
+      "step": 3404
+    },
+    {
+      "epoch": 0.23687780444537201,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0017838744455611337,
+      "loss": 1.0793,
+      "step": 3405
+    },
+    {
+      "epoch": 0.2369473720825072,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0017837345120406596,
+      "loss": 1.1494,
+      "step": 3406
+    },
+    {
+      "epoch": 0.2370169397196424,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001783594538726097,
+      "loss": 0.9968,
+      "step": 3407
+    },
+    {
+      "epoch": 0.23708650735677764,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017834545256245535,
+      "loss": 0.9704,
+      "step": 3408
+    },
+    {
+      "epoch": 0.23715607499391284,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017833144727431383,
+      "loss": 1.1628,
+      "step": 3409
+    },
+    {
+      "epoch": 0.23722564263104803,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017831743800889623,
+      "loss": 1.0879,
+      "step": 3410
+    },
+    {
+      "epoch": 0.23729521026818323,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017830342476691386,
+      "loss": 1.3391,
+      "step": 3411
+    },
+    {
+      "epoch": 0.23736477790531846,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017828940754907828,
+      "loss": 1.1479,
+      "step": 3412
+    },
+    {
+      "epoch": 0.23743434554245366,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017827538635610117,
+      "loss": 1.2333,
+      "step": 3413
+    },
+    {
+      "epoch": 0.23750391317958885,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017826136118869447,
+      "loss": 1.0714,
+      "step": 3414
+    },
+    {
+      "epoch": 0.23757348081672405,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0017824733204757034,
+      "loss": 1.0966,
+      "step": 3415
+    },
+    {
+      "epoch": 0.23764304845385925,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017823329893344106,
+      "loss": 0.9634,
+      "step": 3416
+    },
+    {
+      "epoch": 0.23771261609099448,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0017821926184701923,
+      "loss": 0.8942,
+      "step": 3417
+    },
+    {
+      "epoch": 0.23778218372812968,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001782052207890175,
+      "loss": 1.1385,
+      "step": 3418
+    },
+    {
+      "epoch": 0.23785175136526487,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017819117576014884,
+      "loss": 0.9763,
+      "step": 3419
+    },
+    {
+      "epoch": 0.23792131900240007,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001781771267611264,
+      "loss": 1.2327,
+      "step": 3420
+    },
+    {
+      "epoch": 0.2379908866395353,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0017816307379266351,
+      "loss": 1.1842,
+      "step": 3421
+    },
+    {
+      "epoch": 0.2380604542766705,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017814901685547372,
+      "loss": 1.172,
+      "step": 3422
+    },
+    {
+      "epoch": 0.2381300219138057,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017813495595027072,
+      "loss": 1.2356,
+      "step": 3423
+    },
+    {
+      "epoch": 0.2381995895509409,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0017812089107776847,
+      "loss": 1.1543,
+      "step": 3424
+    },
+    {
+      "epoch": 0.23826915718807612,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017810682223868117,
+      "loss": 1.3024,
+      "step": 3425
+    },
+    {
+      "epoch": 0.23833872482521132,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017809274943372312,
+      "loss": 1.1413,
+      "step": 3426
+    },
+    {
+      "epoch": 0.23840829246234652,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017807867266360887,
+      "loss": 1.1717,
+      "step": 3427
+    },
+    {
+      "epoch": 0.23847786009948171,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0017806459192905315,
+      "loss": 1.2329,
+      "step": 3428
+    },
+    {
+      "epoch": 0.2385474277366169,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0017805050723077095,
+      "loss": 1.3393,
+      "step": 3429
+    },
+    {
+      "epoch": 0.23861699537375214,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017803641856947738,
+      "loss": 1.1232,
+      "step": 3430
+    },
+    {
+      "epoch": 0.23868656301088734,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017802232594588778,
+      "loss": 1.2795,
+      "step": 3431
+    },
+    {
+      "epoch": 0.23875613064802254,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.001780082293607178,
+      "loss": 1.2462,
+      "step": 3432
+    },
+    {
+      "epoch": 0.23882569828515773,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0017799412881468306,
+      "loss": 1.0268,
+      "step": 3433
+    },
+    {
+      "epoch": 0.23889526592229296,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001779800243084996,
+      "loss": 1.2895,
+      "step": 3434
+    },
+    {
+      "epoch": 0.23896483355942816,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017796591584288356,
+      "loss": 1.1936,
+      "step": 3435
+    },
+    {
+      "epoch": 0.23903440119656336,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001779518034185513,
+      "loss": 1.4595,
+      "step": 3436
+    },
+    {
+      "epoch": 0.23910396883369855,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017793768703621936,
+      "loss": 1.2653,
+      "step": 3437
+    },
+    {
+      "epoch": 0.23917353647083378,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001779235666966045,
+      "loss": 0.8807,
+      "step": 3438
+    },
+    {
+      "epoch": 0.23924310410796898,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0017790944240042368,
+      "loss": 1.3037,
+      "step": 3439
+    },
+    {
+      "epoch": 0.23931267174510418,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017789531414839409,
+      "loss": 1.1275,
+      "step": 3440
+    },
+    {
+      "epoch": 0.23938223938223938,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017788118194123307,
+      "loss": 1.1571,
+      "step": 3441
+    },
+    {
+      "epoch": 0.23945180701937457,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017786704577965814,
+      "loss": 1.0926,
+      "step": 3442
+    },
+    {
+      "epoch": 0.2395213746565098,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017785290566438717,
+      "loss": 1.2125,
+      "step": 3443
+    },
+    {
+      "epoch": 0.239590942293645,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017783876159613802,
+      "loss": 1.1799,
+      "step": 3444
+    },
+    {
+      "epoch": 0.2396605099307802,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017782461357562886,
+      "loss": 1.2677,
+      "step": 3445
+    },
+    {
+      "epoch": 0.2397300775679154,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017781046160357814,
+      "loss": 1.1287,
+      "step": 3446
+    },
+    {
+      "epoch": 0.23979964520505062,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017779630568070435,
+      "loss": 1.292,
+      "step": 3447
+    },
+    {
+      "epoch": 0.23986921284218582,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017778214580772627,
+      "loss": 1.0825,
+      "step": 3448
+    },
+    {
+      "epoch": 0.23993878047932102,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001777679819853629,
+      "loss": 1.2443,
+      "step": 3449
+    },
+    {
+      "epoch": 0.24000834811645622,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001777538142143334,
+      "loss": 1.2154,
+      "step": 3450
+    },
+    {
+      "epoch": 0.24007791575359144,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001777396424953571,
+      "loss": 1.3522,
+      "step": 3451
+    },
+    {
+      "epoch": 0.24014748339072664,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0017772546682915359,
+      "loss": 1.2646,
+      "step": 3452
+    },
+    {
+      "epoch": 0.24021705102786184,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017771128721644264,
+      "loss": 0.9994,
+      "step": 3453
+    },
+    {
+      "epoch": 0.24028661866499704,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001776971036579442,
+      "loss": 0.911,
+      "step": 3454
+    },
+    {
+      "epoch": 0.24035618630213224,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017768291615437848,
+      "loss": 1.0556,
+      "step": 3455
+    },
+    {
+      "epoch": 0.24042575393926746,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0017766872470646583,
+      "loss": 1.2429,
+      "step": 3456
+    },
+    {
+      "epoch": 0.24049532157640266,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0017765452931492681,
+      "loss": 1.3328,
+      "step": 3457
+    },
+    {
+      "epoch": 0.24056488921353786,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001776403299804822,
+      "loss": 1.1474,
+      "step": 3458
+    },
+    {
+      "epoch": 0.24063445685067306,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0017762612670385299,
+      "loss": 1.0391,
+      "step": 3459
+    },
+    {
+      "epoch": 0.24070402448780828,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001776119194857603,
+      "loss": 1.2632,
+      "step": 3460
+    },
+    {
+      "epoch": 0.24077359212494348,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017759770832692556,
+      "loss": 1.2716,
+      "step": 3461
+    },
+    {
+      "epoch": 0.24084315976207868,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001775834932280703,
+      "loss": 0.9257,
+      "step": 3462
+    },
+    {
+      "epoch": 0.24091272739921388,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001775692741899163,
+      "loss": 1.1569,
+      "step": 3463
+    },
+    {
+      "epoch": 0.2409822950363491,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0017755505121318552,
+      "loss": 1.3196,
+      "step": 3464
+    },
+    {
+      "epoch": 0.2410518626734843,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0017754082429860018,
+      "loss": 1.1923,
+      "step": 3465
+    },
+    {
+      "epoch": 0.2411214303106195,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001775265934468826,
+      "loss": 1.182,
+      "step": 3466
+    },
+    {
+      "epoch": 0.2411909979477547,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017751235865875537,
+      "loss": 1.1103,
+      "step": 3467
+    },
+    {
+      "epoch": 0.2412605655848899,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017749811993494125,
+      "loss": 1.133,
+      "step": 3468
+    },
+    {
+      "epoch": 0.24133013322202512,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017748387727616322,
+      "loss": 1.1061,
+      "step": 3469
+    },
+    {
+      "epoch": 0.24139970085916032,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017746963068314447,
+      "loss": 1.079,
+      "step": 3470
+    },
+    {
+      "epoch": 0.24146926849629552,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0017745538015660834,
+      "loss": 1.24,
+      "step": 3471
+    },
+    {
+      "epoch": 0.24153883613343072,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017744112569727838,
+      "loss": 1.0469,
+      "step": 3472
+    },
+    {
+      "epoch": 0.24160840377056594,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017742686730587841,
+      "loss": 1.1172,
+      "step": 3473
+    },
+    {
+      "epoch": 0.24167797140770114,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001774126049831324,
+      "loss": 1.0453,
+      "step": 3474
+    },
+    {
+      "epoch": 0.24174753904483634,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017739833872976447,
+      "loss": 0.9883,
+      "step": 3475
+    },
+    {
+      "epoch": 0.24181710668197154,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0017738406854649902,
+      "loss": 1.3942,
+      "step": 3476
+    },
+    {
+      "epoch": 0.24188667431910676,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017736979443406062,
+      "loss": 0.8944,
+      "step": 3477
+    },
+    {
+      "epoch": 0.24195624195624196,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017735551639317402,
+      "loss": 1.2261,
+      "step": 3478
+    },
+    {
+      "epoch": 0.24202580959337716,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017734123442456422,
+      "loss": 1.1733,
+      "step": 3479
+    },
+    {
+      "epoch": 0.24209537723051236,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017732694852895636,
+      "loss": 1.0871,
+      "step": 3480
+    },
+    {
+      "epoch": 0.24216494486764756,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001773126587070758,
+      "loss": 1.0592,
+      "step": 3481
+    },
+    {
+      "epoch": 0.24223451250478278,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001772983649596481,
+      "loss": 1.2381,
+      "step": 3482
+    },
+    {
+      "epoch": 0.24230408014191798,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0017728406728739908,
+      "loss": 0.8463,
+      "step": 3483
+    },
+    {
+      "epoch": 0.24237364777905318,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017726976569105463,
+      "loss": 1.07,
+      "step": 3484
+    },
+    {
+      "epoch": 0.24244321541618838,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0017725546017134098,
+      "loss": 1.1426,
+      "step": 3485
+    },
+    {
+      "epoch": 0.2425127830533236,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017724115072898442,
+      "loss": 1.222,
+      "step": 3486
+    },
+    {
+      "epoch": 0.2425823506904588,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0017722683736471159,
+      "loss": 1.0897,
+      "step": 3487
+    },
+    {
+      "epoch": 0.242651918327594,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.001772125200792492,
+      "loss": 0.8916,
+      "step": 3488
+    },
+    {
+      "epoch": 0.2427214859647292,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017719819887332417,
+      "loss": 1.1342,
+      "step": 3489
+    },
+    {
+      "epoch": 0.24279105360186443,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017718387374766379,
+      "loss": 0.9353,
+      "step": 3490
+    },
+    {
+      "epoch": 0.24286062123899962,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001771695447029953,
+      "loss": 1.2812,
+      "step": 3491
+    },
+    {
+      "epoch": 0.24293018887613482,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017715521174004624,
+      "loss": 1.2196,
+      "step": 3492
+    },
+    {
+      "epoch": 0.24299975651327002,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017714087485954449,
+      "loss": 1.0235,
+      "step": 3493
+    },
+    {
+      "epoch": 0.24306932415040522,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001771265340622179,
+      "loss": 0.9502,
+      "step": 3494
+    },
+    {
+      "epoch": 0.24313889178754045,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017711218934879467,
+      "loss": 0.9577,
+      "step": 3495
+    },
+    {
+      "epoch": 0.24320845942467564,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017709784072000314,
+      "loss": 1.001,
+      "step": 3496
+    },
+    {
+      "epoch": 0.24327802706181084,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017708348817657188,
+      "loss": 1.1289,
+      "step": 3497
+    },
+    {
+      "epoch": 0.24334759469894604,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017706913171922959,
+      "loss": 1.1366,
+      "step": 3498
+    },
+    {
+      "epoch": 0.24341716233608127,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017705477134870526,
+      "loss": 1.0603,
+      "step": 3499
+    },
+    {
+      "epoch": 0.24348672997321646,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00177040407065728,
+      "loss": 1.1888,
+      "step": 3500
+    },
+    {
+      "epoch": 0.24355629761035166,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0017702603887102721,
+      "loss": 0.9626,
+      "step": 3501
+    },
+    {
+      "epoch": 0.24362586524748686,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001770116667653324,
+      "loss": 1.2676,
+      "step": 3502
+    },
+    {
+      "epoch": 0.2436954328846221,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017699729074937332,
+      "loss": 1.0256,
+      "step": 3503
+    },
+    {
+      "epoch": 0.24376500052175729,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001769829108238799,
+      "loss": 1.3075,
+      "step": 3504
+    },
+    {
+      "epoch": 0.24383456815889248,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001769685269895823,
+      "loss": 1.1977,
+      "step": 3505
+    },
+    {
+      "epoch": 0.24390413579602768,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017695413924721088,
+      "loss": 1.1401,
+      "step": 3506
+    },
+    {
+      "epoch": 0.24397370343316288,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017693974759749609,
+      "loss": 1.0515,
+      "step": 3507
+    },
+    {
+      "epoch": 0.2440432710702981,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017692535204116876,
+      "loss": 1.1446,
+      "step": 3508
+    },
+    {
+      "epoch": 0.2441128387074333,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017691095257895977,
+      "loss": 1.0242,
+      "step": 3509
+    },
+    {
+      "epoch": 0.2441824063445685,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017689654921160028,
+      "loss": 1.2903,
+      "step": 3510
+    },
+    {
+      "epoch": 0.2442519739817037,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017688214193982159,
+      "loss": 0.9139,
+      "step": 3511
+    },
+    {
+      "epoch": 0.24432154161883893,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017686773076435527,
+      "loss": 0.9588,
+      "step": 3512
+    },
+    {
+      "epoch": 0.24439110925597413,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00176853315685933,
+      "loss": 1.2142,
+      "step": 3513
+    },
+    {
+      "epoch": 0.24446067689310932,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017683889670528675,
+      "loss": 1.0694,
+      "step": 3514
+    },
+    {
+      "epoch": 0.24453024453024452,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017682447382314861,
+      "loss": 1.073,
+      "step": 3515
+    },
+    {
+      "epoch": 0.24459981216737975,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017681004704025091,
+      "loss": 0.9615,
+      "step": 3516
+    },
+    {
+      "epoch": 0.24466937980451495,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001767956163573262,
+      "loss": 0.8559,
+      "step": 3517
+    },
+    {
+      "epoch": 0.24473894744165015,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017678118177510713,
+      "loss": 1.0005,
+      "step": 3518
+    },
+    {
+      "epoch": 0.24480851507878534,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017676674329432669,
+      "loss": 1.0934,
+      "step": 3519
+    },
+    {
+      "epoch": 0.24487808271592054,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017675230091571791,
+      "loss": 1.0709,
+      "step": 3520
+    },
+    {
+      "epoch": 0.24494765035305577,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001767378546400142,
+      "loss": 1.1684,
+      "step": 3521
+    },
+    {
+      "epoch": 0.24501721799019097,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00176723404467949,
+      "loss": 1.1248,
+      "step": 3522
+    },
+    {
+      "epoch": 0.24508678562732616,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017670895040025605,
+      "loss": 1.1665,
+      "step": 3523
+    },
+    {
+      "epoch": 0.24515635326446136,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0017669449243766923,
+      "loss": 1.1104,
+      "step": 3524
+    },
+    {
+      "epoch": 0.2452259209015966,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017668003058092263,
+      "loss": 0.9688,
+      "step": 3525
+    },
+    {
+      "epoch": 0.2452954885387318,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001766655648307506,
+      "loss": 1.1997,
+      "step": 3526
+    },
+    {
+      "epoch": 0.24536505617586699,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001766510951878876,
+      "loss": 0.9125,
+      "step": 3527
+    },
+    {
+      "epoch": 0.24543462381300218,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017663662165306833,
+      "loss": 1.3179,
+      "step": 3528
+    },
+    {
+      "epoch": 0.24550419145013738,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0017662214422702772,
+      "loss": 1.1226,
+      "step": 3529
+    },
+    {
+      "epoch": 0.2455737590872726,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017660766291050082,
+      "loss": 0.8704,
+      "step": 3530
+    },
+    {
+      "epoch": 0.2456433267244078,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001765931777042229,
+      "loss": 1.2727,
+      "step": 3531
+    },
+    {
+      "epoch": 0.245712894361543,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001765786886089295,
+      "loss": 0.854,
+      "step": 3532
+    },
+    {
+      "epoch": 0.2457824619986782,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0017656419562535625,
+      "loss": 1.1171,
+      "step": 3533
+    },
+    {
+      "epoch": 0.24585202963581343,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001765496987542391,
+      "loss": 0.929,
+      "step": 3534
+    },
+    {
+      "epoch": 0.24592159727294863,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017653519799631407,
+      "loss": 1.2473,
+      "step": 3535
+    },
+    {
+      "epoch": 0.24599116491008383,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0017652069335231744,
+      "loss": 1.2321,
+      "step": 3536
+    },
+    {
+      "epoch": 0.24606073254721902,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001765061848229857,
+      "loss": 1.0668,
+      "step": 3537
+    },
+    {
+      "epoch": 0.24613030018435425,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017649167240905554,
+      "loss": 1.1731,
+      "step": 3538
+    },
+    {
+      "epoch": 0.24619986782148945,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017647715611126375,
+      "loss": 0.8925,
+      "step": 3539
+    },
+    {
+      "epoch": 0.24626943545862465,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0017646263593034748,
+      "loss": 1.1732,
+      "step": 3540
+    },
+    {
+      "epoch": 0.24633900309575985,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017644811186704396,
+      "loss": 1.1054,
+      "step": 3541
+    },
+    {
+      "epoch": 0.24640857073289504,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017643358392209062,
+      "loss": 1.2514,
+      "step": 3542
+    },
+    {
+      "epoch": 0.24647813837003027,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017641905209622518,
+      "loss": 0.8752,
+      "step": 3543
+    },
+    {
+      "epoch": 0.24654770600716547,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017640451639018542,
+      "loss": 1.1269,
+      "step": 3544
+    },
+    {
+      "epoch": 0.24661727364430067,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017638997680470944,
+      "loss": 1.1035,
+      "step": 3545
+    },
+    {
+      "epoch": 0.24668684128143586,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001763754333405355,
+      "loss": 1.3187,
+      "step": 3546
+    },
+    {
+      "epoch": 0.2467564089185711,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017636088599840196,
+      "loss": 1.2283,
+      "step": 3547
+    },
+    {
+      "epoch": 0.2468259765557063,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017634633477904755,
+      "loss": 1.1916,
+      "step": 3548
+    },
+    {
+      "epoch": 0.2468955441928415,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017633177968321109,
+      "loss": 0.9575,
+      "step": 3549
+    },
+    {
+      "epoch": 0.24696511182997669,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017631722071163156,
+      "loss": 0.8847,
+      "step": 3550
+    },
+    {
+      "epoch": 0.2470346794671119,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017630265786504824,
+      "loss": 0.888,
+      "step": 3551
+    },
+    {
+      "epoch": 0.2471042471042471,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017628809114420057,
+      "loss": 0.951,
+      "step": 3552
+    },
+    {
+      "epoch": 0.2471738147413823,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017627352054982812,
+      "loss": 1.0284,
+      "step": 3553
+    },
+    {
+      "epoch": 0.2472433823785175,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017625894608267077,
+      "loss": 1.2441,
+      "step": 3554
+    },
+    {
+      "epoch": 0.2473129500156527,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.001762443677434685,
+      "loss": 1.1451,
+      "step": 3555
+    },
+    {
+      "epoch": 0.24738251765278793,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017622978553296154,
+      "loss": 1.1002,
+      "step": 3556
+    },
+    {
+      "epoch": 0.24745208528992313,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0017621519945189028,
+      "loss": 1.0644,
+      "step": 3557
+    },
+    {
+      "epoch": 0.24752165292705833,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017620060950099537,
+      "loss": 0.9999,
+      "step": 3558
+    },
+    {
+      "epoch": 0.24759122056419353,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017618601568101758,
+      "loss": 1.3499,
+      "step": 3559
+    },
+    {
+      "epoch": 0.24766078820132875,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001761714179926979,
+      "loss": 0.9083,
+      "step": 3560
+    },
+    {
+      "epoch": 0.24773035583846395,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001761568164367776,
+      "loss": 0.9532,
+      "step": 3561
+    },
+    {
+      "epoch": 0.24779992347559915,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017614221101399797,
+      "loss": 1.064,
+      "step": 3562
+    },
+    {
+      "epoch": 0.24786949111273435,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0017612760172510066,
+      "loss": 1.0452,
+      "step": 3563
+    },
+    {
+      "epoch": 0.24793905874986957,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0017611298857082745,
+      "loss": 0.9763,
+      "step": 3564
+    },
+    {
+      "epoch": 0.24800862638700477,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017609837155192032,
+      "loss": 0.8372,
+      "step": 3565
+    },
+    {
+      "epoch": 0.24807819402413997,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017608375066912143,
+      "loss": 1.0637,
+      "step": 3566
+    },
+    {
+      "epoch": 0.24814776166127517,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017606912592317322,
+      "loss": 1.0154,
+      "step": 3567
+    },
+    {
+      "epoch": 0.24821732929841037,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017605449731481816,
+      "loss": 1.1124,
+      "step": 3568
+    },
+    {
+      "epoch": 0.2482868969355456,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001760398648447991,
+      "loss": 1.1471,
+      "step": 3569
+    },
+    {
+      "epoch": 0.2483564645726808,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017602522851385895,
+      "loss": 1.1284,
+      "step": 3570
+    },
+    {
+      "epoch": 0.248426032209816,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001760105883227409,
+      "loss": 1.1458,
+      "step": 3571
+    },
+    {
+      "epoch": 0.2484955998469512,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001759959442721883,
+      "loss": 0.9884,
+      "step": 3572
+    },
+    {
+      "epoch": 0.2485651674840864,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001759812963629447,
+      "loss": 1.043,
+      "step": 3573
+    },
+    {
+      "epoch": 0.2486347351212216,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017596664459575385,
+      "loss": 1.0459,
+      "step": 3574
+    },
+    {
+      "epoch": 0.2487043027583568,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017595198897135968,
+      "loss": 1.1725,
+      "step": 3575
+    },
+    {
+      "epoch": 0.248773870395492,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017593732949050633,
+      "loss": 1.0612,
+      "step": 3576
+    },
+    {
+      "epoch": 0.24884343803262723,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017592266615393815,
+      "loss": 0.7711,
+      "step": 3577
+    },
+    {
+      "epoch": 0.24891300566976243,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017590799896239969,
+      "loss": 1.2447,
+      "step": 3578
+    },
+    {
+      "epoch": 0.24898257330689763,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001758933279166356,
+      "loss": 0.9262,
+      "step": 3579
+    },
+    {
+      "epoch": 0.24905214094403283,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017587865301739085,
+      "loss": 1.075,
+      "step": 3580
+    },
+    {
+      "epoch": 0.24912170858116803,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001758639742654106,
+      "loss": 1.109,
+      "step": 3581
+    },
+    {
+      "epoch": 0.24919127621830325,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017584929166144009,
+      "loss": 1.0506,
+      "step": 3582
+    },
+    {
+      "epoch": 0.24926084385543845,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017583460520622482,
+      "loss": 1.1858,
+      "step": 3583
+    },
+    {
+      "epoch": 0.24933041149257365,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001758199149005106,
+      "loss": 1.0795,
+      "step": 3584
+    },
+    {
+      "epoch": 0.24939997912970885,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0017580522074504324,
+      "loss": 1.1447,
+      "step": 3585
+    },
+    {
+      "epoch": 0.24946954676684407,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0017579052274056884,
+      "loss": 0.9276,
+      "step": 3586
+    },
+    {
+      "epoch": 0.24953911440397927,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017577582088783373,
+      "loss": 1.0903,
+      "step": 3587
+    },
+    {
+      "epoch": 0.24960868204111447,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0017576111518758436,
+      "loss": 0.9308,
+      "step": 3588
+    },
+    {
+      "epoch": 0.24967824967824967,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001757464056405674,
+      "loss": 0.8891,
+      "step": 3589
+    },
+    {
+      "epoch": 0.2497478173153849,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0017573169224752978,
+      "loss": 1.0226,
+      "step": 3590
+    },
+    {
+      "epoch": 0.2498173849525201,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017571697500921857,
+      "loss": 1.2338,
+      "step": 3591
+    },
+    {
+      "epoch": 0.2498869525896553,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017570225392638098,
+      "loss": 0.9897,
+      "step": 3592
+    },
+    {
+      "epoch": 0.2499565202267905,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001756875289997645,
+      "loss": 1.0784,
+      "step": 3593
+    },
+    {
+      "epoch": 0.2500260878639257,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001756728002301168,
+      "loss": 1.2579,
+      "step": 3594
+    },
+    {
+      "epoch": 0.2500956555010609,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017565806761818572,
+      "loss": 1.1706,
+      "step": 3595
+    },
+    {
+      "epoch": 0.2501652231381961,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001756433311647193,
+      "loss": 1.167,
+      "step": 3596
+    },
+    {
+      "epoch": 0.25023479077533134,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017562859087046584,
+      "loss": 1.1437,
+      "step": 3597
+    },
+    {
+      "epoch": 0.2503043584124665,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001756138467361737,
+      "loss": 0.9237,
+      "step": 3598
+    },
+    {
+      "epoch": 0.25037392604960174,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017559909876259155,
+      "loss": 1.0883,
+      "step": 3599
+    },
+    {
+      "epoch": 0.2504434936867369,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017558434695046819,
+      "loss": 1.121,
+      "step": 3600
+    },
+    {
+      "epoch": 0.25051306132387213,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017556959130055267,
+      "loss": 1.0369,
+      "step": 3601
+    },
+    {
+      "epoch": 0.25058262896100736,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001755548318135942,
+      "loss": 1.0228,
+      "step": 3602
+    },
+    {
+      "epoch": 0.25065219659814253,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017554006849034222,
+      "loss": 0.9694,
+      "step": 3603
+    },
+    {
+      "epoch": 0.25072176423527776,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017552530133154631,
+      "loss": 1.0473,
+      "step": 3604
+    },
+    {
+      "epoch": 0.2507913318724129,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017551053033795627,
+      "loss": 1.1891,
+      "step": 3605
+    },
+    {
+      "epoch": 0.25086089950954815,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001754957555103221,
+      "loss": 0.9631,
+      "step": 3606
+    },
+    {
+      "epoch": 0.2509304671466834,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00175480976849394,
+      "loss": 0.9455,
+      "step": 3607
+    },
+    {
+      "epoch": 0.25100003478381855,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017546619435592232,
+      "loss": 1.0606,
+      "step": 3608
+    },
+    {
+      "epoch": 0.2510696024209538,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001754514080306577,
+      "loss": 1.0355,
+      "step": 3609
+    },
+    {
+      "epoch": 0.251139170058089,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0017543661787435085,
+      "loss": 0.9412,
+      "step": 3610
+    },
+    {
+      "epoch": 0.25120873769522417,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017542182388775279,
+      "loss": 1.322,
+      "step": 3611
+    },
+    {
+      "epoch": 0.2512783053323594,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0017540702607161467,
+      "loss": 0.9471,
+      "step": 3612
+    },
+    {
+      "epoch": 0.25134787296949457,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0017539222442668784,
+      "loss": 1.0245,
+      "step": 3613
+    },
+    {
+      "epoch": 0.2514174406066298,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017537741895372388,
+      "loss": 1.2477,
+      "step": 3614
+    },
+    {
+      "epoch": 0.251487008243765,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0017536260965347447,
+      "loss": 1.1399,
+      "step": 3615
+    },
+    {
+      "epoch": 0.2515565758809002,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017534779652669163,
+      "loss": 1.2683,
+      "step": 3616
+    },
+    {
+      "epoch": 0.2516261435180354,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017533297957412746,
+      "loss": 0.993,
+      "step": 3617
+    },
+    {
+      "epoch": 0.2516957111551706,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0017531815879653432,
+      "loss": 1.0794,
+      "step": 3618
+    },
+    {
+      "epoch": 0.2517652787923058,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017530333419466468,
+      "loss": 1.1167,
+      "step": 3619
+    },
+    {
+      "epoch": 0.25183484642944104,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017528850576927128,
+      "loss": 1.032,
+      "step": 3620
+    },
+    {
+      "epoch": 0.2519044140665762,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0017527367352110704,
+      "loss": 1.1043,
+      "step": 3621
+    },
+    {
+      "epoch": 0.25197398170371144,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017525883745092509,
+      "loss": 1.0413,
+      "step": 3622
+    },
+    {
+      "epoch": 0.25204354934084666,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017524399755947865,
+      "loss": 1.0989,
+      "step": 3623
+    },
+    {
+      "epoch": 0.25211311697798183,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017522915384752134,
+      "loss": 0.9765,
+      "step": 3624
+    },
+    {
+      "epoch": 0.25218268461511706,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0017521430631580674,
+      "loss": 1.2161,
+      "step": 3625
+    },
+    {
+      "epoch": 0.25225225225225223,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001751994549650888,
+      "loss": 1.4329,
+      "step": 3626
+    },
+    {
+      "epoch": 0.25232181988938746,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017518459979612155,
+      "loss": 1.2085,
+      "step": 3627
+    },
+    {
+      "epoch": 0.2523913875265227,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001751697408096593,
+      "loss": 1.2017,
+      "step": 3628
+    },
+    {
+      "epoch": 0.25246095516365785,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017515487800645647,
+      "loss": 0.9696,
+      "step": 3629
+    },
+    {
+      "epoch": 0.2525305228007931,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0017514001138726775,
+      "loss": 1.0484,
+      "step": 3630
+    },
+    {
+      "epoch": 0.25260009043792825,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00175125140952848,
+      "loss": 1.0938,
+      "step": 3631
+    },
+    {
+      "epoch": 0.2526696580750635,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017511026670395222,
+      "loss": 1.158,
+      "step": 3632
+    },
+    {
+      "epoch": 0.2527392257121987,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017509538864133574,
+      "loss": 0.862,
+      "step": 3633
+    },
+    {
+      "epoch": 0.25280879334933387,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017508050676575389,
+      "loss": 0.8049,
+      "step": 3634
+    },
+    {
+      "epoch": 0.2528783609864691,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0017506562107796233,
+      "loss": 0.9967,
+      "step": 3635
+    },
+    {
+      "epoch": 0.2529479286236043,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001750507315787169,
+      "loss": 1.1872,
+      "step": 3636
+    },
+    {
+      "epoch": 0.2530174962607395,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017503583826877364,
+      "loss": 1.0083,
+      "step": 3637
+    },
+    {
+      "epoch": 0.2530870638978747,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001750209411488887,
+      "loss": 1.1361,
+      "step": 3638
+    },
+    {
+      "epoch": 0.2531566315350099,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0017500604021981848,
+      "loss": 1.0634,
+      "step": 3639
+    },
+    {
+      "epoch": 0.2532261991721451,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017499113548231963,
+      "loss": 0.7385,
+      "step": 3640
+    },
+    {
+      "epoch": 0.25329576680928034,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017497622693714886,
+      "loss": 1.0432,
+      "step": 3641
+    },
+    {
+      "epoch": 0.2533653344464155,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001749613145850632,
+      "loss": 0.9483,
+      "step": 3642
+    },
+    {
+      "epoch": 0.25343490208355074,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017494639842681986,
+      "loss": 1.1768,
+      "step": 3643
+    },
+    {
+      "epoch": 0.2535044697206859,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017493147846317613,
+      "loss": 1.2575,
+      "step": 3644
+    },
+    {
+      "epoch": 0.25357403735782114,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017491655469488963,
+      "loss": 1.0326,
+      "step": 3645
+    },
+    {
+      "epoch": 0.25364360499495636,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0017490162712271808,
+      "loss": 1.0368,
+      "step": 3646
+    },
+    {
+      "epoch": 0.25371317263209153,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017488669574741943,
+      "loss": 0.9942,
+      "step": 3647
+    },
+    {
+      "epoch": 0.25378274026922676,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017487176056975185,
+      "loss": 1.1022,
+      "step": 3648
+    },
+    {
+      "epoch": 0.253852307906362,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001748568215904736,
+      "loss": 1.1188,
+      "step": 3649
+    },
+    {
+      "epoch": 0.25392187554349716,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001748418788103433,
+      "loss": 1.158,
+      "step": 3650
+    },
+    {
+      "epoch": 0.2539914431806324,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0017482693223011961,
+      "loss": 1.0782,
+      "step": 3651
+    },
+    {
+      "epoch": 0.25406101081776755,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017481198185056146,
+      "loss": 0.964,
+      "step": 3652
+    },
+    {
+      "epoch": 0.2541305784549028,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017479702767242795,
+      "loss": 1.0249,
+      "step": 3653
+    },
+    {
+      "epoch": 0.254200146092038,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001747820696964784,
+      "loss": 1.0348,
+      "step": 3654
+    },
+    {
+      "epoch": 0.2542697137291732,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017476710792347226,
+      "loss": 1.1565,
+      "step": 3655
+    },
+    {
+      "epoch": 0.2543392813663084,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017475214235416923,
+      "loss": 1.1294,
+      "step": 3656
+    },
+    {
+      "epoch": 0.25440884900344357,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017473717298932918,
+      "loss": 0.8729,
+      "step": 3657
+    },
+    {
+      "epoch": 0.2544784166405788,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017472219982971222,
+      "loss": 0.9156,
+      "step": 3658
+    },
+    {
+      "epoch": 0.254547984277714,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017470722287607856,
+      "loss": 1.1243,
+      "step": 3659
+    },
+    {
+      "epoch": 0.2546175519148492,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001746922421291887,
+      "loss": 1.0525,
+      "step": 3660
+    },
+    {
+      "epoch": 0.2546871195519844,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017467725758980323,
+      "loss": 1.0949,
+      "step": 3661
+    },
+    {
+      "epoch": 0.25475668718911965,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017466226925868305,
+      "loss": 0.918,
+      "step": 3662
+    },
+    {
+      "epoch": 0.2548262548262548,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017464727713658915,
+      "loss": 1.2199,
+      "step": 3663
+    },
+    {
+      "epoch": 0.25489582246339004,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017463228122428275,
+      "loss": 1.2338,
+      "step": 3664
+    },
+    {
+      "epoch": 0.2549653901005252,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017461728152252528,
+      "loss": 0.93,
+      "step": 3665
+    },
+    {
+      "epoch": 0.25503495773766044,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017460227803207838,
+      "loss": 1.1836,
+      "step": 3666
+    },
+    {
+      "epoch": 0.25510452537479567,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017458727075370382,
+      "loss": 0.9921,
+      "step": 3667
+    },
+    {
+      "epoch": 0.25517409301193084,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001745722596881636,
+      "loss": 1.1594,
+      "step": 3668
+    },
+    {
+      "epoch": 0.25524366064906606,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017455724483621989,
+      "loss": 1.2506,
+      "step": 3669
+    },
+    {
+      "epoch": 0.25531322828620123,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001745422261986351,
+      "loss": 1.1569,
+      "step": 3670
+    },
+    {
+      "epoch": 0.25538279592333646,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017452720377617178,
+      "loss": 0.8423,
+      "step": 3671
+    },
+    {
+      "epoch": 0.2554523635604717,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0017451217756959268,
+      "loss": 1.1653,
+      "step": 3672
+    },
+    {
+      "epoch": 0.25552193119760686,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001744971475796608,
+      "loss": 0.7858,
+      "step": 3673
+    },
+    {
+      "epoch": 0.2555914988347421,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017448211380713923,
+      "loss": 1.1199,
+      "step": 3674
+    },
+    {
+      "epoch": 0.2556610664718773,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017446707625279135,
+      "loss": 0.9626,
+      "step": 3675
+    },
+    {
+      "epoch": 0.2557306341090125,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001744520349173807,
+      "loss": 0.9203,
+      "step": 3676
+    },
+    {
+      "epoch": 0.2558002017461477,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017443698980167096,
+      "loss": 0.8413,
+      "step": 3677
+    },
+    {
+      "epoch": 0.2558697693832829,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0017442194090642607,
+      "loss": 1.3561,
+      "step": 3678
+    },
+    {
+      "epoch": 0.2559393370204181,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017440688823241012,
+      "loss": 0.9417,
+      "step": 3679
+    },
+    {
+      "epoch": 0.2560089046575533,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017439183178038747,
+      "loss": 0.8635,
+      "step": 3680
+    },
+    {
+      "epoch": 0.2560784722946885,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017437677155112252,
+      "loss": 1.0922,
+      "step": 3681
+    },
+    {
+      "epoch": 0.2561480399318237,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017436170754538001,
+      "loss": 1.0156,
+      "step": 3682
+    },
+    {
+      "epoch": 0.2562176075689589,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017434663976392483,
+      "loss": 1.2135,
+      "step": 3683
+    },
+    {
+      "epoch": 0.2562871752060941,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00174331568207522,
+      "loss": 1.1355,
+      "step": 3684
+    },
+    {
+      "epoch": 0.25635674284322935,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0017431649287693678,
+      "loss": 1.1804,
+      "step": 3685
+    },
+    {
+      "epoch": 0.2564263104803645,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0017430141377293466,
+      "loss": 1.0658,
+      "step": 3686
+    },
+    {
+      "epoch": 0.25649587811749974,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017428633089628122,
+      "loss": 1.0869,
+      "step": 3687
+    },
+    {
+      "epoch": 0.25656544575463497,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0017427124424774236,
+      "loss": 0.7553,
+      "step": 3688
+    },
+    {
+      "epoch": 0.25663501339177014,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017425615382808406,
+      "loss": 1.0089,
+      "step": 3689
+    },
+    {
+      "epoch": 0.25670458102890537,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017424105963807252,
+      "loss": 1.1103,
+      "step": 3690
+    },
+    {
+      "epoch": 0.25677414866604054,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017422596167847421,
+      "loss": 0.9985,
+      "step": 3691
+    },
+    {
+      "epoch": 0.25684371630317576,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.001742108599500557,
+      "loss": 1.1166,
+      "step": 3692
+    },
+    {
+      "epoch": 0.256913283940311,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017419575445358376,
+      "loss": 0.9415,
+      "step": 3693
+    },
+    {
+      "epoch": 0.25698285157744616,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017418064518982539,
+      "loss": 0.9779,
+      "step": 3694
+    },
+    {
+      "epoch": 0.2570524192145814,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017416553215954774,
+      "loss": 1.0644,
+      "step": 3695
+    },
+    {
+      "epoch": 0.25712198685171656,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017415041536351819,
+      "loss": 0.8074,
+      "step": 3696
+    },
+    {
+      "epoch": 0.2571915544888518,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001741352948025043,
+      "loss": 1.1636,
+      "step": 3697
+    },
+    {
+      "epoch": 0.257261122125987,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001741201704772738,
+      "loss": 1.1348,
+      "step": 3698
+    },
+    {
+      "epoch": 0.2573306897631222,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017410504238859464,
+      "loss": 0.9444,
+      "step": 3699
+    },
+    {
+      "epoch": 0.2574002574002574,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0017408991053723495,
+      "loss": 0.9938,
+      "step": 3700
+    },
+    {
+      "epoch": 0.25746982503739263,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017407477492396306,
+      "loss": 1.0754,
+      "step": 3701
+    },
+    {
+      "epoch": 0.2575393926745278,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017405963554954745,
+      "loss": 0.981,
+      "step": 3702
+    },
+    {
+      "epoch": 0.257608960311663,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017404449241475682,
+      "loss": 1.3793,
+      "step": 3703
+    },
+    {
+      "epoch": 0.2576785279487982,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017402934552036007,
+      "loss": 1.051,
+      "step": 3704
+    },
+    {
+      "epoch": 0.2577480955859334,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0017401419486712632,
+      "loss": 1.3719,
+      "step": 3705
+    },
+    {
+      "epoch": 0.25781766322306865,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.001739990404558248,
+      "loss": 1.1657,
+      "step": 3706
+    },
+    {
+      "epoch": 0.2578872308602038,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00173983882287225,
+      "loss": 1.097,
+      "step": 3707
+    },
+    {
+      "epoch": 0.25795679849733905,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017396872036209655,
+      "loss": 0.9704,
+      "step": 3708
+    },
+    {
+      "epoch": 0.2580263661344742,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001739535546812093,
+      "loss": 1.1431,
+      "step": 3709
+    },
+    {
+      "epoch": 0.25809593377160944,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017393838524533333,
+      "loss": 0.9777,
+      "step": 3710
+    },
+    {
+      "epoch": 0.25816550140874467,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001739232120552388,
+      "loss": 1.1802,
+      "step": 3711
+    },
+    {
+      "epoch": 0.25823506904587984,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017390803511169617,
+      "loss": 1.0882,
+      "step": 3712
+    },
+    {
+      "epoch": 0.25830463668301507,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0017389285441547606,
+      "loss": 1.0451,
+      "step": 3713
+    },
+    {
+      "epoch": 0.2583742043201503,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017387766996734924,
+      "loss": 1.0731,
+      "step": 3714
+    },
+    {
+      "epoch": 0.25844377195728546,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017386248176808673,
+      "loss": 1.101,
+      "step": 3715
+    },
+    {
+      "epoch": 0.2585133395944207,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017384728981845966,
+      "loss": 1.0954,
+      "step": 3716
+    },
+    {
+      "epoch": 0.25858290723155586,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017383209411923944,
+      "loss": 1.0016,
+      "step": 3717
+    },
+    {
+      "epoch": 0.2586524748686911,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0017381689467119764,
+      "loss": 1.1121,
+      "step": 3718
+    },
+    {
+      "epoch": 0.2587220425058263,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017380169147510594,
+      "loss": 1.2915,
+      "step": 3719
+    },
+    {
+      "epoch": 0.2587916101429615,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017378648453173638,
+      "loss": 1.2362,
+      "step": 3720
+    },
+    {
+      "epoch": 0.2588611777800967,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017377127384186105,
+      "loss": 1.1448,
+      "step": 3721
+    },
+    {
+      "epoch": 0.2589307454172319,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0017375605940625225,
+      "loss": 1.2008,
+      "step": 3722
+    },
+    {
+      "epoch": 0.2590003130543671,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001737408412256825,
+      "loss": 1.0659,
+      "step": 3723
+    },
+    {
+      "epoch": 0.25906988069150233,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017372561930092455,
+      "loss": 1.1304,
+      "step": 3724
+    },
+    {
+      "epoch": 0.2591394483286375,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017371039363275123,
+      "loss": 0.945,
+      "step": 3725
+    },
+    {
+      "epoch": 0.2592090159657727,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017369516422193567,
+      "loss": 1.0949,
+      "step": 3726
+    },
+    {
+      "epoch": 0.25927858360290795,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001736799310692511,
+      "loss": 1.1905,
+      "step": 3727
+    },
+    {
+      "epoch": 0.2593481512400431,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017366469417547101,
+      "loss": 1.1927,
+      "step": 3728
+    },
+    {
+      "epoch": 0.25941771887717835,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0017364945354136907,
+      "loss": 1.1854,
+      "step": 3729
+    },
+    {
+      "epoch": 0.2594872865143135,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017363420916771909,
+      "loss": 0.9458,
+      "step": 3730
+    },
+    {
+      "epoch": 0.25955685415144875,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0017361896105529508,
+      "loss": 1.3304,
+      "step": 3731
+    },
+    {
+      "epoch": 0.25962642178858397,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017360370920487134,
+      "loss": 0.9451,
+      "step": 3732
+    },
+    {
+      "epoch": 0.25969598942571914,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017358845361722221,
+      "loss": 0.6876,
+      "step": 3733
+    },
+    {
+      "epoch": 0.25976555706285437,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017357319429312232,
+      "loss": 1.0501,
+      "step": 3734
+    },
+    {
+      "epoch": 0.25983512469998954,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0017355793123334648,
+      "loss": 0.8675,
+      "step": 3735
+    },
+    {
+      "epoch": 0.25990469233712477,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017354266443866961,
+      "loss": 1.1924,
+      "step": 3736
+    },
+    {
+      "epoch": 0.25997425997426,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017352739390986696,
+      "loss": 0.9601,
+      "step": 3737
+    },
+    {
+      "epoch": 0.26004382761139516,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017351211964771384,
+      "loss": 1.0359,
+      "step": 3738
+    },
+    {
+      "epoch": 0.2601133952485304,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017349684165298583,
+      "loss": 1.1129,
+      "step": 3739
+    },
+    {
+      "epoch": 0.2601829628856656,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017348155992645863,
+      "loss": 1.0371,
+      "step": 3740
+    },
+    {
+      "epoch": 0.2602525305228008,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001734662744689082,
+      "loss": 1.1541,
+      "step": 3741
+    },
+    {
+      "epoch": 0.260322098159936,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0017345098528111062,
+      "loss": 1.1502,
+      "step": 3742
+    },
+    {
+      "epoch": 0.2603916657970712,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017343569236384227,
+      "loss": 1.1974,
+      "step": 3743
+    },
+    {
+      "epoch": 0.2604612334342064,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001734203957178796,
+      "loss": 0.9742,
+      "step": 3744
+    },
+    {
+      "epoch": 0.26053080107134163,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017340509534399928,
+      "loss": 1.1656,
+      "step": 3745
+    },
+    {
+      "epoch": 0.2606003687084768,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0017338979124297822,
+      "loss": 1.1089,
+      "step": 3746
+    },
+    {
+      "epoch": 0.26066993634561203,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017337448341559348,
+      "loss": 0.8266,
+      "step": 3747
+    },
+    {
+      "epoch": 0.2607395039827472,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001733591718626223,
+      "loss": 0.8495,
+      "step": 3748
+    },
+    {
+      "epoch": 0.2608090716198824,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017334385658484212,
+      "loss": 1.2051,
+      "step": 3749
+    },
+    {
+      "epoch": 0.26087863925701765,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0017332853758303059,
+      "loss": 1.307,
+      "step": 3750
+    },
+    {
+      "epoch": 0.2609482068941528,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017331321485796554,
+      "loss": 1.1944,
+      "step": 3751
+    },
+    {
+      "epoch": 0.26101777453128805,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017329788841042495,
+      "loss": 0.9252,
+      "step": 3752
+    },
+    {
+      "epoch": 0.2610873421684233,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017328255824118704,
+      "loss": 0.9735,
+      "step": 3753
+    },
+    {
+      "epoch": 0.26115690980555845,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001732672243510302,
+      "loss": 1.2298,
+      "step": 3754
+    },
+    {
+      "epoch": 0.26122647744269367,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00173251886740733,
+      "loss": 1.1572,
+      "step": 3755
+    },
+    {
+      "epoch": 0.26129604507982884,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0017323654541107419,
+      "loss": 0.9327,
+      "step": 3756
+    },
+    {
+      "epoch": 0.26136561271696407,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017322120036283276,
+      "loss": 0.8942,
+      "step": 3757
+    },
+    {
+      "epoch": 0.2614351803540993,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017320585159678783,
+      "loss": 0.9664,
+      "step": 3758
+    },
+    {
+      "epoch": 0.26150474799123447,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017319049911371876,
+      "loss": 1.1331,
+      "step": 3759
+    },
+    {
+      "epoch": 0.2615743156283697,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00173175142914405,
+      "loss": 0.9512,
+      "step": 3760
+    },
+    {
+      "epoch": 0.26164388326550486,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017315978299962636,
+      "loss": 0.7905,
+      "step": 3761
+    },
+    {
+      "epoch": 0.2617134509026401,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001731444193701627,
+      "loss": 1.2855,
+      "step": 3762
+    },
+    {
+      "epoch": 0.2617830185397753,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017312905202679408,
+      "loss": 1.0728,
+      "step": 3763
+    },
+    {
+      "epoch": 0.2618525861769105,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001731136809703008,
+      "loss": 1.2534,
+      "step": 3764
+    },
+    {
+      "epoch": 0.2619221538140457,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017309830620146332,
+      "loss": 0.7704,
+      "step": 3765
+    },
+    {
+      "epoch": 0.26199172145118094,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017308292772106229,
+      "loss": 1.0189,
+      "step": 3766
+    },
+    {
+      "epoch": 0.2620612890883161,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017306754552987855,
+      "loss": 1.1517,
+      "step": 3767
+    },
+    {
+      "epoch": 0.26213085672545133,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017305215962869313,
+      "loss": 1.0454,
+      "step": 3768
+    },
+    {
+      "epoch": 0.2622004243625865,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017303677001828729,
+      "loss": 0.8192,
+      "step": 3769
+    },
+    {
+      "epoch": 0.26226999199972173,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0017302137669944235,
+      "loss": 1.2484,
+      "step": 3770
+    },
+    {
+      "epoch": 0.26233955963685696,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017300597967294,
+      "loss": 0.9947,
+      "step": 3771
+    },
+    {
+      "epoch": 0.2624091272739921,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0017299057893956195,
+      "loss": 1.0141,
+      "step": 3772
+    },
+    {
+      "epoch": 0.26247869491112735,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0017297517450009022,
+      "loss": 1.3139,
+      "step": 3773
+    },
+    {
+      "epoch": 0.2625482625482625,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017295976635530695,
+      "loss": 1.2733,
+      "step": 3774
+    },
+    {
+      "epoch": 0.26261783018539775,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001729443545059945,
+      "loss": 0.9708,
+      "step": 3775
+    },
+    {
+      "epoch": 0.262687397822533,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017292893895293538,
+      "loss": 1.1274,
+      "step": 3776
+    },
+    {
+      "epoch": 0.26275696545966815,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0017291351969691232,
+      "loss": 0.9228,
+      "step": 3777
+    },
+    {
+      "epoch": 0.26282653309680337,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0017289809673870825,
+      "loss": 1.2935,
+      "step": 3778
+    },
+    {
+      "epoch": 0.2628961007339386,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017288267007910627,
+      "loss": 0.8132,
+      "step": 3779
+    },
+    {
+      "epoch": 0.26296566837107377,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017286723971888965,
+      "loss": 0.9086,
+      "step": 3780
+    },
+    {
+      "epoch": 0.263035236008209,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017285180565884187,
+      "loss": 1.0525,
+      "step": 3781
+    },
+    {
+      "epoch": 0.26310480364534417,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017283636789974662,
+      "loss": 0.8169,
+      "step": 3782
+    },
+    {
+      "epoch": 0.2631743712824794,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001728209264423877,
+      "loss": 0.8526,
+      "step": 3783
+    },
+    {
+      "epoch": 0.2632439389196146,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001728054812875492,
+      "loss": 0.965,
+      "step": 3784
+    },
+    {
+      "epoch": 0.2633135065567498,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017279003243601532,
+      "loss": 0.8445,
+      "step": 3785
+    },
+    {
+      "epoch": 0.263383074193885,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001727745798885705,
+      "loss": 1.0094,
+      "step": 3786
+    },
+    {
+      "epoch": 0.2634526418310202,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017275912364599928,
+      "loss": 1.0176,
+      "step": 3787
+    },
+    {
+      "epoch": 0.2635222094681554,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017274366370908655,
+      "loss": 0.941,
+      "step": 3788
+    },
+    {
+      "epoch": 0.26359177710529064,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017272820007861718,
+      "loss": 1.2155,
+      "step": 3789
+    },
+    {
+      "epoch": 0.2636613447424258,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017271273275537642,
+      "loss": 0.8125,
+      "step": 3790
+    },
+    {
+      "epoch": 0.26373091237956103,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0017269726174014956,
+      "loss": 0.8297,
+      "step": 3791
+    },
+    {
+      "epoch": 0.26380048001669626,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001726817870337222,
+      "loss": 1.0904,
+      "step": 3792
+    },
+    {
+      "epoch": 0.26387004765383143,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0017266630863688004,
+      "loss": 1.1033,
+      "step": 3793
+    },
+    {
+      "epoch": 0.26393961529096666,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017265082655040897,
+      "loss": 0.8511,
+      "step": 3794
+    },
+    {
+      "epoch": 0.2640091829281018,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017263534077509514,
+      "loss": 1.1242,
+      "step": 3795
+    },
+    {
+      "epoch": 0.26407875056523705,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017261985131172479,
+      "loss": 0.952,
+      "step": 3796
+    },
+    {
+      "epoch": 0.2641483182023723,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017260435816108446,
+      "loss": 0.9841,
+      "step": 3797
+    },
+    {
+      "epoch": 0.26421788583950745,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017258886132396074,
+      "loss": 0.8861,
+      "step": 3798
+    },
+    {
+      "epoch": 0.2642874534766427,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017257336080114052,
+      "loss": 1.0207,
+      "step": 3799
+    },
+    {
+      "epoch": 0.26435702111377785,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017255785659341086,
+      "loss": 0.9993,
+      "step": 3800
+    },
+    {
+      "epoch": 0.26442658875091307,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017254234870155893,
+      "loss": 0.749,
+      "step": 3801
+    },
+    {
+      "epoch": 0.2644961563880483,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017252683712637219,
+      "loss": 1.1282,
+      "step": 3802
+    },
+    {
+      "epoch": 0.26456572402518347,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017251132186863823,
+      "loss": 1.0138,
+      "step": 3803
+    },
+    {
+      "epoch": 0.2646352916623187,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001724958029291448,
+      "loss": 0.9922,
+      "step": 3804
+    },
+    {
+      "epoch": 0.2647048592994539,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017248028030867992,
+      "loss": 1.4224,
+      "step": 3805
+    },
+    {
+      "epoch": 0.2647744269365891,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017246475400803174,
+      "loss": 0.8931,
+      "step": 3806
+    },
+    {
+      "epoch": 0.2648439945737243,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001724492240279886,
+      "loss": 0.9911,
+      "step": 3807
+    },
+    {
+      "epoch": 0.2649135622108595,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017243369036933904,
+      "loss": 1.0472,
+      "step": 3808
+    },
+    {
+      "epoch": 0.2649831298479947,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017241815303287176,
+      "loss": 0.9658,
+      "step": 3809
+    },
+    {
+      "epoch": 0.26505269748512994,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001724026120193757,
+      "loss": 1.0667,
+      "step": 3810
+    },
+    {
+      "epoch": 0.2651222651222651,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017238706732963993,
+      "loss": 0.9033,
+      "step": 3811
+    },
+    {
+      "epoch": 0.26519183275940034,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017237151896445373,
+      "loss": 1.1824,
+      "step": 3812
+    },
+    {
+      "epoch": 0.2652614003965355,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001723559669246066,
+      "loss": 0.9525,
+      "step": 3813
+    },
+    {
+      "epoch": 0.26533096803367073,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017234041121088814,
+      "loss": 0.8767,
+      "step": 3814
+    },
+    {
+      "epoch": 0.26540053567080596,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017232485182408824,
+      "loss": 1.0737,
+      "step": 3815
+    },
+    {
+      "epoch": 0.26547010330794113,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001723092887649969,
+      "loss": 1.0869,
+      "step": 3816
+    },
+    {
+      "epoch": 0.26553967094507636,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0017229372203440435,
+      "loss": 1.1493,
+      "step": 3817
+    },
+    {
+      "epoch": 0.2656092385822116,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00172278151633101,
+      "loss": 1.3631,
+      "step": 3818
+    },
+    {
+      "epoch": 0.26567880621934675,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001722625775618774,
+      "loss": 1.1792,
+      "step": 3819
+    },
+    {
+      "epoch": 0.265748373856482,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0017224699982152432,
+      "loss": 0.8632,
+      "step": 3820
+    },
+    {
+      "epoch": 0.26581794149361715,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017223141841283276,
+      "loss": 1.177,
+      "step": 3821
+    },
+    {
+      "epoch": 0.2658875091307524,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017221583333659385,
+      "loss": 0.9559,
+      "step": 3822
+    },
+    {
+      "epoch": 0.2659570767678876,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017220024459359893,
+      "loss": 1.1445,
+      "step": 3823
+    },
+    {
+      "epoch": 0.26602664440502277,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017218465218463948,
+      "loss": 1.3609,
+      "step": 3824
+    },
+    {
+      "epoch": 0.266096212042158,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017216905611050725,
+      "loss": 1.3015,
+      "step": 3825
+    },
+    {
+      "epoch": 0.26616577967929317,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017215345637199412,
+      "loss": 1.1312,
+      "step": 3826
+    },
+    {
+      "epoch": 0.2662353473164284,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017213785296989212,
+      "loss": 1.1181,
+      "step": 3827
+    },
+    {
+      "epoch": 0.2663049149535636,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017212224590499358,
+      "loss": 1.0499,
+      "step": 3828
+    },
+    {
+      "epoch": 0.2663744825906988,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001721066351780909,
+      "loss": 1.0541,
+      "step": 3829
+    },
+    {
+      "epoch": 0.266444050227834,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017209102078997673,
+      "loss": 0.9934,
+      "step": 3830
+    },
+    {
+      "epoch": 0.26651361786496924,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017207540274144387,
+      "loss": 1.0731,
+      "step": 3831
+    },
+    {
+      "epoch": 0.2665831855021044,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017205978103328537,
+      "loss": 0.9135,
+      "step": 3832
+    },
+    {
+      "epoch": 0.26665275313923964,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001720441556662944,
+      "loss": 1.1729,
+      "step": 3833
+    },
+    {
+      "epoch": 0.2667223207763748,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017202852664126432,
+      "loss": 0.9803,
+      "step": 3834
+    },
+    {
+      "epoch": 0.26679188841351004,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001720128939589887,
+      "loss": 0.8769,
+      "step": 3835
+    },
+    {
+      "epoch": 0.26686145605064526,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017199725762026136,
+      "loss": 0.8438,
+      "step": 3836
+    },
+    {
+      "epoch": 0.26693102368778043,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001719816176258761,
+      "loss": 1.1329,
+      "step": 3837
+    },
+    {
+      "epoch": 0.26700059132491566,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017196597397662714,
+      "loss": 1.0036,
+      "step": 3838
+    },
+    {
+      "epoch": 0.26707015896205083,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017195032667330875,
+      "loss": 0.8832,
+      "step": 3839
+    },
+    {
+      "epoch": 0.26713972659918606,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017193467571671541,
+      "loss": 0.9158,
+      "step": 3840
+    },
+    {
+      "epoch": 0.2672092942363213,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017191902110764183,
+      "loss": 0.8351,
+      "step": 3841
+    },
+    {
+      "epoch": 0.26727886187345645,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017190336284688289,
+      "loss": 1.0573,
+      "step": 3842
+    },
+    {
+      "epoch": 0.2673484295105917,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001718877009352336,
+      "loss": 1.0967,
+      "step": 3843
+    },
+    {
+      "epoch": 0.2674179971477269,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017187203537348914,
+      "loss": 0.8831,
+      "step": 3844
+    },
+    {
+      "epoch": 0.2674875647848621,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017185636616244503,
+      "loss": 0.8737,
+      "step": 3845
+    },
+    {
+      "epoch": 0.2675571324219973,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0017184069330289681,
+      "loss": 1.2432,
+      "step": 3846
+    },
+    {
+      "epoch": 0.26762670005913247,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017182501679564029,
+      "loss": 1.2132,
+      "step": 3847
+    },
+    {
+      "epoch": 0.2676962676962677,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017180933664147147,
+      "loss": 0.7472,
+      "step": 3848
+    },
+    {
+      "epoch": 0.2677658353334029,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017179365284118644,
+      "loss": 0.9909,
+      "step": 3849
+    },
+    {
+      "epoch": 0.2678354029705381,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017177796539558162,
+      "loss": 1.064,
+      "step": 3850
+    },
+    {
+      "epoch": 0.2679049706076733,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017176227430545348,
+      "loss": 0.9202,
+      "step": 3851
+    },
+    {
+      "epoch": 0.2679745382448085,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017174657957159875,
+      "loss": 0.865,
+      "step": 3852
+    },
+    {
+      "epoch": 0.2680441058819437,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001717308811948144,
+      "loss": 1.0918,
+      "step": 3853
+    },
+    {
+      "epoch": 0.26811367351907894,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017171517917589738,
+      "loss": 1.0774,
+      "step": 3854
+    },
+    {
+      "epoch": 0.2681832411562141,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017169947351564508,
+      "loss": 1.165,
+      "step": 3855
+    },
+    {
+      "epoch": 0.26825280879334934,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017168376421485489,
+      "loss": 1.0177,
+      "step": 3856
+    },
+    {
+      "epoch": 0.26832237643048457,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0017166805127432447,
+      "loss": 0.879,
+      "step": 3857
+    },
+    {
+      "epoch": 0.26839194406761974,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017165233469485163,
+      "loss": 1.1177,
+      "step": 3858
+    },
+    {
+      "epoch": 0.26846151170475496,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001716366144772344,
+      "loss": 1.1613,
+      "step": 3859
+    },
+    {
+      "epoch": 0.26853107934189013,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017162089062227096,
+      "loss": 0.8865,
+      "step": 3860
+    },
+    {
+      "epoch": 0.26860064697902536,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017160516313075968,
+      "loss": 1.176,
+      "step": 3861
+    },
+    {
+      "epoch": 0.2686702146161606,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017158943200349915,
+      "loss": 1.0508,
+      "step": 3862
+    },
+    {
+      "epoch": 0.26873978225329576,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017157369724128812,
+      "loss": 0.884,
+      "step": 3863
+    },
+    {
+      "epoch": 0.268809349890431,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017155795884492547,
+      "loss": 1.3607,
+      "step": 3864
+    },
+    {
+      "epoch": 0.26887891752756615,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017154221681521034,
+      "loss": 0.9544,
+      "step": 3865
+    },
+    {
+      "epoch": 0.2689484851647014,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017152647115294204,
+      "loss": 1.084,
+      "step": 3866
+    },
+    {
+      "epoch": 0.2690180528018366,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017151072185892008,
+      "loss": 1.0318,
+      "step": 3867
+    },
+    {
+      "epoch": 0.2690876204389718,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001714949689339441,
+      "loss": 1.0706,
+      "step": 3868
+    },
+    {
+      "epoch": 0.269157188076107,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017147921237881394,
+      "loss": 0.9735,
+      "step": 3869
+    },
+    {
+      "epoch": 0.2692267557132422,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017146345219432966,
+      "loss": 0.8209,
+      "step": 3870
+    },
+    {
+      "epoch": 0.2692963233503774,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017144768838129147,
+      "loss": 0.8533,
+      "step": 3871
+    },
+    {
+      "epoch": 0.2693658909875126,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017143192094049985,
+      "loss": 1.0087,
+      "step": 3872
+    },
+    {
+      "epoch": 0.2694354586246478,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017141614987275526,
+      "loss": 0.8013,
+      "step": 3873
+    },
+    {
+      "epoch": 0.269505026261783,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0017140037517885856,
+      "loss": 1.1604,
+      "step": 3874
+    },
+    {
+      "epoch": 0.26957459389891825,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001713845968596107,
+      "loss": 1.0919,
+      "step": 3875
+    },
+    {
+      "epoch": 0.2696441615360534,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0017136881491581284,
+      "loss": 0.9652,
+      "step": 3876
+    },
+    {
+      "epoch": 0.26971372917318864,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0017135302934826627,
+      "loss": 0.9007,
+      "step": 3877
+    },
+    {
+      "epoch": 0.2697832968103238,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001713372401577725,
+      "loss": 1.0811,
+      "step": 3878
+    },
+    {
+      "epoch": 0.26985286444745904,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017132144734513324,
+      "loss": 1.2171,
+      "step": 3879
+    },
+    {
+      "epoch": 0.26992243208459427,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017130565091115037,
+      "loss": 0.9081,
+      "step": 3880
+    },
+    {
+      "epoch": 0.26999199972172944,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017128985085662599,
+      "loss": 1.0543,
+      "step": 3881
+    },
+    {
+      "epoch": 0.27006156735886466,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017127404718236226,
+      "loss": 0.9564,
+      "step": 3882
+    },
+    {
+      "epoch": 0.27013113499599983,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001712582398891617,
+      "loss": 0.9315,
+      "step": 3883
+    },
+    {
+      "epoch": 0.27020070263313506,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0017124242897782684,
+      "loss": 1.186,
+      "step": 3884
+    },
+    {
+      "epoch": 0.2702702702702703,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017122661444916058,
+      "loss": 0.9827,
+      "step": 3885
+    },
+    {
+      "epoch": 0.27033983790740546,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017121079630396583,
+      "loss": 0.95,
+      "step": 3886
+    },
+    {
+      "epoch": 0.2704094055445407,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017119497454304575,
+      "loss": 1.0094,
+      "step": 3887
+    },
+    {
+      "epoch": 0.2704789731816759,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017117914916720373,
+      "loss": 1.2051,
+      "step": 3888
+    },
+    {
+      "epoch": 0.2705485408188111,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001711633201772433,
+      "loss": 1.0079,
+      "step": 3889
+    },
+    {
+      "epoch": 0.2706181084559463,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017114748757396812,
+      "loss": 1.0215,
+      "step": 3890
+    },
+    {
+      "epoch": 0.2706876760930815,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017113165135818217,
+      "loss": 0.8982,
+      "step": 3891
+    },
+    {
+      "epoch": 0.2707572437302167,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017111581153068948,
+      "loss": 0.8377,
+      "step": 3892
+    },
+    {
+      "epoch": 0.2708268113673519,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017109996809229434,
+      "loss": 0.9026,
+      "step": 3893
+    },
+    {
+      "epoch": 0.2708963790044871,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017108412104380117,
+      "loss": 1.0206,
+      "step": 3894
+    },
+    {
+      "epoch": 0.2709659466416223,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017106827038601464,
+      "loss": 1.1209,
+      "step": 3895
+    },
+    {
+      "epoch": 0.2710355142787575,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0017105241611973954,
+      "loss": 1.0785,
+      "step": 3896
+    },
+    {
+      "epoch": 0.2711050819158927,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001710365582457809,
+      "loss": 1.0814,
+      "step": 3897
+    },
+    {
+      "epoch": 0.27117464955302795,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0017102069676494386,
+      "loss": 1.086,
+      "step": 3898
+    },
+    {
+      "epoch": 0.2712442171901631,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017100483167803381,
+      "loss": 0.8824,
+      "step": 3899
+    },
+    {
+      "epoch": 0.27131378482729834,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017098896298585631,
+      "loss": 0.927,
+      "step": 3900
+    },
+    {
+      "epoch": 0.27138335246443357,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017097309068921708,
+      "loss": 1.0531,
+      "step": 3901
+    },
+    {
+      "epoch": 0.27145292010156874,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00170957214788922,
+      "loss": 1.1599,
+      "step": 3902
+    },
+    {
+      "epoch": 0.27152248773870397,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017094133528577724,
+      "loss": 1.1276,
+      "step": 3903
+    },
+    {
+      "epoch": 0.27159205537583914,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017092545218058905,
+      "loss": 0.8651,
+      "step": 3904
+    },
+    {
+      "epoch": 0.27166162301297436,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017090956547416388,
+      "loss": 1.0644,
+      "step": 3905
+    },
+    {
+      "epoch": 0.2717311906501096,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001708936751673084,
+      "loss": 1.1555,
+      "step": 3906
+    },
+    {
+      "epoch": 0.27180075828724476,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001708777812608294,
+      "loss": 1.024,
+      "step": 3907
+    },
+    {
+      "epoch": 0.27187032592438,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0017086188375553394,
+      "loss": 1.188,
+      "step": 3908
+    },
+    {
+      "epoch": 0.27193989356151516,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017084598265222919,
+      "loss": 0.8403,
+      "step": 3909
+    },
+    {
+      "epoch": 0.2720094611986504,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0017083007795172251,
+      "loss": 1.0971,
+      "step": 3910
+    },
+    {
+      "epoch": 0.2720790288357856,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001708141696548215,
+      "loss": 1.1551,
+      "step": 3911
+    },
+    {
+      "epoch": 0.2721485964729208,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001707982577623339,
+      "loss": 1.1604,
+      "step": 3912
+    },
+    {
+      "epoch": 0.272218164110056,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017078234227506756,
+      "loss": 0.9916,
+      "step": 3913
+    },
+    {
+      "epoch": 0.27228773174719123,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0017076642319383071,
+      "loss": 0.8347,
+      "step": 3914
+    },
+    {
+      "epoch": 0.2723572993843264,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017075050051943155,
+      "loss": 0.8493,
+      "step": 3915
+    },
+    {
+      "epoch": 0.2724268670214616,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001707345742526786,
+      "loss": 1.1102,
+      "step": 3916
+    },
+    {
+      "epoch": 0.2724964346585968,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001707186443943805,
+      "loss": 0.7921,
+      "step": 3917
+    },
+    {
+      "epoch": 0.272566002295732,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017070271094534607,
+      "loss": 0.9222,
+      "step": 3918
+    },
+    {
+      "epoch": 0.27263556993286725,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017068677390638435,
+      "loss": 1.2166,
+      "step": 3919
+    },
+    {
+      "epoch": 0.2727051375700024,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0017067083327830454,
+      "loss": 0.8154,
+      "step": 3920
+    },
+    {
+      "epoch": 0.27277470520713765,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0017065488906191602,
+      "loss": 1.3034,
+      "step": 3921
+    },
+    {
+      "epoch": 0.2728442728442728,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017063894125802835,
+      "loss": 1.0304,
+      "step": 3922
+    },
+    {
+      "epoch": 0.27291384048140804,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017062298986745131,
+      "loss": 0.8899,
+      "step": 3923
+    },
+    {
+      "epoch": 0.27298340811854327,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001706070348909948,
+      "loss": 0.951,
+      "step": 3924
+    },
+    {
+      "epoch": 0.27305297575567844,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017059107632946895,
+      "loss": 1.087,
+      "step": 3925
+    },
+    {
+      "epoch": 0.27312254339281367,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0017057511418368408,
+      "loss": 1.0523,
+      "step": 3926
+    },
+    {
+      "epoch": 0.2731921110299489,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017055914845445059,
+      "loss": 0.9484,
+      "step": 3927
+    },
+    {
+      "epoch": 0.27326167866708406,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001705431791425792,
+      "loss": 1.2164,
+      "step": 3928
+    },
+    {
+      "epoch": 0.2733312463042193,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017052720624888074,
+      "loss": 0.9607,
+      "step": 3929
+    },
+    {
+      "epoch": 0.27340081394135446,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0017051122977416622,
+      "loss": 0.9486,
+      "step": 3930
+    },
+    {
+      "epoch": 0.2734703815784897,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0017049524971924686,
+      "loss": 0.8929,
+      "step": 3931
+    },
+    {
+      "epoch": 0.2735399492156249,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0017047926608493404,
+      "loss": 0.9276,
+      "step": 3932
+    },
+    {
+      "epoch": 0.2736095168527601,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0017046327887203937,
+      "loss": 1.1083,
+      "step": 3933
+    },
+    {
+      "epoch": 0.2736790844898953,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017044728808137451,
+      "loss": 1.0824,
+      "step": 3934
+    },
+    {
+      "epoch": 0.2737486521270305,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0017043129371375147,
+      "loss": 1.3346,
+      "step": 3935
+    },
+    {
+      "epoch": 0.2738182197641657,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017041529576998229,
+      "loss": 0.9817,
+      "step": 3936
+    },
+    {
+      "epoch": 0.27388778740130093,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017039929425087938,
+      "loss": 0.8586,
+      "step": 3937
+    },
+    {
+      "epoch": 0.2739573550384361,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017038328915725508,
+      "loss": 1.3273,
+      "step": 3938
+    },
+    {
+      "epoch": 0.2740269226755713,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0017036728048992215,
+      "loss": 1.0559,
+      "step": 3939
+    },
+    {
+      "epoch": 0.27409649031270655,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0017035126824969339,
+      "loss": 1.0056,
+      "step": 3940
+    },
+    {
+      "epoch": 0.2741660579498417,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017033525243738182,
+      "loss": 1.1017,
+      "step": 3941
+    },
+    {
+      "epoch": 0.27423562558697695,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017031923305380063,
+      "loss": 1.0774,
+      "step": 3942
+    },
+    {
+      "epoch": 0.2743051932241121,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001703032100997633,
+      "loss": 1.1158,
+      "step": 3943
+    },
+    {
+      "epoch": 0.27437476086124735,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017028718357608322,
+      "loss": 0.9334,
+      "step": 3944
+    },
+    {
+      "epoch": 0.2744443284983826,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017027115348357427,
+      "loss": 1.0335,
+      "step": 3945
+    },
+    {
+      "epoch": 0.27451389613551774,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017025511982305033,
+      "loss": 0.9501,
+      "step": 3946
+    },
+    {
+      "epoch": 0.27458346377265297,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0017023908259532552,
+      "loss": 1.0681,
+      "step": 3947
+    },
+    {
+      "epoch": 0.27465303140978814,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017022304180121415,
+      "loss": 1.0523,
+      "step": 3948
+    },
+    {
+      "epoch": 0.27472259904692337,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0017020699744153065,
+      "loss": 1.0081,
+      "step": 3949
+    },
+    {
+      "epoch": 0.2747921666840586,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0017019094951708968,
+      "loss": 0.8158,
+      "step": 3950
+    },
+    {
+      "epoch": 0.27486173432119376,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017017489802870606,
+      "loss": 1.0486,
+      "step": 3951
+    },
+    {
+      "epoch": 0.274931301958329,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001701588429771949,
+      "loss": 1.0314,
+      "step": 3952
+    },
+    {
+      "epoch": 0.2750008695954642,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017014278436337125,
+      "loss": 0.8683,
+      "step": 3953
+    },
+    {
+      "epoch": 0.2750704372325994,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001701267221880506,
+      "loss": 0.7564,
+      "step": 3954
+    },
+    {
+      "epoch": 0.2751400048697346,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017011065645204844,
+      "loss": 1.0051,
+      "step": 3955
+    },
+    {
+      "epoch": 0.2752095725068698,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017009458715618053,
+      "loss": 0.9184,
+      "step": 3956
+    },
+    {
+      "epoch": 0.275279140144005,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017007851430126278,
+      "loss": 0.9399,
+      "step": 3957
+    },
+    {
+      "epoch": 0.27534870778114023,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001700624378881113,
+      "loss": 0.9812,
+      "step": 3958
+    },
+    {
+      "epoch": 0.2754182754182754,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017004635791754237,
+      "loss": 0.985,
+      "step": 3959
+    },
+    {
+      "epoch": 0.27548784305541063,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017003027439037245,
+      "loss": 0.8774,
+      "step": 3960
+    },
+    {
+      "epoch": 0.2755574106925458,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017001418730741818,
+      "loss": 0.94,
+      "step": 3961
+    },
+    {
+      "epoch": 0.275626978329681,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016999809666949637,
+      "loss": 0.8023,
+      "step": 3962
+    },
+    {
+      "epoch": 0.27569654596681625,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0016998200247742403,
+      "loss": 1.077,
+      "step": 3963
+    },
+    {
+      "epoch": 0.2757661136039514,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0016996590473201834,
+      "loss": 0.8863,
+      "step": 3964
+    },
+    {
+      "epoch": 0.27583568124108665,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016994980343409665,
+      "loss": 1.0414,
+      "step": 3965
+    },
+    {
+      "epoch": 0.2759052488782219,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001699336985844765,
+      "loss": 0.7603,
+      "step": 3966
+    },
+    {
+      "epoch": 0.27597481651535705,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016991759018397568,
+      "loss": 0.9134,
+      "step": 3967
+    },
+    {
+      "epoch": 0.2760443841524923,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00169901478233412,
+      "loss": 1.108,
+      "step": 3968
+    },
+    {
+      "epoch": 0.27611395178962744,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001698853627336036,
+      "loss": 1.099,
+      "step": 3969
+    },
+    {
+      "epoch": 0.27618351942676267,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016986924368536872,
+      "loss": 0.981,
+      "step": 3970
+    },
+    {
+      "epoch": 0.2762530870638979,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0016985312108952582,
+      "loss": 0.9792,
+      "step": 3971
+    },
+    {
+      "epoch": 0.27632265470103307,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001698369949468935,
+      "loss": 0.9086,
+      "step": 3972
+    },
+    {
+      "epoch": 0.2763922223381683,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016982086525829062,
+      "loss": 1.0202,
+      "step": 3973
+    },
+    {
+      "epoch": 0.27646178997530346,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0016980473202453609,
+      "loss": 1.1561,
+      "step": 3974
+    },
+    {
+      "epoch": 0.2765313576124387,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016978859524644913,
+      "loss": 0.9192,
+      "step": 3975
+    },
+    {
+      "epoch": 0.2766009252495739,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0016977245492484905,
+      "loss": 1.0597,
+      "step": 3976
+    },
+    {
+      "epoch": 0.2766704928867091,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016975631106055538,
+      "loss": 0.8171,
+      "step": 3977
+    },
+    {
+      "epoch": 0.2767400605238443,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016974016365438787,
+      "loss": 0.9362,
+      "step": 3978
+    },
+    {
+      "epoch": 0.27680962816097954,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016972401270716633,
+      "loss": 1.1577,
+      "step": 3979
+    },
+    {
+      "epoch": 0.2768791957981147,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016970785821971087,
+      "loss": 0.908,
+      "step": 3980
+    },
+    {
+      "epoch": 0.27694876343524993,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016969170019284173,
+      "loss": 0.9814,
+      "step": 3981
+    },
+    {
+      "epoch": 0.2770183310723851,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001696755386273793,
+      "loss": 0.9795,
+      "step": 3982
+    },
+    {
+      "epoch": 0.27708789870952033,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016965937352414425,
+      "loss": 1.0025,
+      "step": 3983
+    },
+    {
+      "epoch": 0.27715746634665556,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001696432048839573,
+      "loss": 1.045,
+      "step": 3984
+    },
+    {
+      "epoch": 0.2772270339837907,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016962703270763941,
+      "loss": 0.9926,
+      "step": 3985
+    },
+    {
+      "epoch": 0.27729660162092595,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001696108569960118,
+      "loss": 0.9154,
+      "step": 3986
+    },
+    {
+      "epoch": 0.2773661692580611,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001695946777498957,
+      "loss": 0.8187,
+      "step": 3987
+    },
+    {
+      "epoch": 0.27743573689519635,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0016957849497011264,
+      "loss": 1.3706,
+      "step": 3988
+    },
+    {
+      "epoch": 0.2775053045323316,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016956230865748433,
+      "loss": 1.0324,
+      "step": 3989
+    },
+    {
+      "epoch": 0.27757487216946675,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001695461188128326,
+      "loss": 0.7268,
+      "step": 3990
+    },
+    {
+      "epoch": 0.277644439806602,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001695299254369795,
+      "loss": 0.9714,
+      "step": 3991
+    },
+    {
+      "epoch": 0.2777140074437372,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016951372853074723,
+      "loss": 0.8341,
+      "step": 3992
+    },
+    {
+      "epoch": 0.27778357508087237,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001694975280949582,
+      "loss": 1.0248,
+      "step": 3993
+    },
+    {
+      "epoch": 0.2778531427180076,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00169481324130435,
+      "loss": 1.0205,
+      "step": 3994
+    },
+    {
+      "epoch": 0.27792271035514277,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016946511663800035,
+      "loss": 0.965,
+      "step": 3995
+    },
+    {
+      "epoch": 0.277992277992278,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016944890561847723,
+      "loss": 0.9839,
+      "step": 3996
+    },
+    {
+      "epoch": 0.2780618456294132,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016943269107268873,
+      "loss": 0.8865,
+      "step": 3997
+    },
+    {
+      "epoch": 0.2781314132665484,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016941647300145813,
+      "loss": 0.9313,
+      "step": 3998
+    },
+    {
+      "epoch": 0.2782009809036836,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016940025140560894,
+      "loss": 0.9325,
+      "step": 3999
+    },
+    {
+      "epoch": 0.2782705485408188,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016938402628596477,
+      "loss": 0.8353,
+      "step": 4000
+    },
+    {
+      "epoch": 0.278340116177954,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016936779764334946,
+      "loss": 0.9596,
+      "step": 4001
+    },
+    {
+      "epoch": 0.27840968381508924,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00169351565478587,
+      "loss": 1.1228,
+      "step": 4002
+    },
+    {
+      "epoch": 0.2784792514522244,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016933532979250166,
+      "loss": 1.0892,
+      "step": 4003
+    },
+    {
+      "epoch": 0.27854881908935963,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0016931909058591772,
+      "loss": 0.811,
+      "step": 4004
+    },
+    {
+      "epoch": 0.27861838672649486,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016930284785965975,
+      "loss": 0.9714,
+      "step": 4005
+    },
+    {
+      "epoch": 0.27868795436363003,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001692866016145525,
+      "loss": 0.9037,
+      "step": 4006
+    },
+    {
+      "epoch": 0.27875752200076526,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0016927035185142084,
+      "loss": 1.0398,
+      "step": 4007
+    },
+    {
+      "epoch": 0.2788270896379004,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016925409857108985,
+      "loss": 0.9304,
+      "step": 4008
+    },
+    {
+      "epoch": 0.27889665727503565,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016923784177438482,
+      "loss": 0.962,
+      "step": 4009
+    },
+    {
+      "epoch": 0.2789662249121709,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016922158146213113,
+      "loss": 0.9493,
+      "step": 4010
+    },
+    {
+      "epoch": 0.27903579254930605,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016920531763515447,
+      "loss": 0.8456,
+      "step": 4011
+    },
+    {
+      "epoch": 0.2791053601864413,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001691890502942806,
+      "loss": 0.7361,
+      "step": 4012
+    },
+    {
+      "epoch": 0.27917492782357645,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016917277944033548,
+      "loss": 1.0424,
+      "step": 4013
+    },
+    {
+      "epoch": 0.2792444954607117,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001691565050741453,
+      "loss": 0.9785,
+      "step": 4014
+    },
+    {
+      "epoch": 0.2793140630978469,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016914022719653637,
+      "loss": 0.8496,
+      "step": 4015
+    },
+    {
+      "epoch": 0.27938363073498207,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016912394580833516,
+      "loss": 0.9109,
+      "step": 4016
+    },
+    {
+      "epoch": 0.2794531983721173,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0016910766091036843,
+      "loss": 1.1295,
+      "step": 4017
+    },
+    {
+      "epoch": 0.2795227660092525,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016909137250346298,
+      "loss": 0.9687,
+      "step": 4018
+    },
+    {
+      "epoch": 0.2795923336463877,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0016907508058844588,
+      "loss": 0.9258,
+      "step": 4019
+    },
+    {
+      "epoch": 0.2796619012835229,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016905878516614437,
+      "loss": 1.2742,
+      "step": 4020
+    },
+    {
+      "epoch": 0.2797314689206581,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0016904248623738584,
+      "loss": 0.7806,
+      "step": 4021
+    },
+    {
+      "epoch": 0.2798010365577933,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0016902618380299783,
+      "loss": 1.0143,
+      "step": 4022
+    },
+    {
+      "epoch": 0.27987060419492854,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0016900987786380812,
+      "loss": 0.8619,
+      "step": 4023
+    },
+    {
+      "epoch": 0.2799401718320637,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016899356842064468,
+      "loss": 0.9301,
+      "step": 4024
+    },
+    {
+      "epoch": 0.28000973946919894,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016897725547433556,
+      "loss": 0.9563,
+      "step": 4025
+    },
+    {
+      "epoch": 0.2800793071063341,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001689609390257091,
+      "loss": 1.0522,
+      "step": 4026
+    },
+    {
+      "epoch": 0.28014887474346933,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016894461907559374,
+      "loss": 1.0169,
+      "step": 4027
+    },
+    {
+      "epoch": 0.28021844238060456,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001689282956248181,
+      "loss": 1.1033,
+      "step": 4028
+    },
+    {
+      "epoch": 0.28028801001773973,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016891196867421109,
+      "loss": 0.791,
+      "step": 4029
+    },
+    {
+      "epoch": 0.28035757765487496,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016889563822460158,
+      "loss": 1.0634,
+      "step": 4030
+    },
+    {
+      "epoch": 0.2804271452920102,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001688793042768189,
+      "loss": 1.2211,
+      "step": 4031
+    },
+    {
+      "epoch": 0.28049671292914535,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016886296683169227,
+      "loss": 1.0032,
+      "step": 4032
+    },
+    {
+      "epoch": 0.2805662805662806,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001688466258900513,
+      "loss": 1.1016,
+      "step": 4033
+    },
+    {
+      "epoch": 0.28063584820341575,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016883028145272567,
+      "loss": 0.8212,
+      "step": 4034
+    },
+    {
+      "epoch": 0.280705415840551,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0016881393352054528,
+      "loss": 1.1214,
+      "step": 4035
+    },
+    {
+      "epoch": 0.2807749834776862,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016879758209434022,
+      "loss": 0.9552,
+      "step": 4036
+    },
+    {
+      "epoch": 0.2808445511148214,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016878122717494067,
+      "loss": 0.7728,
+      "step": 4037
+    },
+    {
+      "epoch": 0.2809141187519566,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016876486876317711,
+      "loss": 1.2132,
+      "step": 4038
+    },
+    {
+      "epoch": 0.28098368638909177,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001687485068598801,
+      "loss": 1.1734,
+      "step": 4039
+    },
+    {
+      "epoch": 0.281053254026227,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0016873214146588046,
+      "loss": 0.7552,
+      "step": 4040
+    },
+    {
+      "epoch": 0.2811228216633622,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016871577258200908,
+      "loss": 0.9213,
+      "step": 4041
+    },
+    {
+      "epoch": 0.2811923893004974,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016869940020909713,
+      "loss": 0.8805,
+      "step": 4042
+    },
+    {
+      "epoch": 0.2812619569376326,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016868302434797592,
+      "loss": 0.9129,
+      "step": 4043
+    },
+    {
+      "epoch": 0.28133152457476784,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016866664499947687,
+      "loss": 0.9635,
+      "step": 4044
+    },
+    {
+      "epoch": 0.281401092211903,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0016865026216443177,
+      "loss": 1.1198,
+      "step": 4045
+    },
+    {
+      "epoch": 0.28147065984903824,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016863387584367233,
+      "loss": 0.8736,
+      "step": 4046
+    },
+    {
+      "epoch": 0.2815402274861734,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0016861748603803062,
+      "loss": 1.2039,
+      "step": 4047
+    },
+    {
+      "epoch": 0.28160979512330864,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016860109274833884,
+      "loss": 1.3537,
+      "step": 4048
+    },
+    {
+      "epoch": 0.28167936276044386,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016858469597542936,
+      "loss": 0.9092,
+      "step": 4049
+    },
+    {
+      "epoch": 0.28174893039757903,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016856829572013468,
+      "loss": 1.0306,
+      "step": 4050
+    },
+    {
+      "epoch": 0.28181849803471426,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016855189198328757,
+      "loss": 0.9224,
+      "step": 4051
+    },
+    {
+      "epoch": 0.28188806567184943,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016853548476572092,
+      "loss": 0.9718,
+      "step": 4052
+    },
+    {
+      "epoch": 0.28195763330898466,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016851907406826776,
+      "loss": 1.1605,
+      "step": 4053
+    },
+    {
+      "epoch": 0.2820272009461199,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001685026598917614,
+      "loss": 1.1018,
+      "step": 4054
+    },
+    {
+      "epoch": 0.28209676858325505,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016848624223703527,
+      "loss": 0.8536,
+      "step": 4055
+    },
+    {
+      "epoch": 0.2821663362203903,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016846982110492292,
+      "loss": 0.7116,
+      "step": 4056
+    },
+    {
+      "epoch": 0.2822359038575255,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0016845339649625818,
+      "loss": 0.7503,
+      "step": 4057
+    },
+    {
+      "epoch": 0.2823054714946607,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016843696841187504,
+      "loss": 0.9039,
+      "step": 4058
+    },
+    {
+      "epoch": 0.2823750391317959,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016842053685260754,
+      "loss": 0.9546,
+      "step": 4059
+    },
+    {
+      "epoch": 0.2824446067689311,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0016840410181929006,
+      "loss": 1.0785,
+      "step": 4060
+    },
+    {
+      "epoch": 0.2825141744060663,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001683876633127571,
+      "loss": 0.7847,
+      "step": 4061
+    },
+    {
+      "epoch": 0.2825837420432015,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016837122133384326,
+      "loss": 1.0703,
+      "step": 4062
+    },
+    {
+      "epoch": 0.2826533096803367,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001683547758833834,
+      "loss": 0.9288,
+      "step": 4063
+    },
+    {
+      "epoch": 0.2827228773174719,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0016833832696221262,
+      "loss": 1.0972,
+      "step": 4064
+    },
+    {
+      "epoch": 0.2827924449546071,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00168321874571166,
+      "loss": 1.1096,
+      "step": 4065
+    },
+    {
+      "epoch": 0.2828620125917423,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016830541871107893,
+      "loss": 1.0317,
+      "step": 4066
+    },
+    {
+      "epoch": 0.28293158022887754,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016828895938278703,
+      "loss": 0.7199,
+      "step": 4067
+    },
+    {
+      "epoch": 0.2830011478660127,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016827249658712597,
+      "loss": 0.9799,
+      "step": 4068
+    },
+    {
+      "epoch": 0.28307071550314794,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0016825603032493163,
+      "loss": 1.0341,
+      "step": 4069
+    },
+    {
+      "epoch": 0.28314028314028317,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0016823956059704012,
+      "loss": 0.9213,
+      "step": 4070
+    },
+    {
+      "epoch": 0.28320985077741834,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001682230874042877,
+      "loss": 1.188,
+      "step": 4071
+    },
+    {
+      "epoch": 0.28327941841455356,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0016820661074751074,
+      "loss": 1.047,
+      "step": 4072
+    },
+    {
+      "epoch": 0.28334898605168873,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016819013062754587,
+      "loss": 0.8298,
+      "step": 4073
+    },
+    {
+      "epoch": 0.28341855368882396,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016817364704522987,
+      "loss": 0.9393,
+      "step": 4074
+    },
+    {
+      "epoch": 0.2834881213259592,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016815716000139972,
+      "loss": 1.2058,
+      "step": 4075
+    },
+    {
+      "epoch": 0.28355768896309436,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016814066949689252,
+      "loss": 1.0323,
+      "step": 4076
+    },
+    {
+      "epoch": 0.2836272566002296,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016812417553254556,
+      "loss": 1.2022,
+      "step": 4077
+    },
+    {
+      "epoch": 0.28369682423736475,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016810767810919633,
+      "loss": 0.9194,
+      "step": 4078
+    },
+    {
+      "epoch": 0.2837663918745,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001680911772276825,
+      "loss": 0.8541,
+      "step": 4079
+    },
+    {
+      "epoch": 0.2838359595116352,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016807467288884191,
+      "loss": 0.9392,
+      "step": 4080
+    },
+    {
+      "epoch": 0.2839055271487704,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016805816509351255,
+      "loss": 1.1768,
+      "step": 4081
+    },
+    {
+      "epoch": 0.2839750947859056,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001680416538425326,
+      "loss": 1.1473,
+      "step": 4082
+    },
+    {
+      "epoch": 0.28404466242304083,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016802513913674042,
+      "loss": 0.9821,
+      "step": 4083
+    },
+    {
+      "epoch": 0.284114230060176,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016800862097697453,
+      "loss": 0.9646,
+      "step": 4084
+    },
+    {
+      "epoch": 0.2841837976973112,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016799209936407369,
+      "loss": 1.1557,
+      "step": 4085
+    },
+    {
+      "epoch": 0.2842533653344464,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016797557429887673,
+      "loss": 0.8796,
+      "step": 4086
+    },
+    {
+      "epoch": 0.2843229329715816,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016795904578222275,
+      "loss": 0.7061,
+      "step": 4087
+    },
+    {
+      "epoch": 0.28439250060871685,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016794251381495094,
+      "loss": 1.0073,
+      "step": 4088
+    },
+    {
+      "epoch": 0.284462068245852,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016792597839790074,
+      "loss": 0.8033,
+      "step": 4089
+    },
+    {
+      "epoch": 0.28453163588298724,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016790943953191174,
+      "loss": 1.1437,
+      "step": 4090
+    },
+    {
+      "epoch": 0.2846012035201224,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016789289721782367,
+      "loss": 0.9835,
+      "step": 4091
+    },
+    {
+      "epoch": 0.28467077115725764,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016787635145647651,
+      "loss": 1.0082,
+      "step": 4092
+    },
+    {
+      "epoch": 0.28474033879439287,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0016785980224871032,
+      "loss": 0.8653,
+      "step": 4093
+    },
+    {
+      "epoch": 0.28480990643152804,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016784324959536541,
+      "loss": 0.6891,
+      "step": 4094
+    },
+    {
+      "epoch": 0.28487947406866326,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0016782669349728226,
+      "loss": 1.1953,
+      "step": 4095
+    },
+    {
+      "epoch": 0.2849490417057985,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016781013395530148,
+      "loss": 0.8658,
+      "step": 4096
+    },
+    {
+      "epoch": 0.28501860934293366,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016779357097026389,
+      "loss": 0.8288,
+      "step": 4097
+    },
+    {
+      "epoch": 0.2850881769800689,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016777700454301046,
+      "loss": 0.7104,
+      "step": 4098
+    },
+    {
+      "epoch": 0.28515774461720406,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016776043467438236,
+      "loss": 0.9844,
+      "step": 4099
+    },
+    {
+      "epoch": 0.2852273122543393,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0016774386136522092,
+      "loss": 0.9912,
+      "step": 4100
+    },
+    {
+      "epoch": 0.2852968798914745,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016772728461636767,
+      "loss": 0.9511,
+      "step": 4101
+    },
+    {
+      "epoch": 0.2853664475286097,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016771070442866427,
+      "loss": 0.8666,
+      "step": 4102
+    },
+    {
+      "epoch": 0.2854360151657449,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001676941208029526,
+      "loss": 1.2521,
+      "step": 4103
+    },
+    {
+      "epoch": 0.2855055828028801,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0016767753374007466,
+      "loss": 1.1681,
+      "step": 4104
+    },
+    {
+      "epoch": 0.2855751504400153,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001676609432408727,
+      "loss": 0.8885,
+      "step": 4105
+    },
+    {
+      "epoch": 0.28564471807715053,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001676443493061891,
+      "loss": 0.9408,
+      "step": 4106
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016762775193686632,
+      "loss": 1.078,
+      "step": 4107
+    },
+    {
+      "epoch": 0.2857838533514209,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001676111511337472,
+      "loss": 1.1927,
+      "step": 4108
+    },
+    {
+      "epoch": 0.28585342098855615,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0016759454689767464,
+      "loss": 1.0382,
+      "step": 4109
+    },
+    {
+      "epoch": 0.2859229886256913,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0016757793922949165,
+      "loss": 0.9997,
+      "step": 4110
+    },
+    {
+      "epoch": 0.28599255626282655,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0016756132813004153,
+      "loss": 1.2973,
+      "step": 4111
+    },
+    {
+      "epoch": 0.2860621238999617,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016754471360016772,
+      "loss": 1.0318,
+      "step": 4112
+    },
+    {
+      "epoch": 0.28613169153709694,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001675280956407138,
+      "loss": 0.8628,
+      "step": 4113
+    },
+    {
+      "epoch": 0.28620125917423217,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016751147425252354,
+      "loss": 0.9316,
+      "step": 4114
+    },
+    {
+      "epoch": 0.28627082681136734,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001674948494364409,
+      "loss": 0.772,
+      "step": 4115
+    },
+    {
+      "epoch": 0.28634039444850257,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0016747822119331003,
+      "loss": 1.4195,
+      "step": 4116
+    },
+    {
+      "epoch": 0.28640996208563774,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0016746158952397519,
+      "loss": 1.1103,
+      "step": 4117
+    },
+    {
+      "epoch": 0.28647952972277296,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016744495442928085,
+      "loss": 0.9768,
+      "step": 4118
+    },
+    {
+      "epoch": 0.2865490973599082,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016742831591007171,
+      "loss": 1.1117,
+      "step": 4119
+    },
+    {
+      "epoch": 0.28661866499704336,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.001674116739671925,
+      "loss": 0.9725,
+      "step": 4120
+    },
+    {
+      "epoch": 0.2866882326341786,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001673950286014883,
+      "loss": 1.0254,
+      "step": 4121
+    },
+    {
+      "epoch": 0.2867578002713138,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001673783798138042,
+      "loss": 1.1598,
+      "step": 4122
+    },
+    {
+      "epoch": 0.286827367908449,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016736172760498564,
+      "loss": 1.0397,
+      "step": 4123
+    },
+    {
+      "epoch": 0.2868969355455842,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0016734507197587807,
+      "loss": 1.1071,
+      "step": 4124
+    },
+    {
+      "epoch": 0.2869665031827194,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001673284129273272,
+      "loss": 1.0425,
+      "step": 4125
+    },
+    {
+      "epoch": 0.2870360708198546,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016731175046017883,
+      "loss": 0.944,
+      "step": 4126
+    },
+    {
+      "epoch": 0.28710563845698983,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016729508457527908,
+      "loss": 0.878,
+      "step": 4127
+    },
+    {
+      "epoch": 0.287175206094125,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016727841527347414,
+      "loss": 0.9369,
+      "step": 4128
+    },
+    {
+      "epoch": 0.28724477373126023,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016726174255561035,
+      "loss": 0.896,
+      "step": 4129
+    },
+    {
+      "epoch": 0.2873143413683954,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016724506642253432,
+      "loss": 0.885,
+      "step": 4130
+    },
+    {
+      "epoch": 0.2873839090055306,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016722838687509276,
+      "loss": 1.1057,
+      "step": 4131
+    },
+    {
+      "epoch": 0.28745347664266585,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016721170391413257,
+      "loss": 0.7977,
+      "step": 4132
+    },
+    {
+      "epoch": 0.287523044279801,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0016719501754050082,
+      "loss": 0.8088,
+      "step": 4133
+    },
+    {
+      "epoch": 0.28759261191693625,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016717832775504475,
+      "loss": 0.9313,
+      "step": 4134
+    },
+    {
+      "epoch": 0.2876621795540715,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016716163455861182,
+      "loss": 0.9587,
+      "step": 4135
+    },
+    {
+      "epoch": 0.28773174719120664,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016714493795204962,
+      "loss": 1.2193,
+      "step": 4136
+    },
+    {
+      "epoch": 0.28780131482834187,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016712823793620588,
+      "loss": 1.0766,
+      "step": 4137
+    },
+    {
+      "epoch": 0.28787088246547704,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001671115345119286,
+      "loss": 0.92,
+      "step": 4138
+    },
+    {
+      "epoch": 0.28794045010261227,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0016709482768006584,
+      "loss": 0.821,
+      "step": 4139
+    },
+    {
+      "epoch": 0.2880100177397475,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001670781174414659,
+      "loss": 1.1517,
+      "step": 4140
+    },
+    {
+      "epoch": 0.28807958537688266,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016706140379697727,
+      "loss": 0.9332,
+      "step": 4141
+    },
+    {
+      "epoch": 0.2881491530140179,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001670446867474486,
+      "loss": 0.9166,
+      "step": 4142
+    },
+    {
+      "epoch": 0.28821872065115306,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016702796629372862,
+      "loss": 0.9084,
+      "step": 4143
+    },
+    {
+      "epoch": 0.2882882882882883,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016701124243666636,
+      "loss": 0.6126,
+      "step": 4144
+    },
+    {
+      "epoch": 0.2883578559254235,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016699451517711102,
+      "loss": 0.9703,
+      "step": 4145
+    },
+    {
+      "epoch": 0.2884274235625587,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0016697778451591184,
+      "loss": 0.8664,
+      "step": 4146
+    },
+    {
+      "epoch": 0.2884969911996939,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016696105045391836,
+      "loss": 0.614,
+      "step": 4147
+    },
+    {
+      "epoch": 0.28856655883682913,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016694431299198024,
+      "loss": 1.1663,
+      "step": 4148
+    },
+    {
+      "epoch": 0.2886361264739643,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016692757213094733,
+      "loss": 1.2911,
+      "step": 4149
+    },
+    {
+      "epoch": 0.28870569411109953,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016691082787166967,
+      "loss": 0.8398,
+      "step": 4150
+    },
+    {
+      "epoch": 0.2887752617482347,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001668940802149974,
+      "loss": 0.9779,
+      "step": 4151
+    },
+    {
+      "epoch": 0.28884482938536993,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016687732916178092,
+      "loss": 1.0343,
+      "step": 4152
+    },
+    {
+      "epoch": 0.28891439702250515,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016686057471287077,
+      "loss": 1.0199,
+      "step": 4153
+    },
+    {
+      "epoch": 0.2889839646596403,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016684381686911762,
+      "loss": 0.8642,
+      "step": 4154
+    },
+    {
+      "epoch": 0.28905353229677555,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016682705563137237,
+      "loss": 0.954,
+      "step": 4155
+    },
+    {
+      "epoch": 0.2891230999339107,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0016681029100048606,
+      "loss": 1.064,
+      "step": 4156
+    },
+    {
+      "epoch": 0.28919266757104595,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0016679352297730991,
+      "loss": 1.0502,
+      "step": 4157
+    },
+    {
+      "epoch": 0.2892622352081812,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016677675156269536,
+      "loss": 1.0124,
+      "step": 4158
+    },
+    {
+      "epoch": 0.28933180284531634,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016675997675749392,
+      "loss": 1.1415,
+      "step": 4159
+    },
+    {
+      "epoch": 0.28940137048245157,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016674319856255738,
+      "loss": 0.9647,
+      "step": 4160
+    },
+    {
+      "epoch": 0.2894709381195868,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016672641697873761,
+      "loss": 0.9417,
+      "step": 4161
+    },
+    {
+      "epoch": 0.28954050575672197,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016670963200688669,
+      "loss": 0.8899,
+      "step": 4162
+    },
+    {
+      "epoch": 0.2896100733938572,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016669284364785692,
+      "loss": 1.1638,
+      "step": 4163
+    },
+    {
+      "epoch": 0.28967964103099236,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016667605190250072,
+      "loss": 1.2574,
+      "step": 4164
+    },
+    {
+      "epoch": 0.2897492086681276,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0016665925677167067,
+      "loss": 0.9816,
+      "step": 4165
+    },
+    {
+      "epoch": 0.2898187763052628,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0016664245825621954,
+      "loss": 0.9461,
+      "step": 4166
+    },
+    {
+      "epoch": 0.289888343942398,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016662565635700028,
+      "loss": 0.8763,
+      "step": 4167
+    },
+    {
+      "epoch": 0.2899579115795332,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016660885107486606,
+      "loss": 0.8769,
+      "step": 4168
+    },
+    {
+      "epoch": 0.2900274792166684,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016659204241067003,
+      "loss": 1.0802,
+      "step": 4169
+    },
+    {
+      "epoch": 0.2900970468538036,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001665752303652658,
+      "loss": 0.9512,
+      "step": 4170
+    },
+    {
+      "epoch": 0.29016661449093883,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001665584149395069,
+      "loss": 0.9257,
+      "step": 4171
+    },
+    {
+      "epoch": 0.290236182128074,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016654159613424717,
+      "loss": 0.9806,
+      "step": 4172
+    },
+    {
+      "epoch": 0.29030574976520923,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016652477395034061,
+      "loss": 1.2123,
+      "step": 4173
+    },
+    {
+      "epoch": 0.2903753174023444,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016650794838864132,
+      "loss": 1.0177,
+      "step": 4174
+    },
+    {
+      "epoch": 0.29044488503947963,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016649111945000363,
+      "loss": 0.8756,
+      "step": 4175
+    },
+    {
+      "epoch": 0.29051445267661485,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016647428713528205,
+      "loss": 0.9949,
+      "step": 4176
+    },
+    {
+      "epoch": 0.29058402031375,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016645745144533122,
+      "loss": 1.2037,
+      "step": 4177
+    },
+    {
+      "epoch": 0.29065358795088525,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016644061238100596,
+      "loss": 0.8776,
+      "step": 4178
+    },
+    {
+      "epoch": 0.2907231555880205,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016642376994316132,
+      "loss": 1.0536,
+      "step": 4179
+    },
+    {
+      "epoch": 0.29079272322515565,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001664069241326524,
+      "loss": 1.0989,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2908622908622909,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0016639007495033462,
+      "loss": 1.1989,
+      "step": 4181
+    },
+    {
+      "epoch": 0.29093185849942604,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016637322239706348,
+      "loss": 0.9185,
+      "step": 4182
+    },
+    {
+      "epoch": 0.29100142613656127,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016635636647369463,
+      "loss": 0.8248,
+      "step": 4183
+    },
+    {
+      "epoch": 0.2910709937736965,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016633950718108394,
+      "loss": 0.8318,
+      "step": 4184
+    },
+    {
+      "epoch": 0.29114056141083167,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0016632264452008747,
+      "loss": 0.8321,
+      "step": 4185
+    },
+    {
+      "epoch": 0.2912101290479669,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016630577849156142,
+      "loss": 1.1485,
+      "step": 4186
+    },
+    {
+      "epoch": 0.29127969668510206,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001662889090963621,
+      "loss": 1.0078,
+      "step": 4187
+    },
+    {
+      "epoch": 0.2913492643222373,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001662720363353461,
+      "loss": 1.0449,
+      "step": 4188
+    },
+    {
+      "epoch": 0.2914188319593725,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016625516020937015,
+      "loss": 0.8293,
+      "step": 4189
+    },
+    {
+      "epoch": 0.2914883995965077,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016623828071929113,
+      "loss": 0.7138,
+      "step": 4190
+    },
+    {
+      "epoch": 0.2915579672336429,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016622139786596603,
+      "loss": 0.9308,
+      "step": 4191
+    },
+    {
+      "epoch": 0.29162753487077814,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0016620451165025218,
+      "loss": 1.4084,
+      "step": 4192
+    },
+    {
+      "epoch": 0.2916971025079133,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001661876220730069,
+      "loss": 0.8537,
+      "step": 4193
+    },
+    {
+      "epoch": 0.29176667014504853,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001661707291350878,
+      "loss": 0.8307,
+      "step": 4194
+    },
+    {
+      "epoch": 0.2918362377821837,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0016615383283735256,
+      "loss": 0.797,
+      "step": 4195
+    },
+    {
+      "epoch": 0.29190580541931893,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0016613693318065917,
+      "loss": 0.8068,
+      "step": 4196
+    },
+    {
+      "epoch": 0.29197537305645416,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0016612003016586562,
+      "loss": 1.0044,
+      "step": 4197
+    },
+    {
+      "epoch": 0.2920449406935893,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016610312379383028,
+      "loss": 0.9487,
+      "step": 4198
+    },
+    {
+      "epoch": 0.29211450833072455,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016608621406541144,
+      "loss": 0.7799,
+      "step": 4199
+    },
+    {
+      "epoch": 0.2921840759678597,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016606930098146777,
+      "loss": 1.0704,
+      "step": 4200
+    },
+    {
+      "epoch": 0.29225364360499495,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016605238454285801,
+      "loss": 1.0219,
+      "step": 4201
+    },
+    {
+      "epoch": 0.2923232112421302,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001660354647504411,
+      "loss": 1.1202,
+      "step": 4202
+    },
+    {
+      "epoch": 0.29239277887926535,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016601854160507613,
+      "loss": 0.8611,
+      "step": 4203
+    },
+    {
+      "epoch": 0.2924623465164006,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016600161510762232,
+      "loss": 0.8448,
+      "step": 4204
+    },
+    {
+      "epoch": 0.2925319141535358,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016598468525893923,
+      "loss": 0.7808,
+      "step": 4205
+    },
+    {
+      "epoch": 0.29260148179067097,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001659677520598864,
+      "loss": 1.0908,
+      "step": 4206
+    },
+    {
+      "epoch": 0.2926710494278062,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016595081551132364,
+      "loss": 0.6729,
+      "step": 4207
+    },
+    {
+      "epoch": 0.29274061706494137,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016593387561411085,
+      "loss": 1.0274,
+      "step": 4208
+    },
+    {
+      "epoch": 0.2928101847020766,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016591693236910818,
+      "loss": 1.0735,
+      "step": 4209
+    },
+    {
+      "epoch": 0.2928797523392118,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016589998577717596,
+      "loss": 0.8966,
+      "step": 4210
+    },
+    {
+      "epoch": 0.292949319976347,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016588303583917462,
+      "loss": 1.036,
+      "step": 4211
+    },
+    {
+      "epoch": 0.2930188876134822,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016586608255596477,
+      "loss": 1.0781,
+      "step": 4212
+    },
+    {
+      "epoch": 0.2930884552506174,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016584912592840727,
+      "loss": 1.0266,
+      "step": 4213
+    },
+    {
+      "epoch": 0.2931580228877526,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016583216595736304,
+      "loss": 0.8677,
+      "step": 4214
+    },
+    {
+      "epoch": 0.29322759052488784,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016581520264369325,
+      "loss": 0.9807,
+      "step": 4215
+    },
+    {
+      "epoch": 0.293297158162023,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001657982359882592,
+      "loss": 0.7129,
+      "step": 4216
+    },
+    {
+      "epoch": 0.29336672579915823,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0016578126599192237,
+      "loss": 1.0546,
+      "step": 4217
+    },
+    {
+      "epoch": 0.29343629343629346,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001657642926555444,
+      "loss": 0.8812,
+      "step": 4218
+    },
+    {
+      "epoch": 0.29350586107342863,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016574731597998715,
+      "loss": 0.9671,
+      "step": 4219
+    },
+    {
+      "epoch": 0.29357542871056386,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001657303359661126,
+      "loss": 1.1712,
+      "step": 4220
+    },
+    {
+      "epoch": 0.293644996347699,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001657133526147829,
+      "loss": 1.011,
+      "step": 4221
+    },
+    {
+      "epoch": 0.29371456398483425,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016569636592686033,
+      "loss": 1.0849,
+      "step": 4222
+    },
+    {
+      "epoch": 0.2937841316219695,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016567937590320745,
+      "loss": 1.1929,
+      "step": 4223
+    },
+    {
+      "epoch": 0.29385369925910465,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016566238254468691,
+      "loss": 0.9215,
+      "step": 4224
+    },
+    {
+      "epoch": 0.2939232668962399,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0016564538585216153,
+      "loss": 1.2911,
+      "step": 4225
+    },
+    {
+      "epoch": 0.29399283453337505,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016562838582649439,
+      "loss": 0.9647,
+      "step": 4226
+    },
+    {
+      "epoch": 0.2940624021705103,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016561138246854853,
+      "loss": 1.0962,
+      "step": 4227
+    },
+    {
+      "epoch": 0.2941319698076455,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016559437577918744,
+      "loss": 1.0047,
+      "step": 4228
+    },
+    {
+      "epoch": 0.29420153744478067,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016557736575927454,
+      "loss": 0.8579,
+      "step": 4229
+    },
+    {
+      "epoch": 0.2942711050819159,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0016556035240967355,
+      "loss": 0.9159,
+      "step": 4230
+    },
+    {
+      "epoch": 0.2943406727190511,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016554333573124832,
+      "loss": 0.8801,
+      "step": 4231
+    },
+    {
+      "epoch": 0.2944102403561863,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016552631572486283,
+      "loss": 0.8854,
+      "step": 4232
+    },
+    {
+      "epoch": 0.2944798079933215,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016550929239138132,
+      "loss": 0.8029,
+      "step": 4233
+    },
+    {
+      "epoch": 0.2945493756304567,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016549226573166816,
+      "loss": 0.7103,
+      "step": 4234
+    },
+    {
+      "epoch": 0.2946189432675919,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016547523574658783,
+      "loss": 0.858,
+      "step": 4235
+    },
+    {
+      "epoch": 0.29468851090472714,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016545820243700504,
+      "loss": 0.8784,
+      "step": 4236
+    },
+    {
+      "epoch": 0.2947580785418623,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001654411658037847,
+      "loss": 0.8463,
+      "step": 4237
+    },
+    {
+      "epoch": 0.29482764617899754,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016542412584779175,
+      "loss": 0.9727,
+      "step": 4238
+    },
+    {
+      "epoch": 0.2948972138161327,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001654070825698915,
+      "loss": 0.9683,
+      "step": 4239
+    },
+    {
+      "epoch": 0.29496678145326793,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016539003597094927,
+      "loss": 1.2157,
+      "step": 4240
+    },
+    {
+      "epoch": 0.29503634909040316,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016537298605183058,
+      "loss": 1.0825,
+      "step": 4241
+    },
+    {
+      "epoch": 0.29510591672753833,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016535593281340117,
+      "loss": 1.0039,
+      "step": 4242
+    },
+    {
+      "epoch": 0.29517548436467356,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016533887625652692,
+      "loss": 0.8681,
+      "step": 4243
+    },
+    {
+      "epoch": 0.2952450520018088,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016532181638207386,
+      "loss": 0.7514,
+      "step": 4244
+    },
+    {
+      "epoch": 0.29531461963894395,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001653047531909082,
+      "loss": 1.2113,
+      "step": 4245
+    },
+    {
+      "epoch": 0.2953841872760792,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016528768668389636,
+      "loss": 1.1541,
+      "step": 4246
+    },
+    {
+      "epoch": 0.29545375491321435,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016527061686190485,
+      "loss": 1.0614,
+      "step": 4247
+    },
+    {
+      "epoch": 0.2955233225503496,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001652535437258004,
+      "loss": 0.9572,
+      "step": 4248
+    },
+    {
+      "epoch": 0.2955928901874848,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0016523646727644992,
+      "loss": 0.9534,
+      "step": 4249
+    },
+    {
+      "epoch": 0.29566245782462,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001652193875147204,
+      "loss": 1.1222,
+      "step": 4250
+    },
+    {
+      "epoch": 0.2957320254617552,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0016520230444147916,
+      "loss": 1.1133,
+      "step": 4251
+    },
+    {
+      "epoch": 0.29580159309889037,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016518521805759352,
+      "loss": 1.1543,
+      "step": 4252
+    },
+    {
+      "epoch": 0.2958711607360256,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001651681283639311,
+      "loss": 0.9325,
+      "step": 4253
+    },
+    {
+      "epoch": 0.2959407283731608,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016515103536135956,
+      "loss": 1.1222,
+      "step": 4254
+    },
+    {
+      "epoch": 0.296010296010296,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016513393905074683,
+      "loss": 0.9288,
+      "step": 4255
+    },
+    {
+      "epoch": 0.2960798636474312,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00165116839432961,
+      "loss": 0.8688,
+      "step": 4256
+    },
+    {
+      "epoch": 0.29614943128456644,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016509973650887023,
+      "loss": 0.9634,
+      "step": 4257
+    },
+    {
+      "epoch": 0.2962189989217016,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016508263027934303,
+      "loss": 0.9483,
+      "step": 4258
+    },
+    {
+      "epoch": 0.29628856655883684,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016506552074524784,
+      "loss": 1.0677,
+      "step": 4259
+    },
+    {
+      "epoch": 0.296358134195972,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001650484079074535,
+      "loss": 0.8843,
+      "step": 4260
+    },
+    {
+      "epoch": 0.29642770183310724,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0016503129176682887,
+      "loss": 1.0493,
+      "step": 4261
+    },
+    {
+      "epoch": 0.29649726947024246,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00165014172324243,
+      "loss": 1.1126,
+      "step": 4262
+    },
+    {
+      "epoch": 0.29656683710737763,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016499704958056521,
+      "loss": 1.1817,
+      "step": 4263
+    },
+    {
+      "epoch": 0.29663640474451286,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001649799235366648,
+      "loss": 0.7771,
+      "step": 4264
+    },
+    {
+      "epoch": 0.29670597238164803,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0016496279419341143,
+      "loss": 1.1077,
+      "step": 4265
+    },
+    {
+      "epoch": 0.29677554001878326,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001649456615516748,
+      "loss": 1.017,
+      "step": 4266
+    },
+    {
+      "epoch": 0.2968451076559185,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0016492852561232482,
+      "loss": 1.0378,
+      "step": 4267
+    },
+    {
+      "epoch": 0.29691467529305365,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016491138637623156,
+      "loss": 0.8971,
+      "step": 4268
+    },
+    {
+      "epoch": 0.2969842429301889,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0016489424384426529,
+      "loss": 1.1585,
+      "step": 4269
+    },
+    {
+      "epoch": 0.2970538105673241,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001648770980172964,
+      "loss": 0.9772,
+      "step": 4270
+    },
+    {
+      "epoch": 0.2971233782044593,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016485994889619549,
+      "loss": 0.9343,
+      "step": 4271
+    },
+    {
+      "epoch": 0.2971929458415945,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016484279648183331,
+      "loss": 0.8485,
+      "step": 4272
+    },
+    {
+      "epoch": 0.2972625134787297,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0016482564077508074,
+      "loss": 0.9716,
+      "step": 4273
+    },
+    {
+      "epoch": 0.2973320811158649,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016480848177680887,
+      "loss": 0.9989,
+      "step": 4274
+    },
+    {
+      "epoch": 0.2974016487530001,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016479131948788895,
+      "loss": 0.735,
+      "step": 4275
+    },
+    {
+      "epoch": 0.2974712163901353,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001647741539091924,
+      "loss": 1.0061,
+      "step": 4276
+    },
+    {
+      "epoch": 0.2975407840272705,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016475698504159083,
+      "loss": 0.7155,
+      "step": 4277
+    },
+    {
+      "epoch": 0.2976103516644057,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0016473981288595589,
+      "loss": 1.0526,
+      "step": 4278
+    },
+    {
+      "epoch": 0.2976799193015409,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016472263744315963,
+      "loss": 0.819,
+      "step": 4279
+    },
+    {
+      "epoch": 0.29774948693867614,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0016470545871407405,
+      "loss": 1.3974,
+      "step": 4280
+    },
+    {
+      "epoch": 0.2978190545758113,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016468827669957142,
+      "loss": 0.959,
+      "step": 4281
+    },
+    {
+      "epoch": 0.29788862221294654,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016467109140052415,
+      "loss": 1.0633,
+      "step": 4282
+    },
+    {
+      "epoch": 0.29795818985008177,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001646539028178048,
+      "loss": 0.8363,
+      "step": 4283
+    },
+    {
+      "epoch": 0.29802775748721694,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016463671095228618,
+      "loss": 0.9048,
+      "step": 4284
+    },
+    {
+      "epoch": 0.29809732512435216,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0016461951580484116,
+      "loss": 1.3229,
+      "step": 4285
+    },
+    {
+      "epoch": 0.29816689276148733,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016460231737634283,
+      "loss": 1.0129,
+      "step": 4286
+    },
+    {
+      "epoch": 0.29823646039862256,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016458511566766446,
+      "loss": 0.7665,
+      "step": 4287
+    },
+    {
+      "epoch": 0.2983060280357578,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0016456791067967942,
+      "loss": 1.1054,
+      "step": 4288
+    },
+    {
+      "epoch": 0.29837559567289296,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0016455070241326133,
+      "loss": 1.0423,
+      "step": 4289
+    },
+    {
+      "epoch": 0.2984451633100282,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016453349086928395,
+      "loss": 0.8518,
+      "step": 4290
+    },
+    {
+      "epoch": 0.29851473094716335,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016451627604862115,
+      "loss": 0.7496,
+      "step": 4291
+    },
+    {
+      "epoch": 0.2985842985842986,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0016449905795214706,
+      "loss": 1.0012,
+      "step": 4292
+    },
+    {
+      "epoch": 0.2986538662214338,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001644818365807359,
+      "loss": 1.0817,
+      "step": 4293
+    },
+    {
+      "epoch": 0.298723433858569,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001644646119352621,
+      "loss": 1.3537,
+      "step": 4294
+    },
+    {
+      "epoch": 0.2987930014957042,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016444738401660021,
+      "loss": 0.8799,
+      "step": 4295
+    },
+    {
+      "epoch": 0.29886256913283943,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016443015282562499,
+      "loss": 0.9406,
+      "step": 4296
+    },
+    {
+      "epoch": 0.2989321367699746,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016441291836321139,
+      "loss": 0.9568,
+      "step": 4297
+    },
+    {
+      "epoch": 0.2990017044071098,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016439568063023446,
+      "loss": 0.8902,
+      "step": 4298
+    },
+    {
+      "epoch": 0.299071272044245,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016437843962756942,
+      "loss": 0.9022,
+      "step": 4299
+    },
+    {
+      "epoch": 0.2991408396813802,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016436119535609176,
+      "loss": 0.96,
+      "step": 4300
+    },
+    {
+      "epoch": 0.29921040731851545,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016434394781667696,
+      "loss": 1.0838,
+      "step": 4301
+    },
+    {
+      "epoch": 0.2992799749556506,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016432669701020083,
+      "loss": 1.1138,
+      "step": 4302
+    },
+    {
+      "epoch": 0.29934954259278584,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016430944293753921,
+      "loss": 0.8477,
+      "step": 4303
+    },
+    {
+      "epoch": 0.299419110229921,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016429218559956826,
+      "loss": 0.8323,
+      "step": 4304
+    },
+    {
+      "epoch": 0.29948867786705624,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001642749249971642,
+      "loss": 0.9095,
+      "step": 4305
+    },
+    {
+      "epoch": 0.29955824550419147,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016425766113120337,
+      "loss": 0.7988,
+      "step": 4306
+    },
+    {
+      "epoch": 0.29962781314132664,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0016424039400256244,
+      "loss": 0.8935,
+      "step": 4307
+    },
+    {
+      "epoch": 0.29969738077846186,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016422312361211806,
+      "loss": 1.1378,
+      "step": 4308
+    },
+    {
+      "epoch": 0.2997669484155971,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001642058499607472,
+      "loss": 1.0826,
+      "step": 4309
+    },
+    {
+      "epoch": 0.29983651605273226,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0016418857304932686,
+      "loss": 0.682,
+      "step": 4310
+    },
+    {
+      "epoch": 0.2999060836898675,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016417129287873435,
+      "loss": 0.8288,
+      "step": 4311
+    },
+    {
+      "epoch": 0.29997565132700266,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016415400944984702,
+      "loss": 1.1236,
+      "step": 4312
+    },
+    {
+      "epoch": 0.3000452189641379,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016413672276354245,
+      "loss": 0.8868,
+      "step": 4313
+    },
+    {
+      "epoch": 0.3001147866012731,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0016411943282069838,
+      "loss": 0.6118,
+      "step": 4314
+    },
+    {
+      "epoch": 0.3001843542384083,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001641021396221927,
+      "loss": 0.8889,
+      "step": 4315
+    },
+    {
+      "epoch": 0.3002539218755435,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0016408484316890347,
+      "loss": 0.7158,
+      "step": 4316
+    },
+    {
+      "epoch": 0.3003234895126787,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001640675434617089,
+      "loss": 1.0108,
+      "step": 4317
+    },
+    {
+      "epoch": 0.3003930571498139,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001640502405014874,
+      "loss": 0.9027,
+      "step": 4318
+    },
+    {
+      "epoch": 0.30046262478694913,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0016403293428911754,
+      "loss": 0.9381,
+      "step": 4319
+    },
+    {
+      "epoch": 0.3005321924240843,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00164015624825478,
+      "loss": 1.1626,
+      "step": 4320
+    },
+    {
+      "epoch": 0.3006017600612195,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016399831211144772,
+      "loss": 1.0997,
+      "step": 4321
+    },
+    {
+      "epoch": 0.30067132769835475,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001639809961479057,
+      "loss": 1.1818,
+      "step": 4322
+    },
+    {
+      "epoch": 0.3007408953354899,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016396367693573119,
+      "loss": 1.1089,
+      "step": 4323
+    },
+    {
+      "epoch": 0.30081046297262515,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016394635447580358,
+      "loss": 1.216,
+      "step": 4324
+    },
+    {
+      "epoch": 0.3008800306097603,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016392902876900242,
+      "loss": 1.1119,
+      "step": 4325
+    },
+    {
+      "epoch": 0.30094959824689554,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001639116998162074,
+      "loss": 1.2958,
+      "step": 4326
+    },
+    {
+      "epoch": 0.30101916588403077,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016389436761829836,
+      "loss": 0.8656,
+      "step": 4327
+    },
+    {
+      "epoch": 0.30108873352116594,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016387703217615541,
+      "loss": 0.9134,
+      "step": 4328
+    },
+    {
+      "epoch": 0.30115830115830117,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016385969349065875,
+      "loss": 0.7285,
+      "step": 4329
+    },
+    {
+      "epoch": 0.30122786879543634,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001638423515626887,
+      "loss": 1.0429,
+      "step": 4330
+    },
+    {
+      "epoch": 0.30129743643257156,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016382500639312582,
+      "loss": 1.1032,
+      "step": 4331
+    },
+    {
+      "epoch": 0.3013670040697068,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016380765798285086,
+      "loss": 0.9027,
+      "step": 4332
+    },
+    {
+      "epoch": 0.30143657170684196,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016379030633274462,
+      "loss": 0.8908,
+      "step": 4333
+    },
+    {
+      "epoch": 0.3015061393439772,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016377295144368816,
+      "loss": 0.9926,
+      "step": 4334
+    },
+    {
+      "epoch": 0.3015757069811124,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016375559331656265,
+      "loss": 0.8325,
+      "step": 4335
+    },
+    {
+      "epoch": 0.3016452746182476,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0016373823195224943,
+      "loss": 0.8746,
+      "step": 4336
+    },
+    {
+      "epoch": 0.3017148422553828,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0016372086735163011,
+      "loss": 0.9151,
+      "step": 4337
+    },
+    {
+      "epoch": 0.301784409892518,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0016370349951558632,
+      "loss": 0.8559,
+      "step": 4338
+    },
+    {
+      "epoch": 0.3018539775296532,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001636861284449999,
+      "loss": 1.004,
+      "step": 4339
+    },
+    {
+      "epoch": 0.30192354516678843,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016366875414075288,
+      "loss": 0.8747,
+      "step": 4340
+    },
+    {
+      "epoch": 0.3019931128039236,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0016365137660372744,
+      "loss": 0.9914,
+      "step": 4341
+    },
+    {
+      "epoch": 0.30206268044105883,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016363399583480592,
+      "loss": 0.9239,
+      "step": 4342
+    },
+    {
+      "epoch": 0.302132248078194,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016361661183487085,
+      "loss": 0.8221,
+      "step": 4343
+    },
+    {
+      "epoch": 0.3022018157153292,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0016359922460480484,
+      "loss": 0.7647,
+      "step": 4344
+    },
+    {
+      "epoch": 0.30227138335246445,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001635818341454908,
+      "loss": 1.116,
+      "step": 4345
+    },
+    {
+      "epoch": 0.3023409509895996,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001635644404578117,
+      "loss": 1.0775,
+      "step": 4346
+    },
+    {
+      "epoch": 0.30241051862673485,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0016354704354265071,
+      "loss": 0.7777,
+      "step": 4347
+    },
+    {
+      "epoch": 0.3024800862638701,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0016352964340089113,
+      "loss": 0.9641,
+      "step": 4348
+    },
+    {
+      "epoch": 0.30254965390100524,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016351224003341644,
+      "loss": 1.0605,
+      "step": 4349
+    },
+    {
+      "epoch": 0.30261922153814047,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016349483344111038,
+      "loss": 0.6386,
+      "step": 4350
+    },
+    {
+      "epoch": 0.30268878917527564,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016347742362485672,
+      "loss": 0.9836,
+      "step": 4351
+    },
+    {
+      "epoch": 0.30275835681241087,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016346001058553938,
+      "loss": 0.9887,
+      "step": 4352
+    },
+    {
+      "epoch": 0.3028279244495461,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001634425943240426,
+      "loss": 1.0901,
+      "step": 4353
+    },
+    {
+      "epoch": 0.30289749208668126,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016342517484125069,
+      "loss": 1.0028,
+      "step": 4354
+    },
+    {
+      "epoch": 0.3029670597238165,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016340775213804803,
+      "loss": 0.976,
+      "step": 4355
+    },
+    {
+      "epoch": 0.30303662736095166,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016339032621531936,
+      "loss": 0.8511,
+      "step": 4356
+    },
+    {
+      "epoch": 0.3031061949980869,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0016337289707394939,
+      "loss": 0.7611,
+      "step": 4357
+    },
+    {
+      "epoch": 0.3031757626352221,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016335546471482317,
+      "loss": 0.7739,
+      "step": 4358
+    },
+    {
+      "epoch": 0.3032453302723573,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016333802913882573,
+      "loss": 0.8467,
+      "step": 4359
+    },
+    {
+      "epoch": 0.3033148979094925,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0016332059034684248,
+      "loss": 0.9224,
+      "step": 4360
+    },
+    {
+      "epoch": 0.30338446554662774,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001633031483397588,
+      "loss": 0.8146,
+      "step": 4361
+    },
+    {
+      "epoch": 0.3034540331837629,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0016328570311846032,
+      "loss": 1.1802,
+      "step": 4362
+    },
+    {
+      "epoch": 0.30352360082089813,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001632682546838328,
+      "loss": 1.055,
+      "step": 4363
+    },
+    {
+      "epoch": 0.3035931684580333,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016325080303676218,
+      "loss": 1.0319,
+      "step": 4364
+    },
+    {
+      "epoch": 0.30366273609516853,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016323334817813465,
+      "loss": 0.8394,
+      "step": 4365
+    },
+    {
+      "epoch": 0.30373230373230375,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001632158901088364,
+      "loss": 1.1423,
+      "step": 4366
+    },
+    {
+      "epoch": 0.3038018713694389,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016319842882975386,
+      "loss": 1.1017,
+      "step": 4367
+    },
+    {
+      "epoch": 0.30387143900657415,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016318096434177365,
+      "loss": 1.069,
+      "step": 4368
+    },
+    {
+      "epoch": 0.3039410066437093,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016316349664578253,
+      "loss": 1.0585,
+      "step": 4369
+    },
+    {
+      "epoch": 0.30401057428084455,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0016314602574266743,
+      "loss": 0.893,
+      "step": 4370
+    },
+    {
+      "epoch": 0.3040801419179798,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016312855163331543,
+      "loss": 1.1369,
+      "step": 4371
+    },
+    {
+      "epoch": 0.30414970955511494,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016311107431861377,
+      "loss": 0.8239,
+      "step": 4372
+    },
+    {
+      "epoch": 0.30421927719225017,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001630935937994498,
+      "loss": 0.9099,
+      "step": 4373
+    },
+    {
+      "epoch": 0.3042888448293854,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016307611007671122,
+      "loss": 0.8812,
+      "step": 4374
+    },
+    {
+      "epoch": 0.30435841246652057,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016305862315128565,
+      "loss": 1.1323,
+      "step": 4375
+    },
+    {
+      "epoch": 0.3044279801036558,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001630411330240611,
+      "loss": 0.9536,
+      "step": 4376
+    },
+    {
+      "epoch": 0.30449754774079096,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0016302363969592551,
+      "loss": 1.147,
+      "step": 4377
+    },
+    {
+      "epoch": 0.3045671153779262,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016300614316776718,
+      "loss": 1.0402,
+      "step": 4378
+    },
+    {
+      "epoch": 0.3046366830150614,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016298864344047447,
+      "loss": 1.0486,
+      "step": 4379
+    },
+    {
+      "epoch": 0.3047062506521966,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016297114051493592,
+      "loss": 0.9422,
+      "step": 4380
+    },
+    {
+      "epoch": 0.3047758182893318,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016295363439204028,
+      "loss": 1.076,
+      "step": 4381
+    },
+    {
+      "epoch": 0.304845385926467,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016293612507267637,
+      "loss": 0.9279,
+      "step": 4382
+    },
+    {
+      "epoch": 0.3049149535636022,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016291861255773325,
+      "loss": 0.9118,
+      "step": 4383
+    },
+    {
+      "epoch": 0.30498452120073744,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016290109684810013,
+      "loss": 0.8204,
+      "step": 4384
+    },
+    {
+      "epoch": 0.3050540888378726,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016288357794466638,
+      "loss": 1.0852,
+      "step": 4385
+    },
+    {
+      "epoch": 0.30512365647500783,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0016286605584832144,
+      "loss": 0.9775,
+      "step": 4386
+    },
+    {
+      "epoch": 0.30519322411214306,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001628485305599551,
+      "loss": 0.8048,
+      "step": 4387
+    },
+    {
+      "epoch": 0.30526279174927823,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0016283100208045714,
+      "loss": 0.9148,
+      "step": 4388
+    },
+    {
+      "epoch": 0.30533235938641345,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016281347041071758,
+      "loss": 1.0056,
+      "step": 4389
+    },
+    {
+      "epoch": 0.3054019270235486,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0016279593555162662,
+      "loss": 0.9933,
+      "step": 4390
+    },
+    {
+      "epoch": 0.30547149466068385,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016277839750407455,
+      "loss": 0.9661,
+      "step": 4391
+    },
+    {
+      "epoch": 0.3055410622978191,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001627608562689519,
+      "loss": 1.0881,
+      "step": 4392
+    },
+    {
+      "epoch": 0.30561062993495425,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0016274331184714928,
+      "loss": 0.8765,
+      "step": 4393
+    },
+    {
+      "epoch": 0.3056801975720895,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016272576423955753,
+      "loss": 0.859,
+      "step": 4394
+    },
+    {
+      "epoch": 0.30574976520922464,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016270821344706765,
+      "loss": 0.8187,
+      "step": 4395
+    },
+    {
+      "epoch": 0.30581933284635987,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016269065947057079,
+      "loss": 1.2265,
+      "step": 4396
+    },
+    {
+      "epoch": 0.3058889004834951,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0016267310231095817,
+      "loss": 1.0202,
+      "step": 4397
+    },
+    {
+      "epoch": 0.30595846812063027,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016265554196912137,
+      "loss": 0.7479,
+      "step": 4398
+    },
+    {
+      "epoch": 0.3060280357577655,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001626379784459519,
+      "loss": 0.9621,
+      "step": 4399
+    },
+    {
+      "epoch": 0.3060976033949007,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0016262041174234163,
+      "loss": 0.9615,
+      "step": 4400
+    },
+    {
+      "epoch": 0.3061671710320359,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001626028418591825,
+      "loss": 1.0641,
+      "step": 4401
+    },
+    {
+      "epoch": 0.3062367386691711,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016258526879736658,
+      "loss": 0.8778,
+      "step": 4402
+    },
+    {
+      "epoch": 0.3063063063063063,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016256769255778615,
+      "loss": 0.8062,
+      "step": 4403
+    },
+    {
+      "epoch": 0.3063758739434415,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001625501131413337,
+      "loss": 0.783,
+      "step": 4404
+    },
+    {
+      "epoch": 0.30644544158057674,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016253253054890173,
+      "loss": 1.1104,
+      "step": 4405
+    },
+    {
+      "epoch": 0.3065150092177119,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.001625149447813831,
+      "loss": 0.8862,
+      "step": 4406
+    },
+    {
+      "epoch": 0.30658457685484714,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001624973558396706,
+      "loss": 0.992,
+      "step": 4407
+    },
+    {
+      "epoch": 0.3066541444919823,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016247976372465744,
+      "loss": 0.7775,
+      "step": 4408
+    },
+    {
+      "epoch": 0.30672371212911753,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001624621684372368,
+      "loss": 0.8106,
+      "step": 4409
+    },
+    {
+      "epoch": 0.30679327976625276,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0016244456997830203,
+      "loss": 1.0442,
+      "step": 4410
+    },
+    {
+      "epoch": 0.30686284740338793,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001624269683487468,
+      "loss": 1.1166,
+      "step": 4411
+    },
+    {
+      "epoch": 0.30693241504052315,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016240936354946474,
+      "loss": 0.896,
+      "step": 4412
+    },
+    {
+      "epoch": 0.3070019826776584,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016239175558134976,
+      "loss": 1.0908,
+      "step": 4413
+    },
+    {
+      "epoch": 0.30707155031479355,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016237414444529592,
+      "loss": 1.1792,
+      "step": 4414
+    },
+    {
+      "epoch": 0.3071411179519288,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016235653014219742,
+      "loss": 0.9716,
+      "step": 4415
+    },
+    {
+      "epoch": 0.30721068558906395,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.001623389126729486,
+      "loss": 0.9999,
+      "step": 4416
+    },
+    {
+      "epoch": 0.3072802532261992,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0016232129203844403,
+      "loss": 1.0203,
+      "step": 4417
+    },
+    {
+      "epoch": 0.3073498208633344,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0016230366823957836,
+      "loss": 1.0839,
+      "step": 4418
+    },
+    {
+      "epoch": 0.30741938850046957,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0016228604127724645,
+      "loss": 0.8569,
+      "step": 4419
+    },
+    {
+      "epoch": 0.3074889561376048,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016226841115234332,
+      "loss": 1.134,
+      "step": 4420
+    },
+    {
+      "epoch": 0.30755852377473997,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016225077786576412,
+      "loss": 0.795,
+      "step": 4421
+    },
+    {
+      "epoch": 0.3076280914118752,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016223314141840417,
+      "loss": 1.0879,
+      "step": 4422
+    },
+    {
+      "epoch": 0.3076976590490104,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016221550181115898,
+      "loss": 0.8837,
+      "step": 4423
+    },
+    {
+      "epoch": 0.3077672266861456,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016219785904492423,
+      "loss": 1.3526,
+      "step": 4424
+    },
+    {
+      "epoch": 0.3078367943232808,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001621802131205957,
+      "loss": 0.8536,
+      "step": 4425
+    },
+    {
+      "epoch": 0.30790636196041604,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016216256403906932,
+      "loss": 0.9965,
+      "step": 4426
+    },
+    {
+      "epoch": 0.3079759295975512,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0016214491180124128,
+      "loss": 1.1338,
+      "step": 4427
+    },
+    {
+      "epoch": 0.30804549723468644,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0016212725640800784,
+      "loss": 0.9712,
+      "step": 4428
+    },
+    {
+      "epoch": 0.3081150648718216,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001621095978602655,
+      "loss": 1.1477,
+      "step": 4429
+    },
+    {
+      "epoch": 0.30818463250895684,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016209193615891078,
+      "loss": 0.8307,
+      "step": 4430
+    },
+    {
+      "epoch": 0.30825420014609206,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0016207427130484056,
+      "loss": 1.0563,
+      "step": 4431
+    },
+    {
+      "epoch": 0.30832376778322723,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001620566032989517,
+      "loss": 1.1016,
+      "step": 4432
+    },
+    {
+      "epoch": 0.30839333542036246,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001620389321421413,
+      "loss": 1.1264,
+      "step": 4433
+    },
+    {
+      "epoch": 0.30846290305749763,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016202125783530666,
+      "loss": 1.1547,
+      "step": 4434
+    },
+    {
+      "epoch": 0.30853247069463285,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016200358037934512,
+      "loss": 1.0316,
+      "step": 4435
+    },
+    {
+      "epoch": 0.3086020383317681,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016198589977515431,
+      "loss": 0.9454,
+      "step": 4436
+    },
+    {
+      "epoch": 0.30867160596890325,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016196821602363193,
+      "loss": 0.9787,
+      "step": 4437
+    },
+    {
+      "epoch": 0.3087411736060385,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001619505291256759,
+      "loss": 0.9956,
+      "step": 4438
+    },
+    {
+      "epoch": 0.3088107412431737,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016193283908218423,
+      "loss": 1.0377,
+      "step": 4439
+    },
+    {
+      "epoch": 0.3088803088803089,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001619151458940552,
+      "loss": 1.0214,
+      "step": 4440
+    },
+    {
+      "epoch": 0.3089498765174441,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001618974495621871,
+      "loss": 0.7111,
+      "step": 4441
+    },
+    {
+      "epoch": 0.30901944415457927,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001618797500874785,
+      "loss": 1.1319,
+      "step": 4442
+    },
+    {
+      "epoch": 0.3090890117917145,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001618620474708281,
+      "loss": 1.0498,
+      "step": 4443
+    },
+    {
+      "epoch": 0.3091585794288497,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016184434171313473,
+      "loss": 1.1306,
+      "step": 4444
+    },
+    {
+      "epoch": 0.3092281470659849,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001618266328152974,
+      "loss": 0.8588,
+      "step": 4445
+    },
+    {
+      "epoch": 0.3092977147031201,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0016180892077821529,
+      "loss": 0.9119,
+      "step": 4446
+    },
+    {
+      "epoch": 0.3093672823402553,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016179120560278772,
+      "loss": 0.8275,
+      "step": 4447
+    },
+    {
+      "epoch": 0.3094368499773905,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0016177348728991419,
+      "loss": 1.0409,
+      "step": 4448
+    },
+    {
+      "epoch": 0.30950641761452574,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0016175576584049431,
+      "loss": 0.9388,
+      "step": 4449
+    },
+    {
+      "epoch": 0.3095759852516609,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016173804125542797,
+      "loss": 0.8171,
+      "step": 4450
+    },
+    {
+      "epoch": 0.30964555288879614,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0016172031353561503,
+      "loss": 1.3405,
+      "step": 4451
+    },
+    {
+      "epoch": 0.30971512052593136,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016170258268195568,
+      "loss": 1.105,
+      "step": 4452
+    },
+    {
+      "epoch": 0.30978468816306653,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016168484869535015,
+      "loss": 0.5778,
+      "step": 4453
+    },
+    {
+      "epoch": 0.30985425580020176,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016166711157669898,
+      "loss": 0.752,
+      "step": 4454
+    },
+    {
+      "epoch": 0.30992382343733693,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016164937132690266,
+      "loss": 1.0001,
+      "step": 4455
+    },
+    {
+      "epoch": 0.30999339107447216,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016163162794686201,
+      "loss": 1.0588,
+      "step": 4456
+    },
+    {
+      "epoch": 0.3100629587116074,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016161388143747797,
+      "loss": 0.8943,
+      "step": 4457
+    },
+    {
+      "epoch": 0.31013252634874255,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0016159613179965156,
+      "loss": 0.9216,
+      "step": 4458
+    },
+    {
+      "epoch": 0.3102020939858778,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016157837903428404,
+      "loss": 0.8055,
+      "step": 4459
+    },
+    {
+      "epoch": 0.31027166162301295,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0016156062314227682,
+      "loss": 0.8656,
+      "step": 4460
+    },
+    {
+      "epoch": 0.3103412292601482,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016154286412453144,
+      "loss": 1.1291,
+      "step": 4461
+    },
+    {
+      "epoch": 0.3104107968972834,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016152510198194966,
+      "loss": 0.8579,
+      "step": 4462
+    },
+    {
+      "epoch": 0.3104803645344186,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016150733671543324,
+      "loss": 0.821,
+      "step": 4463
+    },
+    {
+      "epoch": 0.3105499321715538,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0016148956832588435,
+      "loss": 0.8403,
+      "step": 4464
+    },
+    {
+      "epoch": 0.31061949980868897,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016147179681420506,
+      "loss": 0.9634,
+      "step": 4465
+    },
+    {
+      "epoch": 0.3106890674458242,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001614540221812978,
+      "loss": 1.0434,
+      "step": 4466
+    },
+    {
+      "epoch": 0.3107586350829594,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00161436244428065,
+      "loss": 0.9125,
+      "step": 4467
+    },
+    {
+      "epoch": 0.3108282027200946,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016141846355540942,
+      "loss": 1.0275,
+      "step": 4468
+    },
+    {
+      "epoch": 0.3108977703572298,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016140067956423381,
+      "loss": 0.7801,
+      "step": 4469
+    },
+    {
+      "epoch": 0.31096733799436505,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016138289245544116,
+      "loss": 0.8483,
+      "step": 4470
+    },
+    {
+      "epoch": 0.3110369056315002,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016136510222993464,
+      "loss": 1.0084,
+      "step": 4471
+    },
+    {
+      "epoch": 0.31110647326863544,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016134730888861754,
+      "loss": 0.9062,
+      "step": 4472
+    },
+    {
+      "epoch": 0.3111760409057706,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016132951243239331,
+      "loss": 0.9388,
+      "step": 4473
+    },
+    {
+      "epoch": 0.31124560854290584,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0016131171286216555,
+      "loss": 0.7988,
+      "step": 4474
+    },
+    {
+      "epoch": 0.31131517618004106,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0016129391017883803,
+      "loss": 0.9827,
+      "step": 4475
+    },
+    {
+      "epoch": 0.31138474381717623,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0016127610438331473,
+      "loss": 0.7951,
+      "step": 4476
+    },
+    {
+      "epoch": 0.31145431145431146,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016125829547649967,
+      "loss": 0.9847,
+      "step": 4477
+    },
+    {
+      "epoch": 0.31152387909144663,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016124048345929716,
+      "loss": 0.7996,
+      "step": 4478
+    },
+    {
+      "epoch": 0.31159344672858186,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0016122266833261158,
+      "loss": 1.0155,
+      "step": 4479
+    },
+    {
+      "epoch": 0.3116630143657171,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016120485009734743,
+      "loss": 0.9282,
+      "step": 4480
+    },
+    {
+      "epoch": 0.31173258200285225,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016118702875440954,
+      "loss": 0.9679,
+      "step": 4481
+    },
+    {
+      "epoch": 0.3118021496399875,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016116920430470272,
+      "loss": 0.7549,
+      "step": 4482
+    },
+    {
+      "epoch": 0.3118717172771227,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0016115137674913202,
+      "loss": 1.0342,
+      "step": 4483
+    },
+    {
+      "epoch": 0.3119412849142579,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016113354608860264,
+      "loss": 0.7318,
+      "step": 4484
+    },
+    {
+      "epoch": 0.3120108525513931,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016111571232401993,
+      "loss": 0.868,
+      "step": 4485
+    },
+    {
+      "epoch": 0.3120804201885283,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016109787545628938,
+      "loss": 0.9422,
+      "step": 4486
+    },
+    {
+      "epoch": 0.3121499878256635,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001610800354863167,
+      "loss": 0.8755,
+      "step": 4487
+    },
+    {
+      "epoch": 0.3122195554627987,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016106219241500766,
+      "loss": 1.0344,
+      "step": 4488
+    },
+    {
+      "epoch": 0.3122891230999339,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016104434624326825,
+      "loss": 0.8669,
+      "step": 4489
+    },
+    {
+      "epoch": 0.3123586907370691,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016102649697200464,
+      "loss": 0.961,
+      "step": 4490
+    },
+    {
+      "epoch": 0.3124282583742043,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001610086446021231,
+      "loss": 0.9783,
+      "step": 4491
+    },
+    {
+      "epoch": 0.3124978260113395,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0016099078913453014,
+      "loss": 0.6967,
+      "step": 4492
+    },
+    {
+      "epoch": 0.31256739364847475,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016097293057013226,
+      "loss": 0.9887,
+      "step": 4493
+    },
+    {
+      "epoch": 0.3126369612856099,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0016095506890983634,
+      "loss": 1.0331,
+      "step": 4494
+    },
+    {
+      "epoch": 0.31270652892274514,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016093720415454925,
+      "loss": 0.8642,
+      "step": 4495
+    },
+    {
+      "epoch": 0.31277609655988037,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016091933630517806,
+      "loss": 0.8122,
+      "step": 4496
+    },
+    {
+      "epoch": 0.31284566419701554,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016090146536263002,
+      "loss": 0.8992,
+      "step": 4497
+    },
+    {
+      "epoch": 0.31291523183415076,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016088359132781253,
+      "loss": 0.8872,
+      "step": 4498
+    },
+    {
+      "epoch": 0.31298479947128593,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0016086571420163322,
+      "loss": 1.1528,
+      "step": 4499
+    },
+    {
+      "epoch": 0.31305436710842116,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016084783398499964,
+      "loss": 0.897,
+      "step": 4500
+    },
+    {
+      "epoch": 0.3131239347455564,
+      "grad_norm": 2.578125,
+      "learning_rate": 0.0016082995067881979,
+      "loss": 1.0002,
+      "step": 4501
+    },
+    {
+      "epoch": 0.31319350238269156,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0016081206428400165,
+      "loss": 1.1553,
+      "step": 4502
+    },
+    {
+      "epoch": 0.3132630700198268,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016079417480145339,
+      "loss": 0.8074,
+      "step": 4503
+    },
+    {
+      "epoch": 0.31333263765696195,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0016077628223208338,
+      "loss": 1.249,
+      "step": 4504
+    },
+    {
+      "epoch": 0.3134022052940972,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016075838657680004,
+      "loss": 1.1904,
+      "step": 4505
+    },
+    {
+      "epoch": 0.3134717729312324,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016074048783651213,
+      "loss": 0.8828,
+      "step": 4506
+    },
+    {
+      "epoch": 0.3135413405683676,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0016072258601212838,
+      "loss": 0.8629,
+      "step": 4507
+    },
+    {
+      "epoch": 0.3136109082055028,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001607046811045578,
+      "loss": 0.9527,
+      "step": 4508
+    },
+    {
+      "epoch": 0.31368047584263803,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016068677311470948,
+      "loss": 0.9314,
+      "step": 4509
+    },
+    {
+      "epoch": 0.3137500434797732,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016066886204349267,
+      "loss": 0.7579,
+      "step": 4510
+    },
+    {
+      "epoch": 0.3138196111169084,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0016065094789181687,
+      "loss": 0.7607,
+      "step": 4511
+    },
+    {
+      "epoch": 0.3138891787540436,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0016063303066059162,
+      "loss": 0.7219,
+      "step": 4512
+    },
+    {
+      "epoch": 0.3139587463911788,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001606151103507267,
+      "loss": 1.1225,
+      "step": 4513
+    },
+    {
+      "epoch": 0.31402831402831405,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016059718696313202,
+      "loss": 0.7852,
+      "step": 4514
+    },
+    {
+      "epoch": 0.3140978816654492,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001605792604987176,
+      "loss": 1.0759,
+      "step": 4515
+    },
+    {
+      "epoch": 0.31416744930258445,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016056133095839365,
+      "loss": 1.0242,
+      "step": 4516
+    },
+    {
+      "epoch": 0.3142370169397196,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016054339834307059,
+      "loss": 1.0011,
+      "step": 4517
+    },
+    {
+      "epoch": 0.31430658457685484,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016052546265365893,
+      "loss": 1.2239,
+      "step": 4518
+    },
+    {
+      "epoch": 0.31437615221399007,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016050752389106934,
+      "loss": 1.11,
+      "step": 4519
+    },
+    {
+      "epoch": 0.31444571985112524,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0016048958205621268,
+      "loss": 0.8507,
+      "step": 4520
+    },
+    {
+      "epoch": 0.31451528748826046,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0016047163714999991,
+      "loss": 0.9852,
+      "step": 4521
+    },
+    {
+      "epoch": 0.3145848551253957,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001604536891733422,
+      "loss": 0.9424,
+      "step": 4522
+    },
+    {
+      "epoch": 0.31465442276253086,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0016043573812715086,
+      "loss": 1.0454,
+      "step": 4523
+    },
+    {
+      "epoch": 0.3147239903996661,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.001604177840123374,
+      "loss": 1.1769,
+      "step": 4524
+    },
+    {
+      "epoch": 0.31479355803680126,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016039982682981336,
+      "loss": 0.9743,
+      "step": 4525
+    },
+    {
+      "epoch": 0.3148631256739365,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016038186658049055,
+      "loss": 0.9401,
+      "step": 4526
+    },
+    {
+      "epoch": 0.3149326933110717,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016036390326528093,
+      "loss": 0.9003,
+      "step": 4527
+    },
+    {
+      "epoch": 0.3150022609482069,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016034593688509654,
+      "loss": 0.9977,
+      "step": 4528
+    },
+    {
+      "epoch": 0.3150718285853421,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016032796744084963,
+      "loss": 0.9259,
+      "step": 4529
+    },
+    {
+      "epoch": 0.3151413962224773,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0016030999493345261,
+      "loss": 0.7179,
+      "step": 4530
+    },
+    {
+      "epoch": 0.3152109638596125,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0016029201936381804,
+      "loss": 0.9177,
+      "step": 4531
+    },
+    {
+      "epoch": 0.31528053149674773,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0016027404073285863,
+      "loss": 0.7988,
+      "step": 4532
+    },
+    {
+      "epoch": 0.3153500991338829,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0016025605904148726,
+      "loss": 0.8419,
+      "step": 4533
+    },
+    {
+      "epoch": 0.3154196667710181,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016023807429061687,
+      "loss": 1.1772,
+      "step": 4534
+    },
+    {
+      "epoch": 0.31548923440815335,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016022008648116071,
+      "loss": 1.1462,
+      "step": 4535
+    },
+    {
+      "epoch": 0.3155588020452885,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016020209561403212,
+      "loss": 0.8713,
+      "step": 4536
+    },
+    {
+      "epoch": 0.31562836968242375,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001601841016901445,
+      "loss": 0.9731,
+      "step": 4537
+    },
+    {
+      "epoch": 0.3156979373195589,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001601661047104116,
+      "loss": 1.0064,
+      "step": 4538
+    },
+    {
+      "epoch": 0.31576750495669414,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0016014810467574712,
+      "loss": 0.7273,
+      "step": 4539
+    },
+    {
+      "epoch": 0.31583707259382937,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001601301015870651,
+      "loss": 0.9267,
+      "step": 4540
+    },
+    {
+      "epoch": 0.31590664023096454,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0016011209544527956,
+      "loss": 1.2038,
+      "step": 4541
+    },
+    {
+      "epoch": 0.31597620786809977,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001600940862513048,
+      "loss": 0.8266,
+      "step": 4542
+    },
+    {
+      "epoch": 0.31604577550523494,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016007607400605527,
+      "loss": 0.8463,
+      "step": 4543
+    },
+    {
+      "epoch": 0.31611534314237016,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016005805871044548,
+      "loss": 0.9369,
+      "step": 4544
+    },
+    {
+      "epoch": 0.3161849107795054,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016004004036539018,
+      "loss": 0.9409,
+      "step": 4545
+    },
+    {
+      "epoch": 0.31625447841664056,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016002201897180426,
+      "loss": 0.8067,
+      "step": 4546
+    },
+    {
+      "epoch": 0.3163240460537758,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016000399453060276,
+      "loss": 1.0236,
+      "step": 4547
+    },
+    {
+      "epoch": 0.316393613690911,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015998596704270085,
+      "loss": 0.8579,
+      "step": 4548
+    },
+    {
+      "epoch": 0.3164631813280462,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001599679365090139,
+      "loss": 1.0777,
+      "step": 4549
+    },
+    {
+      "epoch": 0.3165327489651814,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015994990293045738,
+      "loss": 0.911,
+      "step": 4550
+    },
+    {
+      "epoch": 0.3166023166023166,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015993186630794698,
+      "loss": 0.8788,
+      "step": 4551
+    },
+    {
+      "epoch": 0.3166718842394518,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015991382664239846,
+      "loss": 1.1761,
+      "step": 4552
+    },
+    {
+      "epoch": 0.31674145187658703,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015989578393472783,
+      "loss": 1.0325,
+      "step": 4553
+    },
+    {
+      "epoch": 0.3168110195137222,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015987773818585118,
+      "loss": 0.8483,
+      "step": 4554
+    },
+    {
+      "epoch": 0.31688058715085743,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001598596893966848,
+      "loss": 0.7744,
+      "step": 4555
+    },
+    {
+      "epoch": 0.3169501547879926,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0015984163756814509,
+      "loss": 0.9896,
+      "step": 4556
+    },
+    {
+      "epoch": 0.3170197224251278,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0015982358270114868,
+      "loss": 0.804,
+      "step": 4557
+    },
+    {
+      "epoch": 0.31708929006226305,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015980552479661224,
+      "loss": 0.8124,
+      "step": 4558
+    },
+    {
+      "epoch": 0.3171588576993982,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015978746385545272,
+      "loss": 1.0928,
+      "step": 4559
+    },
+    {
+      "epoch": 0.31722842533653345,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001597693998785871,
+      "loss": 0.744,
+      "step": 4560
+    },
+    {
+      "epoch": 0.3172979929736687,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015975133286693266,
+      "loss": 0.7445,
+      "step": 4561
+    },
+    {
+      "epoch": 0.31736756061080384,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015973326282140668,
+      "loss": 0.9578,
+      "step": 4562
+    },
+    {
+      "epoch": 0.31743712824793907,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001597151897429267,
+      "loss": 0.9887,
+      "step": 4563
+    },
+    {
+      "epoch": 0.31750669588507424,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0015969711363241035,
+      "loss": 0.7945,
+      "step": 4564
+    },
+    {
+      "epoch": 0.31757626352220947,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0015967903449077548,
+      "loss": 0.9663,
+      "step": 4565
+    },
+    {
+      "epoch": 0.3176458311593447,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0015966095231894006,
+      "loss": 0.9744,
+      "step": 4566
+    },
+    {
+      "epoch": 0.31771539879647986,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.001596428671178222,
+      "loss": 0.8752,
+      "step": 4567
+    },
+    {
+      "epoch": 0.3177849664336151,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015962477888834012,
+      "loss": 0.8157,
+      "step": 4568
+    },
+    {
+      "epoch": 0.31785453407075026,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0015960668763141234,
+      "loss": 1.0015,
+      "step": 4569
+    },
+    {
+      "epoch": 0.3179241017078855,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001595885933479574,
+      "loss": 0.7865,
+      "step": 4570
+    },
+    {
+      "epoch": 0.3179936693450207,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015957049603889401,
+      "loss": 0.8637,
+      "step": 4571
+    },
+    {
+      "epoch": 0.3180632369821559,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0015955239570514112,
+      "loss": 0.9391,
+      "step": 4572
+    },
+    {
+      "epoch": 0.3181328046192911,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015953429234761773,
+      "loss": 1.0095,
+      "step": 4573
+    },
+    {
+      "epoch": 0.31820237225642634,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0015951618596724306,
+      "loss": 0.8901,
+      "step": 4574
+    },
+    {
+      "epoch": 0.3182719398935615,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015949807656493644,
+      "loss": 0.9012,
+      "step": 4575
+    },
+    {
+      "epoch": 0.31834150753069673,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001594799641416174,
+      "loss": 1.0944,
+      "step": 4576
+    },
+    {
+      "epoch": 0.3184110751678319,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015946184869820557,
+      "loss": 0.9183,
+      "step": 4577
+    },
+    {
+      "epoch": 0.31848064280496713,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015944373023562075,
+      "loss": 0.808,
+      "step": 4578
+    },
+    {
+      "epoch": 0.31855021044210236,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015942560875478295,
+      "loss": 0.9312,
+      "step": 4579
+    },
+    {
+      "epoch": 0.3186197780792375,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015940748425661226,
+      "loss": 0.7663,
+      "step": 4580
+    },
+    {
+      "epoch": 0.31868934571637275,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015938935674202897,
+      "loss": 0.8847,
+      "step": 4581
+    },
+    {
+      "epoch": 0.3187589133535079,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0015937122621195348,
+      "loss": 0.9319,
+      "step": 4582
+    },
+    {
+      "epoch": 0.31882848099064315,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015935309266730635,
+      "loss": 0.9495,
+      "step": 4583
+    },
+    {
+      "epoch": 0.3188980486277784,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015933495610900839,
+      "loss": 0.8488,
+      "step": 4584
+    },
+    {
+      "epoch": 0.31896761626491354,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001593168165379804,
+      "loss": 0.9934,
+      "step": 4585
+    },
+    {
+      "epoch": 0.31903718390204877,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015929867395514344,
+      "loss": 1.0,
+      "step": 4586
+    },
+    {
+      "epoch": 0.319106751539184,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015928052836141871,
+      "loss": 0.8157,
+      "step": 4587
+    },
+    {
+      "epoch": 0.31917631917631917,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0015926237975772755,
+      "loss": 0.7274,
+      "step": 4588
+    },
+    {
+      "epoch": 0.3192458868134544,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015924422814499145,
+      "loss": 0.8182,
+      "step": 4589
+    },
+    {
+      "epoch": 0.31931545445058956,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015922607352413204,
+      "loss": 0.8247,
+      "step": 4590
+    },
+    {
+      "epoch": 0.3193850220877248,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015920791589607115,
+      "loss": 0.8285,
+      "step": 4591
+    },
+    {
+      "epoch": 0.31945458972486,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015918975526173073,
+      "loss": 1.0798,
+      "step": 4592
+    },
+    {
+      "epoch": 0.3195241573619952,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015917159162203284,
+      "loss": 0.912,
+      "step": 4593
+    },
+    {
+      "epoch": 0.3195937249991304,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015915342497789982,
+      "loss": 0.8307,
+      "step": 4594
+    },
+    {
+      "epoch": 0.3196632926362656,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015913525533025402,
+      "loss": 0.8298,
+      "step": 4595
+    },
+    {
+      "epoch": 0.3197328602734008,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015911708268001802,
+      "loss": 0.8014,
+      "step": 4596
+    },
+    {
+      "epoch": 0.31980242791053604,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015909890702811452,
+      "loss": 0.8576,
+      "step": 4597
+    },
+    {
+      "epoch": 0.3198719955476712,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0015908072837546642,
+      "loss": 1.0694,
+      "step": 4598
+    },
+    {
+      "epoch": 0.31994156318480643,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001590625467229967,
+      "loss": 1.1575,
+      "step": 4599
+    },
+    {
+      "epoch": 0.32001113082194166,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015904436207162856,
+      "loss": 0.6767,
+      "step": 4600
+    },
+    {
+      "epoch": 0.32008069845907683,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015902617442228532,
+      "loss": 1.1852,
+      "step": 4601
+    },
+    {
+      "epoch": 0.32015026609621206,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015900798377589047,
+      "loss": 1.0463,
+      "step": 4602
+    },
+    {
+      "epoch": 0.3202198337333472,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015898979013336764,
+      "loss": 1.0375,
+      "step": 4603
+    },
+    {
+      "epoch": 0.32028940137048245,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015897159349564057,
+      "loss": 1.0322,
+      "step": 4604
+    },
+    {
+      "epoch": 0.3203589690076177,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015895339386363322,
+      "loss": 1.1247,
+      "step": 4605
+    },
+    {
+      "epoch": 0.32042853664475285,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015893519123826969,
+      "loss": 0.9166,
+      "step": 4606
+    },
+    {
+      "epoch": 0.3204981042818881,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015891698562047422,
+      "loss": 1.0436,
+      "step": 4607
+    },
+    {
+      "epoch": 0.32056767191902324,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015889877701117114,
+      "loss": 0.8429,
+      "step": 4608
+    },
+    {
+      "epoch": 0.32063723955615847,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015888056541128505,
+      "loss": 0.9104,
+      "step": 4609
+    },
+    {
+      "epoch": 0.3207068071932937,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015886235082174065,
+      "loss": 1.0487,
+      "step": 4610
+    },
+    {
+      "epoch": 0.32077637483042887,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015884413324346275,
+      "loss": 0.9555,
+      "step": 4611
+    },
+    {
+      "epoch": 0.3208459424675641,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015882591267737639,
+      "loss": 0.7592,
+      "step": 4612
+    },
+    {
+      "epoch": 0.3209155101046993,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001588076891244066,
+      "loss": 0.7827,
+      "step": 4613
+    },
+    {
+      "epoch": 0.3209850777418345,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015878946258547889,
+      "loss": 0.8938,
+      "step": 4614
+    },
+    {
+      "epoch": 0.3210546453789697,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0015877123306151848,
+      "loss": 0.8164,
+      "step": 4615
+    },
+    {
+      "epoch": 0.3211242130161049,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0015875300055345114,
+      "loss": 0.929,
+      "step": 4616
+    },
+    {
+      "epoch": 0.3211937806532401,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001587347650622026,
+      "loss": 1.0975,
+      "step": 4617
+    },
+    {
+      "epoch": 0.32126334829037534,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015871652658869869,
+      "loss": 0.9612,
+      "step": 4618
+    },
+    {
+      "epoch": 0.3213329159275105,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001586982851338655,
+      "loss": 0.818,
+      "step": 4619
+    },
+    {
+      "epoch": 0.32140248356464574,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001586800406986293,
+      "loss": 0.8031,
+      "step": 4620
+    },
+    {
+      "epoch": 0.3214720512017809,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015866179328391636,
+      "loss": 1.0245,
+      "step": 4621
+    },
+    {
+      "epoch": 0.32154161883891613,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015864354289065324,
+      "loss": 0.7114,
+      "step": 4622
+    },
+    {
+      "epoch": 0.32161118647605136,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001586252895197666,
+      "loss": 0.848,
+      "step": 4623
+    },
+    {
+      "epoch": 0.32168075411318653,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0015860703317218325,
+      "loss": 0.9736,
+      "step": 4624
+    },
+    {
+      "epoch": 0.32175032175032175,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015858877384883018,
+      "loss": 0.899,
+      "step": 4625
+    },
+    {
+      "epoch": 0.321819889387457,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001585705115506345,
+      "loss": 0.912,
+      "step": 4626
+    },
+    {
+      "epoch": 0.32188945702459215,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001585522462785234,
+      "loss": 1.0388,
+      "step": 4627
+    },
+    {
+      "epoch": 0.3219590246617274,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001585339780334244,
+      "loss": 0.8447,
+      "step": 4628
+    },
+    {
+      "epoch": 0.32202859229886255,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015851570681626502,
+      "loss": 0.9812,
+      "step": 4629
+    },
+    {
+      "epoch": 0.3220981599359978,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0015849743262797299,
+      "loss": 0.8619,
+      "step": 4630
+    },
+    {
+      "epoch": 0.322167727573133,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015847915546947618,
+      "loss": 0.8297,
+      "step": 4631
+    },
+    {
+      "epoch": 0.32223729521026817,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001584608753417026,
+      "loss": 1.0828,
+      "step": 4632
+    },
+    {
+      "epoch": 0.3223068628474034,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015844259224558044,
+      "loss": 0.9393,
+      "step": 4633
+    },
+    {
+      "epoch": 0.32237643048453857,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015842430618203803,
+      "loss": 1.045,
+      "step": 4634
+    },
+    {
+      "epoch": 0.3224459981216738,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015840601715200382,
+      "loss": 0.9415,
+      "step": 4635
+    },
+    {
+      "epoch": 0.322515565758809,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015838772515640645,
+      "loss": 1.0985,
+      "step": 4636
+    },
+    {
+      "epoch": 0.3225851333959442,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0015836943019617467,
+      "loss": 0.8123,
+      "step": 4637
+    },
+    {
+      "epoch": 0.3226547010330794,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015835113227223748,
+      "loss": 0.5864,
+      "step": 4638
+    },
+    {
+      "epoch": 0.32272426867021464,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0015833283138552386,
+      "loss": 1.0993,
+      "step": 4639
+    },
+    {
+      "epoch": 0.3227938363073498,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0015831452753696312,
+      "loss": 1.0382,
+      "step": 4640
+    },
+    {
+      "epoch": 0.32286340394448504,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0015829622072748455,
+      "loss": 0.9563,
+      "step": 4641
+    },
+    {
+      "epoch": 0.3229329715816202,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015827791095801777,
+      "loss": 0.7008,
+      "step": 4642
+    },
+    {
+      "epoch": 0.32300253921875544,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001582595982294924,
+      "loss": 0.9963,
+      "step": 4643
+    },
+    {
+      "epoch": 0.32307210685589066,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015824128254283828,
+      "loss": 1.0364,
+      "step": 4644
+    },
+    {
+      "epoch": 0.32314167449302583,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015822296389898538,
+      "loss": 1.0407,
+      "step": 4645
+    },
+    {
+      "epoch": 0.32321124213016106,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0015820464229886384,
+      "loss": 0.7409,
+      "step": 4646
+    },
+    {
+      "epoch": 0.32328080976729623,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015818631774340394,
+      "loss": 0.9008,
+      "step": 4647
+    },
+    {
+      "epoch": 0.32335037740443145,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015816799023353613,
+      "loss": 0.9936,
+      "step": 4648
+    },
+    {
+      "epoch": 0.3234199450415667,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015814965977019094,
+      "loss": 1.1239,
+      "step": 4649
+    },
+    {
+      "epoch": 0.32348951267870185,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0015813132635429912,
+      "loss": 1.0262,
+      "step": 4650
+    },
+    {
+      "epoch": 0.3235590803158371,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015811298998679156,
+      "loss": 0.6669,
+      "step": 4651
+    },
+    {
+      "epoch": 0.3236286479529723,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0015809465066859928,
+      "loss": 1.0144,
+      "step": 4652
+    },
+    {
+      "epoch": 0.3236982155901075,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0015807630840065346,
+      "loss": 0.7395,
+      "step": 4653
+    },
+    {
+      "epoch": 0.3237677832272427,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0015805796318388544,
+      "loss": 0.9497,
+      "step": 4654
+    },
+    {
+      "epoch": 0.32383735086437787,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015803961501922666,
+      "loss": 1.0169,
+      "step": 4655
+    },
+    {
+      "epoch": 0.3239069185015131,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015802126390760875,
+      "loss": 1.2319,
+      "step": 4656
+    },
+    {
+      "epoch": 0.3239764861386483,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015800290984996355,
+      "loss": 0.7076,
+      "step": 4657
+    },
+    {
+      "epoch": 0.3240460537757835,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015798455284722294,
+      "loss": 1.0573,
+      "step": 4658
+    },
+    {
+      "epoch": 0.3241156214129187,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0015796619290031897,
+      "loss": 1.17,
+      "step": 4659
+    },
+    {
+      "epoch": 0.3241851890500539,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001579478300101839,
+      "loss": 1.0127,
+      "step": 4660
+    },
+    {
+      "epoch": 0.3242547566871891,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0015792946417775013,
+      "loss": 0.927,
+      "step": 4661
+    },
+    {
+      "epoch": 0.32432432432432434,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015791109540395014,
+      "loss": 1.0321,
+      "step": 4662
+    },
+    {
+      "epoch": 0.3243938919614595,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015789272368971663,
+      "loss": 0.758,
+      "step": 4663
+    },
+    {
+      "epoch": 0.32446345959859474,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001578743490359824,
+      "loss": 1.1223,
+      "step": 4664
+    },
+    {
+      "epoch": 0.32453302723572997,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015785597144368042,
+      "loss": 0.9884,
+      "step": 4665
+    },
+    {
+      "epoch": 0.32460259487286514,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015783759091374386,
+      "loss": 1.0554,
+      "step": 4666
+    },
+    {
+      "epoch": 0.32467216251000036,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015781920744710593,
+      "loss": 0.7594,
+      "step": 4667
+    },
+    {
+      "epoch": 0.32474173014713553,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015780082104470009,
+      "loss": 0.9857,
+      "step": 4668
+    },
+    {
+      "epoch": 0.32481129778427076,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015778243170745988,
+      "loss": 0.815,
+      "step": 4669
+    },
+    {
+      "epoch": 0.324880865421406,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0015776403943631905,
+      "loss": 1.0044,
+      "step": 4670
+    },
+    {
+      "epoch": 0.32495043305854115,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0015774564423221143,
+      "loss": 1.1304,
+      "step": 4671
+    },
+    {
+      "epoch": 0.3250200006956764,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0015772724609607108,
+      "loss": 0.9644,
+      "step": 4672
+    },
+    {
+      "epoch": 0.32508956833281155,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001577088450288321,
+      "loss": 0.7194,
+      "step": 4673
+    },
+    {
+      "epoch": 0.3251591359699468,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001576904410314289,
+      "loss": 0.8332,
+      "step": 4674
+    },
+    {
+      "epoch": 0.325228703607082,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015767203410479587,
+      "loss": 0.9187,
+      "step": 4675
+    },
+    {
+      "epoch": 0.3252982712442172,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001576536242498676,
+      "loss": 1.1336,
+      "step": 4676
+    },
+    {
+      "epoch": 0.3253678388813524,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0015763521146757893,
+      "loss": 0.6083,
+      "step": 4677
+    },
+    {
+      "epoch": 0.3254374065184876,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001576167957588647,
+      "loss": 0.9975,
+      "step": 4678
+    },
+    {
+      "epoch": 0.3255069741556228,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0015759837712465998,
+      "loss": 1.0539,
+      "step": 4679
+    },
+    {
+      "epoch": 0.325576541792758,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001575799555659,
+      "loss": 1.2816,
+      "step": 4680
+    },
+    {
+      "epoch": 0.3256461094298932,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015756153108352012,
+      "loss": 1.0234,
+      "step": 4681
+    },
+    {
+      "epoch": 0.3257156770670284,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015754310367845582,
+      "loss": 1.026,
+      "step": 4682
+    },
+    {
+      "epoch": 0.32578524470416365,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.001575246733516427,
+      "loss": 1.1533,
+      "step": 4683
+    },
+    {
+      "epoch": 0.3258548123412988,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001575062401040167,
+      "loss": 0.6291,
+      "step": 4684
+    },
+    {
+      "epoch": 0.32592437997843404,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001574878039365136,
+      "loss": 0.7976,
+      "step": 4685
+    },
+    {
+      "epoch": 0.3259939476155692,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0015746936485006961,
+      "loss": 0.9032,
+      "step": 4686
+    },
+    {
+      "epoch": 0.32606351525270444,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015745092284562094,
+      "loss": 0.8683,
+      "step": 4687
+    },
+    {
+      "epoch": 0.32613308288983966,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00157432477924104,
+      "loss": 0.9426,
+      "step": 4688
+    },
+    {
+      "epoch": 0.32620265052697484,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001574140300864553,
+      "loss": 0.8228,
+      "step": 4689
+    },
+    {
+      "epoch": 0.32627221816411006,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015739557933361153,
+      "loss": 0.9683,
+      "step": 4690
+    },
+    {
+      "epoch": 0.3263417858012453,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015737712566650955,
+      "loss": 1.0398,
+      "step": 4691
+    },
+    {
+      "epoch": 0.32641135343838046,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0015735866908608632,
+      "loss": 1.0982,
+      "step": 4692
+    },
+    {
+      "epoch": 0.3264809210755157,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00157340209593279,
+      "loss": 0.8874,
+      "step": 4693
+    },
+    {
+      "epoch": 0.32655048871265085,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001573217471890248,
+      "loss": 1.1152,
+      "step": 4694
+    },
+    {
+      "epoch": 0.3266200563497861,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0015730328187426126,
+      "loss": 1.0359,
+      "step": 4695
+    },
+    {
+      "epoch": 0.3266896239869213,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015728481364992587,
+      "loss": 0.8886,
+      "step": 4696
+    },
+    {
+      "epoch": 0.3267591916240565,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001572663425169564,
+      "loss": 0.7187,
+      "step": 4697
+    },
+    {
+      "epoch": 0.3268287592611917,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0015724786847629067,
+      "loss": 0.9773,
+      "step": 4698
+    },
+    {
+      "epoch": 0.3268983268983269,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015722939152886676,
+      "loss": 0.7418,
+      "step": 4699
+    },
+    {
+      "epoch": 0.3269678945354621,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0015721091167562279,
+      "loss": 0.9311,
+      "step": 4700
+    },
+    {
+      "epoch": 0.3270374621725973,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015719242891749708,
+      "loss": 0.8655,
+      "step": 4701
+    },
+    {
+      "epoch": 0.3271070298097325,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015717394325542814,
+      "loss": 0.5923,
+      "step": 4702
+    },
+    {
+      "epoch": 0.3271765974468677,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015715545469035448,
+      "loss": 0.7129,
+      "step": 4703
+    },
+    {
+      "epoch": 0.32724616508400295,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0015713696322321496,
+      "loss": 1.1562,
+      "step": 4704
+    },
+    {
+      "epoch": 0.3273157327211381,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0015711846885494843,
+      "loss": 1.1207,
+      "step": 4705
+    },
+    {
+      "epoch": 0.32738530035827335,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0015709997158649394,
+      "loss": 0.846,
+      "step": 4706
+    },
+    {
+      "epoch": 0.3274548679954085,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001570814714187907,
+      "loss": 0.8207,
+      "step": 4707
+    },
+    {
+      "epoch": 0.32752443563254374,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015706296835277804,
+      "loss": 0.7765,
+      "step": 4708
+    },
+    {
+      "epoch": 0.32759400326967897,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001570444623893955,
+      "loss": 0.7479,
+      "step": 4709
+    },
+    {
+      "epoch": 0.32766357090681414,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015702595352958266,
+      "loss": 1.097,
+      "step": 4710
+    },
+    {
+      "epoch": 0.32773313854394936,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015700744177427933,
+      "loss": 0.7712,
+      "step": 4711
+    },
+    {
+      "epoch": 0.32780270618108454,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015698892712442546,
+      "loss": 0.8461,
+      "step": 4712
+    },
+    {
+      "epoch": 0.32787227381821976,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0015697040958096112,
+      "loss": 1.0414,
+      "step": 4713
+    },
+    {
+      "epoch": 0.327941841455355,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015695188914482655,
+      "loss": 0.8578,
+      "step": 4714
+    },
+    {
+      "epoch": 0.32801140909249016,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015693336581696204,
+      "loss": 0.996,
+      "step": 4715
+    },
+    {
+      "epoch": 0.3280809767296254,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015691483959830825,
+      "loss": 0.8574,
+      "step": 4716
+    },
+    {
+      "epoch": 0.3281505443667606,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015689631048980575,
+      "loss": 0.9643,
+      "step": 4717
+    },
+    {
+      "epoch": 0.3282201120038958,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015687777849239537,
+      "loss": 0.6624,
+      "step": 4718
+    },
+    {
+      "epoch": 0.328289679641031,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001568592436070181,
+      "loss": 0.7277,
+      "step": 4719
+    },
+    {
+      "epoch": 0.3283592472781662,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015684070583461504,
+      "loss": 0.8696,
+      "step": 4720
+    },
+    {
+      "epoch": 0.3284288149153014,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015682216517612741,
+      "loss": 0.8684,
+      "step": 4721
+    },
+    {
+      "epoch": 0.32849838255243663,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0015680362163249665,
+      "loss": 0.8038,
+      "step": 4722
+    },
+    {
+      "epoch": 0.3285679501895718,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001567850752046643,
+      "loss": 0.8145,
+      "step": 4723
+    },
+    {
+      "epoch": 0.328637517826707,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015676652589357203,
+      "loss": 1.1258,
+      "step": 4724
+    },
+    {
+      "epoch": 0.3287070854638422,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015674797370016172,
+      "loss": 0.9553,
+      "step": 4725
+    },
+    {
+      "epoch": 0.3287766531009774,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0015672941862537534,
+      "loss": 1.1712,
+      "step": 4726
+    },
+    {
+      "epoch": 0.32884622073811265,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015671086067015501,
+      "loss": 1.0842,
+      "step": 4727
+    },
+    {
+      "epoch": 0.3289157883752478,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0015669229983544303,
+      "loss": 0.854,
+      "step": 4728
+    },
+    {
+      "epoch": 0.32898535601238305,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0015667373612218176,
+      "loss": 0.9932,
+      "step": 4729
+    },
+    {
+      "epoch": 0.32905492364951827,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001566551695313139,
+      "loss": 1.0276,
+      "step": 4730
+    },
+    {
+      "epoch": 0.32912449128665344,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015663660006378203,
+      "loss": 0.9543,
+      "step": 4731
+    },
+    {
+      "epoch": 0.32919405892378867,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0015661802772052914,
+      "loss": 0.8605,
+      "step": 4732
+    },
+    {
+      "epoch": 0.32926362656092384,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0015659945250249814,
+      "loss": 1.184,
+      "step": 4733
+    },
+    {
+      "epoch": 0.32933319419805906,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015658087441063225,
+      "loss": 0.7605,
+      "step": 4734
+    },
+    {
+      "epoch": 0.3294027618351943,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0015656229344587472,
+      "loss": 0.9397,
+      "step": 4735
+    },
+    {
+      "epoch": 0.32947232947232946,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015654370960916904,
+      "loss": 0.6376,
+      "step": 4736
+    },
+    {
+      "epoch": 0.3295418971094647,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001565251229014588,
+      "loss": 0.9573,
+      "step": 4737
+    },
+    {
+      "epoch": 0.32961146474659986,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001565065333236877,
+      "loss": 0.8613,
+      "step": 4738
+    },
+    {
+      "epoch": 0.3296810323837351,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015648794087679968,
+      "loss": 0.7105,
+      "step": 4739
+    },
+    {
+      "epoch": 0.3297506000208703,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0015646934556173872,
+      "loss": 1.0448,
+      "step": 4740
+    },
+    {
+      "epoch": 0.3298201676580055,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015645074737944897,
+      "loss": 0.9482,
+      "step": 4741
+    },
+    {
+      "epoch": 0.3298897352951407,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015643214633087488,
+      "loss": 0.9157,
+      "step": 4742
+    },
+    {
+      "epoch": 0.32995930293227593,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0015641354241696082,
+      "loss": 0.8553,
+      "step": 4743
+    },
+    {
+      "epoch": 0.3300288705694111,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001563949356386514,
+      "loss": 0.8695,
+      "step": 4744
+    },
+    {
+      "epoch": 0.33009843820654633,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015637632599689141,
+      "loss": 1.2751,
+      "step": 4745
+    },
+    {
+      "epoch": 0.3301680058436815,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015635771349262577,
+      "loss": 1.0886,
+      "step": 4746
+    },
+    {
+      "epoch": 0.3302375734808167,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015633909812679948,
+      "loss": 0.7407,
+      "step": 4747
+    },
+    {
+      "epoch": 0.33030714111795195,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015632047990035774,
+      "loss": 1.0269,
+      "step": 4748
+    },
+    {
+      "epoch": 0.3303767087550871,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0015630185881424592,
+      "loss": 1.027,
+      "step": 4749
+    },
+    {
+      "epoch": 0.33044627639222235,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015628323486940952,
+      "loss": 0.6675,
+      "step": 4750
+    },
+    {
+      "epoch": 0.3305158440293575,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015626460806679413,
+      "loss": 0.764,
+      "step": 4751
+    },
+    {
+      "epoch": 0.33058541166649275,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015624597840734552,
+      "loss": 1.0752,
+      "step": 4752
+    },
+    {
+      "epoch": 0.33065497930362797,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015622734589200962,
+      "loss": 0.9472,
+      "step": 4753
+    },
+    {
+      "epoch": 0.33072454694076314,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001562087105217325,
+      "loss": 1.0138,
+      "step": 4754
+    },
+    {
+      "epoch": 0.33079411457789837,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015619007229746038,
+      "loss": 0.9164,
+      "step": 4755
+    },
+    {
+      "epoch": 0.33086368221503354,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015617143122013963,
+      "loss": 0.8213,
+      "step": 4756
+    },
+    {
+      "epoch": 0.33093324985216876,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001561527872907167,
+      "loss": 0.8078,
+      "step": 4757
+    },
+    {
+      "epoch": 0.331002817489304,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015613414051013827,
+      "loss": 1.1892,
+      "step": 4758
+    },
+    {
+      "epoch": 0.33107238512643916,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0015611549087935115,
+      "loss": 0.8594,
+      "step": 4759
+    },
+    {
+      "epoch": 0.3311419527635744,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015609683839930223,
+      "loss": 0.8934,
+      "step": 4760
+    },
+    {
+      "epoch": 0.3312115204007096,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0015607818307093856,
+      "loss": 0.7721,
+      "step": 4761
+    },
+    {
+      "epoch": 0.3312810880378448,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015605952489520748,
+      "loss": 0.9527,
+      "step": 4762
+    },
+    {
+      "epoch": 0.33135065567498,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015604086387305625,
+      "loss": 0.9401,
+      "step": 4763
+    },
+    {
+      "epoch": 0.3314202233121152,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0015602220000543242,
+      "loss": 0.8029,
+      "step": 4764
+    },
+    {
+      "epoch": 0.3314897909492504,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0015600353329328364,
+      "loss": 0.8822,
+      "step": 4765
+    },
+    {
+      "epoch": 0.33155935858638563,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015598486373755774,
+      "loss": 1.0305,
+      "step": 4766
+    },
+    {
+      "epoch": 0.3316289262235208,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015596619133920262,
+      "loss": 0.9862,
+      "step": 4767
+    },
+    {
+      "epoch": 0.33169849386065603,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0015594751609916643,
+      "loss": 0.9982,
+      "step": 4768
+    },
+    {
+      "epoch": 0.3317680614977912,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015592883801839733,
+      "loss": 0.7449,
+      "step": 4769
+    },
+    {
+      "epoch": 0.3318376291349264,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0015591015709784375,
+      "loss": 0.6865,
+      "step": 4770
+    },
+    {
+      "epoch": 0.33190719677206165,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001558914733384542,
+      "loss": 0.9245,
+      "step": 4771
+    },
+    {
+      "epoch": 0.3319767644091968,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015587278674117735,
+      "loss": 0.7876,
+      "step": 4772
+    },
+    {
+      "epoch": 0.33204633204633205,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00155854097306962,
+      "loss": 0.8075,
+      "step": 4773
+    },
+    {
+      "epoch": 0.3321158996834673,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015583540503675715,
+      "loss": 1.0019,
+      "step": 4774
+    },
+    {
+      "epoch": 0.33218546732060245,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015581670993151183,
+      "loss": 0.9934,
+      "step": 4775
+    },
+    {
+      "epoch": 0.33225503495773767,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0015579801199217533,
+      "loss": 0.9749,
+      "step": 4776
+    },
+    {
+      "epoch": 0.33232460259487284,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015577931121969703,
+      "loss": 0.7169,
+      "step": 4777
+    },
+    {
+      "epoch": 0.33239417023200807,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015576060761502643,
+      "loss": 0.9887,
+      "step": 4778
+    },
+    {
+      "epoch": 0.3324637378691433,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015574190117911325,
+      "loss": 1.1363,
+      "step": 4779
+    },
+    {
+      "epoch": 0.33253330550627846,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015572319191290726,
+      "loss": 0.9078,
+      "step": 4780
+    },
+    {
+      "epoch": 0.3326028731434137,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001557044798173585,
+      "loss": 0.8433,
+      "step": 4781
+    },
+    {
+      "epoch": 0.33267244078054886,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015568576489341699,
+      "loss": 0.9377,
+      "step": 4782
+    },
+    {
+      "epoch": 0.3327420084176841,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00155667047142033,
+      "loss": 0.775,
+      "step": 4783
+    },
+    {
+      "epoch": 0.3328115760548193,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0015564832656415697,
+      "loss": 1.1054,
+      "step": 4784
+    },
+    {
+      "epoch": 0.3328811436919545,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015562960316073938,
+      "loss": 0.8957,
+      "step": 4785
+    },
+    {
+      "epoch": 0.3329507113290897,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015561087693273098,
+      "loss": 0.6607,
+      "step": 4786
+    },
+    {
+      "epoch": 0.33302027896622494,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001555921478810825,
+      "loss": 0.8963,
+      "step": 4787
+    },
+    {
+      "epoch": 0.3330898466033601,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00155573416006745,
+      "loss": 0.9921,
+      "step": 4788
+    },
+    {
+      "epoch": 0.33315941424049533,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.001555546813106695,
+      "loss": 1.1758,
+      "step": 4789
+    },
+    {
+      "epoch": 0.3332289818776305,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0015553594379380733,
+      "loss": 1.159,
+      "step": 4790
+    },
+    {
+      "epoch": 0.33329854951476573,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015551720345710987,
+      "loss": 1.0012,
+      "step": 4791
+    },
+    {
+      "epoch": 0.33336811715190096,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015549846030152858,
+      "loss": 0.9196,
+      "step": 4792
+    },
+    {
+      "epoch": 0.3334376847890361,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0015547971432801528,
+      "loss": 0.9451,
+      "step": 4793
+    },
+    {
+      "epoch": 0.33350725242617135,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001554609655375217,
+      "loss": 0.8466,
+      "step": 4794
+    },
+    {
+      "epoch": 0.3335768200633065,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015544221393099984,
+      "loss": 0.7053,
+      "step": 4795
+    },
+    {
+      "epoch": 0.33364638770044175,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015542345950940177,
+      "loss": 0.8763,
+      "step": 4796
+    },
+    {
+      "epoch": 0.333715955337577,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0015540470227367984,
+      "loss": 1.0129,
+      "step": 4797
+    },
+    {
+      "epoch": 0.33378552297471215,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015538594222478635,
+      "loss": 0.9119,
+      "step": 4798
+    },
+    {
+      "epoch": 0.33385509061184737,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001553671793636739,
+      "loss": 0.9252,
+      "step": 4799
+    },
+    {
+      "epoch": 0.3339246582489826,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015534841369129514,
+      "loss": 0.9074,
+      "step": 4800
+    },
+    {
+      "epoch": 0.33399422588611777,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001553296452086029,
+      "loss": 0.8542,
+      "step": 4801
+    },
+    {
+      "epoch": 0.334063793523253,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0015531087391655017,
+      "loss": 1.1449,
+      "step": 4802
+    },
+    {
+      "epoch": 0.33413336116038816,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0015529209981609005,
+      "loss": 0.9562,
+      "step": 4803
+    },
+    {
+      "epoch": 0.3342029287975234,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001552733229081758,
+      "loss": 0.9129,
+      "step": 4804
+    },
+    {
+      "epoch": 0.3342724964346586,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015525454319376079,
+      "loss": 1.0567,
+      "step": 4805
+    },
+    {
+      "epoch": 0.3343420640717938,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0015523576067379861,
+      "loss": 1.0974,
+      "step": 4806
+    },
+    {
+      "epoch": 0.334411631708929,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001552169753492429,
+      "loss": 0.9468,
+      "step": 4807
+    },
+    {
+      "epoch": 0.3344811993460642,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015519818722104747,
+      "loss": 0.7054,
+      "step": 4808
+    },
+    {
+      "epoch": 0.3345507669831994,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0015517939629016634,
+      "loss": 1.3383,
+      "step": 4809
+    },
+    {
+      "epoch": 0.33462033462033464,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001551606025575536,
+      "loss": 0.8769,
+      "step": 4810
+    },
+    {
+      "epoch": 0.3346899022574698,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015514180602416348,
+      "loss": 0.8681,
+      "step": 4811
+    },
+    {
+      "epoch": 0.33475946989460503,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015512300669095036,
+      "loss": 0.9219,
+      "step": 4812
+    },
+    {
+      "epoch": 0.33482903753174026,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0015510420455886885,
+      "loss": 1.2348,
+      "step": 4813
+    },
+    {
+      "epoch": 0.33489860516887543,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015508539962887356,
+      "loss": 0.6768,
+      "step": 4814
+    },
+    {
+      "epoch": 0.33496817280601066,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001550665919019193,
+      "loss": 0.8907,
+      "step": 4815
+    },
+    {
+      "epoch": 0.3350377404431458,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015504778137896108,
+      "loss": 0.93,
+      "step": 4816
+    },
+    {
+      "epoch": 0.33510730808028105,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0015502896806095397,
+      "loss": 1.2813,
+      "step": 4817
+    },
+    {
+      "epoch": 0.3351768757174163,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015501015194885326,
+      "loss": 1.2485,
+      "step": 4818
+    },
+    {
+      "epoch": 0.33524644335455145,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015499133304361426,
+      "loss": 0.9347,
+      "step": 4819
+    },
+    {
+      "epoch": 0.3353160109916867,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001549725113461926,
+      "loss": 0.8417,
+      "step": 4820
+    },
+    {
+      "epoch": 0.33538557862882185,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015495368685754386,
+      "loss": 0.765,
+      "step": 4821
+    },
+    {
+      "epoch": 0.33545514626595707,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015493485957862388,
+      "loss": 1.0378,
+      "step": 4822
+    },
+    {
+      "epoch": 0.3355247139030923,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015491602951038866,
+      "loss": 0.891,
+      "step": 4823
+    },
+    {
+      "epoch": 0.33559428154022747,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015489719665379422,
+      "loss": 1.2746,
+      "step": 4824
+    },
+    {
+      "epoch": 0.3356638491773627,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015487836100979686,
+      "loss": 1.024,
+      "step": 4825
+    },
+    {
+      "epoch": 0.3357334168144979,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0015485952257935293,
+      "loss": 0.8937,
+      "step": 4826
+    },
+    {
+      "epoch": 0.3358029844516331,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015484068136341898,
+      "loss": 1.1165,
+      "step": 4827
+    },
+    {
+      "epoch": 0.3358725520887683,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001548218373629516,
+      "loss": 0.8166,
+      "step": 4828
+    },
+    {
+      "epoch": 0.3359421197259035,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015480299057890768,
+      "loss": 0.7577,
+      "step": 4829
+    },
+    {
+      "epoch": 0.3360116873630387,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0015478414101224409,
+      "loss": 1.0613,
+      "step": 4830
+    },
+    {
+      "epoch": 0.33608125500017394,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0015476528866391797,
+      "loss": 0.8719,
+      "step": 4831
+    },
+    {
+      "epoch": 0.3361508226373091,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015474643353488653,
+      "loss": 0.876,
+      "step": 4832
+    },
+    {
+      "epoch": 0.33622039027444434,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015472757562610714,
+      "loss": 0.6715,
+      "step": 4833
+    },
+    {
+      "epoch": 0.3362899579115795,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015470871493853734,
+      "loss": 0.7938,
+      "step": 4834
+    },
+    {
+      "epoch": 0.33635952554871473,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015468985147313468,
+      "loss": 0.7811,
+      "step": 4835
+    },
+    {
+      "epoch": 0.33642909318584996,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0015467098523085706,
+      "loss": 0.9007,
+      "step": 4836
+    },
+    {
+      "epoch": 0.33649866082298513,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0015465211621266237,
+      "loss": 0.8734,
+      "step": 4837
+    },
+    {
+      "epoch": 0.33656822846012036,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015463324441950868,
+      "loss": 0.8147,
+      "step": 4838
+    },
+    {
+      "epoch": 0.3366377960972556,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015461436985235422,
+      "loss": 1.0236,
+      "step": 4839
+    },
+    {
+      "epoch": 0.33670736373439075,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015459549251215733,
+      "loss": 0.9286,
+      "step": 4840
+    },
+    {
+      "epoch": 0.336776931371526,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001545766123998765,
+      "loss": 0.6887,
+      "step": 4841
+    },
+    {
+      "epoch": 0.33684649900866115,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001545577295164704,
+      "loss": 0.8163,
+      "step": 4842
+    },
+    {
+      "epoch": 0.3369160666457964,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015453884386289775,
+      "loss": 0.9204,
+      "step": 4843
+    },
+    {
+      "epoch": 0.3369856342829316,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015451995544011755,
+      "loss": 0.9626,
+      "step": 4844
+    },
+    {
+      "epoch": 0.33705520192006677,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0015450106424908876,
+      "loss": 0.8108,
+      "step": 4845
+    },
+    {
+      "epoch": 0.337124769557202,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001544821702907707,
+      "loss": 1.0259,
+      "step": 4846
+    },
+    {
+      "epoch": 0.33719433719433717,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001544632735661226,
+      "loss": 0.906,
+      "step": 4847
+    },
+    {
+      "epoch": 0.3372639048314724,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00154444374076104,
+      "loss": 1.0432,
+      "step": 4848
+    },
+    {
+      "epoch": 0.3373334724686076,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0015442547182167449,
+      "loss": 0.8352,
+      "step": 4849
+    },
+    {
+      "epoch": 0.3374030401057428,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015440656680379386,
+      "loss": 0.8683,
+      "step": 4850
+    },
+    {
+      "epoch": 0.337472607742878,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015438765902342198,
+      "loss": 0.7643,
+      "step": 4851
+    },
+    {
+      "epoch": 0.33754217538001324,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015436874848151893,
+      "loss": 0.9901,
+      "step": 4852
+    },
+    {
+      "epoch": 0.3376117430171484,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015434983517904485,
+      "loss": 0.7876,
+      "step": 4853
+    },
+    {
+      "epoch": 0.33768131065428364,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0015433091911696009,
+      "loss": 1.1143,
+      "step": 4854
+    },
+    {
+      "epoch": 0.3377508782914188,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015431200029622511,
+      "loss": 1.0496,
+      "step": 4855
+    },
+    {
+      "epoch": 0.33782044592855404,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001542930787178005,
+      "loss": 0.9421,
+      "step": 4856
+    },
+    {
+      "epoch": 0.33789001356568926,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0015427415438264702,
+      "loss": 0.6855,
+      "step": 4857
+    },
+    {
+      "epoch": 0.33795958120282443,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015425522729172552,
+      "loss": 0.639,
+      "step": 4858
+    },
+    {
+      "epoch": 0.33802914883995966,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015423629744599709,
+      "loss": 0.854,
+      "step": 4859
+    },
+    {
+      "epoch": 0.33809871647709483,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001542173648464228,
+      "loss": 0.9457,
+      "step": 4860
+    },
+    {
+      "epoch": 0.33816828411423006,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0015419842949396404,
+      "loss": 1.1196,
+      "step": 4861
+    },
+    {
+      "epoch": 0.3382378517513653,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0015417949138958218,
+      "loss": 0.901,
+      "step": 4862
+    },
+    {
+      "epoch": 0.33830741938850045,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015416055053423885,
+      "loss": 1.0023,
+      "step": 4863
+    },
+    {
+      "epoch": 0.3383769870256357,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015414160692889575,
+      "loss": 0.8722,
+      "step": 4864
+    },
+    {
+      "epoch": 0.3384465546627709,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015412266057451471,
+      "loss": 0.8777,
+      "step": 4865
+    },
+    {
+      "epoch": 0.3385161222999061,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001541037114720578,
+      "loss": 0.7736,
+      "step": 4866
+    },
+    {
+      "epoch": 0.3385856899370413,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001540847596224871,
+      "loss": 0.8343,
+      "step": 4867
+    },
+    {
+      "epoch": 0.33865525757417647,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015406580502676497,
+      "loss": 0.7841,
+      "step": 4868
+    },
+    {
+      "epoch": 0.3387248252113117,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0015404684768585374,
+      "loss": 0.9431,
+      "step": 4869
+    },
+    {
+      "epoch": 0.3387943928484469,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0015402788760071598,
+      "loss": 0.9309,
+      "step": 4870
+    },
+    {
+      "epoch": 0.3388639604855821,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015400892477231442,
+      "loss": 1.0093,
+      "step": 4871
+    },
+    {
+      "epoch": 0.3389335281227173,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001539899592016119,
+      "loss": 0.8758,
+      "step": 4872
+    },
+    {
+      "epoch": 0.3390030957598525,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0015397099088957137,
+      "loss": 1.0624,
+      "step": 4873
+    },
+    {
+      "epoch": 0.3390726633969877,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015395201983715594,
+      "loss": 0.6798,
+      "step": 4874
+    },
+    {
+      "epoch": 0.33914223103412294,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001539330460453289,
+      "loss": 1.0312,
+      "step": 4875
+    },
+    {
+      "epoch": 0.3392117986712581,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015391406951505361,
+      "loss": 0.9884,
+      "step": 4876
+    },
+    {
+      "epoch": 0.33928136630839334,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015389509024729365,
+      "loss": 0.9795,
+      "step": 4877
+    },
+    {
+      "epoch": 0.33935093394552857,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0015387610824301263,
+      "loss": 0.9417,
+      "step": 4878
+    },
+    {
+      "epoch": 0.33942050158266374,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001538571235031744,
+      "loss": 0.7121,
+      "step": 4879
+    },
+    {
+      "epoch": 0.33949006921979896,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015383813602874291,
+      "loss": 0.8359,
+      "step": 4880
+    },
+    {
+      "epoch": 0.33955963685693413,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015381914582068223,
+      "loss": 1.0214,
+      "step": 4881
+    },
+    {
+      "epoch": 0.33962920449406936,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0015380015287995655,
+      "loss": 0.8943,
+      "step": 4882
+    },
+    {
+      "epoch": 0.3396987721312046,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015378115720753032,
+      "loss": 0.7631,
+      "step": 4883
+    },
+    {
+      "epoch": 0.33976833976833976,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00153762158804368,
+      "loss": 0.9479,
+      "step": 4884
+    },
+    {
+      "epoch": 0.339837907405475,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015374315767143422,
+      "loss": 1.1355,
+      "step": 4885
+    },
+    {
+      "epoch": 0.33990747504261015,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001537241538096938,
+      "loss": 0.9332,
+      "step": 4886
+    },
+    {
+      "epoch": 0.3399770426797454,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0015370514722011163,
+      "loss": 0.7488,
+      "step": 4887
+    },
+    {
+      "epoch": 0.3400466103168806,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001536861379036528,
+      "loss": 0.9931,
+      "step": 4888
+    },
+    {
+      "epoch": 0.3401161779540158,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0015366712586128246,
+      "loss": 0.9932,
+      "step": 4889
+    },
+    {
+      "epoch": 0.340185745591151,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00153648111093966,
+      "loss": 0.7979,
+      "step": 4890
+    },
+    {
+      "epoch": 0.3402553132282862,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015362909360266883,
+      "loss": 1.0964,
+      "step": 4891
+    },
+    {
+      "epoch": 0.3403248808654214,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0015361007338835662,
+      "loss": 0.938,
+      "step": 4892
+    },
+    {
+      "epoch": 0.3403944485025566,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015359105045199511,
+      "loss": 0.8411,
+      "step": 4893
+    },
+    {
+      "epoch": 0.3404640161396918,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015357202479455016,
+      "loss": 1.1284,
+      "step": 4894
+    },
+    {
+      "epoch": 0.340533583776827,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001535529964169878,
+      "loss": 1.0382,
+      "step": 4895
+    },
+    {
+      "epoch": 0.34060315141396225,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0015353396532027423,
+      "loss": 0.8875,
+      "step": 4896
+    },
+    {
+      "epoch": 0.3406727190510974,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001535149315053757,
+      "loss": 1.2211,
+      "step": 4897
+    },
+    {
+      "epoch": 0.34074228668823264,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015349589497325872,
+      "loss": 0.9812,
+      "step": 4898
+    },
+    {
+      "epoch": 0.3408118543253678,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001534768557248898,
+      "loss": 0.8877,
+      "step": 4899
+    },
+    {
+      "epoch": 0.34088142196250304,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015345781376123573,
+      "loss": 0.8944,
+      "step": 4900
+    },
+    {
+      "epoch": 0.34095098959963827,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001534387690832633,
+      "loss": 0.8149,
+      "step": 4901
+    },
+    {
+      "epoch": 0.34102055723677344,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015341972169193952,
+      "loss": 0.6687,
+      "step": 4902
+    },
+    {
+      "epoch": 0.34109012487390866,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0015340067158823155,
+      "loss": 0.8705,
+      "step": 4903
+    },
+    {
+      "epoch": 0.3411596925110439,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001533816187731066,
+      "loss": 0.8809,
+      "step": 4904
+    },
+    {
+      "epoch": 0.34122926014817906,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0015336256324753215,
+      "loss": 0.868,
+      "step": 4905
+    },
+    {
+      "epoch": 0.3412988277853143,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0015334350501247569,
+      "loss": 0.7088,
+      "step": 4906
+    },
+    {
+      "epoch": 0.34136839542244946,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001533244440689049,
+      "loss": 0.904,
+      "step": 4907
+    },
+    {
+      "epoch": 0.3414379630595847,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015330538041778766,
+      "loss": 1.0991,
+      "step": 4908
+    },
+    {
+      "epoch": 0.3415075306967199,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015328631406009183,
+      "loss": 0.9877,
+      "step": 4909
+    },
+    {
+      "epoch": 0.3415770983338551,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001532672449967856,
+      "loss": 0.7401,
+      "step": 4910
+    },
+    {
+      "epoch": 0.3416466659709903,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015324817322883715,
+      "loss": 0.6729,
+      "step": 4911
+    },
+    {
+      "epoch": 0.3417162336081255,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015322909875721481,
+      "loss": 0.925,
+      "step": 4912
+    },
+    {
+      "epoch": 0.3417858012452607,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001532100215828872,
+      "loss": 0.9261,
+      "step": 4913
+    },
+    {
+      "epoch": 0.3418553688823959,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015319094170682282,
+      "loss": 0.7656,
+      "step": 4914
+    },
+    {
+      "epoch": 0.3419249365195311,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015317185912999056,
+      "loss": 0.8597,
+      "step": 4915
+    },
+    {
+      "epoch": 0.3419945041566663,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001531527738533593,
+      "loss": 0.6822,
+      "step": 4916
+    },
+    {
+      "epoch": 0.34206407179380155,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001531336858778981,
+      "loss": 1.1433,
+      "step": 4917
+    },
+    {
+      "epoch": 0.3421336394309367,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0015311459520457613,
+      "loss": 0.9465,
+      "step": 4918
+    },
+    {
+      "epoch": 0.34220320706807195,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0015309550183436273,
+      "loss": 0.5912,
+      "step": 4919
+    },
+    {
+      "epoch": 0.3422727747052071,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015307640576822737,
+      "loss": 0.9429,
+      "step": 4920
+    },
+    {
+      "epoch": 0.34234234234234234,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0015305730700713965,
+      "loss": 0.7966,
+      "step": 4921
+    },
+    {
+      "epoch": 0.34241190997947757,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015303820555206931,
+      "loss": 0.9875,
+      "step": 4922
+    },
+    {
+      "epoch": 0.34248147761661274,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0015301910140398623,
+      "loss": 1.0872,
+      "step": 4923
+    },
+    {
+      "epoch": 0.34255104525374797,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001529999945638604,
+      "loss": 0.9393,
+      "step": 4924
+    },
+    {
+      "epoch": 0.34262061289088314,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00152980885032662,
+      "loss": 0.9258,
+      "step": 4925
+    },
+    {
+      "epoch": 0.34269018052801836,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001529617728113613,
+      "loss": 0.9441,
+      "step": 4926
+    },
+    {
+      "epoch": 0.3427597481651536,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015294265790092873,
+      "loss": 0.8605,
+      "step": 4927
+    },
+    {
+      "epoch": 0.34282931580228876,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015292354030233483,
+      "loss": 0.7356,
+      "step": 4928
+    },
+    {
+      "epoch": 0.342898883439424,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015290442001655031,
+      "loss": 1.0271,
+      "step": 4929
+    },
+    {
+      "epoch": 0.3429684510765592,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0015288529704454601,
+      "loss": 1.0625,
+      "step": 4930
+    },
+    {
+      "epoch": 0.3430380187136944,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015286617138729288,
+      "loss": 0.9363,
+      "step": 4931
+    },
+    {
+      "epoch": 0.3431075863508296,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015284704304576204,
+      "loss": 0.9087,
+      "step": 4932
+    },
+    {
+      "epoch": 0.3431771539879648,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015282791202092475,
+      "loss": 1.0606,
+      "step": 4933
+    },
+    {
+      "epoch": 0.3432467216251,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001528087783137523,
+      "loss": 0.9597,
+      "step": 4934
+    },
+    {
+      "epoch": 0.34331628926223523,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0015278964192521629,
+      "loss": 1.043,
+      "step": 4935
+    },
+    {
+      "epoch": 0.3433858568993704,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015277050285628835,
+      "loss": 0.7973,
+      "step": 4936
+    },
+    {
+      "epoch": 0.3434554245365056,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0015275136110794027,
+      "loss": 0.9741,
+      "step": 4937
+    },
+    {
+      "epoch": 0.3435249921736408,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015273221668114392,
+      "loss": 0.8344,
+      "step": 4938
+    },
+    {
+      "epoch": 0.343594559810776,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0015271306957687142,
+      "loss": 0.991,
+      "step": 4939
+    },
+    {
+      "epoch": 0.34366412744791125,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001526939197960949,
+      "loss": 0.6336,
+      "step": 4940
+    },
+    {
+      "epoch": 0.3437336950850464,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001526747673397868,
+      "loss": 0.8836,
+      "step": 4941
+    },
+    {
+      "epoch": 0.34380326272218165,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0015265561220891948,
+      "loss": 0.9956,
+      "step": 4942
+    },
+    {
+      "epoch": 0.3438728303593169,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015263645440446558,
+      "loss": 0.7178,
+      "step": 4943
+    },
+    {
+      "epoch": 0.34394239799645204,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015261729392739786,
+      "loss": 0.7417,
+      "step": 4944
+    },
+    {
+      "epoch": 0.34401196563358727,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001525981307786891,
+      "loss": 0.6659,
+      "step": 4945
+    },
+    {
+      "epoch": 0.34408153327072244,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0015257896495931244,
+      "loss": 1.1262,
+      "step": 4946
+    },
+    {
+      "epoch": 0.34415110090785767,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001525597964702409,
+      "loss": 1.0506,
+      "step": 4947
+    },
+    {
+      "epoch": 0.3442206685449929,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0015254062531244786,
+      "loss": 1.0611,
+      "step": 4948
+    },
+    {
+      "epoch": 0.34429023618212806,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015252145148690666,
+      "loss": 0.8728,
+      "step": 4949
+    },
+    {
+      "epoch": 0.3443598038192633,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0015250227499459088,
+      "loss": 0.8899,
+      "step": 4950
+    },
+    {
+      "epoch": 0.34442937145639846,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015248309583647424,
+      "loss": 0.9292,
+      "step": 4951
+    },
+    {
+      "epoch": 0.3444989390935337,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015246391401353052,
+      "loss": 0.7855,
+      "step": 4952
+    },
+    {
+      "epoch": 0.3445685067306689,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0015244472952673368,
+      "loss": 0.8608,
+      "step": 4953
+    },
+    {
+      "epoch": 0.3446380743678041,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015242554237705778,
+      "loss": 0.6868,
+      "step": 4954
+    },
+    {
+      "epoch": 0.3447076420049393,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015240635256547712,
+      "loss": 0.8707,
+      "step": 4955
+    },
+    {
+      "epoch": 0.34477720964207453,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00152387160092966,
+      "loss": 0.968,
+      "step": 4956
+    },
+    {
+      "epoch": 0.3448467772792097,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015236796496049898,
+      "loss": 0.8383,
+      "step": 4957
+    },
+    {
+      "epoch": 0.34491634491634493,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015234876716905062,
+      "loss": 0.8117,
+      "step": 4958
+    },
+    {
+      "epoch": 0.3449859125534801,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0015232956671959574,
+      "loss": 0.9149,
+      "step": 4959
+    },
+    {
+      "epoch": 0.3450554801906153,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001523103636131092,
+      "loss": 0.8582,
+      "step": 4960
+    },
+    {
+      "epoch": 0.34512504782775055,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001522911578505661,
+      "loss": 0.7958,
+      "step": 4961
+    },
+    {
+      "epoch": 0.3451946154648857,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015227194943294154,
+      "loss": 0.6385,
+      "step": 4962
+    },
+    {
+      "epoch": 0.34526418310202095,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015225273836121085,
+      "loss": 0.8825,
+      "step": 4963
+    },
+    {
+      "epoch": 0.3453337507391561,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001522335246363495,
+      "loss": 0.8911,
+      "step": 4964
+    },
+    {
+      "epoch": 0.34540331837629135,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015221430825933305,
+      "loss": 0.7903,
+      "step": 4965
+    },
+    {
+      "epoch": 0.34547288601342657,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001521950892311372,
+      "loss": 0.844,
+      "step": 4966
+    },
+    {
+      "epoch": 0.34554245365056174,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015217586755273778,
+      "loss": 0.9461,
+      "step": 4967
+    },
+    {
+      "epoch": 0.34561202128769697,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001521566432251108,
+      "loss": 0.8895,
+      "step": 4968
+    },
+    {
+      "epoch": 0.3456815889248322,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0015213741624923239,
+      "loss": 0.6103,
+      "step": 4969
+    },
+    {
+      "epoch": 0.34575115656196737,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0015211818662607872,
+      "loss": 0.8747,
+      "step": 4970
+    },
+    {
+      "epoch": 0.3458207241991026,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001520989543566263,
+      "loss": 0.7802,
+      "step": 4971
+    },
+    {
+      "epoch": 0.34589029183623776,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015207971944185155,
+      "loss": 0.8913,
+      "step": 4972
+    },
+    {
+      "epoch": 0.345959859473373,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015206048188273113,
+      "loss": 0.9981,
+      "step": 4973
+    },
+    {
+      "epoch": 0.3460294271105082,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015204124168024184,
+      "loss": 0.9906,
+      "step": 4974
+    },
+    {
+      "epoch": 0.3460989947476434,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015202199883536064,
+      "loss": 0.807,
+      "step": 4975
+    },
+    {
+      "epoch": 0.3461685623847786,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0015200275334906453,
+      "loss": 1.1194,
+      "step": 4976
+    },
+    {
+      "epoch": 0.3462381300219138,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015198350522233068,
+      "loss": 0.9465,
+      "step": 4977
+    },
+    {
+      "epoch": 0.346307697659049,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001519642544561365,
+      "loss": 1.0371,
+      "step": 4978
+    },
+    {
+      "epoch": 0.34637726529618423,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015194500105145936,
+      "loss": 0.8513,
+      "step": 4979
+    },
+    {
+      "epoch": 0.3464468329333194,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0015192574500927695,
+      "loss": 0.8837,
+      "step": 4980
+    },
+    {
+      "epoch": 0.34651640057045463,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001519064863305669,
+      "loss": 0.7,
+      "step": 4981
+    },
+    {
+      "epoch": 0.34658596820758986,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015188722501630711,
+      "loss": 0.8776,
+      "step": 4982
+    },
+    {
+      "epoch": 0.346655535844725,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0015186796106747553,
+      "loss": 0.7257,
+      "step": 4983
+    },
+    {
+      "epoch": 0.34672510348186025,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015184869448505035,
+      "loss": 0.911,
+      "step": 4984
+    },
+    {
+      "epoch": 0.3467946711189954,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015182942527000982,
+      "loss": 0.5819,
+      "step": 4985
+    },
+    {
+      "epoch": 0.34686423875613065,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015181015342333227,
+      "loss": 0.7151,
+      "step": 4986
+    },
+    {
+      "epoch": 0.3469338063932659,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001517908789459963,
+      "loss": 0.9317,
+      "step": 4987
+    },
+    {
+      "epoch": 0.34700337403040105,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0015177160183898054,
+      "loss": 0.7153,
+      "step": 4988
+    },
+    {
+      "epoch": 0.34707294166753627,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015175232210326377,
+      "loss": 0.8403,
+      "step": 4989
+    },
+    {
+      "epoch": 0.34714250930467144,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0015173303973982498,
+      "loss": 0.8124,
+      "step": 4990
+    },
+    {
+      "epoch": 0.34721207694180667,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0015171375474964312,
+      "loss": 0.7931,
+      "step": 4991
+    },
+    {
+      "epoch": 0.3472816445789419,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001516944671336975,
+      "loss": 1.0845,
+      "step": 4992
+    },
+    {
+      "epoch": 0.34735121221607707,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0015167517689296734,
+      "loss": 0.8126,
+      "step": 4993
+    },
+    {
+      "epoch": 0.3474207798532123,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015165588402843225,
+      "loss": 1.018,
+      "step": 4994
+    },
+    {
+      "epoch": 0.3474903474903475,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0015163658854107165,
+      "loss": 0.8798,
+      "step": 4995
+    },
+    {
+      "epoch": 0.3475599151274827,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015161729043186541,
+      "loss": 0.988,
+      "step": 4996
+    },
+    {
+      "epoch": 0.3476294827646179,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001515979897017933,
+      "loss": 0.8558,
+      "step": 4997
+    },
+    {
+      "epoch": 0.3476990504017531,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015157868635183537,
+      "loss": 1.0063,
+      "step": 4998
+    },
+    {
+      "epoch": 0.3477686180388883,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.001515593803829717,
+      "loss": 1.202,
+      "step": 4999
+    },
+    {
+      "epoch": 0.34783818567602354,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0015154007179618257,
+      "loss": 1.1614,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3479077533131587,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015152076059244842,
+      "loss": 0.6836,
+      "step": 5001
+    },
+    {
+      "epoch": 0.34797732095029393,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015150144677274966,
+      "loss": 0.8049,
+      "step": 5002
+    },
+    {
+      "epoch": 0.3480468885874291,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015148213033806708,
+      "loss": 1.0787,
+      "step": 5003
+    },
+    {
+      "epoch": 0.34811645622456433,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001514628112893814,
+      "loss": 0.781,
+      "step": 5004
+    },
+    {
+      "epoch": 0.34818602386169956,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015144348962767352,
+      "loss": 0.9506,
+      "step": 5005
+    },
+    {
+      "epoch": 0.3482555914988347,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015142416535392457,
+      "loss": 0.8873,
+      "step": 5006
+    },
+    {
+      "epoch": 0.34832515913596995,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015140483846911566,
+      "loss": 0.879,
+      "step": 5007
+    },
+    {
+      "epoch": 0.3483947267731052,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001513855089742282,
+      "loss": 0.969,
+      "step": 5008
+    },
+    {
+      "epoch": 0.34846429441024035,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0015136617687024354,
+      "loss": 0.9758,
+      "step": 5009
+    },
+    {
+      "epoch": 0.3485338620473756,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0015134684215814338,
+      "loss": 0.7726,
+      "step": 5010
+    },
+    {
+      "epoch": 0.34860342968451075,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015132750483890934,
+      "loss": 0.853,
+      "step": 5011
+    },
+    {
+      "epoch": 0.34867299732164597,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015130816491352333,
+      "loss": 0.879,
+      "step": 5012
+    },
+    {
+      "epoch": 0.3487425649587812,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015128882238296733,
+      "loss": 0.9025,
+      "step": 5013
+    },
+    {
+      "epoch": 0.34881213259591637,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015126947724822342,
+      "loss": 0.7474,
+      "step": 5014
+    },
+    {
+      "epoch": 0.3488817002330516,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001512501295102739,
+      "loss": 0.5989,
+      "step": 5015
+    },
+    {
+      "epoch": 0.34895126787018677,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015123077917010108,
+      "loss": 1.0787,
+      "step": 5016
+    },
+    {
+      "epoch": 0.349020835507322,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015121142622868758,
+      "loss": 0.8695,
+      "step": 5017
+    },
+    {
+      "epoch": 0.3490904031444572,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0015119207068701593,
+      "loss": 0.9261,
+      "step": 5018
+    },
+    {
+      "epoch": 0.3491599707815924,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0015117271254606898,
+      "loss": 0.8961,
+      "step": 5019
+    },
+    {
+      "epoch": 0.3492295384187276,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015115335180682964,
+      "loss": 0.6799,
+      "step": 5020
+    },
+    {
+      "epoch": 0.34929910605586284,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015113398847028086,
+      "loss": 0.7744,
+      "step": 5021
+    },
+    {
+      "epoch": 0.349368673692998,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0015111462253740594,
+      "loss": 1.0222,
+      "step": 5022
+    },
+    {
+      "epoch": 0.34943824133013324,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015109525400918806,
+      "loss": 0.8946,
+      "step": 5023
+    },
+    {
+      "epoch": 0.3495078089672684,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015107588288661078,
+      "loss": 0.9303,
+      "step": 5024
+    },
+    {
+      "epoch": 0.34957737660440363,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015105650917065759,
+      "loss": 0.7738,
+      "step": 5025
+    },
+    {
+      "epoch": 0.34964694424153886,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015103713286231221,
+      "loss": 0.9991,
+      "step": 5026
+    },
+    {
+      "epoch": 0.34971651187867403,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0015101775396255848,
+      "loss": 0.7473,
+      "step": 5027
+    },
+    {
+      "epoch": 0.34978607951580926,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015099837247238032,
+      "loss": 0.908,
+      "step": 5028
+    },
+    {
+      "epoch": 0.3498556471529444,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0015097898839276191,
+      "loss": 1.0346,
+      "step": 5029
+    },
+    {
+      "epoch": 0.34992521479007965,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0015095960172468736,
+      "loss": 0.7678,
+      "step": 5030
+    },
+    {
+      "epoch": 0.3499947824272149,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015094021246914117,
+      "loss": 0.6541,
+      "step": 5031
+    },
+    {
+      "epoch": 0.35006435006435005,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015092082062710766,
+      "loss": 0.829,
+      "step": 5032
+    },
+    {
+      "epoch": 0.3501339177014853,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015090142619957158,
+      "loss": 1.0688,
+      "step": 5033
+    },
+    {
+      "epoch": 0.35020348533862045,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015088202918751763,
+      "loss": 1.0056,
+      "step": 5034
+    },
+    {
+      "epoch": 0.35027305297575567,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015086262959193074,
+      "loss": 1.0583,
+      "step": 5035
+    },
+    {
+      "epoch": 0.3503426206128909,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0015084322741379585,
+      "loss": 0.9359,
+      "step": 5036
+    },
+    {
+      "epoch": 0.35041218825002607,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015082382265409811,
+      "loss": 0.8333,
+      "step": 5037
+    },
+    {
+      "epoch": 0.3504817558871613,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001508044153138229,
+      "loss": 0.7506,
+      "step": 5038
+    },
+    {
+      "epoch": 0.3505513235242965,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001507850053939555,
+      "loss": 0.7149,
+      "step": 5039
+    },
+    {
+      "epoch": 0.3506208911614317,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015076559289548153,
+      "loss": 0.9226,
+      "step": 5040
+    },
+    {
+      "epoch": 0.3506904587985669,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001507461778193866,
+      "loss": 0.8957,
+      "step": 5041
+    },
+    {
+      "epoch": 0.3507600264357021,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015072676016665656,
+      "loss": 0.8803,
+      "step": 5042
+    },
+    {
+      "epoch": 0.3508295940728373,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015070733993827732,
+      "loss": 0.781,
+      "step": 5043
+    },
+    {
+      "epoch": 0.35089916170997254,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0015068791713523492,
+      "loss": 1.0205,
+      "step": 5044
+    },
+    {
+      "epoch": 0.3509687293471077,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015066849175851562,
+      "loss": 0.8004,
+      "step": 5045
+    },
+    {
+      "epoch": 0.35103829698424294,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015064906380910566,
+      "loss": 0.9252,
+      "step": 5046
+    },
+    {
+      "epoch": 0.3511078646213781,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015062963328799155,
+      "loss": 0.9361,
+      "step": 5047
+    },
+    {
+      "epoch": 0.35117743225851333,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0015061020019615982,
+      "loss": 0.7734,
+      "step": 5048
+    },
+    {
+      "epoch": 0.35124699989564856,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015059076453459727,
+      "loss": 0.9989,
+      "step": 5049
+    },
+    {
+      "epoch": 0.35131656753278373,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0015057132630429066,
+      "loss": 0.9318,
+      "step": 5050
+    },
+    {
+      "epoch": 0.35138613516991896,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00150551885506227,
+      "loss": 0.9111,
+      "step": 5051
+    },
+    {
+      "epoch": 0.3514557028070542,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015053244214139343,
+      "loss": 0.8417,
+      "step": 5052
+    },
+    {
+      "epoch": 0.35152527044418935,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001505129962107771,
+      "loss": 1.3842,
+      "step": 5053
+    },
+    {
+      "epoch": 0.3515948380813246,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015049354771536545,
+      "loss": 0.8825,
+      "step": 5054
+    },
+    {
+      "epoch": 0.35166440571845975,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015047409665614594,
+      "loss": 0.9776,
+      "step": 5055
+    },
+    {
+      "epoch": 0.351733973355595,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0015045464303410623,
+      "loss": 0.9288,
+      "step": 5056
+    },
+    {
+      "epoch": 0.3518035409927302,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0015043518685023403,
+      "loss": 0.6324,
+      "step": 5057
+    },
+    {
+      "epoch": 0.35187310862986537,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0015041572810551727,
+      "loss": 0.7409,
+      "step": 5058
+    },
+    {
+      "epoch": 0.3519426762670006,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0015039626680094398,
+      "loss": 0.8132,
+      "step": 5059
+    },
+    {
+      "epoch": 0.35201224390413577,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0015037680293750223,
+      "loss": 1.0056,
+      "step": 5060
+    },
+    {
+      "epoch": 0.352081811541271,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0015035733651618038,
+      "loss": 0.8673,
+      "step": 5061
+    },
+    {
+      "epoch": 0.3521513791784062,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015033786753796676,
+      "loss": 0.8713,
+      "step": 5062
+    },
+    {
+      "epoch": 0.3522209468155414,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015031839600385,
+      "loss": 0.89,
+      "step": 5063
+    },
+    {
+      "epoch": 0.3522905144526766,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0015029892191481867,
+      "loss": 0.9799,
+      "step": 5064
+    },
+    {
+      "epoch": 0.35236008208981184,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001502794452718616,
+      "loss": 0.58,
+      "step": 5065
+    },
+    {
+      "epoch": 0.352429649726947,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0015025996607596777,
+      "loss": 0.9915,
+      "step": 5066
+    },
+    {
+      "epoch": 0.35249921736408224,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001502404843281262,
+      "loss": 1.0942,
+      "step": 5067
+    },
+    {
+      "epoch": 0.3525687850012174,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015022100002932606,
+      "loss": 0.8471,
+      "step": 5068
+    },
+    {
+      "epoch": 0.35263835263835264,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015020151318055662,
+      "loss": 0.7711,
+      "step": 5069
+    },
+    {
+      "epoch": 0.35270792027548786,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015018202378280746,
+      "loss": 0.8061,
+      "step": 5070
+    },
+    {
+      "epoch": 0.35277748791262303,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015016253183706798,
+      "loss": 0.8691,
+      "step": 5071
+    },
+    {
+      "epoch": 0.35284705554975826,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00150143037344328,
+      "loss": 1.1209,
+      "step": 5072
+    },
+    {
+      "epoch": 0.35291662318689343,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0015012354030557735,
+      "loss": 1.1614,
+      "step": 5073
+    },
+    {
+      "epoch": 0.35298619082402866,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0015010404072180595,
+      "loss": 0.9,
+      "step": 5074
+    },
+    {
+      "epoch": 0.3530557584611639,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001500845385940039,
+      "loss": 0.8916,
+      "step": 5075
+    },
+    {
+      "epoch": 0.35312532609829905,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0015006503392316142,
+      "loss": 0.8861,
+      "step": 5076
+    },
+    {
+      "epoch": 0.3531948937354343,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001500455267102689,
+      "loss": 0.9147,
+      "step": 5077
+    },
+    {
+      "epoch": 0.3532644613725695,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015002601695631673,
+      "loss": 0.9116,
+      "step": 5078
+    },
+    {
+      "epoch": 0.3533340290097047,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001500065046622956,
+      "loss": 0.7114,
+      "step": 5079
+    },
+    {
+      "epoch": 0.3534035966468399,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0014998698982919621,
+      "loss": 1.2384,
+      "step": 5080
+    },
+    {
+      "epoch": 0.35347316428397507,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014996747245800942,
+      "loss": 0.9718,
+      "step": 5081
+    },
+    {
+      "epoch": 0.3535427319211103,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0014994795254972622,
+      "loss": 0.9054,
+      "step": 5082
+    },
+    {
+      "epoch": 0.3536122995582455,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0014992843010533776,
+      "loss": 0.7544,
+      "step": 5083
+    },
+    {
+      "epoch": 0.3536818671953807,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0014990890512583534,
+      "loss": 1.0095,
+      "step": 5084
+    },
+    {
+      "epoch": 0.3537514348325159,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014988937761221018,
+      "loss": 0.9885,
+      "step": 5085
+    },
+    {
+      "epoch": 0.3538210024696511,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014986984756545393,
+      "loss": 1.0343,
+      "step": 5086
+    },
+    {
+      "epoch": 0.3538905701067863,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0014985031498655817,
+      "loss": 0.7228,
+      "step": 5087
+    },
+    {
+      "epoch": 0.35396013774392154,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001498307798765147,
+      "loss": 1.2578,
+      "step": 5088
+    },
+    {
+      "epoch": 0.3540297053810567,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0014981124223631538,
+      "loss": 0.9989,
+      "step": 5089
+    },
+    {
+      "epoch": 0.35409927301819194,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014979170206695226,
+      "loss": 0.8049,
+      "step": 5090
+    },
+    {
+      "epoch": 0.35416884065532717,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0014977215936941746,
+      "loss": 0.9737,
+      "step": 5091
+    },
+    {
+      "epoch": 0.35423840829246234,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014975261414470328,
+      "loss": 0.8074,
+      "step": 5092
+    },
+    {
+      "epoch": 0.35430797592959756,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0014973306639380214,
+      "loss": 0.7584,
+      "step": 5093
+    },
+    {
+      "epoch": 0.35437754356673273,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014971351611770653,
+      "loss": 0.9073,
+      "step": 5094
+    },
+    {
+      "epoch": 0.35444711120386796,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0014969396331740916,
+      "loss": 0.9608,
+      "step": 5095
+    },
+    {
+      "epoch": 0.3545166788410032,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014967440799390284,
+      "loss": 1.0383,
+      "step": 5096
+    },
+    {
+      "epoch": 0.35458624647813836,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014965485014818043,
+      "loss": 0.8309,
+      "step": 5097
+    },
+    {
+      "epoch": 0.3546558141152736,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0014963528978123501,
+      "loss": 0.6906,
+      "step": 5098
+    },
+    {
+      "epoch": 0.35472538175240875,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014961572689405976,
+      "loss": 0.7935,
+      "step": 5099
+    },
+    {
+      "epoch": 0.354794949389544,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014959616148764799,
+      "loss": 0.7032,
+      "step": 5100
+    },
+    {
+      "epoch": 0.3548645170266792,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001495765935629931,
+      "loss": 0.8586,
+      "step": 5101
+    },
+    {
+      "epoch": 0.3549340846638144,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0014955702312108867,
+      "loss": 0.8868,
+      "step": 5102
+    },
+    {
+      "epoch": 0.3550036523009496,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0014953745016292844,
+      "loss": 0.9855,
+      "step": 5103
+    },
+    {
+      "epoch": 0.3550732199380848,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0014951787468950612,
+      "loss": 0.9177,
+      "step": 5104
+    },
+    {
+      "epoch": 0.35514278757522,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0014949829670181573,
+      "loss": 0.6573,
+      "step": 5105
+    },
+    {
+      "epoch": 0.3552123552123552,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0014947871620085134,
+      "loss": 0.8794,
+      "step": 5106
+    },
+    {
+      "epoch": 0.3552819228494904,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014945913318760715,
+      "loss": 0.8961,
+      "step": 5107
+    },
+    {
+      "epoch": 0.3553514904866256,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014943954766307743,
+      "loss": 0.8715,
+      "step": 5108
+    },
+    {
+      "epoch": 0.35542105812376085,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0014941995962825668,
+      "loss": 0.9822,
+      "step": 5109
+    },
+    {
+      "epoch": 0.355490625760896,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0014940036908413948,
+      "loss": 0.9308,
+      "step": 5110
+    },
+    {
+      "epoch": 0.35556019339803124,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0014938077603172052,
+      "loss": 0.6205,
+      "step": 5111
+    },
+    {
+      "epoch": 0.3556297610351664,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0014936118047199467,
+      "loss": 0.8679,
+      "step": 5112
+    },
+    {
+      "epoch": 0.35569932867230164,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014934158240595687,
+      "loss": 0.913,
+      "step": 5113
+    },
+    {
+      "epoch": 0.35576889630943687,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014932198183460223,
+      "loss": 0.7827,
+      "step": 5114
+    },
+    {
+      "epoch": 0.35583846394657204,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014930237875892594,
+      "loss": 0.7733,
+      "step": 5115
+    },
+    {
+      "epoch": 0.35590803158370726,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0014928277317992338,
+      "loss": 0.686,
+      "step": 5116
+    },
+    {
+      "epoch": 0.3559775992208425,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014926316509858996,
+      "loss": 0.8916,
+      "step": 5117
+    },
+    {
+      "epoch": 0.35604716685797766,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0014924355451592134,
+      "loss": 0.9492,
+      "step": 5118
+    },
+    {
+      "epoch": 0.3561167344951129,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0014922394143291322,
+      "loss": 0.9,
+      "step": 5119
+    },
+    {
+      "epoch": 0.35618630213224806,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014920432585056147,
+      "loss": 0.9832,
+      "step": 5120
+    },
+    {
+      "epoch": 0.3562558697693833,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001491847077698621,
+      "loss": 0.7843,
+      "step": 5121
+    },
+    {
+      "epoch": 0.3563254374065185,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001491650871918111,
+      "loss": 0.829,
+      "step": 5122
+    },
+    {
+      "epoch": 0.3563950050436537,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014914546411740487,
+      "loss": 0.9153,
+      "step": 5123
+    },
+    {
+      "epoch": 0.3564645726807889,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001491258385476396,
+      "loss": 0.8968,
+      "step": 5124
+    },
+    {
+      "epoch": 0.3565341403179241,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001491062104835119,
+      "loss": 0.9404,
+      "step": 5125
+    },
+    {
+      "epoch": 0.3566037079550593,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0014908657992601833,
+      "loss": 1.2725,
+      "step": 5126
+    },
+    {
+      "epoch": 0.3566732755921945,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014906694687615567,
+      "loss": 0.7339,
+      "step": 5127
+    },
+    {
+      "epoch": 0.3567428432293297,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0014904731133492076,
+      "loss": 0.813,
+      "step": 5128
+    },
+    {
+      "epoch": 0.3568124108664649,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001490276733033106,
+      "loss": 0.8772,
+      "step": 5129
+    },
+    {
+      "epoch": 0.35688197850360015,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014900803278232227,
+      "loss": 0.7159,
+      "step": 5130
+    },
+    {
+      "epoch": 0.3569515461407353,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0014898838977295311,
+      "loss": 0.8305,
+      "step": 5131
+    },
+    {
+      "epoch": 0.35702111377787055,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014896874427620039,
+      "loss": 0.8337,
+      "step": 5132
+    },
+    {
+      "epoch": 0.3570906814150057,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0014894909629306168,
+      "loss": 0.759,
+      "step": 5133
+    },
+    {
+      "epoch": 0.35716024905214094,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001489294458245346,
+      "loss": 1.0684,
+      "step": 5134
+    },
+    {
+      "epoch": 0.35722981668927617,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014890979287161684,
+      "loss": 0.8084,
+      "step": 5135
+    },
+    {
+      "epoch": 0.35729938432641134,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0014889013743530632,
+      "loss": 0.8792,
+      "step": 5136
+    },
+    {
+      "epoch": 0.35736895196354657,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001488704795166011,
+      "loss": 0.8146,
+      "step": 5137
+    },
+    {
+      "epoch": 0.35743851960068174,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001488508191164992,
+      "loss": 0.7602,
+      "step": 5138
+    },
+    {
+      "epoch": 0.35750808723781696,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0014883115623599897,
+      "loss": 1.1143,
+      "step": 5139
+    },
+    {
+      "epoch": 0.3575776548749522,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0014881149087609873,
+      "loss": 0.8189,
+      "step": 5140
+    },
+    {
+      "epoch": 0.35764722251208736,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014879182303779701,
+      "loss": 0.8648,
+      "step": 5141
+    },
+    {
+      "epoch": 0.3577167901492226,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0014877215272209245,
+      "loss": 1.0972,
+      "step": 5142
+    },
+    {
+      "epoch": 0.3577863577863578,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0014875247992998382,
+      "loss": 0.8679,
+      "step": 5143
+    },
+    {
+      "epoch": 0.357855925423493,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014873280466247,
+      "loss": 0.907,
+      "step": 5144
+    },
+    {
+      "epoch": 0.3579254930606282,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0014871312692054995,
+      "loss": 0.8558,
+      "step": 5145
+    },
+    {
+      "epoch": 0.3579950606977634,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0014869344670522286,
+      "loss": 0.607,
+      "step": 5146
+    },
+    {
+      "epoch": 0.3580646283348986,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00148673764017488,
+      "loss": 0.8488,
+      "step": 5147
+    },
+    {
+      "epoch": 0.35813419597203383,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014865407885834472,
+      "loss": 0.8936,
+      "step": 5148
+    },
+    {
+      "epoch": 0.358203763609169,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0014863439122879253,
+      "loss": 0.8855,
+      "step": 5149
+    },
+    {
+      "epoch": 0.3582733312463042,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014861470112983116,
+      "loss": 0.8521,
+      "step": 5150
+    },
+    {
+      "epoch": 0.3583428988834394,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014859500856246024,
+      "loss": 1.1338,
+      "step": 5151
+    },
+    {
+      "epoch": 0.3584124665205746,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014857531352767972,
+      "loss": 1.0693,
+      "step": 5152
+    },
+    {
+      "epoch": 0.35848203415770985,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014855561602648965,
+      "loss": 0.9105,
+      "step": 5153
+    },
+    {
+      "epoch": 0.358551601794845,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014853591605989013,
+      "loss": 0.9043,
+      "step": 5154
+    },
+    {
+      "epoch": 0.35862116943198025,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014851621362888142,
+      "loss": 0.9153,
+      "step": 5155
+    },
+    {
+      "epoch": 0.3586907370691155,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001484965087344639,
+      "loss": 1.0623,
+      "step": 5156
+    },
+    {
+      "epoch": 0.35876030470625064,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014847680137763815,
+      "loss": 1.1121,
+      "step": 5157
+    },
+    {
+      "epoch": 0.35882987234338587,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014845709155940474,
+      "loss": 0.8297,
+      "step": 5158
+    },
+    {
+      "epoch": 0.35889943998052104,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0014843737928076448,
+      "loss": 0.9511,
+      "step": 5159
+    },
+    {
+      "epoch": 0.35896900761765627,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0014841766454271824,
+      "loss": 1.129,
+      "step": 5160
+    },
+    {
+      "epoch": 0.3590385752547915,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014839794734626704,
+      "loss": 0.8025,
+      "step": 5161
+    },
+    {
+      "epoch": 0.35910814289192666,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00148378227692412,
+      "loss": 0.8423,
+      "step": 5162
+    },
+    {
+      "epoch": 0.3591777105290619,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001483585055821544,
+      "loss": 1.051,
+      "step": 5163
+    },
+    {
+      "epoch": 0.35924727816619706,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0014833878101649565,
+      "loss": 0.9794,
+      "step": 5164
+    },
+    {
+      "epoch": 0.3593168458033323,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014831905399643724,
+      "loss": 0.9215,
+      "step": 5165
+    },
+    {
+      "epoch": 0.3593864134404675,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001482993245229808,
+      "loss": 0.7982,
+      "step": 5166
+    },
+    {
+      "epoch": 0.3594559810776027,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0014827959259712813,
+      "loss": 0.9635,
+      "step": 5167
+    },
+    {
+      "epoch": 0.3595255487147379,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0014825985821988108,
+      "loss": 1.0276,
+      "step": 5168
+    },
+    {
+      "epoch": 0.35959511635187313,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001482401213922417,
+      "loss": 0.9149,
+      "step": 5169
+    },
+    {
+      "epoch": 0.3596646839890083,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014822038211521208,
+      "loss": 1.3285,
+      "step": 5170
+    },
+    {
+      "epoch": 0.35973425162614353,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0014820064038979452,
+      "loss": 0.7815,
+      "step": 5171
+    },
+    {
+      "epoch": 0.3598038192632787,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014818089621699139,
+      "loss": 0.799,
+      "step": 5172
+    },
+    {
+      "epoch": 0.3598733869004139,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014816114959780517,
+      "loss": 1.0051,
+      "step": 5173
+    },
+    {
+      "epoch": 0.35994295453754915,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014814140053323855,
+      "loss": 0.687,
+      "step": 5174
+    },
+    {
+      "epoch": 0.3600125221746843,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014812164902429426,
+      "loss": 0.8509,
+      "step": 5175
+    },
+    {
+      "epoch": 0.36008208981181955,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014810189507197518,
+      "loss": 0.561,
+      "step": 5176
+    },
+    {
+      "epoch": 0.3601516574489547,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014808213867728434,
+      "loss": 0.8071,
+      "step": 5177
+    },
+    {
+      "epoch": 0.36022122508608995,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0014806237984122481,
+      "loss": 0.6641,
+      "step": 5178
+    },
+    {
+      "epoch": 0.3602907927232252,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001480426185647999,
+      "loss": 0.7432,
+      "step": 5179
+    },
+    {
+      "epoch": 0.36036036036036034,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014802285484901297,
+      "loss": 0.8742,
+      "step": 5180
+    },
+    {
+      "epoch": 0.36042992799749557,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014800308869486753,
+      "loss": 0.886,
+      "step": 5181
+    },
+    {
+      "epoch": 0.3604994956346308,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014798332010336722,
+      "loss": 0.9504,
+      "step": 5182
+    },
+    {
+      "epoch": 0.36056906327176597,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014796354907551574,
+      "loss": 0.8872,
+      "step": 5183
+    },
+    {
+      "epoch": 0.3606386309089012,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00147943775612317,
+      "loss": 1.0139,
+      "step": 5184
+    },
+    {
+      "epoch": 0.36070819854603636,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00147923999714775,
+      "loss": 0.8477,
+      "step": 5185
+    },
+    {
+      "epoch": 0.3607777661831716,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014790422138389384,
+      "loss": 0.9271,
+      "step": 5186
+    },
+    {
+      "epoch": 0.3608473338203068,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0014788444062067776,
+      "loss": 0.8271,
+      "step": 5187
+    },
+    {
+      "epoch": 0.360916901457442,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0014786465742613116,
+      "loss": 1.1401,
+      "step": 5188
+    },
+    {
+      "epoch": 0.3609864690945772,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001478448718012585,
+      "loss": 0.8429,
+      "step": 5189
+    },
+    {
+      "epoch": 0.3610560367317124,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001478250837470644,
+      "loss": 0.9735,
+      "step": 5190
+    },
+    {
+      "epoch": 0.3611256043688476,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014780529326455362,
+      "loss": 0.8389,
+      "step": 5191
+    },
+    {
+      "epoch": 0.36119517200598283,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00147785500354731,
+      "loss": 0.831,
+      "step": 5192
+    },
+    {
+      "epoch": 0.361264739643118,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014776570501860153,
+      "loss": 1.0879,
+      "step": 5193
+    },
+    {
+      "epoch": 0.36133430728025323,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014774590725717032,
+      "loss": 0.8871,
+      "step": 5194
+    },
+    {
+      "epoch": 0.36140387491738846,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0014772610707144257,
+      "loss": 0.9803,
+      "step": 5195
+    },
+    {
+      "epoch": 0.3614734425545236,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001477063044624237,
+      "loss": 0.8462,
+      "step": 5196
+    },
+    {
+      "epoch": 0.36154301019165885,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0014768649943111911,
+      "loss": 1.0829,
+      "step": 5197
+    },
+    {
+      "epoch": 0.361612577828794,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014766669197853446,
+      "loss": 0.7897,
+      "step": 5198
+    },
+    {
+      "epoch": 0.36168214546592925,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014764688210567546,
+      "loss": 0.7425,
+      "step": 5199
+    },
+    {
+      "epoch": 0.3617517131030645,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014762706981354791,
+      "loss": 0.9892,
+      "step": 5200
+    },
+    {
+      "epoch": 0.36182128074019965,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014760725510315784,
+      "loss": 0.8421,
+      "step": 5201
+    },
+    {
+      "epoch": 0.3618908483773349,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001475874379755113,
+      "loss": 0.8271,
+      "step": 5202
+    },
+    {
+      "epoch": 0.36196041601447004,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014756761843161452,
+      "loss": 1.192,
+      "step": 5203
+    },
+    {
+      "epoch": 0.36202998365160527,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014754779647247385,
+      "loss": 0.6577,
+      "step": 5204
+    },
+    {
+      "epoch": 0.3620995512887405,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014752797209909572,
+      "loss": 0.7588,
+      "step": 5205
+    },
+    {
+      "epoch": 0.36216911892587567,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014750814531248673,
+      "loss": 0.7133,
+      "step": 5206
+    },
+    {
+      "epoch": 0.3622386865630109,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001474883161136536,
+      "loss": 0.9246,
+      "step": 5207
+    },
+    {
+      "epoch": 0.3623082542001461,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001474684845036031,
+      "loss": 0.6941,
+      "step": 5208
+    },
+    {
+      "epoch": 0.3623778218372813,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014744865048334221,
+      "loss": 0.777,
+      "step": 5209
+    },
+    {
+      "epoch": 0.3624473894744165,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014742881405387803,
+      "loss": 0.7638,
+      "step": 5210
+    },
+    {
+      "epoch": 0.3625169571115517,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014740897521621772,
+      "loss": 0.8489,
+      "step": 5211
+    },
+    {
+      "epoch": 0.3625865247486869,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014738913397136862,
+      "loss": 0.8281,
+      "step": 5212
+    },
+    {
+      "epoch": 0.36265609238582214,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014736929032033816,
+      "loss": 0.8747,
+      "step": 5213
+    },
+    {
+      "epoch": 0.3627256600229573,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014734944426413388,
+      "loss": 0.7183,
+      "step": 5214
+    },
+    {
+      "epoch": 0.36279522766009253,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001473295958037635,
+      "loss": 0.9431,
+      "step": 5215
+    },
+    {
+      "epoch": 0.3628647952972277,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014730974494023478,
+      "loss": 0.9157,
+      "step": 5216
+    },
+    {
+      "epoch": 0.36293436293436293,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001472898916745557,
+      "loss": 0.9453,
+      "step": 5217
+    },
+    {
+      "epoch": 0.36300393057149816,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014727003600773425,
+      "loss": 0.6536,
+      "step": 5218
+    },
+    {
+      "epoch": 0.3630734982086333,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014725017794077863,
+      "loss": 0.9153,
+      "step": 5219
+    },
+    {
+      "epoch": 0.36314306584576855,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014723031747469713,
+      "loss": 0.8485,
+      "step": 5220
+    },
+    {
+      "epoch": 0.3632126334829038,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001472104546104982,
+      "loss": 0.7691,
+      "step": 5221
+    },
+    {
+      "epoch": 0.36328220112003895,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014719058934919034,
+      "loss": 0.9719,
+      "step": 5222
+    },
+    {
+      "epoch": 0.3633517687571742,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014717072169178219,
+      "loss": 0.9064,
+      "step": 5223
+    },
+    {
+      "epoch": 0.36342133639430935,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014715085163928255,
+      "loss": 0.9613,
+      "step": 5224
+    },
+    {
+      "epoch": 0.3634909040314446,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014713097919270032,
+      "loss": 1.0228,
+      "step": 5225
+    },
+    {
+      "epoch": 0.3635604716685798,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014711110435304455,
+      "loss": 0.9179,
+      "step": 5226
+    },
+    {
+      "epoch": 0.36363003930571497,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014709122712132433,
+      "loss": 0.7914,
+      "step": 5227
+    },
+    {
+      "epoch": 0.3636996069428502,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014707134749854898,
+      "loss": 0.7456,
+      "step": 5228
+    },
+    {
+      "epoch": 0.36376917457998537,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014705146548572782,
+      "loss": 1.0304,
+      "step": 5229
+    },
+    {
+      "epoch": 0.3638387422171206,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014703158108387044,
+      "loss": 1.0316,
+      "step": 5230
+    },
+    {
+      "epoch": 0.3639083098542558,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014701169429398643,
+      "loss": 0.8354,
+      "step": 5231
+    },
+    {
+      "epoch": 0.363977877491391,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014699180511708553,
+      "loss": 0.7559,
+      "step": 5232
+    },
+    {
+      "epoch": 0.3640474451285262,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0014697191355417761,
+      "loss": 1.1107,
+      "step": 5233
+    },
+    {
+      "epoch": 0.36411701276566144,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0014695201960627266,
+      "loss": 0.8169,
+      "step": 5234
+    },
+    {
+      "epoch": 0.3641865804027966,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014693212327438086,
+      "loss": 0.6953,
+      "step": 5235
+    },
+    {
+      "epoch": 0.36425614803993184,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014691222455951235,
+      "loss": 0.9619,
+      "step": 5236
+    },
+    {
+      "epoch": 0.364325715677067,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014689232346267755,
+      "loss": 0.9999,
+      "step": 5237
+    },
+    {
+      "epoch": 0.36439528331420223,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014687241998488695,
+      "loss": 1.0531,
+      "step": 5238
+    },
+    {
+      "epoch": 0.36446485095133746,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014685251412715106,
+      "loss": 0.7005,
+      "step": 5239
+    },
+    {
+      "epoch": 0.36453441858847263,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014683260589048069,
+      "loss": 0.7436,
+      "step": 5240
+    },
+    {
+      "epoch": 0.36460398622560786,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0014681269527588663,
+      "loss": 1.1294,
+      "step": 5241
+    },
+    {
+      "epoch": 0.364673553862743,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001467927822843799,
+      "loss": 0.7699,
+      "step": 5242
+    },
+    {
+      "epoch": 0.36474312149987825,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0014677286691697146,
+      "loss": 0.827,
+      "step": 5243
+    },
+    {
+      "epoch": 0.3648126891370135,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014675294917467269,
+      "loss": 0.8862,
+      "step": 5244
+    },
+    {
+      "epoch": 0.36488225677414865,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0014673302905849476,
+      "loss": 0.9959,
+      "step": 5245
+    },
+    {
+      "epoch": 0.3649518244112839,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014671310656944915,
+      "loss": 0.7926,
+      "step": 5246
+    },
+    {
+      "epoch": 0.3650213920484191,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0014669318170854747,
+      "loss": 0.8058,
+      "step": 5247
+    },
+    {
+      "epoch": 0.3650909596855543,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014667325447680136,
+      "loss": 0.6047,
+      "step": 5248
+    },
+    {
+      "epoch": 0.3651605273226895,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0014665332487522262,
+      "loss": 1.2125,
+      "step": 5249
+    },
+    {
+      "epoch": 0.36523009495982467,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001466333929048232,
+      "loss": 1.0838,
+      "step": 5250
+    },
+    {
+      "epoch": 0.3652996625969599,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0014661345856661517,
+      "loss": 0.7864,
+      "step": 5251
+    },
+    {
+      "epoch": 0.3653692302340951,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014659352186161064,
+      "loss": 1.1664,
+      "step": 5252
+    },
+    {
+      "epoch": 0.3654387978712303,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0014657358279082193,
+      "loss": 0.9814,
+      "step": 5253
+    },
+    {
+      "epoch": 0.3655083655083655,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0014655364135526142,
+      "loss": 0.9766,
+      "step": 5254
+    },
+    {
+      "epoch": 0.3655779331455007,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014653369755594165,
+      "loss": 0.8501,
+      "step": 5255
+    },
+    {
+      "epoch": 0.3656475007826359,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001465137513938753,
+      "loss": 0.9529,
+      "step": 5256
+    },
+    {
+      "epoch": 0.36571706841977114,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014649380287007504,
+      "loss": 0.8705,
+      "step": 5257
+    },
+    {
+      "epoch": 0.3657866360569063,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014647385198555388,
+      "loss": 0.909,
+      "step": 5258
+    },
+    {
+      "epoch": 0.36585620369404154,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001464538987413247,
+      "loss": 1.0303,
+      "step": 5259
+    },
+    {
+      "epoch": 0.36592577133117676,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0014643394313840076,
+      "loss": 1.0606,
+      "step": 5260
+    },
+    {
+      "epoch": 0.36599533896831193,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014641398517779517,
+      "loss": 0.8019,
+      "step": 5261
+    },
+    {
+      "epoch": 0.36606490660544716,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0014639402486052138,
+      "loss": 0.9951,
+      "step": 5262
+    },
+    {
+      "epoch": 0.36613447424258233,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014637406218759284,
+      "loss": 0.9101,
+      "step": 5263
+    },
+    {
+      "epoch": 0.36620404187971756,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014635409716002314,
+      "loss": 0.8821,
+      "step": 5264
+    },
+    {
+      "epoch": 0.3662736095168528,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001463341297788261,
+      "loss": 0.753,
+      "step": 5265
+    },
+    {
+      "epoch": 0.36634317715398795,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0014631416004501543,
+      "loss": 0.7014,
+      "step": 5266
+    },
+    {
+      "epoch": 0.3664127447911232,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0014629418795960517,
+      "loss": 1.1205,
+      "step": 5267
+    },
+    {
+      "epoch": 0.36648231242825835,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001462742135236094,
+      "loss": 0.8285,
+      "step": 5268
+    },
+    {
+      "epoch": 0.3665518800653936,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001462542367380423,
+      "loss": 0.9696,
+      "step": 5269
+    },
+    {
+      "epoch": 0.3666214477025288,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001462342576039182,
+      "loss": 1.135,
+      "step": 5270
+    },
+    {
+      "epoch": 0.366691015339664,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014621427612225154,
+      "loss": 0.8164,
+      "step": 5271
+    },
+    {
+      "epoch": 0.3667605829767992,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014619429229405685,
+      "loss": 0.8362,
+      "step": 5272
+    },
+    {
+      "epoch": 0.3668301506139344,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014617430612034884,
+      "loss": 0.6728,
+      "step": 5273
+    },
+    {
+      "epoch": 0.3668997182510696,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0014615431760214232,
+      "loss": 0.7946,
+      "step": 5274
+    },
+    {
+      "epoch": 0.3669692858882048,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014613432674045216,
+      "loss": 0.9709,
+      "step": 5275
+    },
+    {
+      "epoch": 0.36703885352534,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014611433353629347,
+      "loss": 0.8764,
+      "step": 5276
+    },
+    {
+      "epoch": 0.3671084211624752,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014609433799068132,
+      "loss": 0.7207,
+      "step": 5277
+    },
+    {
+      "epoch": 0.36717798879961044,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014607434010463103,
+      "loss": 0.8309,
+      "step": 5278
+    },
+    {
+      "epoch": 0.3672475564367456,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014605433987915797,
+      "loss": 0.7943,
+      "step": 5279
+    },
+    {
+      "epoch": 0.36731712407388084,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0014603433731527767,
+      "loss": 1.1244,
+      "step": 5280
+    },
+    {
+      "epoch": 0.367386691711016,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014601433241400576,
+      "loss": 0.9917,
+      "step": 5281
+    },
+    {
+      "epoch": 0.36745625934815124,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0014599432517635796,
+      "loss": 0.9159,
+      "step": 5282
+    },
+    {
+      "epoch": 0.36752582698528646,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014597431560335018,
+      "loss": 0.8739,
+      "step": 5283
+    },
+    {
+      "epoch": 0.36759539462242163,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0014595430369599837,
+      "loss": 1.2074,
+      "step": 5284
+    },
+    {
+      "epoch": 0.36766496225955686,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014593428945531863,
+      "loss": 0.9927,
+      "step": 5285
+    },
+    {
+      "epoch": 0.3677345298966921,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0014591427288232722,
+      "loss": 0.8677,
+      "step": 5286
+    },
+    {
+      "epoch": 0.36780409753382726,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014589425397804044,
+      "loss": 1.0091,
+      "step": 5287
+    },
+    {
+      "epoch": 0.3678736651709625,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014587423274347478,
+      "loss": 1.0127,
+      "step": 5288
+    },
+    {
+      "epoch": 0.36794323280809765,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014585420917964677,
+      "loss": 1.1267,
+      "step": 5289
+    },
+    {
+      "epoch": 0.3680128004452329,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001458341832875732,
+      "loss": 0.7491,
+      "step": 5290
+    },
+    {
+      "epoch": 0.3680823680823681,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014581415506827078,
+      "loss": 0.7425,
+      "step": 5291
+    },
+    {
+      "epoch": 0.3681519357195033,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014579412452275654,
+      "loss": 0.721,
+      "step": 5292
+    },
+    {
+      "epoch": 0.3682215033566385,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014577409165204742,
+      "loss": 0.6941,
+      "step": 5293
+    },
+    {
+      "epoch": 0.3682910709937737,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014575405645716065,
+      "loss": 1.0086,
+      "step": 5294
+    },
+    {
+      "epoch": 0.3683606386309089,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0014573401893911353,
+      "loss": 0.7428,
+      "step": 5295
+    },
+    {
+      "epoch": 0.3684302062680441,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014571397909892343,
+      "loss": 0.7044,
+      "step": 5296
+    },
+    {
+      "epoch": 0.3684997739051793,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001456939369376079,
+      "loss": 0.9907,
+      "step": 5297
+    },
+    {
+      "epoch": 0.3685693415423145,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0014567389245618454,
+      "loss": 0.6828,
+      "step": 5298
+    },
+    {
+      "epoch": 0.36863890917944975,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001456538456556712,
+      "loss": 1.0632,
+      "step": 5299
+    },
+    {
+      "epoch": 0.3687084768165849,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014563379653708562,
+      "loss": 0.7712,
+      "step": 5300
+    },
+    {
+      "epoch": 0.36877804445372014,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014561374510144588,
+      "loss": 1.0056,
+      "step": 5301
+    },
+    {
+      "epoch": 0.3688476120908553,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001455936913497701,
+      "loss": 0.8905,
+      "step": 5302
+    },
+    {
+      "epoch": 0.36891717972799054,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0014557363528307646,
+      "loss": 0.9535,
+      "step": 5303
+    },
+    {
+      "epoch": 0.36898674736512577,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014555357690238333,
+      "loss": 0.8259,
+      "step": 5304
+    },
+    {
+      "epoch": 0.36905631500226094,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014553351620870917,
+      "loss": 0.7287,
+      "step": 5305
+    },
+    {
+      "epoch": 0.36912588263939616,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001455134532030726,
+      "loss": 1.0635,
+      "step": 5306
+    },
+    {
+      "epoch": 0.36919545027653133,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0014549338788649223,
+      "loss": 1.118,
+      "step": 5307
+    },
+    {
+      "epoch": 0.36926501791366656,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014547332025998693,
+      "loss": 0.7552,
+      "step": 5308
+    },
+    {
+      "epoch": 0.3693345855508018,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0014545325032457566,
+      "loss": 0.8597,
+      "step": 5309
+    },
+    {
+      "epoch": 0.36940415318793696,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014543317808127741,
+      "loss": 0.9539,
+      "step": 5310
+    },
+    {
+      "epoch": 0.3694737208250722,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001454131035311114,
+      "loss": 0.8004,
+      "step": 5311
+    },
+    {
+      "epoch": 0.3695432884622074,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001453930266750969,
+      "loss": 0.9075,
+      "step": 5312
+    },
+    {
+      "epoch": 0.3696128560993426,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001453729475142533,
+      "loss": 0.7649,
+      "step": 5313
+    },
+    {
+      "epoch": 0.3696824237364778,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014535286604960007,
+      "loss": 0.8814,
+      "step": 5314
+    },
+    {
+      "epoch": 0.369751991373613,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0014533278228215697,
+      "loss": 0.6539,
+      "step": 5315
+    },
+    {
+      "epoch": 0.3698215590107482,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014531269621294366,
+      "loss": 0.967,
+      "step": 5316
+    },
+    {
+      "epoch": 0.36989112664788343,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0014529260784297998,
+      "loss": 0.7468,
+      "step": 5317
+    },
+    {
+      "epoch": 0.3699606942850186,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014527251717328603,
+      "loss": 0.7324,
+      "step": 5318
+    },
+    {
+      "epoch": 0.3700302619221538,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0014525242420488178,
+      "loss": 0.7423,
+      "step": 5319
+    },
+    {
+      "epoch": 0.370099829559289,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001452323289387876,
+      "loss": 0.6566,
+      "step": 5320
+    },
+    {
+      "epoch": 0.3701693971964242,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0014521223137602367,
+      "loss": 0.976,
+      "step": 5321
+    },
+    {
+      "epoch": 0.37023896483355945,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0014519213151761056,
+      "loss": 0.8218,
+      "step": 5322
+    },
+    {
+      "epoch": 0.3703085324706946,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014517202936456877,
+      "loss": 0.9544,
+      "step": 5323
+    },
+    {
+      "epoch": 0.37037810010782984,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014515192491791904,
+      "loss": 0.987,
+      "step": 5324
+    },
+    {
+      "epoch": 0.370447667744965,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014513181817868215,
+      "loss": 1.1131,
+      "step": 5325
+    },
+    {
+      "epoch": 0.37051723538210024,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014511170914787899,
+      "loss": 0.7968,
+      "step": 5326
+    },
+    {
+      "epoch": 0.37058680301923547,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014509159782653063,
+      "loss": 1.0672,
+      "step": 5327
+    },
+    {
+      "epoch": 0.37065637065637064,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001450714842156582,
+      "loss": 0.7135,
+      "step": 5328
+    },
+    {
+      "epoch": 0.37072593829350586,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00145051368316283,
+      "loss": 0.7866,
+      "step": 5329
+    },
+    {
+      "epoch": 0.3707955059306411,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014503125012942637,
+      "loss": 0.9565,
+      "step": 5330
+    },
+    {
+      "epoch": 0.37086507356777626,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014501112965610986,
+      "loss": 0.9084,
+      "step": 5331
+    },
+    {
+      "epoch": 0.3709346412049115,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014499100689735504,
+      "loss": 1.0043,
+      "step": 5332
+    },
+    {
+      "epoch": 0.37100420884204666,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0014497088185418364,
+      "loss": 0.8889,
+      "step": 5333
+    },
+    {
+      "epoch": 0.3710737764791819,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014495075452761758,
+      "loss": 0.9213,
+      "step": 5334
+    },
+    {
+      "epoch": 0.3711433441163171,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0014493062491867871,
+      "loss": 0.8642,
+      "step": 5335
+    },
+    {
+      "epoch": 0.3712129117534523,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014491049302838923,
+      "loss": 0.7673,
+      "step": 5336
+    },
+    {
+      "epoch": 0.3712824793905875,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0014489035885777125,
+      "loss": 0.9089,
+      "step": 5337
+    },
+    {
+      "epoch": 0.3713520470277227,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014487022240784713,
+      "loss": 0.8892,
+      "step": 5338
+    },
+    {
+      "epoch": 0.3714216146648579,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014485008367963927,
+      "loss": 0.8305,
+      "step": 5339
+    },
+    {
+      "epoch": 0.37149118230199313,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014482994267417022,
+      "loss": 1.1563,
+      "step": 5340
+    },
+    {
+      "epoch": 0.3715607499391283,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014480979939246266,
+      "loss": 0.7855,
+      "step": 5341
+    },
+    {
+      "epoch": 0.3716303175762635,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001447896538355393,
+      "loss": 0.8052,
+      "step": 5342
+    },
+    {
+      "epoch": 0.37169988521339875,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014476950600442315,
+      "loss": 0.8226,
+      "step": 5343
+    },
+    {
+      "epoch": 0.3717694528505339,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014474935590013704,
+      "loss": 0.7776,
+      "step": 5344
+    },
+    {
+      "epoch": 0.37183902048766915,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0014472920352370426,
+      "loss": 0.7737,
+      "step": 5345
+    },
+    {
+      "epoch": 0.3719085881248043,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014470904887614795,
+      "loss": 0.8102,
+      "step": 5346
+    },
+    {
+      "epoch": 0.37197815576193954,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001446888919584915,
+      "loss": 0.8435,
+      "step": 5347
+    },
+    {
+      "epoch": 0.37204772339907477,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0014466873277175839,
+      "loss": 0.832,
+      "step": 5348
+    },
+    {
+      "epoch": 0.37211729103620994,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014464857131697214,
+      "loss": 0.8213,
+      "step": 5349
+    },
+    {
+      "epoch": 0.37218685867334517,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001446284075951565,
+      "loss": 0.6133,
+      "step": 5350
+    },
+    {
+      "epoch": 0.37225642631048034,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014460824160733524,
+      "loss": 0.9,
+      "step": 5351
+    },
+    {
+      "epoch": 0.37232599394761556,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014458807335453235,
+      "loss": 0.6688,
+      "step": 5352
+    },
+    {
+      "epoch": 0.3723955615847508,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0014456790283777182,
+      "loss": 0.7935,
+      "step": 5353
+    },
+    {
+      "epoch": 0.37246512922188596,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001445477300580778,
+      "loss": 0.8331,
+      "step": 5354
+    },
+    {
+      "epoch": 0.3725346968590212,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001445275550164746,
+      "loss": 1.0071,
+      "step": 5355
+    },
+    {
+      "epoch": 0.3726042644961564,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014450737771398662,
+      "loss": 0.6313,
+      "step": 5356
+    },
+    {
+      "epoch": 0.3726738321332916,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0014448719815163833,
+      "loss": 0.9666,
+      "step": 5357
+    },
+    {
+      "epoch": 0.3727433997704268,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014446701633045432,
+      "loss": 0.989,
+      "step": 5358
+    },
+    {
+      "epoch": 0.372812967407562,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0014444683225145938,
+      "loss": 0.795,
+      "step": 5359
+    },
+    {
+      "epoch": 0.3728825350446972,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001444266459156783,
+      "loss": 0.9144,
+      "step": 5360
+    },
+    {
+      "epoch": 0.37295210268183243,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014440645732413607,
+      "loss": 1.0063,
+      "step": 5361
+    },
+    {
+      "epoch": 0.3730216703189676,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014438626647785779,
+      "loss": 0.8353,
+      "step": 5362
+    },
+    {
+      "epoch": 0.37309123795610283,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014436607337786859,
+      "loss": 0.707,
+      "step": 5363
+    },
+    {
+      "epoch": 0.373160805593238,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0014434587802519383,
+      "loss": 1.0309,
+      "step": 5364
+    },
+    {
+      "epoch": 0.3732303732303732,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014432568042085886,
+      "loss": 0.9463,
+      "step": 5365
+    },
+    {
+      "epoch": 0.37329994086750845,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001443054805658893,
+      "loss": 0.8642,
+      "step": 5366
+    },
+    {
+      "epoch": 0.3733695085046436,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0014428527846131072,
+      "loss": 0.9723,
+      "step": 5367
+    },
+    {
+      "epoch": 0.37343907614177885,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014426507410814895,
+      "loss": 0.9318,
+      "step": 5368
+    },
+    {
+      "epoch": 0.3735086437789141,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001442448675074298,
+      "loss": 0.8876,
+      "step": 5369
+    },
+    {
+      "epoch": 0.37357821141604924,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001442246586601793,
+      "loss": 0.8524,
+      "step": 5370
+    },
+    {
+      "epoch": 0.37364777905318447,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0014420444756742354,
+      "loss": 0.6487,
+      "step": 5371
+    },
+    {
+      "epoch": 0.37371734669031964,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014418423423018876,
+      "loss": 0.7525,
+      "step": 5372
+    },
+    {
+      "epoch": 0.37378691432745487,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001441640186495013,
+      "loss": 1.0137,
+      "step": 5373
+    },
+    {
+      "epoch": 0.3738564819645901,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014414380082638748,
+      "loss": 0.9516,
+      "step": 5374
+    },
+    {
+      "epoch": 0.37392604960172526,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014412358076187402,
+      "loss": 0.7149,
+      "step": 5375
+    },
+    {
+      "epoch": 0.3739956172388605,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001441033584569875,
+      "loss": 0.9754,
+      "step": 5376
+    },
+    {
+      "epoch": 0.37406518487599566,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014408313391275475,
+      "loss": 0.6876,
+      "step": 5377
+    },
+    {
+      "epoch": 0.3741347525131309,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014406290713020265,
+      "loss": 0.8897,
+      "step": 5378
+    },
+    {
+      "epoch": 0.3742043201502661,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014404267811035823,
+      "loss": 1.1495,
+      "step": 5379
+    },
+    {
+      "epoch": 0.3742738877874013,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0014402244685424862,
+      "loss": 0.9556,
+      "step": 5380
+    },
+    {
+      "epoch": 0.3743434554245365,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00144002213362901,
+      "loss": 0.975,
+      "step": 5381
+    },
+    {
+      "epoch": 0.37441302306167173,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014398197763734282,
+      "loss": 0.9633,
+      "step": 5382
+    },
+    {
+      "epoch": 0.3744825906988069,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014396173967860149,
+      "loss": 0.9289,
+      "step": 5383
+    },
+    {
+      "epoch": 0.37455215833594213,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001439414994877046,
+      "loss": 0.7651,
+      "step": 5384
+    },
+    {
+      "epoch": 0.3746217259730773,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0014392125706567981,
+      "loss": 0.7602,
+      "step": 5385
+    },
+    {
+      "epoch": 0.37469129361021253,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0014390101241355503,
+      "loss": 0.5503,
+      "step": 5386
+    },
+    {
+      "epoch": 0.37476086124734775,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014388076553235808,
+      "loss": 0.9548,
+      "step": 5387
+    },
+    {
+      "epoch": 0.3748304288844829,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0014386051642311705,
+      "loss": 0.8298,
+      "step": 5388
+    },
+    {
+      "epoch": 0.37489999652161815,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014384026508686006,
+      "loss": 0.9172,
+      "step": 5389
+    },
+    {
+      "epoch": 0.3749695641587533,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014382001152461537,
+      "loss": 0.8015,
+      "step": 5390
+    },
+    {
+      "epoch": 0.37503913179588855,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0014379975573741135,
+      "loss": 0.8311,
+      "step": 5391
+    },
+    {
+      "epoch": 0.3751086994330238,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0014377949772627651,
+      "loss": 0.703,
+      "step": 5392
+    },
+    {
+      "epoch": 0.37517826707015894,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0014375923749223947,
+      "loss": 0.8893,
+      "step": 5393
+    },
+    {
+      "epoch": 0.37524783470729417,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001437389750363289,
+      "loss": 0.7522,
+      "step": 5394
+    },
+    {
+      "epoch": 0.3753174023444294,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014371871035957363,
+      "loss": 0.9608,
+      "step": 5395
+    },
+    {
+      "epoch": 0.37538696998156457,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0014369844346300265,
+      "loss": 0.9194,
+      "step": 5396
+    },
+    {
+      "epoch": 0.3754565376186998,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001436781743476449,
+      "loss": 0.9093,
+      "step": 5397
+    },
+    {
+      "epoch": 0.37552610525583496,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014365790301452963,
+      "loss": 0.8586,
+      "step": 5398
+    },
+    {
+      "epoch": 0.3755956728929702,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001436376294646861,
+      "loss": 0.9752,
+      "step": 5399
+    },
+    {
+      "epoch": 0.3756652405301054,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001436173536991437,
+      "loss": 0.8677,
+      "step": 5400
+    },
+    {
+      "epoch": 0.3757348081672406,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014359707571893194,
+      "loss": 0.8918,
+      "step": 5401
+    },
+    {
+      "epoch": 0.3758043758043758,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014357679552508041,
+      "loss": 0.7776,
+      "step": 5402
+    },
+    {
+      "epoch": 0.375873943441511,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014355651311861886,
+      "loss": 0.8775,
+      "step": 5403
+    },
+    {
+      "epoch": 0.3759435110786462,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014353622850057709,
+      "loss": 1.008,
+      "step": 5404
+    },
+    {
+      "epoch": 0.37601307871578143,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014351594167198508,
+      "loss": 0.8674,
+      "step": 5405
+    },
+    {
+      "epoch": 0.3760826463529166,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001434956526338729,
+      "loss": 0.831,
+      "step": 5406
+    },
+    {
+      "epoch": 0.37615221399005183,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001434753613872707,
+      "loss": 0.9915,
+      "step": 5407
+    },
+    {
+      "epoch": 0.37622178162718706,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.001434550679332088,
+      "loss": 1.0004,
+      "step": 5408
+    },
+    {
+      "epoch": 0.37629134926432223,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014343477227271757,
+      "loss": 0.7761,
+      "step": 5409
+    },
+    {
+      "epoch": 0.37636091690145745,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014341447440682754,
+      "loss": 0.9584,
+      "step": 5410
+    },
+    {
+      "epoch": 0.3764304845385926,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001433941743365693,
+      "loss": 0.7237,
+      "step": 5411
+    },
+    {
+      "epoch": 0.37650005217572785,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0014337387206297364,
+      "loss": 0.6604,
+      "step": 5412
+    },
+    {
+      "epoch": 0.3765696198128631,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014335356758707137,
+      "loss": 0.8516,
+      "step": 5413
+    },
+    {
+      "epoch": 0.37663918744999825,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0014333326090989345,
+      "loss": 0.85,
+      "step": 5414
+    },
+    {
+      "epoch": 0.3767087550871335,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014331295203247095,
+      "loss": 0.8663,
+      "step": 5415
+    },
+    {
+      "epoch": 0.37677832272426864,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014329264095583505,
+      "loss": 0.7427,
+      "step": 5416
+    },
+    {
+      "epoch": 0.37684789036140387,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014327232768101708,
+      "loss": 0.9801,
+      "step": 5417
+    },
+    {
+      "epoch": 0.3769174579985391,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001432520122090484,
+      "loss": 1.0844,
+      "step": 5418
+    },
+    {
+      "epoch": 0.37698702563567427,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014323169454096057,
+      "loss": 1.1017,
+      "step": 5419
+    },
+    {
+      "epoch": 0.3770565932728095,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0014321137467778518,
+      "loss": 0.9933,
+      "step": 5420
+    },
+    {
+      "epoch": 0.3771261609099447,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014319105262055399,
+      "loss": 0.974,
+      "step": 5421
+    },
+    {
+      "epoch": 0.3771957285470799,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0014317072837029883,
+      "loss": 0.8098,
+      "step": 5422
+    },
+    {
+      "epoch": 0.3772652961842151,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001431504019280517,
+      "loss": 0.7295,
+      "step": 5423
+    },
+    {
+      "epoch": 0.3773348638213503,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0014313007329484462,
+      "loss": 0.6766,
+      "step": 5424
+    },
+    {
+      "epoch": 0.3774044314584855,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0014310974247170984,
+      "loss": 0.9285,
+      "step": 5425
+    },
+    {
+      "epoch": 0.37747399909562074,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0014308940945967964,
+      "loss": 0.9051,
+      "step": 5426
+    },
+    {
+      "epoch": 0.3775435667327559,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001430690742597864,
+      "loss": 0.9952,
+      "step": 5427
+    },
+    {
+      "epoch": 0.37761313436989113,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014304873687306264,
+      "loss": 1.0094,
+      "step": 5428
+    },
+    {
+      "epoch": 0.3776827020070263,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00143028397300541,
+      "loss": 0.8287,
+      "step": 5429
+    },
+    {
+      "epoch": 0.37775226964416153,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0014300805554325424,
+      "loss": 0.6493,
+      "step": 5430
+    },
+    {
+      "epoch": 0.37782183728129676,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001429877116022352,
+      "loss": 0.8438,
+      "step": 5431
+    },
+    {
+      "epoch": 0.37789140491843193,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014296736547851684,
+      "loss": 0.8506,
+      "step": 5432
+    },
+    {
+      "epoch": 0.37796097255556715,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001429470171731322,
+      "loss": 0.889,
+      "step": 5433
+    },
+    {
+      "epoch": 0.3780305401927024,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0014292666668711453,
+      "loss": 0.8362,
+      "step": 5434
+    },
+    {
+      "epoch": 0.37810010782983755,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014290631402149709,
+      "loss": 0.7322,
+      "step": 5435
+    },
+    {
+      "epoch": 0.3781696754669728,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014288595917731329,
+      "loss": 0.9391,
+      "step": 5436
+    },
+    {
+      "epoch": 0.37823924310410795,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014286560215559664,
+      "loss": 0.7884,
+      "step": 5437
+    },
+    {
+      "epoch": 0.3783088107412432,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014284524295738075,
+      "loss": 0.8923,
+      "step": 5438
+    },
+    {
+      "epoch": 0.3783783783783784,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001428248815836994,
+      "loss": 0.8666,
+      "step": 5439
+    },
+    {
+      "epoch": 0.37844794601551357,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001428045180355864,
+      "loss": 0.9711,
+      "step": 5440
+    },
+    {
+      "epoch": 0.3785175136526488,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0014278415231407575,
+      "loss": 0.5071,
+      "step": 5441
+    },
+    {
+      "epoch": 0.37858708128978397,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014276378442020148,
+      "loss": 0.8519,
+      "step": 5442
+    },
+    {
+      "epoch": 0.3786566489269192,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0014274341435499779,
+      "loss": 1.1103,
+      "step": 5443
+    },
+    {
+      "epoch": 0.3787262165640544,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014272304211949895,
+      "loss": 1.0963,
+      "step": 5444
+    },
+    {
+      "epoch": 0.3787957842011896,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0014270266771473938,
+      "loss": 0.914,
+      "step": 5445
+    },
+    {
+      "epoch": 0.3788653518383248,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0014268229114175357,
+      "loss": 0.9931,
+      "step": 5446
+    },
+    {
+      "epoch": 0.37893491947546004,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0014266191240157617,
+      "loss": 0.775,
+      "step": 5447
+    },
+    {
+      "epoch": 0.3790044871125952,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014264153149524189,
+      "loss": 0.9089,
+      "step": 5448
+    },
+    {
+      "epoch": 0.37907405474973044,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014262114842378555,
+      "loss": 0.7882,
+      "step": 5449
+    },
+    {
+      "epoch": 0.3791436223868656,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0014260076318824211,
+      "loss": 0.9675,
+      "step": 5450
+    },
+    {
+      "epoch": 0.37921319002400083,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0014258037578964667,
+      "loss": 0.8656,
+      "step": 5451
+    },
+    {
+      "epoch": 0.37928275766113606,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014255998622903433,
+      "loss": 0.8968,
+      "step": 5452
+    },
+    {
+      "epoch": 0.37935232529827123,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0014253959450744045,
+      "loss": 0.8417,
+      "step": 5453
+    },
+    {
+      "epoch": 0.37942189293540646,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014251920062590036,
+      "loss": 1.1655,
+      "step": 5454
+    },
+    {
+      "epoch": 0.37949146057254163,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014249880458544956,
+      "loss": 0.9787,
+      "step": 5455
+    },
+    {
+      "epoch": 0.37956102820967685,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001424784063871237,
+      "loss": 0.4948,
+      "step": 5456
+    },
+    {
+      "epoch": 0.3796305958468121,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014245800603195846,
+      "loss": 0.6736,
+      "step": 5457
+    },
+    {
+      "epoch": 0.37970016348394725,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0014243760352098968,
+      "loss": 0.9282,
+      "step": 5458
+    },
+    {
+      "epoch": 0.3797697311210825,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.001424171988552533,
+      "loss": 0.7448,
+      "step": 5459
+    },
+    {
+      "epoch": 0.3798392987582177,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014239679203578532,
+      "loss": 0.9497,
+      "step": 5460
+    },
+    {
+      "epoch": 0.3799088663953529,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00142376383063622,
+      "loss": 1.0226,
+      "step": 5461
+    },
+    {
+      "epoch": 0.3799784340324881,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001423559719397995,
+      "loss": 1.1281,
+      "step": 5462
+    },
+    {
+      "epoch": 0.38004800166962327,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0014233555866535424,
+      "loss": 0.8493,
+      "step": 5463
+    },
+    {
+      "epoch": 0.3801175693067585,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0014231514324132269,
+      "loss": 0.9384,
+      "step": 5464
+    },
+    {
+      "epoch": 0.3801871369438937,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014229472566874147,
+      "loss": 0.6505,
+      "step": 5465
+    },
+    {
+      "epoch": 0.3802567045810289,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014227430594864726,
+      "loss": 0.7257,
+      "step": 5466
+    },
+    {
+      "epoch": 0.3803262722181641,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0014225388408207684,
+      "loss": 1.0635,
+      "step": 5467
+    },
+    {
+      "epoch": 0.3803958398552993,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001422334600700672,
+      "loss": 0.6463,
+      "step": 5468
+    },
+    {
+      "epoch": 0.3804654074924345,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014221303391365532,
+      "loss": 1.1023,
+      "step": 5469
+    },
+    {
+      "epoch": 0.38053497512956974,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014219260561387835,
+      "loss": 0.7375,
+      "step": 5470
+    },
+    {
+      "epoch": 0.3806045427667049,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0014217217517177353,
+      "loss": 0.8715,
+      "step": 5471
+    },
+    {
+      "epoch": 0.38067411040384014,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001421517425883782,
+      "loss": 0.6879,
+      "step": 5472
+    },
+    {
+      "epoch": 0.38074367804097536,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014213130786472985,
+      "loss": 0.891,
+      "step": 5473
+    },
+    {
+      "epoch": 0.38081324567811053,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014211087100186605,
+      "loss": 0.9219,
+      "step": 5474
+    },
+    {
+      "epoch": 0.38088281331524576,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001420904320008245,
+      "loss": 0.9867,
+      "step": 5475
+    },
+    {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014206999086264292,
+      "loss": 1.0281,
+      "step": 5476
+    },
+    {
+      "epoch": 0.38102194858951616,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014204954758835929,
+      "loss": 0.8644,
+      "step": 5477
+    },
+    {
+      "epoch": 0.3810915162266514,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0014202910217901155,
+      "loss": 0.914,
+      "step": 5478
+    },
+    {
+      "epoch": 0.38116108386378655,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014200865463563786,
+      "loss": 0.8536,
+      "step": 5479
+    },
+    {
+      "epoch": 0.3812306515009218,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014198820495927643,
+      "loss": 1.0379,
+      "step": 5480
+    },
+    {
+      "epoch": 0.38130021913805695,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0014196775315096558,
+      "loss": 0.879,
+      "step": 5481
+    },
+    {
+      "epoch": 0.3813697867751922,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014194729921174374,
+      "loss": 0.9843,
+      "step": 5482
+    },
+    {
+      "epoch": 0.3814393544123274,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014192684314264952,
+      "loss": 0.9935,
+      "step": 5483
+    },
+    {
+      "epoch": 0.3815089220494626,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001419063849447215,
+      "loss": 0.9397,
+      "step": 5484
+    },
+    {
+      "epoch": 0.3815784896865978,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0014188592461899848,
+      "loss": 0.958,
+      "step": 5485
+    },
+    {
+      "epoch": 0.381648057323733,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014186546216651932,
+      "loss": 0.9617,
+      "step": 5486
+    },
+    {
+      "epoch": 0.3817176249608682,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0014184499758832304,
+      "loss": 0.8839,
+      "step": 5487
+    },
+    {
+      "epoch": 0.3817871925980034,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014182453088544867,
+      "loss": 1.0621,
+      "step": 5488
+    },
+    {
+      "epoch": 0.3818567602351386,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014180406205893546,
+      "loss": 1.0318,
+      "step": 5489
+    },
+    {
+      "epoch": 0.3819263278722738,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0014178359110982265,
+      "loss": 0.7853,
+      "step": 5490
+    },
+    {
+      "epoch": 0.38199589550940904,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014176311803914972,
+      "loss": 0.9092,
+      "step": 5491
+    },
+    {
+      "epoch": 0.3820654631465442,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0014174264284795614,
+      "loss": 0.8226,
+      "step": 5492
+    },
+    {
+      "epoch": 0.38213503078367944,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014172216553728152,
+      "loss": 0.8707,
+      "step": 5493
+    },
+    {
+      "epoch": 0.3822045984208146,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001417016861081657,
+      "loss": 0.9307,
+      "step": 5494
+    },
+    {
+      "epoch": 0.38227416605794984,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001416812045616484,
+      "loss": 0.5553,
+      "step": 5495
+    },
+    {
+      "epoch": 0.38234373369508506,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014166072089876968,
+      "loss": 0.9959,
+      "step": 5496
+    },
+    {
+      "epoch": 0.38241330133222023,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001416402351205695,
+      "loss": 0.7066,
+      "step": 5497
+    },
+    {
+      "epoch": 0.38248286896935546,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014161974722808803,
+      "loss": 0.7988,
+      "step": 5498
+    },
+    {
+      "epoch": 0.3825524366064907,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001415992572223656,
+      "loss": 1.0347,
+      "step": 5499
+    },
+    {
+      "epoch": 0.38262200424362586,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0014157876510444256,
+      "loss": 0.923,
+      "step": 5500
+    },
+    {
+      "epoch": 0.3826915718807611,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0014155827087535943,
+      "loss": 0.6956,
+      "step": 5501
+    },
+    {
+      "epoch": 0.38276113951789625,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0014153777453615678,
+      "loss": 0.7743,
+      "step": 5502
+    },
+    {
+      "epoch": 0.3828307071550315,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.0014151727608787525,
+      "loss": 1.0726,
+      "step": 5503
+    },
+    {
+      "epoch": 0.3829002747921667,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014149677553155575,
+      "loss": 0.8967,
+      "step": 5504
+    },
+    {
+      "epoch": 0.3829698424293019,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014147627286823915,
+      "loss": 0.9015,
+      "step": 5505
+    },
+    {
+      "epoch": 0.3830394100664371,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0014145576809896643,
+      "loss": 0.7625,
+      "step": 5506
+    },
+    {
+      "epoch": 0.3831089777035723,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014143526122477879,
+      "loss": 0.8154,
+      "step": 5507
+    },
+    {
+      "epoch": 0.3831785453407075,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014141475224671743,
+      "loss": 0.9021,
+      "step": 5508
+    },
+    {
+      "epoch": 0.3832481129778427,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014139424116582364,
+      "loss": 1.1319,
+      "step": 5509
+    },
+    {
+      "epoch": 0.3833176806149779,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00141373727983139,
+      "loss": 1.0978,
+      "step": 5510
+    },
+    {
+      "epoch": 0.3833872482521131,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014135321269970497,
+      "loss": 0.9746,
+      "step": 5511
+    },
+    {
+      "epoch": 0.38345681588924835,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014133269531656323,
+      "loss": 0.7934,
+      "step": 5512
+    },
+    {
+      "epoch": 0.3835263835263835,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014131217583475558,
+      "loss": 0.7595,
+      "step": 5513
+    },
+    {
+      "epoch": 0.38359595116351874,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014129165425532384,
+      "loss": 1.0819,
+      "step": 5514
+    },
+    {
+      "epoch": 0.3836655188006539,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014127113057931003,
+      "loss": 0.8422,
+      "step": 5515
+    },
+    {
+      "epoch": 0.38373508643778914,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001412506048077562,
+      "loss": 1.0966,
+      "step": 5516
+    },
+    {
+      "epoch": 0.38380465407492437,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0014123007694170461,
+      "loss": 1.1284,
+      "step": 5517
+    },
+    {
+      "epoch": 0.38387422171205954,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014120954698219755,
+      "loss": 0.7266,
+      "step": 5518
+    },
+    {
+      "epoch": 0.38394378934919476,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0014118901493027738,
+      "loss": 0.5555,
+      "step": 5519
+    },
+    {
+      "epoch": 0.38401335698632993,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0014116848078698663,
+      "loss": 0.8231,
+      "step": 5520
+    },
+    {
+      "epoch": 0.38408292462346516,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014114794455336794,
+      "loss": 0.8969,
+      "step": 5521
+    },
+    {
+      "epoch": 0.3841524922606004,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0014112740623046403,
+      "loss": 1.1414,
+      "step": 5522
+    },
+    {
+      "epoch": 0.38422205989773556,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0014110686581931772,
+      "loss": 0.823,
+      "step": 5523
+    },
+    {
+      "epoch": 0.3842916275348708,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014108632332097198,
+      "loss": 0.8634,
+      "step": 5524
+    },
+    {
+      "epoch": 0.384361195172006,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0014106577873646982,
+      "loss": 0.9133,
+      "step": 5525
+    },
+    {
+      "epoch": 0.3844307628091412,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001410452320668544,
+      "loss": 0.9964,
+      "step": 5526
+    },
+    {
+      "epoch": 0.3845003304462764,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014102468331316897,
+      "loss": 1.0287,
+      "step": 5527
+    },
+    {
+      "epoch": 0.3845698980834116,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001410041324764569,
+      "loss": 0.8902,
+      "step": 5528
+    },
+    {
+      "epoch": 0.3846394657205468,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0014098357955776167,
+      "loss": 1.0256,
+      "step": 5529
+    },
+    {
+      "epoch": 0.38470903335768203,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014096302455812683,
+      "loss": 0.858,
+      "step": 5530
+    },
+    {
+      "epoch": 0.3847786009948172,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014094246747859609,
+      "loss": 0.997,
+      "step": 5531
+    },
+    {
+      "epoch": 0.3848481686319524,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014092190832021318,
+      "loss": 0.7877,
+      "step": 5532
+    },
+    {
+      "epoch": 0.3849177362690876,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001409013470840221,
+      "loss": 1.1309,
+      "step": 5533
+    },
+    {
+      "epoch": 0.3849873039062228,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014088078377106673,
+      "loss": 0.7741,
+      "step": 5534
+    },
+    {
+      "epoch": 0.38505687154335805,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001408602183823912,
+      "loss": 1.0019,
+      "step": 5535
+    },
+    {
+      "epoch": 0.3851264391804932,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0014083965091903974,
+      "loss": 0.8954,
+      "step": 5536
+    },
+    {
+      "epoch": 0.38519600681762844,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014081908138205664,
+      "loss": 0.8182,
+      "step": 5537
+    },
+    {
+      "epoch": 0.38526557445476367,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014079850977248638,
+      "loss": 1.0547,
+      "step": 5538
+    },
+    {
+      "epoch": 0.38533514209189884,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014077793609137336,
+      "loss": 1.1989,
+      "step": 5539
+    },
+    {
+      "epoch": 0.38540470972903407,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0014075736033976236,
+      "loss": 0.7847,
+      "step": 5540
+    },
+    {
+      "epoch": 0.38547427736616924,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00140736782518698,
+      "loss": 1.0591,
+      "step": 5541
+    },
+    {
+      "epoch": 0.38554384500330446,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0014071620262922516,
+      "loss": 0.8005,
+      "step": 5542
+    },
+    {
+      "epoch": 0.3856134126404397,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0014069562067238874,
+      "loss": 0.8348,
+      "step": 5543
+    },
+    {
+      "epoch": 0.38568298027757486,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014067503664923387,
+      "loss": 0.655,
+      "step": 5544
+    },
+    {
+      "epoch": 0.3857525479147101,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0014065445056080563,
+      "loss": 0.8986,
+      "step": 5545
+    },
+    {
+      "epoch": 0.38582211555184526,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001406338624081493,
+      "loss": 0.7869,
+      "step": 5546
+    },
+    {
+      "epoch": 0.3858916831889805,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014061327219231025,
+      "loss": 0.9504,
+      "step": 5547
+    },
+    {
+      "epoch": 0.3859612508261157,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0014059267991433394,
+      "loss": 0.9495,
+      "step": 5548
+    },
+    {
+      "epoch": 0.3860308184632509,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00140572085575266,
+      "loss": 0.7159,
+      "step": 5549
+    },
+    {
+      "epoch": 0.3861003861003861,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00140551489176152,
+      "loss": 0.8652,
+      "step": 5550
+    },
+    {
+      "epoch": 0.38616995373752133,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014053089071803778,
+      "loss": 0.919,
+      "step": 5551
+    },
+    {
+      "epoch": 0.3862395213746565,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.001405102902019692,
+      "loss": 0.7285,
+      "step": 5552
+    },
+    {
+      "epoch": 0.38630908901179173,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001404896876289923,
+      "loss": 1.0848,
+      "step": 5553
+    },
+    {
+      "epoch": 0.3863786566489269,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0014046908300015316,
+      "loss": 0.8936,
+      "step": 5554
+    },
+    {
+      "epoch": 0.3864482242860621,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014044847631649792,
+      "loss": 0.9112,
+      "step": 5555
+    },
+    {
+      "epoch": 0.38651779192319735,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0014042786757907297,
+      "loss": 0.8496,
+      "step": 5556
+    },
+    {
+      "epoch": 0.3865873595603325,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014040725678892466,
+      "loss": 0.8048,
+      "step": 5557
+    },
+    {
+      "epoch": 0.38665692719746775,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0014038664394709953,
+      "loss": 1.0822,
+      "step": 5558
+    },
+    {
+      "epoch": 0.3867264948346029,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0014036602905464414,
+      "loss": 0.8533,
+      "step": 5559
+    },
+    {
+      "epoch": 0.38679606247173814,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0014034541211260527,
+      "loss": 0.7703,
+      "step": 5560
+    },
+    {
+      "epoch": 0.38686563010887337,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0014032479312202977,
+      "loss": 0.6636,
+      "step": 5561
+    },
+    {
+      "epoch": 0.38693519774600854,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001403041720839645,
+      "loss": 0.8638,
+      "step": 5562
+    },
+    {
+      "epoch": 0.38700476538314377,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0014028354899945652,
+      "loss": 0.8446,
+      "step": 5563
+    },
+    {
+      "epoch": 0.387074333020279,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0014026292386955296,
+      "loss": 0.724,
+      "step": 5564
+    },
+    {
+      "epoch": 0.38714390065741416,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0014024229669530109,
+      "loss": 1.009,
+      "step": 5565
+    },
+    {
+      "epoch": 0.3872134682945494,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0014022166747774821,
+      "loss": 0.6755,
+      "step": 5566
+    },
+    {
+      "epoch": 0.38728303593168456,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0014020103621794177,
+      "loss": 0.6224,
+      "step": 5567
+    },
+    {
+      "epoch": 0.3873526035688198,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001401804029169294,
+      "loss": 0.9592,
+      "step": 5568
+    },
+    {
+      "epoch": 0.387422171205955,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001401597675757586,
+      "loss": 0.6962,
+      "step": 5569
+    },
+    {
+      "epoch": 0.3874917388430902,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014013913019547731,
+      "loss": 0.9164,
+      "step": 5570
+    },
+    {
+      "epoch": 0.3875613064802254,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014011849077713325,
+      "loss": 0.8144,
+      "step": 5571
+    },
+    {
+      "epoch": 0.3876308741173606,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0014009784932177446,
+      "loss": 1.1767,
+      "step": 5572
+    },
+    {
+      "epoch": 0.3877004417544958,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014007720583044901,
+      "loss": 0.8022,
+      "step": 5573
+    },
+    {
+      "epoch": 0.38777000939163103,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0014005656030420502,
+      "loss": 0.9998,
+      "step": 5574
+    },
+    {
+      "epoch": 0.3878395770287662,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014003591274409084,
+      "loss": 0.8166,
+      "step": 5575
+    },
+    {
+      "epoch": 0.38790914466590143,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0014001526315115475,
+      "loss": 0.8817,
+      "step": 5576
+    },
+    {
+      "epoch": 0.38797871230303665,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013999461152644536,
+      "loss": 0.8195,
+      "step": 5577
+    },
+    {
+      "epoch": 0.3880482799401718,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001399739578710111,
+      "loss": 1.1256,
+      "step": 5578
+    },
+    {
+      "epoch": 0.38811784757730705,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013995330218590082,
+      "loss": 0.744,
+      "step": 5579
+    },
+    {
+      "epoch": 0.3881874152144422,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0013993264447216317,
+      "loss": 1.0807,
+      "step": 5580
+    },
+    {
+      "epoch": 0.38825698285157745,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001399119847308471,
+      "loss": 1.1585,
+      "step": 5581
+    },
+    {
+      "epoch": 0.3883265504887127,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013989132296300172,
+      "loss": 0.9526,
+      "step": 5582
+    },
+    {
+      "epoch": 0.38839611812584784,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013987065916967595,
+      "loss": 0.7696,
+      "step": 5583
+    },
+    {
+      "epoch": 0.38846568576298307,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013984999335191909,
+      "loss": 1.1156,
+      "step": 5584
+    },
+    {
+      "epoch": 0.38853525340011824,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013982932551078041,
+      "loss": 0.9445,
+      "step": 5585
+    },
+    {
+      "epoch": 0.38860482103725347,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013980865564730935,
+      "loss": 0.777,
+      "step": 5586
+    },
+    {
+      "epoch": 0.3886743886743887,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0013978798376255536,
+      "loss": 0.9426,
+      "step": 5587
+    },
+    {
+      "epoch": 0.38874395631152386,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0013976730985756818,
+      "loss": 0.8967,
+      "step": 5588
+    },
+    {
+      "epoch": 0.3888135239486591,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013974663393339739,
+      "loss": 1.0173,
+      "step": 5589
+    },
+    {
+      "epoch": 0.3888830915857943,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0013972595599109287,
+      "loss": 0.864,
+      "step": 5590
+    },
+    {
+      "epoch": 0.3889526592229295,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0013970527603170458,
+      "loss": 0.7937,
+      "step": 5591
+    },
+    {
+      "epoch": 0.3890222268600647,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013968459405628247,
+      "loss": 0.7681,
+      "step": 5592
+    },
+    {
+      "epoch": 0.3890917944971999,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001396639100658767,
+      "loss": 0.7632,
+      "step": 5593
+    },
+    {
+      "epoch": 0.3891613621343351,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001396432240615375,
+      "loss": 0.955,
+      "step": 5594
+    },
+    {
+      "epoch": 0.38923092977147034,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013962253604431524,
+      "loss": 0.7488,
+      "step": 5595
+    },
+    {
+      "epoch": 0.3893004974086055,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013960184601526024,
+      "loss": 0.8588,
+      "step": 5596
+    },
+    {
+      "epoch": 0.38937006504574073,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013958115397542314,
+      "loss": 0.7897,
+      "step": 5597
+    },
+    {
+      "epoch": 0.3894396326828759,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0013956045992585457,
+      "loss": 0.7896,
+      "step": 5598
+    },
+    {
+      "epoch": 0.38950920032001113,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001395397638676052,
+      "loss": 0.8627,
+      "step": 5599
+    },
+    {
+      "epoch": 0.38957876795714635,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0013951906580172595,
+      "loss": 0.908,
+      "step": 5600
+    },
+    {
+      "epoch": 0.3896483355942815,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013949836572926771,
+      "loss": 1.0089,
+      "step": 5601
+    },
+    {
+      "epoch": 0.38971790323141675,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0013947766365128157,
+      "loss": 0.8768,
+      "step": 5602
+    },
+    {
+      "epoch": 0.389787470868552,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001394569595688186,
+      "loss": 1.0241,
+      "step": 5603
+    },
+    {
+      "epoch": 0.38985703850568715,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013943625348293014,
+      "loss": 0.9728,
+      "step": 5604
+    },
+    {
+      "epoch": 0.3899266061428224,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013941554539466752,
+      "loss": 0.8221,
+      "step": 5605
+    },
+    {
+      "epoch": 0.38999617377995754,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013939483530508213,
+      "loss": 1.0314,
+      "step": 5606
+    },
+    {
+      "epoch": 0.39006574141709277,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.001393741232152256,
+      "loss": 1.1567,
+      "step": 5607
+    },
+    {
+      "epoch": 0.390135309054228,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013935340912614954,
+      "loss": 0.7966,
+      "step": 5608
+    },
+    {
+      "epoch": 0.39020487669136317,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0013933269303890575,
+      "loss": 1.0969,
+      "step": 5609
+    },
+    {
+      "epoch": 0.3902744443284984,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00139311974954546,
+      "loss": 0.8874,
+      "step": 5610
+    },
+    {
+      "epoch": 0.39034401196563356,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0013929125487412233,
+      "loss": 0.7099,
+      "step": 5611
+    },
+    {
+      "epoch": 0.3904135796027688,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013927053279868683,
+      "loss": 0.9192,
+      "step": 5612
+    },
+    {
+      "epoch": 0.390483147239904,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013924980872929153,
+      "loss": 1.0022,
+      "step": 5613
+    },
+    {
+      "epoch": 0.3905527148770392,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013922908266698884,
+      "loss": 0.9358,
+      "step": 5614
+    },
+    {
+      "epoch": 0.3906222825141744,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00139208354612831,
+      "loss": 0.8857,
+      "step": 5615
+    },
+    {
+      "epoch": 0.3906918501513096,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013918762456787061,
+      "loss": 0.7615,
+      "step": 5616
+    },
+    {
+      "epoch": 0.3907614177884448,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013916689253316013,
+      "loss": 0.769,
+      "step": 5617
+    },
+    {
+      "epoch": 0.39083098542558004,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013914615850975226,
+      "loss": 0.9046,
+      "step": 5618
+    },
+    {
+      "epoch": 0.3909005530627152,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0013912542249869978,
+      "loss": 0.9263,
+      "step": 5619
+    },
+    {
+      "epoch": 0.39097012069985043,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0013910468450105556,
+      "loss": 1.1956,
+      "step": 5620
+    },
+    {
+      "epoch": 0.39103968833698566,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0013908394451787255,
+      "loss": 0.8908,
+      "step": 5621
+    },
+    {
+      "epoch": 0.39110925597412083,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013906320255020384,
+      "loss": 0.7833,
+      "step": 5622
+    },
+    {
+      "epoch": 0.39117882361125605,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001390424585991026,
+      "loss": 1.0066,
+      "step": 5623
+    },
+    {
+      "epoch": 0.3912483912483912,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001390217126656221,
+      "loss": 0.7541,
+      "step": 5624
+    },
+    {
+      "epoch": 0.39131795888552645,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0013900096475081571,
+      "loss": 0.8519,
+      "step": 5625
+    },
+    {
+      "epoch": 0.3913875265226617,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013898021485573688,
+      "loss": 0.6573,
+      "step": 5626
+    },
+    {
+      "epoch": 0.39145709415979685,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013895946298143923,
+      "loss": 0.8274,
+      "step": 5627
+    },
+    {
+      "epoch": 0.3915266617969321,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013893870912897648,
+      "loss": 0.9457,
+      "step": 5628
+    },
+    {
+      "epoch": 0.39159622943406724,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001389179532994023,
+      "loss": 0.8788,
+      "step": 5629
+    },
+    {
+      "epoch": 0.39166579707120247,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0013889719549377063,
+      "loss": 0.9339,
+      "step": 5630
+    },
+    {
+      "epoch": 0.3917353647083377,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013887643571313538,
+      "loss": 0.8774,
+      "step": 5631
+    },
+    {
+      "epoch": 0.39180493234547287,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0013885567395855072,
+      "loss": 0.7744,
+      "step": 5632
+    },
+    {
+      "epoch": 0.3918744999826081,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0013883491023107075,
+      "loss": 0.846,
+      "step": 5633
+    },
+    {
+      "epoch": 0.3919440676197433,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001388141445317498,
+      "loss": 0.775,
+      "step": 5634
+    },
+    {
+      "epoch": 0.3920136352568785,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013879337686164223,
+      "loss": 0.7387,
+      "step": 5635
+    },
+    {
+      "epoch": 0.3920832028940137,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013877260722180253,
+      "loss": 0.8283,
+      "step": 5636
+    },
+    {
+      "epoch": 0.3921527705311489,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013875183561328527,
+      "loss": 0.9993,
+      "step": 5637
+    },
+    {
+      "epoch": 0.3922223381682841,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001387310620371451,
+      "loss": 1.0272,
+      "step": 5638
+    },
+    {
+      "epoch": 0.39229190580541934,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013871028649443682,
+      "loss": 0.8058,
+      "step": 5639
+    },
+    {
+      "epoch": 0.3923614734425545,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001386895089862153,
+      "loss": 0.7725,
+      "step": 5640
+    },
+    {
+      "epoch": 0.39243104107968974,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0013866872951353553,
+      "loss": 0.952,
+      "step": 5641
+    },
+    {
+      "epoch": 0.3925006087168249,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013864794807745258,
+      "loss": 0.7073,
+      "step": 5642
+    },
+    {
+      "epoch": 0.39257017635396013,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0013862716467902163,
+      "loss": 0.7186,
+      "step": 5643
+    },
+    {
+      "epoch": 0.39263974399109536,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013860637931929797,
+      "loss": 1.0442,
+      "step": 5644
+    },
+    {
+      "epoch": 0.39270931162823053,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0013858559199933693,
+      "loss": 0.9521,
+      "step": 5645
+    },
+    {
+      "epoch": 0.39277887926536575,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0013856480272019405,
+      "loss": 0.933,
+      "step": 5646
+    },
+    {
+      "epoch": 0.392848446902501,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001385440114829248,
+      "loss": 0.6352,
+      "step": 5647
+    },
+    {
+      "epoch": 0.39291801453963615,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013852321828858498,
+      "loss": 1.0762,
+      "step": 5648
+    },
+    {
+      "epoch": 0.3929875821767714,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001385024231382303,
+      "loss": 0.7916,
+      "step": 5649
+    },
+    {
+      "epoch": 0.39305714981390655,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001384816260329166,
+      "loss": 0.8968,
+      "step": 5650
+    },
+    {
+      "epoch": 0.3931267174510418,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013846082697369995,
+      "loss": 0.7204,
+      "step": 5651
+    },
+    {
+      "epoch": 0.393196285088177,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0013844002596163634,
+      "loss": 1.3161,
+      "step": 5652
+    },
+    {
+      "epoch": 0.39326585272531217,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013841922299778198,
+      "loss": 0.7805,
+      "step": 5653
+    },
+    {
+      "epoch": 0.3933354203624474,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013839841808319306,
+      "loss": 0.6772,
+      "step": 5654
+    },
+    {
+      "epoch": 0.39340498799958257,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0013837761121892607,
+      "loss": 0.6694,
+      "step": 5655
+    },
+    {
+      "epoch": 0.3934745556367178,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001383568024060374,
+      "loss": 0.795,
+      "step": 5656
+    },
+    {
+      "epoch": 0.393544123273853,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0013833599164558366,
+      "loss": 0.8358,
+      "step": 5657
+    },
+    {
+      "epoch": 0.3936136909109882,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0013831517893862146,
+      "loss": 0.7724,
+      "step": 5658
+    },
+    {
+      "epoch": 0.3936832585481234,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001382943642862076,
+      "loss": 0.7712,
+      "step": 5659
+    },
+    {
+      "epoch": 0.39375282618525864,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00138273547689399,
+      "loss": 0.7206,
+      "step": 5660
+    },
+    {
+      "epoch": 0.3938223938223938,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001382527291492525,
+      "loss": 0.7366,
+      "step": 5661
+    },
+    {
+      "epoch": 0.39389196145952904,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0013823190866682526,
+      "loss": 0.7126,
+      "step": 5662
+    },
+    {
+      "epoch": 0.3939615290966642,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013821108624317434,
+      "loss": 0.8909,
+      "step": 5663
+    },
+    {
+      "epoch": 0.39403109673379944,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0013819026187935708,
+      "loss": 0.7864,
+      "step": 5664
+    },
+    {
+      "epoch": 0.39410066437093466,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013816943557643081,
+      "loss": 0.8352,
+      "step": 5665
+    },
+    {
+      "epoch": 0.39417023200806983,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0013814860733545303,
+      "loss": 0.9868,
+      "step": 5666
+    },
+    {
+      "epoch": 0.39423979964520506,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0013812777715748125,
+      "loss": 0.8523,
+      "step": 5667
+    },
+    {
+      "epoch": 0.39430936728234023,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013810694504357308,
+      "loss": 0.7038,
+      "step": 5668
+    },
+    {
+      "epoch": 0.39437893491947545,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0013808611099478637,
+      "loss": 1.205,
+      "step": 5669
+    },
+    {
+      "epoch": 0.3944485025566107,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0013806527501217885,
+      "loss": 0.7109,
+      "step": 5670
+    },
+    {
+      "epoch": 0.39451807019374585,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0013804443709680857,
+      "loss": 0.6902,
+      "step": 5671
+    },
+    {
+      "epoch": 0.3945876378308811,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001380235972497335,
+      "loss": 0.7888,
+      "step": 5672
+    },
+    {
+      "epoch": 0.3946572054680163,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013800275547201184,
+      "loss": 0.8962,
+      "step": 5673
+    },
+    {
+      "epoch": 0.3947267731051515,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001379819117647018,
+      "loss": 0.8865,
+      "step": 5674
+    },
+    {
+      "epoch": 0.3947963407422867,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013796106612886173,
+      "loss": 0.7484,
+      "step": 5675
+    },
+    {
+      "epoch": 0.39486590837942187,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0013794021856555008,
+      "loss": 0.735,
+      "step": 5676
+    },
+    {
+      "epoch": 0.3949354760165571,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013791936907582532,
+      "loss": 0.7644,
+      "step": 5677
+    },
+    {
+      "epoch": 0.3950050436536923,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013789851766074614,
+      "loss": 0.8093,
+      "step": 5678
+    },
+    {
+      "epoch": 0.3950746112908275,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0013787766432137127,
+      "loss": 0.8287,
+      "step": 5679
+    },
+    {
+      "epoch": 0.3951441789279627,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001378568090587595,
+      "loss": 0.8146,
+      "step": 5680
+    },
+    {
+      "epoch": 0.3952137465650979,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001378359518739698,
+      "loss": 0.6935,
+      "step": 5681
+    },
+    {
+      "epoch": 0.3952833142022331,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013781509276806117,
+      "loss": 1.1332,
+      "step": 5682
+    },
+    {
+      "epoch": 0.39535288183936834,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001377942317420927,
+      "loss": 0.907,
+      "step": 5683
+    },
+    {
+      "epoch": 0.3954224494765035,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013777336879712367,
+      "loss": 0.8557,
+      "step": 5684
+    },
+    {
+      "epoch": 0.39549201711363874,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013775250393421336,
+      "loss": 0.7406,
+      "step": 5685
+    },
+    {
+      "epoch": 0.39556158475077396,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013773163715442118,
+      "loss": 0.7507,
+      "step": 5686
+    },
+    {
+      "epoch": 0.39563115238790914,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013771076845880668,
+      "loss": 0.9068,
+      "step": 5687
+    },
+    {
+      "epoch": 0.39570072002504436,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013768989784842941,
+      "loss": 0.8867,
+      "step": 5688
+    },
+    {
+      "epoch": 0.39577028766217953,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001376690253243491,
+      "loss": 0.9804,
+      "step": 5689
+    },
+    {
+      "epoch": 0.39583985529931476,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0013764815088762553,
+      "loss": 1.0954,
+      "step": 5690
+    },
+    {
+      "epoch": 0.39590942293645,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0013762727453931862,
+      "loss": 0.7848,
+      "step": 5691
+    },
+    {
+      "epoch": 0.39597899057358515,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0013760639628048838,
+      "loss": 0.6177,
+      "step": 5692
+    },
+    {
+      "epoch": 0.3960485582107204,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001375855161121949,
+      "loss": 0.8519,
+      "step": 5693
+    },
+    {
+      "epoch": 0.39611812584785555,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013756463403549835,
+      "loss": 0.671,
+      "step": 5694
+    },
+    {
+      "epoch": 0.3961876934849908,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00137543750051459,
+      "loss": 0.8336,
+      "step": 5695
+    },
+    {
+      "epoch": 0.396257261122126,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013752286416113728,
+      "loss": 0.9169,
+      "step": 5696
+    },
+    {
+      "epoch": 0.3963268287592612,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013750197636559363,
+      "loss": 0.8185,
+      "step": 5697
+    },
+    {
+      "epoch": 0.3963963963963964,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013748108666588865,
+      "loss": 1.1148,
+      "step": 5698
+    },
+    {
+      "epoch": 0.3964659640335316,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0013746019506308302,
+      "loss": 0.903,
+      "step": 5699
+    },
+    {
+      "epoch": 0.3965355316706668,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001374393015582375,
+      "loss": 0.9774,
+      "step": 5700
+    },
+    {
+      "epoch": 0.396605099307802,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0013741840615241294,
+      "loss": 1.2874,
+      "step": 5701
+    },
+    {
+      "epoch": 0.3966746669449372,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001373975088466703,
+      "loss": 0.7905,
+      "step": 5702
+    },
+    {
+      "epoch": 0.3967442345820724,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0013737660964207071,
+      "loss": 0.8377,
+      "step": 5703
+    },
+    {
+      "epoch": 0.39681380221920765,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013735570853967522,
+      "loss": 0.9901,
+      "step": 5704
+    },
+    {
+      "epoch": 0.3968833698563428,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013733480554054519,
+      "loss": 0.8395,
+      "step": 5705
+    },
+    {
+      "epoch": 0.39695293749347804,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0013731390064574188,
+      "loss": 0.8949,
+      "step": 5706
+    },
+    {
+      "epoch": 0.3970225051306132,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013729299385632676,
+      "loss": 0.6607,
+      "step": 5707
+    },
+    {
+      "epoch": 0.39709207276774844,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001372720851733614,
+      "loss": 0.9082,
+      "step": 5708
+    },
+    {
+      "epoch": 0.39716164040488366,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013725117459790744,
+      "loss": 1.0577,
+      "step": 5709
+    },
+    {
+      "epoch": 0.39723120804201884,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013723026213102658,
+      "loss": 0.6586,
+      "step": 5710
+    },
+    {
+      "epoch": 0.39730077567915406,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013720934777378064,
+      "loss": 0.8832,
+      "step": 5711
+    },
+    {
+      "epoch": 0.3973703433162893,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001371884315272316,
+      "loss": 0.9176,
+      "step": 5712
+    },
+    {
+      "epoch": 0.39743991095342446,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0013716751339244145,
+      "loss": 0.9529,
+      "step": 5713
+    },
+    {
+      "epoch": 0.3975094785905597,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013714659337047228,
+      "loss": 0.9422,
+      "step": 5714
+    },
+    {
+      "epoch": 0.39757904622769485,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013712567146238635,
+      "loss": 0.8914,
+      "step": 5715
+    },
+    {
+      "epoch": 0.3976486138648301,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013710474766924596,
+      "loss": 0.9733,
+      "step": 5716
+    },
+    {
+      "epoch": 0.3977181815019653,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013708382199211348,
+      "loss": 0.9387,
+      "step": 5717
+    },
+    {
+      "epoch": 0.3977877491391005,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0013706289443205146,
+      "loss": 0.9794,
+      "step": 5718
+    },
+    {
+      "epoch": 0.3978573167762357,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013704196499012247,
+      "loss": 0.8667,
+      "step": 5719
+    },
+    {
+      "epoch": 0.3979268844133709,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013702103366738919,
+      "loss": 0.8185,
+      "step": 5720
+    },
+    {
+      "epoch": 0.3979964520505061,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013700010046491442,
+      "loss": 1.0267,
+      "step": 5721
+    },
+    {
+      "epoch": 0.3980660196876413,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013697916538376106,
+      "loss": 0.9751,
+      "step": 5722
+    },
+    {
+      "epoch": 0.3981355873247765,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013695822842499203,
+      "loss": 0.6295,
+      "step": 5723
+    },
+    {
+      "epoch": 0.3982051549619117,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001369372895896705,
+      "loss": 1.0598,
+      "step": 5724
+    },
+    {
+      "epoch": 0.39827472259904695,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013691634887885954,
+      "loss": 0.8897,
+      "step": 5725
+    },
+    {
+      "epoch": 0.3983442902361821,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013689540629362247,
+      "loss": 0.6013,
+      "step": 5726
+    },
+    {
+      "epoch": 0.39841385787331735,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013687446183502264,
+      "loss": 0.8765,
+      "step": 5727
+    },
+    {
+      "epoch": 0.3984834255104525,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001368535155041235,
+      "loss": 0.8201,
+      "step": 5728
+    },
+    {
+      "epoch": 0.39855299314758774,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013683256730198858,
+      "loss": 0.6924,
+      "step": 5729
+    },
+    {
+      "epoch": 0.39862256078472297,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0013681161722968157,
+      "loss": 0.9522,
+      "step": 5730
+    },
+    {
+      "epoch": 0.39869212842185814,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0013679066528826617,
+      "loss": 0.7778,
+      "step": 5731
+    },
+    {
+      "epoch": 0.39876169605899336,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001367697114788062,
+      "loss": 1.1964,
+      "step": 5732
+    },
+    {
+      "epoch": 0.39883126369612854,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013674875580236563,
+      "loss": 0.7515,
+      "step": 5733
+    },
+    {
+      "epoch": 0.39890083133326376,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001367277982600085,
+      "loss": 0.6181,
+      "step": 5734
+    },
+    {
+      "epoch": 0.398970398970399,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0013670683885279886,
+      "loss": 0.9628,
+      "step": 5735
+    },
+    {
+      "epoch": 0.39903996660753416,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0013668587758180095,
+      "loss": 1.1417,
+      "step": 5736
+    },
+    {
+      "epoch": 0.3991095342446694,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013666491444807912,
+      "loss": 0.9542,
+      "step": 5737
+    },
+    {
+      "epoch": 0.3991791018818046,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013664394945269774,
+      "loss": 0.8635,
+      "step": 5738
+    },
+    {
+      "epoch": 0.3992486695189398,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0013662298259672129,
+      "loss": 0.8736,
+      "step": 5739
+    },
+    {
+      "epoch": 0.399318237156075,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0013660201388121438,
+      "loss": 0.8686,
+      "step": 5740
+    },
+    {
+      "epoch": 0.3993878047932102,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0013658104330724168,
+      "loss": 1.0662,
+      "step": 5741
+    },
+    {
+      "epoch": 0.3994573724303454,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00136560070875868,
+      "loss": 0.6878,
+      "step": 5742
+    },
+    {
+      "epoch": 0.39952694006748063,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001365390965881582,
+      "loss": 0.9694,
+      "step": 5743
+    },
+    {
+      "epoch": 0.3995965077046158,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013651812044517722,
+      "loss": 0.8046,
+      "step": 5744
+    },
+    {
+      "epoch": 0.399666075341751,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0013649714244799017,
+      "loss": 0.8767,
+      "step": 5745
+    },
+    {
+      "epoch": 0.3997356429788862,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013647616259766218,
+      "loss": 0.809,
+      "step": 5746
+    },
+    {
+      "epoch": 0.3998052106160214,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001364551808952585,
+      "loss": 0.96,
+      "step": 5747
+    },
+    {
+      "epoch": 0.39987477825315665,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001364341973418445,
+      "loss": 0.6902,
+      "step": 5748
+    },
+    {
+      "epoch": 0.3999443458902918,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0013641321193848558,
+      "loss": 0.8762,
+      "step": 5749
+    },
+    {
+      "epoch": 0.40001391352742705,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013639222468624732,
+      "loss": 0.7761,
+      "step": 5750
+    },
+    {
+      "epoch": 0.40008348116456227,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013637123558619532,
+      "loss": 0.6942,
+      "step": 5751
+    },
+    {
+      "epoch": 0.40015304880169744,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013635024463939528,
+      "loss": 0.7257,
+      "step": 5752
+    },
+    {
+      "epoch": 0.40022261643883267,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013632925184691304,
+      "loss": 0.9273,
+      "step": 5753
+    },
+    {
+      "epoch": 0.40029218407596784,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001363082572098145,
+      "loss": 1.032,
+      "step": 5754
+    },
+    {
+      "epoch": 0.40036175171310306,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0013628726072916568,
+      "loss": 0.7682,
+      "step": 5755
+    },
+    {
+      "epoch": 0.4004313193502383,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013626626240603266,
+      "loss": 0.8368,
+      "step": 5756
+    },
+    {
+      "epoch": 0.40050088698737346,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013624526224148162,
+      "loss": 0.9481,
+      "step": 5757
+    },
+    {
+      "epoch": 0.4005704546245087,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013622426023657886,
+      "loss": 0.9786,
+      "step": 5758
+    },
+    {
+      "epoch": 0.40064002226164386,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013620325639239076,
+      "loss": 0.852,
+      "step": 5759
+    },
+    {
+      "epoch": 0.4007095898987791,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013618225070998375,
+      "loss": 0.9142,
+      "step": 5760
+    },
+    {
+      "epoch": 0.4007791575359143,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013616124319042445,
+      "loss": 0.8978,
+      "step": 5761
+    },
+    {
+      "epoch": 0.4008487251730495,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013614023383477947,
+      "loss": 1.0515,
+      "step": 5762
+    },
+    {
+      "epoch": 0.4009182928101847,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013611922264411558,
+      "loss": 0.9575,
+      "step": 5763
+    },
+    {
+      "epoch": 0.40098786044731993,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0013609820961949961,
+      "loss": 1.222,
+      "step": 5764
+    },
+    {
+      "epoch": 0.4010574280844551,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013607719476199853,
+      "loss": 0.9202,
+      "step": 5765
+    },
+    {
+      "epoch": 0.40112699572159033,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013605617807267933,
+      "loss": 1.0188,
+      "step": 5766
+    },
+    {
+      "epoch": 0.4011965633587255,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013603515955260912,
+      "loss": 0.8969,
+      "step": 5767
+    },
+    {
+      "epoch": 0.4012661309958607,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013601413920285516,
+      "loss": 0.6961,
+      "step": 5768
+    },
+    {
+      "epoch": 0.40133569863299595,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013599311702448473,
+      "loss": 0.7371,
+      "step": 5769
+    },
+    {
+      "epoch": 0.4014052662701311,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013597209301856525,
+      "loss": 0.7546,
+      "step": 5770
+    },
+    {
+      "epoch": 0.40147483390726635,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013595106718616418,
+      "loss": 0.9842,
+      "step": 5771
+    },
+    {
+      "epoch": 0.4015444015444015,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013593003952834914,
+      "loss": 0.9441,
+      "step": 5772
+    },
+    {
+      "epoch": 0.40161396918153675,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0013590901004618776,
+      "loss": 0.6869,
+      "step": 5773
+    },
+    {
+      "epoch": 0.40168353681867197,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0013588797874074792,
+      "loss": 0.751,
+      "step": 5774
+    },
+    {
+      "epoch": 0.40175310445580714,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0013586694561309736,
+      "loss": 1.1005,
+      "step": 5775
+    },
+    {
+      "epoch": 0.40182267209294237,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013584591066430408,
+      "loss": 1.0155,
+      "step": 5776
+    },
+    {
+      "epoch": 0.4018922397300776,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0013582487389543615,
+      "loss": 1.0699,
+      "step": 5777
+    },
+    {
+      "epoch": 0.40196180736721276,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001358038353075617,
+      "loss": 0.9256,
+      "step": 5778
+    },
+    {
+      "epoch": 0.402031375004348,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0013578279490174892,
+      "loss": 0.9979,
+      "step": 5779
+    },
+    {
+      "epoch": 0.40210094264148316,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013576175267906619,
+      "loss": 0.8082,
+      "step": 5780
+    },
+    {
+      "epoch": 0.4021705102786184,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013574070864058193,
+      "loss": 0.7615,
+      "step": 5781
+    },
+    {
+      "epoch": 0.4022400779157536,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001357196627873646,
+      "loss": 1.0908,
+      "step": 5782
+    },
+    {
+      "epoch": 0.4023096455528888,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013569861512048285,
+      "loss": 0.712,
+      "step": 5783
+    },
+    {
+      "epoch": 0.402379213190024,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013567756564100537,
+      "loss": 0.9485,
+      "step": 5784
+    },
+    {
+      "epoch": 0.4024487808271592,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013565651435000093,
+      "loss": 0.986,
+      "step": 5785
+    },
+    {
+      "epoch": 0.4025183484642944,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001356354612485384,
+      "loss": 0.8165,
+      "step": 5786
+    },
+    {
+      "epoch": 0.40258791610142963,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013561440633768679,
+      "loss": 1.107,
+      "step": 5787
+    },
+    {
+      "epoch": 0.4026574837385648,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001355933496185151,
+      "loss": 0.9454,
+      "step": 5788
+    },
+    {
+      "epoch": 0.40272705137570003,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0013557229109209252,
+      "loss": 0.877,
+      "step": 5789
+    },
+    {
+      "epoch": 0.40279661901283526,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013555123075948835,
+      "loss": 0.8719,
+      "step": 5790
+    },
+    {
+      "epoch": 0.4028661866499704,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0013553016862177182,
+      "loss": 0.871,
+      "step": 5791
+    },
+    {
+      "epoch": 0.40293575428710565,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0013550910468001244,
+      "loss": 0.9059,
+      "step": 5792
+    },
+    {
+      "epoch": 0.4030053219242408,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0013548803893527971,
+      "loss": 0.8285,
+      "step": 5793
+    },
+    {
+      "epoch": 0.40307488956137605,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0013546697138864321,
+      "loss": 1.0612,
+      "step": 5794
+    },
+    {
+      "epoch": 0.4031444571985113,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001354459020411727,
+      "loss": 0.7523,
+      "step": 5795
+    },
+    {
+      "epoch": 0.40321402483564645,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0013542483089393788,
+      "loss": 0.9797,
+      "step": 5796
+    },
+    {
+      "epoch": 0.40328359247278167,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013540375794800876,
+      "loss": 0.8473,
+      "step": 5797
+    },
+    {
+      "epoch": 0.40335316010991684,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013538268320445526,
+      "loss": 0.9444,
+      "step": 5798
+    },
+    {
+      "epoch": 0.40342272774705207,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0013536160666434746,
+      "loss": 0.7539,
+      "step": 5799
+    },
+    {
+      "epoch": 0.4034922953841873,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013534052832875547,
+      "loss": 0.7358,
+      "step": 5800
+    },
+    {
+      "epoch": 0.40356186302132246,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001353194481987496,
+      "loss": 0.6306,
+      "step": 5801
+    },
+    {
+      "epoch": 0.4036314306584577,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0013529836627540015,
+      "loss": 0.7535,
+      "step": 5802
+    },
+    {
+      "epoch": 0.4037009982955929,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013527728255977758,
+      "loss": 1.0439,
+      "step": 5803
+    },
+    {
+      "epoch": 0.4037705659327281,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013525619705295245,
+      "loss": 0.6658,
+      "step": 5804
+    },
+    {
+      "epoch": 0.4038401335698633,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001352351097559953,
+      "loss": 0.8662,
+      "step": 5805
+    },
+    {
+      "epoch": 0.4039097012069985,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013521402066997692,
+      "loss": 0.8351,
+      "step": 5806
+    },
+    {
+      "epoch": 0.4039792688441337,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0013519292979596801,
+      "loss": 0.9935,
+      "step": 5807
+    },
+    {
+      "epoch": 0.40404883648126894,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013517183713503955,
+      "loss": 0.7469,
+      "step": 5808
+    },
+    {
+      "epoch": 0.4041184041184041,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0013515074268826246,
+      "loss": 1.154,
+      "step": 5809
+    },
+    {
+      "epoch": 0.40418797175553933,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013512964645670783,
+      "loss": 0.5069,
+      "step": 5810
+    },
+    {
+      "epoch": 0.4042575393926745,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013510854844144685,
+      "loss": 0.8358,
+      "step": 5811
+    },
+    {
+      "epoch": 0.40432710702980973,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0013508744864355066,
+      "loss": 0.8857,
+      "step": 5812
+    },
+    {
+      "epoch": 0.40439667466694496,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013506634706409078,
+      "loss": 1.0108,
+      "step": 5813
+    },
+    {
+      "epoch": 0.4044662423040801,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0013504524370413849,
+      "loss": 0.8618,
+      "step": 5814
+    },
+    {
+      "epoch": 0.40453580994121535,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013502413856476539,
+      "loss": 0.8774,
+      "step": 5815
+    },
+    {
+      "epoch": 0.4046053775783506,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013500303164704305,
+      "loss": 0.8373,
+      "step": 5816
+    },
+    {
+      "epoch": 0.40467494521548575,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0013498192295204317,
+      "loss": 1.0593,
+      "step": 5817
+    },
+    {
+      "epoch": 0.404744512852621,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.001349608124808376,
+      "loss": 0.7007,
+      "step": 5818
+    },
+    {
+      "epoch": 0.40481408048975615,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013493970023449814,
+      "loss": 0.7079,
+      "step": 5819
+    },
+    {
+      "epoch": 0.40488364812689137,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013491858621409688,
+      "loss": 0.8407,
+      "step": 5820
+    },
+    {
+      "epoch": 0.4049532157640266,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0013489747042070576,
+      "loss": 0.6917,
+      "step": 5821
+    },
+    {
+      "epoch": 0.40502278340116177,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0013487635285539703,
+      "loss": 0.9607,
+      "step": 5822
+    },
+    {
+      "epoch": 0.405092351038297,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0013485523351924288,
+      "loss": 0.9759,
+      "step": 5823
+    },
+    {
+      "epoch": 0.40516191867543216,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013483411241331565,
+      "loss": 1.0282,
+      "step": 5824
+    },
+    {
+      "epoch": 0.4052314863125674,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013481298953868777,
+      "loss": 0.9879,
+      "step": 5825
+    },
+    {
+      "epoch": 0.4053010539497026,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013479186489643172,
+      "loss": 0.962,
+      "step": 5826
+    },
+    {
+      "epoch": 0.4053706215868378,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013477073848762017,
+      "loss": 1.0321,
+      "step": 5827
+    },
+    {
+      "epoch": 0.405440189223973,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013474961031332575,
+      "loss": 0.972,
+      "step": 5828
+    },
+    {
+      "epoch": 0.40550975686110824,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013472848037462133,
+      "loss": 0.8074,
+      "step": 5829
+    },
+    {
+      "epoch": 0.4055793244982434,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013470734867257967,
+      "loss": 0.9583,
+      "step": 5830
+    },
+    {
+      "epoch": 0.40564889213537864,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001346862152082738,
+      "loss": 0.72,
+      "step": 5831
+    },
+    {
+      "epoch": 0.4057184597725138,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0013466507998277674,
+      "loss": 0.8751,
+      "step": 5832
+    },
+    {
+      "epoch": 0.40578802740964903,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013464394299716163,
+      "loss": 0.647,
+      "step": 5833
+    },
+    {
+      "epoch": 0.40585759504678426,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013462280425250175,
+      "loss": 0.8034,
+      "step": 5834
+    },
+    {
+      "epoch": 0.40592716268391943,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0013460166374987036,
+      "loss": 0.7786,
+      "step": 5835
+    },
+    {
+      "epoch": 0.40599673032105466,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001345805214903409,
+      "loss": 0.6817,
+      "step": 5836
+    },
+    {
+      "epoch": 0.4060662979581898,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0013455937747498686,
+      "loss": 0.8713,
+      "step": 5837
+    },
+    {
+      "epoch": 0.40613586559532505,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013453823170488182,
+      "loss": 0.8209,
+      "step": 5838
+    },
+    {
+      "epoch": 0.4062054332324603,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0013451708418109945,
+      "loss": 0.7567,
+      "step": 5839
+    },
+    {
+      "epoch": 0.40627500086959545,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0013449593490471351,
+      "loss": 1.2558,
+      "step": 5840
+    },
+    {
+      "epoch": 0.4063445685067307,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001344747838767979,
+      "loss": 0.9353,
+      "step": 5841
+    },
+    {
+      "epoch": 0.4064141361438659,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001344536310984265,
+      "loss": 0.8507,
+      "step": 5842
+    },
+    {
+      "epoch": 0.40648370378100107,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0013443247657067342,
+      "loss": 0.828,
+      "step": 5843
+    },
+    {
+      "epoch": 0.4065532714181363,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013441132029461268,
+      "loss": 0.844,
+      "step": 5844
+    },
+    {
+      "epoch": 0.40662283905527147,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0013439016227131857,
+      "loss": 0.7064,
+      "step": 5845
+    },
+    {
+      "epoch": 0.4066924066924067,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013436900250186536,
+      "loss": 0.6783,
+      "step": 5846
+    },
+    {
+      "epoch": 0.4067619743295419,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013434784098732742,
+      "loss": 0.8343,
+      "step": 5847
+    },
+    {
+      "epoch": 0.4068315419666771,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013432667772877926,
+      "loss": 0.8044,
+      "step": 5848
+    },
+    {
+      "epoch": 0.4069011096038123,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013430551272729538,
+      "loss": 0.8037,
+      "step": 5849
+    },
+    {
+      "epoch": 0.4069706772409475,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0013428434598395055,
+      "loss": 0.7308,
+      "step": 5850
+    },
+    {
+      "epoch": 0.4070402448780827,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0013426317749981936,
+      "loss": 1.059,
+      "step": 5851
+    },
+    {
+      "epoch": 0.40710981251521794,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013424200727597678,
+      "loss": 0.7579,
+      "step": 5852
+    },
+    {
+      "epoch": 0.4071793801523531,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013422083531349762,
+      "loss": 0.7347,
+      "step": 5853
+    },
+    {
+      "epoch": 0.40724894778948834,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013419966161345694,
+      "loss": 0.9378,
+      "step": 5854
+    },
+    {
+      "epoch": 0.40731851542662356,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0013417848617692984,
+      "loss": 0.7056,
+      "step": 5855
+    },
+    {
+      "epoch": 0.40738808306375873,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013415730900499146,
+      "loss": 0.6704,
+      "step": 5856
+    },
+    {
+      "epoch": 0.40745765070089396,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013413613009871713,
+      "loss": 0.8853,
+      "step": 5857
+    },
+    {
+      "epoch": 0.40752721833802913,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001341149494591821,
+      "loss": 0.7417,
+      "step": 5858
+    },
+    {
+      "epoch": 0.40759678597516436,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0013409376708746197,
+      "loss": 0.7576,
+      "step": 5859
+    },
+    {
+      "epoch": 0.4076663536122996,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013407258298463215,
+      "loss": 0.8548,
+      "step": 5860
+    },
+    {
+      "epoch": 0.40773592124943475,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013405139715176833,
+      "loss": 0.9098,
+      "step": 5861
+    },
+    {
+      "epoch": 0.40780548888657,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013403020958994616,
+      "loss": 0.7366,
+      "step": 5862
+    },
+    {
+      "epoch": 0.40787505652370515,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0013400902030024147,
+      "loss": 0.7845,
+      "step": 5863
+    },
+    {
+      "epoch": 0.4079446241608404,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0013398782928373018,
+      "loss": 1.0852,
+      "step": 5864
+    },
+    {
+      "epoch": 0.4080141917979756,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013396663654148822,
+      "loss": 1.0065,
+      "step": 5865
+    },
+    {
+      "epoch": 0.40808375943511077,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013394544207459167,
+      "loss": 0.7465,
+      "step": 5866
+    },
+    {
+      "epoch": 0.408153327072246,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013392424588411665,
+      "loss": 0.871,
+      "step": 5867
+    },
+    {
+      "epoch": 0.4082228947093812,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013390304797113943,
+      "loss": 0.708,
+      "step": 5868
+    },
+    {
+      "epoch": 0.4082924623465164,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013388184833673631,
+      "loss": 0.6567,
+      "step": 5869
+    },
+    {
+      "epoch": 0.4083620299836516,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001338606469819837,
+      "loss": 0.7794,
+      "step": 5870
+    },
+    {
+      "epoch": 0.4084315976207868,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0013383944390795812,
+      "loss": 0.899,
+      "step": 5871
+    },
+    {
+      "epoch": 0.408501165257922,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001338182391157361,
+      "loss": 0.8562,
+      "step": 5872
+    },
+    {
+      "epoch": 0.40857073289505724,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013379703260639442,
+      "loss": 0.9008,
+      "step": 5873
+    },
+    {
+      "epoch": 0.4086403005321924,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013377582438100972,
+      "loss": 0.8773,
+      "step": 5874
+    },
+    {
+      "epoch": 0.40870986816932764,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0013375461444065896,
+      "loss": 0.7937,
+      "step": 5875
+    },
+    {
+      "epoch": 0.4087794358064628,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0013373340278641894,
+      "loss": 0.7907,
+      "step": 5876
+    },
+    {
+      "epoch": 0.40884900344359804,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013371218941936683,
+      "loss": 0.8613,
+      "step": 5877
+    },
+    {
+      "epoch": 0.40891857108073326,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013369097434057964,
+      "loss": 0.7615,
+      "step": 5878
+    },
+    {
+      "epoch": 0.40898813871786843,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013366975755113456,
+      "loss": 0.9024,
+      "step": 5879
+    },
+    {
+      "epoch": 0.40905770635500366,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013364853905210893,
+      "loss": 0.7378,
+      "step": 5880
+    },
+    {
+      "epoch": 0.4091272739921389,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013362731884458006,
+      "loss": 0.7813,
+      "step": 5881
+    },
+    {
+      "epoch": 0.40919684162927406,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013360609692962546,
+      "loss": 1.0286,
+      "step": 5882
+    },
+    {
+      "epoch": 0.4092664092664093,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001335848733083226,
+      "loss": 0.7492,
+      "step": 5883
+    },
+    {
+      "epoch": 0.40933597690354445,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001335636479817492,
+      "loss": 0.8455,
+      "step": 5884
+    },
+    {
+      "epoch": 0.4094055445406797,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013354242095098294,
+      "loss": 0.9679,
+      "step": 5885
+    },
+    {
+      "epoch": 0.4094751121778149,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013352119221710158,
+      "loss": 0.9595,
+      "step": 5886
+    },
+    {
+      "epoch": 0.4095446798149501,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013349996178118305,
+      "loss": 0.8342,
+      "step": 5887
+    },
+    {
+      "epoch": 0.4096142474520853,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013347872964430527,
+      "loss": 0.7592,
+      "step": 5888
+    },
+    {
+      "epoch": 0.40968381508922047,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013345749580754643,
+      "loss": 0.9402,
+      "step": 5889
+    },
+    {
+      "epoch": 0.4097533827263557,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013343626027198451,
+      "loss": 0.8261,
+      "step": 5890
+    },
+    {
+      "epoch": 0.4098229503634909,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0013341502303869787,
+      "loss": 0.8737,
+      "step": 5891
+    },
+    {
+      "epoch": 0.4098925180006261,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0013339378410876478,
+      "loss": 0.8406,
+      "step": 5892
+    },
+    {
+      "epoch": 0.4099620856377613,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013337254348326363,
+      "loss": 0.8002,
+      "step": 5893
+    },
+    {
+      "epoch": 0.4100316532748965,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0013335130116327296,
+      "loss": 0.7979,
+      "step": 5894
+    },
+    {
+      "epoch": 0.4101012209120317,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0013333005714987127,
+      "loss": 0.9424,
+      "step": 5895
+    },
+    {
+      "epoch": 0.41017078854916694,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0013330881144413733,
+      "loss": 0.9167,
+      "step": 5896
+    },
+    {
+      "epoch": 0.4102403561863021,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0013328756404714982,
+      "loss": 1.0018,
+      "step": 5897
+    },
+    {
+      "epoch": 0.41030992382343734,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013326631495998759,
+      "loss": 0.8047,
+      "step": 5898
+    },
+    {
+      "epoch": 0.41037949146057257,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013324506418372953,
+      "loss": 0.918,
+      "step": 5899
+    },
+    {
+      "epoch": 0.41044905909770774,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001332238117194547,
+      "loss": 0.9236,
+      "step": 5900
+    },
+    {
+      "epoch": 0.41051862673484296,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001332025575682422,
+      "loss": 0.9684,
+      "step": 5901
+    },
+    {
+      "epoch": 0.41058819437197813,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0013318130173117111,
+      "loss": 1.0373,
+      "step": 5902
+    },
+    {
+      "epoch": 0.41065776200911336,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0013316004420932085,
+      "loss": 0.8822,
+      "step": 5903
+    },
+    {
+      "epoch": 0.4107273296462486,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001331387850037706,
+      "loss": 0.7408,
+      "step": 5904
+    },
+    {
+      "epoch": 0.41079689728338376,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0013311752411559994,
+      "loss": 0.8438,
+      "step": 5905
+    },
+    {
+      "epoch": 0.410866464920519,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001330962615458883,
+      "loss": 0.8473,
+      "step": 5906
+    },
+    {
+      "epoch": 0.41093603255765415,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013307499729571532,
+      "loss": 0.784,
+      "step": 5907
+    },
+    {
+      "epoch": 0.4110056001947894,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001330537313661607,
+      "loss": 1.2242,
+      "step": 5908
+    },
+    {
+      "epoch": 0.4110751678319246,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001330324637583042,
+      "loss": 0.7399,
+      "step": 5909
+    },
+    {
+      "epoch": 0.4111447354690598,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001330111944732257,
+      "loss": 0.7732,
+      "step": 5910
+    },
+    {
+      "epoch": 0.411214303106195,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0013298992351200509,
+      "loss": 1.0661,
+      "step": 5911
+    },
+    {
+      "epoch": 0.4112838707433302,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001329686508757225,
+      "loss": 0.6823,
+      "step": 5912
+    },
+    {
+      "epoch": 0.4113534383804654,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013294737656545795,
+      "loss": 0.8745,
+      "step": 5913
+    },
+    {
+      "epoch": 0.4114230060176006,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013292610058229168,
+      "loss": 0.7367,
+      "step": 5914
+    },
+    {
+      "epoch": 0.4114925736547358,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0013290482292730402,
+      "loss": 0.9285,
+      "step": 5915
+    },
+    {
+      "epoch": 0.411562141291871,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013288354360157528,
+      "loss": 0.7864,
+      "step": 5916
+    },
+    {
+      "epoch": 0.41163170892900625,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013286226260618597,
+      "loss": 0.8476,
+      "step": 5917
+    },
+    {
+      "epoch": 0.4117012765661414,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0013284097994221656,
+      "loss": 0.7926,
+      "step": 5918
+    },
+    {
+      "epoch": 0.41177084420327664,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0013281969561074775,
+      "loss": 1.0299,
+      "step": 5919
+    },
+    {
+      "epoch": 0.4118404118404118,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001327984096128602,
+      "loss": 0.9616,
+      "step": 5920
+    },
+    {
+      "epoch": 0.41190997947754704,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013277712194963475,
+      "loss": 0.9856,
+      "step": 5921
+    },
+    {
+      "epoch": 0.41197954711468227,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0013275583262215224,
+      "loss": 1.2096,
+      "step": 5922
+    },
+    {
+      "epoch": 0.41204911475181744,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013273454163149365,
+      "loss": 0.8468,
+      "step": 5923
+    },
+    {
+      "epoch": 0.41211868238895266,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0013271324897874007,
+      "loss": 1.0125,
+      "step": 5924
+    },
+    {
+      "epoch": 0.4121882500260879,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013269195466497252,
+      "loss": 0.728,
+      "step": 5925
+    },
+    {
+      "epoch": 0.41225781766322306,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0013267065869127235,
+      "loss": 1.0771,
+      "step": 5926
+    },
+    {
+      "epoch": 0.4123273853003583,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013264936105872077,
+      "loss": 0.6939,
+      "step": 5927
+    },
+    {
+      "epoch": 0.41239695293749346,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001326280617683992,
+      "loss": 0.8318,
+      "step": 5928
+    },
+    {
+      "epoch": 0.4124665205746287,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013260676082138914,
+      "loss": 0.8941,
+      "step": 5929
+    },
+    {
+      "epoch": 0.4125360882117639,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001325854582187721,
+      "loss": 0.8441,
+      "step": 5930
+    },
+    {
+      "epoch": 0.4126056558488991,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013256415396162976,
+      "loss": 0.9476,
+      "step": 5931
+    },
+    {
+      "epoch": 0.4126752234860343,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0013254284805104377,
+      "loss": 0.8693,
+      "step": 5932
+    },
+    {
+      "epoch": 0.4127447911231695,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0013252154048809604,
+      "loss": 1.0098,
+      "step": 5933
+    },
+    {
+      "epoch": 0.4128143587603047,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0013250023127386835,
+      "loss": 0.5385,
+      "step": 5934
+    },
+    {
+      "epoch": 0.4128839263974399,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0013247892040944276,
+      "loss": 0.7623,
+      "step": 5935
+    },
+    {
+      "epoch": 0.4129534940345751,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001324576078959013,
+      "loss": 0.6963,
+      "step": 5936
+    },
+    {
+      "epoch": 0.4130230616717103,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013243629373432609,
+      "loss": 0.827,
+      "step": 5937
+    },
+    {
+      "epoch": 0.41309262930884555,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013241497792579938,
+      "loss": 0.8957,
+      "step": 5938
+    },
+    {
+      "epoch": 0.4131621969459807,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0013239366047140347,
+      "loss": 0.8064,
+      "step": 5939
+    },
+    {
+      "epoch": 0.41323176458311595,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001323723413722208,
+      "loss": 0.8238,
+      "step": 5940
+    },
+    {
+      "epoch": 0.4133013322202511,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013235102062933372,
+      "loss": 0.8791,
+      "step": 5941
+    },
+    {
+      "epoch": 0.41337089985738634,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0013232969824382497,
+      "loss": 0.7921,
+      "step": 5942
+    },
+    {
+      "epoch": 0.41344046749452157,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013230837421677702,
+      "loss": 0.8456,
+      "step": 5943
+    },
+    {
+      "epoch": 0.41351003513165674,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013228704854927268,
+      "loss": 0.8593,
+      "step": 5944
+    },
+    {
+      "epoch": 0.41357960276879197,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001322657212423948,
+      "loss": 0.8399,
+      "step": 5945
+    },
+    {
+      "epoch": 0.41364917040592714,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001322443922972262,
+      "loss": 0.9466,
+      "step": 5946
+    },
+    {
+      "epoch": 0.41371873804306236,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001322230617148499,
+      "loss": 0.8536,
+      "step": 5947
+    },
+    {
+      "epoch": 0.4137883056801976,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0013220172949634892,
+      "loss": 0.8135,
+      "step": 5948
+    },
+    {
+      "epoch": 0.41385787331733276,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0013218039564280647,
+      "loss": 0.7584,
+      "step": 5949
+    },
+    {
+      "epoch": 0.413927440954468,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013215906015530568,
+      "loss": 0.7815,
+      "step": 5950
+    },
+    {
+      "epoch": 0.4139970085916032,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001321377230349299,
+      "loss": 1.0563,
+      "step": 5951
+    },
+    {
+      "epoch": 0.4140665762287384,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013211638428276256,
+      "loss": 0.962,
+      "step": 5952
+    },
+    {
+      "epoch": 0.4141361438658736,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013209504389988709,
+      "loss": 0.7863,
+      "step": 5953
+    },
+    {
+      "epoch": 0.4142057115030088,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013207370188738708,
+      "loss": 0.9184,
+      "step": 5954
+    },
+    {
+      "epoch": 0.414275279140144,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0013205235824634615,
+      "loss": 0.8869,
+      "step": 5955
+    },
+    {
+      "epoch": 0.41434484677727923,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0013203101297784804,
+      "loss": 0.786,
+      "step": 5956
+    },
+    {
+      "epoch": 0.4144144144144144,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013200966608297648,
+      "loss": 0.9513,
+      "step": 5957
+    },
+    {
+      "epoch": 0.4144839820515496,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013198831756281546,
+      "loss": 1.0198,
+      "step": 5958
+    },
+    {
+      "epoch": 0.4145535496886848,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001319669674184489,
+      "loss": 0.6842,
+      "step": 5959
+    },
+    {
+      "epoch": 0.41462311732582,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0013194561565096085,
+      "loss": 1.0767,
+      "step": 5960
+    },
+    {
+      "epoch": 0.41469268496295525,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0013192426226143548,
+      "loss": 0.9905,
+      "step": 5961
+    },
+    {
+      "epoch": 0.4147622526000904,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0013190290725095695,
+      "loss": 0.8643,
+      "step": 5962
+    },
+    {
+      "epoch": 0.41483182023722565,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013188155062060962,
+      "loss": 0.6039,
+      "step": 5963
+    },
+    {
+      "epoch": 0.41490138787436087,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013186019237147785,
+      "loss": 0.8251,
+      "step": 5964
+    },
+    {
+      "epoch": 0.41497095551149604,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0013183883250464606,
+      "loss": 0.8622,
+      "step": 5965
+    },
+    {
+      "epoch": 0.41504052314863127,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013181747102119887,
+      "loss": 0.8189,
+      "step": 5966
+    },
+    {
+      "epoch": 0.41511009078576644,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0013179610792222085,
+      "loss": 0.936,
+      "step": 5967
+    },
+    {
+      "epoch": 0.41517965842290167,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0013177474320879674,
+      "loss": 0.857,
+      "step": 5968
+    },
+    {
+      "epoch": 0.4152492260600369,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013175337688201135,
+      "loss": 0.9093,
+      "step": 5969
+    },
+    {
+      "epoch": 0.41531879369717206,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001317320089429495,
+      "loss": 0.8407,
+      "step": 5970
+    },
+    {
+      "epoch": 0.4153883613343073,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001317106393926962,
+      "loss": 0.7999,
+      "step": 5971
+    },
+    {
+      "epoch": 0.41545792897144246,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0013168926823233645,
+      "loss": 0.9324,
+      "step": 5972
+    },
+    {
+      "epoch": 0.4155274966085777,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.001316678954629554,
+      "loss": 1.0212,
+      "step": 5973
+    },
+    {
+      "epoch": 0.4155970642457129,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0013164652108563822,
+      "loss": 0.9504,
+      "step": 5974
+    },
+    {
+      "epoch": 0.4156666318828481,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0013162514510147022,
+      "loss": 0.8767,
+      "step": 5975
+    },
+    {
+      "epoch": 0.4157361995199833,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0013160376751153674,
+      "loss": 0.7471,
+      "step": 5976
+    },
+    {
+      "epoch": 0.41580576715711853,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013158238831692324,
+      "loss": 0.9097,
+      "step": 5977
+    },
+    {
+      "epoch": 0.4158753347942537,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0013156100751871528,
+      "loss": 1.1735,
+      "step": 5978
+    },
+    {
+      "epoch": 0.41594490243138893,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013153962511799843,
+      "loss": 0.8713,
+      "step": 5979
+    },
+    {
+      "epoch": 0.4160144700685241,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0013151824111585836,
+      "loss": 1.0516,
+      "step": 5980
+    },
+    {
+      "epoch": 0.4160840377056593,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013149685551338086,
+      "loss": 0.796,
+      "step": 5981
+    },
+    {
+      "epoch": 0.41615360534279455,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013147546831165182,
+      "loss": 0.9837,
+      "step": 5982
+    },
+    {
+      "epoch": 0.4162231729799297,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0013145407951175717,
+      "loss": 0.751,
+      "step": 5983
+    },
+    {
+      "epoch": 0.41629274061706495,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0013143268911478287,
+      "loss": 0.7369,
+      "step": 5984
+    },
+    {
+      "epoch": 0.4163623082542001,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013141129712181505,
+      "loss": 0.9013,
+      "step": 5985
+    },
+    {
+      "epoch": 0.41643187589133535,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0013138990353393988,
+      "loss": 1.1253,
+      "step": 5986
+    },
+    {
+      "epoch": 0.41650144352847057,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013136850835224366,
+      "loss": 0.6227,
+      "step": 5987
+    },
+    {
+      "epoch": 0.41657101116560574,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0013134711157781268,
+      "loss": 0.6232,
+      "step": 5988
+    },
+    {
+      "epoch": 0.41664057880274097,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0013132571321173337,
+      "loss": 0.7733,
+      "step": 5989
+    },
+    {
+      "epoch": 0.4167101464398762,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0013130431325509221,
+      "loss": 0.9784,
+      "step": 5990
+    },
+    {
+      "epoch": 0.41677971407701137,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0013128291170897584,
+      "loss": 0.8261,
+      "step": 5991
+    },
+    {
+      "epoch": 0.4168492817141466,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0013126150857447087,
+      "loss": 0.8843,
+      "step": 5992
+    },
+    {
+      "epoch": 0.41691884935128176,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001312401038526641,
+      "loss": 0.7726,
+      "step": 5993
+    },
+    {
+      "epoch": 0.416988416988417,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0013121869754464228,
+      "loss": 0.9126,
+      "step": 5994
+    },
+    {
+      "epoch": 0.4170579846255522,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0013119728965149237,
+      "loss": 1.1141,
+      "step": 5995
+    },
+    {
+      "epoch": 0.4171275522626874,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013117588017430134,
+      "loss": 0.9249,
+      "step": 5996
+    },
+    {
+      "epoch": 0.4171971198998226,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013115446911415626,
+      "loss": 0.773,
+      "step": 5997
+    },
+    {
+      "epoch": 0.4172666875369578,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013113305647214424,
+      "loss": 0.7567,
+      "step": 5998
+    },
+    {
+      "epoch": 0.417336255174093,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013111164224935256,
+      "loss": 0.8103,
+      "step": 5999
+    },
+    {
+      "epoch": 0.41740582281122823,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001310902264468685,
+      "loss": 0.6928,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4174753904483634,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0013106880906577944,
+      "loss": 0.6852,
+      "step": 6001
+    },
+    {
+      "epoch": 0.41754495808549863,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013104739010717287,
+      "loss": 0.8544,
+      "step": 6002
+    },
+    {
+      "epoch": 0.41761452572263386,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0013102596957213631,
+      "loss": 0.8348,
+      "step": 6003
+    },
+    {
+      "epoch": 0.417684093359769,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013100454746175739,
+      "loss": 0.8061,
+      "step": 6004
+    },
+    {
+      "epoch": 0.41775366099690425,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013098312377712383,
+      "loss": 0.7722,
+      "step": 6005
+    },
+    {
+      "epoch": 0.4178232286340394,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013096169851932338,
+      "loss": 1.0377,
+      "step": 6006
+    },
+    {
+      "epoch": 0.41789279627117465,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0013094027168944397,
+      "loss": 0.8373,
+      "step": 6007
+    },
+    {
+      "epoch": 0.4179623639083099,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001309188432885735,
+      "loss": 1.1225,
+      "step": 6008
+    },
+    {
+      "epoch": 0.41803193154544505,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013089741331780004,
+      "loss": 0.9004,
+      "step": 6009
+    },
+    {
+      "epoch": 0.41810149918258027,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0013087598177821166,
+      "loss": 0.8053,
+      "step": 6010
+    },
+    {
+      "epoch": 0.41817106681971544,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0013085454867089652,
+      "loss": 0.8599,
+      "step": 6011
+    },
+    {
+      "epoch": 0.41824063445685067,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013083311399694293,
+      "loss": 0.7888,
+      "step": 6012
+    },
+    {
+      "epoch": 0.4183102020939859,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0013081167775743925,
+      "loss": 0.814,
+      "step": 6013
+    },
+    {
+      "epoch": 0.41837976973112107,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013079023995347385,
+      "loss": 0.9164,
+      "step": 6014
+    },
+    {
+      "epoch": 0.4184493373682563,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0013076880058613524,
+      "loss": 0.8386,
+      "step": 6015
+    },
+    {
+      "epoch": 0.4185189050053915,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013074735965651206,
+      "loss": 0.7152,
+      "step": 6016
+    },
+    {
+      "epoch": 0.4185884726425267,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0013072591716569294,
+      "loss": 0.6224,
+      "step": 6017
+    },
+    {
+      "epoch": 0.4186580402796619,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001307044731147666,
+      "loss": 0.902,
+      "step": 6018
+    },
+    {
+      "epoch": 0.4187276079167971,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0013068302750482185,
+      "loss": 0.7997,
+      "step": 6019
+    },
+    {
+      "epoch": 0.4187971755539323,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013066158033694763,
+      "loss": 0.9071,
+      "step": 6020
+    },
+    {
+      "epoch": 0.41886674319106754,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013064013161223293,
+      "loss": 0.913,
+      "step": 6021
+    },
+    {
+      "epoch": 0.4189363108282027,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0013061868133176678,
+      "loss": 0.9266,
+      "step": 6022
+    },
+    {
+      "epoch": 0.41900587846533793,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001305972294966383,
+      "loss": 0.9229,
+      "step": 6023
+    },
+    {
+      "epoch": 0.4190754461024731,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0013057577610793673,
+      "loss": 0.8808,
+      "step": 6024
+    },
+    {
+      "epoch": 0.41914501373960833,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001305543211667514,
+      "loss": 0.8094,
+      "step": 6025
+    },
+    {
+      "epoch": 0.41921458137674356,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001305328646741716,
+      "loss": 0.8917,
+      "step": 6026
+    },
+    {
+      "epoch": 0.4192841490138787,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0013051140663128686,
+      "loss": 0.8215,
+      "step": 6027
+    },
+    {
+      "epoch": 0.41935371665101395,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013048994703918667,
+      "loss": 0.869,
+      "step": 6028
+    },
+    {
+      "epoch": 0.4194232842881492,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013046848589896066,
+      "loss": 0.9581,
+      "step": 6029
+    },
+    {
+      "epoch": 0.41949285192528435,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013044702321169848,
+      "loss": 0.7993,
+      "step": 6030
+    },
+    {
+      "epoch": 0.4195624195624196,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0013042555897848996,
+      "loss": 0.9372,
+      "step": 6031
+    },
+    {
+      "epoch": 0.41963198719955475,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0013040409320042488,
+      "loss": 0.7678,
+      "step": 6032
+    },
+    {
+      "epoch": 0.41970155483668997,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013038262587859323,
+      "loss": 0.9445,
+      "step": 6033
+    },
+    {
+      "epoch": 0.4197711224738252,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0013036115701408493,
+      "loss": 0.631,
+      "step": 6034
+    },
+    {
+      "epoch": 0.41984069011096037,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0013033968660799014,
+      "loss": 1.0625,
+      "step": 6035
+    },
+    {
+      "epoch": 0.4199102577480956,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00130318214661399,
+      "loss": 0.6111,
+      "step": 6036
+    },
+    {
+      "epoch": 0.41997982538523077,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001302967411754017,
+      "loss": 0.8909,
+      "step": 6037
+    },
+    {
+      "epoch": 0.420049393022366,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013027526615108863,
+      "loss": 0.8946,
+      "step": 6038
+    },
+    {
+      "epoch": 0.4201189606595012,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001302537895895501,
+      "loss": 0.8895,
+      "step": 6039
+    },
+    {
+      "epoch": 0.4201885282966364,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013023231149187663,
+      "loss": 0.9964,
+      "step": 6040
+    },
+    {
+      "epoch": 0.4202580959337716,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0013021083185915882,
+      "loss": 1.0044,
+      "step": 6041
+    },
+    {
+      "epoch": 0.42032766357090684,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0013018935069248718,
+      "loss": 0.7365,
+      "step": 6042
+    },
+    {
+      "epoch": 0.420397231208042,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0013016786799295251,
+      "loss": 0.7353,
+      "step": 6043
+    },
+    {
+      "epoch": 0.42046679884517724,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0013014638376164555,
+      "loss": 0.7773,
+      "step": 6044
+    },
+    {
+      "epoch": 0.4205363664823124,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013012489799965716,
+      "loss": 1.0401,
+      "step": 6045
+    },
+    {
+      "epoch": 0.42060593411944763,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001301034107080783,
+      "loss": 0.9157,
+      "step": 6046
+    },
+    {
+      "epoch": 0.42067550175658286,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00130081921888,
+      "loss": 0.7776,
+      "step": 6047
+    },
+    {
+      "epoch": 0.42074506939371803,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013006043154051331,
+      "loss": 0.9943,
+      "step": 6048
+    },
+    {
+      "epoch": 0.42081463703085326,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0013003893966670942,
+      "loss": 1.0388,
+      "step": 6049
+    },
+    {
+      "epoch": 0.4208842046679884,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0013001744626767958,
+      "loss": 0.7578,
+      "step": 6050
+    },
+    {
+      "epoch": 0.42095377230512365,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0012999595134451512,
+      "loss": 0.9979,
+      "step": 6051
+    },
+    {
+      "epoch": 0.4210233399422589,
+      "grad_norm": 1.84375,
+      "learning_rate": 0.0012997445489830745,
+      "loss": 0.855,
+      "step": 6052
+    },
+    {
+      "epoch": 0.42109290757939405,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0012995295693014803,
+      "loss": 1.2088,
+      "step": 6053
+    },
+    {
+      "epoch": 0.4211624752165293,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012993145744112844,
+      "loss": 0.7431,
+      "step": 6054
+    },
+    {
+      "epoch": 0.4212320428536645,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001299099564323403,
+      "loss": 1.0207,
+      "step": 6055
+    },
+    {
+      "epoch": 0.42130161049079967,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012988845390487533,
+      "loss": 0.898,
+      "step": 6056
+    },
+    {
+      "epoch": 0.4213711781279349,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0012986694985982533,
+      "loss": 1.0087,
+      "step": 6057
+    },
+    {
+      "epoch": 0.42144074576507007,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012984544429828215,
+      "loss": 1.1273,
+      "step": 6058
+    },
+    {
+      "epoch": 0.4215103134022053,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012982393722133774,
+      "loss": 0.7767,
+      "step": 6059
+    },
+    {
+      "epoch": 0.4215798810393405,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0012980242863008412,
+      "loss": 0.6377,
+      "step": 6060
+    },
+    {
+      "epoch": 0.4216494486764757,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001297809185256134,
+      "loss": 1.0128,
+      "step": 6061
+    },
+    {
+      "epoch": 0.4217190163136109,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0012975940690901772,
+      "loss": 0.7818,
+      "step": 6062
+    },
+    {
+      "epoch": 0.4217885839507461,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0012973789378138939,
+      "loss": 0.6688,
+      "step": 6063
+    },
+    {
+      "epoch": 0.4218581515878813,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001297163791438207,
+      "loss": 0.8949,
+      "step": 6064
+    },
+    {
+      "epoch": 0.42192771922501654,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0012969486299740402,
+      "loss": 0.7757,
+      "step": 6065
+    },
+    {
+      "epoch": 0.4219972868621517,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001296733453432319,
+      "loss": 0.871,
+      "step": 6066
+    },
+    {
+      "epoch": 0.42206685449928694,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012965182618239685,
+      "loss": 0.8648,
+      "step": 6067
+    },
+    {
+      "epoch": 0.42213642213642216,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012963030551599154,
+      "loss": 0.9459,
+      "step": 6068
+    },
+    {
+      "epoch": 0.42220598977355733,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0012960878334510864,
+      "loss": 0.5526,
+      "step": 6069
+    },
+    {
+      "epoch": 0.42227555741069256,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00129587259670841,
+      "loss": 0.9363,
+      "step": 6070
+    },
+    {
+      "epoch": 0.42234512504782773,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001295657344942814,
+      "loss": 1.0081,
+      "step": 6071
+    },
+    {
+      "epoch": 0.42241469268496296,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012954420781652288,
+      "loss": 1.0101,
+      "step": 6072
+    },
+    {
+      "epoch": 0.4224842603220982,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012952267963865839,
+      "loss": 1.1445,
+      "step": 6073
+    },
+    {
+      "epoch": 0.42255382795923335,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00129501149961781,
+      "loss": 0.9124,
+      "step": 6074
+    },
+    {
+      "epoch": 0.4226233955963686,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001294796187869839,
+      "loss": 0.8919,
+      "step": 6075
+    },
+    {
+      "epoch": 0.42269296323350375,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012945808611536038,
+      "loss": 0.5887,
+      "step": 6076
+    },
+    {
+      "epoch": 0.422762530870639,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0012943655194800371,
+      "loss": 0.8486,
+      "step": 6077
+    },
+    {
+      "epoch": 0.4228320985077742,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012941501628600733,
+      "loss": 0.9304,
+      "step": 6078
+    },
+    {
+      "epoch": 0.42290166614490937,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012939347913046466,
+      "loss": 0.6868,
+      "step": 6079
+    },
+    {
+      "epoch": 0.4229712337820446,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001293719404824693,
+      "loss": 0.7462,
+      "step": 6080
+    },
+    {
+      "epoch": 0.4230408014191798,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0012935040034311482,
+      "loss": 0.8727,
+      "step": 6081
+    },
+    {
+      "epoch": 0.423110369056315,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0012932885871349497,
+      "loss": 1.006,
+      "step": 6082
+    },
+    {
+      "epoch": 0.4231799366934502,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0012930731559470346,
+      "loss": 0.6387,
+      "step": 6083
+    },
+    {
+      "epoch": 0.4232495043305854,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0012928577098783422,
+      "loss": 0.9473,
+      "step": 6084
+    },
+    {
+      "epoch": 0.4233190719677206,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012926422489398114,
+      "loss": 0.8836,
+      "step": 6085
+    },
+    {
+      "epoch": 0.42338863960485584,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0012924267731423823,
+      "loss": 0.8869,
+      "step": 6086
+    },
+    {
+      "epoch": 0.423458207241991,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012922112824969953,
+      "loss": 0.7571,
+      "step": 6087
+    },
+    {
+      "epoch": 0.42352777487912624,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0012919957770145924,
+      "loss": 0.7119,
+      "step": 6088
+    },
+    {
+      "epoch": 0.4235973425162614,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001291780256706116,
+      "loss": 0.6646,
+      "step": 6089
+    },
+    {
+      "epoch": 0.42366691015339664,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012915647215825082,
+      "loss": 0.7336,
+      "step": 6090
+    },
+    {
+      "epoch": 0.42373647779053186,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001291349171654714,
+      "loss": 0.8615,
+      "step": 6091
+    },
+    {
+      "epoch": 0.42380604542766703,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001291133606933677,
+      "loss": 0.8484,
+      "step": 6092
+    },
+    {
+      "epoch": 0.42387561306480226,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012909180274303432,
+      "loss": 0.8087,
+      "step": 6093
+    },
+    {
+      "epoch": 0.4239451807019375,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001290702433155658,
+      "loss": 0.6936,
+      "step": 6094
+    },
+    {
+      "epoch": 0.42401474833907266,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012904868241205686,
+      "loss": 0.8051,
+      "step": 6095
+    },
+    {
+      "epoch": 0.4240843159762079,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012902712003360227,
+      "loss": 0.7881,
+      "step": 6096
+    },
+    {
+      "epoch": 0.42415388361334305,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001290055561812968,
+      "loss": 0.8346,
+      "step": 6097
+    },
+    {
+      "epoch": 0.4242234512504783,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012898399085623537,
+      "loss": 0.9084,
+      "step": 6098
+    },
+    {
+      "epoch": 0.4242930188876135,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00128962424059513,
+      "loss": 0.6617,
+      "step": 6099
+    },
+    {
+      "epoch": 0.4243625865247487,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012894085579222472,
+      "loss": 0.8488,
+      "step": 6100
+    },
+    {
+      "epoch": 0.4244321541618839,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0012891928605546564,
+      "loss": 0.7682,
+      "step": 6101
+    },
+    {
+      "epoch": 0.42450172179901907,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00128897714850331,
+      "loss": 0.8503,
+      "step": 6102
+    },
+    {
+      "epoch": 0.4245712894361543,
+      "grad_norm": 3.171875,
+      "learning_rate": 0.0012887614217791605,
+      "loss": 0.905,
+      "step": 6103
+    },
+    {
+      "epoch": 0.4246408570732895,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012885456803931614,
+      "loss": 0.9455,
+      "step": 6104
+    },
+    {
+      "epoch": 0.4247104247104247,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012883299243562673,
+      "loss": 1.0008,
+      "step": 6105
+    },
+    {
+      "epoch": 0.4247799923475599,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0012881141536794322,
+      "loss": 0.8103,
+      "step": 6106
+    },
+    {
+      "epoch": 0.42484955998469515,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001287898368373613,
+      "loss": 0.9041,
+      "step": 6107
+    },
+    {
+      "epoch": 0.4249191276218303,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0012876825684497658,
+      "loss": 0.8348,
+      "step": 6108
+    },
+    {
+      "epoch": 0.42498869525896554,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001287466753918848,
+      "loss": 1.0223,
+      "step": 6109
+    },
+    {
+      "epoch": 0.4250582628961007,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012872509247918173,
+      "loss": 0.8618,
+      "step": 6110
+    },
+    {
+      "epoch": 0.42512783053323594,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012870350810796323,
+      "loss": 0.8407,
+      "step": 6111
+    },
+    {
+      "epoch": 0.42519739817037117,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0012868192227932526,
+      "loss": 0.864,
+      "step": 6112
+    },
+    {
+      "epoch": 0.42526696580750634,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0012866033499436384,
+      "loss": 0.7074,
+      "step": 6113
+    },
+    {
+      "epoch": 0.42533653344464156,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0012863874625417514,
+      "loss": 0.919,
+      "step": 6114
+    },
+    {
+      "epoch": 0.42540610108177673,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0012861715605985515,
+      "loss": 0.8278,
+      "step": 6115
+    },
+    {
+      "epoch": 0.42547566871891196,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012859556441250032,
+      "loss": 1.0292,
+      "step": 6116
+    },
+    {
+      "epoch": 0.4255452363560472,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012857397131320677,
+      "loss": 0.803,
+      "step": 6117
+    },
+    {
+      "epoch": 0.42561480399318236,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012855237676307103,
+      "loss": 0.8109,
+      "step": 6118
+    },
+    {
+      "epoch": 0.4256843716303176,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012853078076318952,
+      "loss": 1.0028,
+      "step": 6119
+    },
+    {
+      "epoch": 0.4257539392674528,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012850918331465872,
+      "loss": 0.9196,
+      "step": 6120
+    },
+    {
+      "epoch": 0.425823506904588,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0012848758441857534,
+      "loss": 0.9724,
+      "step": 6121
+    },
+    {
+      "epoch": 0.4258930745417232,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0012846598407603596,
+      "loss": 0.7952,
+      "step": 6122
+    },
+    {
+      "epoch": 0.4259626421788584,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012844438228813745,
+      "loss": 0.8355,
+      "step": 6123
+    },
+    {
+      "epoch": 0.4260322098159936,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012842277905597652,
+      "loss": 0.6901,
+      "step": 6124
+    },
+    {
+      "epoch": 0.4261017774531288,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012840117438065017,
+      "loss": 0.9057,
+      "step": 6125
+    },
+    {
+      "epoch": 0.426171345090264,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0012837956826325532,
+      "loss": 0.7786,
+      "step": 6126
+    },
+    {
+      "epoch": 0.4262409127273992,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0012835796070488903,
+      "loss": 1.1199,
+      "step": 6127
+    },
+    {
+      "epoch": 0.4263104803645344,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012833635170664845,
+      "loss": 0.9589,
+      "step": 6128
+    },
+    {
+      "epoch": 0.4263800480016696,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012831474126963074,
+      "loss": 0.7652,
+      "step": 6129
+    },
+    {
+      "epoch": 0.42644961563880485,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001282931293949332,
+      "loss": 0.8077,
+      "step": 6130
+    },
+    {
+      "epoch": 0.42651918327594,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012827151608365312,
+      "loss": 0.9046,
+      "step": 6131
+    },
+    {
+      "epoch": 0.42658875091307524,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012824990133688803,
+      "loss": 0.9335,
+      "step": 6132
+    },
+    {
+      "epoch": 0.42665831855021047,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0012822828515573527,
+      "loss": 0.6665,
+      "step": 6133
+    },
+    {
+      "epoch": 0.42672788618734564,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012820666754129251,
+      "loss": 0.791,
+      "step": 6134
+    },
+    {
+      "epoch": 0.42679745382448087,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001281850484946573,
+      "loss": 0.7337,
+      "step": 6135
+    },
+    {
+      "epoch": 0.42686702146161604,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001281634280169274,
+      "loss": 0.867,
+      "step": 6136
+    },
+    {
+      "epoch": 0.42693658909875126,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0012814180610920063,
+      "loss": 0.7578,
+      "step": 6137
+    },
+    {
+      "epoch": 0.4270061567358865,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012812018277257474,
+      "loss": 0.8261,
+      "step": 6138
+    },
+    {
+      "epoch": 0.42707572437302166,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012809855800814773,
+      "loss": 0.9987,
+      "step": 6139
+    },
+    {
+      "epoch": 0.4271452920101569,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0012807693181701757,
+      "loss": 1.1341,
+      "step": 6140
+    },
+    {
+      "epoch": 0.42721485964729206,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012805530420028233,
+      "loss": 1.0794,
+      "step": 6141
+    },
+    {
+      "epoch": 0.4272844272844273,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012803367515904017,
+      "loss": 0.838,
+      "step": 6142
+    },
+    {
+      "epoch": 0.4273539949215625,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0012801204469438923,
+      "loss": 1.0831,
+      "step": 6143
+    },
+    {
+      "epoch": 0.4274235625586977,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001279904128074279,
+      "loss": 1.0262,
+      "step": 6144
+    },
+    {
+      "epoch": 0.4274931301958329,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0012796877949925445,
+      "loss": 0.8847,
+      "step": 6145
+    },
+    {
+      "epoch": 0.42756269783296813,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012794714477096741,
+      "loss": 0.96,
+      "step": 6146
+    },
+    {
+      "epoch": 0.4276322654701033,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0012792550862366517,
+      "loss": 1.0317,
+      "step": 6147
+    },
+    {
+      "epoch": 0.4277018331072385,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012790387105844638,
+      "loss": 0.8646,
+      "step": 6148
+    },
+    {
+      "epoch": 0.4277714007443737,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012788223207640963,
+      "loss": 0.8444,
+      "step": 6149
+    },
+    {
+      "epoch": 0.4278409683815089,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012786059167865372,
+      "loss": 0.7417,
+      "step": 6150
+    },
+    {
+      "epoch": 0.42791053601864415,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0012783894986627738,
+      "loss": 0.686,
+      "step": 6151
+    },
+    {
+      "epoch": 0.4279801036557793,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0012781730664037944,
+      "loss": 0.7532,
+      "step": 6152
+    },
+    {
+      "epoch": 0.42804967129291455,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012779566200205894,
+      "loss": 0.8931,
+      "step": 6153
+    },
+    {
+      "epoch": 0.4281192389300497,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0012777401595241479,
+      "loss": 0.7948,
+      "step": 6154
+    },
+    {
+      "epoch": 0.42818880656718494,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012775236849254612,
+      "loss": 0.9641,
+      "step": 6155
+    },
+    {
+      "epoch": 0.42825837420432017,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012773071962355203,
+      "loss": 0.8638,
+      "step": 6156
+    },
+    {
+      "epoch": 0.42832794184145534,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001277090693465318,
+      "loss": 0.8303,
+      "step": 6157
+    },
+    {
+      "epoch": 0.42839750947859057,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001276874176625847,
+      "loss": 1.1343,
+      "step": 6158
+    },
+    {
+      "epoch": 0.4284670771157258,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0012766576457281006,
+      "loss": 0.7364,
+      "step": 6159
+    },
+    {
+      "epoch": 0.42853664475286096,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012764411007830736,
+      "loss": 0.8541,
+      "step": 6160
+    },
+    {
+      "epoch": 0.4286062123899962,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012762245418017606,
+      "loss": 1.0099,
+      "step": 6161
+    },
+    {
+      "epoch": 0.42867578002713136,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001276007968795158,
+      "loss": 0.5945,
+      "step": 6162
+    },
+    {
+      "epoch": 0.4287453476642666,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012757913817742614,
+      "loss": 0.8322,
+      "step": 6163
+    },
+    {
+      "epoch": 0.4288149153014018,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001275574780750069,
+      "loss": 0.9469,
+      "step": 6164
+    },
+    {
+      "epoch": 0.428884482938537,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0012753581657335782,
+      "loss": 0.9341,
+      "step": 6165
+    },
+    {
+      "epoch": 0.4289540505756722,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0012751415367357876,
+      "loss": 0.7707,
+      "step": 6166
+    },
+    {
+      "epoch": 0.4290236182128074,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012749248937676968,
+      "loss": 0.7381,
+      "step": 6167
+    },
+    {
+      "epoch": 0.4290931858499426,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012747082368403048,
+      "loss": 0.8526,
+      "step": 6168
+    },
+    {
+      "epoch": 0.42916275348707783,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0012744915659646141,
+      "loss": 1.0997,
+      "step": 6169
+    },
+    {
+      "epoch": 0.429232321124213,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0012742748811516247,
+      "loss": 0.9804,
+      "step": 6170
+    },
+    {
+      "epoch": 0.4293018887613482,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0012740581824123396,
+      "loss": 0.8845,
+      "step": 6171
+    },
+    {
+      "epoch": 0.42937145639848345,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012738414697577609,
+      "loss": 0.9485,
+      "step": 6172
+    },
+    {
+      "epoch": 0.4294410240356186,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001273624743198893,
+      "loss": 0.6952,
+      "step": 6173
+    },
+    {
+      "epoch": 0.42951059167275385,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012734080027467399,
+      "loss": 0.7642,
+      "step": 6174
+    },
+    {
+      "epoch": 0.429580159309889,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001273191248412306,
+      "loss": 0.6815,
+      "step": 6175
+    },
+    {
+      "epoch": 0.42964972694702425,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001272974480206598,
+      "loss": 0.8678,
+      "step": 6176
+    },
+    {
+      "epoch": 0.4297192945841595,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0012727576981406215,
+      "loss": 1.0667,
+      "step": 6177
+    },
+    {
+      "epoch": 0.42978886222129464,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0012725409022253842,
+      "loss": 0.9059,
+      "step": 6178
+    },
+    {
+      "epoch": 0.42985842985842987,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001272324092471893,
+      "loss": 0.77,
+      "step": 6179
+    },
+    {
+      "epoch": 0.42992799749556504,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012721072688911576,
+      "loss": 0.8317,
+      "step": 6180
+    },
+    {
+      "epoch": 0.42999756513270027,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0012718904314941866,
+      "loss": 0.5463,
+      "step": 6181
+    },
+    {
+      "epoch": 0.4300671327698355,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0012716735802919894,
+      "loss": 0.7145,
+      "step": 6182
+    },
+    {
+      "epoch": 0.43013670040697066,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0012714567152955776,
+      "loss": 0.9652,
+      "step": 6183
+    },
+    {
+      "epoch": 0.4302062680441059,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012712398365159617,
+      "loss": 0.9703,
+      "step": 6184
+    },
+    {
+      "epoch": 0.43027583568124106,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012710229439641544,
+      "loss": 0.8048,
+      "step": 6185
+    },
+    {
+      "epoch": 0.4303454033183763,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012708060376511677,
+      "loss": 0.8256,
+      "step": 6186
+    },
+    {
+      "epoch": 0.4304149709555115,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012705891175880156,
+      "loss": 1.0206,
+      "step": 6187
+    },
+    {
+      "epoch": 0.4304845385926467,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0012703721837857118,
+      "loss": 0.6895,
+      "step": 6188
+    },
+    {
+      "epoch": 0.4305541062297819,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012701552362552714,
+      "loss": 0.7009,
+      "step": 6189
+    },
+    {
+      "epoch": 0.43062367386691713,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012699382750077102,
+      "loss": 0.7006,
+      "step": 6190
+    },
+    {
+      "epoch": 0.4306932415040523,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012697213000540434,
+      "loss": 0.9176,
+      "step": 6191
+    },
+    {
+      "epoch": 0.43076280914118753,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0012695043114052886,
+      "loss": 0.7925,
+      "step": 6192
+    },
+    {
+      "epoch": 0.4308323767783227,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0012692873090724632,
+      "loss": 0.7769,
+      "step": 6193
+    },
+    {
+      "epoch": 0.4309019444154579,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001269070293066586,
+      "loss": 1.0246,
+      "step": 6194
+    },
+    {
+      "epoch": 0.43097151205259315,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001268853263398675,
+      "loss": 0.9114,
+      "step": 6195
+    },
+    {
+      "epoch": 0.4310410796897283,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012686362200797507,
+      "loss": 0.6292,
+      "step": 6196
+    },
+    {
+      "epoch": 0.43111064732686355,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012684191631208333,
+      "loss": 0.8646,
+      "step": 6197
+    },
+    {
+      "epoch": 0.4311802149639987,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0012682020925329433,
+      "loss": 0.9575,
+      "step": 6198
+    },
+    {
+      "epoch": 0.43124978260113395,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012679850083271034,
+      "loss": 0.9149,
+      "step": 6199
+    },
+    {
+      "epoch": 0.4313193502382692,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012677679105143349,
+      "loss": 0.9711,
+      "step": 6200
+    },
+    {
+      "epoch": 0.43138891787540434,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0012675507991056622,
+      "loss": 0.644,
+      "step": 6201
+    },
+    {
+      "epoch": 0.43145848551253957,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001267333674112108,
+      "loss": 0.706,
+      "step": 6202
+    },
+    {
+      "epoch": 0.4315280531496748,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0012671165355446973,
+      "loss": 0.7567,
+      "step": 6203
+    },
+    {
+      "epoch": 0.43159762078680997,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012668993834144555,
+      "loss": 0.9832,
+      "step": 6204
+    },
+    {
+      "epoch": 0.4316671884239452,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0012666822177324082,
+      "loss": 0.8723,
+      "step": 6205
+    },
+    {
+      "epoch": 0.43173675606108036,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0012664650385095825,
+      "loss": 0.8979,
+      "step": 6206
+    },
+    {
+      "epoch": 0.4318063236982156,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0012662478457570044,
+      "loss": 0.7803,
+      "step": 6207
+    },
+    {
+      "epoch": 0.4318758913353508,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0012660306394857033,
+      "loss": 1.0101,
+      "step": 6208
+    },
+    {
+      "epoch": 0.431945458972486,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0012658134197067069,
+      "loss": 0.778,
+      "step": 6209
+    },
+    {
+      "epoch": 0.4320150266096212,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001265596186431045,
+      "loss": 0.8064,
+      "step": 6210
+    },
+    {
+      "epoch": 0.4320845942467564,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0012653789396697476,
+      "loss": 0.9607,
+      "step": 6211
+    },
+    {
+      "epoch": 0.4321541618838916,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012651616794338448,
+      "loss": 0.9648,
+      "step": 6212
+    },
+    {
+      "epoch": 0.43222372952102683,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0012649444057343691,
+      "loss": 0.7897,
+      "step": 6213
+    },
+    {
+      "epoch": 0.432293297158162,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012647271185823512,
+      "loss": 0.9157,
+      "step": 6214
+    },
+    {
+      "epoch": 0.43236286479529723,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001264509817988825,
+      "loss": 0.5547,
+      "step": 6215
+    },
+    {
+      "epoch": 0.43243243243243246,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0012642925039648232,
+      "loss": 0.7265,
+      "step": 6216
+    },
+    {
+      "epoch": 0.4325020000695676,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012640751765213803,
+      "loss": 1.0178,
+      "step": 6217
+    },
+    {
+      "epoch": 0.43257156770670285,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001263857835669531,
+      "loss": 0.9097,
+      "step": 6218
+    },
+    {
+      "epoch": 0.432641135343838,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0012636404814203106,
+      "loss": 0.6847,
+      "step": 6219
+    },
+    {
+      "epoch": 0.43271070298097325,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012634231137847556,
+      "loss": 1.0189,
+      "step": 6220
+    },
+    {
+      "epoch": 0.4327802706181085,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0012632057327739026,
+      "loss": 0.8217,
+      "step": 6221
+    },
+    {
+      "epoch": 0.43284983825524365,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012629883383987893,
+      "loss": 0.7257,
+      "step": 6222
+    },
+    {
+      "epoch": 0.4329194058923789,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0012627709306704533,
+      "loss": 0.8535,
+      "step": 6223
+    },
+    {
+      "epoch": 0.43298897352951404,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0012625535095999341,
+      "loss": 1.0538,
+      "step": 6224
+    },
+    {
+      "epoch": 0.43305854116664927,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0012623360751982712,
+      "loss": 1.0261,
+      "step": 6225
+    },
+    {
+      "epoch": 0.4331281088037845,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0012621186274765044,
+      "loss": 0.8567,
+      "step": 6226
+    },
+    {
+      "epoch": 0.43319767644091967,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.001261901166445675,
+      "loss": 1.0199,
+      "step": 6227
+    },
+    {
+      "epoch": 0.4332672440780549,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012616836921168243,
+      "loss": 0.8066,
+      "step": 6228
+    },
+    {
+      "epoch": 0.4333368117151901,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012614662045009953,
+      "loss": 0.7862,
+      "step": 6229
+    },
+    {
+      "epoch": 0.4334063793523253,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012612487036092297,
+      "loss": 0.8326,
+      "step": 6230
+    },
+    {
+      "epoch": 0.4334759469894605,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0012610311894525718,
+      "loss": 0.8385,
+      "step": 6231
+    },
+    {
+      "epoch": 0.4335455146265957,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001260813662042066,
+      "loss": 0.9319,
+      "step": 6232
+    },
+    {
+      "epoch": 0.4336150822637309,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001260596121388757,
+      "loss": 0.7254,
+      "step": 6233
+    },
+    {
+      "epoch": 0.43368464990086614,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012603785675036905,
+      "loss": 0.8268,
+      "step": 6234
+    },
+    {
+      "epoch": 0.4337542175380013,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012601610003979125,
+      "loss": 0.9525,
+      "step": 6235
+    },
+    {
+      "epoch": 0.43382378517513653,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012599434200824705,
+      "loss": 0.7402,
+      "step": 6236
+    },
+    {
+      "epoch": 0.4338933528122717,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012597258265684118,
+      "loss": 0.8009,
+      "step": 6237
+    },
+    {
+      "epoch": 0.43396292044940693,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012595082198667846,
+      "loss": 0.8976,
+      "step": 6238
+    },
+    {
+      "epoch": 0.43403248808654216,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001259290599988638,
+      "loss": 0.8552,
+      "step": 6239
+    },
+    {
+      "epoch": 0.4341020557236773,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0012590729669450219,
+      "loss": 1.1349,
+      "step": 6240
+    },
+    {
+      "epoch": 0.43417162336081255,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001258855320746986,
+      "loss": 0.9319,
+      "step": 6241
+    },
+    {
+      "epoch": 0.4342411909979478,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001258637661405582,
+      "loss": 1.2469,
+      "step": 6242
+    },
+    {
+      "epoch": 0.43431075863508295,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0012584199889318609,
+      "loss": 0.7951,
+      "step": 6243
+    },
+    {
+      "epoch": 0.4343803262722182,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012582023033368755,
+      "loss": 0.5938,
+      "step": 6244
+    },
+    {
+      "epoch": 0.43444989390935335,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012579846046316782,
+      "loss": 0.9024,
+      "step": 6245
+    },
+    {
+      "epoch": 0.4345194615464886,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012577668928273234,
+      "loss": 1.0139,
+      "step": 6246
+    },
+    {
+      "epoch": 0.4345890291836238,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001257549167934865,
+      "loss": 0.9458,
+      "step": 6247
+    },
+    {
+      "epoch": 0.43465859682075897,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0012573314299653578,
+      "loss": 0.8125,
+      "step": 6248
+    },
+    {
+      "epoch": 0.4347281644578942,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012571136789298579,
+      "loss": 0.8901,
+      "step": 6249
+    },
+    {
+      "epoch": 0.43479773209502937,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012568959148394213,
+      "loss": 0.6321,
+      "step": 6250
+    },
+    {
+      "epoch": 0.4348672997321646,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0012566781377051047,
+      "loss": 0.934,
+      "step": 6251
+    },
+    {
+      "epoch": 0.4349368673692998,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012564603475379663,
+      "loss": 0.6629,
+      "step": 6252
+    },
+    {
+      "epoch": 0.435006435006435,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001256242544349064,
+      "loss": 0.9645,
+      "step": 6253
+    },
+    {
+      "epoch": 0.4350760026435702,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012560247281494569,
+      "loss": 0.8213,
+      "step": 6254
+    },
+    {
+      "epoch": 0.43514557028070544,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012558068989502044,
+      "loss": 0.9691,
+      "step": 6255
+    },
+    {
+      "epoch": 0.4352151379178406,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012555890567623668,
+      "loss": 0.9168,
+      "step": 6256
+    },
+    {
+      "epoch": 0.43528470555497584,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0012553712015970055,
+      "loss": 0.9164,
+      "step": 6257
+    },
+    {
+      "epoch": 0.435354273192111,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012551533334651816,
+      "loss": 0.7233,
+      "step": 6258
+    },
+    {
+      "epoch": 0.43542384082924623,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012549354523779578,
+      "loss": 0.6932,
+      "step": 6259
+    },
+    {
+      "epoch": 0.43549340846638146,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0012547175583463963,
+      "loss": 1.0742,
+      "step": 6260
+    },
+    {
+      "epoch": 0.43556297610351663,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0012544996513815614,
+      "loss": 0.9574,
+      "step": 6261
+    },
+    {
+      "epoch": 0.43563254374065186,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0012542817314945168,
+      "loss": 0.8548,
+      "step": 6262
+    },
+    {
+      "epoch": 0.435702111377787,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012540637986963275,
+      "loss": 0.8449,
+      "step": 6263
+    },
+    {
+      "epoch": 0.43577167901492225,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001253845852998059,
+      "loss": 0.9364,
+      "step": 6264
+    },
+    {
+      "epoch": 0.4358412466520575,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0012536278944107776,
+      "loss": 0.8772,
+      "step": 6265
+    },
+    {
+      "epoch": 0.43591081428919265,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0012534099229455505,
+      "loss": 1.127,
+      "step": 6266
+    },
+    {
+      "epoch": 0.4359803819263279,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012531919386134444,
+      "loss": 0.7243,
+      "step": 6267
+    },
+    {
+      "epoch": 0.4360499495634631,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001252973941425528,
+      "loss": 1.2287,
+      "step": 6268
+    },
+    {
+      "epoch": 0.4361195172005983,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0012527559313928699,
+      "loss": 0.7078,
+      "step": 6269
+    },
+    {
+      "epoch": 0.4361890848377335,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012525379085265393,
+      "loss": 0.8298,
+      "step": 6270
+    },
+    {
+      "epoch": 0.43625865247486867,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0012523198728376069,
+      "loss": 0.9078,
+      "step": 6271
+    },
+    {
+      "epoch": 0.4363282201120039,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001252101824337143,
+      "loss": 0.898,
+      "step": 6272
+    },
+    {
+      "epoch": 0.4363977877491391,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012518837630362194,
+      "loss": 1.1462,
+      "step": 6273
+    },
+    {
+      "epoch": 0.4364673553862743,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012516656889459078,
+      "loss": 0.8964,
+      "step": 6274
+    },
+    {
+      "epoch": 0.4365369230234095,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012514476020772808,
+      "loss": 0.9142,
+      "step": 6275
+    },
+    {
+      "epoch": 0.4366064906605447,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001251229502441412,
+      "loss": 1.0183,
+      "step": 6276
+    },
+    {
+      "epoch": 0.4366760582976799,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012510113900493756,
+      "loss": 0.7949,
+      "step": 6277
+    },
+    {
+      "epoch": 0.43674562593481514,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012507932649122458,
+      "loss": 0.9381,
+      "step": 6278
+    },
+    {
+      "epoch": 0.4368151935719503,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012505751270410982,
+      "loss": 0.9689,
+      "step": 6279
+    },
+    {
+      "epoch": 0.43688476120908554,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0012503569764470085,
+      "loss": 0.8015,
+      "step": 6280
+    },
+    {
+      "epoch": 0.43695432884622076,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0012501388131410537,
+      "loss": 0.6995,
+      "step": 6281
+    },
+    {
+      "epoch": 0.43702389648335593,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012499206371343104,
+      "loss": 0.7622,
+      "step": 6282
+    },
+    {
+      "epoch": 0.43709346412049116,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001249702448437857,
+      "loss": 0.8139,
+      "step": 6283
+    },
+    {
+      "epoch": 0.43716303175762633,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012494842470627719,
+      "loss": 0.914,
+      "step": 6284
+    },
+    {
+      "epoch": 0.43723259939476156,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0012492660330201341,
+      "loss": 0.7262,
+      "step": 6285
+    },
+    {
+      "epoch": 0.4373021670318968,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012490478063210237,
+      "loss": 1.0207,
+      "step": 6286
+    },
+    {
+      "epoch": 0.43737173466903195,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001248829566976521,
+      "loss": 0.8092,
+      "step": 6287
+    },
+    {
+      "epoch": 0.4374413023061672,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001248611314997707,
+      "loss": 0.6535,
+      "step": 6288
+    },
+    {
+      "epoch": 0.43751086994330235,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012483930503956635,
+      "loss": 0.9891,
+      "step": 6289
+    },
+    {
+      "epoch": 0.4375804375804376,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001248174773181473,
+      "loss": 0.9172,
+      "step": 6290
+    },
+    {
+      "epoch": 0.4376500052175728,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012479564833662185,
+      "loss": 0.6461,
+      "step": 6291
+    },
+    {
+      "epoch": 0.437719572854708,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012477381809609834,
+      "loss": 0.7959,
+      "step": 6292
+    },
+    {
+      "epoch": 0.4377891404918432,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012475198659768522,
+      "loss": 0.8386,
+      "step": 6293
+    },
+    {
+      "epoch": 0.4378587081289784,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012473015384249096,
+      "loss": 0.8491,
+      "step": 6294
+    },
+    {
+      "epoch": 0.4379282757661136,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0012470831983162416,
+      "loss": 0.9507,
+      "step": 6295
+    },
+    {
+      "epoch": 0.4379978434032488,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.001246864845661934,
+      "loss": 1.2016,
+      "step": 6296
+    },
+    {
+      "epoch": 0.438067411040384,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001246646480473074,
+      "loss": 0.8521,
+      "step": 6297
+    },
+    {
+      "epoch": 0.4381369786775192,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012464281027607489,
+      "loss": 0.7937,
+      "step": 6298
+    },
+    {
+      "epoch": 0.43820654631465444,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012462097125360467,
+      "loss": 0.7912,
+      "step": 6299
+    },
+    {
+      "epoch": 0.4382761139517896,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012459913098100566,
+      "loss": 0.8754,
+      "step": 6300
+    },
+    {
+      "epoch": 0.43834568158892484,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012457728945938673,
+      "loss": 0.6109,
+      "step": 6301
+    },
+    {
+      "epoch": 0.43841524922606,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0012455544668985693,
+      "loss": 0.8394,
+      "step": 6302
+    },
+    {
+      "epoch": 0.43848481686319524,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0012453360267352534,
+      "loss": 1.0656,
+      "step": 6303
+    },
+    {
+      "epoch": 0.43855438450033046,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012451175741150105,
+      "loss": 0.8723,
+      "step": 6304
+    },
+    {
+      "epoch": 0.43862395213746563,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0012448991090489325,
+      "loss": 0.6628,
+      "step": 6305
+    },
+    {
+      "epoch": 0.43869351977460086,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012446806315481124,
+      "loss": 0.7501,
+      "step": 6306
+    },
+    {
+      "epoch": 0.4387630874117361,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012444621416236427,
+      "loss": 1.0503,
+      "step": 6307
+    },
+    {
+      "epoch": 0.43883265504887126,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0012442436392866181,
+      "loss": 0.6441,
+      "step": 6308
+    },
+    {
+      "epoch": 0.4389022226860065,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012440251245481324,
+      "loss": 1.0388,
+      "step": 6309
+    },
+    {
+      "epoch": 0.43897179032314165,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0012438065974192808,
+      "loss": 0.7253,
+      "step": 6310
+    },
+    {
+      "epoch": 0.4390413579602769,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001243588057911159,
+      "loss": 0.8736,
+      "step": 6311
+    },
+    {
+      "epoch": 0.4391109255974121,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0012433695060348636,
+      "loss": 0.9115,
+      "step": 6312
+    },
+    {
+      "epoch": 0.4391804932345473,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0012431509418014913,
+      "loss": 0.8458,
+      "step": 6313
+    },
+    {
+      "epoch": 0.4392500608716825,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012429323652221396,
+      "loss": 0.9502,
+      "step": 6314
+    },
+    {
+      "epoch": 0.43931962850881767,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001242713776307907,
+      "loss": 1.1474,
+      "step": 6315
+    },
+    {
+      "epoch": 0.4393891961459529,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001242495175069892,
+      "loss": 0.8861,
+      "step": 6316
+    },
+    {
+      "epoch": 0.4394587637830881,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0012422765615191947,
+      "loss": 0.7672,
+      "step": 6317
+    },
+    {
+      "epoch": 0.4395283314202233,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0012420579356669144,
+      "loss": 0.7132,
+      "step": 6318
+    },
+    {
+      "epoch": 0.4395978990573585,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0012418392975241522,
+      "loss": 1.0153,
+      "step": 6319
+    },
+    {
+      "epoch": 0.43966746669449375,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012416206471020095,
+      "loss": 0.7832,
+      "step": 6320
+    },
+    {
+      "epoch": 0.4397370343316289,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0012414019844115883,
+      "loss": 1.0355,
+      "step": 6321
+    },
+    {
+      "epoch": 0.43980660196876414,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001241183309463991,
+      "loss": 0.8978,
+      "step": 6322
+    },
+    {
+      "epoch": 0.4398761696058993,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001240964622270321,
+      "loss": 0.8164,
+      "step": 6323
+    },
+    {
+      "epoch": 0.43994573724303454,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012407459228416819,
+      "loss": 1.0299,
+      "step": 6324
+    },
+    {
+      "epoch": 0.44001530488016977,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0012405272111891783,
+      "loss": 0.7,
+      "step": 6325
+    },
+    {
+      "epoch": 0.44008487251730494,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012403084873239152,
+      "loss": 0.7141,
+      "step": 6326
+    },
+    {
+      "epoch": 0.44015444015444016,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012400897512569987,
+      "loss": 1.0925,
+      "step": 6327
+    },
+    {
+      "epoch": 0.44022400779157533,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0012398710029995345,
+      "loss": 1.0427,
+      "step": 6328
+    },
+    {
+      "epoch": 0.44029357542871056,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0012396522425626299,
+      "loss": 0.8871,
+      "step": 6329
+    },
+    {
+      "epoch": 0.4403631430658458,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001239433469957392,
+      "loss": 0.9091,
+      "step": 6330
+    },
+    {
+      "epoch": 0.44043271070298096,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012392146851949296,
+      "loss": 0.6989,
+      "step": 6331
+    },
+    {
+      "epoch": 0.4405022783401162,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012389958882863515,
+      "loss": 0.5548,
+      "step": 6332
+    },
+    {
+      "epoch": 0.4405718459772514,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0012387770792427664,
+      "loss": 0.8693,
+      "step": 6333
+    },
+    {
+      "epoch": 0.4406414136143866,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001238558258075285,
+      "loss": 0.7976,
+      "step": 6334
+    },
+    {
+      "epoch": 0.4407109812515218,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012383394247950175,
+      "loss": 0.7941,
+      "step": 6335
+    },
+    {
+      "epoch": 0.440780548888657,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012381205794130754,
+      "loss": 0.7462,
+      "step": 6336
+    },
+    {
+      "epoch": 0.4408501165257922,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0012379017219405705,
+      "loss": 0.7067,
+      "step": 6337
+    },
+    {
+      "epoch": 0.4409196841629274,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012376828523886151,
+      "loss": 0.948,
+      "step": 6338
+    },
+    {
+      "epoch": 0.4409892518000626,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0012374639707683228,
+      "loss": 0.7899,
+      "step": 6339
+    },
+    {
+      "epoch": 0.4410588194371978,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012372450770908067,
+      "loss": 0.8701,
+      "step": 6340
+    },
+    {
+      "epoch": 0.441128387074333,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012370261713671817,
+      "loss": 0.8288,
+      "step": 6341
+    },
+    {
+      "epoch": 0.4411979547114682,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001236807253608562,
+      "loss": 0.8467,
+      "step": 6342
+    },
+    {
+      "epoch": 0.44126752234860345,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001236588323826064,
+      "loss": 0.934,
+      "step": 6343
+    },
+    {
+      "epoch": 0.4413370899857386,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012363693820308032,
+      "loss": 0.7287,
+      "step": 6344
+    },
+    {
+      "epoch": 0.44140665762287384,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012361504282338964,
+      "loss": 0.7829,
+      "step": 6345
+    },
+    {
+      "epoch": 0.44147622526000907,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012359314624464616,
+      "loss": 0.9451,
+      "step": 6346
+    },
+    {
+      "epoch": 0.44154579289714424,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001235712484679616,
+      "loss": 0.7119,
+      "step": 6347
+    },
+    {
+      "epoch": 0.44161536053427947,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0012354934949444785,
+      "loss": 1.2297,
+      "step": 6348
+    },
+    {
+      "epoch": 0.44168492817141464,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001235274493252168,
+      "loss": 0.8412,
+      "step": 6349
+    },
+    {
+      "epoch": 0.44175449580854986,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0012350554796138051,
+      "loss": 0.7168,
+      "step": 6350
+    },
+    {
+      "epoch": 0.4418240634456851,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0012348364540405096,
+      "loss": 0.8528,
+      "step": 6351
+    },
+    {
+      "epoch": 0.44189363108282026,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012346174165434026,
+      "loss": 0.7164,
+      "step": 6352
+    },
+    {
+      "epoch": 0.4419631987199555,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0012343983671336057,
+      "loss": 1.1093,
+      "step": 6353
+    },
+    {
+      "epoch": 0.44203276635709066,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0012341793058222412,
+      "loss": 0.6447,
+      "step": 6354
+    },
+    {
+      "epoch": 0.4421023339942259,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001233960232620432,
+      "loss": 0.7405,
+      "step": 6355
+    },
+    {
+      "epoch": 0.4421719016313611,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001233741147539301,
+      "loss": 0.7226,
+      "step": 6356
+    },
+    {
+      "epoch": 0.4422414692684963,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001233522050589973,
+      "loss": 0.7298,
+      "step": 6357
+    },
+    {
+      "epoch": 0.4423110369056315,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012333029417835725,
+      "loss": 0.8299,
+      "step": 6358
+    },
+    {
+      "epoch": 0.44238060454276673,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0012330838211312243,
+      "loss": 0.6317,
+      "step": 6359
+    },
+    {
+      "epoch": 0.4424501721799019,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012328646886440547,
+      "loss": 0.9076,
+      "step": 6360
+    },
+    {
+      "epoch": 0.4425197398170371,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0012326455443331897,
+      "loss": 0.98,
+      "step": 6361
+    },
+    {
+      "epoch": 0.4425893074541723,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012324263882097567,
+      "loss": 0.8287,
+      "step": 6362
+    },
+    {
+      "epoch": 0.4426588750913075,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0012322072202848831,
+      "loss": 0.9169,
+      "step": 6363
+    },
+    {
+      "epoch": 0.44272844272844275,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012319880405696974,
+      "loss": 1.0304,
+      "step": 6364
+    },
+    {
+      "epoch": 0.4427980103655779,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0012317688490753281,
+      "loss": 0.7497,
+      "step": 6365
+    },
+    {
+      "epoch": 0.44286757800271315,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012315496458129053,
+      "loss": 0.8016,
+      "step": 6366
+    },
+    {
+      "epoch": 0.4429371456398483,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0012313304307935583,
+      "loss": 1.0096,
+      "step": 6367
+    },
+    {
+      "epoch": 0.44300671327698354,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001231111204028418,
+      "loss": 0.7129,
+      "step": 6368
+    },
+    {
+      "epoch": 0.44307628091411877,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0012308919655286154,
+      "loss": 0.6975,
+      "step": 6369
+    },
+    {
+      "epoch": 0.44314584855125394,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001230672715305283,
+      "loss": 0.9218,
+      "step": 6370
+    },
+    {
+      "epoch": 0.44321541618838917,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012304534533695527,
+      "loss": 0.9669,
+      "step": 6371
+    },
+    {
+      "epoch": 0.4432849838255244,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012302341797325572,
+      "loss": 0.7856,
+      "step": 6372
+    },
+    {
+      "epoch": 0.44335455146265956,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001230014894405431,
+      "loss": 0.7618,
+      "step": 6373
+    },
+    {
+      "epoch": 0.4434241190997948,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012297955973993076,
+      "loss": 0.8446,
+      "step": 6374
+    },
+    {
+      "epoch": 0.44349368673692996,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001229576288725322,
+      "loss": 0.8408,
+      "step": 6375
+    },
+    {
+      "epoch": 0.4435632543740652,
+      "grad_norm": 1.75,
+      "learning_rate": 0.00122935696839461,
+      "loss": 0.9296,
+      "step": 6376
+    },
+    {
+      "epoch": 0.4436328220112004,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012291376364183069,
+      "loss": 0.6171,
+      "step": 6377
+    },
+    {
+      "epoch": 0.4437023896483356,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012289182928075495,
+      "loss": 0.7637,
+      "step": 6378
+    },
+    {
+      "epoch": 0.4437719572854708,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012286989375734749,
+      "loss": 0.697,
+      "step": 6379
+    },
+    {
+      "epoch": 0.443841524922606,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012284795707272213,
+      "loss": 0.7791,
+      "step": 6380
+    },
+    {
+      "epoch": 0.4439110925597412,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0012282601922799263,
+      "loss": 0.9094,
+      "step": 6381
+    },
+    {
+      "epoch": 0.44398066019687643,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012280408022427298,
+      "loss": 0.7983,
+      "step": 6382
+    },
+    {
+      "epoch": 0.4440502278340116,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012278214006267705,
+      "loss": 0.872,
+      "step": 6383
+    },
+    {
+      "epoch": 0.4441197954711468,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0012276019874431887,
+      "loss": 0.7685,
+      "step": 6384
+    },
+    {
+      "epoch": 0.44418936310828205,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012273825627031254,
+      "loss": 0.8077,
+      "step": 6385
+    },
+    {
+      "epoch": 0.4442589307454172,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012271631264177212,
+      "loss": 0.7381,
+      "step": 6386
+    },
+    {
+      "epoch": 0.44432849838255245,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001226943678598119,
+      "loss": 0.7848,
+      "step": 6387
+    },
+    {
+      "epoch": 0.4443980660196876,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012267242192554601,
+      "loss": 0.9076,
+      "step": 6388
+    },
+    {
+      "epoch": 0.44446763365682285,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012265047484008886,
+      "loss": 0.9831,
+      "step": 6389
+    },
+    {
+      "epoch": 0.4445372012939581,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0012262852660455477,
+      "loss": 1.0663,
+      "step": 6390
+    },
+    {
+      "epoch": 0.44460676893109324,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0012260657722005812,
+      "loss": 0.8143,
+      "step": 6391
+    },
+    {
+      "epoch": 0.44467633656822847,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0012258462668771344,
+      "loss": 0.6318,
+      "step": 6392
+    },
+    {
+      "epoch": 0.44474590420536364,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012256267500863522,
+      "loss": 0.7823,
+      "step": 6393
+    },
+    {
+      "epoch": 0.44481547184249887,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012254072218393815,
+      "loss": 0.8409,
+      "step": 6394
+    },
+    {
+      "epoch": 0.4448850394796341,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0012251876821473676,
+      "loss": 0.9326,
+      "step": 6395
+    },
+    {
+      "epoch": 0.44495460711676926,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001224968131021459,
+      "loss": 0.8727,
+      "step": 6396
+    },
+    {
+      "epoch": 0.4450241747539045,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012247485684728017,
+      "loss": 0.6485,
+      "step": 6397
+    },
+    {
+      "epoch": 0.4450937423910397,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0012245289945125458,
+      "loss": 0.8116,
+      "step": 6398
+    },
+    {
+      "epoch": 0.4451633100281749,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0012243094091518387,
+      "loss": 0.5843,
+      "step": 6399
+    },
+    {
+      "epoch": 0.4452328776653101,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0012240898124018303,
+      "loss": 0.6947,
+      "step": 6400
+    },
+    {
+      "epoch": 0.4453024453024453,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001223870204273671,
+      "loss": 0.9957,
+      "step": 6401
+    },
+    {
+      "epoch": 0.4453720129395805,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012236505847785112,
+      "loss": 0.902,
+      "step": 6402
+    },
+    {
+      "epoch": 0.44544158057671573,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0012234309539275018,
+      "loss": 0.857,
+      "step": 6403
+    },
+    {
+      "epoch": 0.4455111482138509,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0012232113117317948,
+      "loss": 0.7336,
+      "step": 6404
+    },
+    {
+      "epoch": 0.44558071585098613,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0012229916582025427,
+      "loss": 1.2174,
+      "step": 6405
+    },
+    {
+      "epoch": 0.4456502834881213,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0012227719933508977,
+      "loss": 0.8288,
+      "step": 6406
+    },
+    {
+      "epoch": 0.4457198511252565,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001222552317188014,
+      "loss": 0.8039,
+      "step": 6407
+    },
+    {
+      "epoch": 0.44578941876239175,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0012223326297250453,
+      "loss": 1.055,
+      "step": 6408
+    },
+    {
+      "epoch": 0.4458589863995269,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012221129309731463,
+      "loss": 0.8263,
+      "step": 6409
+    },
+    {
+      "epoch": 0.44592855403666215,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012218932209434722,
+      "loss": 0.8756,
+      "step": 6410
+    },
+    {
+      "epoch": 0.4459981216737974,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012216734996471788,
+      "loss": 0.8042,
+      "step": 6411
+    },
+    {
+      "epoch": 0.44606768931093255,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012214537670954225,
+      "loss": 0.8315,
+      "step": 6412
+    },
+    {
+      "epoch": 0.4461372569480678,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0012212340232993597,
+      "loss": 0.8429,
+      "step": 6413
+    },
+    {
+      "epoch": 0.44620682458520294,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012210142682701488,
+      "loss": 0.9472,
+      "step": 6414
+    },
+    {
+      "epoch": 0.44627639222233817,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0012207945020189473,
+      "loss": 0.8691,
+      "step": 6415
+    },
+    {
+      "epoch": 0.4463459598594734,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012205747245569135,
+      "loss": 0.7448,
+      "step": 6416
+    },
+    {
+      "epoch": 0.44641552749660857,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0012203549358952076,
+      "loss": 1.11,
+      "step": 6417
+    },
+    {
+      "epoch": 0.4464850951337438,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001220135136044988,
+      "loss": 0.8819,
+      "step": 6418
+    },
+    {
+      "epoch": 0.44655466277087896,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0012199153250174162,
+      "loss": 0.8179,
+      "step": 6419
+    },
+    {
+      "epoch": 0.4466242304080142,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012196955028236523,
+      "loss": 0.9889,
+      "step": 6420
+    },
+    {
+      "epoch": 0.4466937980451494,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012194756694748586,
+      "loss": 0.8049,
+      "step": 6421
+    },
+    {
+      "epoch": 0.4467633656822846,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012192558249821963,
+      "loss": 0.9424,
+      "step": 6422
+    },
+    {
+      "epoch": 0.4468329333194198,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012190359693568284,
+      "loss": 0.9192,
+      "step": 6423
+    },
+    {
+      "epoch": 0.44690250095655504,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0012188161026099183,
+      "loss": 0.9867,
+      "step": 6424
+    },
+    {
+      "epoch": 0.4469720685936902,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0012185962247526288,
+      "loss": 0.8561,
+      "step": 6425
+    },
+    {
+      "epoch": 0.44704163623082543,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012183763357961252,
+      "loss": 0.8188,
+      "step": 6426
+    },
+    {
+      "epoch": 0.4471112038679606,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001218156435751572,
+      "loss": 0.7588,
+      "step": 6427
+    },
+    {
+      "epoch": 0.44718077150509583,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012179365246301347,
+      "loss": 0.8457,
+      "step": 6428
+    },
+    {
+      "epoch": 0.44725033914223106,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0012177166024429787,
+      "loss": 0.9289,
+      "step": 6429
+    },
+    {
+      "epoch": 0.4473199067793662,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012174966692012712,
+      "loss": 0.6494,
+      "step": 6430
+    },
+    {
+      "epoch": 0.44738947441650145,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0012172767249161796,
+      "loss": 0.8693,
+      "step": 6431
+    },
+    {
+      "epoch": 0.4474590420536366,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012170567695988703,
+      "loss": 0.8021,
+      "step": 6432
+    },
+    {
+      "epoch": 0.44752860969077185,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012168368032605128,
+      "loss": 0.7772,
+      "step": 6433
+    },
+    {
+      "epoch": 0.4475981773279071,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001216616825912275,
+      "loss": 0.9562,
+      "step": 6434
+    },
+    {
+      "epoch": 0.44766774496504225,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001216396837565327,
+      "loss": 0.9181,
+      "step": 6435
+    },
+    {
+      "epoch": 0.4477373126021775,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001216176838230838,
+      "loss": 0.7663,
+      "step": 6436
+    },
+    {
+      "epoch": 0.4478068802393127,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001215956827919979,
+      "loss": 0.7702,
+      "step": 6437
+    },
+    {
+      "epoch": 0.44787644787644787,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012157368066439207,
+      "loss": 0.7952,
+      "step": 6438
+    },
+    {
+      "epoch": 0.4479460155135831,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0012155167744138345,
+      "loss": 0.5708,
+      "step": 6439
+    },
+    {
+      "epoch": 0.44801558315071827,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0012152967312408932,
+      "loss": 1.1361,
+      "step": 6440
+    },
+    {
+      "epoch": 0.4480851507878535,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0012150766771362688,
+      "loss": 0.9889,
+      "step": 6441
+    },
+    {
+      "epoch": 0.4481547184249887,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0012148566121111348,
+      "loss": 0.617,
+      "step": 6442
+    },
+    {
+      "epoch": 0.4482242860621239,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001214636536176665,
+      "loss": 1.0036,
+      "step": 6443
+    },
+    {
+      "epoch": 0.4482938536992591,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001214416449344034,
+      "loss": 0.8777,
+      "step": 6444
+    },
+    {
+      "epoch": 0.4483634213363943,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001214196351624416,
+      "loss": 0.9089,
+      "step": 6445
+    },
+    {
+      "epoch": 0.4484329889735295,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.0012139762430289872,
+      "loss": 1.0769,
+      "step": 6446
+    },
+    {
+      "epoch": 0.44850255661066474,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012137561235689234,
+      "loss": 0.8059,
+      "step": 6447
+    },
+    {
+      "epoch": 0.4485721242477999,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0012135359932554006,
+      "loss": 0.6083,
+      "step": 6448
+    },
+    {
+      "epoch": 0.44864169188493513,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001213315852099597,
+      "loss": 0.6459,
+      "step": 6449
+    },
+    {
+      "epoch": 0.44871125952207036,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001213095700112689,
+      "loss": 1.041,
+      "step": 6450
+    },
+    {
+      "epoch": 0.44878082715920553,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001212875537305856,
+      "loss": 0.8277,
+      "step": 6451
+    },
+    {
+      "epoch": 0.44885039479634076,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0012126553636902758,
+      "loss": 0.7094,
+      "step": 6452
+    },
+    {
+      "epoch": 0.4489199624334759,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001212435179277128,
+      "loss": 0.9776,
+      "step": 6453
+    },
+    {
+      "epoch": 0.44898953007061115,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0012122149840775932,
+      "loss": 0.9729,
+      "step": 6454
+    },
+    {
+      "epoch": 0.4490590977077464,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012119947781028503,
+      "loss": 0.7491,
+      "step": 6455
+    },
+    {
+      "epoch": 0.44912866534488155,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012117745613640816,
+      "loss": 0.8354,
+      "step": 6456
+    },
+    {
+      "epoch": 0.4491982329820168,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001211554333872468,
+      "loss": 0.9233,
+      "step": 6457
+    },
+    {
+      "epoch": 0.44926780061915195,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0012113340956391916,
+      "loss": 0.6918,
+      "step": 6458
+    },
+    {
+      "epoch": 0.4493373682562872,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001211113846675435,
+      "loss": 0.7891,
+      "step": 6459
+    },
+    {
+      "epoch": 0.4494069358934224,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0012108935869923813,
+      "loss": 0.679,
+      "step": 6460
+    },
+    {
+      "epoch": 0.44947650353055757,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0012106733166012144,
+      "loss": 0.6628,
+      "step": 6461
+    },
+    {
+      "epoch": 0.4495460711676928,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0012104530355131183,
+      "loss": 0.7719,
+      "step": 6462
+    },
+    {
+      "epoch": 0.449615638804828,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001210232743739278,
+      "loss": 0.8797,
+      "step": 6463
+    },
+    {
+      "epoch": 0.4496852064419632,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001210012441290878,
+      "loss": 1.2471,
+      "step": 6464
+    },
+    {
+      "epoch": 0.4497547740790984,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012097921281791057,
+      "loss": 0.57,
+      "step": 6465
+    },
+    {
+      "epoch": 0.4498243417162336,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0012095718044151458,
+      "loss": 0.7033,
+      "step": 6466
+    },
+    {
+      "epoch": 0.4498939093533688,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012093514700101864,
+      "loss": 0.9363,
+      "step": 6467
+    },
+    {
+      "epoch": 0.44996347699050404,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012091311249754144,
+      "loss": 0.9462,
+      "step": 6468
+    },
+    {
+      "epoch": 0.4500330446276392,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001208910769322018,
+      "loss": 1.047,
+      "step": 6469
+    },
+    {
+      "epoch": 0.45010261226477444,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012086904030611859,
+      "loss": 0.7955,
+      "step": 6470
+    },
+    {
+      "epoch": 0.4501721799019096,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012084700262041067,
+      "loss": 0.7809,
+      "step": 6471
+    },
+    {
+      "epoch": 0.45024174753904483,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012082496387619706,
+      "loss": 0.9051,
+      "step": 6472
+    },
+    {
+      "epoch": 0.45031131517618006,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012080292407459672,
+      "loss": 0.6128,
+      "step": 6473
+    },
+    {
+      "epoch": 0.45038088281331523,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0012078088321672874,
+      "loss": 0.6237,
+      "step": 6474
+    },
+    {
+      "epoch": 0.45045045045045046,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001207588413037123,
+      "loss": 0.9595,
+      "step": 6475
+    },
+    {
+      "epoch": 0.4505200180875856,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001207367983366665,
+      "loss": 0.9554,
+      "step": 6476
+    },
+    {
+      "epoch": 0.45058958572472085,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0012071475431671066,
+      "loss": 0.8974,
+      "step": 6477
+    },
+    {
+      "epoch": 0.4506591533618561,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012069270924496393,
+      "loss": 0.7054,
+      "step": 6478
+    },
+    {
+      "epoch": 0.45072872099899125,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012067066312254579,
+      "loss": 0.6205,
+      "step": 6479
+    },
+    {
+      "epoch": 0.4507982886361265,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012064861595057548,
+      "loss": 1.0292,
+      "step": 6480
+    },
+    {
+      "epoch": 0.4508678562732617,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001206265677301726,
+      "loss": 0.7173,
+      "step": 6481
+    },
+    {
+      "epoch": 0.4509374239103969,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0012060451846245654,
+      "loss": 0.8283,
+      "step": 6482
+    },
+    {
+      "epoch": 0.4510069915475321,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001205824681485469,
+      "loss": 0.9758,
+      "step": 6483
+    },
+    {
+      "epoch": 0.45107655918466727,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0012056041678956326,
+      "loss": 0.7103,
+      "step": 6484
+    },
+    {
+      "epoch": 0.4511461268218025,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001205383643866253,
+      "loss": 0.904,
+      "step": 6485
+    },
+    {
+      "epoch": 0.4512156944589377,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0012051631094085274,
+      "loss": 0.7688,
+      "step": 6486
+    },
+    {
+      "epoch": 0.4512852620960729,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0012049425645336528,
+      "loss": 0.7455,
+      "step": 6487
+    },
+    {
+      "epoch": 0.4513548297332081,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012047220092528282,
+      "loss": 0.7169,
+      "step": 6488
+    },
+    {
+      "epoch": 0.4514243973703433,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012045014435772513,
+      "loss": 0.9323,
+      "step": 6489
+    },
+    {
+      "epoch": 0.4514939650074785,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001204280867518122,
+      "loss": 0.9188,
+      "step": 6490
+    },
+    {
+      "epoch": 0.45156353264461374,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0012040602810866401,
+      "loss": 0.9239,
+      "step": 6491
+    },
+    {
+      "epoch": 0.4516331002817489,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0012038396842940055,
+      "loss": 0.9611,
+      "step": 6492
+    },
+    {
+      "epoch": 0.45170266791888414,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0012036190771514195,
+      "loss": 0.9009,
+      "step": 6493
+    },
+    {
+      "epoch": 0.45177223555601936,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012033984596700827,
+      "loss": 0.6225,
+      "step": 6494
+    },
+    {
+      "epoch": 0.45184180319315453,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0012031778318611977,
+      "loss": 0.8611,
+      "step": 6495
+    },
+    {
+      "epoch": 0.45191137083028976,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001202957193735966,
+      "loss": 0.7015,
+      "step": 6496
+    },
+    {
+      "epoch": 0.45198093846742493,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001202736545305591,
+      "loss": 0.8262,
+      "step": 6497
+    },
+    {
+      "epoch": 0.45205050610456016,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0012025158865812764,
+      "loss": 0.8527,
+      "step": 6498
+    },
+    {
+      "epoch": 0.4521200737416954,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001202295217574226,
+      "loss": 0.7397,
+      "step": 6499
+    },
+    {
+      "epoch": 0.45218964137883055,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012020745382956438,
+      "loss": 0.9292,
+      "step": 6500
+    },
+    {
+      "epoch": 0.4522592090159658,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001201853848756735,
+      "loss": 0.7857,
+      "step": 6501
+    },
+    {
+      "epoch": 0.45232877665310095,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012016331489687056,
+      "loss": 0.8355,
+      "step": 6502
+    },
+    {
+      "epoch": 0.4523983442902362,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012014124389427606,
+      "loss": 0.9161,
+      "step": 6503
+    },
+    {
+      "epoch": 0.4524679119273714,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012011917186901075,
+      "loss": 0.9408,
+      "step": 6504
+    },
+    {
+      "epoch": 0.4525374795645066,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0012009709882219528,
+      "loss": 0.9124,
+      "step": 6505
+    },
+    {
+      "epoch": 0.4526070472016418,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012007502475495048,
+      "loss": 0.8368,
+      "step": 6506
+    },
+    {
+      "epoch": 0.452676614838777,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0012005294966839703,
+      "loss": 0.9683,
+      "step": 6507
+    },
+    {
+      "epoch": 0.4527461824759122,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0012003087356365595,
+      "loss": 0.7981,
+      "step": 6508
+    },
+    {
+      "epoch": 0.4528157501130474,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012000879644184803,
+      "loss": 0.6341,
+      "step": 6509
+    },
+    {
+      "epoch": 0.4528853177501826,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011998671830409427,
+      "loss": 0.6034,
+      "step": 6510
+    },
+    {
+      "epoch": 0.4529548853873178,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0011996463915151573,
+      "loss": 0.6165,
+      "step": 6511
+    },
+    {
+      "epoch": 0.45302445302445304,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011994255898523341,
+      "loss": 0.9816,
+      "step": 6512
+    },
+    {
+      "epoch": 0.4530940206615882,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011992047780636848,
+      "loss": 0.7594,
+      "step": 6513
+    },
+    {
+      "epoch": 0.45316358829872344,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011989839561604208,
+      "loss": 0.7464,
+      "step": 6514
+    },
+    {
+      "epoch": 0.4532331559358586,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0011987631241537546,
+      "loss": 0.883,
+      "step": 6515
+    },
+    {
+      "epoch": 0.45330272357299384,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011985422820548989,
+      "loss": 0.776,
+      "step": 6516
+    },
+    {
+      "epoch": 0.45337229121012906,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0011983214298750663,
+      "loss": 0.9282,
+      "step": 6517
+    },
+    {
+      "epoch": 0.45344185884726423,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0011981005676254717,
+      "loss": 0.7438,
+      "step": 6518
+    },
+    {
+      "epoch": 0.45351142648439946,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011978796953173285,
+      "loss": 0.9153,
+      "step": 6519
+    },
+    {
+      "epoch": 0.4535809941215347,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001197658812961852,
+      "loss": 0.7996,
+      "step": 6520
+    },
+    {
+      "epoch": 0.45365056175866986,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001197437920570257,
+      "loss": 0.7393,
+      "step": 6521
+    },
+    {
+      "epoch": 0.4537201293958051,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0011972170181537595,
+      "loss": 0.8687,
+      "step": 6522
+    },
+    {
+      "epoch": 0.45378969703294025,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001196996105723576,
+      "loss": 0.7682,
+      "step": 6523
+    },
+    {
+      "epoch": 0.4538592646700755,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011967751832909232,
+      "loss": 0.7974,
+      "step": 6524
+    },
+    {
+      "epoch": 0.4539288323072107,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0011965542508670188,
+      "loss": 1.0709,
+      "step": 6525
+    },
+    {
+      "epoch": 0.4539983999443459,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0011963333084630797,
+      "loss": 0.8347,
+      "step": 6526
+    },
+    {
+      "epoch": 0.4540679675814811,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011961123560903248,
+      "loss": 0.9493,
+      "step": 6527
+    },
+    {
+      "epoch": 0.4541375352186163,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0011958913937599731,
+      "loss": 0.9302,
+      "step": 6528
+    },
+    {
+      "epoch": 0.4542071028557515,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001195670421483244,
+      "loss": 0.8061,
+      "step": 6529
+    },
+    {
+      "epoch": 0.4542766704928867,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0011954494392713566,
+      "loss": 0.8252,
+      "step": 6530
+    },
+    {
+      "epoch": 0.4543462381300219,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011952284471355324,
+      "loss": 0.9535,
+      "step": 6531
+    },
+    {
+      "epoch": 0.4544158057671571,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011950074450869912,
+      "loss": 0.7242,
+      "step": 6532
+    },
+    {
+      "epoch": 0.45448537340429235,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001194786433136955,
+      "loss": 1.0569,
+      "step": 6533
+    },
+    {
+      "epoch": 0.4545549410414275,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011945654112966457,
+      "loss": 0.9092,
+      "step": 6534
+    },
+    {
+      "epoch": 0.45462450867856274,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0011943443795772854,
+      "loss": 0.7104,
+      "step": 6535
+    },
+    {
+      "epoch": 0.4546940763156979,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011941233379900971,
+      "loss": 0.8083,
+      "step": 6536
+    },
+    {
+      "epoch": 0.45476364395283314,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001193902286546304,
+      "loss": 0.7365,
+      "step": 6537
+    },
+    {
+      "epoch": 0.45483321158996837,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0011936812252571303,
+      "loss": 0.8173,
+      "step": 6538
+    },
+    {
+      "epoch": 0.45490277922710354,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011934601541338003,
+      "loss": 0.8049,
+      "step": 6539
+    },
+    {
+      "epoch": 0.45497234686423876,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0011932390731875385,
+      "loss": 0.983,
+      "step": 6540
+    },
+    {
+      "epoch": 0.45504191450137393,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011930179824295706,
+      "loss": 0.9033,
+      "step": 6541
+    },
+    {
+      "epoch": 0.45511148213850916,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0011927968818711227,
+      "loss": 0.9472,
+      "step": 6542
+    },
+    {
+      "epoch": 0.4551810497756444,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0011925757715234204,
+      "loss": 0.5104,
+      "step": 6543
+    },
+    {
+      "epoch": 0.45525061741277956,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011923546513976915,
+      "loss": 1.0062,
+      "step": 6544
+    },
+    {
+      "epoch": 0.4553201850499148,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001192133521505163,
+      "loss": 0.8641,
+      "step": 6545
+    },
+    {
+      "epoch": 0.45538975268705,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0011919123818570625,
+      "loss": 0.911,
+      "step": 6546
+    },
+    {
+      "epoch": 0.4554593203241852,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011916912324646184,
+      "loss": 0.7561,
+      "step": 6547
+    },
+    {
+      "epoch": 0.4555288879613204,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00119147007333906,
+      "loss": 0.9323,
+      "step": 6548
+    },
+    {
+      "epoch": 0.4555984555984556,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011912489044916164,
+      "loss": 0.6791,
+      "step": 6549
+    },
+    {
+      "epoch": 0.4556680232355908,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011910277259335172,
+      "loss": 1.0677,
+      "step": 6550
+    },
+    {
+      "epoch": 0.45573759087272603,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001190806537675993,
+      "loss": 0.9121,
+      "step": 6551
+    },
+    {
+      "epoch": 0.4558071585098612,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011905853397302746,
+      "loss": 0.9033,
+      "step": 6552
+    },
+    {
+      "epoch": 0.4558767261469964,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001190364132107593,
+      "loss": 0.9323,
+      "step": 6553
+    },
+    {
+      "epoch": 0.4559462937841316,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011901429148191806,
+      "loss": 0.6688,
+      "step": 6554
+    },
+    {
+      "epoch": 0.4560158614212668,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0011899216878762692,
+      "loss": 0.7977,
+      "step": 6555
+    },
+    {
+      "epoch": 0.45608542905840205,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001189700451290092,
+      "loss": 0.8965,
+      "step": 6556
+    },
+    {
+      "epoch": 0.4561549966955372,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011894792050718818,
+      "loss": 0.7538,
+      "step": 6557
+    },
+    {
+      "epoch": 0.45622456433267244,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0011892579492328728,
+      "loss": 0.8227,
+      "step": 6558
+    },
+    {
+      "epoch": 0.45629413196980767,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001189036683784299,
+      "loss": 1.0174,
+      "step": 6559
+    },
+    {
+      "epoch": 0.45636369960694284,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001188815408737395,
+      "loss": 0.7564,
+      "step": 6560
+    },
+    {
+      "epoch": 0.45643326724407807,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011885941241033967,
+      "loss": 0.72,
+      "step": 6561
+    },
+    {
+      "epoch": 0.45650283488121324,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001188372829893539,
+      "loss": 0.7294,
+      "step": 6562
+    },
+    {
+      "epoch": 0.45657240251834846,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011881515261190586,
+      "loss": 0.8049,
+      "step": 6563
+    },
+    {
+      "epoch": 0.4566419701554837,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001187930212791192,
+      "loss": 0.8503,
+      "step": 6564
+    },
+    {
+      "epoch": 0.45671153779261886,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0011877088899211762,
+      "loss": 0.7304,
+      "step": 6565
+    },
+    {
+      "epoch": 0.4567811054297541,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011874875575202495,
+      "loss": 0.6712,
+      "step": 6566
+    },
+    {
+      "epoch": 0.45685067306688926,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011872662155996494,
+      "loss": 0.7354,
+      "step": 6567
+    },
+    {
+      "epoch": 0.4569202407040245,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011870448641706148,
+      "loss": 0.8842,
+      "step": 6568
+    },
+    {
+      "epoch": 0.4569898083411597,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0011868235032443848,
+      "loss": 0.8922,
+      "step": 6569
+    },
+    {
+      "epoch": 0.4570593759782949,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001186602132832199,
+      "loss": 0.9433,
+      "step": 6570
+    },
+    {
+      "epoch": 0.4571289436154301,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0011863807529452974,
+      "loss": 1.005,
+      "step": 6571
+    },
+    {
+      "epoch": 0.45719851125256533,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011861593635949207,
+      "loss": 0.7681,
+      "step": 6572
+    },
+    {
+      "epoch": 0.4572680788897005,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011859379647923096,
+      "loss": 0.8305,
+      "step": 6573
+    },
+    {
+      "epoch": 0.45733764652683573,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.001185716556548706,
+      "loss": 1.1047,
+      "step": 6574
+    },
+    {
+      "epoch": 0.4574072141639709,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001185495138875352,
+      "loss": 0.7204,
+      "step": 6575
+    },
+    {
+      "epoch": 0.4574767818011061,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0011852737117834893,
+      "loss": 0.8898,
+      "step": 6576
+    },
+    {
+      "epoch": 0.45754634943824135,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011850522752843615,
+      "loss": 0.945,
+      "step": 6577
+    },
+    {
+      "epoch": 0.4576159170753765,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001184830829389212,
+      "loss": 0.9251,
+      "step": 6578
+    },
+    {
+      "epoch": 0.45768548471251175,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011846093741092847,
+      "loss": 0.874,
+      "step": 6579
+    },
+    {
+      "epoch": 0.4577550523496469,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011843879094558239,
+      "loss": 0.9574,
+      "step": 6580
+    },
+    {
+      "epoch": 0.45782461998678214,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011841664354400741,
+      "loss": 0.9693,
+      "step": 6581
+    },
+    {
+      "epoch": 0.45789418762391737,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011839449520732812,
+      "loss": 0.9989,
+      "step": 6582
+    },
+    {
+      "epoch": 0.45796375526105254,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011837234593666908,
+      "loss": 0.7398,
+      "step": 6583
+    },
+    {
+      "epoch": 0.45803332289818777,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0011835019573315493,
+      "loss": 0.7772,
+      "step": 6584
+    },
+    {
+      "epoch": 0.458102890535323,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0011832804459791031,
+      "loss": 0.7726,
+      "step": 6585
+    },
+    {
+      "epoch": 0.45817245817245816,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011830589253205997,
+      "loss": 0.9433,
+      "step": 6586
+    },
+    {
+      "epoch": 0.4582420258095934,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0011828373953672868,
+      "loss": 0.765,
+      "step": 6587
+    },
+    {
+      "epoch": 0.45831159344672856,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0011826158561304126,
+      "loss": 0.7674,
+      "step": 6588
+    },
+    {
+      "epoch": 0.4583811610838638,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0011823943076212256,
+      "loss": 1.1343,
+      "step": 6589
+    },
+    {
+      "epoch": 0.458450728720999,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001182172749850975,
+      "loss": 1.2282,
+      "step": 6590
+    },
+    {
+      "epoch": 0.4585202963581342,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011819511828309102,
+      "loss": 0.9132,
+      "step": 6591
+    },
+    {
+      "epoch": 0.4585898639952694,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0011817296065722816,
+      "loss": 0.9758,
+      "step": 6592
+    },
+    {
+      "epoch": 0.4586594316324046,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0011815080210863397,
+      "loss": 0.8898,
+      "step": 6593
+    },
+    {
+      "epoch": 0.4587289992695398,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0011812864263843353,
+      "loss": 0.6673,
+      "step": 6594
+    },
+    {
+      "epoch": 0.45879856690667503,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0011810648224775198,
+      "loss": 0.727,
+      "step": 6595
+    },
+    {
+      "epoch": 0.4588681345438102,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0011808432093771454,
+      "loss": 0.7471,
+      "step": 6596
+    },
+    {
+      "epoch": 0.45893770218094543,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0011806215870944642,
+      "loss": 0.7002,
+      "step": 6597
+    },
+    {
+      "epoch": 0.45900726981808065,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011803999556407293,
+      "loss": 0.8018,
+      "step": 6598
+    },
+    {
+      "epoch": 0.4590768374552158,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0011801783150271934,
+      "loss": 0.6078,
+      "step": 6599
+    },
+    {
+      "epoch": 0.45914640509235105,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011799566652651117,
+      "loss": 0.8592,
+      "step": 6600
+    },
+    {
+      "epoch": 0.4592159727294862,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001179735006365737,
+      "loss": 0.8635,
+      "step": 6601
+    },
+    {
+      "epoch": 0.45928554036662145,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001179513338340325,
+      "loss": 0.7542,
+      "step": 6602
+    },
+    {
+      "epoch": 0.4593551080037567,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011792916612001303,
+      "loss": 0.7602,
+      "step": 6603
+    },
+    {
+      "epoch": 0.45942467564089184,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011790699749564086,
+      "loss": 0.7673,
+      "step": 6604
+    },
+    {
+      "epoch": 0.45949424327802707,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0011788482796204164,
+      "loss": 0.6943,
+      "step": 6605
+    },
+    {
+      "epoch": 0.45956381091516224,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0011786265752034098,
+      "loss": 0.7012,
+      "step": 6606
+    },
+    {
+      "epoch": 0.45963337855229747,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0011784048617166463,
+      "loss": 0.9853,
+      "step": 6607
+    },
+    {
+      "epoch": 0.4597029461894327,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001178183139171383,
+      "loss": 0.7788,
+      "step": 6608
+    },
+    {
+      "epoch": 0.45977251382656786,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0011779614075788781,
+      "loss": 0.5682,
+      "step": 6609
+    },
+    {
+      "epoch": 0.4598420814637031,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011777396669503898,
+      "loss": 0.6482,
+      "step": 6610
+    },
+    {
+      "epoch": 0.4599116491008383,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011775179172971771,
+      "loss": 0.7693,
+      "step": 6611
+    },
+    {
+      "epoch": 0.4599812167379735,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011772961586304993,
+      "loss": 0.7675,
+      "step": 6612
+    },
+    {
+      "epoch": 0.4600507843751087,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0011770743909616161,
+      "loss": 0.7296,
+      "step": 6613
+    },
+    {
+      "epoch": 0.4601203520122439,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011768526143017882,
+      "loss": 0.9234,
+      "step": 6614
+    },
+    {
+      "epoch": 0.4601899196493791,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0011766308286622756,
+      "loss": 0.8889,
+      "step": 6615
+    },
+    {
+      "epoch": 0.46025948728651433,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00117640903405434,
+      "loss": 0.756,
+      "step": 6616
+    },
+    {
+      "epoch": 0.4603290549236495,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0011761872304892427,
+      "loss": 0.6366,
+      "step": 6617
+    },
+    {
+      "epoch": 0.46039862256078473,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001175965417978246,
+      "loss": 1.0019,
+      "step": 6618
+    },
+    {
+      "epoch": 0.4604681901979199,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0011757435965326123,
+      "loss": 0.7696,
+      "step": 6619
+    },
+    {
+      "epoch": 0.46053775783505513,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011755217661636047,
+      "loss": 0.7882,
+      "step": 6620
+    },
+    {
+      "epoch": 0.46060732547219035,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011752999268824862,
+      "loss": 0.8832,
+      "step": 6621
+    },
+    {
+      "epoch": 0.4606768931093255,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001175078078700521,
+      "loss": 1.199,
+      "step": 6622
+    },
+    {
+      "epoch": 0.46074646074646075,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011748562216289738,
+      "loss": 0.9617,
+      "step": 6623
+    },
+    {
+      "epoch": 0.460816028383596,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0011746343556791085,
+      "loss": 0.8008,
+      "step": 6624
+    },
+    {
+      "epoch": 0.46088559602073115,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001174412480862191,
+      "loss": 0.6703,
+      "step": 6625
+    },
+    {
+      "epoch": 0.4609551636578664,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011741905971894872,
+      "loss": 0.9244,
+      "step": 6626
+    },
+    {
+      "epoch": 0.46102473129500154,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011739687046722627,
+      "loss": 0.6949,
+      "step": 6627
+    },
+    {
+      "epoch": 0.46109429893213677,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001173746803321784,
+      "loss": 0.998,
+      "step": 6628
+    },
+    {
+      "epoch": 0.461163866569272,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011735248931493184,
+      "loss": 0.9241,
+      "step": 6629
+    },
+    {
+      "epoch": 0.46123343420640717,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0011733029741661336,
+      "loss": 1.089,
+      "step": 6630
+    },
+    {
+      "epoch": 0.4613030018435424,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0011730810463834972,
+      "loss": 0.7909,
+      "step": 6631
+    },
+    {
+      "epoch": 0.46137256948067756,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0011728591098126775,
+      "loss": 0.6703,
+      "step": 6632
+    },
+    {
+      "epoch": 0.4614421371178128,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011726371644649436,
+      "loss": 0.8211,
+      "step": 6633
+    },
+    {
+      "epoch": 0.461511704754948,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011724152103515647,
+      "loss": 0.9068,
+      "step": 6634
+    },
+    {
+      "epoch": 0.4615812723920832,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011721932474838103,
+      "loss": 0.813,
+      "step": 6635
+    },
+    {
+      "epoch": 0.4616508400292184,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011719712758729505,
+      "loss": 0.7287,
+      "step": 6636
+    },
+    {
+      "epoch": 0.46172040766635364,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011717492955302569,
+      "loss": 0.9651,
+      "step": 6637
+    },
+    {
+      "epoch": 0.4617899753034888,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0011715273064669988,
+      "loss": 0.7242,
+      "step": 6638
+    },
+    {
+      "epoch": 0.46185954294062403,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011713053086944494,
+      "loss": 1.2099,
+      "step": 6639
+    },
+    {
+      "epoch": 0.4619291105777592,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011710833022238797,
+      "loss": 0.6752,
+      "step": 6640
+    },
+    {
+      "epoch": 0.46199867821489443,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001170861287066562,
+      "loss": 0.9003,
+      "step": 6641
+    },
+    {
+      "epoch": 0.46206824585202966,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011706392632337694,
+      "loss": 0.8056,
+      "step": 6642
+    },
+    {
+      "epoch": 0.46213781348916483,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0011704172307367754,
+      "loss": 0.697,
+      "step": 6643
+    },
+    {
+      "epoch": 0.46220738112630005,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001170195189586853,
+      "loss": 0.7523,
+      "step": 6644
+    },
+    {
+      "epoch": 0.4622769487634352,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0011699731397952766,
+      "loss": 1.0126,
+      "step": 6645
+    },
+    {
+      "epoch": 0.46234651640057045,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0011697510813733214,
+      "loss": 0.869,
+      "step": 6646
+    },
+    {
+      "epoch": 0.4624160840377057,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0011695290143322616,
+      "loss": 0.8879,
+      "step": 6647
+    },
+    {
+      "epoch": 0.46248565167484085,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001169306938683373,
+      "loss": 0.8482,
+      "step": 6648
+    },
+    {
+      "epoch": 0.4625552193119761,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011690848544379316,
+      "loss": 0.6539,
+      "step": 6649
+    },
+    {
+      "epoch": 0.4626247869491113,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0011688627616072132,
+      "loss": 0.732,
+      "step": 6650
+    },
+    {
+      "epoch": 0.46269435458624647,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001168640660202495,
+      "loss": 1.0946,
+      "step": 6651
+    },
+    {
+      "epoch": 0.4627639222233817,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001168418550235054,
+      "loss": 0.8689,
+      "step": 6652
+    },
+    {
+      "epoch": 0.46283348986051687,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011681964317161685,
+      "loss": 0.9217,
+      "step": 6653
+    },
+    {
+      "epoch": 0.4629030574976521,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.001167974304657115,
+      "loss": 0.93,
+      "step": 6654
+    },
+    {
+      "epoch": 0.4629726251347873,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001167752169069174,
+      "loss": 0.8417,
+      "step": 6655
+    },
+    {
+      "epoch": 0.4630421927719225,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011675300249636227,
+      "loss": 0.8598,
+      "step": 6656
+    },
+    {
+      "epoch": 0.4631117604090577,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0011673078723517414,
+      "loss": 0.9799,
+      "step": 6657
+    },
+    {
+      "epoch": 0.4631813280461929,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0011670857112448094,
+      "loss": 0.8178,
+      "step": 6658
+    },
+    {
+      "epoch": 0.4632508956833281,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011668635416541072,
+      "loss": 0.9696,
+      "step": 6659
+    },
+    {
+      "epoch": 0.46332046332046334,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011666413635909156,
+      "loss": 0.8456,
+      "step": 6660
+    },
+    {
+      "epoch": 0.4633900309575985,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0011664191770665154,
+      "loss": 0.6578,
+      "step": 6661
+    },
+    {
+      "epoch": 0.46345959859473373,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011661969820921884,
+      "loss": 0.742,
+      "step": 6662
+    },
+    {
+      "epoch": 0.46352916623186896,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011659747786792161,
+      "loss": 0.8278,
+      "step": 6663
+    },
+    {
+      "epoch": 0.46359873386900413,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0011657525668388813,
+      "loss": 0.7629,
+      "step": 6664
+    },
+    {
+      "epoch": 0.46366830150613936,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011655303465824664,
+      "loss": 0.637,
+      "step": 6665
+    },
+    {
+      "epoch": 0.46373786914327453,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0011653081179212549,
+      "loss": 0.7785,
+      "step": 6666
+    },
+    {
+      "epoch": 0.46380743678040975,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0011650858808665303,
+      "loss": 0.9898,
+      "step": 6667
+    },
+    {
+      "epoch": 0.463877004417545,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011648636354295767,
+      "loss": 0.7137,
+      "step": 6668
+    },
+    {
+      "epoch": 0.46394657205468015,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0011646413816216792,
+      "loss": 0.5219,
+      "step": 6669
+    },
+    {
+      "epoch": 0.4640161396918154,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0011644191194541216,
+      "loss": 0.6708,
+      "step": 6670
+    },
+    {
+      "epoch": 0.46408570732895055,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011641968489381903,
+      "loss": 0.7185,
+      "step": 6671
+    },
+    {
+      "epoch": 0.4641552749660858,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011639745700851702,
+      "loss": 0.755,
+      "step": 6672
+    },
+    {
+      "epoch": 0.464224842603221,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.001163752282906348,
+      "loss": 0.7932,
+      "step": 6673
+    },
+    {
+      "epoch": 0.46429441024035617,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0011635299874130107,
+      "loss": 0.9895,
+      "step": 6674
+    },
+    {
+      "epoch": 0.4643639778774914,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0011633076836164444,
+      "loss": 1.0151,
+      "step": 6675
+    },
+    {
+      "epoch": 0.4644335455146266,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0011630853715279374,
+      "loss": 1.0777,
+      "step": 6676
+    },
+    {
+      "epoch": 0.4645031131517618,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0011628630511587767,
+      "loss": 0.8239,
+      "step": 6677
+    },
+    {
+      "epoch": 0.464572680788897,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001162640722520252,
+      "loss": 0.9758,
+      "step": 6678
+    },
+    {
+      "epoch": 0.4646422484260322,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011624183856236505,
+      "loss": 1.0418,
+      "step": 6679
+    },
+    {
+      "epoch": 0.4647118160631674,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0011621960404802623,
+      "loss": 0.8334,
+      "step": 6680
+    },
+    {
+      "epoch": 0.46478138370030264,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011619736871013766,
+      "loss": 0.764,
+      "step": 6681
+    },
+    {
+      "epoch": 0.4648509513374378,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0011617513254982834,
+      "loss": 1.0208,
+      "step": 6682
+    },
+    {
+      "epoch": 0.46492051897457304,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011615289556822735,
+      "loss": 0.8085,
+      "step": 6683
+    },
+    {
+      "epoch": 0.4649900866117082,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001161306577664637,
+      "loss": 0.8879,
+      "step": 6684
+    },
+    {
+      "epoch": 0.46505965424884343,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011610841914566658,
+      "loss": 0.9558,
+      "step": 6685
+    },
+    {
+      "epoch": 0.46512922188597866,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0011608617970696512,
+      "loss": 0.6572,
+      "step": 6686
+    },
+    {
+      "epoch": 0.46519878952311383,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011606393945148854,
+      "loss": 1.0288,
+      "step": 6687
+    },
+    {
+      "epoch": 0.46526835716024906,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0011604169838036608,
+      "loss": 0.9335,
+      "step": 6688
+    },
+    {
+      "epoch": 0.4653379247973843,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00116019456494727,
+      "loss": 0.8959,
+      "step": 6689
+    },
+    {
+      "epoch": 0.46540749243451945,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0011599721379570071,
+      "loss": 0.6395,
+      "step": 6690
+    },
+    {
+      "epoch": 0.4654770600716547,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001159749702844165,
+      "loss": 1.0481,
+      "step": 6691
+    },
+    {
+      "epoch": 0.46554662770878985,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011595272596200386,
+      "loss": 0.792,
+      "step": 6692
+    },
+    {
+      "epoch": 0.4656161953459251,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011593048082959216,
+      "loss": 0.714,
+      "step": 6693
+    },
+    {
+      "epoch": 0.4656857629830603,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00115908234888311,
+      "loss": 0.845,
+      "step": 6694
+    },
+    {
+      "epoch": 0.4657553306201955,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011588598813928978,
+      "loss": 0.6841,
+      "step": 6695
+    },
+    {
+      "epoch": 0.4658248982573307,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001158637405836582,
+      "loss": 0.7866,
+      "step": 6696
+    },
+    {
+      "epoch": 0.46589446589446587,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011584149222254583,
+      "loss": 0.7629,
+      "step": 6697
+    },
+    {
+      "epoch": 0.4659640335316011,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0011581924305708229,
+      "loss": 0.6811,
+      "step": 6698
+    },
+    {
+      "epoch": 0.4660336011687363,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011579699308839739,
+      "loss": 0.9732,
+      "step": 6699
+    },
+    {
+      "epoch": 0.4661031688058715,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0011577474231762076,
+      "loss": 0.7477,
+      "step": 6700
+    },
+    {
+      "epoch": 0.4661727364430067,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0011575249074588223,
+      "loss": 0.8375,
+      "step": 6701
+    },
+    {
+      "epoch": 0.46624230408014194,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0011573023837431163,
+      "loss": 0.6758,
+      "step": 6702
+    },
+    {
+      "epoch": 0.4663118717172771,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011570798520403878,
+      "loss": 0.6657,
+      "step": 6703
+    },
+    {
+      "epoch": 0.46638143935441234,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011568573123619367,
+      "loss": 0.7631,
+      "step": 6704
+    },
+    {
+      "epoch": 0.4664510069915475,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0011566347647190614,
+      "loss": 0.5423,
+      "step": 6705
+    },
+    {
+      "epoch": 0.46652057462868274,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0011564122091230627,
+      "loss": 0.7526,
+      "step": 6706
+    },
+    {
+      "epoch": 0.46659014226581796,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00115618964558524,
+      "loss": 1.0364,
+      "step": 6707
+    },
+    {
+      "epoch": 0.46665970990295313,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0011559670741168946,
+      "loss": 0.9639,
+      "step": 6708
+    },
+    {
+      "epoch": 0.46672927754008836,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001155744494729327,
+      "loss": 0.8695,
+      "step": 6709
+    },
+    {
+      "epoch": 0.46679884517722353,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0011555219074338393,
+      "loss": 0.9251,
+      "step": 6710
+    },
+    {
+      "epoch": 0.46686841281435876,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001155299312241733,
+      "loss": 0.8074,
+      "step": 6711
+    },
+    {
+      "epoch": 0.466937980451494,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00115507670916431,
+      "loss": 0.8851,
+      "step": 6712
+    },
+    {
+      "epoch": 0.46700754808862915,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001154854098212874,
+      "loss": 0.7742,
+      "step": 6713
+    },
+    {
+      "epoch": 0.4670771157257644,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011546314793987268,
+      "loss": 0.7902,
+      "step": 6714
+    },
+    {
+      "epoch": 0.4671466833628996,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001154408852733173,
+      "loss": 1.073,
+      "step": 6715
+    },
+    {
+      "epoch": 0.4672162510000348,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0011541862182275155,
+      "loss": 0.844,
+      "step": 6716
+    },
+    {
+      "epoch": 0.46728581863717,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011539635758930592,
+      "loss": 0.6413,
+      "step": 6717
+    },
+    {
+      "epoch": 0.4673553862743052,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011537409257411084,
+      "loss": 0.8827,
+      "step": 6718
+    },
+    {
+      "epoch": 0.4674249539114404,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011535182677829684,
+      "loss": 0.7699,
+      "step": 6719
+    },
+    {
+      "epoch": 0.4674945215485756,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011532956020299447,
+      "loss": 0.8396,
+      "step": 6720
+    },
+    {
+      "epoch": 0.4675640891857108,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011530729284933428,
+      "loss": 0.8124,
+      "step": 6721
+    },
+    {
+      "epoch": 0.467633656822846,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011528502471844693,
+      "loss": 0.6586,
+      "step": 6722
+    },
+    {
+      "epoch": 0.4677032244599812,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0011526275581146303,
+      "loss": 1.1657,
+      "step": 6723
+    },
+    {
+      "epoch": 0.4677727920971164,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0011524048612951336,
+      "loss": 0.7377,
+      "step": 6724
+    },
+    {
+      "epoch": 0.46784235973425164,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011521821567372862,
+      "loss": 0.7954,
+      "step": 6725
+    },
+    {
+      "epoch": 0.4679119273713868,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011519594444523956,
+      "loss": 0.8808,
+      "step": 6726
+    },
+    {
+      "epoch": 0.46798149500852204,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001151736724451771,
+      "loss": 0.5918,
+      "step": 6727
+    },
+    {
+      "epoch": 0.46805106264565727,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011515139967467195,
+      "loss": 0.9816,
+      "step": 6728
+    },
+    {
+      "epoch": 0.46812063028279244,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011512912613485516,
+      "loss": 0.96,
+      "step": 6729
+    },
+    {
+      "epoch": 0.46819019791992766,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011510685182685755,
+      "loss": 1.0691,
+      "step": 6730
+    },
+    {
+      "epoch": 0.46825976555706283,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001150845767518102,
+      "loss": 0.7501,
+      "step": 6731
+    },
+    {
+      "epoch": 0.46832933319419806,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0011506230091084403,
+      "loss": 0.6844,
+      "step": 6732
+    },
+    {
+      "epoch": 0.4683989008313333,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0011504002430509014,
+      "loss": 0.7496,
+      "step": 6733
+    },
+    {
+      "epoch": 0.46846846846846846,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0011501774693567968,
+      "loss": 0.9753,
+      "step": 6734
+    },
+    {
+      "epoch": 0.4685380361056037,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011499546880374366,
+      "loss": 0.9054,
+      "step": 6735
+    },
+    {
+      "epoch": 0.46860760374273885,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011497318991041336,
+      "loss": 0.7376,
+      "step": 6736
+    },
+    {
+      "epoch": 0.4686771713798741,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001149509102568199,
+      "loss": 0.6246,
+      "step": 6737
+    },
+    {
+      "epoch": 0.4687467390170093,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0011492862984409464,
+      "loss": 0.8417,
+      "step": 6738
+    },
+    {
+      "epoch": 0.4688163066541445,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011490634867336875,
+      "loss": 1.0912,
+      "step": 6739
+    },
+    {
+      "epoch": 0.4688858742912797,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0011488406674577364,
+      "loss": 0.8998,
+      "step": 6740
+    },
+    {
+      "epoch": 0.46895544192841493,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001148617840624406,
+      "loss": 1.0196,
+      "step": 6741
+    },
+    {
+      "epoch": 0.4690250095655501,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0011483950062450112,
+      "loss": 0.981,
+      "step": 6742
+    },
+    {
+      "epoch": 0.4690945772026853,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001148172164330866,
+      "loss": 0.7669,
+      "step": 6743
+    },
+    {
+      "epoch": 0.4691641448398205,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0011479493148932847,
+      "loss": 0.8624,
+      "step": 6744
+    },
+    {
+      "epoch": 0.4692337124769557,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0011477264579435834,
+      "loss": 0.6625,
+      "step": 6745
+    },
+    {
+      "epoch": 0.46930328011409095,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0011475035934930768,
+      "loss": 0.9669,
+      "step": 6746
+    },
+    {
+      "epoch": 0.4693728477512261,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0011472807215530813,
+      "loss": 0.7936,
+      "step": 6747
+    },
+    {
+      "epoch": 0.46944241538836134,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001147057842134913,
+      "loss": 0.7275,
+      "step": 6748
+    },
+    {
+      "epoch": 0.4695119830254965,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011468349552498887,
+      "loss": 0.7721,
+      "step": 6749
+    },
+    {
+      "epoch": 0.46958155066263174,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0011466120609093257,
+      "loss": 0.9905,
+      "step": 6750
+    },
+    {
+      "epoch": 0.46965111829976697,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.001146389159124541,
+      "loss": 0.9612,
+      "step": 6751
+    },
+    {
+      "epoch": 0.46972068593690214,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011461662499068527,
+      "loss": 0.8758,
+      "step": 6752
+    },
+    {
+      "epoch": 0.46979025357403736,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001145943333267579,
+      "loss": 0.7701,
+      "step": 6753
+    },
+    {
+      "epoch": 0.4698598212111726,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011457204092180384,
+      "loss": 0.8275,
+      "step": 6754
+    },
+    {
+      "epoch": 0.46992938884830776,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00114549747776955,
+      "loss": 0.6869,
+      "step": 6755
+    },
+    {
+      "epoch": 0.469998956485443,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001145274538933433,
+      "loss": 0.848,
+      "step": 6756
+    },
+    {
+      "epoch": 0.47006852412257816,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0011450515927210073,
+      "loss": 0.8047,
+      "step": 6757
+    },
+    {
+      "epoch": 0.4701380917597134,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0011448286391435925,
+      "loss": 0.7052,
+      "step": 6758
+    },
+    {
+      "epoch": 0.4702076593968486,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011446056782125097,
+      "loss": 0.7012,
+      "step": 6759
+    },
+    {
+      "epoch": 0.4702772270339838,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0011443827099390793,
+      "loss": 0.8022,
+      "step": 6760
+    },
+    {
+      "epoch": 0.470346794671119,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001144159734334623,
+      "loss": 0.8481,
+      "step": 6761
+    },
+    {
+      "epoch": 0.4704163623082542,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0011439367514104613,
+      "loss": 0.6478,
+      "step": 6762
+    },
+    {
+      "epoch": 0.4704859299453894,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011437137611779171,
+      "loss": 0.6574,
+      "step": 6763
+    },
+    {
+      "epoch": 0.47055549758252463,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011434907636483126,
+      "loss": 0.8813,
+      "step": 6764
+    },
+    {
+      "epoch": 0.4706250652196598,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0011432677588329703,
+      "loss": 0.8534,
+      "step": 6765
+    },
+    {
+      "epoch": 0.470694632856795,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0011430447467432137,
+      "loss": 1.1378,
+      "step": 6766
+    },
+    {
+      "epoch": 0.4707642004939302,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0011428217273903654,
+      "loss": 0.7197,
+      "step": 6767
+    },
+    {
+      "epoch": 0.4708337681310654,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0011425987007857498,
+      "loss": 1.1085,
+      "step": 6768
+    },
+    {
+      "epoch": 0.47090333576820065,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0011423756669406908,
+      "loss": 0.6024,
+      "step": 6769
+    },
+    {
+      "epoch": 0.4709729034053358,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011421526258665131,
+      "loss": 0.6622,
+      "step": 6770
+    },
+    {
+      "epoch": 0.47104247104247104,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011419295775745417,
+      "loss": 0.8794,
+      "step": 6771
+    },
+    {
+      "epoch": 0.47111203867960627,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001141706522076102,
+      "loss": 0.9826,
+      "step": 6772
+    },
+    {
+      "epoch": 0.47118160631674144,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011414834593825188,
+      "loss": 0.9709,
+      "step": 6773
+    },
+    {
+      "epoch": 0.47125117395387667,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001141260389505119,
+      "loss": 0.937,
+      "step": 6774
+    },
+    {
+      "epoch": 0.47132074159101184,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0011410373124552287,
+      "loss": 0.962,
+      "step": 6775
+    },
+    {
+      "epoch": 0.47139030922814706,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001140814228244174,
+      "loss": 0.797,
+      "step": 6776
+    },
+    {
+      "epoch": 0.4714598768652823,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0011405911368832832,
+      "loss": 0.8643,
+      "step": 6777
+    },
+    {
+      "epoch": 0.47152944450241746,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0011403680383838828,
+      "loss": 0.7165,
+      "step": 6778
+    },
+    {
+      "epoch": 0.4715990121395527,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011401449327573007,
+      "loss": 0.9037,
+      "step": 6779
+    },
+    {
+      "epoch": 0.47166857977668786,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0011399218200148658,
+      "loss": 0.726,
+      "step": 6780
+    },
+    {
+      "epoch": 0.4717381474138231,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0011396987001679058,
+      "loss": 1.0062,
+      "step": 6781
+    },
+    {
+      "epoch": 0.4718077150509583,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0011394755732277502,
+      "loss": 0.9328,
+      "step": 6782
+    },
+    {
+      "epoch": 0.4718772826880935,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0011392524392057277,
+      "loss": 0.6841,
+      "step": 6783
+    },
+    {
+      "epoch": 0.4719468503252287,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0011390292981131682,
+      "loss": 0.7211,
+      "step": 6784
+    },
+    {
+      "epoch": 0.47201641796236393,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001138806149961402,
+      "loss": 0.8626,
+      "step": 6785
+    },
+    {
+      "epoch": 0.4720859855994991,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001138582994761759,
+      "loss": 0.5986,
+      "step": 6786
+    },
+    {
+      "epoch": 0.47215555323663433,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00113835983252557,
+      "loss": 1.2783,
+      "step": 6787
+    },
+    {
+      "epoch": 0.4722251208737695,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0011381366632641661,
+      "loss": 1.1105,
+      "step": 6788
+    },
+    {
+      "epoch": 0.4722946885109047,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0011379134869888789,
+      "loss": 0.8647,
+      "step": 6789
+    },
+    {
+      "epoch": 0.47236425614803995,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011376903037110396,
+      "loss": 0.8945,
+      "step": 6790
+    },
+    {
+      "epoch": 0.4724338237851751,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0011374671134419807,
+      "loss": 0.6122,
+      "step": 6791
+    },
+    {
+      "epoch": 0.47250339142231035,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001137243916193035,
+      "loss": 0.8256,
+      "step": 6792
+    },
+    {
+      "epoch": 0.4725729590594455,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0011370207119755346,
+      "loss": 0.7419,
+      "step": 6793
+    },
+    {
+      "epoch": 0.47264252669658074,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0011367975008008133,
+      "loss": 0.6636,
+      "step": 6794
+    },
+    {
+      "epoch": 0.47271209433371597,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0011365742826802046,
+      "loss": 0.5432,
+      "step": 6795
+    },
+    {
+      "epoch": 0.47278166197085114,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001136351057625042,
+      "loss": 0.9745,
+      "step": 6796
+    },
+    {
+      "epoch": 0.47285122960798637,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00113612782564666,
+      "loss": 0.9833,
+      "step": 6797
+    },
+    {
+      "epoch": 0.4729207972451216,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011359045867563933,
+      "loss": 0.8484,
+      "step": 6798
+    },
+    {
+      "epoch": 0.47299036488225676,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011356813409655764,
+      "loss": 0.7557,
+      "step": 6799
+    },
+    {
+      "epoch": 0.473059932519392,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011354580882855449,
+      "loss": 0.949,
+      "step": 6800
+    },
+    {
+      "epoch": 0.47312950015652716,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011352348287276346,
+      "loss": 0.8126,
+      "step": 6801
+    },
+    {
+      "epoch": 0.4731990677936624,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0011350115623031815,
+      "loss": 0.6654,
+      "step": 6802
+    },
+    {
+      "epoch": 0.4732686354307976,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0011347882890235216,
+      "loss": 0.7928,
+      "step": 6803
+    },
+    {
+      "epoch": 0.4733382030679328,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0011345650088999918,
+      "loss": 0.8891,
+      "step": 6804
+    },
+    {
+      "epoch": 0.473407770705068,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0011343417219439292,
+      "loss": 0.6199,
+      "step": 6805
+    },
+    {
+      "epoch": 0.4734773383422032,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0011341184281666705,
+      "loss": 0.6123,
+      "step": 6806
+    },
+    {
+      "epoch": 0.4735469059793384,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011338951275795546,
+      "loss": 0.9712,
+      "step": 6807
+    },
+    {
+      "epoch": 0.47361647361647363,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0011336718201939186,
+      "loss": 0.5731,
+      "step": 6808
+    },
+    {
+      "epoch": 0.4736860412536088,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0011334485060211018,
+      "loss": 0.932,
+      "step": 6809
+    },
+    {
+      "epoch": 0.47375560889074403,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0011332251850724423,
+      "loss": 1.0456,
+      "step": 6810
+    },
+    {
+      "epoch": 0.47382517652787925,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011330018573592793,
+      "loss": 0.8988,
+      "step": 6811
+    },
+    {
+      "epoch": 0.4738947441650144,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0011327785228929525,
+      "loss": 0.8061,
+      "step": 6812
+    },
+    {
+      "epoch": 0.47396431180214965,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011325551816848015,
+      "loss": 0.7523,
+      "step": 6813
+    },
+    {
+      "epoch": 0.4740338794392848,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0011323318337461666,
+      "loss": 0.9292,
+      "step": 6814
+    },
+    {
+      "epoch": 0.47410344707642005,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001132108479088388,
+      "loss": 0.6064,
+      "step": 6815
+    },
+    {
+      "epoch": 0.4741730147135553,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001131885117722807,
+      "loss": 0.7639,
+      "step": 6816
+    },
+    {
+      "epoch": 0.47424258235069044,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011316617496607642,
+      "loss": 0.783,
+      "step": 6817
+    },
+    {
+      "epoch": 0.47431214998782567,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0011314383749136015,
+      "loss": 0.6997,
+      "step": 6818
+    },
+    {
+      "epoch": 0.47438171762496084,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0011312149934926605,
+      "loss": 0.6055,
+      "step": 6819
+    },
+    {
+      "epoch": 0.47445128526209607,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011309916054092835,
+      "loss": 0.9023,
+      "step": 6820
+    },
+    {
+      "epoch": 0.4745208528992313,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0011307682106748132,
+      "loss": 0.8706,
+      "step": 6821
+    },
+    {
+      "epoch": 0.47459042053636646,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001130544809300592,
+      "loss": 0.8186,
+      "step": 6822
+    },
+    {
+      "epoch": 0.4746599881735017,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011303214012979637,
+      "loss": 0.9062,
+      "step": 6823
+    },
+    {
+      "epoch": 0.4747295558106369,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0011300979866782715,
+      "loss": 0.8621,
+      "step": 6824
+    },
+    {
+      "epoch": 0.4747991234477721,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011298745654528591,
+      "loss": 0.9045,
+      "step": 6825
+    },
+    {
+      "epoch": 0.4748686910849073,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.001129651137633071,
+      "loss": 0.9506,
+      "step": 6826
+    },
+    {
+      "epoch": 0.4749382587220425,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011294277032302513,
+      "loss": 0.8287,
+      "step": 6827
+    },
+    {
+      "epoch": 0.4750078263591777,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0011292042622557457,
+      "loss": 1.0259,
+      "step": 6828
+    },
+    {
+      "epoch": 0.47507739399631294,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0011289808147208987,
+      "loss": 0.895,
+      "step": 6829
+    },
+    {
+      "epoch": 0.4751469616334481,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0011287573606370558,
+      "loss": 0.9821,
+      "step": 6830
+    },
+    {
+      "epoch": 0.47521652927058333,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0011285339000155635,
+      "loss": 0.8931,
+      "step": 6831
+    },
+    {
+      "epoch": 0.4752860969077185,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0011283104328677674,
+      "loss": 0.815,
+      "step": 6832
+    },
+    {
+      "epoch": 0.47535566454485373,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001128086959205014,
+      "loss": 0.7367,
+      "step": 6833
+    },
+    {
+      "epoch": 0.47542523218198895,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0011278634790386508,
+      "loss": 0.8959,
+      "step": 6834
+    },
+    {
+      "epoch": 0.4754947998191241,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0011276399923800245,
+      "loss": 0.9691,
+      "step": 6835
+    },
+    {
+      "epoch": 0.47556436745625935,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0011274164992404827,
+      "loss": 0.734,
+      "step": 6836
+    },
+    {
+      "epoch": 0.4756339350933946,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011271929996313735,
+      "loss": 1.028,
+      "step": 6837
+    },
+    {
+      "epoch": 0.47570350273052975,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011269694935640447,
+      "loss": 0.7971,
+      "step": 6838
+    },
+    {
+      "epoch": 0.475773070367665,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0011267459810498448,
+      "loss": 0.7243,
+      "step": 6839
+    },
+    {
+      "epoch": 0.47584263800480014,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0011265224621001232,
+      "loss": 0.8573,
+      "step": 6840
+    },
+    {
+      "epoch": 0.47591220564193537,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0011262989367262285,
+      "loss": 0.782,
+      "step": 6841
+    },
+    {
+      "epoch": 0.4759817732790706,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011260754049395103,
+      "loss": 0.9528,
+      "step": 6842
+    },
+    {
+      "epoch": 0.47605134091620577,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0011258518667513187,
+      "loss": 1.1033,
+      "step": 6843
+    },
+    {
+      "epoch": 0.476120908553341,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011256283221730036,
+      "loss": 0.894,
+      "step": 6844
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011254047712159156,
+      "loss": 0.7271,
+      "step": 6845
+    },
+    {
+      "epoch": 0.4762600438276114,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0011251812138914053,
+      "loss": 0.905,
+      "step": 6846
+    },
+    {
+      "epoch": 0.4763296114647466,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0011249576502108238,
+      "loss": 0.7876,
+      "step": 6847
+    },
+    {
+      "epoch": 0.4763991791018818,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011247340801855228,
+      "loss": 0.838,
+      "step": 6848
+    },
+    {
+      "epoch": 0.476468746739017,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001124510503826854,
+      "loss": 0.8328,
+      "step": 6849
+    },
+    {
+      "epoch": 0.47653831437615224,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001124286921146169,
+      "loss": 0.9428,
+      "step": 6850
+    },
+    {
+      "epoch": 0.4766078820132874,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001124063332154821,
+      "loss": 1.0431,
+      "step": 6851
+    },
+    {
+      "epoch": 0.47667744965042264,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001123839736864162,
+      "loss": 0.607,
+      "step": 6852
+    },
+    {
+      "epoch": 0.4767470172875578,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011236161352855456,
+      "loss": 0.7733,
+      "step": 6853
+    },
+    {
+      "epoch": 0.47681658492469303,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0011233925274303249,
+      "loss": 0.7836,
+      "step": 6854
+    },
+    {
+      "epoch": 0.47688615256182826,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011231689133098537,
+      "loss": 0.9309,
+      "step": 6855
+    },
+    {
+      "epoch": 0.47695572019896343,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0011229452929354857,
+      "loss": 0.9256,
+      "step": 6856
+    },
+    {
+      "epoch": 0.47702528783609865,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0011227216663185755,
+      "loss": 0.7528,
+      "step": 6857
+    },
+    {
+      "epoch": 0.4770948554732338,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0011224980334704777,
+      "loss": 0.586,
+      "step": 6858
+    },
+    {
+      "epoch": 0.47716442311036905,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001122274394402547,
+      "loss": 0.821,
+      "step": 6859
+    },
+    {
+      "epoch": 0.4772339907475043,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001122050749126139,
+      "loss": 1.0315,
+      "step": 6860
+    },
+    {
+      "epoch": 0.47730355838463945,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011218270976526092,
+      "loss": 0.8908,
+      "step": 6861
+    },
+    {
+      "epoch": 0.4773731260217747,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011216034399933134,
+      "loss": 0.8264,
+      "step": 6862
+    },
+    {
+      "epoch": 0.4774426936589099,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011213797761596078,
+      "loss": 0.8753,
+      "step": 6863
+    },
+    {
+      "epoch": 0.47751226129604507,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001121156106162849,
+      "loss": 1.1162,
+      "step": 6864
+    },
+    {
+      "epoch": 0.4775818289331803,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011209324300143937,
+      "loss": 0.6004,
+      "step": 6865
+    },
+    {
+      "epoch": 0.47765139657031547,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011207087477255993,
+      "loss": 0.7431,
+      "step": 6866
+    },
+    {
+      "epoch": 0.4777209642074507,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001120485059307823,
+      "loss": 1.0819,
+      "step": 6867
+    },
+    {
+      "epoch": 0.4777905318445859,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011202613647724228,
+      "loss": 1.0026,
+      "step": 6868
+    },
+    {
+      "epoch": 0.4778600994817211,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0011200376641307564,
+      "loss": 0.8103,
+      "step": 6869
+    },
+    {
+      "epoch": 0.4779296671188563,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0011198139573941827,
+      "loss": 0.8203,
+      "step": 6870
+    },
+    {
+      "epoch": 0.4779992347559915,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00111959024457406,
+      "loss": 0.9181,
+      "step": 6871
+    },
+    {
+      "epoch": 0.4780688023931267,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011193665256817476,
+      "loss": 0.8363,
+      "step": 6872
+    },
+    {
+      "epoch": 0.47813837003026194,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0011191428007286046,
+      "loss": 0.7834,
+      "step": 6873
+    },
+    {
+      "epoch": 0.4782079376673971,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0011189190697259907,
+      "loss": 0.9514,
+      "step": 6874
+    },
+    {
+      "epoch": 0.47827750530453234,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001118695332685266,
+      "loss": 1.0461,
+      "step": 6875
+    },
+    {
+      "epoch": 0.47834707294166756,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0011184715896177901,
+      "loss": 0.9652,
+      "step": 6876
+    },
+    {
+      "epoch": 0.47841664057880273,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0011182478405349246,
+      "loss": 0.688,
+      "step": 6877
+    },
+    {
+      "epoch": 0.47848620821593796,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011180240854480295,
+      "loss": 0.5792,
+      "step": 6878
+    },
+    {
+      "epoch": 0.47855577585307313,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011178003243684663,
+      "loss": 0.8728,
+      "step": 6879
+    },
+    {
+      "epoch": 0.47862534349020835,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0011175765573075962,
+      "loss": 0.7156,
+      "step": 6880
+    },
+    {
+      "epoch": 0.4786949111273436,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0011173527842767812,
+      "loss": 0.7903,
+      "step": 6881
+    },
+    {
+      "epoch": 0.47876447876447875,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011171290052873835,
+      "loss": 0.734,
+      "step": 6882
+    },
+    {
+      "epoch": 0.478834046401614,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0011169052203507653,
+      "loss": 0.8546,
+      "step": 6883
+    },
+    {
+      "epoch": 0.47890361403874915,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001116681429478289,
+      "loss": 0.7843,
+      "step": 6884
+    },
+    {
+      "epoch": 0.4789731816758844,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001116457632681318,
+      "loss": 0.8304,
+      "step": 6885
+    },
+    {
+      "epoch": 0.4790427493130196,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011162338299712153,
+      "loss": 0.6309,
+      "step": 6886
+    },
+    {
+      "epoch": 0.47911231695015477,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011160100213593448,
+      "loss": 1.0741,
+      "step": 6887
+    },
+    {
+      "epoch": 0.47918188458729,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011157862068570698,
+      "loss": 0.9332,
+      "step": 6888
+    },
+    {
+      "epoch": 0.4792514522244252,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011155623864757551,
+      "loss": 0.8916,
+      "step": 6889
+    },
+    {
+      "epoch": 0.4793210198615604,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011153385602267647,
+      "loss": 0.98,
+      "step": 6890
+    },
+    {
+      "epoch": 0.4793905874986956,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0011151147281214637,
+      "loss": 1.0965,
+      "step": 6891
+    },
+    {
+      "epoch": 0.4794601551358308,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0011148908901712172,
+      "loss": 0.7172,
+      "step": 6892
+    },
+    {
+      "epoch": 0.479529722772966,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00111466704638739,
+      "loss": 0.6087,
+      "step": 6893
+    },
+    {
+      "epoch": 0.47959929041010124,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0011144431967813485,
+      "loss": 0.8725,
+      "step": 6894
+    },
+    {
+      "epoch": 0.4796688580472364,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011142193413644576,
+      "loss": 0.6584,
+      "step": 6895
+    },
+    {
+      "epoch": 0.47973842568437164,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0011139954801480851,
+      "loss": 1.0956,
+      "step": 6896
+    },
+    {
+      "epoch": 0.4798079933215068,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0011137716131435964,
+      "loss": 0.6688,
+      "step": 6897
+    },
+    {
+      "epoch": 0.47987756095864204,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011135477403623585,
+      "loss": 0.9466,
+      "step": 6898
+    },
+    {
+      "epoch": 0.47994712859577726,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001113323861815739,
+      "loss": 0.6891,
+      "step": 6899
+    },
+    {
+      "epoch": 0.48001669623291243,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0011130999775151047,
+      "loss": 0.705,
+      "step": 6900
+    },
+    {
+      "epoch": 0.48008626387004766,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0011128760874718237,
+      "loss": 0.8579,
+      "step": 6901
+    },
+    {
+      "epoch": 0.4801558315071829,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011126521916972637,
+      "loss": 0.8625,
+      "step": 6902
+    },
+    {
+      "epoch": 0.48022539914431805,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0011124282902027938,
+      "loss": 0.9706,
+      "step": 6903
+    },
+    {
+      "epoch": 0.4802949667814533,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011122043829997815,
+      "loss": 0.706,
+      "step": 6904
+    },
+    {
+      "epoch": 0.48036453441858845,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0011119804700995964,
+      "loss": 1.1041,
+      "step": 6905
+    },
+    {
+      "epoch": 0.4804341020557237,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0011117565515136071,
+      "loss": 0.6875,
+      "step": 6906
+    },
+    {
+      "epoch": 0.4805036696928589,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011115326272531838,
+      "loss": 0.8057,
+      "step": 6907
+    },
+    {
+      "epoch": 0.4805732373299941,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011113086973296958,
+      "loss": 0.8843,
+      "step": 6908
+    },
+    {
+      "epoch": 0.4806428049671293,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0011110847617545128,
+      "loss": 0.6403,
+      "step": 6909
+    },
+    {
+      "epoch": 0.48071237260426447,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001110860820539006,
+      "loss": 0.7008,
+      "step": 6910
+    },
+    {
+      "epoch": 0.4807819402413997,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011106368736945452,
+      "loss": 0.6692,
+      "step": 6911
+    },
+    {
+      "epoch": 0.4808515078785349,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001110412921232502,
+      "loss": 1.0763,
+      "step": 6912
+    },
+    {
+      "epoch": 0.4809210755156701,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001110188963164247,
+      "loss": 0.75,
+      "step": 6913
+    },
+    {
+      "epoch": 0.4809906431528053,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0011099649995011515,
+      "loss": 0.8156,
+      "step": 6914
+    },
+    {
+      "epoch": 0.48106021078994055,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0011097410302545881,
+      "loss": 0.8426,
+      "step": 6915
+    },
+    {
+      "epoch": 0.4811297784270757,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001109517055435928,
+      "loss": 0.7825,
+      "step": 6916
+    },
+    {
+      "epoch": 0.48119934606421094,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001109293075056544,
+      "loss": 0.8922,
+      "step": 6917
+    },
+    {
+      "epoch": 0.4812689137013461,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001109069089127808,
+      "loss": 0.9127,
+      "step": 6918
+    },
+    {
+      "epoch": 0.48133848133848134,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0011088450976610943,
+      "loss": 0.7225,
+      "step": 6919
+    },
+    {
+      "epoch": 0.48140804897561656,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0011086211006677744,
+      "loss": 0.8415,
+      "step": 6920
+    },
+    {
+      "epoch": 0.48147761661275174,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0011083970981592228,
+      "loss": 1.0372,
+      "step": 6921
+    },
+    {
+      "epoch": 0.48154718424988696,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001108173090146813,
+      "loss": 0.9047,
+      "step": 6922
+    },
+    {
+      "epoch": 0.48161675188702213,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001107949076641919,
+      "loss": 0.791,
+      "step": 6923
+    },
+    {
+      "epoch": 0.48168631952415736,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0011077250576559145,
+      "loss": 0.8391,
+      "step": 6924
+    },
+    {
+      "epoch": 0.4817558871612926,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001107501033200175,
+      "loss": 0.5883,
+      "step": 6925
+    },
+    {
+      "epoch": 0.48182545479842775,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0011072770032860748,
+      "loss": 0.8007,
+      "step": 6926
+    },
+    {
+      "epoch": 0.481895022435563,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0011070529679249887,
+      "loss": 0.6742,
+      "step": 6927
+    },
+    {
+      "epoch": 0.4819645900726982,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0011068289271282932,
+      "loss": 0.8524,
+      "step": 6928
+    },
+    {
+      "epoch": 0.4820341577098334,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0011066048809073629,
+      "loss": 0.9279,
+      "step": 6929
+    },
+    {
+      "epoch": 0.4821037253469686,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001106380829273574,
+      "loss": 0.7533,
+      "step": 6930
+    },
+    {
+      "epoch": 0.4821732929841038,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011061567722383029,
+      "loss": 1.0112,
+      "step": 6931
+    },
+    {
+      "epoch": 0.482242860621239,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011059327098129255,
+      "loss": 0.7589,
+      "step": 6932
+    },
+    {
+      "epoch": 0.4823124282583742,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0011057086420088195,
+      "loss": 0.7081,
+      "step": 6933
+    },
+    {
+      "epoch": 0.4823819958955094,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0011054845688373614,
+      "loss": 0.7104,
+      "step": 6934
+    },
+    {
+      "epoch": 0.4824515635326446,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0011052604903099286,
+      "loss": 0.9116,
+      "step": 6935
+    },
+    {
+      "epoch": 0.4825211311697798,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0011050364064378985,
+      "loss": 1.0161,
+      "step": 6936
+    },
+    {
+      "epoch": 0.482590698806915,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0011048123172326494,
+      "loss": 0.6631,
+      "step": 6937
+    },
+    {
+      "epoch": 0.48266026644405025,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001104588222705559,
+      "loss": 0.8084,
+      "step": 6938
+    },
+    {
+      "epoch": 0.4827298340811854,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011043641228680055,
+      "loss": 0.7227,
+      "step": 6939
+    },
+    {
+      "epoch": 0.48279940171832064,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011041400177313682,
+      "loss": 0.9966,
+      "step": 6940
+    },
+    {
+      "epoch": 0.48286896935545587,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0011039159073070258,
+      "loss": 0.9344,
+      "step": 6941
+    },
+    {
+      "epoch": 0.48293853699259104,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011036917916063572,
+      "loss": 0.6069,
+      "step": 6942
+    },
+    {
+      "epoch": 0.48300810462972626,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011034676706407423,
+      "loss": 1.0173,
+      "step": 6943
+    },
+    {
+      "epoch": 0.48307767226686144,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0011032435444215602,
+      "loss": 0.8079,
+      "step": 6944
+    },
+    {
+      "epoch": 0.48314723990399666,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0011030194129601917,
+      "loss": 0.899,
+      "step": 6945
+    },
+    {
+      "epoch": 0.4832168075411319,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0011027952762680162,
+      "loss": 0.8863,
+      "step": 6946
+    },
+    {
+      "epoch": 0.48328637517826706,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001102571134356415,
+      "loss": 0.8912,
+      "step": 6947
+    },
+    {
+      "epoch": 0.4833559428154023,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0011023469872367686,
+      "loss": 0.6625,
+      "step": 6948
+    },
+    {
+      "epoch": 0.48342551045253745,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0011021228349204582,
+      "loss": 0.9037,
+      "step": 6949
+    },
+    {
+      "epoch": 0.4834950780896727,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011018986774188645,
+      "loss": 0.8848,
+      "step": 6950
+    },
+    {
+      "epoch": 0.4835646457268079,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011016745147433703,
+      "loss": 0.7961,
+      "step": 6951
+    },
+    {
+      "epoch": 0.4836342133639431,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0011014503469053563,
+      "loss": 0.9383,
+      "step": 6952
+    },
+    {
+      "epoch": 0.4837037810010783,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011012261739162049,
+      "loss": 0.7179,
+      "step": 6953
+    },
+    {
+      "epoch": 0.48377334863821353,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011010019957872989,
+      "loss": 1.0111,
+      "step": 6954
+    },
+    {
+      "epoch": 0.4838429162753487,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00110077781253002,
+      "loss": 0.8226,
+      "step": 6955
+    },
+    {
+      "epoch": 0.4839124839124839,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0011005536241557525,
+      "loss": 0.8878,
+      "step": 6956
+    },
+    {
+      "epoch": 0.4839820515496191,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011003294306758781,
+      "loss": 1.0296,
+      "step": 6957
+    },
+    {
+      "epoch": 0.4840516191867543,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0011001052321017817,
+      "loss": 0.8059,
+      "step": 6958
+    },
+    {
+      "epoch": 0.48412118682388955,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001099881028444846,
+      "loss": 0.8771,
+      "step": 6959
+    },
+    {
+      "epoch": 0.4841907544610247,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010996568197164547,
+      "loss": 1.0343,
+      "step": 6960
+    },
+    {
+      "epoch": 0.48426032209815995,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0010994326059279927,
+      "loss": 0.6113,
+      "step": 6961
+    },
+    {
+      "epoch": 0.4843298897352951,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010992083870908437,
+      "loss": 0.8849,
+      "step": 6962
+    },
+    {
+      "epoch": 0.48439945737243034,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0010989841632163934,
+      "loss": 0.8569,
+      "step": 6963
+    },
+    {
+      "epoch": 0.48446902500956557,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001098759934316026,
+      "loss": 0.8583,
+      "step": 6964
+    },
+    {
+      "epoch": 0.48453859264670074,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010985357004011272,
+      "loss": 0.8247,
+      "step": 6965
+    },
+    {
+      "epoch": 0.48460816028383596,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0010983114614830816,
+      "loss": 0.636,
+      "step": 6966
+    },
+    {
+      "epoch": 0.4846777279209712,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010980872175732762,
+      "loss": 0.7528,
+      "step": 6967
+    },
+    {
+      "epoch": 0.48474729555810636,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0010978629686830958,
+      "loss": 0.874,
+      "step": 6968
+    },
+    {
+      "epoch": 0.4848168631952416,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001097638714823927,
+      "loss": 1.0633,
+      "step": 6969
+    },
+    {
+      "epoch": 0.48488643083237676,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010974144560071568,
+      "loss": 0.7056,
+      "step": 6970
+    },
+    {
+      "epoch": 0.484955998469512,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010971901922441712,
+      "loss": 0.8113,
+      "step": 6971
+    },
+    {
+      "epoch": 0.4850255661066472,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001096965923546358,
+      "loss": 0.9971,
+      "step": 6972
+    },
+    {
+      "epoch": 0.4850951337437824,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010967416499251034,
+      "loss": 0.9382,
+      "step": 6973
+    },
+    {
+      "epoch": 0.4851647013809176,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0010965173713917958,
+      "loss": 0.6524,
+      "step": 6974
+    },
+    {
+      "epoch": 0.4852342690180528,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0010962930879578226,
+      "loss": 1.0498,
+      "step": 6975
+    },
+    {
+      "epoch": 0.485303836655188,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010960687996345712,
+      "loss": 1.0655,
+      "step": 6976
+    },
+    {
+      "epoch": 0.48537340429232323,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0010958445064334311,
+      "loss": 0.8055,
+      "step": 6977
+    },
+    {
+      "epoch": 0.4854429719294584,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0010956202083657893,
+      "loss": 0.9072,
+      "step": 6978
+    },
+    {
+      "epoch": 0.4855125395665936,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001095395905443036,
+      "loss": 0.8098,
+      "step": 6979
+    },
+    {
+      "epoch": 0.48558210720372885,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0010951715976765589,
+      "loss": 0.7313,
+      "step": 6980
+    },
+    {
+      "epoch": 0.485651674840864,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0010949472850777483,
+      "loss": 0.8856,
+      "step": 6981
+    },
+    {
+      "epoch": 0.48572124247799925,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0010947229676579926,
+      "loss": 0.7672,
+      "step": 6982
+    },
+    {
+      "epoch": 0.4857908101151344,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0010944986454286822,
+      "loss": 0.69,
+      "step": 6983
+    },
+    {
+      "epoch": 0.48586037775226965,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010942743184012072,
+      "loss": 0.9117,
+      "step": 6984
+    },
+    {
+      "epoch": 0.48592994538940487,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001094049986586957,
+      "loss": 0.9458,
+      "step": 6985
+    },
+    {
+      "epoch": 0.48599951302654004,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010938256499973232,
+      "loss": 0.9272,
+      "step": 6986
+    },
+    {
+      "epoch": 0.48606908066367527,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001093601308643695,
+      "loss": 0.8044,
+      "step": 6987
+    },
+    {
+      "epoch": 0.48613864830081044,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001093376962537465,
+      "loss": 0.6457,
+      "step": 6988
+    },
+    {
+      "epoch": 0.48620821593794566,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0010931526116900229,
+      "loss": 0.827,
+      "step": 6989
+    },
+    {
+      "epoch": 0.4862777835750809,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010929282561127607,
+      "loss": 0.9153,
+      "step": 6990
+    },
+    {
+      "epoch": 0.48634735121221606,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010927038958170703,
+      "loss": 0.8547,
+      "step": 6991
+    },
+    {
+      "epoch": 0.4864169188493513,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0010924795308143432,
+      "loss": 0.9355,
+      "step": 6992
+    },
+    {
+      "epoch": 0.4864864864864865,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0010922551611159716,
+      "loss": 0.6463,
+      "step": 6993
+    },
+    {
+      "epoch": 0.4865560541236217,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0010920307867333479,
+      "loss": 0.8102,
+      "step": 6994
+    },
+    {
+      "epoch": 0.4866256217607569,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001091806407677865,
+      "loss": 0.5896,
+      "step": 6995
+    },
+    {
+      "epoch": 0.4866951893978921,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001091582023960915,
+      "loss": 0.8311,
+      "step": 6996
+    },
+    {
+      "epoch": 0.4867647570350273,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001091357635593892,
+      "loss": 1.1586,
+      "step": 6997
+    },
+    {
+      "epoch": 0.48683432467216253,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010911332425881885,
+      "loss": 0.8111,
+      "step": 6998
+    },
+    {
+      "epoch": 0.4869038923092977,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001090908844955198,
+      "loss": 0.9535,
+      "step": 6999
+    },
+    {
+      "epoch": 0.48697345994643293,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001090684442706315,
+      "loss": 0.806,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4870430275835681,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0010904600358529327,
+      "loss": 0.7193,
+      "step": 7001
+    },
+    {
+      "epoch": 0.4871125952207033,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0010902356244064462,
+      "loss": 1.1411,
+      "step": 7002
+    },
+    {
+      "epoch": 0.48718216285783855,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001090011208378249,
+      "loss": 0.7308,
+      "step": 7003
+    },
+    {
+      "epoch": 0.4872517304949737,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010897867877797368,
+      "loss": 0.6471,
+      "step": 7004
+    },
+    {
+      "epoch": 0.48732129813210895,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0010895623626223034,
+      "loss": 0.7874,
+      "step": 7005
+    },
+    {
+      "epoch": 0.4873908657692442,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0010893379329173453,
+      "loss": 0.7763,
+      "step": 7006
+    },
+    {
+      "epoch": 0.48746043340637935,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010891134986762572,
+      "loss": 0.9469,
+      "step": 7007
+    },
+    {
+      "epoch": 0.48753000104351457,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0010888890599104345,
+      "loss": 0.896,
+      "step": 7008
+    },
+    {
+      "epoch": 0.48759956868064974,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0010886646166312736,
+      "loss": 0.9828,
+      "step": 7009
+    },
+    {
+      "epoch": 0.48766913631778497,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010884401688501702,
+      "loss": 0.6623,
+      "step": 7010
+    },
+    {
+      "epoch": 0.4877387039549202,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001088215716578521,
+      "loss": 0.6458,
+      "step": 7011
+    },
+    {
+      "epoch": 0.48780827159205536,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001087991259827722,
+      "loss": 0.9172,
+      "step": 7012
+    },
+    {
+      "epoch": 0.4878778392291906,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0010877667986091705,
+      "loss": 0.8783,
+      "step": 7013
+    },
+    {
+      "epoch": 0.48794740686632576,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010875423329342634,
+      "loss": 0.5569,
+      "step": 7014
+    },
+    {
+      "epoch": 0.488016974503461,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001087317862814398,
+      "loss": 0.7734,
+      "step": 7015
+    },
+    {
+      "epoch": 0.4880865421405962,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010870933882609717,
+      "loss": 0.9205,
+      "step": 7016
+    },
+    {
+      "epoch": 0.4881561097777314,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0010868689092853817,
+      "loss": 0.9377,
+      "step": 7017
+    },
+    {
+      "epoch": 0.4882256774148666,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0010866444258990269,
+      "loss": 0.9297,
+      "step": 7018
+    },
+    {
+      "epoch": 0.48829524505200184,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0010864199381133044,
+      "loss": 0.9733,
+      "step": 7019
+    },
+    {
+      "epoch": 0.488364812689137,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010861954459396132,
+      "loss": 0.798,
+      "step": 7020
+    },
+    {
+      "epoch": 0.48843438032627223,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0010859709493893518,
+      "loss": 0.9854,
+      "step": 7021
+    },
+    {
+      "epoch": 0.4885039479634074,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010857464484739189,
+      "loss": 0.8518,
+      "step": 7022
+    },
+    {
+      "epoch": 0.48857351560054263,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010855219432047137,
+      "loss": 0.9385,
+      "step": 7023
+    },
+    {
+      "epoch": 0.48864308323767786,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0010852974335931347,
+      "loss": 0.8966,
+      "step": 7024
+    },
+    {
+      "epoch": 0.488712650874813,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010850729196505825,
+      "loss": 0.7525,
+      "step": 7025
+    },
+    {
+      "epoch": 0.48878221851194825,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001084848401388456,
+      "loss": 0.7952,
+      "step": 7026
+    },
+    {
+      "epoch": 0.4888517861490834,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010846238788181552,
+      "loss": 0.7759,
+      "step": 7027
+    },
+    {
+      "epoch": 0.48892135378621865,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010843993519510807,
+      "loss": 0.9408,
+      "step": 7028
+    },
+    {
+      "epoch": 0.4889909214233539,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0010841748207986324,
+      "loss": 0.6235,
+      "step": 7029
+    },
+    {
+      "epoch": 0.48906048906048905,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001083950285372211,
+      "loss": 1.0157,
+      "step": 7030
+    },
+    {
+      "epoch": 0.48913005669762427,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0010837257456832172,
+      "loss": 0.9633,
+      "step": 7031
+    },
+    {
+      "epoch": 0.4891996243347595,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0010835012017430521,
+      "loss": 0.7513,
+      "step": 7032
+    },
+    {
+      "epoch": 0.48926919197189467,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010832766535631166,
+      "loss": 0.9007,
+      "step": 7033
+    },
+    {
+      "epoch": 0.4893387596090299,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001083052101154813,
+      "loss": 0.7177,
+      "step": 7034
+    },
+    {
+      "epoch": 0.48940832724616506,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010828275445295414,
+      "loss": 0.8104,
+      "step": 7035
+    },
+    {
+      "epoch": 0.4894778948833003,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010826029836987052,
+      "loss": 0.8287,
+      "step": 7036
+    },
+    {
+      "epoch": 0.4895474625204355,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0010823784186737059,
+      "loss": 1.0064,
+      "step": 7037
+    },
+    {
+      "epoch": 0.4896170301575707,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0010821538494659453,
+      "loss": 0.6387,
+      "step": 7038
+    },
+    {
+      "epoch": 0.4896865977947059,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001081929276086827,
+      "loss": 0.8335,
+      "step": 7039
+    },
+    {
+      "epoch": 0.4897561654318411,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010817046985477522,
+      "loss": 0.8973,
+      "step": 7040
+    },
+    {
+      "epoch": 0.4898257330689763,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010814801168601252,
+      "loss": 0.957,
+      "step": 7041
+    },
+    {
+      "epoch": 0.48989530070611154,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001081255531035348,
+      "loss": 0.7339,
+      "step": 7042
+    },
+    {
+      "epoch": 0.4899648683432467,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0010810309410848248,
+      "loss": 0.8312,
+      "step": 7043
+    },
+    {
+      "epoch": 0.49003443598038193,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001080806347019959,
+      "loss": 0.7359,
+      "step": 7044
+    },
+    {
+      "epoch": 0.4901040036175171,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001080581748852154,
+      "loss": 0.9715,
+      "step": 7045
+    },
+    {
+      "epoch": 0.49017357125465233,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010803571465928142,
+      "loss": 0.7558,
+      "step": 7046
+    },
+    {
+      "epoch": 0.49024313889178756,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010801325402533433,
+      "loss": 0.8256,
+      "step": 7047
+    },
+    {
+      "epoch": 0.4903127065289227,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001079907929845146,
+      "loss": 0.9817,
+      "step": 7048
+    },
+    {
+      "epoch": 0.49038227416605795,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010796833153796266,
+      "loss": 1.0321,
+      "step": 7049
+    },
+    {
+      "epoch": 0.4904518418031932,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00107945869686819,
+      "loss": 0.6911,
+      "step": 7050
+    },
+    {
+      "epoch": 0.49052140944032835,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0010792340743222418,
+      "loss": 1.0358,
+      "step": 7051
+    },
+    {
+      "epoch": 0.4905909770774636,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010790094477531862,
+      "loss": 0.7645,
+      "step": 7052
+    },
+    {
+      "epoch": 0.49066054471459875,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0010787848171724293,
+      "loss": 0.8814,
+      "step": 7053
+    },
+    {
+      "epoch": 0.49073011235173397,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0010785601825913764,
+      "loss": 0.9904,
+      "step": 7054
+    },
+    {
+      "epoch": 0.4907996799888692,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0010783355440214335,
+      "loss": 0.9773,
+      "step": 7055
+    },
+    {
+      "epoch": 0.49086924762600437,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0010781109014740063,
+      "loss": 0.7507,
+      "step": 7056
+    },
+    {
+      "epoch": 0.4909388152631396,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010778862549605016,
+      "loss": 0.8323,
+      "step": 7057
+    },
+    {
+      "epoch": 0.49100838290027476,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001077661604492325,
+      "loss": 0.9502,
+      "step": 7058
+    },
+    {
+      "epoch": 0.49107795053741,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010774369500808837,
+      "loss": 0.8934,
+      "step": 7059
+    },
+    {
+      "epoch": 0.4911475181745452,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010772122917375845,
+      "loss": 0.6439,
+      "step": 7060
+    },
+    {
+      "epoch": 0.4912170858116804,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010769876294738343,
+      "loss": 0.9425,
+      "step": 7061
+    },
+    {
+      "epoch": 0.4912866534488156,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010767629633010407,
+      "loss": 0.7707,
+      "step": 7062
+    },
+    {
+      "epoch": 0.49135622108595084,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00107653829323061,
+      "loss": 0.6929,
+      "step": 7063
+    },
+    {
+      "epoch": 0.491425788723086,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0010763136192739509,
+      "loss": 0.8058,
+      "step": 7064
+    },
+    {
+      "epoch": 0.49149535636022124,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0010760889414424709,
+      "loss": 0.8957,
+      "step": 7065
+    },
+    {
+      "epoch": 0.4915649239973564,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0010758642597475778,
+      "loss": 0.738,
+      "step": 7066
+    },
+    {
+      "epoch": 0.49163449163449163,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0010756395742006803,
+      "loss": 0.6443,
+      "step": 7067
+    },
+    {
+      "epoch": 0.49170405927162686,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010754148848131863,
+      "loss": 0.7924,
+      "step": 7068
+    },
+    {
+      "epoch": 0.49177362690876203,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010751901915965045,
+      "loss": 0.7784,
+      "step": 7069
+    },
+    {
+      "epoch": 0.49184319454589726,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0010749654945620437,
+      "loss": 0.95,
+      "step": 7070
+    },
+    {
+      "epoch": 0.4919127621830324,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010747407937212133,
+      "loss": 0.7802,
+      "step": 7071
+    },
+    {
+      "epoch": 0.49198232982016765,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010745160890854215,
+      "loss": 0.7512,
+      "step": 7072
+    },
+    {
+      "epoch": 0.4920518974573029,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010742913806660787,
+      "loss": 0.9372,
+      "step": 7073
+    },
+    {
+      "epoch": 0.49212146509443805,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001074066668474594,
+      "loss": 0.677,
+      "step": 7074
+    },
+    {
+      "epoch": 0.4921910327315733,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010738419525223772,
+      "loss": 0.9019,
+      "step": 7075
+    },
+    {
+      "epoch": 0.4922606003687085,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0010736172328208381,
+      "loss": 0.8827,
+      "step": 7076
+    },
+    {
+      "epoch": 0.49233016800584367,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010733925093813872,
+      "loss": 0.7767,
+      "step": 7077
+    },
+    {
+      "epoch": 0.4923997356429789,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010731677822154349,
+      "loss": 0.7247,
+      "step": 7078
+    },
+    {
+      "epoch": 0.49246930328011407,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010729430513343908,
+      "loss": 0.8941,
+      "step": 7079
+    },
+    {
+      "epoch": 0.4925388709172493,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010727183167496663,
+      "loss": 0.899,
+      "step": 7080
+    },
+    {
+      "epoch": 0.4926084385543845,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0010724935784726724,
+      "loss": 0.5359,
+      "step": 7081
+    },
+    {
+      "epoch": 0.4926780061915197,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010722688365148198,
+      "loss": 0.9204,
+      "step": 7082
+    },
+    {
+      "epoch": 0.4927475738286549,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0010720440908875202,
+      "loss": 1.1598,
+      "step": 7083
+    },
+    {
+      "epoch": 0.4928171414657901,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0010718193416021846,
+      "loss": 0.7563,
+      "step": 7084
+    },
+    {
+      "epoch": 0.4928867091029253,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0010715945886702247,
+      "loss": 0.9402,
+      "step": 7085
+    },
+    {
+      "epoch": 0.49295627674006054,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010713698321030527,
+      "loss": 0.8206,
+      "step": 7086
+    },
+    {
+      "epoch": 0.4930258443771957,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010711450719120804,
+      "loss": 0.8456,
+      "step": 7087
+    },
+    {
+      "epoch": 0.49309541201433094,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0010709203081087197,
+      "loss": 0.6072,
+      "step": 7088
+    },
+    {
+      "epoch": 0.49316497965146616,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010706955407043834,
+      "loss": 0.8227,
+      "step": 7089
+    },
+    {
+      "epoch": 0.49323454728860133,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001070470769710484,
+      "loss": 0.9223,
+      "step": 7090
+    },
+    {
+      "epoch": 0.49330411492573656,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010702459951384337,
+      "loss": 0.8665,
+      "step": 7091
+    },
+    {
+      "epoch": 0.49337368256287173,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010700212169996461,
+      "loss": 0.7868,
+      "step": 7092
+    },
+    {
+      "epoch": 0.49344325020000696,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001069796435305534,
+      "loss": 0.9867,
+      "step": 7093
+    },
+    {
+      "epoch": 0.4935128178371422,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010695716500675107,
+      "loss": 0.8527,
+      "step": 7094
+    },
+    {
+      "epoch": 0.49358238547427735,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0010693468612969898,
+      "loss": 0.9523,
+      "step": 7095
+    },
+    {
+      "epoch": 0.4936519531114126,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0010691220690053846,
+      "loss": 0.8617,
+      "step": 7096
+    },
+    {
+      "epoch": 0.49372152074854775,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010688972732041094,
+      "loss": 0.5449,
+      "step": 7097
+    },
+    {
+      "epoch": 0.493791088385683,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0010686724739045776,
+      "loss": 0.5827,
+      "step": 7098
+    },
+    {
+      "epoch": 0.4938606560228182,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0010684476711182041,
+      "loss": 0.8263,
+      "step": 7099
+    },
+    {
+      "epoch": 0.49393022365995337,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0010682228648564026,
+      "loss": 0.7761,
+      "step": 7100
+    },
+    {
+      "epoch": 0.4939997912970886,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001067998055130588,
+      "loss": 0.8542,
+      "step": 7101
+    },
+    {
+      "epoch": 0.4940693589342238,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010677732419521748,
+      "loss": 0.8381,
+      "step": 7102
+    },
+    {
+      "epoch": 0.494138926571359,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001067548425332578,
+      "loss": 0.7612,
+      "step": 7103
+    },
+    {
+      "epoch": 0.4942084942084942,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0010673236052832127,
+      "loss": 1.0282,
+      "step": 7104
+    },
+    {
+      "epoch": 0.4942780618456294,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010670987818154941,
+      "loss": 0.9268,
+      "step": 7105
+    },
+    {
+      "epoch": 0.4943476294827646,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0010668739549408372,
+      "loss": 0.8165,
+      "step": 7106
+    },
+    {
+      "epoch": 0.49441719711989984,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010666491246706584,
+      "loss": 0.8969,
+      "step": 7107
+    },
+    {
+      "epoch": 0.494486764757035,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010664242910163727,
+      "loss": 0.8662,
+      "step": 7108
+    },
+    {
+      "epoch": 0.49455633239417024,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010661994539893965,
+      "loss": 0.8012,
+      "step": 7109
+    },
+    {
+      "epoch": 0.4946259000313054,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0010659746136011457,
+      "loss": 1.2093,
+      "step": 7110
+    },
+    {
+      "epoch": 0.49469546766844064,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010657497698630363,
+      "loss": 0.8276,
+      "step": 7111
+    },
+    {
+      "epoch": 0.49476503530557586,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010655249227864852,
+      "loss": 0.7684,
+      "step": 7112
+    },
+    {
+      "epoch": 0.49483460294271103,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010653000723829086,
+      "loss": 0.9557,
+      "step": 7113
+    },
+    {
+      "epoch": 0.49490417057984626,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0010650752186637238,
+      "loss": 0.6632,
+      "step": 7114
+    },
+    {
+      "epoch": 0.4949737382169815,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001064850361640347,
+      "loss": 0.9204,
+      "step": 7115
+    },
+    {
+      "epoch": 0.49504330585411666,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0010646255013241962,
+      "loss": 0.8105,
+      "step": 7116
+    },
+    {
+      "epoch": 0.4951128734912519,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0010644006377266877,
+      "loss": 0.9422,
+      "step": 7117
+    },
+    {
+      "epoch": 0.49518244112838705,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0010641757708592396,
+      "loss": 0.9256,
+      "step": 7118
+    },
+    {
+      "epoch": 0.4952520087655223,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0010639509007332694,
+      "loss": 0.9179,
+      "step": 7119
+    },
+    {
+      "epoch": 0.4953215764026575,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0010637260273601947,
+      "loss": 0.7678,
+      "step": 7120
+    },
+    {
+      "epoch": 0.4953911440397927,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010635011507514336,
+      "loss": 0.824,
+      "step": 7121
+    },
+    {
+      "epoch": 0.4954607116769279,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001063276270918404,
+      "loss": 0.8793,
+      "step": 7122
+    },
+    {
+      "epoch": 0.49553027931406307,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0010630513878725244,
+      "loss": 0.8169,
+      "step": 7123
+    },
+    {
+      "epoch": 0.4955998469511983,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0010628265016252132,
+      "loss": 0.7137,
+      "step": 7124
+    },
+    {
+      "epoch": 0.4956694145883335,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0010626016121878887,
+      "loss": 1.0796,
+      "step": 7125
+    },
+    {
+      "epoch": 0.4957389822254687,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00106237671957197,
+      "loss": 0.7397,
+      "step": 7126
+    },
+    {
+      "epoch": 0.4958085498626039,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0010621518237888762,
+      "loss": 0.7724,
+      "step": 7127
+    },
+    {
+      "epoch": 0.49587811749973915,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0010619269248500257,
+      "loss": 0.8617,
+      "step": 7128
+    },
+    {
+      "epoch": 0.4959476851368743,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001061702022766838,
+      "loss": 0.9586,
+      "step": 7129
+    },
+    {
+      "epoch": 0.49601725277400954,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010614771175507327,
+      "loss": 0.8047,
+      "step": 7130
+    },
+    {
+      "epoch": 0.4960868204111447,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0010612522092131294,
+      "loss": 0.9814,
+      "step": 7131
+    },
+    {
+      "epoch": 0.49615638804827994,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0010610272977654475,
+      "loss": 0.7223,
+      "step": 7132
+    },
+    {
+      "epoch": 0.49622595568541517,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0010608023832191069,
+      "loss": 0.82,
+      "step": 7133
+    },
+    {
+      "epoch": 0.49629552332255034,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0010605774655855279,
+      "loss": 1.0853,
+      "step": 7134
+    },
+    {
+      "epoch": 0.49636509095968556,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010603525448761304,
+      "loss": 0.7723,
+      "step": 7135
+    },
+    {
+      "epoch": 0.49643465859682073,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001060127621102335,
+      "loss": 0.8135,
+      "step": 7136
+    },
+    {
+      "epoch": 0.49650422623395596,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001059902694275562,
+      "loss": 0.8008,
+      "step": 7137
+    },
+    {
+      "epoch": 0.4965737938710912,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0010596777644072321,
+      "loss": 0.7991,
+      "step": 7138
+    },
+    {
+      "epoch": 0.49664336150822636,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0010594528315087664,
+      "loss": 0.7725,
+      "step": 7139
+    },
+    {
+      "epoch": 0.4967129291453616,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0010592278955915853,
+      "loss": 0.9814,
+      "step": 7140
+    },
+    {
+      "epoch": 0.4967824967824968,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0010590029566671102,
+      "loss": 0.994,
+      "step": 7141
+    },
+    {
+      "epoch": 0.496852064419632,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010587780147467624,
+      "loss": 0.8298,
+      "step": 7142
+    },
+    {
+      "epoch": 0.4969216320567672,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010585530698419634,
+      "loss": 0.8363,
+      "step": 7143
+    },
+    {
+      "epoch": 0.4969911996939024,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0010583281219641346,
+      "loss": 0.807,
+      "step": 7144
+    },
+    {
+      "epoch": 0.4970607673310376,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010581031711246977,
+      "loss": 0.9414,
+      "step": 7145
+    },
+    {
+      "epoch": 0.4971303349681728,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0010578782173350746,
+      "loss": 0.6897,
+      "step": 7146
+    },
+    {
+      "epoch": 0.497199902605308,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010576532606066873,
+      "loss": 0.9442,
+      "step": 7147
+    },
+    {
+      "epoch": 0.4972694702424432,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001057428300950958,
+      "loss": 0.8061,
+      "step": 7148
+    },
+    {
+      "epoch": 0.4973390378795784,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010572033383793092,
+      "loss": 0.794,
+      "step": 7149
+    },
+    {
+      "epoch": 0.4974086055167136,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010569783729031633,
+      "loss": 0.8179,
+      "step": 7150
+    },
+    {
+      "epoch": 0.49747817315384885,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0010567534045339425,
+      "loss": 0.9908,
+      "step": 7151
+    },
+    {
+      "epoch": 0.497547740790984,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00105652843328307,
+      "loss": 0.8828,
+      "step": 7152
+    },
+    {
+      "epoch": 0.49761730842811924,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0010563034591619686,
+      "loss": 0.8516,
+      "step": 7153
+    },
+    {
+      "epoch": 0.49768687606525447,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0010560784821820614,
+      "loss": 0.816,
+      "step": 7154
+    },
+    {
+      "epoch": 0.49775644370238964,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010558535023547715,
+      "loss": 0.7183,
+      "step": 7155
+    },
+    {
+      "epoch": 0.49782601133952487,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0010556285196915223,
+      "loss": 0.7265,
+      "step": 7156
+    },
+    {
+      "epoch": 0.49789557897666004,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0010554035342037371,
+      "loss": 0.7026,
+      "step": 7157
+    },
+    {
+      "epoch": 0.49796514661379526,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0010551785459028398,
+      "loss": 0.7944,
+      "step": 7158
+    },
+    {
+      "epoch": 0.4980347142509305,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001054953554800254,
+      "loss": 0.7208,
+      "step": 7159
+    },
+    {
+      "epoch": 0.49810428188806566,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010547285609074039,
+      "loss": 0.9822,
+      "step": 7160
+    },
+    {
+      "epoch": 0.4981738495252009,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001054503564235713,
+      "loss": 0.7046,
+      "step": 7161
+    },
+    {
+      "epoch": 0.49824341716233606,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001054278564796606,
+      "loss": 0.8731,
+      "step": 7162
+    },
+    {
+      "epoch": 0.4983129847994713,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010540535626015072,
+      "loss": 0.7237,
+      "step": 7163
+    },
+    {
+      "epoch": 0.4983825524366065,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0010538285576618407,
+      "loss": 0.7564,
+      "step": 7164
+    },
+    {
+      "epoch": 0.4984521200737417,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0010536035499890315,
+      "loss": 0.5736,
+      "step": 7165
+    },
+    {
+      "epoch": 0.4985216877108769,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001053378539594504,
+      "loss": 0.864,
+      "step": 7166
+    },
+    {
+      "epoch": 0.49859125534801213,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0010531535264896837,
+      "loss": 0.8084,
+      "step": 7167
+    },
+    {
+      "epoch": 0.4986608229851473,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0010529285106859949,
+      "loss": 0.6282,
+      "step": 7168
+    },
+    {
+      "epoch": 0.4987303906222825,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010527034921948633,
+      "loss": 0.7866,
+      "step": 7169
+    },
+    {
+      "epoch": 0.4987999582594177,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0010524784710277137,
+      "loss": 0.7971,
+      "step": 7170
+    },
+    {
+      "epoch": 0.4988695258965529,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0010522534471959723,
+      "loss": 0.8345,
+      "step": 7171
+    },
+    {
+      "epoch": 0.49893909353368815,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.001052028420711064,
+      "loss": 0.8096,
+      "step": 7172
+    },
+    {
+      "epoch": 0.4990086611708233,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010518033915844147,
+      "loss": 0.7644,
+      "step": 7173
+    },
+    {
+      "epoch": 0.49907822880795855,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010515783598274502,
+      "loss": 0.9254,
+      "step": 7174
+    },
+    {
+      "epoch": 0.4991477964450937,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010513533254515965,
+      "loss": 0.8688,
+      "step": 7175
+    },
+    {
+      "epoch": 0.49921736408222894,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010511282884682802,
+      "loss": 0.8079,
+      "step": 7176
+    },
+    {
+      "epoch": 0.49928693171936417,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001050903248888927,
+      "loss": 0.695,
+      "step": 7177
+    },
+    {
+      "epoch": 0.49935649935649934,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001050678206724963,
+      "loss": 0.7909,
+      "step": 7178
+    },
+    {
+      "epoch": 0.49942606699363457,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010504531619878155,
+      "loss": 0.9214,
+      "step": 7179
+    },
+    {
+      "epoch": 0.4994956346307698,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010502281146889108,
+      "loss": 0.9654,
+      "step": 7180
+    },
+    {
+      "epoch": 0.49956520226790496,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010500030648396752,
+      "loss": 0.8288,
+      "step": 7181
+    },
+    {
+      "epoch": 0.4996347699050402,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0010497780124515362,
+      "loss": 0.875,
+      "step": 7182
+    },
+    {
+      "epoch": 0.49970433754217536,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010495529575359208,
+      "loss": 0.8785,
+      "step": 7183
+    },
+    {
+      "epoch": 0.4997739051793106,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010493279001042557,
+      "loss": 0.7318,
+      "step": 7184
+    },
+    {
+      "epoch": 0.4998434728164458,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0010491028401679687,
+      "loss": 0.5975,
+      "step": 7185
+    },
+    {
+      "epoch": 0.499913040453581,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0010488777777384868,
+      "loss": 0.9336,
+      "step": 7186
+    },
+    {
+      "epoch": 0.4999826080907162,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010486527128272377,
+      "loss": 0.8408,
+      "step": 7187
+    },
+    {
+      "epoch": 0.5000521757278514,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010484276454456492,
+      "loss": 0.938,
+      "step": 7188
+    },
+    {
+      "epoch": 0.5001217433649866,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001048202575605149,
+      "loss": 0.7174,
+      "step": 7189
+    },
+    {
+      "epoch": 0.5001913110021218,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010479775033171647,
+      "loss": 0.8057,
+      "step": 7190
+    },
+    {
+      "epoch": 0.5002608786392571,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0010477524285931246,
+      "loss": 0.7395,
+      "step": 7191
+    },
+    {
+      "epoch": 0.5003304462763922,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010475273514444574,
+      "loss": 0.8219,
+      "step": 7192
+    },
+    {
+      "epoch": 0.5004000139135274,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0010473022718825904,
+      "loss": 0.9962,
+      "step": 7193
+    },
+    {
+      "epoch": 0.5004695815506627,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010470771899189525,
+      "loss": 0.9283,
+      "step": 7194
+    },
+    {
+      "epoch": 0.5005391491877978,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010468521055649722,
+      "loss": 1.09,
+      "step": 7195
+    },
+    {
+      "epoch": 0.500608716824933,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010466270188320783,
+      "loss": 0.6323,
+      "step": 7196
+    },
+    {
+      "epoch": 0.5006782844620682,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010464019297316992,
+      "loss": 0.9214,
+      "step": 7197
+    },
+    {
+      "epoch": 0.5007478520992035,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0010461768382752639,
+      "loss": 1.1907,
+      "step": 7198
+    },
+    {
+      "epoch": 0.5008174197363386,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001045951744474202,
+      "loss": 0.9154,
+      "step": 7199
+    },
+    {
+      "epoch": 0.5008869873734738,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010457266483399417,
+      "loss": 0.623,
+      "step": 7200
+    },
+    {
+      "epoch": 0.5009565550106091,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010455015498839126,
+      "loss": 0.9534,
+      "step": 7201
+    },
+    {
+      "epoch": 0.5010261226477443,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010452764491175443,
+      "loss": 0.7288,
+      "step": 7202
+    },
+    {
+      "epoch": 0.5010956902848794,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0010450513460522662,
+      "loss": 0.8695,
+      "step": 7203
+    },
+    {
+      "epoch": 0.5011652579220147,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0010448262406995076,
+      "loss": 1.0398,
+      "step": 7204
+    },
+    {
+      "epoch": 0.5012348255591499,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010446011330706986,
+      "loss": 0.6585,
+      "step": 7205
+    },
+    {
+      "epoch": 0.5013043931962851,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001044376023177269,
+      "loss": 0.9344,
+      "step": 7206
+    },
+    {
+      "epoch": 0.5013739608334203,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0010441509110306483,
+      "loss": 0.8191,
+      "step": 7207
+    },
+    {
+      "epoch": 0.5014435284705555,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010439257966422674,
+      "loss": 0.8409,
+      "step": 7208
+    },
+    {
+      "epoch": 0.5015130961076907,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010437006800235553,
+      "loss": 0.6652,
+      "step": 7209
+    },
+    {
+      "epoch": 0.5015826637448259,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0010434755611859435,
+      "loss": 1.1023,
+      "step": 7210
+    },
+    {
+      "epoch": 0.5016522313819611,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010432504401408614,
+      "loss": 1.0829,
+      "step": 7211
+    },
+    {
+      "epoch": 0.5017217990190963,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00104302531689974,
+      "loss": 0.69,
+      "step": 7212
+    },
+    {
+      "epoch": 0.5017913666562315,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0010428001914740102,
+      "loss": 0.6944,
+      "step": 7213
+    },
+    {
+      "epoch": 0.5018609342933668,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010425750638751018,
+      "loss": 0.6269,
+      "step": 7214
+    },
+    {
+      "epoch": 0.5019305019305019,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001042349934114447,
+      "loss": 0.8376,
+      "step": 7215
+    },
+    {
+      "epoch": 0.5020000695676371,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0010421248022034755,
+      "loss": 0.688,
+      "step": 7216
+    },
+    {
+      "epoch": 0.5020696372047724,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001041899668153619,
+      "loss": 0.9963,
+      "step": 7217
+    },
+    {
+      "epoch": 0.5021392048419075,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010416745319763085,
+      "loss": 0.8619,
+      "step": 7218
+    },
+    {
+      "epoch": 0.5022087724790427,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010414493936829754,
+      "loss": 0.8277,
+      "step": 7219
+    },
+    {
+      "epoch": 0.502278340116178,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001041224253285051,
+      "loss": 0.698,
+      "step": 7220
+    },
+    {
+      "epoch": 0.5023479077533132,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010409991107939668,
+      "loss": 0.9984,
+      "step": 7221
+    },
+    {
+      "epoch": 0.5024174753904483,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0010407739662211546,
+      "loss": 0.9493,
+      "step": 7222
+    },
+    {
+      "epoch": 0.5024870430275835,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0010405488195780455,
+      "loss": 1.0066,
+      "step": 7223
+    },
+    {
+      "epoch": 0.5025566106647188,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010403236708760723,
+      "loss": 0.8073,
+      "step": 7224
+    },
+    {
+      "epoch": 0.502626178301854,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010400985201266656,
+      "loss": 1.0504,
+      "step": 7225
+    },
+    {
+      "epoch": 0.5026957459389891,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0010398733673412583,
+      "loss": 0.8966,
+      "step": 7226
+    },
+    {
+      "epoch": 0.5027653135761244,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001039648212531283,
+      "loss": 0.8191,
+      "step": 7227
+    },
+    {
+      "epoch": 0.5028348812132596,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0010394230557081708,
+      "loss": 0.8162,
+      "step": 7228
+    },
+    {
+      "epoch": 0.5029044488503948,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010391978968833549,
+      "loss": 0.8613,
+      "step": 7229
+    },
+    {
+      "epoch": 0.50297401648753,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0010389727360682669,
+      "loss": 0.4959,
+      "step": 7230
+    },
+    {
+      "epoch": 0.5030435841246652,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0010387475732743401,
+      "loss": 0.8968,
+      "step": 7231
+    },
+    {
+      "epoch": 0.5031131517618004,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0010385224085130067,
+      "loss": 0.7869,
+      "step": 7232
+    },
+    {
+      "epoch": 0.5031827193989357,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0010382972417956997,
+      "loss": 0.9447,
+      "step": 7233
+    },
+    {
+      "epoch": 0.5032522870360708,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0010380720731338517,
+      "loss": 0.6995,
+      "step": 7234
+    },
+    {
+      "epoch": 0.503321854673206,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010378469025388954,
+      "loss": 1.0611,
+      "step": 7235
+    },
+    {
+      "epoch": 0.5033914223103412,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010376217300222647,
+      "loss": 0.8676,
+      "step": 7236
+    },
+    {
+      "epoch": 0.5034609899474765,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010373965555953919,
+      "loss": 0.751,
+      "step": 7237
+    },
+    {
+      "epoch": 0.5035305575846116,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0010371713792697108,
+      "loss": 0.7965,
+      "step": 7238
+    },
+    {
+      "epoch": 0.5036001252217468,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001036946201056654,
+      "loss": 0.8837,
+      "step": 7239
+    },
+    {
+      "epoch": 0.5036696928588821,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010367210209676556,
+      "loss": 0.9095,
+      "step": 7240
+    },
+    {
+      "epoch": 0.5037392604960172,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.0010364958390141489,
+      "loss": 0.6597,
+      "step": 7241
+    },
+    {
+      "epoch": 0.5038088281331524,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010362706552075672,
+      "loss": 0.645,
+      "step": 7242
+    },
+    {
+      "epoch": 0.5038783957702877,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010360454695593447,
+      "loss": 0.9205,
+      "step": 7243
+    },
+    {
+      "epoch": 0.5039479634074229,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010358202820809146,
+      "loss": 1.0243,
+      "step": 7244
+    },
+    {
+      "epoch": 0.504017531044558,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010355950927837115,
+      "loss": 0.6335,
+      "step": 7245
+    },
+    {
+      "epoch": 0.5040870986816933,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0010353699016791684,
+      "loss": 0.7884,
+      "step": 7246
+    },
+    {
+      "epoch": 0.5041566663188285,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010351447087787206,
+      "loss": 0.9097,
+      "step": 7247
+    },
+    {
+      "epoch": 0.5042262339559637,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010349195140938016,
+      "loss": 0.7768,
+      "step": 7248
+    },
+    {
+      "epoch": 0.5042958015930988,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0010346943176358452,
+      "loss": 1.0664,
+      "step": 7249
+    },
+    {
+      "epoch": 0.5043653692302341,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0010344691194162866,
+      "loss": 0.9543,
+      "step": 7250
+    },
+    {
+      "epoch": 0.5044349368673693,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00103424391944656,
+      "loss": 0.9053,
+      "step": 7251
+    },
+    {
+      "epoch": 0.5045045045045045,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0010340187177380995,
+      "loss": 0.8547,
+      "step": 7252
+    },
+    {
+      "epoch": 0.5045740721416397,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010337935143023397,
+      "loss": 0.558,
+      "step": 7253
+    },
+    {
+      "epoch": 0.5046436397787749,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0010335683091507162,
+      "loss": 0.7452,
+      "step": 7254
+    },
+    {
+      "epoch": 0.5047132074159101,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001033343102294663,
+      "loss": 0.5924,
+      "step": 7255
+    },
+    {
+      "epoch": 0.5047827750530454,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010331178937456147,
+      "loss": 0.9584,
+      "step": 7256
+    },
+    {
+      "epoch": 0.5048523426901805,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010328926835150073,
+      "loss": 0.9278,
+      "step": 7257
+    },
+    {
+      "epoch": 0.5049219103273157,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010326674716142745,
+      "loss": 0.9773,
+      "step": 7258
+    },
+    {
+      "epoch": 0.504991477964451,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010324422580548528,
+      "loss": 1.0611,
+      "step": 7259
+    },
+    {
+      "epoch": 0.5050610456015862,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010322170428481764,
+      "loss": 0.7346,
+      "step": 7260
+    },
+    {
+      "epoch": 0.5051306132387213,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010319918260056813,
+      "loss": 0.993,
+      "step": 7261
+    },
+    {
+      "epoch": 0.5052001808758565,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.001031766607538802,
+      "loss": 0.9258,
+      "step": 7262
+    },
+    {
+      "epoch": 0.5052697485129918,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010315413874589748,
+      "loss": 0.6496,
+      "step": 7263
+    },
+    {
+      "epoch": 0.505339316150127,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010313161657776351,
+      "loss": 0.669,
+      "step": 7264
+    },
+    {
+      "epoch": 0.5054088837872621,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0010310909425062177,
+      "loss": 0.9657,
+      "step": 7265
+    },
+    {
+      "epoch": 0.5054784514243974,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0010308657176561597,
+      "loss": 1.0466,
+      "step": 7266
+    },
+    {
+      "epoch": 0.5055480190615326,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010306404912388957,
+      "loss": 0.9254,
+      "step": 7267
+    },
+    {
+      "epoch": 0.5056175866986677,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0010304152632658623,
+      "loss": 0.5554,
+      "step": 7268
+    },
+    {
+      "epoch": 0.505687154335803,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010301900337484947,
+      "loss": 1.0306,
+      "step": 7269
+    },
+    {
+      "epoch": 0.5057567219729382,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0010299648026982297,
+      "loss": 0.9134,
+      "step": 7270
+    },
+    {
+      "epoch": 0.5058262896100734,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001029739570126503,
+      "loss": 1.012,
+      "step": 7271
+    },
+    {
+      "epoch": 0.5058958572472086,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010295143360447507,
+      "loss": 0.717,
+      "step": 7272
+    },
+    {
+      "epoch": 0.5059654248843438,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0010292891004644094,
+      "loss": 0.9266,
+      "step": 7273
+    },
+    {
+      "epoch": 0.506034992521479,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001029063863396915,
+      "loss": 0.4913,
+      "step": 7274
+    },
+    {
+      "epoch": 0.5061045601586142,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0010288386248537042,
+      "loss": 0.5788,
+      "step": 7275
+    },
+    {
+      "epoch": 0.5061741277957494,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0010286133848462131,
+      "loss": 0.8754,
+      "step": 7276
+    },
+    {
+      "epoch": 0.5062436954328846,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0010283881433858792,
+      "loss": 0.7639,
+      "step": 7277
+    },
+    {
+      "epoch": 0.5063132630700198,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010281629004841378,
+      "loss": 0.7923,
+      "step": 7278
+    },
+    {
+      "epoch": 0.5063828307071551,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0010279376561524265,
+      "loss": 0.8669,
+      "step": 7279
+    },
+    {
+      "epoch": 0.5064523983442902,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010277124104021821,
+      "loss": 0.8788,
+      "step": 7280
+    },
+    {
+      "epoch": 0.5065219659814254,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010274871632448407,
+      "loss": 0.7445,
+      "step": 7281
+    },
+    {
+      "epoch": 0.5065915336185607,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0010272619146918403,
+      "loss": 0.9489,
+      "step": 7282
+    },
+    {
+      "epoch": 0.5066611012556959,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0010270366647546166,
+      "loss": 0.8435,
+      "step": 7283
+    },
+    {
+      "epoch": 0.506730668892831,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001026811413444608,
+      "loss": 1.0238,
+      "step": 7284
+    },
+    {
+      "epoch": 0.5068002365299663,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010265861607732503,
+      "loss": 0.7611,
+      "step": 7285
+    },
+    {
+      "epoch": 0.5068698041671015,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010263609067519817,
+      "loss": 0.8724,
+      "step": 7286
+    },
+    {
+      "epoch": 0.5069393718042366,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010261356513922393,
+      "loss": 0.9458,
+      "step": 7287
+    },
+    {
+      "epoch": 0.5070089394413718,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00102591039470546,
+      "loss": 0.7484,
+      "step": 7288
+    },
+    {
+      "epoch": 0.5070785070785071,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0010256851367030817,
+      "loss": 1.2545,
+      "step": 7289
+    },
+    {
+      "epoch": 0.5071480747156423,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001025459877396541,
+      "loss": 0.9903,
+      "step": 7290
+    },
+    {
+      "epoch": 0.5072176423527774,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001025234616797277,
+      "loss": 0.8205,
+      "step": 7291
+    },
+    {
+      "epoch": 0.5072872099899127,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0010250093549167257,
+      "loss": 0.8465,
+      "step": 7292
+    },
+    {
+      "epoch": 0.5073567776270479,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0010247840917663254,
+      "loss": 0.8993,
+      "step": 7293
+    },
+    {
+      "epoch": 0.5074263452641831,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0010245588273575142,
+      "loss": 0.9857,
+      "step": 7294
+    },
+    {
+      "epoch": 0.5074959129013183,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001024333561701729,
+      "loss": 0.6849,
+      "step": 7295
+    },
+    {
+      "epoch": 0.5075654805384535,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001024108294810409,
+      "loss": 0.9671,
+      "step": 7296
+    },
+    {
+      "epoch": 0.5076350481755887,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010238830266949906,
+      "loss": 0.9284,
+      "step": 7297
+    },
+    {
+      "epoch": 0.507704615812724,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0010236577573669128,
+      "loss": 0.6884,
+      "step": 7298
+    },
+    {
+      "epoch": 0.5077741834498591,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001023432486837613,
+      "loss": 0.7001,
+      "step": 7299
+    },
+    {
+      "epoch": 0.5078437510869943,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00102320721511853,
+      "loss": 0.9619,
+      "step": 7300
+    },
+    {
+      "epoch": 0.5079133187241295,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0010229819422211016,
+      "loss": 0.9597,
+      "step": 7301
+    },
+    {
+      "epoch": 0.5079828863612648,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0010227566681567657,
+      "loss": 1.0313,
+      "step": 7302
+    },
+    {
+      "epoch": 0.5080524539983999,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0010225313929369613,
+      "loss": 0.7011,
+      "step": 7303
+    },
+    {
+      "epoch": 0.5081220216355351,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0010223061165731257,
+      "loss": 0.8618,
+      "step": 7304
+    },
+    {
+      "epoch": 0.5081915892726704,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0010220808390766986,
+      "loss": 0.7311,
+      "step": 7305
+    },
+    {
+      "epoch": 0.5082611569098056,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0010218555604591174,
+      "loss": 0.6457,
+      "step": 7306
+    },
+    {
+      "epoch": 0.5083307245469407,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010216302807318214,
+      "loss": 1.0473,
+      "step": 7307
+    },
+    {
+      "epoch": 0.508400292184076,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010214049999062481,
+      "loss": 0.7673,
+      "step": 7308
+    },
+    {
+      "epoch": 0.5084698598212112,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010211797179938374,
+      "loss": 0.6966,
+      "step": 7309
+    },
+    {
+      "epoch": 0.5085394274583463,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0010209544350060272,
+      "loss": 0.6101,
+      "step": 7310
+    },
+    {
+      "epoch": 0.5086089950954816,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010207291509542562,
+      "loss": 0.6792,
+      "step": 7311
+    },
+    {
+      "epoch": 0.5086785627326168,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010205038658499636,
+      "loss": 0.6202,
+      "step": 7312
+    },
+    {
+      "epoch": 0.508748130369752,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0010202785797045878,
+      "loss": 0.5167,
+      "step": 7313
+    },
+    {
+      "epoch": 0.5088176980068871,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010200532925295684,
+      "loss": 0.8961,
+      "step": 7314
+    },
+    {
+      "epoch": 0.5088872656440224,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0010198280043363435,
+      "loss": 0.8841,
+      "step": 7315
+    },
+    {
+      "epoch": 0.5089568332811576,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0010196027151363526,
+      "loss": 0.8483,
+      "step": 7316
+    },
+    {
+      "epoch": 0.5090264009182928,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0010193774249410345,
+      "loss": 0.6354,
+      "step": 7317
+    },
+    {
+      "epoch": 0.509095968555428,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010191521337618286,
+      "loss": 0.8367,
+      "step": 7318
+    },
+    {
+      "epoch": 0.5091655361925632,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001018926841610174,
+      "loss": 0.6624,
+      "step": 7319
+    },
+    {
+      "epoch": 0.5092351038296984,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0010187015484975095,
+      "loss": 0.7719,
+      "step": 7320
+    },
+    {
+      "epoch": 0.5093046714668337,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001018476254435275,
+      "loss": 0.6867,
+      "step": 7321
+    },
+    {
+      "epoch": 0.5093742391039688,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001018250959434909,
+      "loss": 1.0151,
+      "step": 7322
+    },
+    {
+      "epoch": 0.509443806741104,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0010180256635078514,
+      "loss": 0.6282,
+      "step": 7323
+    },
+    {
+      "epoch": 0.5095133743782393,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0010178003666655416,
+      "loss": 0.895,
+      "step": 7324
+    },
+    {
+      "epoch": 0.5095829420153745,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010175750689194187,
+      "loss": 0.7366,
+      "step": 7325
+    },
+    {
+      "epoch": 0.5096525096525096,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010173497702809225,
+      "loss": 0.8923,
+      "step": 7326
+    },
+    {
+      "epoch": 0.5097220772896448,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010171244707614924,
+      "loss": 0.9015,
+      "step": 7327
+    },
+    {
+      "epoch": 0.5097916449267801,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010168991703725682,
+      "loss": 0.8422,
+      "step": 7328
+    },
+    {
+      "epoch": 0.5098612125639153,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001016673869125589,
+      "loss": 0.8664,
+      "step": 7329
+    },
+    {
+      "epoch": 0.5099307802010504,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010164485670319948,
+      "loss": 0.854,
+      "step": 7330
+    },
+    {
+      "epoch": 0.5100003478381857,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010162232641032253,
+      "loss": 0.6463,
+      "step": 7331
+    },
+    {
+      "epoch": 0.5100699154753209,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010159979603507204,
+      "loss": 0.9051,
+      "step": 7332
+    },
+    {
+      "epoch": 0.510139483112456,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0010157726557859196,
+      "loss": 0.7682,
+      "step": 7333
+    },
+    {
+      "epoch": 0.5102090507495913,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0010155473504202626,
+      "loss": 0.7717,
+      "step": 7334
+    },
+    {
+      "epoch": 0.5102786183867265,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00101532204426519,
+      "loss": 0.63,
+      "step": 7335
+    },
+    {
+      "epoch": 0.5103481860238617,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010150967373321407,
+      "loss": 0.8076,
+      "step": 7336
+    },
+    {
+      "epoch": 0.510417753660997,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0010148714296325553,
+      "loss": 0.7188,
+      "step": 7337
+    },
+    {
+      "epoch": 0.5104873212981321,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010146461211778738,
+      "loss": 0.9532,
+      "step": 7338
+    },
+    {
+      "epoch": 0.5105568889352673,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0010144208119795362,
+      "loss": 0.9941,
+      "step": 7339
+    },
+    {
+      "epoch": 0.5106264565724025,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0010141955020489823,
+      "loss": 0.6594,
+      "step": 7340
+    },
+    {
+      "epoch": 0.5106960242095377,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010139701913976524,
+      "loss": 0.893,
+      "step": 7341
+    },
+    {
+      "epoch": 0.5107655918466729,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0010137448800369869,
+      "loss": 0.6618,
+      "step": 7342
+    },
+    {
+      "epoch": 0.5108351594838081,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001013519567978425,
+      "loss": 0.6087,
+      "step": 7343
+    },
+    {
+      "epoch": 0.5109047271209434,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0010132942552334078,
+      "loss": 0.9438,
+      "step": 7344
+    },
+    {
+      "epoch": 0.5109742947580785,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010130689418133755,
+      "loss": 0.6784,
+      "step": 7345
+    },
+    {
+      "epoch": 0.5110438623952137,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0010128436277297684,
+      "loss": 0.6899,
+      "step": 7346
+    },
+    {
+      "epoch": 0.511113430032349,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0010126183129940264,
+      "loss": 1.0467,
+      "step": 7347
+    },
+    {
+      "epoch": 0.5111829976694842,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0010123929976175899,
+      "loss": 0.8388,
+      "step": 7348
+    },
+    {
+      "epoch": 0.5112525653066193,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0010121676816118997,
+      "loss": 0.7295,
+      "step": 7349
+    },
+    {
+      "epoch": 0.5113221329437546,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0010119423649883957,
+      "loss": 0.6243,
+      "step": 7350
+    },
+    {
+      "epoch": 0.5113917005808898,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001011717047758519,
+      "loss": 0.7465,
+      "step": 7351
+    },
+    {
+      "epoch": 0.511461268218025,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0010114917299337092,
+      "loss": 0.7375,
+      "step": 7352
+    },
+    {
+      "epoch": 0.5115308358551601,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0010112664115254075,
+      "loss": 0.7938,
+      "step": 7353
+    },
+    {
+      "epoch": 0.5116004034922954,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0010110410925450542,
+      "loss": 0.8754,
+      "step": 7354
+    },
+    {
+      "epoch": 0.5116699711294306,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00101081577300409,
+      "loss": 1.0421,
+      "step": 7355
+    },
+    {
+      "epoch": 0.5117395387665657,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001010590452913955,
+      "loss": 0.9681,
+      "step": 7356
+    },
+    {
+      "epoch": 0.511809106403701,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0010103651322860905,
+      "loss": 0.8657,
+      "step": 7357
+    },
+    {
+      "epoch": 0.5118786740408362,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0010101398111319372,
+      "loss": 0.9783,
+      "step": 7358
+    },
+    {
+      "epoch": 0.5119482416779714,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010099144894629346,
+      "loss": 0.6622,
+      "step": 7359
+    },
+    {
+      "epoch": 0.5120178093151067,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0010096891672905246,
+      "loss": 0.9833,
+      "step": 7360
+    },
+    {
+      "epoch": 0.5120873769522418,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0010094638446261474,
+      "loss": 0.8881,
+      "step": 7361
+    },
+    {
+      "epoch": 0.512156944589377,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0010092385214812438,
+      "loss": 0.5359,
+      "step": 7362
+    },
+    {
+      "epoch": 0.5122265122265123,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.001009013197867255,
+      "loss": 0.9023,
+      "step": 7363
+    },
+    {
+      "epoch": 0.5122960798636474,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001008787873795621,
+      "loss": 0.4285,
+      "step": 7364
+    },
+    {
+      "epoch": 0.5123656475007826,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010085625492777834,
+      "loss": 0.8085,
+      "step": 7365
+    },
+    {
+      "epoch": 0.5124352151379178,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0010083372243251828,
+      "loss": 1.0129,
+      "step": 7366
+    },
+    {
+      "epoch": 0.5125047827750531,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0010081118989492598,
+      "loss": 0.6876,
+      "step": 7367
+    },
+    {
+      "epoch": 0.5125743504121882,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0010078865731614553,
+      "loss": 1.2099,
+      "step": 7368
+    },
+    {
+      "epoch": 0.5126439180493234,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0010076612469732105,
+      "loss": 1.0681,
+      "step": 7369
+    },
+    {
+      "epoch": 0.5127134856864587,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0010074359203959661,
+      "loss": 0.7904,
+      "step": 7370
+    },
+    {
+      "epoch": 0.5127830533235939,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0010072105934411633,
+      "loss": 0.9934,
+      "step": 7371
+    },
+    {
+      "epoch": 0.512852620960729,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0010069852661202428,
+      "loss": 0.8781,
+      "step": 7372
+    },
+    {
+      "epoch": 0.5129221885978643,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010067599384446456,
+      "loss": 0.884,
+      "step": 7373
+    },
+    {
+      "epoch": 0.5129917562349995,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001006534610425813,
+      "loss": 0.6635,
+      "step": 7374
+    },
+    {
+      "epoch": 0.5130613238721347,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0010063092820751858,
+      "loss": 0.9075,
+      "step": 7375
+    },
+    {
+      "epoch": 0.5131308915092699,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001006083953404205,
+      "loss": 0.6741,
+      "step": 7376
+    },
+    {
+      "epoch": 0.5132004591464051,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0010058586244243118,
+      "loss": 0.6751,
+      "step": 7377
+    },
+    {
+      "epoch": 0.5132700267835403,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0010056332951469472,
+      "loss": 0.6612,
+      "step": 7378
+    },
+    {
+      "epoch": 0.5133395944206754,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001005407965583552,
+      "loss": 0.9126,
+      "step": 7379
+    },
+    {
+      "epoch": 0.5134091620578107,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0010051826357455678,
+      "loss": 0.676,
+      "step": 7380
+    },
+    {
+      "epoch": 0.5134787296949459,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0010049573056444354,
+      "loss": 0.6661,
+      "step": 7381
+    },
+    {
+      "epoch": 0.5135482973320811,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001004731975291596,
+      "loss": 0.7886,
+      "step": 7382
+    },
+    {
+      "epoch": 0.5136178649692164,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0010045066446984908,
+      "loss": 0.8526,
+      "step": 7383
+    },
+    {
+      "epoch": 0.5136874326063515,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0010042813138765607,
+      "loss": 1.2588,
+      "step": 7384
+    },
+    {
+      "epoch": 0.5137570002434867,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0010040559828372474,
+      "loss": 0.7593,
+      "step": 7385
+    },
+    {
+      "epoch": 0.513826567880622,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0010038306515919916,
+      "loss": 0.6358,
+      "step": 7386
+    },
+    {
+      "epoch": 0.5138961355177571,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0010036053201522347,
+      "loss": 0.7824,
+      "step": 7387
+    },
+    {
+      "epoch": 0.5139657031548923,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010033799885294174,
+      "loss": 1.0388,
+      "step": 7388
+    },
+    {
+      "epoch": 0.5140352707920276,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0010031546567349815,
+      "loss": 0.8488,
+      "step": 7389
+    },
+    {
+      "epoch": 0.5141048384291628,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0010029293247803685,
+      "loss": 0.6219,
+      "step": 7390
+    },
+    {
+      "epoch": 0.5141744060662979,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0010027039926770187,
+      "loss": 0.5945,
+      "step": 7391
+    },
+    {
+      "epoch": 0.5142439737034331,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010024786604363738,
+      "loss": 0.8943,
+      "step": 7392
+    },
+    {
+      "epoch": 0.5143135413405684,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0010022533280698751,
+      "loss": 0.8981,
+      "step": 7393
+    },
+    {
+      "epoch": 0.5143831089777036,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0010020279955889637,
+      "loss": 0.7547,
+      "step": 7394
+    },
+    {
+      "epoch": 0.5144526766148387,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001001802663005081,
+      "loss": 0.6436,
+      "step": 7395
+    },
+    {
+      "epoch": 0.514522244251974,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0010015773303296682,
+      "loss": 0.6235,
+      "step": 7396
+    },
+    {
+      "epoch": 0.5145918118891092,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0010013519975741662,
+      "loss": 0.7985,
+      "step": 7397
+    },
+    {
+      "epoch": 0.5146613795262444,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001001126664750017,
+      "loss": 0.9846,
+      "step": 7398
+    },
+    {
+      "epoch": 0.5147309471633796,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0010009013318686612,
+      "loss": 0.8467,
+      "step": 7399
+    },
+    {
+      "epoch": 0.5148005148005148,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0010006759989415403,
+      "loss": 0.8558,
+      "step": 7400
+    },
+    {
+      "epoch": 0.51487008243765,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010004506659800959,
+      "loss": 0.7214,
+      "step": 7401
+    },
+    {
+      "epoch": 0.5149396500747853,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0010002253329957685,
+      "loss": 1.0984,
+      "step": 7402
+    },
+    {
+      "epoch": 0.5150092177119204,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001,
+      "loss": 0.95,
+      "step": 7403
+    },
+    {
+      "epoch": 0.5150787853490556,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009997746670042315,
+      "loss": 0.7794,
+      "step": 7404
+    },
+    {
+      "epoch": 0.5151483529861908,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0009995493340199042,
+      "loss": 0.8167,
+      "step": 7405
+    },
+    {
+      "epoch": 0.515217920623326,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009993240010584597,
+      "loss": 0.7491,
+      "step": 7406
+    },
+    {
+      "epoch": 0.5152874882604612,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009990986681313388,
+      "loss": 1.0262,
+      "step": 7407
+    },
+    {
+      "epoch": 0.5153570558975964,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0009988733352499833,
+      "loss": 0.6585,
+      "step": 7408
+    },
+    {
+      "epoch": 0.5154266235347317,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009986480024258338,
+      "loss": 0.925,
+      "step": 7409
+    },
+    {
+      "epoch": 0.5154961911718668,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.000998422669670332,
+      "loss": 0.8088,
+      "step": 7410
+    },
+    {
+      "epoch": 0.515565758809002,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.000998197336994919,
+      "loss": 0.565,
+      "step": 7411
+    },
+    {
+      "epoch": 0.5156353264461373,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0009979720044110362,
+      "loss": 0.9264,
+      "step": 7412
+    },
+    {
+      "epoch": 0.5157048940832725,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0009977466719301251,
+      "loss": 0.8953,
+      "step": 7413
+    },
+    {
+      "epoch": 0.5157744617204076,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0009975213395636263,
+      "loss": 0.769,
+      "step": 7414
+    },
+    {
+      "epoch": 0.5158440293575429,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009972960073229818,
+      "loss": 0.8064,
+      "step": 7415
+    },
+    {
+      "epoch": 0.5159135969946781,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009970706752196316,
+      "loss": 0.9126,
+      "step": 7416
+    },
+    {
+      "epoch": 0.5159831646318133,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009968453432650185,
+      "loss": 0.9174,
+      "step": 7417
+    },
+    {
+      "epoch": 0.5160527322689484,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009966200114705827,
+      "loss": 0.8217,
+      "step": 7418
+    },
+    {
+      "epoch": 0.5161222999060837,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009963946798477654,
+      "loss": 0.7123,
+      "step": 7419
+    },
+    {
+      "epoch": 0.5161918675432189,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0009961693484080087,
+      "loss": 1.0308,
+      "step": 7420
+    },
+    {
+      "epoch": 0.5162614351803541,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.000995944017162753,
+      "loss": 0.8344,
+      "step": 7421
+    },
+    {
+      "epoch": 0.5163310028174893,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0009957186861234396,
+      "loss": 0.6491,
+      "step": 7422
+    },
+    {
+      "epoch": 0.5164005704546245,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0009954933553015092,
+      "loss": 0.8135,
+      "step": 7423
+    },
+    {
+      "epoch": 0.5164701380917597,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0009952680247084043,
+      "loss": 0.8808,
+      "step": 7424
+    },
+    {
+      "epoch": 0.516539705728895,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0009950426943555648,
+      "loss": 0.6941,
+      "step": 7425
+    },
+    {
+      "epoch": 0.5166092733660301,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0009948173642544322,
+      "loss": 0.7446,
+      "step": 7426
+    },
+    {
+      "epoch": 0.5166788410031653,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000994592034416448,
+      "loss": 0.6119,
+      "step": 7427
+    },
+    {
+      "epoch": 0.5167484086403006,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000994366704853053,
+      "loss": 0.7891,
+      "step": 7428
+    },
+    {
+      "epoch": 0.5168179762774358,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0009941413755756886,
+      "loss": 0.6071,
+      "step": 7429
+    },
+    {
+      "epoch": 0.5168875439145709,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000993916046595795,
+      "loss": 0.751,
+      "step": 7430
+    },
+    {
+      "epoch": 0.5169571115517061,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0009936907179248144,
+      "loss": 0.7383,
+      "step": 7431
+    },
+    {
+      "epoch": 0.5170266791888414,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009934653895741872,
+      "loss": 0.7809,
+      "step": 7432
+    },
+    {
+      "epoch": 0.5170962468259765,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0009932400615553542,
+      "loss": 0.9368,
+      "step": 7433
+    },
+    {
+      "epoch": 0.5171658144631117,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0009930147338797573,
+      "loss": 0.6948,
+      "step": 7434
+    },
+    {
+      "epoch": 0.517235382100247,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.000992789406558837,
+      "loss": 0.8302,
+      "step": 7435
+    },
+    {
+      "epoch": 0.5173049497373822,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0009925640796040341,
+      "loss": 0.8594,
+      "step": 7436
+    },
+    {
+      "epoch": 0.5173745173745173,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009923387530267895,
+      "loss": 0.8291,
+      "step": 7437
+    },
+    {
+      "epoch": 0.5174440850116526,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.000992113426838545,
+      "loss": 0.9406,
+      "step": 7438
+    },
+    {
+      "epoch": 0.5175136526487878,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0009918881010507405,
+      "loss": 0.8503,
+      "step": 7439
+    },
+    {
+      "epoch": 0.517583220285923,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0009916627756748173,
+      "loss": 0.7722,
+      "step": 7440
+    },
+    {
+      "epoch": 0.5176527879230582,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009914374507222167,
+      "loss": 1.0453,
+      "step": 7441
+    },
+    {
+      "epoch": 0.5177223555601934,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000991212126204379,
+      "loss": 0.8305,
+      "step": 7442
+    },
+    {
+      "epoch": 0.5177919231973286,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0009909868021327451,
+      "loss": 0.9021,
+      "step": 7443
+    },
+    {
+      "epoch": 0.5178614908344638,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000990761478518756,
+      "loss": 0.8837,
+      "step": 7444
+    },
+    {
+      "epoch": 0.517931058471599,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009905361553738529,
+      "loss": 0.6102,
+      "step": 7445
+    },
+    {
+      "epoch": 0.5180006261087342,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009903108327094757,
+      "loss": 0.8095,
+      "step": 7446
+    },
+    {
+      "epoch": 0.5180701937458694,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009900855105370657,
+      "loss": 0.8938,
+      "step": 7447
+    },
+    {
+      "epoch": 0.5181397613830047,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000989860188868063,
+      "loss": 0.9159,
+      "step": 7448
+    },
+    {
+      "epoch": 0.5182093290201398,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009896348677139095,
+      "loss": 0.6995,
+      "step": 7449
+    },
+    {
+      "epoch": 0.518278896657275,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.000989409547086045,
+      "loss": 0.779,
+      "step": 7450
+    },
+    {
+      "epoch": 0.5183484642944103,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00098918422699591,
+      "loss": 0.9001,
+      "step": 7451
+    },
+    {
+      "epoch": 0.5184180319315455,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009889589074549459,
+      "loss": 1.0165,
+      "step": 7452
+    },
+    {
+      "epoch": 0.5184875995686806,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0009887335884745925,
+      "loss": 0.8979,
+      "step": 7453
+    },
+    {
+      "epoch": 0.5185571672058159,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.000988508270066291,
+      "loss": 0.9494,
+      "step": 7454
+    },
+    {
+      "epoch": 0.5186267348429511,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000988282952241481,
+      "loss": 0.9349,
+      "step": 7455
+    },
+    {
+      "epoch": 0.5186963024800862,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0009880576350116044,
+      "loss": 0.9685,
+      "step": 7456
+    },
+    {
+      "epoch": 0.5187658701172214,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0009878323183881005,
+      "loss": 1.3639,
+      "step": 7457
+    },
+    {
+      "epoch": 0.5188354377543567,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0009876070023824102,
+      "loss": 0.8812,
+      "step": 7458
+    },
+    {
+      "epoch": 0.5189050053914919,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009873816870059739,
+      "loss": 0.9826,
+      "step": 7459
+    },
+    {
+      "epoch": 0.518974573028627,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0009871563722702319,
+      "loss": 0.8891,
+      "step": 7460
+    },
+    {
+      "epoch": 0.5190441406657623,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009869310581866247,
+      "loss": 0.8919,
+      "step": 7461
+    },
+    {
+      "epoch": 0.5191137083028975,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000986705744766592,
+      "loss": 0.8724,
+      "step": 7462
+    },
+    {
+      "epoch": 0.5191832759400327,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.000986480432021575,
+      "loss": 0.6715,
+      "step": 7463
+    },
+    {
+      "epoch": 0.5192528435771679,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009862551199630136,
+      "loss": 0.8011,
+      "step": 7464
+    },
+    {
+      "epoch": 0.5193224112143031,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009860298086023474,
+      "loss": 0.8478,
+      "step": 7465
+    },
+    {
+      "epoch": 0.5193919788514383,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009858044979510177,
+      "loss": 0.7981,
+      "step": 7466
+    },
+    {
+      "epoch": 0.5194615464885736,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0009855791880204639,
+      "loss": 0.9811,
+      "step": 7467
+    },
+    {
+      "epoch": 0.5195311141257087,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009853538788221262,
+      "loss": 0.9317,
+      "step": 7468
+    },
+    {
+      "epoch": 0.5196006817628439,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0009851285703674445,
+      "loss": 1.1164,
+      "step": 7469
+    },
+    {
+      "epoch": 0.5196702493999791,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009849032626678595,
+      "loss": 0.7334,
+      "step": 7470
+    },
+    {
+      "epoch": 0.5197398170371144,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0009846779557348103,
+      "loss": 0.5433,
+      "step": 7471
+    },
+    {
+      "epoch": 0.5198093846742495,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009844526495797372,
+      "loss": 0.8945,
+      "step": 7472
+    },
+    {
+      "epoch": 0.5198789523113847,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009842273442140807,
+      "loss": 0.7181,
+      "step": 7473
+    },
+    {
+      "epoch": 0.51994851994852,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0009840020396492798,
+      "loss": 0.9351,
+      "step": 7474
+    },
+    {
+      "epoch": 0.5200180875856552,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.000983776735896775,
+      "loss": 0.648,
+      "step": 7475
+    },
+    {
+      "epoch": 0.5200876552227903,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0009835514329680052,
+      "loss": 0.8321,
+      "step": 7476
+    },
+    {
+      "epoch": 0.5201572228599256,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009833261308744112,
+      "loss": 0.8325,
+      "step": 7477
+    },
+    {
+      "epoch": 0.5202267904970608,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0009831008296274323,
+      "loss": 0.8579,
+      "step": 7478
+    },
+    {
+      "epoch": 0.520296358134196,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0009828755292385076,
+      "loss": 1.0932,
+      "step": 7479
+    },
+    {
+      "epoch": 0.5203659257713312,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0009826502297190776,
+      "loss": 0.8288,
+      "step": 7480
+    },
+    {
+      "epoch": 0.5204354934084664,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009824249310805815,
+      "loss": 0.9406,
+      "step": 7481
+    },
+    {
+      "epoch": 0.5205050610456016,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0009821996333344587,
+      "loss": 1.028,
+      "step": 7482
+    },
+    {
+      "epoch": 0.5205746286827367,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0009819743364921484,
+      "loss": 0.7697,
+      "step": 7483
+    },
+    {
+      "epoch": 0.520644196319872,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.000981749040565091,
+      "loss": 0.7308,
+      "step": 7484
+    },
+    {
+      "epoch": 0.5207137639570072,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009815237455647254,
+      "loss": 0.8124,
+      "step": 7485
+    },
+    {
+      "epoch": 0.5207833315941424,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009812984515024904,
+      "loss": 0.5755,
+      "step": 7486
+    },
+    {
+      "epoch": 0.5208528992312776,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000981073158389826,
+      "loss": 0.9276,
+      "step": 7487
+    },
+    {
+      "epoch": 0.5209224668684128,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0009808478662381714,
+      "loss": 0.6658,
+      "step": 7488
+    },
+    {
+      "epoch": 0.520992034505548,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009806225750589655,
+      "loss": 0.8713,
+      "step": 7489
+    },
+    {
+      "epoch": 0.5210616021426833,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0009803972848636473,
+      "loss": 0.7039,
+      "step": 7490
+    },
+    {
+      "epoch": 0.5211311697798184,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009801719956636567,
+      "loss": 0.9872,
+      "step": 7491
+    },
+    {
+      "epoch": 0.5212007374169536,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009799467074704318,
+      "loss": 0.6726,
+      "step": 7492
+    },
+    {
+      "epoch": 0.5212703050540889,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000979721420295412,
+      "loss": 0.7117,
+      "step": 7493
+    },
+    {
+      "epoch": 0.5213398726912241,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0009794961341500364,
+      "loss": 0.7386,
+      "step": 7494
+    },
+    {
+      "epoch": 0.5214094403283592,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0009792708490457438,
+      "loss": 1.0354,
+      "step": 7495
+    },
+    {
+      "epoch": 0.5214790079654944,
+      "grad_norm": 0.875,
+      "learning_rate": 0.000979045564993973,
+      "loss": 0.6982,
+      "step": 7496
+    },
+    {
+      "epoch": 0.5215485756026297,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0009788202820061626,
+      "loss": 0.8027,
+      "step": 7497
+    },
+    {
+      "epoch": 0.5216181432397649,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.000978595000093752,
+      "loss": 0.8026,
+      "step": 7498
+    },
+    {
+      "epoch": 0.5216877108769,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.000978369719268179,
+      "loss": 0.7331,
+      "step": 7499
+    },
+    {
+      "epoch": 0.5217572785140353,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009781444395408824,
+      "loss": 1.0517,
+      "step": 7500
+    },
+    {
+      "epoch": 0.5218268461511705,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009779191609233014,
+      "loss": 0.7587,
+      "step": 7501
+    },
+    {
+      "epoch": 0.5218964137883056,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0009776938834268744,
+      "loss": 1.1129,
+      "step": 7502
+    },
+    {
+      "epoch": 0.5219659814254409,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009774686070630392,
+      "loss": 0.605,
+      "step": 7503
+    },
+    {
+      "epoch": 0.5220355490625761,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0009772433318432341,
+      "loss": 0.6579,
+      "step": 7504
+    },
+    {
+      "epoch": 0.5221051166997113,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009770180577788987,
+      "loss": 0.7113,
+      "step": 7505
+    },
+    {
+      "epoch": 0.5221746843368466,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009767927848814701,
+      "loss": 1.2031,
+      "step": 7506
+    },
+    {
+      "epoch": 0.5222442519739817,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0009765675131623867,
+      "loss": 0.8527,
+      "step": 7507
+    },
+    {
+      "epoch": 0.5223138196111169,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0009763422426330873,
+      "loss": 0.9367,
+      "step": 7508
+    },
+    {
+      "epoch": 0.5223833872482521,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009761169733050096,
+      "loss": 0.8686,
+      "step": 7509
+    },
+    {
+      "epoch": 0.5224529548853873,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0009758917051895915,
+      "loss": 0.5321,
+      "step": 7510
+    },
+    {
+      "epoch": 0.5225225225225225,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0009756664382982708,
+      "loss": 0.8484,
+      "step": 7511
+    },
+    {
+      "epoch": 0.5225920901596577,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009754411726424861,
+      "loss": 0.7063,
+      "step": 7512
+    },
+    {
+      "epoch": 0.522661657796793,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009752159082336747,
+      "loss": 0.825,
+      "step": 7513
+    },
+    {
+      "epoch": 0.5227312254339281,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0009749906450832744,
+      "loss": 0.6291,
+      "step": 7514
+    },
+    {
+      "epoch": 0.5228007930710633,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0009747653832027232,
+      "loss": 0.7437,
+      "step": 7515
+    },
+    {
+      "epoch": 0.5228703607081986,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0009745401226034589,
+      "loss": 0.7518,
+      "step": 7516
+    },
+    {
+      "epoch": 0.5229399283453338,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0009743148632969186,
+      "loss": 0.8273,
+      "step": 7517
+    },
+    {
+      "epoch": 0.5230094959824689,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00097408960529454,
+      "loss": 0.7889,
+      "step": 7518
+    },
+    {
+      "epoch": 0.5230790636196042,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009738643486077608,
+      "loss": 0.7051,
+      "step": 7519
+    },
+    {
+      "epoch": 0.5231486312567394,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009736390932480183,
+      "loss": 0.7936,
+      "step": 7520
+    },
+    {
+      "epoch": 0.5232181988938746,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009734138392267497,
+      "loss": 0.9106,
+      "step": 7521
+    },
+    {
+      "epoch": 0.5232877665310097,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009731885865553922,
+      "loss": 0.7028,
+      "step": 7522
+    },
+    {
+      "epoch": 0.523357334168145,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009729633352453835,
+      "loss": 0.7533,
+      "step": 7523
+    },
+    {
+      "epoch": 0.5234269018052802,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0009727380853081601,
+      "loss": 0.8453,
+      "step": 7524
+    },
+    {
+      "epoch": 0.5234964694424153,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0009725128367551592,
+      "loss": 0.6497,
+      "step": 7525
+    },
+    {
+      "epoch": 0.5235660370795506,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000972287589597818,
+      "loss": 0.9773,
+      "step": 7526
+    },
+    {
+      "epoch": 0.5236356047166858,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0009720623438475737,
+      "loss": 0.7903,
+      "step": 7527
+    },
+    {
+      "epoch": 0.523705172353821,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0009718370995158623,
+      "loss": 0.5697,
+      "step": 7528
+    },
+    {
+      "epoch": 0.5237747399909563,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000971611856614121,
+      "loss": 0.7219,
+      "step": 7529
+    },
+    {
+      "epoch": 0.5238443076280914,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0009713866151537869,
+      "loss": 0.7004,
+      "step": 7530
+    },
+    {
+      "epoch": 0.5239138752652266,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009711613751462961,
+      "loss": 0.7109,
+      "step": 7531
+    },
+    {
+      "epoch": 0.5239834429023619,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.000970936136603085,
+      "loss": 0.6683,
+      "step": 7532
+    },
+    {
+      "epoch": 0.524053010539497,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009707108995355907,
+      "loss": 0.8626,
+      "step": 7533
+    },
+    {
+      "epoch": 0.5241225781766322,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0009704856639552495,
+      "loss": 0.7987,
+      "step": 7534
+    },
+    {
+      "epoch": 0.5241921458137674,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009702604298734973,
+      "loss": 0.8395,
+      "step": 7535
+    },
+    {
+      "epoch": 0.5242617134509027,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0009700351973017704,
+      "loss": 0.7056,
+      "step": 7536
+    },
+    {
+      "epoch": 0.5243312810880378,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0009698099662515054,
+      "loss": 0.7448,
+      "step": 7537
+    },
+    {
+      "epoch": 0.524400848725173,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.000969584736734138,
+      "loss": 0.8765,
+      "step": 7538
+    },
+    {
+      "epoch": 0.5244704163623083,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009693595087611042,
+      "loss": 0.6362,
+      "step": 7539
+    },
+    {
+      "epoch": 0.5245399839994435,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0009691342823438403,
+      "loss": 0.8538,
+      "step": 7540
+    },
+    {
+      "epoch": 0.5246095516365786,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009689090574937823,
+      "loss": 0.6046,
+      "step": 7541
+    },
+    {
+      "epoch": 0.5246791192737139,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0009686838342223654,
+      "loss": 0.8277,
+      "step": 7542
+    },
+    {
+      "epoch": 0.5247486869108491,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009684586125410252,
+      "loss": 0.9067,
+      "step": 7543
+    },
+    {
+      "epoch": 0.5248182545479843,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0009682333924611983,
+      "loss": 0.5895,
+      "step": 7544
+    },
+    {
+      "epoch": 0.5248878221851195,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0009680081739943192,
+      "loss": 0.6634,
+      "step": 7545
+    },
+    {
+      "epoch": 0.5249573898222547,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0009677829571518237,
+      "loss": 0.684,
+      "step": 7546
+    },
+    {
+      "epoch": 0.5250269574593899,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009675577419451473,
+      "loss": 0.8417,
+      "step": 7547
+    },
+    {
+      "epoch": 0.525096525096525,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0009673325283857256,
+      "loss": 0.9912,
+      "step": 7548
+    },
+    {
+      "epoch": 0.5251660927336603,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009671073164849932,
+      "loss": 0.9259,
+      "step": 7549
+    },
+    {
+      "epoch": 0.5252356603707955,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009668821062543852,
+      "loss": 0.6655,
+      "step": 7550
+    },
+    {
+      "epoch": 0.5253052280079307,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0009666568977053371,
+      "loss": 1.0478,
+      "step": 7551
+    },
+    {
+      "epoch": 0.525374795645066,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.000966431690849284,
+      "loss": 0.6806,
+      "step": 7552
+    },
+    {
+      "epoch": 0.5254443632822011,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0009662064856976601,
+      "loss": 0.8352,
+      "step": 7553
+    },
+    {
+      "epoch": 0.5255139309193363,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009659812822619007,
+      "loss": 0.7596,
+      "step": 7554
+    },
+    {
+      "epoch": 0.5255834985564716,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0009657560805534405,
+      "loss": 0.9609,
+      "step": 7555
+    },
+    {
+      "epoch": 0.5256530661936067,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0009655308805837135,
+      "loss": 1.0609,
+      "step": 7556
+    },
+    {
+      "epoch": 0.5257226338307419,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0009653056823641546,
+      "loss": 0.7534,
+      "step": 7557
+    },
+    {
+      "epoch": 0.5257922014678772,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009650804859061985,
+      "loss": 0.8217,
+      "step": 7558
+    },
+    {
+      "epoch": 0.5258617691050124,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009648552912212795,
+      "loss": 0.8723,
+      "step": 7559
+    },
+    {
+      "epoch": 0.5259313367421475,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0009646300983208314,
+      "loss": 0.7671,
+      "step": 7560
+    },
+    {
+      "epoch": 0.5260009043792827,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009644049072162887,
+      "loss": 0.8247,
+      "step": 7561
+    },
+    {
+      "epoch": 0.526070472016418,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009641797179190856,
+      "loss": 0.937,
+      "step": 7562
+    },
+    {
+      "epoch": 0.5261400396535532,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0009639545304406557,
+      "loss": 0.9648,
+      "step": 7563
+    },
+    {
+      "epoch": 0.5262096072906883,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009637293447924329,
+      "loss": 0.9806,
+      "step": 7564
+    },
+    {
+      "epoch": 0.5262791749278236,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009635041609858513,
+      "loss": 1.0049,
+      "step": 7565
+    },
+    {
+      "epoch": 0.5263487425649588,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0009632789790323446,
+      "loss": 0.6768,
+      "step": 7566
+    },
+    {
+      "epoch": 0.526418310202094,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000963053798943346,
+      "loss": 0.7095,
+      "step": 7567
+    },
+    {
+      "epoch": 0.5264878778392292,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0009628286207302893,
+      "loss": 0.8472,
+      "step": 7568
+    },
+    {
+      "epoch": 0.5265574454763644,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0009626034444046082,
+      "loss": 1.0541,
+      "step": 7569
+    },
+    {
+      "epoch": 0.5266270131134996,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009623782699777354,
+      "loss": 0.7968,
+      "step": 7570
+    },
+    {
+      "epoch": 0.5266965807506349,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0009621530974611044,
+      "loss": 0.6698,
+      "step": 7571
+    },
+    {
+      "epoch": 0.52676614838777,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009619279268661484,
+      "loss": 1.0106,
+      "step": 7572
+    },
+    {
+      "epoch": 0.5268357160249052,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009617027582043006,
+      "loss": 1.1062,
+      "step": 7573
+    },
+    {
+      "epoch": 0.5269052836620404,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009614775914869934,
+      "loss": 0.7848,
+      "step": 7574
+    },
+    {
+      "epoch": 0.5269748512991757,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00096125242672566,
+      "loss": 0.9292,
+      "step": 7575
+    },
+    {
+      "epoch": 0.5270444189363108,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009610272639317334,
+      "loss": 0.8652,
+      "step": 7576
+    },
+    {
+      "epoch": 0.527113986573446,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009608021031166456,
+      "loss": 1.0807,
+      "step": 7577
+    },
+    {
+      "epoch": 0.5271835542105813,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009605769442918293,
+      "loss": 0.9035,
+      "step": 7578
+    },
+    {
+      "epoch": 0.5272531218477164,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009603517874687172,
+      "loss": 0.6871,
+      "step": 7579
+    },
+    {
+      "epoch": 0.5273226894848516,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009601266326587416,
+      "loss": 0.9676,
+      "step": 7580
+    },
+    {
+      "epoch": 0.5273922571219869,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0009599014798733344,
+      "loss": 0.8892,
+      "step": 7581
+    },
+    {
+      "epoch": 0.5274618247591221,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0009596763291239281,
+      "loss": 0.6882,
+      "step": 7582
+    },
+    {
+      "epoch": 0.5275313923962572,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0009594511804219548,
+      "loss": 0.8788,
+      "step": 7583
+    },
+    {
+      "epoch": 0.5276009600333925,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0009592260337788459,
+      "loss": 1.0275,
+      "step": 7584
+    },
+    {
+      "epoch": 0.5276705276705277,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009590008892060332,
+      "loss": 0.7101,
+      "step": 7585
+    },
+    {
+      "epoch": 0.5277400953076629,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.000958775746714949,
+      "loss": 0.7943,
+      "step": 7586
+    },
+    {
+      "epoch": 0.527809662944798,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009585506063170249,
+      "loss": 0.9239,
+      "step": 7587
+    },
+    {
+      "epoch": 0.5278792305819333,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0009583254680236915,
+      "loss": 0.6241,
+      "step": 7588
+    },
+    {
+      "epoch": 0.5279487982190685,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.000958100331846381,
+      "loss": 0.8665,
+      "step": 7589
+    },
+    {
+      "epoch": 0.5280183658562037,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009578751977965246,
+      "loss": 0.6914,
+      "step": 7590
+    },
+    {
+      "epoch": 0.5280879334933389,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009576500658855535,
+      "loss": 0.7779,
+      "step": 7591
+    },
+    {
+      "epoch": 0.5281575011304741,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009574249361248981,
+      "loss": 0.9939,
+      "step": 7592
+    },
+    {
+      "epoch": 0.5282270687676093,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009571998085259901,
+      "loss": 0.7421,
+      "step": 7593
+    },
+    {
+      "epoch": 0.5282966364047446,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0009569746831002603,
+      "loss": 0.7919,
+      "step": 7594
+    },
+    {
+      "epoch": 0.5283662040418797,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0009567495598591387,
+      "loss": 0.5853,
+      "step": 7595
+    },
+    {
+      "epoch": 0.5284357716790149,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009565244388140569,
+      "loss": 0.7668,
+      "step": 7596
+    },
+    {
+      "epoch": 0.5285053393161502,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009562993199764447,
+      "loss": 0.7021,
+      "step": 7597
+    },
+    {
+      "epoch": 0.5285749069532854,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0009560742033577332,
+      "loss": 0.7556,
+      "step": 7598
+    },
+    {
+      "epoch": 0.5286444745904205,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009558490889693518,
+      "loss": 0.8799,
+      "step": 7599
+    },
+    {
+      "epoch": 0.5287140422275557,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0009556239768227312,
+      "loss": 0.9766,
+      "step": 7600
+    },
+    {
+      "epoch": 0.528783609864691,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009553988669293017,
+      "loss": 0.7397,
+      "step": 7601
+    },
+    {
+      "epoch": 0.5288531775018261,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0009551737593004926,
+      "loss": 0.6558,
+      "step": 7602
+    },
+    {
+      "epoch": 0.5289227451389613,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.000954948653947734,
+      "loss": 0.8443,
+      "step": 7603
+    },
+    {
+      "epoch": 0.5289923127760966,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0009547235508824557,
+      "loss": 0.6416,
+      "step": 7604
+    },
+    {
+      "epoch": 0.5290618804132318,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009544984501160878,
+      "loss": 0.9741,
+      "step": 7605
+    },
+    {
+      "epoch": 0.5291314480503669,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0009542733516600586,
+      "loss": 0.9167,
+      "step": 7606
+    },
+    {
+      "epoch": 0.5292010156875022,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009540482555257983,
+      "loss": 1.1045,
+      "step": 7607
+    },
+    {
+      "epoch": 0.5292705833246374,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009538231617247363,
+      "loss": 1.1215,
+      "step": 7608
+    },
+    {
+      "epoch": 0.5293401509617726,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009535980702683011,
+      "loss": 0.7904,
+      "step": 7609
+    },
+    {
+      "epoch": 0.5294097185989078,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0009533729811679219,
+      "loss": 0.5266,
+      "step": 7610
+    },
+    {
+      "epoch": 0.529479286236043,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0009531478944350278,
+      "loss": 0.9714,
+      "step": 7611
+    },
+    {
+      "epoch": 0.5295488538731782,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009529228100810479,
+      "loss": 0.7552,
+      "step": 7612
+    },
+    {
+      "epoch": 0.5296184215103134,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009526977281174098,
+      "loss": 0.7659,
+      "step": 7613
+    },
+    {
+      "epoch": 0.5296879891474486,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009524726485555428,
+      "loss": 0.7575,
+      "step": 7614
+    },
+    {
+      "epoch": 0.5297575567845838,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0009522475714068754,
+      "loss": 0.9411,
+      "step": 7615
+    },
+    {
+      "epoch": 0.529827124421719,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009520224966828356,
+      "loss": 0.9846,
+      "step": 7616
+    },
+    {
+      "epoch": 0.5298966920588543,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0009517974243948512,
+      "loss": 0.7321,
+      "step": 7617
+    },
+    {
+      "epoch": 0.5299662596959894,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0009515723545543509,
+      "loss": 0.7617,
+      "step": 7618
+    },
+    {
+      "epoch": 0.5300358273331246,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009513472871727625,
+      "loss": 0.8585,
+      "step": 7619
+    },
+    {
+      "epoch": 0.5301053949702599,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009511222222615133,
+      "loss": 0.772,
+      "step": 7620
+    },
+    {
+      "epoch": 0.530174962607395,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0009508971598320315,
+      "loss": 1.0156,
+      "step": 7621
+    },
+    {
+      "epoch": 0.5302445302445302,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009506720998957443,
+      "loss": 0.8706,
+      "step": 7622
+    },
+    {
+      "epoch": 0.5303140978816655,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0009504470424640797,
+      "loss": 0.7963,
+      "step": 7623
+    },
+    {
+      "epoch": 0.5303836655188007,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009502219875484639,
+      "loss": 0.7009,
+      "step": 7624
+    },
+    {
+      "epoch": 0.5304532331559358,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0009499969351603248,
+      "loss": 0.8547,
+      "step": 7625
+    },
+    {
+      "epoch": 0.530522800793071,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0009497718853110897,
+      "loss": 1.025,
+      "step": 7626
+    },
+    {
+      "epoch": 0.5305923684302063,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009495468380121846,
+      "loss": 0.8212,
+      "step": 7627
+    },
+    {
+      "epoch": 0.5306619360673415,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.000949321793275037,
+      "loss": 1.0669,
+      "step": 7628
+    },
+    {
+      "epoch": 0.5307315037044766,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009490967511110733,
+      "loss": 1.0157,
+      "step": 7629
+    },
+    {
+      "epoch": 0.5308010713416119,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009488717115317202,
+      "loss": 0.9015,
+      "step": 7630
+    },
+    {
+      "epoch": 0.5308706389787471,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009486466745484034,
+      "loss": 0.859,
+      "step": 7631
+    },
+    {
+      "epoch": 0.5309402066158823,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0009484216401725498,
+      "loss": 0.6554,
+      "step": 7632
+    },
+    {
+      "epoch": 0.5310097742530175,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0009481966084155857,
+      "loss": 0.8968,
+      "step": 7633
+    },
+    {
+      "epoch": 0.5310793418901527,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0009479715792889363,
+      "loss": 1.0967,
+      "step": 7634
+    },
+    {
+      "epoch": 0.5311489095272879,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.000947746552804028,
+      "loss": 0.7383,
+      "step": 7635
+    },
+    {
+      "epoch": 0.5312184771644232,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0009475215289722864,
+      "loss": 0.7721,
+      "step": 7636
+    },
+    {
+      "epoch": 0.5312880448015583,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0009472965078051372,
+      "loss": 0.7587,
+      "step": 7637
+    },
+    {
+      "epoch": 0.5313576124386935,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009470714893140053,
+      "loss": 0.7739,
+      "step": 7638
+    },
+    {
+      "epoch": 0.5314271800758287,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009468464735103166,
+      "loss": 0.949,
+      "step": 7639
+    },
+    {
+      "epoch": 0.531496747712964,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0009466214604054962,
+      "loss": 0.8655,
+      "step": 7640
+    },
+    {
+      "epoch": 0.5315663153500991,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0009463964500109685,
+      "loss": 0.8699,
+      "step": 7641
+    },
+    {
+      "epoch": 0.5316358829872343,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009461714423381595,
+      "loss": 0.931,
+      "step": 7642
+    },
+    {
+      "epoch": 0.5317054506243696,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009459464373984931,
+      "loss": 1.0457,
+      "step": 7643
+    },
+    {
+      "epoch": 0.5317750182615048,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009457214352033943,
+      "loss": 0.8442,
+      "step": 7644
+    },
+    {
+      "epoch": 0.5318445858986399,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009454964357642872,
+      "loss": 0.699,
+      "step": 7645
+    },
+    {
+      "epoch": 0.5319141535357752,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0009452714390925964,
+      "loss": 0.8539,
+      "step": 7646
+    },
+    {
+      "epoch": 0.5319837211729104,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009450464451997463,
+      "loss": 0.8525,
+      "step": 7647
+    },
+    {
+      "epoch": 0.5320532888100455,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009448214540971601,
+      "loss": 1.0032,
+      "step": 7648
+    },
+    {
+      "epoch": 0.5321228564471808,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.000944596465796263,
+      "loss": 0.8292,
+      "step": 7649
+    },
+    {
+      "epoch": 0.532192424084316,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0009443714803084779,
+      "loss": 0.5984,
+      "step": 7650
+    },
+    {
+      "epoch": 0.5322619917214512,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009441464976452288,
+      "loss": 0.7322,
+      "step": 7651
+    },
+    {
+      "epoch": 0.5323315593585863,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009439215178179388,
+      "loss": 0.8294,
+      "step": 7652
+    },
+    {
+      "epoch": 0.5324011269957216,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009436965408380314,
+      "loss": 0.8577,
+      "step": 7653
+    },
+    {
+      "epoch": 0.5324706946328568,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0009434715667169303,
+      "loss": 0.6898,
+      "step": 7654
+    },
+    {
+      "epoch": 0.532540262269992,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009432465954660574,
+      "loss": 0.9053,
+      "step": 7655
+    },
+    {
+      "epoch": 0.5326098299071272,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009430216270968371,
+      "loss": 0.673,
+      "step": 7656
+    },
+    {
+      "epoch": 0.5326793975442624,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0009427966616206909,
+      "loss": 0.8517,
+      "step": 7657
+    },
+    {
+      "epoch": 0.5327489651813976,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009425716990490423,
+      "loss": 0.8912,
+      "step": 7658
+    },
+    {
+      "epoch": 0.5328185328185329,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0009423467393933128,
+      "loss": 0.7184,
+      "step": 7659
+    },
+    {
+      "epoch": 0.532888100455668,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0009421217826649257,
+      "loss": 0.9315,
+      "step": 7660
+    },
+    {
+      "epoch": 0.5329576680928032,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0009418968288753026,
+      "loss": 0.7263,
+      "step": 7661
+    },
+    {
+      "epoch": 0.5330272357299385,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0009416718780358654,
+      "loss": 0.7388,
+      "step": 7662
+    },
+    {
+      "epoch": 0.5330968033670737,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009414469301580368,
+      "loss": 0.9644,
+      "step": 7663
+    },
+    {
+      "epoch": 0.5331663710042088,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0009412219852532376,
+      "loss": 0.9148,
+      "step": 7664
+    },
+    {
+      "epoch": 0.533235938641344,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009409970433328902,
+      "loss": 0.777,
+      "step": 7665
+    },
+    {
+      "epoch": 0.5333055062784793,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009407721044084148,
+      "loss": 0.8419,
+      "step": 7666
+    },
+    {
+      "epoch": 0.5333750739156145,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009405471684912338,
+      "loss": 0.9105,
+      "step": 7667
+    },
+    {
+      "epoch": 0.5334446415527496,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009403222355927679,
+      "loss": 0.8591,
+      "step": 7668
+    },
+    {
+      "epoch": 0.5335142091898849,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009400973057244378,
+      "loss": 0.7017,
+      "step": 7669
+    },
+    {
+      "epoch": 0.5335837768270201,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009398723788976651,
+      "loss": 0.9882,
+      "step": 7670
+    },
+    {
+      "epoch": 0.5336533444641552,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0009396474551238696,
+      "loss": 0.6953,
+      "step": 7671
+    },
+    {
+      "epoch": 0.5337229121012905,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0009394225344144725,
+      "loss": 0.9536,
+      "step": 7672
+    },
+    {
+      "epoch": 0.5337924797384257,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000939197616780893,
+      "loss": 0.6909,
+      "step": 7673
+    },
+    {
+      "epoch": 0.5338620473755609,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0009389727022345528,
+      "loss": 0.9209,
+      "step": 7674
+    },
+    {
+      "epoch": 0.5339316150126961,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009387477907868709,
+      "loss": 0.8242,
+      "step": 7675
+    },
+    {
+      "epoch": 0.5340011826498313,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009385228824492672,
+      "loss": 1.1437,
+      "step": 7676
+    },
+    {
+      "epoch": 0.5340707502869665,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0009382979772331622,
+      "loss": 0.8774,
+      "step": 7677
+    },
+    {
+      "epoch": 0.5341403179241017,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009380730751499747,
+      "loss": 0.8611,
+      "step": 7678
+    },
+    {
+      "epoch": 0.5342098855612369,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0009378481762111244,
+      "loss": 0.8717,
+      "step": 7679
+    },
+    {
+      "epoch": 0.5342794531983721,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0009376232804280298,
+      "loss": 0.8216,
+      "step": 7680
+    },
+    {
+      "epoch": 0.5343490208355073,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009373983878121113,
+      "loss": 0.8085,
+      "step": 7681
+    },
+    {
+      "epoch": 0.5344185884726426,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000937173498374787,
+      "loss": 0.8056,
+      "step": 7682
+    },
+    {
+      "epoch": 0.5344881561097777,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009369486121274759,
+      "loss": 0.8503,
+      "step": 7683
+    },
+    {
+      "epoch": 0.5345577237469129,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009367237290815961,
+      "loss": 0.8109,
+      "step": 7684
+    },
+    {
+      "epoch": 0.5346272913840482,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0009364988492485667,
+      "loss": 0.5855,
+      "step": 7685
+    },
+    {
+      "epoch": 0.5346968590211834,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009362739726398058,
+      "loss": 1.041,
+      "step": 7686
+    },
+    {
+      "epoch": 0.5347664266583185,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009360490992667306,
+      "loss": 0.8213,
+      "step": 7687
+    },
+    {
+      "epoch": 0.5348359942954538,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009358242291407604,
+      "loss": 1.0044,
+      "step": 7688
+    },
+    {
+      "epoch": 0.534905561932589,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0009355993622733124,
+      "loss": 0.6749,
+      "step": 7689
+    },
+    {
+      "epoch": 0.5349751295697242,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0009353744986758044,
+      "loss": 0.7504,
+      "step": 7690
+    },
+    {
+      "epoch": 0.5350446972068593,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.000935149638359653,
+      "loss": 0.9699,
+      "step": 7691
+    },
+    {
+      "epoch": 0.5351142648439946,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0009349247813362764,
+      "loss": 0.7765,
+      "step": 7692
+    },
+    {
+      "epoch": 0.5351838324811298,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009346999276170914,
+      "loss": 1.1291,
+      "step": 7693
+    },
+    {
+      "epoch": 0.5352534001182649,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0009344750772135148,
+      "loss": 0.8869,
+      "step": 7694
+    },
+    {
+      "epoch": 0.5353229677554002,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009342502301369637,
+      "loss": 0.8624,
+      "step": 7695
+    },
+    {
+      "epoch": 0.5353925353925354,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0009340253863988545,
+      "loss": 0.6484,
+      "step": 7696
+    },
+    {
+      "epoch": 0.5354621030296706,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.000933800546010604,
+      "loss": 0.7179,
+      "step": 7697
+    },
+    {
+      "epoch": 0.5355316706668058,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009335757089836274,
+      "loss": 0.6628,
+      "step": 7698
+    },
+    {
+      "epoch": 0.535601238303941,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009333508753293418,
+      "loss": 0.9152,
+      "step": 7699
+    },
+    {
+      "epoch": 0.5356708059410762,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0009331260450591627,
+      "loss": 1.0213,
+      "step": 7700
+    },
+    {
+      "epoch": 0.5357403735782115,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009329012181845059,
+      "loss": 0.7104,
+      "step": 7701
+    },
+    {
+      "epoch": 0.5358099412153466,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009326763947167875,
+      "loss": 0.9936,
+      "step": 7702
+    },
+    {
+      "epoch": 0.5358795088524818,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0009324515746674221,
+      "loss": 0.8168,
+      "step": 7703
+    },
+    {
+      "epoch": 0.535949076489617,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0009322267580478255,
+      "loss": 0.7997,
+      "step": 7704
+    },
+    {
+      "epoch": 0.5360186441267523,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0009320019448694121,
+      "loss": 0.8153,
+      "step": 7705
+    },
+    {
+      "epoch": 0.5360882117638874,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0009317771351435975,
+      "loss": 1.037,
+      "step": 7706
+    },
+    {
+      "epoch": 0.5361577794010226,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0009315523288817961,
+      "loss": 0.7164,
+      "step": 7707
+    },
+    {
+      "epoch": 0.5362273470381579,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009313275260954221,
+      "loss": 0.8784,
+      "step": 7708
+    },
+    {
+      "epoch": 0.5362969146752931,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009311027267958908,
+      "loss": 0.6885,
+      "step": 7709
+    },
+    {
+      "epoch": 0.5363664823124282,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0009308779309946155,
+      "loss": 0.5003,
+      "step": 7710
+    },
+    {
+      "epoch": 0.5364360499495635,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009306531387030106,
+      "loss": 0.8354,
+      "step": 7711
+    },
+    {
+      "epoch": 0.5365056175866987,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009304283499324892,
+      "loss": 0.869,
+      "step": 7712
+    },
+    {
+      "epoch": 0.5365751852238339,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0009302035646944661,
+      "loss": 0.8773,
+      "step": 7713
+    },
+    {
+      "epoch": 0.5366447528609691,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000929978783000354,
+      "loss": 0.6557,
+      "step": 7714
+    },
+    {
+      "epoch": 0.5367143204981043,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0009297540048615661,
+      "loss": 0.6123,
+      "step": 7715
+    },
+    {
+      "epoch": 0.5367838881352395,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0009295292302895163,
+      "loss": 0.9011,
+      "step": 7716
+    },
+    {
+      "epoch": 0.5368534557723746,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009293044592956167,
+      "loss": 0.7213,
+      "step": 7717
+    },
+    {
+      "epoch": 0.5369230234095099,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009290796918912806,
+      "loss": 0.7583,
+      "step": 7718
+    },
+    {
+      "epoch": 0.5369925910466451,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009288549280879196,
+      "loss": 1.0099,
+      "step": 7719
+    },
+    {
+      "epoch": 0.5370621586837803,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009286301678969474,
+      "loss": 0.8666,
+      "step": 7720
+    },
+    {
+      "epoch": 0.5371317263209155,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0009284054113297753,
+      "loss": 0.6667,
+      "step": 7721
+    },
+    {
+      "epoch": 0.5372012939580507,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009281806583978155,
+      "loss": 0.8413,
+      "step": 7722
+    },
+    {
+      "epoch": 0.5372708615951859,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00092795590911248,
+      "loss": 0.7475,
+      "step": 7723
+    },
+    {
+      "epoch": 0.5373404292323212,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0009277311634851803,
+      "loss": 1.0075,
+      "step": 7724
+    },
+    {
+      "epoch": 0.5374099968694563,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009275064215273278,
+      "loss": 0.7693,
+      "step": 7725
+    },
+    {
+      "epoch": 0.5374795645065915,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009272816832503335,
+      "loss": 0.8396,
+      "step": 7726
+    },
+    {
+      "epoch": 0.5375491321437268,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0009270569486656095,
+      "loss": 0.9725,
+      "step": 7727
+    },
+    {
+      "epoch": 0.537618699780862,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0009268322177845656,
+      "loss": 0.9665,
+      "step": 7728
+    },
+    {
+      "epoch": 0.5376882674179971,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0009266074906186125,
+      "loss": 0.9539,
+      "step": 7729
+    },
+    {
+      "epoch": 0.5377578350551323,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009263827671791619,
+      "loss": 0.9783,
+      "step": 7730
+    },
+    {
+      "epoch": 0.5378274026922676,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009261580474776229,
+      "loss": 0.9241,
+      "step": 7731
+    },
+    {
+      "epoch": 0.5378969703294028,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009259333315254062,
+      "loss": 0.6221,
+      "step": 7732
+    },
+    {
+      "epoch": 0.5379665379665379,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0009257086193339212,
+      "loss": 0.9105,
+      "step": 7733
+    },
+    {
+      "epoch": 0.5380361056036732,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009254839109145785,
+      "loss": 0.8122,
+      "step": 7734
+    },
+    {
+      "epoch": 0.5381056732408084,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0009252592062787871,
+      "loss": 0.9319,
+      "step": 7735
+    },
+    {
+      "epoch": 0.5381752408779436,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009250345054379562,
+      "loss": 0.6494,
+      "step": 7736
+    },
+    {
+      "epoch": 0.5382448085150788,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0009248098084034957,
+      "loss": 0.9644,
+      "step": 7737
+    },
+    {
+      "epoch": 0.538314376152214,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000924585115186814,
+      "loss": 0.87,
+      "step": 7738
+    },
+    {
+      "epoch": 0.5383839437893492,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009243604257993199,
+      "loss": 0.8682,
+      "step": 7739
+    },
+    {
+      "epoch": 0.5384535114264845,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009241357402524219,
+      "loss": 0.822,
+      "step": 7740
+    },
+    {
+      "epoch": 0.5385230790636196,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0009239110585575292,
+      "loss": 0.666,
+      "step": 7741
+    },
+    {
+      "epoch": 0.5385926467007548,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0009236863807260493,
+      "loss": 0.8514,
+      "step": 7742
+    },
+    {
+      "epoch": 0.53866221433789,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0009234617067693899,
+      "loss": 0.7054,
+      "step": 7743
+    },
+    {
+      "epoch": 0.5387317819750252,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0009232370366989596,
+      "loss": 0.7784,
+      "step": 7744
+    },
+    {
+      "epoch": 0.5388013496121604,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0009230123705261657,
+      "loss": 0.8373,
+      "step": 7745
+    },
+    {
+      "epoch": 0.5388709172492956,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0009227877082624155,
+      "loss": 0.9933,
+      "step": 7746
+    },
+    {
+      "epoch": 0.5389404848864309,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0009225630499191161,
+      "loss": 1.0417,
+      "step": 7747
+    },
+    {
+      "epoch": 0.539010052523566,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0009223383955076752,
+      "loss": 0.7612,
+      "step": 7748
+    },
+    {
+      "epoch": 0.5390796201607012,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0009221137450394987,
+      "loss": 0.7888,
+      "step": 7749
+    },
+    {
+      "epoch": 0.5391491877978365,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009218890985259935,
+      "loss": 0.9712,
+      "step": 7750
+    },
+    {
+      "epoch": 0.5392187554349717,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0009216644559785665,
+      "loss": 0.9912,
+      "step": 7751
+    },
+    {
+      "epoch": 0.5392883230721068,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0009214398174086238,
+      "loss": 0.6542,
+      "step": 7752
+    },
+    {
+      "epoch": 0.5393578907092421,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0009212151828275709,
+      "loss": 0.8197,
+      "step": 7753
+    },
+    {
+      "epoch": 0.5394274583463773,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009209905522468137,
+      "loss": 0.9404,
+      "step": 7754
+    },
+    {
+      "epoch": 0.5394970259835125,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0009207659256777586,
+      "loss": 0.64,
+      "step": 7755
+    },
+    {
+      "epoch": 0.5395665936206476,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00092054130313181,
+      "loss": 0.6587,
+      "step": 7756
+    },
+    {
+      "epoch": 0.5396361612577829,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0009203166846203739,
+      "loss": 0.8482,
+      "step": 7757
+    },
+    {
+      "epoch": 0.5397057288949181,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0009200920701548541,
+      "loss": 0.8016,
+      "step": 7758
+    },
+    {
+      "epoch": 0.5397752965320533,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.000919867459746657,
+      "loss": 0.7776,
+      "step": 7759
+    },
+    {
+      "epoch": 0.5398448641691885,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0009196428534071861,
+      "loss": 1.1175,
+      "step": 7760
+    },
+    {
+      "epoch": 0.5399144318063237,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000919418251147846,
+      "loss": 0.7893,
+      "step": 7761
+    },
+    {
+      "epoch": 0.5399839994434589,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009191936529800412,
+      "loss": 0.6078,
+      "step": 7762
+    },
+    {
+      "epoch": 0.5400535670805942,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0009189690589151752,
+      "loss": 0.689,
+      "step": 7763
+    },
+    {
+      "epoch": 0.5401231347177293,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0009187444689646521,
+      "loss": 0.5132,
+      "step": 7764
+    },
+    {
+      "epoch": 0.5401927023548645,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.000918519883139875,
+      "loss": 0.778,
+      "step": 7765
+    },
+    {
+      "epoch": 0.5402622699919997,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.000918295301452248,
+      "loss": 1.479,
+      "step": 7766
+    },
+    {
+      "epoch": 0.540331837629135,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009180707239131735,
+      "loss": 0.9314,
+      "step": 7767
+    },
+    {
+      "epoch": 0.5404014052662701,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009178461505340546,
+      "loss": 0.8695,
+      "step": 7768
+    },
+    {
+      "epoch": 0.5404709729034053,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0009176215813262944,
+      "loss": 0.8224,
+      "step": 7769
+    },
+    {
+      "epoch": 0.5405405405405406,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009173970163012949,
+      "loss": 0.6287,
+      "step": 7770
+    },
+    {
+      "epoch": 0.5406101081776757,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009171724554704586,
+      "loss": 1.0836,
+      "step": 7771
+    },
+    {
+      "epoch": 0.5406796758148109,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0009169478988451873,
+      "loss": 0.8674,
+      "step": 7772
+    },
+    {
+      "epoch": 0.5407492434519462,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009167233464368835,
+      "loss": 0.6791,
+      "step": 7773
+    },
+    {
+      "epoch": 0.5408188110890814,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009164987982569481,
+      "loss": 0.7213,
+      "step": 7774
+    },
+    {
+      "epoch": 0.5408883787262165,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0009162742543167828,
+      "loss": 0.6782,
+      "step": 7775
+    },
+    {
+      "epoch": 0.5409579463633518,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.000916049714627789,
+      "loss": 0.7157,
+      "step": 7776
+    },
+    {
+      "epoch": 0.541027514000487,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0009158251792013677,
+      "loss": 0.8124,
+      "step": 7777
+    },
+    {
+      "epoch": 0.5410970816376222,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0009156006480489196,
+      "loss": 0.642,
+      "step": 7778
+    },
+    {
+      "epoch": 0.5411666492747573,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0009153761211818447,
+      "loss": 0.5892,
+      "step": 7779
+    },
+    {
+      "epoch": 0.5412362169118926,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0009151515986115442,
+      "loss": 0.6871,
+      "step": 7780
+    },
+    {
+      "epoch": 0.5413057845490278,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0009149270803494178,
+      "loss": 0.9627,
+      "step": 7781
+    },
+    {
+      "epoch": 0.541375352186163,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0009147025664068652,
+      "loss": 0.8162,
+      "step": 7782
+    },
+    {
+      "epoch": 0.5414449198232982,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009144780567952866,
+      "loss": 0.6913,
+      "step": 7783
+    },
+    {
+      "epoch": 0.5415144874604334,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009142535515260814,
+      "loss": 0.9963,
+      "step": 7784
+    },
+    {
+      "epoch": 0.5415840550975686,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0009140290506106485,
+      "loss": 1.0329,
+      "step": 7785
+    },
+    {
+      "epoch": 0.5416536227347039,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0009138045540603868,
+      "loss": 0.8808,
+      "step": 7786
+    },
+    {
+      "epoch": 0.541723190371839,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0009135800618866957,
+      "loss": 0.6811,
+      "step": 7787
+    },
+    {
+      "epoch": 0.5417927580089742,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009133555741009735,
+      "loss": 0.8626,
+      "step": 7788
+    },
+    {
+      "epoch": 0.5418623256461095,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009131310907146181,
+      "loss": 0.8456,
+      "step": 7789
+    },
+    {
+      "epoch": 0.5419318932832446,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0009129066117390284,
+      "loss": 0.7679,
+      "step": 7790
+    },
+    {
+      "epoch": 0.5420014609203798,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0009126821371856021,
+      "loss": 0.7074,
+      "step": 7791
+    },
+    {
+      "epoch": 0.542071028557515,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009124576670657366,
+      "loss": 0.8705,
+      "step": 7792
+    },
+    {
+      "epoch": 0.5421405961946503,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009122332013908293,
+      "loss": 0.8058,
+      "step": 7793
+    },
+    {
+      "epoch": 0.5422101638317854,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0009120087401722782,
+      "loss": 0.7583,
+      "step": 7794
+    },
+    {
+      "epoch": 0.5422797314689206,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0009117842834214793,
+      "loss": 0.781,
+      "step": 7795
+    },
+    {
+      "epoch": 0.5423492991060559,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0009115598311498299,
+      "loss": 0.8502,
+      "step": 7796
+    },
+    {
+      "epoch": 0.5424188667431911,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009113353833687266,
+      "loss": 0.7159,
+      "step": 7797
+    },
+    {
+      "epoch": 0.5424884343803262,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0009111109400895659,
+      "loss": 0.5067,
+      "step": 7798
+    },
+    {
+      "epoch": 0.5425580020174615,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0009108865013237433,
+      "loss": 0.7159,
+      "step": 7799
+    },
+    {
+      "epoch": 0.5426275696545967,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0009106620670826548,
+      "loss": 0.6929,
+      "step": 7800
+    },
+    {
+      "epoch": 0.5426971372917319,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.0009104376373776967,
+      "loss": 0.7217,
+      "step": 7801
+    },
+    {
+      "epoch": 0.5427667049288671,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009102132122202638,
+      "loss": 0.8091,
+      "step": 7802
+    },
+    {
+      "epoch": 0.5428362725660023,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.000909988791621751,
+      "loss": 0.9184,
+      "step": 7803
+    },
+    {
+      "epoch": 0.5429058402031375,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009097643755935541,
+      "loss": 0.9267,
+      "step": 7804
+    },
+    {
+      "epoch": 0.5429754078402727,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0009095399641470675,
+      "loss": 0.8922,
+      "step": 7805
+    },
+    {
+      "epoch": 0.5430449754774079,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0009093155572936854,
+      "loss": 0.886,
+      "step": 7806
+    },
+    {
+      "epoch": 0.5431145431145431,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000909091155044802,
+      "loss": 0.8428,
+      "step": 7807
+    },
+    {
+      "epoch": 0.5431841107516783,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0009088667574118119,
+      "loss": 0.8639,
+      "step": 7808
+    },
+    {
+      "epoch": 0.5432536783888136,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0009086423644061083,
+      "loss": 0.7444,
+      "step": 7809
+    },
+    {
+      "epoch": 0.5433232460259487,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0009084179760390849,
+      "loss": 0.7876,
+      "step": 7810
+    },
+    {
+      "epoch": 0.5433928136630839,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0009081935923221352,
+      "loss": 0.4759,
+      "step": 7811
+    },
+    {
+      "epoch": 0.5434623813002192,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0009079692132666523,
+      "loss": 0.6489,
+      "step": 7812
+    },
+    {
+      "epoch": 0.5435319489373543,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0009077448388840286,
+      "loss": 0.6377,
+      "step": 7813
+    },
+    {
+      "epoch": 0.5436015165744895,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009075204691856569,
+      "loss": 0.9348,
+      "step": 7814
+    },
+    {
+      "epoch": 0.5436710842116248,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0009072961041829299,
+      "loss": 0.7295,
+      "step": 7815
+    },
+    {
+      "epoch": 0.54374065184876,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0009070717438872395,
+      "loss": 0.6259,
+      "step": 7816
+    },
+    {
+      "epoch": 0.5438102194858951,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0009068473883099773,
+      "loss": 0.7253,
+      "step": 7817
+    },
+    {
+      "epoch": 0.5438797871230303,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0009066230374625353,
+      "loss": 0.6697,
+      "step": 7818
+    },
+    {
+      "epoch": 0.5439493547601656,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000906398691356305,
+      "loss": 0.6115,
+      "step": 7819
+    },
+    {
+      "epoch": 0.5440189223973008,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0009061743500026773,
+      "loss": 0.9162,
+      "step": 7820
+    },
+    {
+      "epoch": 0.5440884900344359,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0009059500134130428,
+      "loss": 1.1168,
+      "step": 7821
+    },
+    {
+      "epoch": 0.5441580576715712,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0009057256815987928,
+      "loss": 0.9183,
+      "step": 7822
+    },
+    {
+      "epoch": 0.5442276253087064,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009055013545713179,
+      "loss": 0.9001,
+      "step": 7823
+    },
+    {
+      "epoch": 0.5442971929458416,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0009052770323420074,
+      "loss": 0.9196,
+      "step": 7824
+    },
+    {
+      "epoch": 0.5443667605829768,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000905052714922252,
+      "loss": 0.8726,
+      "step": 7825
+    },
+    {
+      "epoch": 0.544436328220112,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0009048284023234413,
+      "loss": 0.558,
+      "step": 7826
+    },
+    {
+      "epoch": 0.5445058958572472,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0009046040945569644,
+      "loss": 0.915,
+      "step": 7827
+    },
+    {
+      "epoch": 0.5445754634943825,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0009043797916342106,
+      "loss": 0.9056,
+      "step": 7828
+    },
+    {
+      "epoch": 0.5446450311315176,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0009041554935665691,
+      "loss": 0.8979,
+      "step": 7829
+    },
+    {
+      "epoch": 0.5447145987686528,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000903931200365429,
+      "loss": 0.8202,
+      "step": 7830
+    },
+    {
+      "epoch": 0.544784166405788,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009037069120421777,
+      "loss": 0.8047,
+      "step": 7831
+    },
+    {
+      "epoch": 0.5448537340429233,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0009034826286082043,
+      "loss": 0.6387,
+      "step": 7832
+    },
+    {
+      "epoch": 0.5449233016800584,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0009032583500748968,
+      "loss": 0.8866,
+      "step": 7833
+    },
+    {
+      "epoch": 0.5449928693171936,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0009030340764536424,
+      "loss": 0.995,
+      "step": 7834
+    },
+    {
+      "epoch": 0.5450624369543289,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0009028098077558287,
+      "loss": 0.76,
+      "step": 7835
+    },
+    {
+      "epoch": 0.545132004591464,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0009025855439928433,
+      "loss": 0.7401,
+      "step": 7836
+    },
+    {
+      "epoch": 0.5452015722285992,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0009023612851760731,
+      "loss": 0.8346,
+      "step": 7837
+    },
+    {
+      "epoch": 0.5452711398657345,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0009021370313169046,
+      "loss": 0.596,
+      "step": 7838
+    },
+    {
+      "epoch": 0.5453407075028697,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0009019127824267242,
+      "loss": 0.6525,
+      "step": 7839
+    },
+    {
+      "epoch": 0.5454102751400048,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009016885385169185,
+      "loss": 1.0252,
+      "step": 7840
+    },
+    {
+      "epoch": 0.5454798427771401,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0009014642995988733,
+      "loss": 0.801,
+      "step": 7841
+    },
+    {
+      "epoch": 0.5455494104142753,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.000901240065683974,
+      "loss": 0.5813,
+      "step": 7842
+    },
+    {
+      "epoch": 0.5456189780514105,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009010158367836066,
+      "loss": 0.8475,
+      "step": 7843
+    },
+    {
+      "epoch": 0.5456885456885456,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0009007916129091563,
+      "loss": 0.8293,
+      "step": 7844
+    },
+    {
+      "epoch": 0.5457581133256809,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0009005673940720077,
+      "loss": 0.765,
+      "step": 7845
+    },
+    {
+      "epoch": 0.5458276809628161,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0009003431802835454,
+      "loss": 0.6574,
+      "step": 7846
+    },
+    {
+      "epoch": 0.5458972485999513,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0009001189715551544,
+      "loss": 1.1601,
+      "step": 7847
+    },
+    {
+      "epoch": 0.5459668162370865,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0008998947678982187,
+      "loss": 0.5843,
+      "step": 7848
+    },
+    {
+      "epoch": 0.5460363838742217,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008996705693241216,
+      "loss": 0.723,
+      "step": 7849
+    },
+    {
+      "epoch": 0.5461059515113569,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0008994463758442476,
+      "loss": 0.848,
+      "step": 7850
+    },
+    {
+      "epoch": 0.5461755191484922,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0008992221874699801,
+      "loss": 1.0163,
+      "step": 7851
+    },
+    {
+      "epoch": 0.5462450867856273,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008989980042127016,
+      "loss": 0.6162,
+      "step": 7852
+    },
+    {
+      "epoch": 0.5463146544227625,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0008987738260837952,
+      "loss": 0.8456,
+      "step": 7853
+    },
+    {
+      "epoch": 0.5463842220598978,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000898549653094644,
+      "loss": 0.8359,
+      "step": 7854
+    },
+    {
+      "epoch": 0.546453789697033,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008983254852566303,
+      "loss": 0.8284,
+      "step": 7855
+    },
+    {
+      "epoch": 0.5465233573341681,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0008981013225811354,
+      "loss": 0.6926,
+      "step": 7856
+    },
+    {
+      "epoch": 0.5465929249713033,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.000897877165079542,
+      "loss": 0.8208,
+      "step": 7857
+    },
+    {
+      "epoch": 0.5466624926084386,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008976530127632317,
+      "loss": 0.7933,
+      "step": 7858
+    },
+    {
+      "epoch": 0.5467320602455737,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008974288656435852,
+      "loss": 0.7539,
+      "step": 7859
+    },
+    {
+      "epoch": 0.5468016278827089,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008972047237319838,
+      "loss": 0.7873,
+      "step": 7860
+    },
+    {
+      "epoch": 0.5468711955198442,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008969805870398086,
+      "loss": 0.8811,
+      "step": 7861
+    },
+    {
+      "epoch": 0.5469407631569794,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0008967564555784401,
+      "loss": 0.9331,
+      "step": 7862
+    },
+    {
+      "epoch": 0.5470103307941145,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000896532329359258,
+      "loss": 0.8192,
+      "step": 7863
+    },
+    {
+      "epoch": 0.5470798984312498,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0008963082083936429,
+      "loss": 0.9081,
+      "step": 7864
+    },
+    {
+      "epoch": 0.547149466068385,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0008960840926929745,
+      "loss": 1.0103,
+      "step": 7865
+    },
+    {
+      "epoch": 0.5472190337055202,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008958599822686319,
+      "loss": 0.8191,
+      "step": 7866
+    },
+    {
+      "epoch": 0.5472886013426554,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0008956358771319943,
+      "loss": 0.755,
+      "step": 7867
+    },
+    {
+      "epoch": 0.5473581689797906,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008954117772944412,
+      "loss": 0.8025,
+      "step": 7868
+    },
+    {
+      "epoch": 0.5474277366169258,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.000895187682767351,
+      "loss": 0.7683,
+      "step": 7869
+    },
+    {
+      "epoch": 0.547497304254061,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0008949635935621014,
+      "loss": 0.9424,
+      "step": 7870
+    },
+    {
+      "epoch": 0.5475668718911962,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008947395096900715,
+      "loss": 0.9196,
+      "step": 7871
+    },
+    {
+      "epoch": 0.5476364395283314,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0008945154311626389,
+      "loss": 0.7907,
+      "step": 7872
+    },
+    {
+      "epoch": 0.5477060071654666,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0008942913579911808,
+      "loss": 0.8883,
+      "step": 7873
+    },
+    {
+      "epoch": 0.5477755748026019,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008940672901870745,
+      "loss": 0.7396,
+      "step": 7874
+    },
+    {
+      "epoch": 0.547845142439737,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008938432277616975,
+      "loss": 0.7292,
+      "step": 7875
+    },
+    {
+      "epoch": 0.5479147100768722,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008936191707264265,
+      "loss": 0.814,
+      "step": 7876
+    },
+    {
+      "epoch": 0.5479842777140075,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008933951190926374,
+      "loss": 0.8076,
+      "step": 7877
+    },
+    {
+      "epoch": 0.5480538453511427,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.000893171072871707,
+      "loss": 0.8851,
+      "step": 7878
+    },
+    {
+      "epoch": 0.5481234129882778,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008929470320750114,
+      "loss": 1.0926,
+      "step": 7879
+    },
+    {
+      "epoch": 0.5481929806254131,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0008927229967139256,
+      "loss": 1.0434,
+      "step": 7880
+    },
+    {
+      "epoch": 0.5482625482625483,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0008924989667998251,
+      "loss": 0.9311,
+      "step": 7881
+    },
+    {
+      "epoch": 0.5483321158996834,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008922749423440854,
+      "loss": 0.8518,
+      "step": 7882
+    },
+    {
+      "epoch": 0.5484016835368186,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0008920509233580814,
+      "loss": 0.8492,
+      "step": 7883
+    },
+    {
+      "epoch": 0.5484712511739539,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008918269098531871,
+      "loss": 0.8881,
+      "step": 7884
+    },
+    {
+      "epoch": 0.5485408188110891,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008916029018407772,
+      "loss": 0.8287,
+      "step": 7885
+    },
+    {
+      "epoch": 0.5486103864482242,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008913788993322256,
+      "loss": 0.7321,
+      "step": 7886
+    },
+    {
+      "epoch": 0.5486799540853595,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008911549023389063,
+      "loss": 0.9308,
+      "step": 7887
+    },
+    {
+      "epoch": 0.5487495217224947,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008909309108721918,
+      "loss": 0.6023,
+      "step": 7888
+    },
+    {
+      "epoch": 0.5488190893596299,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0008907069249434563,
+      "loss": 0.9637,
+      "step": 7889
+    },
+    {
+      "epoch": 0.5488886569967651,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008904829445640724,
+      "loss": 0.7305,
+      "step": 7890
+    },
+    {
+      "epoch": 0.5489582246339003,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008902589697454122,
+      "loss": 0.9317,
+      "step": 7891
+    },
+    {
+      "epoch": 0.5490277922710355,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0008900350004988484,
+      "loss": 0.66,
+      "step": 7892
+    },
+    {
+      "epoch": 0.5490973599081708,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0008898110368357533,
+      "loss": 0.9249,
+      "step": 7893
+    },
+    {
+      "epoch": 0.5491669275453059,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0008895870787674984,
+      "loss": 0.6496,
+      "step": 7894
+    },
+    {
+      "epoch": 0.5492364951824411,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008893631263054547,
+      "loss": 0.9521,
+      "step": 7895
+    },
+    {
+      "epoch": 0.5493060628195763,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008891391794609941,
+      "loss": 0.9686,
+      "step": 7896
+    },
+    {
+      "epoch": 0.5493756304567116,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0008889152382454872,
+      "loss": 0.7132,
+      "step": 7897
+    },
+    {
+      "epoch": 0.5494451980938467,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008886913026703042,
+      "loss": 0.5568,
+      "step": 7898
+    },
+    {
+      "epoch": 0.5495147657309819,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008884673727468164,
+      "loss": 0.8192,
+      "step": 7899
+    },
+    {
+      "epoch": 0.5495843333681172,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0008882434484863928,
+      "loss": 0.7144,
+      "step": 7900
+    },
+    {
+      "epoch": 0.5496539010052524,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.000888019529900404,
+      "loss": 0.9487,
+      "step": 7901
+    },
+    {
+      "epoch": 0.5497234686423875,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0008877956170002186,
+      "loss": 0.5608,
+      "step": 7902
+    },
+    {
+      "epoch": 0.5497930362795228,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008875717097972064,
+      "loss": 0.7444,
+      "step": 7903
+    },
+    {
+      "epoch": 0.549862603916658,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008873478083027364,
+      "loss": 0.5599,
+      "step": 7904
+    },
+    {
+      "epoch": 0.5499321715537931,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0008871239125281761,
+      "loss": 0.7114,
+      "step": 7905
+    },
+    {
+      "epoch": 0.5500017391909284,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008869000224848954,
+      "loss": 0.9034,
+      "step": 7906
+    },
+    {
+      "epoch": 0.5500713068280636,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0008866761381842612,
+      "loss": 0.7924,
+      "step": 7907
+    },
+    {
+      "epoch": 0.5501408744651988,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008864522596376416,
+      "loss": 0.585,
+      "step": 7908
+    },
+    {
+      "epoch": 0.5502104421023339,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0008862283868564038,
+      "loss": 0.9106,
+      "step": 7909
+    },
+    {
+      "epoch": 0.5502800097394692,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.000886004519851915,
+      "loss": 0.8078,
+      "step": 7910
+    },
+    {
+      "epoch": 0.5503495773766044,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008857806586355423,
+      "loss": 0.7807,
+      "step": 7911
+    },
+    {
+      "epoch": 0.5504191450137396,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008855568032186517,
+      "loss": 0.6597,
+      "step": 7912
+    },
+    {
+      "epoch": 0.5504887126508748,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0008853329536126102,
+      "loss": 1.0829,
+      "step": 7913
+    },
+    {
+      "epoch": 0.55055828028801,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0008851091098287831,
+      "loss": 0.8936,
+      "step": 7914
+    },
+    {
+      "epoch": 0.5506278479251452,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008848852718785366,
+      "loss": 0.7169,
+      "step": 7915
+    },
+    {
+      "epoch": 0.5506974155622805,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0008846614397732354,
+      "loss": 0.6306,
+      "step": 7916
+    },
+    {
+      "epoch": 0.5507669831994156,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008844376135242451,
+      "loss": 0.7502,
+      "step": 7917
+    },
+    {
+      "epoch": 0.5508365508365508,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008842137931429303,
+      "loss": 0.8823,
+      "step": 7918
+    },
+    {
+      "epoch": 0.5509061184736861,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008839899786406558,
+      "loss": 0.8811,
+      "step": 7919
+    },
+    {
+      "epoch": 0.5509756861108213,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008837661700287849,
+      "loss": 0.9098,
+      "step": 7920
+    },
+    {
+      "epoch": 0.5510452537479564,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0008835423673186822,
+      "loss": 0.805,
+      "step": 7921
+    },
+    {
+      "epoch": 0.5511148213850916,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008833185705217114,
+      "loss": 0.8195,
+      "step": 7922
+    },
+    {
+      "epoch": 0.5511843890222269,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000883094779649235,
+      "loss": 0.9827,
+      "step": 7923
+    },
+    {
+      "epoch": 0.551253956659362,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008828709947126166,
+      "loss": 0.648,
+      "step": 7924
+    },
+    {
+      "epoch": 0.5513235242964972,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0008826472157232188,
+      "loss": 0.8791,
+      "step": 7925
+    },
+    {
+      "epoch": 0.5513930919336325,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008824234426924041,
+      "loss": 0.9768,
+      "step": 7926
+    },
+    {
+      "epoch": 0.5514626595707677,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008821996756315341,
+      "loss": 0.6314,
+      "step": 7927
+    },
+    {
+      "epoch": 0.5515322272079028,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0008819759145519707,
+      "loss": 0.7367,
+      "step": 7928
+    },
+    {
+      "epoch": 0.5516017948450381,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008817521594650759,
+      "loss": 0.8871,
+      "step": 7929
+    },
+    {
+      "epoch": 0.5516713624821733,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0008815284103822097,
+      "loss": 0.9596,
+      "step": 7930
+    },
+    {
+      "epoch": 0.5517409301193085,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0008813046673147344,
+      "loss": 0.9696,
+      "step": 7931
+    },
+    {
+      "epoch": 0.5518104977564438,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0008810809302740095,
+      "loss": 0.9666,
+      "step": 7932
+    },
+    {
+      "epoch": 0.5518800653935789,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0008808571992713958,
+      "loss": 0.8094,
+      "step": 7933
+    },
+    {
+      "epoch": 0.5519496330307141,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0008806334743182526,
+      "loss": 0.8582,
+      "step": 7934
+    },
+    {
+      "epoch": 0.5520192006678493,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008804097554259402,
+      "loss": 1.0506,
+      "step": 7935
+    },
+    {
+      "epoch": 0.5520887683049845,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008801860426058177,
+      "loss": 0.9523,
+      "step": 7936
+    },
+    {
+      "epoch": 0.5521583359421197,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0008799623358692434,
+      "loss": 1.1695,
+      "step": 7937
+    },
+    {
+      "epoch": 0.5522279035792549,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008797386352275775,
+      "loss": 0.9633,
+      "step": 7938
+    },
+    {
+      "epoch": 0.5522974712163902,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008795149406921772,
+      "loss": 0.7446,
+      "step": 7939
+    },
+    {
+      "epoch": 0.5523670388535253,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008792912522744011,
+      "loss": 0.9321,
+      "step": 7940
+    },
+    {
+      "epoch": 0.5524366064906605,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0008790675699856064,
+      "loss": 0.8671,
+      "step": 7941
+    },
+    {
+      "epoch": 0.5525061741277958,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008788438938371512,
+      "loss": 0.7517,
+      "step": 7942
+    },
+    {
+      "epoch": 0.552575741764931,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008786202238403926,
+      "loss": 0.6515,
+      "step": 7943
+    },
+    {
+      "epoch": 0.5526453094020661,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0008783965600066866,
+      "loss": 0.7545,
+      "step": 7944
+    },
+    {
+      "epoch": 0.5527148770392014,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000878172902347391,
+      "loss": 0.7001,
+      "step": 7945
+    },
+    {
+      "epoch": 0.5527844446763366,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008779492508738611,
+      "loss": 0.9313,
+      "step": 7946
+    },
+    {
+      "epoch": 0.5528540123134718,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008777256055974533,
+      "loss": 0.82,
+      "step": 7947
+    },
+    {
+      "epoch": 0.5529235799506069,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008775019665295225,
+      "loss": 0.8616,
+      "step": 7948
+    },
+    {
+      "epoch": 0.5529931475877422,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008772783336814246,
+      "loss": 0.8757,
+      "step": 7949
+    },
+    {
+      "epoch": 0.5530627152248774,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008770547070645145,
+      "loss": 0.9859,
+      "step": 7950
+    },
+    {
+      "epoch": 0.5531322828620125,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0008768310866901463,
+      "loss": 0.7146,
+      "step": 7951
+    },
+    {
+      "epoch": 0.5532018504991478,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0008766074725696752,
+      "loss": 0.7746,
+      "step": 7952
+    },
+    {
+      "epoch": 0.553271418136283,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008763838647144544,
+      "loss": 0.822,
+      "step": 7953
+    },
+    {
+      "epoch": 0.5533409857734182,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008761602631358382,
+      "loss": 0.692,
+      "step": 7954
+    },
+    {
+      "epoch": 0.5534105534105535,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008759366678451792,
+      "loss": 0.6741,
+      "step": 7955
+    },
+    {
+      "epoch": 0.5534801210476886,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008757130788538311,
+      "loss": 0.6768,
+      "step": 7956
+    },
+    {
+      "epoch": 0.5535496886848238,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0008754894961731463,
+      "loss": 0.63,
+      "step": 7957
+    },
+    {
+      "epoch": 0.5536192563219591,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0008752659198144773,
+      "loss": 0.7849,
+      "step": 7958
+    },
+    {
+      "epoch": 0.5536888239590942,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008750423497891764,
+      "loss": 0.8418,
+      "step": 7959
+    },
+    {
+      "epoch": 0.5537583915962294,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.000874818786108595,
+      "loss": 0.6265,
+      "step": 7960
+    },
+    {
+      "epoch": 0.5538279592333646,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0008745952287840849,
+      "loss": 0.9863,
+      "step": 7961
+    },
+    {
+      "epoch": 0.5538975268704999,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0008743716778269966,
+      "loss": 0.7077,
+      "step": 7962
+    },
+    {
+      "epoch": 0.553967094507635,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008741481332486813,
+      "loss": 0.9039,
+      "step": 7963
+    },
+    {
+      "epoch": 0.5540366621447702,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008739245950604897,
+      "loss": 0.6563,
+      "step": 7964
+    },
+    {
+      "epoch": 0.5541062297819055,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008737010632737714,
+      "loss": 0.9993,
+      "step": 7965
+    },
+    {
+      "epoch": 0.5541757974190407,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008734775378998771,
+      "loss": 0.6714,
+      "step": 7966
+    },
+    {
+      "epoch": 0.5542453650561758,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0008732540189501552,
+      "loss": 1.1504,
+      "step": 7967
+    },
+    {
+      "epoch": 0.5543149326933111,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008730305064359558,
+      "loss": 0.7834,
+      "step": 7968
+    },
+    {
+      "epoch": 0.5543845003304463,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008728070003686266,
+      "loss": 0.7097,
+      "step": 7969
+    },
+    {
+      "epoch": 0.5544540679675815,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0008725835007595174,
+      "loss": 0.7942,
+      "step": 7970
+    },
+    {
+      "epoch": 0.5545236356047167,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0008723600076199757,
+      "loss": 0.8154,
+      "step": 7971
+    },
+    {
+      "epoch": 0.5545932032418519,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008721365209613491,
+      "loss": 0.8025,
+      "step": 7972
+    },
+    {
+      "epoch": 0.5546627708789871,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.000871913040794986,
+      "loss": 0.6833,
+      "step": 7973
+    },
+    {
+      "epoch": 0.5547323385161222,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0008716895671322329,
+      "loss": 0.9714,
+      "step": 7974
+    },
+    {
+      "epoch": 0.5548019061532575,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0008714660999844371,
+      "loss": 1.101,
+      "step": 7975
+    },
+    {
+      "epoch": 0.5548714737903927,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008712426393629441,
+      "loss": 0.8226,
+      "step": 7976
+    },
+    {
+      "epoch": 0.5549410414275279,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008710191852791016,
+      "loss": 0.8727,
+      "step": 7977
+    },
+    {
+      "epoch": 0.5550106090646632,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0008707957377442546,
+      "loss": 0.8996,
+      "step": 7978
+    },
+    {
+      "epoch": 0.5550801767017983,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0008705722967697484,
+      "loss": 0.8693,
+      "step": 7979
+    },
+    {
+      "epoch": 0.5551497443389335,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0008703488623669293,
+      "loss": 0.768,
+      "step": 7980
+    },
+    {
+      "epoch": 0.5552193119760688,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008701254345471411,
+      "loss": 0.9451,
+      "step": 7981
+    },
+    {
+      "epoch": 0.555288879613204,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000869902013321729,
+      "loss": 0.6398,
+      "step": 7982
+    },
+    {
+      "epoch": 0.5553584472503391,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008696785987020362,
+      "loss": 0.932,
+      "step": 7983
+    },
+    {
+      "epoch": 0.5554280148874744,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0008694551906994081,
+      "loss": 0.7843,
+      "step": 7984
+    },
+    {
+      "epoch": 0.5554975825246096,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.000869231789325187,
+      "loss": 0.8724,
+      "step": 7985
+    },
+    {
+      "epoch": 0.5555671501617447,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0008690083945907163,
+      "loss": 0.8195,
+      "step": 7986
+    },
+    {
+      "epoch": 0.5556367177988799,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0008687850065073398,
+      "loss": 0.9577,
+      "step": 7987
+    },
+    {
+      "epoch": 0.5557062854360152,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0008685616250863988,
+      "loss": 0.5295,
+      "step": 7988
+    },
+    {
+      "epoch": 0.5557758530731504,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008683382503392361,
+      "loss": 0.613,
+      "step": 7989
+    },
+    {
+      "epoch": 0.5558454207102855,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008681148822771932,
+      "loss": 0.8027,
+      "step": 7990
+    },
+    {
+      "epoch": 0.5559149883474208,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008678915209116121,
+      "loss": 0.7651,
+      "step": 7991
+    },
+    {
+      "epoch": 0.555984555984556,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0008676681662538335,
+      "loss": 0.9081,
+      "step": 7992
+    },
+    {
+      "epoch": 0.5560541236216912,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008674448183151988,
+      "loss": 0.9049,
+      "step": 7993
+    },
+    {
+      "epoch": 0.5561236912588264,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0008672214771070477,
+      "loss": 0.6433,
+      "step": 7994
+    },
+    {
+      "epoch": 0.5561932588959616,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0008669981426407208,
+      "loss": 0.6013,
+      "step": 7995
+    },
+    {
+      "epoch": 0.5562628265330968,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008667748149275578,
+      "loss": 0.7064,
+      "step": 7996
+    },
+    {
+      "epoch": 0.5563323941702321,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0008665514939788981,
+      "loss": 0.799,
+      "step": 7997
+    },
+    {
+      "epoch": 0.5564019618073672,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0008663281798060814,
+      "loss": 0.9057,
+      "step": 7998
+    },
+    {
+      "epoch": 0.5564715294445024,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008661048724204457,
+      "loss": 0.8069,
+      "step": 7999
+    },
+    {
+      "epoch": 0.5565410970816376,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008658815718333298,
+      "loss": 0.9088,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5566106647187729,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008656582780560712,
+      "loss": 0.7117,
+      "step": 8001
+    },
+    {
+      "epoch": 0.556680232355908,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0008654349911000086,
+      "loss": 0.6986,
+      "step": 8002
+    },
+    {
+      "epoch": 0.5567497999930432,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0008652117109764787,
+      "loss": 0.7315,
+      "step": 8003
+    },
+    {
+      "epoch": 0.5568193676301785,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0008649884376968186,
+      "loss": 0.6983,
+      "step": 8004
+    },
+    {
+      "epoch": 0.5568889352673136,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008647651712723654,
+      "loss": 0.5391,
+      "step": 8005
+    },
+    {
+      "epoch": 0.5569585029044488,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.000864541911714455,
+      "loss": 0.9969,
+      "step": 8006
+    },
+    {
+      "epoch": 0.5570280705415841,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0008643186590344239,
+      "loss": 0.6933,
+      "step": 8007
+    },
+    {
+      "epoch": 0.5570976381787193,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0008640954132436067,
+      "loss": 0.7267,
+      "step": 8008
+    },
+    {
+      "epoch": 0.5571672058158544,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008638721743533402,
+      "loss": 0.7014,
+      "step": 8009
+    },
+    {
+      "epoch": 0.5572367734529897,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0008636489423749581,
+      "loss": 0.8283,
+      "step": 8010
+    },
+    {
+      "epoch": 0.5573063410901249,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008634257173197954,
+      "loss": 1.0623,
+      "step": 8011
+    },
+    {
+      "epoch": 0.5573759087272601,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0008632024991991867,
+      "loss": 0.8609,
+      "step": 8012
+    },
+    {
+      "epoch": 0.5574454763643952,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008629792880244653,
+      "loss": 0.8745,
+      "step": 8013
+    },
+    {
+      "epoch": 0.5575150440015305,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0008627560838069655,
+      "loss": 0.7606,
+      "step": 8014
+    },
+    {
+      "epoch": 0.5575846116386657,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0008625328865580191,
+      "loss": 0.7748,
+      "step": 8015
+    },
+    {
+      "epoch": 0.5576541792758009,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008623096962889606,
+      "loss": 0.9064,
+      "step": 8016
+    },
+    {
+      "epoch": 0.5577237469129361,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008620865130111215,
+      "loss": 0.8291,
+      "step": 8017
+    },
+    {
+      "epoch": 0.5577933145500713,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0008618633367358339,
+      "loss": 1.0929,
+      "step": 8018
+    },
+    {
+      "epoch": 0.5578628821872065,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008616401674744303,
+      "loss": 0.7828,
+      "step": 8019
+    },
+    {
+      "epoch": 0.5579324498243418,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008614170052382413,
+      "loss": 0.9277,
+      "step": 8020
+    },
+    {
+      "epoch": 0.5580020174614769,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008611938500385983,
+      "loss": 0.9666,
+      "step": 8021
+    },
+    {
+      "epoch": 0.5580715850986121,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0008609707018868317,
+      "loss": 0.4326,
+      "step": 8022
+    },
+    {
+      "epoch": 0.5581411527357474,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008607475607942725,
+      "loss": 0.5866,
+      "step": 8023
+    },
+    {
+      "epoch": 0.5582107203728826,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0008605244267722502,
+      "loss": 0.9919,
+      "step": 8024
+    },
+    {
+      "epoch": 0.5582802880100177,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0008603012998320941,
+      "loss": 0.8603,
+      "step": 8025
+    },
+    {
+      "epoch": 0.5583498556471529,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008600781799851344,
+      "loss": 0.8032,
+      "step": 8026
+    },
+    {
+      "epoch": 0.5584194232842882,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0008598550672426993,
+      "loss": 1.0323,
+      "step": 8027
+    },
+    {
+      "epoch": 0.5584889909214233,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0008596319616161175,
+      "loss": 0.6604,
+      "step": 8028
+    },
+    {
+      "epoch": 0.5585585585585585,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008594088631167169,
+      "loss": 0.8073,
+      "step": 8029
+    },
+    {
+      "epoch": 0.5586281261956938,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008591857717558261,
+      "loss": 0.8021,
+      "step": 8030
+    },
+    {
+      "epoch": 0.558697693832829,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008589626875447717,
+      "loss": 0.8974,
+      "step": 8031
+    },
+    {
+      "epoch": 0.5587672614699641,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008587396104948811,
+      "loss": 0.8931,
+      "step": 8032
+    },
+    {
+      "epoch": 0.5588368291070994,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0008585165406174813,
+      "loss": 0.894,
+      "step": 8033
+    },
+    {
+      "epoch": 0.5589063967442346,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0008582934779238985,
+      "loss": 0.6545,
+      "step": 8034
+    },
+    {
+      "epoch": 0.5589759643813698,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0008580704224254583,
+      "loss": 0.7657,
+      "step": 8035
+    },
+    {
+      "epoch": 0.559045532018505,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008578473741334867,
+      "loss": 0.7614,
+      "step": 8036
+    },
+    {
+      "epoch": 0.5591150996556402,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008576243330593093,
+      "loss": 0.8003,
+      "step": 8037
+    },
+    {
+      "epoch": 0.5591846672927754,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008574012992142504,
+      "loss": 1.0983,
+      "step": 8038
+    },
+    {
+      "epoch": 0.5592542349299106,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008571782726096346,
+      "loss": 1.0023,
+      "step": 8039
+    },
+    {
+      "epoch": 0.5593238025670458,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008569552532567865,
+      "loss": 0.8936,
+      "step": 8040
+    },
+    {
+      "epoch": 0.559393370204181,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008567322411670297,
+      "loss": 0.7366,
+      "step": 8041
+    },
+    {
+      "epoch": 0.5594629378413162,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008565092363516876,
+      "loss": 0.7664,
+      "step": 8042
+    },
+    {
+      "epoch": 0.5595325054784515,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008562862388220828,
+      "loss": 0.7319,
+      "step": 8043
+    },
+    {
+      "epoch": 0.5596020731155866,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000856063248589539,
+      "loss": 0.8718,
+      "step": 8044
+    },
+    {
+      "epoch": 0.5596716407527218,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0008558402656653777,
+      "loss": 0.8213,
+      "step": 8045
+    },
+    {
+      "epoch": 0.5597412083898571,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0008556172900609207,
+      "loss": 0.7882,
+      "step": 8046
+    },
+    {
+      "epoch": 0.5598107760269923,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0008553943217874903,
+      "loss": 0.768,
+      "step": 8047
+    },
+    {
+      "epoch": 0.5598803436641274,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008551713608564075,
+      "loss": 1.0058,
+      "step": 8048
+    },
+    {
+      "epoch": 0.5599499113012627,
+      "grad_norm": 1.0,
+      "learning_rate": 0.000854948407278993,
+      "loss": 0.8091,
+      "step": 8049
+    },
+    {
+      "epoch": 0.5600194789383979,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000854725461066567,
+      "loss": 0.9949,
+      "step": 8050
+    },
+    {
+      "epoch": 0.560089046575533,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008545025222304501,
+      "loss": 0.7258,
+      "step": 8051
+    },
+    {
+      "epoch": 0.5601586142126682,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0008542795907819618,
+      "loss": 0.6841,
+      "step": 8052
+    },
+    {
+      "epoch": 0.5602281818498035,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000854056666732421,
+      "loss": 0.6304,
+      "step": 8053
+    },
+    {
+      "epoch": 0.5602977494869387,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0008538337500931472,
+      "loss": 0.6101,
+      "step": 8054
+    },
+    {
+      "epoch": 0.5603673171240738,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0008536108408754593,
+      "loss": 1.0405,
+      "step": 8055
+    },
+    {
+      "epoch": 0.5604368847612091,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0008533879390906747,
+      "loss": 0.9695,
+      "step": 8056
+    },
+    {
+      "epoch": 0.5605064523983443,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0008531650447501114,
+      "loss": 1.005,
+      "step": 8057
+    },
+    {
+      "epoch": 0.5605760200354795,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008529421578650873,
+      "loss": 0.8173,
+      "step": 8058
+    },
+    {
+      "epoch": 0.5606455876726147,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008527192784469191,
+      "loss": 0.9021,
+      "step": 8059
+    },
+    {
+      "epoch": 0.5607151553097499,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008524964065069234,
+      "loss": 0.756,
+      "step": 8060
+    },
+    {
+      "epoch": 0.5607847229468851,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0008522735420564169,
+      "loss": 0.8285,
+      "step": 8061
+    },
+    {
+      "epoch": 0.5608542905840204,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0008520506851067154,
+      "loss": 0.9753,
+      "step": 8062
+    },
+    {
+      "epoch": 0.5609238582211555,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0008518278356691344,
+      "loss": 0.8214,
+      "step": 8063
+    },
+    {
+      "epoch": 0.5609934258582907,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0008516049937549888,
+      "loss": 0.8445,
+      "step": 8064
+    },
+    {
+      "epoch": 0.5610629934954259,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0008513821593755939,
+      "loss": 1.078,
+      "step": 8065
+    },
+    {
+      "epoch": 0.5611325611325612,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008511593325422639,
+      "loss": 0.7691,
+      "step": 8066
+    },
+    {
+      "epoch": 0.5612021287696963,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008509365132663124,
+      "loss": 1.0448,
+      "step": 8067
+    },
+    {
+      "epoch": 0.5612716964068315,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0008507137015590537,
+      "loss": 0.5083,
+      "step": 8068
+    },
+    {
+      "epoch": 0.5613412640439668,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008504908974318009,
+      "loss": 0.884,
+      "step": 8069
+    },
+    {
+      "epoch": 0.561410831681102,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008502681008958667,
+      "loss": 0.9151,
+      "step": 8070
+    },
+    {
+      "epoch": 0.5614803993182371,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0008500453119625633,
+      "loss": 0.807,
+      "step": 8071
+    },
+    {
+      "epoch": 0.5615499669553724,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0008498225306432034,
+      "loss": 0.8662,
+      "step": 8072
+    },
+    {
+      "epoch": 0.5616195345925076,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008495997569490986,
+      "loss": 0.872,
+      "step": 8073
+    },
+    {
+      "epoch": 0.5616891022296427,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008493769908915599,
+      "loss": 0.8244,
+      "step": 8074
+    },
+    {
+      "epoch": 0.561758669866778,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008491542324818982,
+      "loss": 0.9894,
+      "step": 8075
+    },
+    {
+      "epoch": 0.5618282375039132,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0008489314817314246,
+      "loss": 0.8056,
+      "step": 8076
+    },
+    {
+      "epoch": 0.5618978051410484,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008487087386514488,
+      "loss": 0.7569,
+      "step": 8077
+    },
+    {
+      "epoch": 0.5619673727781835,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008484860032532804,
+      "loss": 0.9542,
+      "step": 8078
+    },
+    {
+      "epoch": 0.5620369404153188,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008482632755482293,
+      "loss": 0.8073,
+      "step": 8079
+    },
+    {
+      "epoch": 0.562106508052454,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008480405555476045,
+      "loss": 0.8613,
+      "step": 8080
+    },
+    {
+      "epoch": 0.5621760756895892,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0008478178432627142,
+      "loss": 0.9116,
+      "step": 8081
+    },
+    {
+      "epoch": 0.5622456433267244,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0008475951387048664,
+      "loss": 0.7866,
+      "step": 8082
+    },
+    {
+      "epoch": 0.5623152109638596,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008473724418853698,
+      "loss": 0.8013,
+      "step": 8083
+    },
+    {
+      "epoch": 0.5623847786009948,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008471497528155311,
+      "loss": 0.8295,
+      "step": 8084
+    },
+    {
+      "epoch": 0.5624543462381301,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008469270715066573,
+      "loss": 0.9016,
+      "step": 8085
+    },
+    {
+      "epoch": 0.5625239138752652,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0008467043979700554,
+      "loss": 0.6481,
+      "step": 8086
+    },
+    {
+      "epoch": 0.5625934815124004,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008464817322170319,
+      "loss": 0.8493,
+      "step": 8087
+    },
+    {
+      "epoch": 0.5626630491495357,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0008462590742588918,
+      "loss": 0.7085,
+      "step": 8088
+    },
+    {
+      "epoch": 0.5627326167866709,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000846036424106941,
+      "loss": 0.8964,
+      "step": 8089
+    },
+    {
+      "epoch": 0.562802184423806,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0008458137817724848,
+      "loss": 0.9103,
+      "step": 8090
+    },
+    {
+      "epoch": 0.5628717520609412,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0008455911472668276,
+      "loss": 0.8857,
+      "step": 8091
+    },
+    {
+      "epoch": 0.5629413196980765,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008453685206012732,
+      "loss": 0.8084,
+      "step": 8092
+    },
+    {
+      "epoch": 0.5630108873352117,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008451459017871263,
+      "loss": 0.7816,
+      "step": 8093
+    },
+    {
+      "epoch": 0.5630804549723468,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0008449232908356901,
+      "loss": 0.6622,
+      "step": 8094
+    },
+    {
+      "epoch": 0.5631500226094821,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008447006877582674,
+      "loss": 0.8106,
+      "step": 8095
+    },
+    {
+      "epoch": 0.5632195902466173,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008444780925661609,
+      "loss": 0.9192,
+      "step": 8096
+    },
+    {
+      "epoch": 0.5632891578837524,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008442555052706732,
+      "loss": 0.8015,
+      "step": 8097
+    },
+    {
+      "epoch": 0.5633587255208877,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0008440329258831057,
+      "loss": 0.557,
+      "step": 8098
+    },
+    {
+      "epoch": 0.5634282931580229,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008438103544147601,
+      "loss": 0.8574,
+      "step": 8099
+    },
+    {
+      "epoch": 0.5634978607951581,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008435877908769375,
+      "loss": 0.6947,
+      "step": 8100
+    },
+    {
+      "epoch": 0.5635674284322933,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0008433652352809388,
+      "loss": 1.0825,
+      "step": 8101
+    },
+    {
+      "epoch": 0.5636369960694285,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008431426876380636,
+      "loss": 0.9564,
+      "step": 8102
+    },
+    {
+      "epoch": 0.5637065637065637,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.000842920147959612,
+      "loss": 1.0568,
+      "step": 8103
+    },
+    {
+      "epoch": 0.5637761313436989,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008426976162568837,
+      "loss": 0.7129,
+      "step": 8104
+    },
+    {
+      "epoch": 0.5638456989808341,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008424750925411779,
+      "loss": 1.0349,
+      "step": 8105
+    },
+    {
+      "epoch": 0.5639152666179693,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0008422525768237925,
+      "loss": 0.6118,
+      "step": 8106
+    },
+    {
+      "epoch": 0.5639848342551045,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008420300691160263,
+      "loss": 0.7699,
+      "step": 8107
+    },
+    {
+      "epoch": 0.5640544018922398,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008418075694291772,
+      "loss": 1.049,
+      "step": 8108
+    },
+    {
+      "epoch": 0.5641239695293749,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0008415850777745421,
+      "loss": 0.7753,
+      "step": 8109
+    },
+    {
+      "epoch": 0.5641935371665101,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0008413625941634181,
+      "loss": 0.7612,
+      "step": 8110
+    },
+    {
+      "epoch": 0.5642631048036454,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008411401186071022,
+      "loss": 0.7208,
+      "step": 8111
+    },
+    {
+      "epoch": 0.5643326724407806,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0008409176511168906,
+      "loss": 0.76,
+      "step": 8112
+    },
+    {
+      "epoch": 0.5644022400779157,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0008406951917040784,
+      "loss": 0.6868,
+      "step": 8113
+    },
+    {
+      "epoch": 0.564471807715051,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008404727403799614,
+      "loss": 0.8213,
+      "step": 8114
+    },
+    {
+      "epoch": 0.5645413753521862,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0008402502971558352,
+      "loss": 0.7647,
+      "step": 8115
+    },
+    {
+      "epoch": 0.5646109429893214,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0008400278620429932,
+      "loss": 0.7873,
+      "step": 8116
+    },
+    {
+      "epoch": 0.5646805106264565,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008398054350527298,
+      "loss": 0.9852,
+      "step": 8117
+    },
+    {
+      "epoch": 0.5647500782635918,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008395830161963394,
+      "loss": 0.7469,
+      "step": 8118
+    },
+    {
+      "epoch": 0.564819645900727,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.000839360605485115,
+      "loss": 1.0719,
+      "step": 8119
+    },
+    {
+      "epoch": 0.5648892135378621,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.000839138202930349,
+      "loss": 0.7244,
+      "step": 8120
+    },
+    {
+      "epoch": 0.5649587811749974,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0008389158085433343,
+      "loss": 1.0414,
+      "step": 8121
+    },
+    {
+      "epoch": 0.5650283488121326,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008386934223353632,
+      "loss": 0.9417,
+      "step": 8122
+    },
+    {
+      "epoch": 0.5650979164492678,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008384710443177269,
+      "loss": 0.8437,
+      "step": 8123
+    },
+    {
+      "epoch": 0.565167484086403,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008382486745017166,
+      "loss": 0.8869,
+      "step": 8124
+    },
+    {
+      "epoch": 0.5652370517235382,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0008380263128986235,
+      "loss": 1.0772,
+      "step": 8125
+    },
+    {
+      "epoch": 0.5653066193606734,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000837803959519738,
+      "loss": 0.5437,
+      "step": 8126
+    },
+    {
+      "epoch": 0.5653761869978087,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008375816143763495,
+      "loss": 0.6332,
+      "step": 8127
+    },
+    {
+      "epoch": 0.5654457546349438,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0008373592774797482,
+      "loss": 0.8213,
+      "step": 8128
+    },
+    {
+      "epoch": 0.565515322272079,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0008371369488412233,
+      "loss": 0.8402,
+      "step": 8129
+    },
+    {
+      "epoch": 0.5655848899092142,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.000836914628472063,
+      "loss": 0.6205,
+      "step": 8130
+    },
+    {
+      "epoch": 0.5656544575463495,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0008366923163835556,
+      "loss": 0.9671,
+      "step": 8131
+    },
+    {
+      "epoch": 0.5657240251834846,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008364700125869895,
+      "loss": 0.5901,
+      "step": 8132
+    },
+    {
+      "epoch": 0.5657935928206198,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000836247717093652,
+      "loss": 0.9082,
+      "step": 8133
+    },
+    {
+      "epoch": 0.5658631604577551,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0008360254299148298,
+      "loss": 0.98,
+      "step": 8134
+    },
+    {
+      "epoch": 0.5659327280948903,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008358031510618099,
+      "loss": 0.8122,
+      "step": 8135
+    },
+    {
+      "epoch": 0.5660022957320254,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0008355808805458786,
+      "loss": 0.7831,
+      "step": 8136
+    },
+    {
+      "epoch": 0.5660718633691607,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0008353586183783212,
+      "loss": 0.8124,
+      "step": 8137
+    },
+    {
+      "epoch": 0.5661414310062959,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008351363645704231,
+      "loss": 0.7187,
+      "step": 8138
+    },
+    {
+      "epoch": 0.566210998643431,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0008349141191334697,
+      "loss": 0.5892,
+      "step": 8139
+    },
+    {
+      "epoch": 0.5662805662805663,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008346918820787455,
+      "loss": 0.7405,
+      "step": 8140
+    },
+    {
+      "epoch": 0.5663501339177015,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0008344696534175337,
+      "loss": 0.7435,
+      "step": 8141
+    },
+    {
+      "epoch": 0.5664197015548367,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008342474331611189,
+      "loss": 0.726,
+      "step": 8142
+    },
+    {
+      "epoch": 0.5664892691919718,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008340252213207839,
+      "loss": 0.6987,
+      "step": 8143
+    },
+    {
+      "epoch": 0.5665588368291071,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.000833803017907812,
+      "loss": 0.6203,
+      "step": 8144
+    },
+    {
+      "epoch": 0.5666284044662423,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0008335808229334846,
+      "loss": 0.7581,
+      "step": 8145
+    },
+    {
+      "epoch": 0.5666979721033775,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008333586364090844,
+      "loss": 0.9403,
+      "step": 8146
+    },
+    {
+      "epoch": 0.5667675397405127,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008331364583458929,
+      "loss": 0.7753,
+      "step": 8147
+    },
+    {
+      "epoch": 0.5668371073776479,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008329142887551908,
+      "loss": 0.9515,
+      "step": 8148
+    },
+    {
+      "epoch": 0.5669066750147831,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0008326921276482588,
+      "loss": 1.0615,
+      "step": 8149
+    },
+    {
+      "epoch": 0.5669762426519184,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008324699750363774,
+      "loss": 0.702,
+      "step": 8150
+    },
+    {
+      "epoch": 0.5670458102890535,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0008322478309308266,
+      "loss": 0.8821,
+      "step": 8151
+    },
+    {
+      "epoch": 0.5671153779261887,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0008320256953428849,
+      "loss": 0.7465,
+      "step": 8152
+    },
+    {
+      "epoch": 0.567184945563324,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0008318035682838319,
+      "loss": 0.9299,
+      "step": 8153
+    },
+    {
+      "epoch": 0.5672545132004592,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008315814497649461,
+      "loss": 0.8122,
+      "step": 8154
+    },
+    {
+      "epoch": 0.5673240808375943,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008313593397975052,
+      "loss": 0.9976,
+      "step": 8155
+    },
+    {
+      "epoch": 0.5673936484747295,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008311372383927869,
+      "loss": 0.743,
+      "step": 8156
+    },
+    {
+      "epoch": 0.5674632161118648,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0008309151455620687,
+      "loss": 0.9827,
+      "step": 8157
+    },
+    {
+      "epoch": 0.567532783749,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008306930613166272,
+      "loss": 0.9007,
+      "step": 8158
+    },
+    {
+      "epoch": 0.5676023513861351,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0008304709856677384,
+      "loss": 0.952,
+      "step": 8159
+    },
+    {
+      "epoch": 0.5676719190232704,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0008302489186266788,
+      "loss": 0.575,
+      "step": 8160
+    },
+    {
+      "epoch": 0.5677414866604056,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008300268602047235,
+      "loss": 0.7995,
+      "step": 8161
+    },
+    {
+      "epoch": 0.5678110542975408,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008298048104131474,
+      "loss": 0.5987,
+      "step": 8162
+    },
+    {
+      "epoch": 0.567880621934676,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0008295827692632249,
+      "loss": 0.6065,
+      "step": 8163
+    },
+    {
+      "epoch": 0.5679501895718112,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0008293607367662306,
+      "loss": 0.7019,
+      "step": 8164
+    },
+    {
+      "epoch": 0.5680197572089464,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0008291387129334383,
+      "loss": 0.4507,
+      "step": 8165
+    },
+    {
+      "epoch": 0.5680893248460817,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0008289166977761205,
+      "loss": 0.8238,
+      "step": 8166
+    },
+    {
+      "epoch": 0.5681588924832168,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008286946913055506,
+      "loss": 0.7219,
+      "step": 8167
+    },
+    {
+      "epoch": 0.568228460120352,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0008284726935330011,
+      "loss": 0.7893,
+      "step": 8168
+    },
+    {
+      "epoch": 0.5682980277574872,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008282507044697436,
+      "loss": 0.5606,
+      "step": 8169
+    },
+    {
+      "epoch": 0.5683675953946224,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008280287241270492,
+      "loss": 0.9674,
+      "step": 8170
+    },
+    {
+      "epoch": 0.5684371630317576,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0008278067525161897,
+      "loss": 0.8512,
+      "step": 8171
+    },
+    {
+      "epoch": 0.5685067306688928,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008275847896484356,
+      "loss": 0.8297,
+      "step": 8172
+    },
+    {
+      "epoch": 0.5685762983060281,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0008273628355350564,
+      "loss": 0.8624,
+      "step": 8173
+    },
+    {
+      "epoch": 0.5686458659431632,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0008271408901873225,
+      "loss": 0.7673,
+      "step": 8174
+    },
+    {
+      "epoch": 0.5687154335802984,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.000826918953616503,
+      "loss": 0.7336,
+      "step": 8175
+    },
+    {
+      "epoch": 0.5687850012174337,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008266970258338668,
+      "loss": 0.6666,
+      "step": 8176
+    },
+    {
+      "epoch": 0.5688545688545689,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0008264751068506816,
+      "loss": 0.9051,
+      "step": 8177
+    },
+    {
+      "epoch": 0.568924136491704,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008262531966782161,
+      "loss": 0.6647,
+      "step": 8178
+    },
+    {
+      "epoch": 0.5689937041288393,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008260312953277378,
+      "loss": 0.7353,
+      "step": 8179
+    },
+    {
+      "epoch": 0.5690632717659745,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000825809402810513,
+      "loss": 0.9339,
+      "step": 8180
+    },
+    {
+      "epoch": 0.5691328394031097,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008255875191378089,
+      "loss": 0.7516,
+      "step": 8181
+    },
+    {
+      "epoch": 0.5692024070402448,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008253656443208915,
+      "loss": 0.8792,
+      "step": 8182
+    },
+    {
+      "epoch": 0.5692719746773801,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0008251437783710267,
+      "loss": 0.9131,
+      "step": 8183
+    },
+    {
+      "epoch": 0.5693415423145153,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.000824921921299479,
+      "loss": 0.8456,
+      "step": 8184
+    },
+    {
+      "epoch": 0.5694111099516505,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008247000731175139,
+      "loss": 0.668,
+      "step": 8185
+    },
+    {
+      "epoch": 0.5694806775887857,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0008244782338363959,
+      "loss": 1.0285,
+      "step": 8186
+    },
+    {
+      "epoch": 0.5695502452259209,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008242564034673879,
+      "loss": 0.876,
+      "step": 8187
+    },
+    {
+      "epoch": 0.5696198128630561,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0008240345820217541,
+      "loss": 0.5813,
+      "step": 8188
+    },
+    {
+      "epoch": 0.5696893805001914,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0008238127695107574,
+      "loss": 0.7834,
+      "step": 8189
+    },
+    {
+      "epoch": 0.5697589481373265,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008235909659456604,
+      "loss": 0.8144,
+      "step": 8190
+    },
+    {
+      "epoch": 0.5698285157744617,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0008233691713377245,
+      "loss": 0.6035,
+      "step": 8191
+    },
+    {
+      "epoch": 0.569898083411597,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0008231473856982121,
+      "loss": 1.0587,
+      "step": 8192
+    },
+    {
+      "epoch": 0.5699676510487321,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008229256090383841,
+      "loss": 0.7681,
+      "step": 8193
+    },
+    {
+      "epoch": 0.5700372186858673,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0008227038413695007,
+      "loss": 0.9053,
+      "step": 8194
+    },
+    {
+      "epoch": 0.5701067863230025,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008224820827028231,
+      "loss": 0.9224,
+      "step": 8195
+    },
+    {
+      "epoch": 0.5701763539601378,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0008222603330496105,
+      "loss": 0.7537,
+      "step": 8196
+    },
+    {
+      "epoch": 0.5702459215972729,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008220385924211224,
+      "loss": 0.6989,
+      "step": 8197
+    },
+    {
+      "epoch": 0.5703154892344081,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0008218168608286172,
+      "loss": 0.7965,
+      "step": 8198
+    },
+    {
+      "epoch": 0.5703850568715434,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.000821595138283354,
+      "loss": 0.7993,
+      "step": 8199
+    },
+    {
+      "epoch": 0.5704546245086786,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008213734247965905,
+      "loss": 0.8572,
+      "step": 8200
+    },
+    {
+      "epoch": 0.5705241921458137,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008211517203795837,
+      "loss": 0.7204,
+      "step": 8201
+    },
+    {
+      "epoch": 0.570593759782949,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0008209300250435915,
+      "loss": 0.7224,
+      "step": 8202
+    },
+    {
+      "epoch": 0.5706633274200842,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00082070833879987,
+      "loss": 0.7471,
+      "step": 8203
+    },
+    {
+      "epoch": 0.5707328950572194,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0008204866616596754,
+      "loss": 0.9107,
+      "step": 8204
+    },
+    {
+      "epoch": 0.5708024626943546,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0008202649936342631,
+      "loss": 0.8132,
+      "step": 8205
+    },
+    {
+      "epoch": 0.5708720303314898,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008200433347348886,
+      "loss": 0.8231,
+      "step": 8206
+    },
+    {
+      "epoch": 0.570941597968625,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0008198216849728068,
+      "loss": 0.7626,
+      "step": 8207
+    },
+    {
+      "epoch": 0.5710111656057602,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008196000443592708,
+      "loss": 0.8453,
+      "step": 8208
+    },
+    {
+      "epoch": 0.5710807332428954,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0008193784129055362,
+      "loss": 0.9886,
+      "step": 8209
+    },
+    {
+      "epoch": 0.5711503008800306,
+      "grad_norm": 1.0,
+      "learning_rate": 0.000819156790622855,
+      "loss": 0.7544,
+      "step": 8210
+    },
+    {
+      "epoch": 0.5712198685171658,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0008189351775224807,
+      "loss": 1.0287,
+      "step": 8211
+    },
+    {
+      "epoch": 0.5712894361543011,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000818713573615665,
+      "loss": 0.9659,
+      "step": 8212
+    },
+    {
+      "epoch": 0.5713590037914362,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0008184919789136606,
+      "loss": 0.7124,
+      "step": 8213
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008182703934277184,
+      "loss": 0.831,
+      "step": 8214
+    },
+    {
+      "epoch": 0.5714981390657067,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0008180488171690896,
+      "loss": 0.7193,
+      "step": 8215
+    },
+    {
+      "epoch": 0.5715677067028418,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008178272501490252,
+      "loss": 0.838,
+      "step": 8216
+    },
+    {
+      "epoch": 0.571637274339977,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0008176056923787747,
+      "loss": 0.7169,
+      "step": 8217
+    },
+    {
+      "epoch": 0.5717068419771123,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0008173841438695879,
+      "loss": 0.7327,
+      "step": 8218
+    },
+    {
+      "epoch": 0.5717764096142475,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0008171626046327134,
+      "loss": 0.7963,
+      "step": 8219
+    },
+    {
+      "epoch": 0.5718459772513826,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0008169410746794005,
+      "loss": 0.9955,
+      "step": 8220
+    },
+    {
+      "epoch": 0.5719155448885178,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000816719554020897,
+      "loss": 0.8668,
+      "step": 8221
+    },
+    {
+      "epoch": 0.5719851125256531,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008164980426684507,
+      "loss": 1.036,
+      "step": 8222
+    },
+    {
+      "epoch": 0.5720546801627883,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008162765406333093,
+      "loss": 0.9154,
+      "step": 8223
+    },
+    {
+      "epoch": 0.5721242477999234,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008160550479267188,
+      "loss": 0.6411,
+      "step": 8224
+    },
+    {
+      "epoch": 0.5721938154370587,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008158335645599262,
+      "loss": 0.7755,
+      "step": 8225
+    },
+    {
+      "epoch": 0.5722633830741939,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008156120905441762,
+      "loss": 0.941,
+      "step": 8226
+    },
+    {
+      "epoch": 0.5723329507113291,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0008153906258907155,
+      "loss": 0.8045,
+      "step": 8227
+    },
+    {
+      "epoch": 0.5724025183484643,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000815169170610788,
+      "loss": 0.7237,
+      "step": 8228
+    },
+    {
+      "epoch": 0.5724720859855995,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008149477247156387,
+      "loss": 0.8402,
+      "step": 8229
+    },
+    {
+      "epoch": 0.5725416536227347,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0008147262882165109,
+      "loss": 0.7556,
+      "step": 8230
+    },
+    {
+      "epoch": 0.57261122125987,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0008145048611246484,
+      "loss": 0.8533,
+      "step": 8231
+    },
+    {
+      "epoch": 0.5726807888970051,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008142834434512943,
+      "loss": 0.7094,
+      "step": 8232
+    },
+    {
+      "epoch": 0.5727503565341403,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0008140620352076903,
+      "loss": 0.6124,
+      "step": 8233
+    },
+    {
+      "epoch": 0.5728199241712755,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0008138406364050796,
+      "loss": 0.805,
+      "step": 8234
+    },
+    {
+      "epoch": 0.5728894918084108,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0008136192470547027,
+      "loss": 0.6793,
+      "step": 8235
+    },
+    {
+      "epoch": 0.5729590594455459,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008133978671678013,
+      "loss": 1.1407,
+      "step": 8236
+    },
+    {
+      "epoch": 0.5730286270826811,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0008131764967556154,
+      "loss": 0.6932,
+      "step": 8237
+    },
+    {
+      "epoch": 0.5730981947198164,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0008129551358293853,
+      "loss": 0.6433,
+      "step": 8238
+    },
+    {
+      "epoch": 0.5731677623569515,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008127337844003509,
+      "loss": 0.9065,
+      "step": 8239
+    },
+    {
+      "epoch": 0.5732373299940867,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0008125124424797506,
+      "loss": 0.8267,
+      "step": 8240
+    },
+    {
+      "epoch": 0.573306897631222,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0008122911100788238,
+      "loss": 0.9824,
+      "step": 8241
+    },
+    {
+      "epoch": 0.5733764652683572,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008120697872088083,
+      "loss": 0.729,
+      "step": 8242
+    },
+    {
+      "epoch": 0.5734460329054923,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000811848473880942,
+      "loss": 0.7309,
+      "step": 8243
+    },
+    {
+      "epoch": 0.5735156005426276,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008116271701064612,
+      "loss": 0.8049,
+      "step": 8244
+    },
+    {
+      "epoch": 0.5735851681797628,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0008114058758966037,
+      "loss": 1.1414,
+      "step": 8245
+    },
+    {
+      "epoch": 0.573654735816898,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.000811184591262605,
+      "loss": 0.9099,
+      "step": 8246
+    },
+    {
+      "epoch": 0.5737243034540331,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.000810963316215701,
+      "loss": 0.9187,
+      "step": 8247
+    },
+    {
+      "epoch": 0.5737938710911684,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0008107420507671275,
+      "loss": 0.707,
+      "step": 8248
+    },
+    {
+      "epoch": 0.5738634387283036,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0008105207949281184,
+      "loss": 0.9479,
+      "step": 8249
+    },
+    {
+      "epoch": 0.5739330063654388,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0008102995487099085,
+      "loss": 0.6448,
+      "step": 8250
+    },
+    {
+      "epoch": 0.574002574002574,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0008100783121237308,
+      "loss": 0.7715,
+      "step": 8251
+    },
+    {
+      "epoch": 0.5740721416397092,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0008098570851808194,
+      "loss": 1.0675,
+      "step": 8252
+    },
+    {
+      "epoch": 0.5741417092768444,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.000809635867892407,
+      "loss": 0.5275,
+      "step": 8253
+    },
+    {
+      "epoch": 0.5742112769139797,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0008094146602697254,
+      "loss": 0.8431,
+      "step": 8254
+    },
+    {
+      "epoch": 0.5742808445511148,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0008091934623240071,
+      "loss": 0.5103,
+      "step": 8255
+    },
+    {
+      "epoch": 0.57435041218825,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000808972274066483,
+      "loss": 0.8274,
+      "step": 8256
+    },
+    {
+      "epoch": 0.5744199798253853,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008087510955083841,
+      "loss": 0.7134,
+      "step": 8257
+    },
+    {
+      "epoch": 0.5744895474625205,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00080852992666094,
+      "loss": 0.9269,
+      "step": 8258
+    },
+    {
+      "epoch": 0.5745591150996556,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0008083087675353816,
+      "loss": 0.7844,
+      "step": 8259
+    },
+    {
+      "epoch": 0.5746286827367908,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0008080876181429377,
+      "loss": 0.4759,
+      "step": 8260
+    },
+    {
+      "epoch": 0.5746982503739261,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.000807866478494837,
+      "loss": 0.9786,
+      "step": 8261
+    },
+    {
+      "epoch": 0.5747678180110612,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0008076453486023087,
+      "loss": 0.7311,
+      "step": 8262
+    },
+    {
+      "epoch": 0.5748373856481964,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008074242284765796,
+      "loss": 0.6925,
+      "step": 8263
+    },
+    {
+      "epoch": 0.5749069532853317,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0008072031181288779,
+      "loss": 0.7691,
+      "step": 8264
+    },
+    {
+      "epoch": 0.5749765209224669,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0008069820175704293,
+      "loss": 0.8411,
+      "step": 8265
+    },
+    {
+      "epoch": 0.575046088559602,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0008067609268124617,
+      "loss": 0.5944,
+      "step": 8266
+    },
+    {
+      "epoch": 0.5751156561967373,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0008065398458662001,
+      "loss": 1.019,
+      "step": 8267
+    },
+    {
+      "epoch": 0.5751852238338725,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0008063187747428698,
+      "loss": 0.8377,
+      "step": 8268
+    },
+    {
+      "epoch": 0.5752547914710077,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008060977134536961,
+      "loss": 0.6803,
+      "step": 8269
+    },
+    {
+      "epoch": 0.575324359108143,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0008058766620099031,
+      "loss": 0.7064,
+      "step": 8270
+    },
+    {
+      "epoch": 0.5753939267452781,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000805655620422715,
+      "loss": 0.9263,
+      "step": 8271
+    },
+    {
+      "epoch": 0.5754634943824133,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0008054345887033542,
+      "loss": 0.8418,
+      "step": 8272
+    },
+    {
+      "epoch": 0.5755330620195485,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000805213566863045,
+      "loss": 0.7247,
+      "step": 8273
+    },
+    {
+      "epoch": 0.5756026296566837,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008049925549130089,
+      "loss": 0.8379,
+      "step": 8274
+    },
+    {
+      "epoch": 0.5756721972938189,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0008047715528644677,
+      "loss": 0.8124,
+      "step": 8275
+    },
+    {
+      "epoch": 0.5757417649309541,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0008045505607286434,
+      "loss": 0.7437,
+      "step": 8276
+    },
+    {
+      "epoch": 0.5758113325680894,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0008043295785167563,
+      "loss": 0.7772,
+      "step": 8277
+    },
+    {
+      "epoch": 0.5758809002052245,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.000804108606240027,
+      "loss": 0.5885,
+      "step": 8278
+    },
+    {
+      "epoch": 0.5759504678423597,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000803887643909675,
+      "loss": 0.7219,
+      "step": 8279
+    },
+    {
+      "epoch": 0.576020035479495,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008036666915369205,
+      "loss": 0.6723,
+      "step": 8280
+    },
+    {
+      "epoch": 0.5760896031166302,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008034457491329816,
+      "loss": 0.5599,
+      "step": 8281
+    },
+    {
+      "epoch": 0.5761591707537653,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008032248167090765,
+      "loss": 1.0868,
+      "step": 8282
+    },
+    {
+      "epoch": 0.5762287383909006,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008030038942764239,
+      "loss": 0.9291,
+      "step": 8283
+    },
+    {
+      "epoch": 0.5762983060280358,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0008027829818462405,
+      "loss": 0.8722,
+      "step": 8284
+    },
+    {
+      "epoch": 0.576367873665171,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0008025620794297431,
+      "loss": 0.8945,
+      "step": 8285
+    },
+    {
+      "epoch": 0.5764374413023061,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.000802341187038148,
+      "loss": 0.783,
+      "step": 8286
+    },
+    {
+      "epoch": 0.5765070089394414,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0008021203046826716,
+      "loss": 1.0315,
+      "step": 8287
+    },
+    {
+      "epoch": 0.5765765765765766,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0008018994323745284,
+      "loss": 0.8479,
+      "step": 8288
+    },
+    {
+      "epoch": 0.5766461442137117,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008016785701249334,
+      "loss": 0.8675,
+      "step": 8289
+    },
+    {
+      "epoch": 0.576715711850847,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0008014577179451015,
+      "loss": 0.718,
+      "step": 8290
+    },
+    {
+      "epoch": 0.5767852794879822,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0008012368758462456,
+      "loss": 0.8693,
+      "step": 8291
+    },
+    {
+      "epoch": 0.5768548471251174,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0008010160438395794,
+      "loss": 0.7722,
+      "step": 8292
+    },
+    {
+      "epoch": 0.5769244147622526,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0008007952219363152,
+      "loss": 0.6732,
+      "step": 8293
+    },
+    {
+      "epoch": 0.5769939823993878,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0008005744101476661,
+      "loss": 0.6754,
+      "step": 8294
+    },
+    {
+      "epoch": 0.577063550036523,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0008003536084848431,
+      "loss": 0.8421,
+      "step": 8295
+    },
+    {
+      "epoch": 0.5771331176736583,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0008001328169590571,
+      "loss": 0.9898,
+      "step": 8296
+    },
+    {
+      "epoch": 0.5772026853107934,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007999120355815197,
+      "loss": 0.9763,
+      "step": 8297
+    },
+    {
+      "epoch": 0.5772722529479286,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007996912643634409,
+      "loss": 0.7011,
+      "step": 8298
+    },
+    {
+      "epoch": 0.5773418205850638,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007994705033160296,
+      "loss": 0.9451,
+      "step": 8299
+    },
+    {
+      "epoch": 0.5774113882221991,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007992497524504954,
+      "loss": 1.1057,
+      "step": 8300
+    },
+    {
+      "epoch": 0.5774809558593342,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007990290117780472,
+      "loss": 0.9615,
+      "step": 8301
+    },
+    {
+      "epoch": 0.5775505234964694,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0007988082813098927,
+      "loss": 1.1864,
+      "step": 8302
+    },
+    {
+      "epoch": 0.5776200911336047,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0007985875610572393,
+      "loss": 0.6633,
+      "step": 8303
+    },
+    {
+      "epoch": 0.5776896587707399,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0007983668510312947,
+      "loss": 0.543,
+      "step": 8304
+    },
+    {
+      "epoch": 0.577759226407875,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0007981461512432652,
+      "loss": 0.4702,
+      "step": 8305
+    },
+    {
+      "epoch": 0.5778287940450103,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0007979254617043565,
+      "loss": 1.0051,
+      "step": 8306
+    },
+    {
+      "epoch": 0.5778983616821455,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007977047824257741,
+      "loss": 0.9822,
+      "step": 8307
+    },
+    {
+      "epoch": 0.5779679293192806,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007974841134187236,
+      "loss": 0.8998,
+      "step": 8308
+    },
+    {
+      "epoch": 0.5780374969564159,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.000797263454694409,
+      "loss": 0.7201,
+      "step": 8309
+    },
+    {
+      "epoch": 0.5781070645935511,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007970428062640345,
+      "loss": 0.6195,
+      "step": 8310
+    },
+    {
+      "epoch": 0.5781766322306863,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007968221681388026,
+      "loss": 0.9949,
+      "step": 8311
+    },
+    {
+      "epoch": 0.5782461998678214,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007966015403299175,
+      "loss": 0.9154,
+      "step": 8312
+    },
+    {
+      "epoch": 0.5783157675049567,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007963809228485807,
+      "loss": 0.7892,
+      "step": 8313
+    },
+    {
+      "epoch": 0.5783853351420919,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007961603157059943,
+      "loss": 0.6663,
+      "step": 8314
+    },
+    {
+      "epoch": 0.5784549027792271,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00079593971891336,
+      "loss": 0.5892,
+      "step": 8315
+    },
+    {
+      "epoch": 0.5785244704163623,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007957191324818781,
+      "loss": 0.8504,
+      "step": 8316
+    },
+    {
+      "epoch": 0.5785940380534975,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007954985564227489,
+      "loss": 0.7952,
+      "step": 8317
+    },
+    {
+      "epoch": 0.5786636056906327,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.000795277990747172,
+      "loss": 0.8688,
+      "step": 8318
+    },
+    {
+      "epoch": 0.578733173327768,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0007950574354663474,
+      "loss": 1.0818,
+      "step": 8319
+    },
+    {
+      "epoch": 0.5788027409649031,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007948368905914729,
+      "loss": 0.8581,
+      "step": 8320
+    },
+    {
+      "epoch": 0.5788723086020383,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0007946163561337468,
+      "loss": 0.6678,
+      "step": 8321
+    },
+    {
+      "epoch": 0.5789418762391736,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007943958321043674,
+      "loss": 0.8144,
+      "step": 8322
+    },
+    {
+      "epoch": 0.5790114438763088,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007941753185145312,
+      "loss": 0.7094,
+      "step": 8323
+    },
+    {
+      "epoch": 0.5790810115134439,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0007939548153754347,
+      "loss": 0.7981,
+      "step": 8324
+    },
+    {
+      "epoch": 0.5791505791505791,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0007937343226982741,
+      "loss": 0.8511,
+      "step": 8325
+    },
+    {
+      "epoch": 0.5792201467877144,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0007935138404942452,
+      "loss": 0.8673,
+      "step": 8326
+    },
+    {
+      "epoch": 0.5792897144248496,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0007932933687745426,
+      "loss": 0.8772,
+      "step": 8327
+    },
+    {
+      "epoch": 0.5793592820619847,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0007930729075503606,
+      "loss": 0.6199,
+      "step": 8328
+    },
+    {
+      "epoch": 0.57942884969912,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007928524568328936,
+      "loss": 0.8623,
+      "step": 8329
+    },
+    {
+      "epoch": 0.5794984173362552,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007926320166333349,
+      "loss": 1.0028,
+      "step": 8330
+    },
+    {
+      "epoch": 0.5795679849733903,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0007924115869628771,
+      "loss": 0.7935,
+      "step": 8331
+    },
+    {
+      "epoch": 0.5796375526105256,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007921911678327123,
+      "loss": 0.9247,
+      "step": 8332
+    },
+    {
+      "epoch": 0.5797071202476608,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0007919707592540329,
+      "loss": 0.8797,
+      "step": 8333
+    },
+    {
+      "epoch": 0.579776687884796,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0007917503612380298,
+      "loss": 0.8182,
+      "step": 8334
+    },
+    {
+      "epoch": 0.5798462555219313,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0007915299737958933,
+      "loss": 0.8666,
+      "step": 8335
+    },
+    {
+      "epoch": 0.5799158231590664,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0007913095969388143,
+      "loss": 0.9228,
+      "step": 8336
+    },
+    {
+      "epoch": 0.5799853907962016,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0007910892306779822,
+      "loss": 0.9564,
+      "step": 8337
+    },
+    {
+      "epoch": 0.5800549584333368,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007908688750245858,
+      "loss": 0.8895,
+      "step": 8338
+    },
+    {
+      "epoch": 0.580124526070472,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0007906485299898137,
+      "loss": 0.8285,
+      "step": 8339
+    },
+    {
+      "epoch": 0.5801940937076072,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007904281955848543,
+      "loss": 0.8271,
+      "step": 8340
+    },
+    {
+      "epoch": 0.5802636613447424,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0007902078718208947,
+      "loss": 0.6401,
+      "step": 8341
+    },
+    {
+      "epoch": 0.5803332289818777,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0007899875587091216,
+      "loss": 0.8646,
+      "step": 8342
+    },
+    {
+      "epoch": 0.5804027966190128,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0007897672562607221,
+      "loss": 0.8125,
+      "step": 8343
+    },
+    {
+      "epoch": 0.580472364256148,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007895469644868819,
+      "loss": 0.7378,
+      "step": 8344
+    },
+    {
+      "epoch": 0.5805419318932833,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0007893266833987857,
+      "loss": 1.0015,
+      "step": 8345
+    },
+    {
+      "epoch": 0.5806114995304185,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007891064130076187,
+      "loss": 0.6859,
+      "step": 8346
+    },
+    {
+      "epoch": 0.5806810671675536,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0007888861533245652,
+      "loss": 0.9137,
+      "step": 8347
+    },
+    {
+      "epoch": 0.5807506348046888,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0007886659043608086,
+      "loss": 0.912,
+      "step": 8348
+    },
+    {
+      "epoch": 0.5808202024418241,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007884456661275321,
+      "loss": 0.8555,
+      "step": 8349
+    },
+    {
+      "epoch": 0.5808897700789593,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007882254386359184,
+      "loss": 1.044,
+      "step": 8350
+    },
+    {
+      "epoch": 0.5809593377160944,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007880052218971499,
+      "loss": 0.5766,
+      "step": 8351
+    },
+    {
+      "epoch": 0.5810289053532297,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007877850159224073,
+      "loss": 0.6393,
+      "step": 8352
+    },
+    {
+      "epoch": 0.5810984729903649,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007875648207228719,
+      "loss": 0.8741,
+      "step": 8353
+    },
+    {
+      "epoch": 0.5811680406275,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0007873446363097246,
+      "loss": 0.7753,
+      "step": 8354
+    },
+    {
+      "epoch": 0.5812376082646353,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007871244626941444,
+      "loss": 0.9385,
+      "step": 8355
+    },
+    {
+      "epoch": 0.5813071759017705,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007869042998873108,
+      "loss": 0.9722,
+      "step": 8356
+    },
+    {
+      "epoch": 0.5813767435389057,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007866841479004032,
+      "loss": 1.0156,
+      "step": 8357
+    },
+    {
+      "epoch": 0.581446311176041,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0007864640067445994,
+      "loss": 1.0329,
+      "step": 8358
+    },
+    {
+      "epoch": 0.5815158788131761,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0007862438764310769,
+      "loss": 0.8019,
+      "step": 8359
+    },
+    {
+      "epoch": 0.5815854464503113,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007860237569710127,
+      "loss": 0.8743,
+      "step": 8360
+    },
+    {
+      "epoch": 0.5816550140874465,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0007858036483755842,
+      "loss": 0.5349,
+      "step": 8361
+    },
+    {
+      "epoch": 0.5817245817245817,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0007855835506559663,
+      "loss": 0.9931,
+      "step": 8362
+    },
+    {
+      "epoch": 0.5817941493617169,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0007853634638233349,
+      "loss": 0.8545,
+      "step": 8363
+    },
+    {
+      "epoch": 0.5818637169988521,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0007851433878888652,
+      "loss": 0.899,
+      "step": 8364
+    },
+    {
+      "epoch": 0.5819332846359874,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0007849233228637315,
+      "loss": 0.7024,
+      "step": 8365
+    },
+    {
+      "epoch": 0.5820028522731225,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007847032687591072,
+      "loss": 0.8722,
+      "step": 8366
+    },
+    {
+      "epoch": 0.5820724199102577,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007844832255861654,
+      "loss": 1.013,
+      "step": 8367
+    },
+    {
+      "epoch": 0.582141987547393,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0007842631933560794,
+      "loss": 0.7286,
+      "step": 8368
+    },
+    {
+      "epoch": 0.5822115551845282,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007840431720800212,
+      "loss": 0.8038,
+      "step": 8369
+    },
+    {
+      "epoch": 0.5822811228216633,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.000783823161769162,
+      "loss": 0.8539,
+      "step": 8370
+    },
+    {
+      "epoch": 0.5823506904587986,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0007836031624346731,
+      "loss": 0.7812,
+      "step": 8371
+    },
+    {
+      "epoch": 0.5824202580959338,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.000783383174087725,
+      "loss": 0.901,
+      "step": 8372
+    },
+    {
+      "epoch": 0.582489825733069,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0007831631967394876,
+      "loss": 0.7741,
+      "step": 8373
+    },
+    {
+      "epoch": 0.5825593933702041,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007829432304011297,
+      "loss": 0.886,
+      "step": 8374
+    },
+    {
+      "epoch": 0.5826289610073394,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007827232750838207,
+      "loss": 0.8501,
+      "step": 8375
+    },
+    {
+      "epoch": 0.5826985286444746,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0007825033307987289,
+      "loss": 0.9736,
+      "step": 8376
+    },
+    {
+      "epoch": 0.5827680962816097,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007822833975570213,
+      "loss": 0.9093,
+      "step": 8377
+    },
+    {
+      "epoch": 0.582837663918745,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007820634753698656,
+      "loss": 1.0385,
+      "step": 8378
+    },
+    {
+      "epoch": 0.5829072315558802,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0007818435642484283,
+      "loss": 0.8872,
+      "step": 8379
+    },
+    {
+      "epoch": 0.5829767991930154,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.000781623664203875,
+      "loss": 0.7607,
+      "step": 8380
+    },
+    {
+      "epoch": 0.5830463668301507,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0007814037752473711,
+      "loss": 0.6788,
+      "step": 8381
+    },
+    {
+      "epoch": 0.5831159344672858,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000781183897390082,
+      "loss": 0.7802,
+      "step": 8382
+    },
+    {
+      "epoch": 0.583185502104421,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0007809640306431718,
+      "loss": 0.699,
+      "step": 8383
+    },
+    {
+      "epoch": 0.5832550697415563,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.000780744175017804,
+      "loss": 0.688,
+      "step": 8384
+    },
+    {
+      "epoch": 0.5833246373786914,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007805243305251415,
+      "loss": 0.8424,
+      "step": 8385
+    },
+    {
+      "epoch": 0.5833942050158266,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0007803044971763477,
+      "loss": 0.9309,
+      "step": 8386
+    },
+    {
+      "epoch": 0.5834637726529618,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007800846749825842,
+      "loss": 0.7958,
+      "step": 8387
+    },
+    {
+      "epoch": 0.5835333402900971,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000779864863955012,
+      "loss": 0.7098,
+      "step": 8388
+    },
+    {
+      "epoch": 0.5836029079272322,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007796450641047928,
+      "loss": 0.9335,
+      "step": 8389
+    },
+    {
+      "epoch": 0.5836724755643674,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007794252754430866,
+      "loss": 1.0127,
+      "step": 8390
+    },
+    {
+      "epoch": 0.5837420432015027,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007792054979810531,
+      "loss": 0.8231,
+      "step": 8391
+    },
+    {
+      "epoch": 0.5838116108386379,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0007789857317298512,
+      "loss": 0.829,
+      "step": 8392
+    },
+    {
+      "epoch": 0.583881178475773,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007787659767006403,
+      "loss": 0.9883,
+      "step": 8393
+    },
+    {
+      "epoch": 0.5839507461129083,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0007785462329045779,
+      "loss": 0.8823,
+      "step": 8394
+    },
+    {
+      "epoch": 0.5840203137500435,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0007783265003528212,
+      "loss": 0.6567,
+      "step": 8395
+    },
+    {
+      "epoch": 0.5840898813871787,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007781067790565278,
+      "loss": 0.6483,
+      "step": 8396
+    },
+    {
+      "epoch": 0.5841594490243139,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.000777887069026854,
+      "loss": 0.8813,
+      "step": 8397
+    },
+    {
+      "epoch": 0.5842290166614491,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000777667370274955,
+      "loss": 0.9875,
+      "step": 8398
+    },
+    {
+      "epoch": 0.5842985842985843,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0007774476828119861,
+      "loss": 0.554,
+      "step": 8399
+    },
+    {
+      "epoch": 0.5843681519357194,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007772280066491024,
+      "loss": 1.1132,
+      "step": 8400
+    },
+    {
+      "epoch": 0.5844377195728547,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0007770083417974578,
+      "loss": 0.9085,
+      "step": 8401
+    },
+    {
+      "epoch": 0.5845072872099899,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0007767886882682053,
+      "loss": 0.6959,
+      "step": 8402
+    },
+    {
+      "epoch": 0.5845768548471251,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007765690460724982,
+      "loss": 0.5727,
+      "step": 8403
+    },
+    {
+      "epoch": 0.5846464224842604,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007763494152214892,
+      "loss": 0.8546,
+      "step": 8404
+    },
+    {
+      "epoch": 0.5847159901213955,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0007761297957263291,
+      "loss": 0.8025,
+      "step": 8405
+    },
+    {
+      "epoch": 0.5847855577585307,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0007759101875981695,
+      "loss": 1.1985,
+      "step": 8406
+    },
+    {
+      "epoch": 0.584855125395666,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0007756905908481615,
+      "loss": 0.8312,
+      "step": 8407
+    },
+    {
+      "epoch": 0.5849246930328011,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0007754710054874548,
+      "loss": 0.8647,
+      "step": 8408
+    },
+    {
+      "epoch": 0.5849942606699363,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007752514315271981,
+      "loss": 0.8611,
+      "step": 8409
+    },
+    {
+      "epoch": 0.5850638283070716,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0007750318689785413,
+      "loss": 0.7379,
+      "step": 8410
+    },
+    {
+      "epoch": 0.5851333959442068,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007748123178526324,
+      "loss": 0.6662,
+      "step": 8411
+    },
+    {
+      "epoch": 0.5852029635813419,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0007745927781606188,
+      "loss": 1.269,
+      "step": 8412
+    },
+    {
+      "epoch": 0.5852725312184771,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0007743732499136476,
+      "loss": 0.7875,
+      "step": 8413
+    },
+    {
+      "epoch": 0.5853420988556124,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007741537331228657,
+      "loss": 0.6835,
+      "step": 8414
+    },
+    {
+      "epoch": 0.5854116664927476,
+      "grad_norm": 1.25,
+      "learning_rate": 0.000773934227799419,
+      "loss": 0.877,
+      "step": 8415
+    },
+    {
+      "epoch": 0.5854812341298827,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007737147339544526,
+      "loss": 0.9002,
+      "step": 8416
+    },
+    {
+      "epoch": 0.585550801767018,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0007734952515991114,
+      "loss": 0.918,
+      "step": 8417
+    },
+    {
+      "epoch": 0.5856203694041532,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00077327578074454,
+      "loss": 0.9253,
+      "step": 8418
+    },
+    {
+      "epoch": 0.5856899370412884,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007730563214018814,
+      "loss": 1.0449,
+      "step": 8419
+    },
+    {
+      "epoch": 0.5857595046784236,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0007728368735822787,
+      "loss": 0.6915,
+      "step": 8420
+    },
+    {
+      "epoch": 0.5858290723155588,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0007726174372968748,
+      "loss": 1.0598,
+      "step": 8421
+    },
+    {
+      "epoch": 0.585898639952694,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0007723980125568116,
+      "loss": 0.7353,
+      "step": 8422
+    },
+    {
+      "epoch": 0.5859682075898293,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007721785993732296,
+      "loss": 0.8202,
+      "step": 8423
+    },
+    {
+      "epoch": 0.5860377752269644,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007719591977572704,
+      "loss": 0.7018,
+      "step": 8424
+    },
+    {
+      "epoch": 0.5861073428640996,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007717398077200738,
+      "loss": 0.6593,
+      "step": 8425
+    },
+    {
+      "epoch": 0.5861769105012348,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007715204292727791,
+      "loss": 0.962,
+      "step": 8426
+    },
+    {
+      "epoch": 0.58624647813837,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007713010624265251,
+      "loss": 0.7747,
+      "step": 8427
+    },
+    {
+      "epoch": 0.5863160457755052,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007710817071924507,
+      "loss": 0.7734,
+      "step": 8428
+    },
+    {
+      "epoch": 0.5863856134126404,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007708623635816936,
+      "loss": 0.7227,
+      "step": 8429
+    },
+    {
+      "epoch": 0.5864551810497757,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007706430316053903,
+      "loss": 0.841,
+      "step": 8430
+    },
+    {
+      "epoch": 0.5865247486869108,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0007704237112746779,
+      "loss": 0.5351,
+      "step": 8431
+    },
+    {
+      "epoch": 0.586594316324046,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007702044026006927,
+      "loss": 0.6932,
+      "step": 8432
+    },
+    {
+      "epoch": 0.5866638839611813,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007699851055945693,
+      "loss": 0.8046,
+      "step": 8433
+    },
+    {
+      "epoch": 0.5867334515983165,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007697658202674427,
+      "loss": 0.8083,
+      "step": 8434
+    },
+    {
+      "epoch": 0.5868030192354516,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0007695465466304476,
+      "loss": 0.7808,
+      "step": 8435
+    },
+    {
+      "epoch": 0.5868725868725869,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0007693272846947173,
+      "loss": 1.0373,
+      "step": 8436
+    },
+    {
+      "epoch": 0.5869421545097221,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007691080344713845,
+      "loss": 0.923,
+      "step": 8437
+    },
+    {
+      "epoch": 0.5870117221468573,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0007688887959715823,
+      "loss": 0.8264,
+      "step": 8438
+    },
+    {
+      "epoch": 0.5870812897839924,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007686695692064419,
+      "loss": 0.9924,
+      "step": 8439
+    },
+    {
+      "epoch": 0.5871508574211277,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007684503541870952,
+      "loss": 0.7213,
+      "step": 8440
+    },
+    {
+      "epoch": 0.5872204250582629,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0007682311509246719,
+      "loss": 0.9749,
+      "step": 8441
+    },
+    {
+      "epoch": 0.587289992695398,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0007680119594303028,
+      "loss": 0.9688,
+      "step": 8442
+    },
+    {
+      "epoch": 0.5873595603325333,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007677927797151172,
+      "loss": 0.7979,
+      "step": 8443
+    },
+    {
+      "epoch": 0.5874291279696685,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0007675736117902435,
+      "loss": 0.8144,
+      "step": 8444
+    },
+    {
+      "epoch": 0.5874986956068037,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007673544556668104,
+      "loss": 0.937,
+      "step": 8445
+    },
+    {
+      "epoch": 0.587568263243939,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007671353113559455,
+      "loss": 0.8635,
+      "step": 8446
+    },
+    {
+      "epoch": 0.5876378308810741,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000766916178868776,
+      "loss": 0.7995,
+      "step": 8447
+    },
+    {
+      "epoch": 0.5877073985182093,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0007666970582164277,
+      "loss": 1.0202,
+      "step": 8448
+    },
+    {
+      "epoch": 0.5877769661553446,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0007664779494100269,
+      "loss": 0.8607,
+      "step": 8449
+    },
+    {
+      "epoch": 0.5878465337924798,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007662588524606992,
+      "loss": 0.8812,
+      "step": 8450
+    },
+    {
+      "epoch": 0.5879161014296149,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.000766039767379568,
+      "loss": 0.9578,
+      "step": 8451
+    },
+    {
+      "epoch": 0.5879856690667501,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007658206941777591,
+      "loss": 0.663,
+      "step": 8452
+    },
+    {
+      "epoch": 0.5880552367038854,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0007656016328663944,
+      "loss": 0.7829,
+      "step": 8453
+    },
+    {
+      "epoch": 0.5881248043410205,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0007653825834565977,
+      "loss": 0.6719,
+      "step": 8454
+    },
+    {
+      "epoch": 0.5881943719781557,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0007651635459594905,
+      "loss": 0.6089,
+      "step": 8455
+    },
+    {
+      "epoch": 0.588263939615291,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000764944520386195,
+      "loss": 0.8261,
+      "step": 8456
+    },
+    {
+      "epoch": 0.5883335072524262,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007647255067478321,
+      "loss": 0.7335,
+      "step": 8457
+    },
+    {
+      "epoch": 0.5884030748895613,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007645065050555216,
+      "loss": 0.6148,
+      "step": 8458
+    },
+    {
+      "epoch": 0.5884726425266966,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007642875153203843,
+      "loss": 1.1661,
+      "step": 8459
+    },
+    {
+      "epoch": 0.5885422101638318,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007640685375535388,
+      "loss": 0.7649,
+      "step": 8460
+    },
+    {
+      "epoch": 0.588611777800967,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007638495717661038,
+      "loss": 0.7276,
+      "step": 8461
+    },
+    {
+      "epoch": 0.5886813454381022,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007636306179691969,
+      "loss": 0.8225,
+      "step": 8462
+    },
+    {
+      "epoch": 0.5887509130752374,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0007634116761739362,
+      "loss": 0.6969,
+      "step": 8463
+    },
+    {
+      "epoch": 0.5888204807123726,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007631927463914382,
+      "loss": 0.7609,
+      "step": 8464
+    },
+    {
+      "epoch": 0.5888900483495078,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007629738286328187,
+      "loss": 0.6378,
+      "step": 8465
+    },
+    {
+      "epoch": 0.588959615986643,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007627549229091932,
+      "loss": 0.9368,
+      "step": 8466
+    },
+    {
+      "epoch": 0.5890291836237782,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007625360292316773,
+      "loss": 0.7146,
+      "step": 8467
+    },
+    {
+      "epoch": 0.5890987512609134,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.000762317147611385,
+      "loss": 0.8827,
+      "step": 8468
+    },
+    {
+      "epoch": 0.5891683188980487,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0007620982780594297,
+      "loss": 0.7071,
+      "step": 8469
+    },
+    {
+      "epoch": 0.5892378865351838,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007618794205869247,
+      "loss": 0.8249,
+      "step": 8470
+    },
+    {
+      "epoch": 0.589307454172319,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007616605752049827,
+      "loss": 0.5886,
+      "step": 8471
+    },
+    {
+      "epoch": 0.5893770218094543,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0007614417419247155,
+      "loss": 0.6516,
+      "step": 8472
+    },
+    {
+      "epoch": 0.5894465894465895,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007612229207572337,
+      "loss": 0.7519,
+      "step": 8473
+    },
+    {
+      "epoch": 0.5895161570837246,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007610041117136488,
+      "loss": 0.6568,
+      "step": 8474
+    },
+    {
+      "epoch": 0.5895857247208599,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0007607853148050706,
+      "loss": 0.7215,
+      "step": 8475
+    },
+    {
+      "epoch": 0.5896552923579951,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.000760566530042608,
+      "loss": 0.7886,
+      "step": 8476
+    },
+    {
+      "epoch": 0.5897248599951302,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007603477574373705,
+      "loss": 0.7763,
+      "step": 8477
+    },
+    {
+      "epoch": 0.5897944276322654,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0007601289970004658,
+      "loss": 0.641,
+      "step": 8478
+    },
+    {
+      "epoch": 0.5898639952694007,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0007599102487430018,
+      "loss": 0.6329,
+      "step": 8479
+    },
+    {
+      "epoch": 0.5899335629065359,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0007596915126760848,
+      "loss": 0.9425,
+      "step": 8480
+    },
+    {
+      "epoch": 0.590003130543671,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0007594727888108219,
+      "loss": 1.0997,
+      "step": 8481
+    },
+    {
+      "epoch": 0.5900726981808063,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0007592540771583185,
+      "loss": 0.8175,
+      "step": 8482
+    },
+    {
+      "epoch": 0.5901422658179415,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007590353777296793,
+      "loss": 0.8869,
+      "step": 8483
+    },
+    {
+      "epoch": 0.5902118334550767,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0007588166905360091,
+      "loss": 0.7391,
+      "step": 8484
+    },
+    {
+      "epoch": 0.5902814010922119,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0007585980155884118,
+      "loss": 0.9174,
+      "step": 8485
+    },
+    {
+      "epoch": 0.5903509687293471,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007583793528979908,
+      "loss": 0.8217,
+      "step": 8486
+    },
+    {
+      "epoch": 0.5904205363664823,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0007581607024758479,
+      "loss": 0.8131,
+      "step": 8487
+    },
+    {
+      "epoch": 0.5904901040036176,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0007579420643330858,
+      "loss": 0.6882,
+      "step": 8488
+    },
+    {
+      "epoch": 0.5905596716407527,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0007577234384808058,
+      "loss": 0.7639,
+      "step": 8489
+    },
+    {
+      "epoch": 0.5906292392778879,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007575048249301078,
+      "loss": 0.5981,
+      "step": 8490
+    },
+    {
+      "epoch": 0.5906988069150231,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007572862236920932,
+      "loss": 0.677,
+      "step": 8491
+    },
+    {
+      "epoch": 0.5907683745521584,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0007570676347778605,
+      "loss": 0.6852,
+      "step": 8492
+    },
+    {
+      "epoch": 0.5908379421892935,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007568490581985091,
+      "loss": 0.8176,
+      "step": 8493
+    },
+    {
+      "epoch": 0.5909075098264287,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007566304939651366,
+      "loss": 1.033,
+      "step": 8494
+    },
+    {
+      "epoch": 0.590977077463564,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007564119420888411,
+      "loss": 0.9066,
+      "step": 8495
+    },
+    {
+      "epoch": 0.5910466451006992,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0007561934025807196,
+      "loss": 0.5497,
+      "step": 8496
+    },
+    {
+      "epoch": 0.5911162127378343,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0007559748754518677,
+      "loss": 0.6161,
+      "step": 8497
+    },
+    {
+      "epoch": 0.5911857803749696,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000755756360713382,
+      "loss": 0.746,
+      "step": 8498
+    },
+    {
+      "epoch": 0.5912553480121048,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007555378583763572,
+      "loss": 1.0243,
+      "step": 8499
+    },
+    {
+      "epoch": 0.59132491564924,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007553193684518881,
+      "loss": 0.9379,
+      "step": 8500
+    },
+    {
+      "epoch": 0.5913944832863752,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007551008909510676,
+      "loss": 0.7344,
+      "step": 8501
+    },
+    {
+      "epoch": 0.5914640509235104,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0007548824258849898,
+      "loss": 0.6471,
+      "step": 8502
+    },
+    {
+      "epoch": 0.5915336185606456,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0007546639732647468,
+      "loss": 1.029,
+      "step": 8503
+    },
+    {
+      "epoch": 0.5916031861977807,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0007544455331014305,
+      "loss": 0.6613,
+      "step": 8504
+    },
+    {
+      "epoch": 0.591672753834916,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007542271054061328,
+      "loss": 0.9418,
+      "step": 8505
+    },
+    {
+      "epoch": 0.5917423214720512,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007540086901899436,
+      "loss": 0.8629,
+      "step": 8506
+    },
+    {
+      "epoch": 0.5918118891091864,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0007537902874639535,
+      "loss": 0.8829,
+      "step": 8507
+    },
+    {
+      "epoch": 0.5918814567463216,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.0007535718972392512,
+      "loss": 0.9109,
+      "step": 8508
+    },
+    {
+      "epoch": 0.5919510243834568,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0007533535195269262,
+      "loss": 0.847,
+      "step": 8509
+    },
+    {
+      "epoch": 0.592020592020592,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.000753135154338066,
+      "loss": 0.7576,
+      "step": 8510
+    },
+    {
+      "epoch": 0.5920901596577273,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007529168016837584,
+      "loss": 0.5772,
+      "step": 8511
+    },
+    {
+      "epoch": 0.5921597272948624,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0007526984615750904,
+      "loss": 1.1715,
+      "step": 8512
+    },
+    {
+      "epoch": 0.5922292949319976,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007524801340231481,
+      "loss": 1.0781,
+      "step": 8513
+    },
+    {
+      "epoch": 0.5922988625691329,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007522618190390171,
+      "loss": 0.8278,
+      "step": 8514
+    },
+    {
+      "epoch": 0.5923684302062681,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007520435166337817,
+      "loss": 0.6436,
+      "step": 8515
+    },
+    {
+      "epoch": 0.5924379978434032,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007518252268185272,
+      "loss": 0.7351,
+      "step": 8516
+    },
+    {
+      "epoch": 0.5925075654805384,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0007516069496043365,
+      "loss": 0.913,
+      "step": 8517
+    },
+    {
+      "epoch": 0.5925771331176737,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007513886850022928,
+      "loss": 1.0798,
+      "step": 8518
+    },
+    {
+      "epoch": 0.5926467007548089,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0007511704330234791,
+      "loss": 0.6147,
+      "step": 8519
+    },
+    {
+      "epoch": 0.592716268391944,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0007509521936789763,
+      "loss": 0.6515,
+      "step": 8520
+    },
+    {
+      "epoch": 0.5927858360290793,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.000750733966979866,
+      "loss": 0.8566,
+      "step": 8521
+    },
+    {
+      "epoch": 0.5928554036662145,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.000750515752937228,
+      "loss": 1.1151,
+      "step": 8522
+    },
+    {
+      "epoch": 0.5929249713033496,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0007502975515621431,
+      "loss": 0.803,
+      "step": 8523
+    },
+    {
+      "epoch": 0.5929945389404849,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007500793628656897,
+      "loss": 0.9002,
+      "step": 8524
+    },
+    {
+      "epoch": 0.5930641065776201,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007498611868589464,
+      "loss": 0.8477,
+      "step": 8525
+    },
+    {
+      "epoch": 0.5931336742147553,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007496430235529916,
+      "loss": 0.8164,
+      "step": 8526
+    },
+    {
+      "epoch": 0.5932032418518906,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.000749424872958902,
+      "loss": 0.6571,
+      "step": 8527
+    },
+    {
+      "epoch": 0.5932728094890257,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007492067350877546,
+      "loss": 0.7534,
+      "step": 8528
+    },
+    {
+      "epoch": 0.5933423771261609,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007489886099506244,
+      "loss": 0.8796,
+      "step": 8529
+    },
+    {
+      "epoch": 0.5934119447632961,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.000748770497558588,
+      "loss": 0.8716,
+      "step": 8530
+    },
+    {
+      "epoch": 0.5934815124004313,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0007485523979227194,
+      "loss": 0.6666,
+      "step": 8531
+    },
+    {
+      "epoch": 0.5935510800375665,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0007483343110540923,
+      "loss": 0.6986,
+      "step": 8532
+    },
+    {
+      "epoch": 0.5936206476747017,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007481162369637808,
+      "loss": 0.6915,
+      "step": 8533
+    },
+    {
+      "epoch": 0.593690215311837,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007478981756628571,
+      "loss": 1.1911,
+      "step": 8534
+    },
+    {
+      "epoch": 0.5937597829489721,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007476801271623934,
+      "loss": 0.7592,
+      "step": 8535
+    },
+    {
+      "epoch": 0.5938293505861073,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007474620914734606,
+      "loss": 0.7574,
+      "step": 8536
+    },
+    {
+      "epoch": 0.5938989182232426,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007472440686071305,
+      "loss": 0.6431,
+      "step": 8537
+    },
+    {
+      "epoch": 0.5939684858603778,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007470260585744722,
+      "loss": 0.8533,
+      "step": 8538
+    },
+    {
+      "epoch": 0.5940380534975129,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000746808061386556,
+      "loss": 0.9223,
+      "step": 8539
+    },
+    {
+      "epoch": 0.5941076211346482,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007465900770544498,
+      "loss": 0.7645,
+      "step": 8540
+    },
+    {
+      "epoch": 0.5941771887717834,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0007463721055892223,
+      "loss": 0.7114,
+      "step": 8541
+    },
+    {
+      "epoch": 0.5942467564089186,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007461541470019411,
+      "loss": 0.9832,
+      "step": 8542
+    },
+    {
+      "epoch": 0.5943163240460537,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0007459362013036725,
+      "loss": 0.7882,
+      "step": 8543
+    },
+    {
+      "epoch": 0.594385891683189,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0007457182685054834,
+      "loss": 0.7626,
+      "step": 8544
+    },
+    {
+      "epoch": 0.5944554593203242,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0007455003486184389,
+      "loss": 0.5522,
+      "step": 8545
+    },
+    {
+      "epoch": 0.5945250269574593,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0007452824416536039,
+      "loss": 1.0521,
+      "step": 8546
+    },
+    {
+      "epoch": 0.5945945945945946,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007450645476220424,
+      "loss": 0.904,
+      "step": 8547
+    },
+    {
+      "epoch": 0.5946641622317298,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007448466665348184,
+      "loss": 0.9891,
+      "step": 8548
+    },
+    {
+      "epoch": 0.594733729868865,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007446287984029944,
+      "loss": 0.6378,
+      "step": 8549
+    },
+    {
+      "epoch": 0.5948032975060003,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007444109432376329,
+      "loss": 1.0443,
+      "step": 8550
+    },
+    {
+      "epoch": 0.5948728651431354,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0007441931010497958,
+      "loss": 0.768,
+      "step": 8551
+    },
+    {
+      "epoch": 0.5949424327802706,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0007439752718505435,
+      "loss": 0.7619,
+      "step": 8552
+    },
+    {
+      "epoch": 0.5950120004174059,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007437574556509365,
+      "loss": 0.9104,
+      "step": 8553
+    },
+    {
+      "epoch": 0.595081568054541,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007435396524620338,
+      "loss": 0.7708,
+      "step": 8554
+    },
+    {
+      "epoch": 0.5951511356916762,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007433218622948956,
+      "loss": 0.7666,
+      "step": 8555
+    },
+    {
+      "epoch": 0.5952207033288114,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007431040851605791,
+      "loss": 0.8577,
+      "step": 8556
+    },
+    {
+      "epoch": 0.5952902709659467,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0007428863210701422,
+      "loss": 0.7772,
+      "step": 8557
+    },
+    {
+      "epoch": 0.5953598386030818,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007426685700346422,
+      "loss": 0.6987,
+      "step": 8558
+    },
+    {
+      "epoch": 0.595429406240217,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0007424508320651352,
+      "loss": 0.7198,
+      "step": 8559
+    },
+    {
+      "epoch": 0.5954989738773523,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0007422331071726769,
+      "loss": 0.5827,
+      "step": 8560
+    },
+    {
+      "epoch": 0.5955685415144875,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0007420153953683215,
+      "loss": 0.7368,
+      "step": 8561
+    },
+    {
+      "epoch": 0.5956381091516226,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007417976966631249,
+      "loss": 0.7991,
+      "step": 8562
+    },
+    {
+      "epoch": 0.5957076767887579,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007415800110681392,
+      "loss": 0.87,
+      "step": 8563
+    },
+    {
+      "epoch": 0.5957772444258931,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0007413623385944182,
+      "loss": 0.7575,
+      "step": 8564
+    },
+    {
+      "epoch": 0.5958468120630283,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007411446792530141,
+      "loss": 0.8132,
+      "step": 8565
+    },
+    {
+      "epoch": 0.5959163797001635,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0007409270330549784,
+      "loss": 0.8191,
+      "step": 8566
+    },
+    {
+      "epoch": 0.5959859473372987,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0007407094000113623,
+      "loss": 0.7915,
+      "step": 8567
+    },
+    {
+      "epoch": 0.5960555149744339,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0007404917801332154,
+      "loss": 0.8721,
+      "step": 8568
+    },
+    {
+      "epoch": 0.596125082611569,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0007402741734315885,
+      "loss": 0.7433,
+      "step": 8569
+    },
+    {
+      "epoch": 0.5961946502487043,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007400565799175296,
+      "loss": 0.517,
+      "step": 8570
+    },
+    {
+      "epoch": 0.5962642178858395,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007398389996020873,
+      "loss": 0.7589,
+      "step": 8571
+    },
+    {
+      "epoch": 0.5963337855229747,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0007396214324963098,
+      "loss": 0.8574,
+      "step": 8572
+    },
+    {
+      "epoch": 0.59640335316011,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0007394038786112431,
+      "loss": 0.6718,
+      "step": 8573
+    },
+    {
+      "epoch": 0.5964729207972451,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000739186337957934,
+      "loss": 0.796,
+      "step": 8574
+    },
+    {
+      "epoch": 0.5965424884343803,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007389688105474279,
+      "loss": 0.9065,
+      "step": 8575
+    },
+    {
+      "epoch": 0.5966120560715156,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0007387512963907704,
+      "loss": 0.7815,
+      "step": 8576
+    },
+    {
+      "epoch": 0.5966816237086507,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.000738533795499005,
+      "loss": 0.7542,
+      "step": 8577
+    },
+    {
+      "epoch": 0.5967511913457859,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0007383163078831754,
+      "loss": 0.832,
+      "step": 8578
+    },
+    {
+      "epoch": 0.5968207589829212,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.000738098833554325,
+      "loss": 0.9228,
+      "step": 8579
+    },
+    {
+      "epoch": 0.5968903266200564,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007378813725234958,
+      "loss": 0.784,
+      "step": 8580
+    },
+    {
+      "epoch": 0.5969598942571915,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0007376639248017291,
+      "loss": 0.5069,
+      "step": 8581
+    },
+    {
+      "epoch": 0.5970294618943267,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0007374464904000658,
+      "loss": 0.6602,
+      "step": 8582
+    },
+    {
+      "epoch": 0.597099029531462,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007372290693295469,
+      "loss": 0.8191,
+      "step": 8583
+    },
+    {
+      "epoch": 0.5971685971685972,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007370116616012112,
+      "loss": 0.8624,
+      "step": 8584
+    },
+    {
+      "epoch": 0.5972381648057323,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007367942672260974,
+      "loss": 0.9326,
+      "step": 8585
+    },
+    {
+      "epoch": 0.5973077324428676,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0007365768862152447,
+      "loss": 0.9081,
+      "step": 8586
+    },
+    {
+      "epoch": 0.5973773000800028,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0007363595185796895,
+      "loss": 0.837,
+      "step": 8587
+    },
+    {
+      "epoch": 0.597446867717138,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0007361421643304692,
+      "loss": 1.1247,
+      "step": 8588
+    },
+    {
+      "epoch": 0.5975164353542732,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0007359248234786198,
+      "loss": 0.7298,
+      "step": 8589
+    },
+    {
+      "epoch": 0.5975860029914084,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0007357074960351771,
+      "loss": 0.8085,
+      "step": 8590
+    },
+    {
+      "epoch": 0.5976555706285436,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007354901820111753,
+      "loss": 0.9944,
+      "step": 8591
+    },
+    {
+      "epoch": 0.5977251382656789,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0007352728814176489,
+      "loss": 0.8763,
+      "step": 8592
+    },
+    {
+      "epoch": 0.597794705902814,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0007350555942656311,
+      "loss": 0.9391,
+      "step": 8593
+    },
+    {
+      "epoch": 0.5978642735399492,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007348383205661552,
+      "loss": 0.8523,
+      "step": 8594
+    },
+    {
+      "epoch": 0.5979338411770844,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007346210603302528,
+      "loss": 0.8748,
+      "step": 8595
+    },
+    {
+      "epoch": 0.5980034088142197,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.000734403813568955,
+      "loss": 0.8543,
+      "step": 8596
+    },
+    {
+      "epoch": 0.5980729764513548,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007341865802932932,
+      "loss": 0.812,
+      "step": 8597
+    },
+    {
+      "epoch": 0.59814254408849,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0007339693605142969,
+      "loss": 0.7481,
+      "step": 8598
+    },
+    {
+      "epoch": 0.5982121117256253,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0007337521542429955,
+      "loss": 0.6395,
+      "step": 8599
+    },
+    {
+      "epoch": 0.5982816793627604,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007335349614904179,
+      "loss": 0.7155,
+      "step": 8600
+    },
+    {
+      "epoch": 0.5983512469998956,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0007333177822675918,
+      "loss": 0.8966,
+      "step": 8601
+    },
+    {
+      "epoch": 0.5984208146370309,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0007331006165855448,
+      "loss": 0.7555,
+      "step": 8602
+    },
+    {
+      "epoch": 0.5984903822741661,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0007328834644553026,
+      "loss": 0.8706,
+      "step": 8603
+    },
+    {
+      "epoch": 0.5985599499113012,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0007326663258878923,
+      "loss": 0.8422,
+      "step": 8604
+    },
+    {
+      "epoch": 0.5986295175484365,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007324492008943382,
+      "loss": 0.9008,
+      "step": 8605
+    },
+    {
+      "epoch": 0.5986990851855717,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000732232089485665,
+      "loss": 0.7004,
+      "step": 8606
+    },
+    {
+      "epoch": 0.5987686528227069,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0007320149916728969,
+      "loss": 0.6937,
+      "step": 8607
+    },
+    {
+      "epoch": 0.598838220459842,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007317979074670569,
+      "loss": 0.8367,
+      "step": 8608
+    },
+    {
+      "epoch": 0.5989077880969773,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007315808368791671,
+      "loss": 0.8127,
+      "step": 8609
+    },
+    {
+      "epoch": 0.5989773557341125,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007313637799202493,
+      "loss": 0.8375,
+      "step": 8610
+    },
+    {
+      "epoch": 0.5990469233712477,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007311467366013251,
+      "loss": 0.7361,
+      "step": 8611
+    },
+    {
+      "epoch": 0.5991164910083829,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007309297069334143,
+      "loss": 0.7678,
+      "step": 8612
+    },
+    {
+      "epoch": 0.5991860586455181,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007307126909275365,
+      "loss": 0.8631,
+      "step": 8613
+    },
+    {
+      "epoch": 0.5992556262826533,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0007304956885947114,
+      "loss": 1.013,
+      "step": 8614
+    },
+    {
+      "epoch": 0.5993251939197886,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0007302786999459569,
+      "loss": 0.6568,
+      "step": 8615
+    },
+    {
+      "epoch": 0.5993947615569237,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0007300617249922903,
+      "loss": 0.8126,
+      "step": 8616
+    },
+    {
+      "epoch": 0.5994643291940589,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0007298447637447284,
+      "loss": 0.5406,
+      "step": 8617
+    },
+    {
+      "epoch": 0.5995338968311942,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007296278162142882,
+      "loss": 0.6937,
+      "step": 8618
+    },
+    {
+      "epoch": 0.5996034644683294,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007294108824119846,
+      "loss": 0.8836,
+      "step": 8619
+    },
+    {
+      "epoch": 0.5996730321054645,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007291939623488324,
+      "loss": 1.1188,
+      "step": 8620
+    },
+    {
+      "epoch": 0.5997425997425997,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007289770560358458,
+      "loss": 0.8996,
+      "step": 8621
+    },
+    {
+      "epoch": 0.599812167379735,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007287601634840384,
+      "loss": 0.8006,
+      "step": 8622
+    },
+    {
+      "epoch": 0.5998817350168701,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0007285432847044227,
+      "loss": 0.7099,
+      "step": 8623
+    },
+    {
+      "epoch": 0.5999513026540053,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007283264197080106,
+      "loss": 0.9949,
+      "step": 8624
+    },
+    {
+      "epoch": 0.6000208702911406,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0007281095685058137,
+      "loss": 1.1107,
+      "step": 8625
+    },
+    {
+      "epoch": 0.6000904379282758,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0007278927311088426,
+      "loss": 0.8135,
+      "step": 8626
+    },
+    {
+      "epoch": 0.6001600055654109,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0007276759075281069,
+      "loss": 0.905,
+      "step": 8627
+    },
+    {
+      "epoch": 0.6002295732025462,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0007274590977746161,
+      "loss": 0.868,
+      "step": 8628
+    },
+    {
+      "epoch": 0.6002991408396814,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007272423018593787,
+      "loss": 0.7799,
+      "step": 8629
+    },
+    {
+      "epoch": 0.6003687084768166,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0007270255197934024,
+      "loss": 0.8145,
+      "step": 8630
+    },
+    {
+      "epoch": 0.6004382761139518,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007268087515876939,
+      "loss": 0.8468,
+      "step": 8631
+    },
+    {
+      "epoch": 0.600507843751087,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0007265919972532603,
+      "loss": 0.5068,
+      "step": 8632
+    },
+    {
+      "epoch": 0.6005774113882222,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007263752568011073,
+      "loss": 0.8017,
+      "step": 8633
+    },
+    {
+      "epoch": 0.6006469790253574,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007261585302422392,
+      "loss": 0.7295,
+      "step": 8634
+    },
+    {
+      "epoch": 0.6007165466624926,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007259418175876607,
+      "loss": 0.9748,
+      "step": 8635
+    },
+    {
+      "epoch": 0.6007861142996278,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007257251188483756,
+      "loss": 0.9806,
+      "step": 8636
+    },
+    {
+      "epoch": 0.600855681936763,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0007255084340353862,
+      "loss": 0.6462,
+      "step": 8637
+    },
+    {
+      "epoch": 0.6009252495738983,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0007252917631596949,
+      "loss": 0.8728,
+      "step": 8638
+    },
+    {
+      "epoch": 0.6009948172110334,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0007250751062323036,
+      "loss": 1.0304,
+      "step": 8639
+    },
+    {
+      "epoch": 0.6010643848481686,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007248584632642127,
+      "loss": 0.8004,
+      "step": 8640
+    },
+    {
+      "epoch": 0.6011339524853039,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.000724641834266422,
+      "loss": 0.6333,
+      "step": 8641
+    },
+    {
+      "epoch": 0.601203520122439,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000724425219249931,
+      "loss": 0.8913,
+      "step": 8642
+    },
+    {
+      "epoch": 0.6012730877595742,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0007242086182257386,
+      "loss": 0.9301,
+      "step": 8643
+    },
+    {
+      "epoch": 0.6013426553967095,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0007239920312048423,
+      "loss": 0.7183,
+      "step": 8644
+    },
+    {
+      "epoch": 0.6014122230338447,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0007237754581982394,
+      "loss": 0.7616,
+      "step": 8645
+    },
+    {
+      "epoch": 0.6014817906709798,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007235588992169265,
+      "loss": 0.6698,
+      "step": 8646
+    },
+    {
+      "epoch": 0.601551358308115,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0007233423542718997,
+      "loss": 0.8227,
+      "step": 8647
+    },
+    {
+      "epoch": 0.6016209259452503,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007231258233741533,
+      "loss": 0.8004,
+      "step": 8648
+    },
+    {
+      "epoch": 0.6016904935823855,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0007229093065346818,
+      "loss": 0.9062,
+      "step": 8649
+    },
+    {
+      "epoch": 0.6017600612195206,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007226928037644798,
+      "loss": 0.9083,
+      "step": 8650
+    },
+    {
+      "epoch": 0.6018296288566559,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000722476315074539,
+      "loss": 0.6559,
+      "step": 8651
+    },
+    {
+      "epoch": 0.6018991964937911,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000722259840475852,
+      "loss": 0.7909,
+      "step": 8652
+    },
+    {
+      "epoch": 0.6019687641309263,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0007220433799794106,
+      "loss": 0.6338,
+      "step": 8653
+    },
+    {
+      "epoch": 0.6020383317680615,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0007218269335962055,
+      "loss": 0.97,
+      "step": 8654
+    },
+    {
+      "epoch": 0.6021078994051967,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007216105013372266,
+      "loss": 0.8516,
+      "step": 8655
+    },
+    {
+      "epoch": 0.6021774670423319,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007213940832134629,
+      "loss": 0.913,
+      "step": 8656
+    },
+    {
+      "epoch": 0.6022470346794672,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0007211776792359038,
+      "loss": 0.9543,
+      "step": 8657
+    },
+    {
+      "epoch": 0.6023166023166023,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0007209612894155367,
+      "loss": 0.8336,
+      "step": 8658
+    },
+    {
+      "epoch": 0.6023861699537375,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007207449137633483,
+      "loss": 0.9429,
+      "step": 8659
+    },
+    {
+      "epoch": 0.6024557375908727,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007205285522903262,
+      "loss": 0.7073,
+      "step": 8660
+    },
+    {
+      "epoch": 0.602525305228008,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0007203122050074556,
+      "loss": 0.468,
+      "step": 8661
+    },
+    {
+      "epoch": 0.6025948728651431,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007200958719257213,
+      "loss": 0.9161,
+      "step": 8662
+    },
+    {
+      "epoch": 0.6026644405022783,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007198795530561077,
+      "loss": 0.9724,
+      "step": 8663
+    },
+    {
+      "epoch": 0.6027340081394136,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0007196632484095986,
+      "loss": 0.6824,
+      "step": 8664
+    },
+    {
+      "epoch": 0.6028035757765488,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0007194469579971769,
+      "loss": 0.8068,
+      "step": 8665
+    },
+    {
+      "epoch": 0.6028731434136839,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0007192306818298244,
+      "loss": 0.8191,
+      "step": 8666
+    },
+    {
+      "epoch": 0.6029427110508192,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0007190144199185227,
+      "loss": 0.8178,
+      "step": 8667
+    },
+    {
+      "epoch": 0.6030122786879544,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0007187981722742527,
+      "loss": 0.6776,
+      "step": 8668
+    },
+    {
+      "epoch": 0.6030818463250895,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0007185819389079939,
+      "loss": 0.8442,
+      "step": 8669
+    },
+    {
+      "epoch": 0.6031514139622248,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007183657198307258,
+      "loss": 0.6645,
+      "step": 8670
+    },
+    {
+      "epoch": 0.60322098159936,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.000718149515053427,
+      "loss": 0.7999,
+      "step": 8671
+    },
+    {
+      "epoch": 0.6032905492364952,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0007179333245870753,
+      "loss": 0.7067,
+      "step": 8672
+    },
+    {
+      "epoch": 0.6033601168736303,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0007177171484426474,
+      "loss": 0.6579,
+      "step": 8673
+    },
+    {
+      "epoch": 0.6034296845107656,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007175009866311199,
+      "loss": 0.7907,
+      "step": 8674
+    },
+    {
+      "epoch": 0.6034992521479008,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007172848391634687,
+      "loss": 0.8546,
+      "step": 8675
+    },
+    {
+      "epoch": 0.603568819785036,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0007170687060506682,
+      "loss": 0.9074,
+      "step": 8676
+    },
+    {
+      "epoch": 0.6036383874221712,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007168525873036926,
+      "loss": 0.8058,
+      "step": 8677
+    },
+    {
+      "epoch": 0.6037079550593064,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007166364829335155,
+      "loss": 0.8768,
+      "step": 8678
+    },
+    {
+      "epoch": 0.6037775226964416,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00071642039295111,
+      "loss": 0.8745,
+      "step": 8679
+    },
+    {
+      "epoch": 0.6038470903335769,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0007162043173674468,
+      "loss": 0.7824,
+      "step": 8680
+    },
+    {
+      "epoch": 0.603916657970712,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007159882561934984,
+      "loss": 0.9104,
+      "step": 8681
+    },
+    {
+      "epoch": 0.6039862256078472,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0007157722094402351,
+      "loss": 0.6902,
+      "step": 8682
+    },
+    {
+      "epoch": 0.6040557932449825,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0007155561771186259,
+      "loss": 0.839,
+      "step": 8683
+    },
+    {
+      "epoch": 0.6041253608821177,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0007153401592396402,
+      "loss": 0.7838,
+      "step": 8684
+    },
+    {
+      "epoch": 0.6041949285192528,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007151241558142467,
+      "loss": 0.8241,
+      "step": 8685
+    },
+    {
+      "epoch": 0.604264496156388,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000714908166853413,
+      "loss": 0.745,
+      "step": 8686
+    },
+    {
+      "epoch": 0.6043340637935233,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0007146921923681051,
+      "loss": 0.7767,
+      "step": 8687
+    },
+    {
+      "epoch": 0.6044036314306585,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007144762323692897,
+      "loss": 0.7174,
+      "step": 8688
+    },
+    {
+      "epoch": 0.6044731990677936,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0007142602868679324,
+      "loss": 0.6602,
+      "step": 8689
+    },
+    {
+      "epoch": 0.6045427667049289,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007140443558749974,
+      "loss": 0.6507,
+      "step": 8690
+    },
+    {
+      "epoch": 0.6046123343420641,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0007138284394014483,
+      "loss": 0.79,
+      "step": 8691
+    },
+    {
+      "epoch": 0.6046819019791992,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.000713612537458249,
+      "loss": 0.7754,
+      "step": 8692
+    },
+    {
+      "epoch": 0.6047514696163345,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007133966500563615,
+      "loss": 0.9146,
+      "step": 8693
+    },
+    {
+      "epoch": 0.6048210372534697,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007131807772067473,
+      "loss": 0.6198,
+      "step": 8694
+    },
+    {
+      "epoch": 0.6048906048906049,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007129649189203677,
+      "loss": 0.7704,
+      "step": 8695
+    },
+    {
+      "epoch": 0.6049601725277401,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0007127490752081829,
+      "loss": 0.7555,
+      "step": 8696
+    },
+    {
+      "epoch": 0.6050297401648753,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0007125332460811522,
+      "loss": 0.9109,
+      "step": 8697
+    },
+    {
+      "epoch": 0.6050993078020105,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0007123174315502341,
+      "loss": 0.7943,
+      "step": 8698
+    },
+    {
+      "epoch": 0.6051688754391457,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007121016316263869,
+      "loss": 0.7806,
+      "step": 8699
+    },
+    {
+      "epoch": 0.6052384430762809,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000711885846320568,
+      "loss": 0.7104,
+      "step": 8700
+    },
+    {
+      "epoch": 0.6053080107134161,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0007116700756437333,
+      "loss": 0.7789,
+      "step": 8701
+    },
+    {
+      "epoch": 0.6053775783505513,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0007114543196068389,
+      "loss": 0.7917,
+      "step": 8702
+    },
+    {
+      "epoch": 0.6054471459876866,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007112385782208397,
+      "loss": 0.8465,
+      "step": 8703
+    },
+    {
+      "epoch": 0.6055167136248217,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0007110228514966903,
+      "loss": 1.0022,
+      "step": 8704
+    },
+    {
+      "epoch": 0.6055862812619569,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0007108071394453436,
+      "loss": 0.5816,
+      "step": 8705
+    },
+    {
+      "epoch": 0.6056558488990922,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007105914420777529,
+      "loss": 0.6953,
+      "step": 8706
+    },
+    {
+      "epoch": 0.6057254165362274,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007103757594048703,
+      "loss": 0.6059,
+      "step": 8707
+    },
+    {
+      "epoch": 0.6057949841733625,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0007101600914376465,
+      "loss": 0.8081,
+      "step": 8708
+    },
+    {
+      "epoch": 0.6058645518104978,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007099444381870322,
+      "loss": 0.8674,
+      "step": 8709
+    },
+    {
+      "epoch": 0.605934119447633,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0007097287996639776,
+      "loss": 0.7671,
+      "step": 8710
+    },
+    {
+      "epoch": 0.6060036870847682,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007095131758794317,
+      "loss": 0.6164,
+      "step": 8711
+    },
+    {
+      "epoch": 0.6060732547219033,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007092975668443421,
+      "loss": 0.9779,
+      "step": 8712
+    },
+    {
+      "epoch": 0.6061428223590386,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.000709081972569657,
+      "loss": 0.7917,
+      "step": 8713
+    },
+    {
+      "epoch": 0.6062123899961738,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0007088663930663232,
+      "loss": 0.967,
+      "step": 8714
+    },
+    {
+      "epoch": 0.6062819576333089,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007086508283452864,
+      "loss": 0.6636,
+      "step": 8715
+    },
+    {
+      "epoch": 0.6063515252704442,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0007084352784174917,
+      "loss": 0.8809,
+      "step": 8716
+    },
+    {
+      "epoch": 0.6064210929075794,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0007082197432938844,
+      "loss": 0.8034,
+      "step": 8717
+    },
+    {
+      "epoch": 0.6064906605447146,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007080042229854077,
+      "loss": 0.5958,
+      "step": 8718
+    },
+    {
+      "epoch": 0.6065602281818498,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0007077887175030047,
+      "loss": 0.7012,
+      "step": 8719
+    },
+    {
+      "epoch": 0.606629795818985,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000707573226857618,
+      "loss": 0.7884,
+      "step": 8720
+    },
+    {
+      "epoch": 0.6066993634561202,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007073577510601889,
+      "loss": 0.6782,
+      "step": 8721
+    },
+    {
+      "epoch": 0.6067689310932555,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0007071422901216579,
+      "loss": 0.8939,
+      "step": 8722
+    },
+    {
+      "epoch": 0.6068384987303906,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007069268440529654,
+      "loss": 0.5865,
+      "step": 8723
+    },
+    {
+      "epoch": 0.6069080663675258,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0007067114128650506,
+      "loss": 0.6065,
+      "step": 8724
+    },
+    {
+      "epoch": 0.606977634004661,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0007064959965688522,
+      "loss": 0.6358,
+      "step": 8725
+    },
+    {
+      "epoch": 0.6070472016417963,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0007062805951753073,
+      "loss": 1.11,
+      "step": 8726
+    },
+    {
+      "epoch": 0.6071167692789314,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0007060652086953534,
+      "loss": 0.9128,
+      "step": 8727
+    },
+    {
+      "epoch": 0.6071863369160666,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0007058498371399269,
+      "loss": 0.5899,
+      "step": 8728
+    },
+    {
+      "epoch": 0.6072559045532019,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.000705634480519963,
+      "loss": 0.6378,
+      "step": 8729
+    },
+    {
+      "epoch": 0.6073254721903371,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0007054191388463962,
+      "loss": 0.958,
+      "step": 8730
+    },
+    {
+      "epoch": 0.6073950398274722,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0007052038121301609,
+      "loss": 0.9299,
+      "step": 8731
+    },
+    {
+      "epoch": 0.6074646074646075,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0007049885003821905,
+      "loss": 1.0552,
+      "step": 8732
+    },
+    {
+      "epoch": 0.6075341751017427,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0007047732036134165,
+      "loss": 0.6015,
+      "step": 8733
+    },
+    {
+      "epoch": 0.6076037427388779,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0007045579218347712,
+      "loss": 0.7952,
+      "step": 8734
+    },
+    {
+      "epoch": 0.6076733103760131,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007043426550571858,
+      "loss": 1.0644,
+      "step": 8735
+    },
+    {
+      "epoch": 0.6077428780131483,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0007041274032915903,
+      "loss": 0.7886,
+      "step": 8736
+    },
+    {
+      "epoch": 0.6078124456502835,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0007039121665489134,
+      "loss": 1.0829,
+      "step": 8737
+    },
+    {
+      "epoch": 0.6078820132874186,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0007036969448400847,
+      "loss": 0.9535,
+      "step": 8738
+    },
+    {
+      "epoch": 0.6079515809245539,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007034817381760317,
+      "loss": 0.8826,
+      "step": 8739
+    },
+    {
+      "epoch": 0.6080211485616891,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0007032665465676812,
+      "loss": 0.6847,
+      "step": 8740
+    },
+    {
+      "epoch": 0.6080907161988243,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00070305137002596,
+      "loss": 0.9334,
+      "step": 8741
+    },
+    {
+      "epoch": 0.6081602838359595,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0007028362085617935,
+      "loss": 0.7175,
+      "step": 8742
+    },
+    {
+      "epoch": 0.6082298514730947,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0007026210621861066,
+      "loss": 0.8694,
+      "step": 8743
+    },
+    {
+      "epoch": 0.6082994191102299,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0007024059309098229,
+      "loss": 0.9104,
+      "step": 8744
+    },
+    {
+      "epoch": 0.6083689867473652,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0007021908147438662,
+      "loss": 0.8699,
+      "step": 8745
+    },
+    {
+      "epoch": 0.6084385543845003,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0007019757136991591,
+      "loss": 0.6986,
+      "step": 8746
+    },
+    {
+      "epoch": 0.6085081220216355,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0007017606277866225,
+      "loss": 0.7712,
+      "step": 8747
+    },
+    {
+      "epoch": 0.6085776896587708,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0007015455570171787,
+      "loss": 1.0519,
+      "step": 8748
+    },
+    {
+      "epoch": 0.608647257295906,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0007013305014017468,
+      "loss": 0.8171,
+      "step": 8749
+    },
+    {
+      "epoch": 0.6087168249330411,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.000701115460951247,
+      "loss": 0.8163,
+      "step": 8750
+    },
+    {
+      "epoch": 0.6087863925701763,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0007009004356765971,
+      "loss": 0.8739,
+      "step": 8751
+    },
+    {
+      "epoch": 0.6088559602073116,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0007006854255887157,
+      "loss": 0.7692,
+      "step": 8752
+    },
+    {
+      "epoch": 0.6089255278444468,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0007004704306985201,
+      "loss": 0.9266,
+      "step": 8753
+    },
+    {
+      "epoch": 0.6089950954815819,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0007002554510169254,
+      "loss": 0.7674,
+      "step": 8754
+    },
+    {
+      "epoch": 0.6090646631187172,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0007000404865548489,
+      "loss": 0.8297,
+      "step": 8755
+    },
+    {
+      "epoch": 0.6091342307558524,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0006998255373232043,
+      "loss": 0.7031,
+      "step": 8756
+    },
+    {
+      "epoch": 0.6092037983929875,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0006996106033329061,
+      "loss": 0.7895,
+      "step": 8757
+    },
+    {
+      "epoch": 0.6092733660301228,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000699395684594867,
+      "loss": 0.7787,
+      "step": 8758
+    },
+    {
+      "epoch": 0.609342933667258,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006991807811200002,
+      "loss": 0.8288,
+      "step": 8759
+    },
+    {
+      "epoch": 0.6094125013043932,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0006989658929192171,
+      "loss": 0.9346,
+      "step": 8760
+    },
+    {
+      "epoch": 0.6094820689415285,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006987510200034281,
+      "loss": 0.8066,
+      "step": 8761
+    },
+    {
+      "epoch": 0.6095516365786636,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0006985361623835447,
+      "loss": 0.6906,
+      "step": 8762
+    },
+    {
+      "epoch": 0.6096212042157988,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.000698321320070475,
+      "loss": 0.7455,
+      "step": 8763
+    },
+    {
+      "epoch": 0.609690771852934,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006981064930751285,
+      "loss": 0.8661,
+      "step": 8764
+    },
+    {
+      "epoch": 0.6097603394900692,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006978916814084121,
+      "loss": 0.7087,
+      "step": 8765
+    },
+    {
+      "epoch": 0.6098299071272044,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0006976768850812336,
+      "loss": 0.6889,
+      "step": 8766
+    },
+    {
+      "epoch": 0.6098994747643396,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000697462104104499,
+      "loss": 0.8307,
+      "step": 8767
+    },
+    {
+      "epoch": 0.6099690424014749,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0006972473384891138,
+      "loss": 0.9023,
+      "step": 8768
+    },
+    {
+      "epoch": 0.61003861003861,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006970325882459832,
+      "loss": 0.7498,
+      "step": 8769
+    },
+    {
+      "epoch": 0.6101081776757452,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0006968178533860103,
+      "loss": 0.4831,
+      "step": 8770
+    },
+    {
+      "epoch": 0.6101777453128805,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006966031339200989,
+      "loss": 0.664,
+      "step": 8771
+    },
+    {
+      "epoch": 0.6102473129500157,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0006963884298591507,
+      "loss": 0.8983,
+      "step": 8772
+    },
+    {
+      "epoch": 0.6103168805871508,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0006961737412140681,
+      "loss": 0.8764,
+      "step": 8773
+    },
+    {
+      "epoch": 0.6103864482242861,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0006959590679957513,
+      "loss": 0.8993,
+      "step": 8774
+    },
+    {
+      "epoch": 0.6104560158614213,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0006957444102151009,
+      "loss": 0.7429,
+      "step": 8775
+    },
+    {
+      "epoch": 0.6105255834985565,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0006955297678830153,
+      "loss": 0.8008,
+      "step": 8776
+    },
+    {
+      "epoch": 0.6105951511356916,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006953151410103937,
+      "loss": 0.8046,
+      "step": 8777
+    },
+    {
+      "epoch": 0.6106647187728269,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0006951005296081336,
+      "loss": 0.7889,
+      "step": 8778
+    },
+    {
+      "epoch": 0.6107342864099621,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006948859336871314,
+      "loss": 0.8505,
+      "step": 8779
+    },
+    {
+      "epoch": 0.6108038540470972,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0006946713532582841,
+      "loss": 0.6689,
+      "step": 8780
+    },
+    {
+      "epoch": 0.6108734216842325,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006944567883324863,
+      "loss": 0.8853,
+      "step": 8781
+    },
+    {
+      "epoch": 0.6109429893213677,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006942422389206329,
+      "loss": 0.8171,
+      "step": 8782
+    },
+    {
+      "epoch": 0.6110125569585029,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006940277050336172,
+      "loss": 1.0236,
+      "step": 8783
+    },
+    {
+      "epoch": 0.6110821245956382,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006938131866823324,
+      "loss": 0.8976,
+      "step": 8784
+    },
+    {
+      "epoch": 0.6111516922327733,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006935986838776711,
+      "loss": 0.8862,
+      "step": 8785
+    },
+    {
+      "epoch": 0.6112212598699085,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0006933841966305234,
+      "loss": 1.2325,
+      "step": 8786
+    },
+    {
+      "epoch": 0.6112908275070438,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0006931697249517816,
+      "loss": 0.8163,
+      "step": 8787
+    },
+    {
+      "epoch": 0.611360395144179,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0006929552688523344,
+      "loss": 0.9309,
+      "step": 8788
+    },
+    {
+      "epoch": 0.6114299627813141,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006927408283430712,
+      "loss": 0.8401,
+      "step": 8789
+    },
+    {
+      "epoch": 0.6114995304184493,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006925264034348795,
+      "loss": 0.7555,
+      "step": 8790
+    },
+    {
+      "epoch": 0.6115690980555846,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0006923119941386475,
+      "loss": 0.7396,
+      "step": 8791
+    },
+    {
+      "epoch": 0.6116386656927197,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000692097600465262,
+      "loss": 0.6304,
+      "step": 8792
+    },
+    {
+      "epoch": 0.6117082333298549,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0006918832224256076,
+      "loss": 0.962,
+      "step": 8793
+    },
+    {
+      "epoch": 0.6117778009669902,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006916688600305707,
+      "loss": 0.9029,
+      "step": 8794
+    },
+    {
+      "epoch": 0.6118473686041254,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006914545132910348,
+      "loss": 0.8357,
+      "step": 8795
+    },
+    {
+      "epoch": 0.6119169362412605,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0006912401822178839,
+      "loss": 0.945,
+      "step": 8796
+    },
+    {
+      "epoch": 0.6119865038783958,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006910258668219998,
+      "loss": 0.7049,
+      "step": 8797
+    },
+    {
+      "epoch": 0.612056071515531,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.000690811567114265,
+      "loss": 0.715,
+      "step": 8798
+    },
+    {
+      "epoch": 0.6121256391526662,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006905972831055604,
+      "loss": 0.7373,
+      "step": 8799
+    },
+    {
+      "epoch": 0.6121952067898014,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000690383014806766,
+      "loss": 0.9297,
+      "step": 8800
+    },
+    {
+      "epoch": 0.6122647744269366,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000690168762228762,
+      "loss": 0.8317,
+      "step": 8801
+    },
+    {
+      "epoch": 0.6123343420640718,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0006899545253824265,
+      "loss": 0.6979,
+      "step": 8802
+    },
+    {
+      "epoch": 0.612403909701207,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006897403042786374,
+      "loss": 0.9181,
+      "step": 8803
+    },
+    {
+      "epoch": 0.6124734773383422,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0006895260989282717,
+      "loss": 0.7358,
+      "step": 8804
+    },
+    {
+      "epoch": 0.6125430449754774,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0006893119093422058,
+      "loss": 0.7538,
+      "step": 8805
+    },
+    {
+      "epoch": 0.6126126126126126,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0006890977355313152,
+      "loss": 0.8676,
+      "step": 8806
+    },
+    {
+      "epoch": 0.6126821802497479,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006888835775064743,
+      "loss": 0.5304,
+      "step": 8807
+    },
+    {
+      "epoch": 0.612751747886883,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006886694352785576,
+      "loss": 0.5638,
+      "step": 8808
+    },
+    {
+      "epoch": 0.6128213155240182,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0006884553088584376,
+      "loss": 0.6907,
+      "step": 8809
+    },
+    {
+      "epoch": 0.6128908831611535,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0006882411982569869,
+      "loss": 1.1476,
+      "step": 8810
+    },
+    {
+      "epoch": 0.6129604507982886,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006880271034850763,
+      "loss": 0.7789,
+      "step": 8811
+    },
+    {
+      "epoch": 0.6130300184354238,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006878130245535772,
+      "loss": 0.9118,
+      "step": 8812
+    },
+    {
+      "epoch": 0.6130995860725591,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006875989614733592,
+      "loss": 0.9038,
+      "step": 8813
+    },
+    {
+      "epoch": 0.6131691537096943,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.000687384914255291,
+      "loss": 0.7975,
+      "step": 8814
+    },
+    {
+      "epoch": 0.6132387213468294,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006871708829102417,
+      "loss": 0.9838,
+      "step": 8815
+    },
+    {
+      "epoch": 0.6133082889839646,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.000686956867449078,
+      "loss": 0.7255,
+      "step": 8816
+    },
+    {
+      "epoch": 0.6133778566210999,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0006867428678826668,
+      "loss": 0.8206,
+      "step": 8817
+    },
+    {
+      "epoch": 0.6134474242582351,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006865288842218733,
+      "loss": 0.6303,
+      "step": 8818
+    },
+    {
+      "epoch": 0.6135169918953702,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006863149164775637,
+      "loss": 0.5099,
+      "step": 8819
+    },
+    {
+      "epoch": 0.6135865595325055,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0006861009646606012,
+      "loss": 0.706,
+      "step": 8820
+    },
+    {
+      "epoch": 0.6136561271696407,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006858870287818494,
+      "loss": 0.7001,
+      "step": 8821
+    },
+    {
+      "epoch": 0.6137256948067759,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0006856731088521715,
+      "loss": 0.6741,
+      "step": 8822
+    },
+    {
+      "epoch": 0.6137952624439111,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0006854592048824286,
+      "loss": 0.8331,
+      "step": 8823
+    },
+    {
+      "epoch": 0.6138648300810463,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.000685245316883482,
+      "loss": 0.7475,
+      "step": 8824
+    },
+    {
+      "epoch": 0.6139343977181815,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006850314448661912,
+      "loss": 0.6212,
+      "step": 8825
+    },
+    {
+      "epoch": 0.6140039653553168,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0006848175888414166,
+      "loss": 0.9171,
+      "step": 8826
+    },
+    {
+      "epoch": 0.6140735329924519,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006846037488200161,
+      "loss": 0.9713,
+      "step": 8827
+    },
+    {
+      "epoch": 0.6141431006295871,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006843899248128473,
+      "loss": 0.7993,
+      "step": 8828
+    },
+    {
+      "epoch": 0.6142126682667223,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0006841761168307676,
+      "loss": 0.8893,
+      "step": 8829
+    },
+    {
+      "epoch": 0.6142822359038576,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0006839623248846327,
+      "loss": 0.8157,
+      "step": 8830
+    },
+    {
+      "epoch": 0.6143518035409927,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006837485489852983,
+      "loss": 0.8176,
+      "step": 8831
+    },
+    {
+      "epoch": 0.6144213711781279,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006835347891436178,
+      "loss": 1.0037,
+      "step": 8832
+    },
+    {
+      "epoch": 0.6144909388152632,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006833210453704463,
+      "loss": 0.9301,
+      "step": 8833
+    },
+    {
+      "epoch": 0.6145605064523983,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0006831073176766356,
+      "loss": 0.9012,
+      "step": 8834
+    },
+    {
+      "epoch": 0.6146300740895335,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000682893606073038,
+      "loss": 0.8061,
+      "step": 8835
+    },
+    {
+      "epoch": 0.6146996417266688,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000682679910570505,
+      "loss": 0.9522,
+      "step": 8836
+    },
+    {
+      "epoch": 0.614769209363804,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006824662311798867,
+      "loss": 0.7811,
+      "step": 8837
+    },
+    {
+      "epoch": 0.6148387770009391,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006822525679120326,
+      "loss": 1.0802,
+      "step": 8838
+    },
+    {
+      "epoch": 0.6149083446380744,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0006820389207777914,
+      "loss": 1.1608,
+      "step": 8839
+    },
+    {
+      "epoch": 0.6149779122752096,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0006818252897880115,
+      "loss": 0.5822,
+      "step": 8840
+    },
+    {
+      "epoch": 0.6150474799123448,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0006816116749535395,
+      "loss": 0.684,
+      "step": 8841
+    },
+    {
+      "epoch": 0.6151170475494799,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006813980762852217,
+      "loss": 0.6828,
+      "step": 8842
+    },
+    {
+      "epoch": 0.6151866151866152,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000681184493793904,
+      "loss": 0.9205,
+      "step": 8843
+    },
+    {
+      "epoch": 0.6152561828237504,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006809709274904305,
+      "loss": 0.7127,
+      "step": 8844
+    },
+    {
+      "epoch": 0.6153257504608856,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0006807573773856455,
+      "loss": 0.9358,
+      "step": 8845
+    },
+    {
+      "epoch": 0.6153953180980208,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0006805438434903915,
+      "loss": 0.8105,
+      "step": 8846
+    },
+    {
+      "epoch": 0.615464885735156,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0006803303258155111,
+      "loss": 0.9114,
+      "step": 8847
+    },
+    {
+      "epoch": 0.6155344533722912,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0006801168243718457,
+      "loss": 0.9992,
+      "step": 8848
+    },
+    {
+      "epoch": 0.6156040210094265,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006799033391702351,
+      "loss": 0.8461,
+      "step": 8849
+    },
+    {
+      "epoch": 0.6156735886465616,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006796898702215199,
+      "loss": 0.6119,
+      "step": 8850
+    },
+    {
+      "epoch": 0.6157431562836968,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0006794764175365387,
+      "loss": 0.8069,
+      "step": 8851
+    },
+    {
+      "epoch": 0.6158127239208321,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006792629811261293,
+      "loss": 0.7707,
+      "step": 8852
+    },
+    {
+      "epoch": 0.6158822915579673,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006790495610011289,
+      "loss": 0.5123,
+      "step": 8853
+    },
+    {
+      "epoch": 0.6159518591951024,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006788361571723744,
+      "loss": 0.7792,
+      "step": 8854
+    },
+    {
+      "epoch": 0.6160214268322376,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006786227696507011,
+      "loss": 1.1437,
+      "step": 8855
+    },
+    {
+      "epoch": 0.6160909944693729,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0006784093984469437,
+      "loss": 0.7059,
+      "step": 8856
+    },
+    {
+      "epoch": 0.616160562106508,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006781960435719355,
+      "loss": 0.6805,
+      "step": 8857
+    },
+    {
+      "epoch": 0.6162301297436432,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0006779827050365109,
+      "loss": 0.5224,
+      "step": 8858
+    },
+    {
+      "epoch": 0.6162996973807785,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006777693828515012,
+      "loss": 0.715,
+      "step": 8859
+    },
+    {
+      "epoch": 0.6163692650179137,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0006775560770277378,
+      "loss": 0.6267,
+      "step": 8860
+    },
+    {
+      "epoch": 0.6164388326550488,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006773427875760521,
+      "loss": 0.6734,
+      "step": 8861
+    },
+    {
+      "epoch": 0.6165084002921841,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006771295145072731,
+      "loss": 0.7742,
+      "step": 8862
+    },
+    {
+      "epoch": 0.6165779679293193,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006769162578322301,
+      "loss": 0.9444,
+      "step": 8863
+    },
+    {
+      "epoch": 0.6166475355664545,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006767030175617505,
+      "loss": 0.7557,
+      "step": 8864
+    },
+    {
+      "epoch": 0.6167171032035897,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006764897937066627,
+      "loss": 0.7259,
+      "step": 8865
+    },
+    {
+      "epoch": 0.6167866708407249,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0006762765862777924,
+      "loss": 0.9274,
+      "step": 8866
+    },
+    {
+      "epoch": 0.6168562384778601,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0006760633952859652,
+      "loss": 0.86,
+      "step": 8867
+    },
+    {
+      "epoch": 0.6169258061149953,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0006758502207420065,
+      "loss": 0.939,
+      "step": 8868
+    },
+    {
+      "epoch": 0.6169953737521305,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0006756370626567394,
+      "loss": 0.8592,
+      "step": 8869
+    },
+    {
+      "epoch": 0.6170649413892657,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006754239210409874,
+      "loss": 0.803,
+      "step": 8870
+    },
+    {
+      "epoch": 0.6171345090264009,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0006752107959055724,
+      "loss": 0.7991,
+      "step": 8871
+    },
+    {
+      "epoch": 0.6172040766635362,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006749976872613166,
+      "loss": 0.8255,
+      "step": 8872
+    },
+    {
+      "epoch": 0.6172736443006713,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00067478459511904,
+      "loss": 0.8896,
+      "step": 8873
+    },
+    {
+      "epoch": 0.6173432119378065,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006745715194895622,
+      "loss": 0.7888,
+      "step": 8874
+    },
+    {
+      "epoch": 0.6174127795749418,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0006743584603837027,
+      "loss": 0.7649,
+      "step": 8875
+    },
+    {
+      "epoch": 0.617482347212077,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000674145417812279,
+      "loss": 0.8674,
+      "step": 8876
+    },
+    {
+      "epoch": 0.6175519148492121,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0006739323917861087,
+      "loss": 0.7114,
+      "step": 8877
+    },
+    {
+      "epoch": 0.6176214824863474,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006737193823160077,
+      "loss": 0.8836,
+      "step": 8878
+    },
+    {
+      "epoch": 0.6176910501234826,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006735063894127924,
+      "loss": 0.6029,
+      "step": 8879
+    },
+    {
+      "epoch": 0.6177606177606177,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006732934130872768,
+      "loss": 0.7293,
+      "step": 8880
+    },
+    {
+      "epoch": 0.6178301853977529,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0006730804533502747,
+      "loss": 0.7565,
+      "step": 8881
+    },
+    {
+      "epoch": 0.6178997530348882,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0006728675102125997,
+      "loss": 0.8098,
+      "step": 8882
+    },
+    {
+      "epoch": 0.6179693206720234,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0006726545836850636,
+      "loss": 0.7885,
+      "step": 8883
+    },
+    {
+      "epoch": 0.6180388883091585,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0006724416737784777,
+      "loss": 0.8904,
+      "step": 8884
+    },
+    {
+      "epoch": 0.6181084559462938,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006722287805036525,
+      "loss": 0.9974,
+      "step": 8885
+    },
+    {
+      "epoch": 0.618178023583429,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0006720159038713981,
+      "loss": 0.681,
+      "step": 8886
+    },
+    {
+      "epoch": 0.6182475912205642,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006718030438925227,
+      "loss": 0.6932,
+      "step": 8887
+    },
+    {
+      "epoch": 0.6183171588576994,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0006715902005778343,
+      "loss": 0.7525,
+      "step": 8888
+    },
+    {
+      "epoch": 0.6183867264948346,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0006713773739381403,
+      "loss": 0.6183,
+      "step": 8889
+    },
+    {
+      "epoch": 0.6184562941319698,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006711645639842474,
+      "loss": 0.8117,
+      "step": 8890
+    },
+    {
+      "epoch": 0.6185258617691051,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00067095177072696,
+      "loss": 0.736,
+      "step": 8891
+    },
+    {
+      "epoch": 0.6185954294062402,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0006707389941770829,
+      "loss": 0.9317,
+      "step": 8892
+    },
+    {
+      "epoch": 0.6186649970433754,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006705262343454208,
+      "loss": 0.8924,
+      "step": 8893
+    },
+    {
+      "epoch": 0.6187345646805106,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006703134912427754,
+      "loss": 0.6364,
+      "step": 8894
+    },
+    {
+      "epoch": 0.6188041323176459,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006701007648799491,
+      "loss": 0.779,
+      "step": 8895
+    },
+    {
+      "epoch": 0.618873699954781,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0006698880552677432,
+      "loss": 0.6551,
+      "step": 8896
+    },
+    {
+      "epoch": 0.6189432675919162,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006696753624169582,
+      "loss": 0.9548,
+      "step": 8897
+    },
+    {
+      "epoch": 0.6190128352290515,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006694626863383932,
+      "loss": 0.7592,
+      "step": 8898
+    },
+    {
+      "epoch": 0.6190824028661867,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006692500270428467,
+      "loss": 1.0177,
+      "step": 8899
+    },
+    {
+      "epoch": 0.6191519705033218,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006690373845411173,
+      "loss": 0.8578,
+      "step": 8900
+    },
+    {
+      "epoch": 0.6192215381404571,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006688247588440008,
+      "loss": 0.7239,
+      "step": 8901
+    },
+    {
+      "epoch": 0.6192911057775923,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000668612149962294,
+      "loss": 0.6681,
+      "step": 8902
+    },
+    {
+      "epoch": 0.6193606734147274,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0006683995579067918,
+      "loss": 0.8568,
+      "step": 8903
+    },
+    {
+      "epoch": 0.6194302410518627,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006681869826882889,
+      "loss": 0.6597,
+      "step": 8904
+    },
+    {
+      "epoch": 0.6194998086889979,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0006679744243175785,
+      "loss": 0.76,
+      "step": 8905
+    },
+    {
+      "epoch": 0.6195693763261331,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.000667761882805453,
+      "loss": 1.2157,
+      "step": 8906
+    },
+    {
+      "epoch": 0.6196389439632682,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006675493581627049,
+      "loss": 0.7298,
+      "step": 8907
+    },
+    {
+      "epoch": 0.6197085116004035,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006673368504001245,
+      "loss": 0.8582,
+      "step": 8908
+    },
+    {
+      "epoch": 0.6197780792375387,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000667124359528502,
+      "loss": 1.0202,
+      "step": 8909
+    },
+    {
+      "epoch": 0.6198476468746739,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0006669118855586267,
+      "loss": 0.8024,
+      "step": 8910
+    },
+    {
+      "epoch": 0.6199172145118091,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006666994285012873,
+      "loss": 0.8253,
+      "step": 8911
+    },
+    {
+      "epoch": 0.6199867821489443,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006664869883672708,
+      "loss": 0.8428,
+      "step": 8912
+    },
+    {
+      "epoch": 0.6200563497860795,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0006662745651673638,
+      "loss": 0.8329,
+      "step": 8913
+    },
+    {
+      "epoch": 0.6201259174232148,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0006660621589123526,
+      "loss": 0.7488,
+      "step": 8914
+    },
+    {
+      "epoch": 0.6201954850603499,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006658497696130216,
+      "loss": 0.7186,
+      "step": 8915
+    },
+    {
+      "epoch": 0.6202650526974851,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0006656373972801548,
+      "loss": 0.81,
+      "step": 8916
+    },
+    {
+      "epoch": 0.6203346203346203,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.000665425041924536,
+      "loss": 0.5148,
+      "step": 8917
+    },
+    {
+      "epoch": 0.6204041879717556,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0006652127035569473,
+      "loss": 0.8071,
+      "step": 8918
+    },
+    {
+      "epoch": 0.6204737556088907,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006650003821881698,
+      "loss": 0.6649,
+      "step": 8919
+    },
+    {
+      "epoch": 0.6205433232460259,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0006647880778289843,
+      "loss": 0.9216,
+      "step": 8920
+    },
+    {
+      "epoch": 0.6206128908831612,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006645757904901708,
+      "loss": 0.7633,
+      "step": 8921
+    },
+    {
+      "epoch": 0.6206824585202964,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006643635201825081,
+      "loss": 0.5849,
+      "step": 8922
+    },
+    {
+      "epoch": 0.6207520261574315,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006641512669167737,
+      "loss": 0.8264,
+      "step": 8923
+    },
+    {
+      "epoch": 0.6208215937945668,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0006639390307037456,
+      "loss": 0.8133,
+      "step": 8924
+    },
+    {
+      "epoch": 0.620891161431702,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006637268115541997,
+      "loss": 0.6556,
+      "step": 8925
+    },
+    {
+      "epoch": 0.6209607290688371,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006635146094789111,
+      "loss": 0.7816,
+      "step": 8926
+    },
+    {
+      "epoch": 0.6210302967059724,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006633024244886546,
+      "loss": 0.9437,
+      "step": 8927
+    },
+    {
+      "epoch": 0.6210998643431076,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0006630902565942039,
+      "loss": 0.6375,
+      "step": 8928
+    },
+    {
+      "epoch": 0.6211694319802428,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.000662878105806332,
+      "loss": 0.8066,
+      "step": 8929
+    },
+    {
+      "epoch": 0.6212389996173779,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006626659721358103,
+      "loss": 0.7373,
+      "step": 8930
+    },
+    {
+      "epoch": 0.6213085672545132,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006624538555934105,
+      "loss": 0.8738,
+      "step": 8931
+    },
+    {
+      "epoch": 0.6213781348916484,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006622417561899028,
+      "loss": 0.9945,
+      "step": 8932
+    },
+    {
+      "epoch": 0.6214477025287836,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0006620296739360561,
+      "loss": 0.6965,
+      "step": 8933
+    },
+    {
+      "epoch": 0.6215172701659188,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0006618176088426388,
+      "loss": 0.6123,
+      "step": 8934
+    },
+    {
+      "epoch": 0.621586837803054,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006616055609204191,
+      "loss": 0.6154,
+      "step": 8935
+    },
+    {
+      "epoch": 0.6216564054401892,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006613935301801633,
+      "loss": 0.6419,
+      "step": 8936
+    },
+    {
+      "epoch": 0.6217259730773245,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0006611815166326373,
+      "loss": 0.6474,
+      "step": 8937
+    },
+    {
+      "epoch": 0.6217955407144596,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0006609695202886059,
+      "loss": 0.8672,
+      "step": 8938
+    },
+    {
+      "epoch": 0.6218651083515948,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0006607575411588338,
+      "loss": 0.7448,
+      "step": 8939
+    },
+    {
+      "epoch": 0.6219346759887301,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0006605455792540837,
+      "loss": 0.6578,
+      "step": 8940
+    },
+    {
+      "epoch": 0.6220042436258653,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006603336345851179,
+      "loss": 0.9665,
+      "step": 8941
+    },
+    {
+      "epoch": 0.6220738112630004,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006601217071626981,
+      "loss": 0.7588,
+      "step": 8942
+    },
+    {
+      "epoch": 0.6221433789001356,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0006599097969975853,
+      "loss": 0.9968,
+      "step": 8943
+    },
+    {
+      "epoch": 0.6222129465372709,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006596979041005387,
+      "loss": 0.8707,
+      "step": 8944
+    },
+    {
+      "epoch": 0.622282514174406,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000659486028482317,
+      "loss": 1.1097,
+      "step": 8945
+    },
+    {
+      "epoch": 0.6223520818115412,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0006592741701536789,
+      "loss": 0.6772,
+      "step": 8946
+    },
+    {
+      "epoch": 0.6224216494486765,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006590623291253807,
+      "loss": 0.8832,
+      "step": 8947
+    },
+    {
+      "epoch": 0.6224912170858117,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006588505054081788,
+      "loss": 0.826,
+      "step": 8948
+    },
+    {
+      "epoch": 0.6225607847229468,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000658638699012829,
+      "loss": 0.6682,
+      "step": 8949
+    },
+    {
+      "epoch": 0.6226303523600821,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0006584269099500857,
+      "loss": 0.7514,
+      "step": 8950
+    },
+    {
+      "epoch": 0.6226999199972173,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000658215138230702,
+      "loss": 0.7864,
+      "step": 8951
+    },
+    {
+      "epoch": 0.6227694876343525,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0006580033838654305,
+      "loss": 0.585,
+      "step": 8952
+    },
+    {
+      "epoch": 0.6228390552714878,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006577916468650238,
+      "loss": 0.9401,
+      "step": 8953
+    },
+    {
+      "epoch": 0.6229086229086229,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0006575799272402326,
+      "loss": 1.1381,
+      "step": 8954
+    },
+    {
+      "epoch": 0.6229781905457581,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006573682250018062,
+      "loss": 0.8025,
+      "step": 8955
+    },
+    {
+      "epoch": 0.6230477581828933,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006571565401604948,
+      "loss": 0.8385,
+      "step": 8956
+    },
+    {
+      "epoch": 0.6231173258200285,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006569448727270462,
+      "loss": 0.7887,
+      "step": 8957
+    },
+    {
+      "epoch": 0.6231868934571637,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006567332227122078,
+      "loss": 1.0143,
+      "step": 8958
+    },
+    {
+      "epoch": 0.6232564610942989,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0006565215901267259,
+      "loss": 0.6021,
+      "step": 8959
+    },
+    {
+      "epoch": 0.6233260287314342,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006563099749813466,
+      "loss": 0.7746,
+      "step": 8960
+    },
+    {
+      "epoch": 0.6233955963685693,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006560983772868146,
+      "loss": 0.9201,
+      "step": 8961
+    },
+    {
+      "epoch": 0.6234651640057045,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006558867970538733,
+      "loss": 0.7459,
+      "step": 8962
+    },
+    {
+      "epoch": 0.6235347316428398,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000655675234293266,
+      "loss": 0.7632,
+      "step": 8963
+    },
+    {
+      "epoch": 0.623604299279975,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0006554636890157352,
+      "loss": 0.6866,
+      "step": 8964
+    },
+    {
+      "epoch": 0.6236738669171101,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0006552521612320214,
+      "loss": 0.6747,
+      "step": 8965
+    },
+    {
+      "epoch": 0.6237434345542454,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006550406509528649,
+      "loss": 0.8112,
+      "step": 8966
+    },
+    {
+      "epoch": 0.6238130021913806,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0006548291581890057,
+      "loss": 0.8227,
+      "step": 8967
+    },
+    {
+      "epoch": 0.6238825698285158,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006546176829511823,
+      "loss": 0.6668,
+      "step": 8968
+    },
+    {
+      "epoch": 0.6239521374656509,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0006544062252501317,
+      "loss": 0.7372,
+      "step": 8969
+    },
+    {
+      "epoch": 0.6240217051027862,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006541947850965911,
+      "loss": 0.9286,
+      "step": 8970
+    },
+    {
+      "epoch": 0.6240912727399214,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006539833625012968,
+      "loss": 1.1129,
+      "step": 8971
+    },
+    {
+      "epoch": 0.6241608403770565,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006537719574749828,
+      "loss": 0.6248,
+      "step": 8972
+    },
+    {
+      "epoch": 0.6242304080141918,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006535605700283836,
+      "loss": 0.6675,
+      "step": 8973
+    },
+    {
+      "epoch": 0.624299975651327,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0006533492001722327,
+      "loss": 0.8368,
+      "step": 8974
+    },
+    {
+      "epoch": 0.6243695432884622,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0006531378479172624,
+      "loss": 0.6613,
+      "step": 8975
+    },
+    {
+      "epoch": 0.6244391109255975,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006529265132742035,
+      "loss": 0.78,
+      "step": 8976
+    },
+    {
+      "epoch": 0.6245086785627326,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000652715196253787,
+      "loss": 0.8276,
+      "step": 8977
+    },
+    {
+      "epoch": 0.6245782461998678,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006525038968667425,
+      "loss": 0.8103,
+      "step": 8978
+    },
+    {
+      "epoch": 0.6246478138370031,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0006522926151237984,
+      "loss": 0.7788,
+      "step": 8979
+    },
+    {
+      "epoch": 0.6247173814741382,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006520813510356827,
+      "loss": 0.6742,
+      "step": 8980
+    },
+    {
+      "epoch": 0.6247869491112734,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0006518701046131226,
+      "loss": 1.0233,
+      "step": 8981
+    },
+    {
+      "epoch": 0.6248565167484086,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.000651658875866844,
+      "loss": 0.7299,
+      "step": 8982
+    },
+    {
+      "epoch": 0.6249260843855439,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0006514476648075714,
+      "loss": 0.906,
+      "step": 8983
+    },
+    {
+      "epoch": 0.624995652022679,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006512364714460297,
+      "loss": 0.8938,
+      "step": 8984
+    },
+    {
+      "epoch": 0.6250652196598142,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006510252957929426,
+      "loss": 1.0569,
+      "step": 8985
+    },
+    {
+      "epoch": 0.6251347872969495,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006508141378590316,
+      "loss": 0.7806,
+      "step": 8986
+    },
+    {
+      "epoch": 0.6252043549340847,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006506029976550184,
+      "loss": 0.7813,
+      "step": 8987
+    },
+    {
+      "epoch": 0.6252739225712198,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0006503918751916241,
+      "loss": 0.8226,
+      "step": 8988
+    },
+    {
+      "epoch": 0.6253434902083551,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006501807704795686,
+      "loss": 0.8797,
+      "step": 8989
+    },
+    {
+      "epoch": 0.6254130578454903,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006499696835295698,
+      "loss": 0.8448,
+      "step": 8990
+    },
+    {
+      "epoch": 0.6254826254826255,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0006497586143523464,
+      "loss": 0.8622,
+      "step": 8991
+    },
+    {
+      "epoch": 0.6255521931197607,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006495475629586153,
+      "loss": 0.8413,
+      "step": 8992
+    },
+    {
+      "epoch": 0.6256217607568959,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0006493365293590927,
+      "loss": 0.9827,
+      "step": 8993
+    },
+    {
+      "epoch": 0.6256913283940311,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006491255135644931,
+      "loss": 0.6953,
+      "step": 8994
+    },
+    {
+      "epoch": 0.6257608960311662,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006489145155855318,
+      "loss": 0.5838,
+      "step": 8995
+    },
+    {
+      "epoch": 0.6258304636683015,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.000648703535432922,
+      "loss": 0.783,
+      "step": 8996
+    },
+    {
+      "epoch": 0.6259000313054367,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006484925731173755,
+      "loss": 0.8399,
+      "step": 8997
+    },
+    {
+      "epoch": 0.6259695989425719,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006482816286496046,
+      "loss": 0.8525,
+      "step": 8998
+    },
+    {
+      "epoch": 0.6260391665797072,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006480707020403198,
+      "loss": 0.8645,
+      "step": 8999
+    },
+    {
+      "epoch": 0.6261087342168423,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0006478597933002313,
+      "loss": 0.7281,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6261783018539775,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.000647648902440047,
+      "loss": 0.7644,
+      "step": 9001
+    },
+    {
+      "epoch": 0.6262478694911128,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0006474380294704756,
+      "loss": 0.7728,
+      "step": 9002
+    },
+    {
+      "epoch": 0.626317437128248,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0006472271744022243,
+      "loss": 0.5117,
+      "step": 9003
+    },
+    {
+      "epoch": 0.6263870047653831,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0006470163372459984,
+      "loss": 0.6666,
+      "step": 9004
+    },
+    {
+      "epoch": 0.6264565724025184,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0006468055180125043,
+      "loss": 0.8523,
+      "step": 9005
+    },
+    {
+      "epoch": 0.6265261400396536,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0006465947167124455,
+      "loss": 0.7207,
+      "step": 9006
+    },
+    {
+      "epoch": 0.6265957076767887,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000646383933356526,
+      "loss": 0.5216,
+      "step": 9007
+    },
+    {
+      "epoch": 0.6266652753139239,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0006461731679554476,
+      "loss": 0.8078,
+      "step": 9008
+    },
+    {
+      "epoch": 0.6267348429510592,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006459624205199124,
+      "loss": 0.7409,
+      "step": 9009
+    },
+    {
+      "epoch": 0.6268044105881944,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0006457516910606213,
+      "loss": 1.0532,
+      "step": 9010
+    },
+    {
+      "epoch": 0.6268739782253295,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006455409795882737,
+      "loss": 0.6999,
+      "step": 9011
+    },
+    {
+      "epoch": 0.6269435458624648,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0006453302861135681,
+      "loss": 1.0903,
+      "step": 9012
+    },
+    {
+      "epoch": 0.6270131134996,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0006451196106472031,
+      "loss": 0.9208,
+      "step": 9013
+    },
+    {
+      "epoch": 0.6270826811367352,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0006449089531998759,
+      "loss": 0.5971,
+      "step": 9014
+    },
+    {
+      "epoch": 0.6271522487738704,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0006446983137822818,
+      "loss": 1.0601,
+      "step": 9015
+    },
+    {
+      "epoch": 0.6272218164110056,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0006444876924051168,
+      "loss": 0.8429,
+      "step": 9016
+    },
+    {
+      "epoch": 0.6272913840481408,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0006442770890790749,
+      "loss": 0.7286,
+      "step": 9017
+    },
+    {
+      "epoch": 0.6273609516852761,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006440665038148493,
+      "loss": 0.9192,
+      "step": 9018
+    },
+    {
+      "epoch": 0.6274305193224112,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006438559366231325,
+      "loss": 1.0337,
+      "step": 9019
+    },
+    {
+      "epoch": 0.6275000869595464,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0006436453875146161,
+      "loss": 0.8791,
+      "step": 9020
+    },
+    {
+      "epoch": 0.6275696545966816,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0006434348564999911,
+      "loss": 0.5871,
+      "step": 9021
+    },
+    {
+      "epoch": 0.6276392222338169,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0006432243435899465,
+      "loss": 0.9247,
+      "step": 9022
+    },
+    {
+      "epoch": 0.627708789870952,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0006430138487951715,
+      "loss": 0.7651,
+      "step": 9023
+    },
+    {
+      "epoch": 0.6277783575080872,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006428033721263541,
+      "loss": 0.9575,
+      "step": 9024
+    },
+    {
+      "epoch": 0.6278479251452225,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006425929135941813,
+      "loss": 0.7282,
+      "step": 9025
+    },
+    {
+      "epoch": 0.6279174927823576,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0006423824732093383,
+      "loss": 1.0565,
+      "step": 9026
+    },
+    {
+      "epoch": 0.6279870604194928,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006421720509825111,
+      "loss": 0.8532,
+      "step": 9027
+    },
+    {
+      "epoch": 0.6280566280566281,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006419616469243837,
+      "loss": 0.81,
+      "step": 9028
+    },
+    {
+      "epoch": 0.6281261956937633,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0006417512610456389,
+      "loss": 0.9244,
+      "step": 9029
+    },
+    {
+      "epoch": 0.6281957633308984,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006415408933569593,
+      "loss": 0.8321,
+      "step": 9030
+    },
+    {
+      "epoch": 0.6282653309680337,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006413305438690267,
+      "loss": 0.8649,
+      "step": 9031
+    },
+    {
+      "epoch": 0.6283348986051689,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006411202125925213,
+      "loss": 1.0341,
+      "step": 9032
+    },
+    {
+      "epoch": 0.6284044662423041,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0006409098995381222,
+      "loss": 0.9063,
+      "step": 9033
+    },
+    {
+      "epoch": 0.6284740338794392,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0006406996047165086,
+      "loss": 0.7885,
+      "step": 9034
+    },
+    {
+      "epoch": 0.6285436015165745,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0006404893281383583,
+      "loss": 1.0488,
+      "step": 9035
+    },
+    {
+      "epoch": 0.6286131691537097,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006402790698143477,
+      "loss": 0.8029,
+      "step": 9036
+    },
+    {
+      "epoch": 0.6286827367908449,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006400688297551526,
+      "loss": 0.7888,
+      "step": 9037
+    },
+    {
+      "epoch": 0.6287523044279801,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0006398586079714485,
+      "loss": 0.903,
+      "step": 9038
+    },
+    {
+      "epoch": 0.6288218720651153,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000639648404473909,
+      "loss": 0.8528,
+      "step": 9039
+    },
+    {
+      "epoch": 0.6288914397022505,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006394382192732069,
+      "loss": 0.8498,
+      "step": 9040
+    },
+    {
+      "epoch": 0.6289610073393858,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0006392280523800149,
+      "loss": 0.7965,
+      "step": 9041
+    },
+    {
+      "epoch": 0.6290305749765209,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0006390179038050041,
+      "loss": 0.688,
+      "step": 9042
+    },
+    {
+      "epoch": 0.6291001426136561,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0006388077735588441,
+      "loss": 0.6369,
+      "step": 9043
+    },
+    {
+      "epoch": 0.6291697102507914,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006385976616522054,
+      "loss": 0.9437,
+      "step": 9044
+    },
+    {
+      "epoch": 0.6292392778879266,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006383875680957557,
+      "loss": 0.8621,
+      "step": 9045
+    },
+    {
+      "epoch": 0.6293088455250617,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0006381774929001628,
+      "loss": 0.683,
+      "step": 9046
+    },
+    {
+      "epoch": 0.6293784131621969,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0006379674360760927,
+      "loss": 0.7988,
+      "step": 9047
+    },
+    {
+      "epoch": 0.6294479807993322,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006377573976342114,
+      "loss": 0.7488,
+      "step": 9048
+    },
+    {
+      "epoch": 0.6295175484364673,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006375473775851841,
+      "loss": 0.7922,
+      "step": 9049
+    },
+    {
+      "epoch": 0.6295871160736025,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006373373759396735,
+      "loss": 0.8314,
+      "step": 9050
+    },
+    {
+      "epoch": 0.6296566837107378,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0006371273927083434,
+      "loss": 0.765,
+      "step": 9051
+    },
+    {
+      "epoch": 0.629726251347873,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.000636917427901855,
+      "loss": 0.9509,
+      "step": 9052
+    },
+    {
+      "epoch": 0.6297958189850081,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00063670748153087,
+      "loss": 0.6709,
+      "step": 9053
+    },
+    {
+      "epoch": 0.6298653866221434,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0006364975536060475,
+      "loss": 0.8492,
+      "step": 9054
+    },
+    {
+      "epoch": 0.6299349542592786,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006362876441380471,
+      "loss": 0.787,
+      "step": 9055
+    },
+    {
+      "epoch": 0.6300045218964138,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.000636077753137527,
+      "loss": 0.5293,
+      "step": 9056
+    },
+    {
+      "epoch": 0.630074089533549,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.000635867880615144,
+      "loss": 0.4938,
+      "step": 9057
+    },
+    {
+      "epoch": 0.6301436571706842,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0006356580265815551,
+      "loss": 0.9754,
+      "step": 9058
+    },
+    {
+      "epoch": 0.6302132248078194,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000635448191047415,
+      "loss": 0.6962,
+      "step": 9059
+    },
+    {
+      "epoch": 0.6302827924449546,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006352383740233784,
+      "loss": 0.5832,
+      "step": 9060
+    },
+    {
+      "epoch": 0.6303523600820898,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0006350285755200984,
+      "loss": 0.6581,
+      "step": 9061
+    },
+    {
+      "epoch": 0.630421927719225,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0006348187955482279,
+      "loss": 0.7414,
+      "step": 9062
+    },
+    {
+      "epoch": 0.6304914953563602,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0006346090341184183,
+      "loss": 0.7638,
+      "step": 9063
+    },
+    {
+      "epoch": 0.6305610629934955,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00063439929124132,
+      "loss": 0.9622,
+      "step": 9064
+    },
+    {
+      "epoch": 0.6306306306306306,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0006341895669275834,
+      "loss": 0.7825,
+      "step": 9065
+    },
+    {
+      "epoch": 0.6307001982677658,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006339798611878565,
+      "loss": 0.6645,
+      "step": 9066
+    },
+    {
+      "epoch": 0.6307697659049011,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0006337701740327876,
+      "loss": 0.5933,
+      "step": 9067
+    },
+    {
+      "epoch": 0.6308393335420363,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.000633560505473023,
+      "loss": 0.6615,
+      "step": 9068
+    },
+    {
+      "epoch": 0.6309089011791714,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0006333508555192089,
+      "loss": 0.6311,
+      "step": 9069
+    },
+    {
+      "epoch": 0.6309784688163067,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006331412241819905,
+      "loss": 0.7053,
+      "step": 9070
+    },
+    {
+      "epoch": 0.6310480364534419,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0006329316114720114,
+      "loss": 0.9294,
+      "step": 9071
+    },
+    {
+      "epoch": 0.631117604090577,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006327220173999153,
+      "loss": 1.0703,
+      "step": 9072
+    },
+    {
+      "epoch": 0.6311871717277122,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0006325124419763438,
+      "loss": 0.8098,
+      "step": 9073
+    },
+    {
+      "epoch": 0.6312567393648475,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0006323028852119383,
+      "loss": 0.8282,
+      "step": 9074
+    },
+    {
+      "epoch": 0.6313263070019827,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0006320933471173385,
+      "loss": 0.8094,
+      "step": 9075
+    },
+    {
+      "epoch": 0.6313958746391178,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006318838277031845,
+      "loss": 0.8589,
+      "step": 9076
+    },
+    {
+      "epoch": 0.6314654422762531,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006316743269801142,
+      "loss": 0.907,
+      "step": 9077
+    },
+    {
+      "epoch": 0.6315350099133883,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0006314648449587649,
+      "loss": 0.7442,
+      "step": 9078
+    },
+    {
+      "epoch": 0.6316045775505235,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006312553816497737,
+      "loss": 0.8458,
+      "step": 9079
+    },
+    {
+      "epoch": 0.6316741451876587,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006310459370637754,
+      "loss": 0.9374,
+      "step": 9080
+    },
+    {
+      "epoch": 0.6317437128247939,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000630836511211405,
+      "loss": 0.828,
+      "step": 9081
+    },
+    {
+      "epoch": 0.6318132804619291,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000630627104103295,
+      "loss": 0.879,
+      "step": 9082
+    },
+    {
+      "epoch": 0.6318828480990644,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0006304177157500796,
+      "loss": 0.6049,
+      "step": 9083
+    },
+    {
+      "epoch": 0.6319524157361995,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006302083461623896,
+      "loss": 0.6668,
+      "step": 9084
+    },
+    {
+      "epoch": 0.6320219833733347,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0006299989953508558,
+      "loss": 0.8314,
+      "step": 9085
+    },
+    {
+      "epoch": 0.6320915510104699,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0006297896633261083,
+      "loss": 0.6113,
+      "step": 9086
+    },
+    {
+      "epoch": 0.6321611186476052,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006295803500987755,
+      "loss": 0.7645,
+      "step": 9087
+    },
+    {
+      "epoch": 0.6322306862847403,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0006293710556794859,
+      "loss": 0.9322,
+      "step": 9088
+    },
+    {
+      "epoch": 0.6323002539218755,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.000629161780078865,
+      "loss": 0.8756,
+      "step": 9089
+    },
+    {
+      "epoch": 0.6323698215590108,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0006289525233075406,
+      "loss": 0.8356,
+      "step": 9090
+    },
+    {
+      "epoch": 0.632439389196146,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0006287432853761365,
+      "loss": 0.6584,
+      "step": 9091
+    },
+    {
+      "epoch": 0.6325089568332811,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0006285340662952775,
+      "loss": 0.9389,
+      "step": 9092
+    },
+    {
+      "epoch": 0.6325785244704164,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006283248660755858,
+      "loss": 1.0067,
+      "step": 9093
+    },
+    {
+      "epoch": 0.6326480921075516,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006281156847276841,
+      "loss": 0.8184,
+      "step": 9094
+    },
+    {
+      "epoch": 0.6327176597446867,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0006279065222621936,
+      "loss": 0.982,
+      "step": 9095
+    },
+    {
+      "epoch": 0.632787227381822,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0006276973786897342,
+      "loss": 1.0089,
+      "step": 9096
+    },
+    {
+      "epoch": 0.6328567950189572,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0006274882540209258,
+      "loss": 0.598,
+      "step": 9097
+    },
+    {
+      "epoch": 0.6329263626560924,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006272791482663859,
+      "loss": 0.6582,
+      "step": 9098
+    },
+    {
+      "epoch": 0.6329959302932275,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0006270700614367326,
+      "loss": 1.0057,
+      "step": 9099
+    },
+    {
+      "epoch": 0.6330654979303628,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0006268609935425815,
+      "loss": 1.0524,
+      "step": 9100
+    },
+    {
+      "epoch": 0.633135065567498,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006266519445945484,
+      "loss": 0.8258,
+      "step": 9101
+    },
+    {
+      "epoch": 0.6332046332046332,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006264429146032478,
+      "loss": 0.8512,
+      "step": 9102
+    },
+    {
+      "epoch": 0.6332742008417684,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000626233903579293,
+      "loss": 0.8977,
+      "step": 9103
+    },
+    {
+      "epoch": 0.6333437684789036,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.000626024911533297,
+      "loss": 0.678,
+      "step": 9104
+    },
+    {
+      "epoch": 0.6334133361160388,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0006258159384758709,
+      "loss": 0.6439,
+      "step": 9105
+    },
+    {
+      "epoch": 0.6334829037531741,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006256069844176256,
+      "loss": 0.702,
+      "step": 9106
+    },
+    {
+      "epoch": 0.6335524713903092,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006253980493691698,
+      "loss": 0.9091,
+      "step": 9107
+    },
+    {
+      "epoch": 0.6336220390274444,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0006251891333411136,
+      "loss": 0.9381,
+      "step": 9108
+    },
+    {
+      "epoch": 0.6336916066645797,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006249802363440638,
+      "loss": 0.816,
+      "step": 9109
+    },
+    {
+      "epoch": 0.6337611743017149,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0006247713583886272,
+      "loss": 0.9566,
+      "step": 9110
+    },
+    {
+      "epoch": 0.63383074193885,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0006245624994854102,
+      "loss": 0.8885,
+      "step": 9111
+    },
+    {
+      "epoch": 0.6339003095759852,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0006243536596450168,
+      "loss": 0.7782,
+      "step": 9112
+    },
+    {
+      "epoch": 0.6339698772131205,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0006241448388780514,
+      "loss": 0.9671,
+      "step": 9113
+    },
+    {
+      "epoch": 0.6340394448502557,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006239360371951161,
+      "loss": 0.7471,
+      "step": 9114
+    },
+    {
+      "epoch": 0.6341090124873908,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0006237272546068137,
+      "loss": 0.8383,
+      "step": 9115
+    },
+    {
+      "epoch": 0.6341785801245261,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006235184911237449,
+      "loss": 0.7559,
+      "step": 9116
+    },
+    {
+      "epoch": 0.6342481477616613,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0006233097467565092,
+      "loss": 0.8646,
+      "step": 9117
+    },
+    {
+      "epoch": 0.6343177153987964,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006231010215157062,
+      "loss": 0.8285,
+      "step": 9118
+    },
+    {
+      "epoch": 0.6343872830359317,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0006228923154119334,
+      "loss": 0.6399,
+      "step": 9119
+    },
+    {
+      "epoch": 0.6344568506730669,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006226836284557885,
+      "loss": 0.8933,
+      "step": 9120
+    },
+    {
+      "epoch": 0.6345264183102021,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0006224749606578662,
+      "loss": 0.64,
+      "step": 9121
+    },
+    {
+      "epoch": 0.6345959859473373,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0006222663120287633,
+      "loss": 0.8596,
+      "step": 9122
+    },
+    {
+      "epoch": 0.6346655535844725,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0006220576825790729,
+      "loss": 0.9377,
+      "step": 9123
+    },
+    {
+      "epoch": 0.6347351212216077,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0006218490723193884,
+      "loss": 1.0686,
+      "step": 9124
+    },
+    {
+      "epoch": 0.6348046888587429,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0006216404812603021,
+      "loss": 0.7806,
+      "step": 9125
+    },
+    {
+      "epoch": 0.6348742564958781,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0006214319094124051,
+      "loss": 0.7259,
+      "step": 9126
+    },
+    {
+      "epoch": 0.6349438241330133,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0006212233567862875,
+      "loss": 1.2013,
+      "step": 9127
+    },
+    {
+      "epoch": 0.6350133917701485,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0006210148233925385,
+      "loss": 0.6091,
+      "step": 9128
+    },
+    {
+      "epoch": 0.6350829594072838,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000620806309241747,
+      "loss": 0.8251,
+      "step": 9129
+    },
+    {
+      "epoch": 0.6351525270444189,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006205978143444996,
+      "loss": 0.7176,
+      "step": 9130
+    },
+    {
+      "epoch": 0.6352220946815541,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006203893387113826,
+      "loss": 0.8539,
+      "step": 9131
+    },
+    {
+      "epoch": 0.6352916623186894,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006201808823529819,
+      "loss": 0.7645,
+      "step": 9132
+    },
+    {
+      "epoch": 0.6353612299558246,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0006199724452798816,
+      "loss": 0.8681,
+      "step": 9133
+    },
+    {
+      "epoch": 0.6354307975929597,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.000619764027502665,
+      "loss": 0.9698,
+      "step": 9134
+    },
+    {
+      "epoch": 0.635500365230095,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006195556290319143,
+      "loss": 0.9378,
+      "step": 9135
+    },
+    {
+      "epoch": 0.6355699328672302,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0006193472498782116,
+      "loss": 0.7968,
+      "step": 9136
+    },
+    {
+      "epoch": 0.6356395005043654,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0006191388900521368,
+      "loss": 0.8968,
+      "step": 9137
+    },
+    {
+      "epoch": 0.6357090681415005,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.000618930549564269,
+      "loss": 0.8122,
+      "step": 9138
+    },
+    {
+      "epoch": 0.6357786357786358,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0006187222284251879,
+      "loss": 0.7596,
+      "step": 9139
+    },
+    {
+      "epoch": 0.635848203415771,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006185139266454698,
+      "loss": 0.5965,
+      "step": 9140
+    },
+    {
+      "epoch": 0.6359177710529061,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006183056442356918,
+      "loss": 1.0042,
+      "step": 9141
+    },
+    {
+      "epoch": 0.6359873386900414,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006180973812064291,
+      "loss": 0.8028,
+      "step": 9142
+    },
+    {
+      "epoch": 0.6360569063271766,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000617889137568257,
+      "loss": 0.9763,
+      "step": 9143
+    },
+    {
+      "epoch": 0.6361264739643118,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000617680913331748,
+      "loss": 0.5821,
+      "step": 9144
+    },
+    {
+      "epoch": 0.636196041601447,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006174727085074751,
+      "loss": 0.637,
+      "step": 9145
+    },
+    {
+      "epoch": 0.6362656092385822,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006172645231060103,
+      "loss": 0.9573,
+      "step": 9146
+    },
+    {
+      "epoch": 0.6363351768757174,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.000617056357137924,
+      "loss": 0.8373,
+      "step": 9147
+    },
+    {
+      "epoch": 0.6364047445128527,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006168482106137854,
+      "loss": 0.7554,
+      "step": 9148
+    },
+    {
+      "epoch": 0.6364743121499878,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006166400835441635,
+      "loss": 0.6937,
+      "step": 9149
+    },
+    {
+      "epoch": 0.636543879787123,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006164319759396261,
+      "loss": 0.6661,
+      "step": 9150
+    },
+    {
+      "epoch": 0.6366134474242582,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0006162238878107394,
+      "loss": 0.7754,
+      "step": 9151
+    },
+    {
+      "epoch": 0.6366830150613935,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0006160158191680691,
+      "loss": 0.79,
+      "step": 9152
+    },
+    {
+      "epoch": 0.6367525826985286,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0006158077700221805,
+      "loss": 0.6313,
+      "step": 9153
+    },
+    {
+      "epoch": 0.6368221503356638,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0006155997403836369,
+      "loss": 0.8359,
+      "step": 9154
+    },
+    {
+      "epoch": 0.6368917179727991,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0006153917302630007,
+      "loss": 0.6287,
+      "step": 9155
+    },
+    {
+      "epoch": 0.6369612856099343,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006151837396708337,
+      "loss": 1.0975,
+      "step": 9156
+    },
+    {
+      "epoch": 0.6370308532470694,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006149757686176973,
+      "loss": 0.6502,
+      "step": 9157
+    },
+    {
+      "epoch": 0.6371004208842047,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0006147678171141504,
+      "loss": 0.8921,
+      "step": 9158
+    },
+    {
+      "epoch": 0.6371699885213399,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006145598851707519,
+      "loss": 0.9472,
+      "step": 9159
+    },
+    {
+      "epoch": 0.637239556158475,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006143519727980597,
+      "loss": 0.8791,
+      "step": 9160
+    },
+    {
+      "epoch": 0.6373091237956103,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0006141440800066309,
+      "loss": 0.9028,
+      "step": 9161
+    },
+    {
+      "epoch": 0.6373786914327455,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006139362068070207,
+      "loss": 0.7902,
+      "step": 9162
+    },
+    {
+      "epoch": 0.6374482590698807,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006137283532097837,
+      "loss": 0.7436,
+      "step": 9163
+    },
+    {
+      "epoch": 0.6375178267070158,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0006135205192254742,
+      "loss": 0.8525,
+      "step": 9164
+    },
+    {
+      "epoch": 0.6375873943441511,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0006133127048646448,
+      "loss": 0.8239,
+      "step": 9165
+    },
+    {
+      "epoch": 0.6376569619812863,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006131049101378472,
+      "loss": 0.7516,
+      "step": 9166
+    },
+    {
+      "epoch": 0.6377265296184215,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006128971350556319,
+      "loss": 0.7965,
+      "step": 9167
+    },
+    {
+      "epoch": 0.6377960972555567,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0006126893796285493,
+      "loss": 0.7398,
+      "step": 9168
+    },
+    {
+      "epoch": 0.6378656648926919,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0006124816438671476,
+      "loss": 0.7929,
+      "step": 9169
+    },
+    {
+      "epoch": 0.6379352325298271,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006122739277819747,
+      "loss": 0.7309,
+      "step": 9170
+    },
+    {
+      "epoch": 0.6380048001669624,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006120662313835776,
+      "loss": 1.0432,
+      "step": 9171
+    },
+    {
+      "epoch": 0.6380743678040975,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0006118585546825019,
+      "loss": 0.7969,
+      "step": 9172
+    },
+    {
+      "epoch": 0.6381439354412327,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0006116508976892925,
+      "loss": 0.6666,
+      "step": 9173
+    },
+    {
+      "epoch": 0.638213503078368,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0006114432604144928,
+      "loss": 0.6517,
+      "step": 9174
+    },
+    {
+      "epoch": 0.6382830707155032,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0006112356428686463,
+      "loss": 0.7091,
+      "step": 9175
+    },
+    {
+      "epoch": 0.6383526383526383,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006110280450622943,
+      "loss": 0.9462,
+      "step": 9176
+    },
+    {
+      "epoch": 0.6384222059897735,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0006108204670059772,
+      "loss": 0.9103,
+      "step": 9177
+    },
+    {
+      "epoch": 0.6384917736269088,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006106129087102354,
+      "loss": 0.827,
+      "step": 9178
+    },
+    {
+      "epoch": 0.638561341264044,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0006104053701856076,
+      "loss": 0.9415,
+      "step": 9179
+    },
+    {
+      "epoch": 0.6386309089011791,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0006101978514426312,
+      "loss": 0.7695,
+      "step": 9180
+    },
+    {
+      "epoch": 0.6387004765383144,
+      "grad_norm": 1.375,
+      "learning_rate": 0.000609990352491843,
+      "loss": 0.6326,
+      "step": 9181
+    },
+    {
+      "epoch": 0.6387700441754496,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0006097828733437794,
+      "loss": 0.6975,
+      "step": 9182
+    },
+    {
+      "epoch": 0.6388396118125848,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0006095754140089744,
+      "loss": 0.9786,
+      "step": 9183
+    },
+    {
+      "epoch": 0.63890917944972,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006093679744979617,
+      "loss": 0.9109,
+      "step": 9184
+    },
+    {
+      "epoch": 0.6389787470868552,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0006091605548212746,
+      "loss": 0.7764,
+      "step": 9185
+    },
+    {
+      "epoch": 0.6390483147239904,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006089531549894447,
+      "loss": 0.6927,
+      "step": 9186
+    },
+    {
+      "epoch": 0.6391178823611257,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006087457750130023,
+      "loss": 0.9123,
+      "step": 9187
+    },
+    {
+      "epoch": 0.6391874499982608,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0006085384149024773,
+      "loss": 0.7084,
+      "step": 9188
+    },
+    {
+      "epoch": 0.639257017635396,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.000608331074668399,
+      "loss": 1.138,
+      "step": 9189
+    },
+    {
+      "epoch": 0.6393265852725312,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.000608123754321294,
+      "loss": 0.9994,
+      "step": 9190
+    },
+    {
+      "epoch": 0.6393961529096664,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0006079164538716897,
+      "loss": 0.98,
+      "step": 9191
+    },
+    {
+      "epoch": 0.6394657205468016,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006077091733301117,
+      "loss": 0.6393,
+      "step": 9192
+    },
+    {
+      "epoch": 0.6395352881839368,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0006075019127070849,
+      "loss": 1.0215,
+      "step": 9193
+    },
+    {
+      "epoch": 0.6396048558210721,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0006072946720131323,
+      "loss": 0.9159,
+      "step": 9194
+    },
+    {
+      "epoch": 0.6396744234582072,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0006070874512587766,
+      "loss": 0.5661,
+      "step": 9195
+    },
+    {
+      "epoch": 0.6397439910953424,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0006068802504545402,
+      "loss": 0.8845,
+      "step": 9196
+    },
+    {
+      "epoch": 0.6398135587324777,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000606673069610943,
+      "loss": 0.6966,
+      "step": 9197
+    },
+    {
+      "epoch": 0.6398831263696129,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0006064659087385047,
+      "loss": 0.691,
+      "step": 9198
+    },
+    {
+      "epoch": 0.639952694006748,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0006062587678477441,
+      "loss": 0.8337,
+      "step": 9199
+    },
+    {
+      "epoch": 0.6400222616438833,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006060516469491788,
+      "loss": 0.7723,
+      "step": 9200
+    },
+    {
+      "epoch": 0.6400918292810185,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0006058445460533251,
+      "loss": 0.7276,
+      "step": 9201
+    },
+    {
+      "epoch": 0.6401613969181537,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0006056374651706985,
+      "loss": 0.9775,
+      "step": 9202
+    },
+    {
+      "epoch": 0.6402309645552888,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0006054304043118141,
+      "loss": 0.8342,
+      "step": 9203
+    },
+    {
+      "epoch": 0.6403005321924241,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0006052233634871847,
+      "loss": 0.741,
+      "step": 9204
+    },
+    {
+      "epoch": 0.6403700998295593,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.000605016342707323,
+      "loss": 0.9725,
+      "step": 9205
+    },
+    {
+      "epoch": 0.6404396674666945,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006048093419827405,
+      "loss": 0.6572,
+      "step": 9206
+    },
+    {
+      "epoch": 0.6405092351038297,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006046023613239482,
+      "loss": 0.7487,
+      "step": 9207
+    },
+    {
+      "epoch": 0.6405788027409649,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0006043954007414548,
+      "loss": 1.0782,
+      "step": 9208
+    },
+    {
+      "epoch": 0.6406483703781001,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0006041884602457685,
+      "loss": 0.9721,
+      "step": 9209
+    },
+    {
+      "epoch": 0.6407179380152354,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0006039815398473978,
+      "loss": 1.022,
+      "step": 9210
+    },
+    {
+      "epoch": 0.6407875056523705,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0006037746395568481,
+      "loss": 0.9364,
+      "step": 9211
+    },
+    {
+      "epoch": 0.6408570732895057,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006035677593846249,
+      "loss": 0.8639,
+      "step": 9212
+    },
+    {
+      "epoch": 0.640926640926641,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0006033608993412329,
+      "loss": 0.8383,
+      "step": 9213
+    },
+    {
+      "epoch": 0.6409962085637761,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0006031540594371755,
+      "loss": 0.7661,
+      "step": 9214
+    },
+    {
+      "epoch": 0.6410657762009113,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0006029472396829545,
+      "loss": 0.514,
+      "step": 9215
+    },
+    {
+      "epoch": 0.6411353438380465,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0006027404400890711,
+      "loss": 0.9713,
+      "step": 9216
+    },
+    {
+      "epoch": 0.6412049114751818,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0006025336606660262,
+      "loss": 0.6745,
+      "step": 9217
+    },
+    {
+      "epoch": 0.6412744791123169,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0006023269014243186,
+      "loss": 0.6754,
+      "step": 9218
+    },
+    {
+      "epoch": 0.6413440467494521,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0006021201623744462,
+      "loss": 0.7241,
+      "step": 9219
+    },
+    {
+      "epoch": 0.6414136143865874,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0006019134435269066,
+      "loss": 0.8975,
+      "step": 9220
+    },
+    {
+      "epoch": 0.6414831820237226,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0006017067448921962,
+      "loss": 0.732,
+      "step": 9221
+    },
+    {
+      "epoch": 0.6415527496608577,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0006015000664808096,
+      "loss": 0.4636,
+      "step": 9222
+    },
+    {
+      "epoch": 0.641622317297993,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006012934083032406,
+      "loss": 0.7586,
+      "step": 9223
+    },
+    {
+      "epoch": 0.6416918849351282,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0006010867703699831,
+      "loss": 0.8372,
+      "step": 9224
+    },
+    {
+      "epoch": 0.6417614525722634,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0006008801526915288,
+      "loss": 0.7143,
+      "step": 9225
+    },
+    {
+      "epoch": 0.6418310202093986,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006006735552783683,
+      "loss": 0.8462,
+      "step": 9226
+    },
+    {
+      "epoch": 0.6419005878465338,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0006004669781409922,
+      "loss": 0.6911,
+      "step": 9227
+    },
+    {
+      "epoch": 0.641970155483669,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0006002604212898892,
+      "loss": 0.8371,
+      "step": 9228
+    },
+    {
+      "epoch": 0.6420397231208042,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000600053884735547,
+      "loss": 0.7316,
+      "step": 9229
+    },
+    {
+      "epoch": 0.6421092907579394,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0005998473684884525,
+      "loss": 0.8465,
+      "step": 9230
+    },
+    {
+      "epoch": 0.6421788583950746,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005996408725590918,
+      "loss": 0.7709,
+      "step": 9231
+    },
+    {
+      "epoch": 0.6422484260322098,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0005994343969579498,
+      "loss": 0.5967,
+      "step": 9232
+    },
+    {
+      "epoch": 0.642317993669345,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00059922794169551,
+      "loss": 0.9269,
+      "step": 9233
+    },
+    {
+      "epoch": 0.6423875613064802,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005990215067822553,
+      "loss": 0.979,
+      "step": 9234
+    },
+    {
+      "epoch": 0.6424571289436154,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005988150922286676,
+      "loss": 0.8963,
+      "step": 9235
+    },
+    {
+      "epoch": 0.6425266965807507,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0005986086980452272,
+      "loss": 0.7806,
+      "step": 9236
+    },
+    {
+      "epoch": 0.6425962642178858,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005984023242424138,
+      "loss": 0.8006,
+      "step": 9237
+    },
+    {
+      "epoch": 0.642665831855021,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005981959708307063,
+      "loss": 0.7691,
+      "step": 9238
+    },
+    {
+      "epoch": 0.6427353994921563,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005979896378205824,
+      "loss": 0.477,
+      "step": 9239
+    },
+    {
+      "epoch": 0.6428049671292915,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000597783325222518,
+      "loss": 0.8926,
+      "step": 9240
+    },
+    {
+      "epoch": 0.6428745347664266,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005975770330469892,
+      "loss": 0.6737,
+      "step": 9241
+    },
+    {
+      "epoch": 0.6429441024035618,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0005973707613044706,
+      "loss": 0.9849,
+      "step": 9242
+    },
+    {
+      "epoch": 0.6430136700406971,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.000597164510005435,
+      "loss": 0.8287,
+      "step": 9243
+    },
+    {
+      "epoch": 0.6430832376778323,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0005969582791603551,
+      "loss": 0.6986,
+      "step": 9244
+    },
+    {
+      "epoch": 0.6431528053149674,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0005967520687797023,
+      "loss": 0.9721,
+      "step": 9245
+    },
+    {
+      "epoch": 0.6432223729521027,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0005965458788739473,
+      "loss": 0.8994,
+      "step": 9246
+    },
+    {
+      "epoch": 0.6432919405892379,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0005963397094535587,
+      "loss": 0.6767,
+      "step": 9247
+    },
+    {
+      "epoch": 0.6433615082263731,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.000596133560529005,
+      "loss": 0.9858,
+      "step": 9248
+    },
+    {
+      "epoch": 0.6434310758635083,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005959274321107535,
+      "loss": 0.8106,
+      "step": 9249
+    },
+    {
+      "epoch": 0.6435006435006435,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0005957213242092707,
+      "loss": 0.7467,
+      "step": 9250
+    },
+    {
+      "epoch": 0.6435702111377787,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005955152368350207,
+      "loss": 0.7088,
+      "step": 9251
+    },
+    {
+      "epoch": 0.643639778774914,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0005953091699984687,
+      "loss": 0.7812,
+      "step": 9252
+    },
+    {
+      "epoch": 0.6437093464120491,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0005951031237100773,
+      "loss": 0.4536,
+      "step": 9253
+    },
+    {
+      "epoch": 0.6437789140491843,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0005948970979803082,
+      "loss": 1.0513,
+      "step": 9254
+    },
+    {
+      "epoch": 0.6438484816863195,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0005946910928196224,
+      "loss": 0.7437,
+      "step": 9255
+    },
+    {
+      "epoch": 0.6439180493234548,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0005944851082384802,
+      "loss": 0.7235,
+      "step": 9256
+    },
+    {
+      "epoch": 0.6439876169605899,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005942791442473405,
+      "loss": 0.9745,
+      "step": 9257
+    },
+    {
+      "epoch": 0.6440571845977251,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005940732008566605,
+      "loss": 0.753,
+      "step": 9258
+    },
+    {
+      "epoch": 0.6441267522348604,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005938672780768974,
+      "loss": 1.0134,
+      "step": 9259
+    },
+    {
+      "epoch": 0.6441963198719955,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0005936613759185073,
+      "loss": 0.9824,
+      "step": 9260
+    },
+    {
+      "epoch": 0.6442658875091307,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005934554943919442,
+      "loss": 0.8076,
+      "step": 9261
+    },
+    {
+      "epoch": 0.644335455146266,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005932496335076616,
+      "loss": 0.7829,
+      "step": 9262
+    },
+    {
+      "epoch": 0.6444050227834012,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0005930437932761126,
+      "loss": 0.7276,
+      "step": 9263
+    },
+    {
+      "epoch": 0.6444745904205363,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005928379737077489,
+      "loss": 0.7165,
+      "step": 9264
+    },
+    {
+      "epoch": 0.6445441580576716,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005926321748130201,
+      "loss": 0.7109,
+      "step": 9265
+    },
+    {
+      "epoch": 0.6446137256948068,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005924263966023767,
+      "loss": 0.9391,
+      "step": 9266
+    },
+    {
+      "epoch": 0.644683293331942,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005922206390862663,
+      "loss": 0.8388,
+      "step": 9267
+    },
+    {
+      "epoch": 0.6447528609690771,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005920149022751366,
+      "loss": 0.9156,
+      "step": 9268
+    },
+    {
+      "epoch": 0.6448224286062124,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005918091861794334,
+      "loss": 0.7867,
+      "step": 9269
+    },
+    {
+      "epoch": 0.6448919962433476,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005916034908096026,
+      "loss": 0.8344,
+      "step": 9270
+    },
+    {
+      "epoch": 0.6449615638804828,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005913978161760883,
+      "loss": 0.697,
+      "step": 9271
+    },
+    {
+      "epoch": 0.645031131517618,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0005911921622893331,
+      "loss": 0.7987,
+      "step": 9272
+    },
+    {
+      "epoch": 0.6451006991547532,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005909865291597792,
+      "loss": 0.7588,
+      "step": 9273
+    },
+    {
+      "epoch": 0.6451702667918884,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0005907809167978682,
+      "loss": 0.7214,
+      "step": 9274
+    },
+    {
+      "epoch": 0.6452398344290237,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0005905753252140394,
+      "loss": 0.8408,
+      "step": 9275
+    },
+    {
+      "epoch": 0.6453094020661588,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0005903697544187318,
+      "loss": 1.061,
+      "step": 9276
+    },
+    {
+      "epoch": 0.645378969703294,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0005901642044223834,
+      "loss": 0.6878,
+      "step": 9277
+    },
+    {
+      "epoch": 0.6454485373404293,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005899586752354314,
+      "loss": 0.7811,
+      "step": 9278
+    },
+    {
+      "epoch": 0.6455181049775645,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005897531668683104,
+      "loss": 0.7057,
+      "step": 9279
+    },
+    {
+      "epoch": 0.6455876726146996,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005895476793314563,
+      "loss": 0.8484,
+      "step": 9280
+    },
+    {
+      "epoch": 0.6456572402518348,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0005893422126353021,
+      "loss": 0.8589,
+      "step": 9281
+    },
+    {
+      "epoch": 0.6457268078889701,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005891367667902807,
+      "loss": 0.9069,
+      "step": 9282
+    },
+    {
+      "epoch": 0.6457963755261052,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0005889313418068229,
+      "loss": 0.9748,
+      "step": 9283
+    },
+    {
+      "epoch": 0.6458659431632404,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005887259376953597,
+      "loss": 0.804,
+      "step": 9284
+    },
+    {
+      "epoch": 0.6459355108003757,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005885205544663208,
+      "loss": 0.7784,
+      "step": 9285
+    },
+    {
+      "epoch": 0.6460050784375109,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005883151921301337,
+      "loss": 0.6842,
+      "step": 9286
+    },
+    {
+      "epoch": 0.646074646074646,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005881098506972265,
+      "loss": 0.709,
+      "step": 9287
+    },
+    {
+      "epoch": 0.6461442137117813,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0005879045301780247,
+      "loss": 0.8058,
+      "step": 9288
+    },
+    {
+      "epoch": 0.6462137813489165,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000587699230582954,
+      "loss": 0.6286,
+      "step": 9289
+    },
+    {
+      "epoch": 0.6462833489860517,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005874939519224378,
+      "loss": 1.0014,
+      "step": 9290
+    },
+    {
+      "epoch": 0.646352916623187,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0005872886942068999,
+      "loss": 0.8455,
+      "step": 9291
+    },
+    {
+      "epoch": 0.6464224842603221,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005870834574467621,
+      "loss": 0.7493,
+      "step": 9292
+    },
+    {
+      "epoch": 0.6464920518974573,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0005868782416524446,
+      "loss": 0.8493,
+      "step": 9293
+    },
+    {
+      "epoch": 0.6465616195345925,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005866730468343678,
+      "loss": 0.5316,
+      "step": 9294
+    },
+    {
+      "epoch": 0.6466311871717277,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005864678730029503,
+      "loss": 0.6703,
+      "step": 9295
+    },
+    {
+      "epoch": 0.6467007548088629,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005862627201686102,
+      "loss": 1.0093,
+      "step": 9296
+    },
+    {
+      "epoch": 0.6467703224459981,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0005860575883417634,
+      "loss": 0.7401,
+      "step": 9297
+    },
+    {
+      "epoch": 0.6468398900831334,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000585852477532826,
+      "loss": 0.6057,
+      "step": 9298
+    },
+    {
+      "epoch": 0.6469094577202685,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0005856473877522126,
+      "loss": 0.7363,
+      "step": 9299
+    },
+    {
+      "epoch": 0.6469790253574037,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005854423190103357,
+      "loss": 0.7201,
+      "step": 9300
+    },
+    {
+      "epoch": 0.647048592994539,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005852372713176088,
+      "loss": 0.6854,
+      "step": 9301
+    },
+    {
+      "epoch": 0.6471181606316742,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005850322446844427,
+      "loss": 0.8433,
+      "step": 9302
+    },
+    {
+      "epoch": 0.6471877282688093,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005848272391212477,
+      "loss": 0.6544,
+      "step": 9303
+    },
+    {
+      "epoch": 0.6472572959059446,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0005846222546384325,
+      "loss": 1.06,
+      "step": 9304
+    },
+    {
+      "epoch": 0.6473268635430798,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0005844172912464057,
+      "loss": 0.6242,
+      "step": 9305
+    },
+    {
+      "epoch": 0.647396431180215,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0005842123489555744,
+      "loss": 0.7195,
+      "step": 9306
+    },
+    {
+      "epoch": 0.6474659988173501,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005840074277763437,
+      "loss": 0.7005,
+      "step": 9307
+    },
+    {
+      "epoch": 0.6475355664544854,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005838025277191197,
+      "loss": 0.7961,
+      "step": 9308
+    },
+    {
+      "epoch": 0.6476051340916206,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0005835976487943055,
+      "loss": 0.9478,
+      "step": 9309
+    },
+    {
+      "epoch": 0.6476747017287557,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005833927910123036,
+      "loss": 0.6468,
+      "step": 9310
+    },
+    {
+      "epoch": 0.647744269365891,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005831879543835157,
+      "loss": 0.595,
+      "step": 9311
+    },
+    {
+      "epoch": 0.6478138370030262,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0005829831389183431,
+      "loss": 1.113,
+      "step": 9312
+    },
+    {
+      "epoch": 0.6478834046401614,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005827783446271848,
+      "loss": 0.9451,
+      "step": 9313
+    },
+    {
+      "epoch": 0.6479529722772966,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005825735715204388,
+      "loss": 1.0168,
+      "step": 9314
+    },
+    {
+      "epoch": 0.6480225399144318,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0005823688196085028,
+      "loss": 0.735,
+      "step": 9315
+    },
+    {
+      "epoch": 0.648092107551567,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0005821640889017737,
+      "loss": 0.6886,
+      "step": 9316
+    },
+    {
+      "epoch": 0.6481616751887023,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.000581959379410646,
+      "loss": 0.7337,
+      "step": 9317
+    },
+    {
+      "epoch": 0.6482312428258374,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0005817546911455134,
+      "loss": 0.7089,
+      "step": 9318
+    },
+    {
+      "epoch": 0.6483008104629726,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005815500241167699,
+      "loss": 0.7163,
+      "step": 9319
+    },
+    {
+      "epoch": 0.6483703781001078,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0005813453783348069,
+      "loss": 1.0089,
+      "step": 9320
+    },
+    {
+      "epoch": 0.6484399457372431,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0005811407538100151,
+      "loss": 0.6632,
+      "step": 9321
+    },
+    {
+      "epoch": 0.6485095133743782,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005809361505527852,
+      "loss": 0.6799,
+      "step": 9322
+    },
+    {
+      "epoch": 0.6485790810115134,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005807315685735052,
+      "loss": 0.7795,
+      "step": 9323
+    },
+    {
+      "epoch": 0.6486486486486487,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005805270078825626,
+      "loss": 0.808,
+      "step": 9324
+    },
+    {
+      "epoch": 0.6487182162857839,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005803224684903442,
+      "loss": 0.793,
+      "step": 9325
+    },
+    {
+      "epoch": 0.648787783922919,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005801179504072359,
+      "loss": 0.8611,
+      "step": 9326
+    },
+    {
+      "epoch": 0.6488573515600543,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005799134536436217,
+      "loss": 0.9363,
+      "step": 9327
+    },
+    {
+      "epoch": 0.6489269191971895,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005797089782098846,
+      "loss": 0.7961,
+      "step": 9328
+    },
+    {
+      "epoch": 0.6489964868343246,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0005795045241164072,
+      "loss": 0.648,
+      "step": 9329
+    },
+    {
+      "epoch": 0.6490660544714599,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0005793000913735709,
+      "loss": 0.9337,
+      "step": 9330
+    },
+    {
+      "epoch": 0.6491356221085951,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005790956799917555,
+      "loss": 0.8339,
+      "step": 9331
+    },
+    {
+      "epoch": 0.6492051897457303,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005788912899813395,
+      "loss": 0.809,
+      "step": 9332
+    },
+    {
+      "epoch": 0.6492747573828654,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005786869213527013,
+      "loss": 0.8671,
+      "step": 9333
+    },
+    {
+      "epoch": 0.6493443250200007,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005784825741162181,
+      "loss": 0.8602,
+      "step": 9334
+    },
+    {
+      "epoch": 0.6494138926571359,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0005782782482822653,
+      "loss": 0.6793,
+      "step": 9335
+    },
+    {
+      "epoch": 0.6494834602942711,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0005780739438612169,
+      "loss": 0.775,
+      "step": 9336
+    },
+    {
+      "epoch": 0.6495530279314063,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0005778696608634473,
+      "loss": 0.7829,
+      "step": 9337
+    },
+    {
+      "epoch": 0.6496225955685415,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0005776653992993282,
+      "loss": 0.6999,
+      "step": 9338
+    },
+    {
+      "epoch": 0.6496921632056767,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005774611591792314,
+      "loss": 0.7163,
+      "step": 9339
+    },
+    {
+      "epoch": 0.649761730842812,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005772569405135277,
+      "loss": 0.6897,
+      "step": 9340
+    },
+    {
+      "epoch": 0.6498312984799471,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005770527433125857,
+      "loss": 0.8467,
+      "step": 9341
+    },
+    {
+      "epoch": 0.6499008661170823,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005768485675867732,
+      "loss": 0.8483,
+      "step": 9342
+    },
+    {
+      "epoch": 0.6499704337542176,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005766444133464577,
+      "loss": 0.8846,
+      "step": 9343
+    },
+    {
+      "epoch": 0.6500400013913528,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005764402806020053,
+      "loss": 0.7636,
+      "step": 9344
+    },
+    {
+      "epoch": 0.6501095690284879,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005762361693637805,
+      "loss": 0.8441,
+      "step": 9345
+    },
+    {
+      "epoch": 0.6501791366656231,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005760320796421468,
+      "loss": 0.7062,
+      "step": 9346
+    },
+    {
+      "epoch": 0.6502487043027584,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005758280114474671,
+      "loss": 0.7896,
+      "step": 9347
+    },
+    {
+      "epoch": 0.6503182719398936,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0005756239647901033,
+      "loss": 0.8114,
+      "step": 9348
+    },
+    {
+      "epoch": 0.6503878395770287,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005754199396804157,
+      "loss": 0.7884,
+      "step": 9349
+    },
+    {
+      "epoch": 0.650457407214164,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005752159361287631,
+      "loss": 1.0018,
+      "step": 9350
+    },
+    {
+      "epoch": 0.6505269748512992,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005750119541455045,
+      "loss": 0.8045,
+      "step": 9351
+    },
+    {
+      "epoch": 0.6505965424884343,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005748079937409965,
+      "loss": 0.8519,
+      "step": 9352
+    },
+    {
+      "epoch": 0.6506661101255696,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005746040549255955,
+      "loss": 0.9646,
+      "step": 9353
+    },
+    {
+      "epoch": 0.6507356777627048,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005744001377096566,
+      "loss": 0.8221,
+      "step": 9354
+    },
+    {
+      "epoch": 0.65080524539984,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005741962421035337,
+      "loss": 1.0693,
+      "step": 9355
+    },
+    {
+      "epoch": 0.6508748130369753,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0005739923681175789,
+      "loss": 0.7727,
+      "step": 9356
+    },
+    {
+      "epoch": 0.6509443806741104,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005737885157621446,
+      "loss": 0.6614,
+      "step": 9357
+    },
+    {
+      "epoch": 0.6510139483112456,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005735846850475814,
+      "loss": 0.8921,
+      "step": 9358
+    },
+    {
+      "epoch": 0.6510835159483808,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0005733808759842387,
+      "loss": 0.9109,
+      "step": 9359
+    },
+    {
+      "epoch": 0.651153083585516,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0005731770885824643,
+      "loss": 0.916,
+      "step": 9360
+    },
+    {
+      "epoch": 0.6512226512226512,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005729733228526061,
+      "loss": 1.0362,
+      "step": 9361
+    },
+    {
+      "epoch": 0.6512922188597864,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005727695788050106,
+      "loss": 0.8676,
+      "step": 9362
+    },
+    {
+      "epoch": 0.6513617864969217,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005725658564500225,
+      "loss": 0.7026,
+      "step": 9363
+    },
+    {
+      "epoch": 0.6514313541340568,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0005723621557979854,
+      "loss": 0.8282,
+      "step": 9364
+    },
+    {
+      "epoch": 0.651500921771192,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0005721584768592425,
+      "loss": 0.9341,
+      "step": 9365
+    },
+    {
+      "epoch": 0.6515704894083273,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0005719548196441359,
+      "loss": 0.9879,
+      "step": 9366
+    },
+    {
+      "epoch": 0.6516400570454625,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005717511841630058,
+      "loss": 0.6812,
+      "step": 9367
+    },
+    {
+      "epoch": 0.6517096246825976,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005715475704261925,
+      "loss": 0.6552,
+      "step": 9368
+    },
+    {
+      "epoch": 0.6517791923197329,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005713439784440341,
+      "loss": 0.6276,
+      "step": 9369
+    },
+    {
+      "epoch": 0.6518487599568681,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0005711404082268673,
+      "loss": 0.856,
+      "step": 9370
+    },
+    {
+      "epoch": 0.6519183275940033,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0005709368597850291,
+      "loss": 0.9023,
+      "step": 9371
+    },
+    {
+      "epoch": 0.6519878952311384,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005707333331288548,
+      "loss": 1.0215,
+      "step": 9372
+    },
+    {
+      "epoch": 0.6520574628682737,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005705298282686782,
+      "loss": 0.8265,
+      "step": 9373
+    },
+    {
+      "epoch": 0.6521270305054089,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0005703263452148319,
+      "loss": 0.772,
+      "step": 9374
+    },
+    {
+      "epoch": 0.652196598142544,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000570122883977648,
+      "loss": 0.7103,
+      "step": 9375
+    },
+    {
+      "epoch": 0.6522661657796793,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005699194445674577,
+      "loss": 0.9021,
+      "step": 9376
+    },
+    {
+      "epoch": 0.6523357334168145,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005697160269945902,
+      "loss": 0.971,
+      "step": 9377
+    },
+    {
+      "epoch": 0.6524053010539497,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005695126312693738,
+      "loss": 0.8955,
+      "step": 9378
+    },
+    {
+      "epoch": 0.652474868691085,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0005693092574021361,
+      "loss": 0.8563,
+      "step": 9379
+    },
+    {
+      "epoch": 0.6525444363282201,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005691059054032039,
+      "loss": 0.721,
+      "step": 9380
+    },
+    {
+      "epoch": 0.6526140039653553,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0005689025752829014,
+      "loss": 1.0648,
+      "step": 9381
+    },
+    {
+      "epoch": 0.6526835716024906,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0005686992670515538,
+      "loss": 0.7004,
+      "step": 9382
+    },
+    {
+      "epoch": 0.6527531392396257,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005684959807194835,
+      "loss": 0.8682,
+      "step": 9383
+    },
+    {
+      "epoch": 0.6528227068767609,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005682927162970119,
+      "loss": 1.0182,
+      "step": 9384
+    },
+    {
+      "epoch": 0.6528922745138961,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005680894737944602,
+      "loss": 0.9278,
+      "step": 9385
+    },
+    {
+      "epoch": 0.6529618421510314,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005678862532221485,
+      "loss": 0.6509,
+      "step": 9386
+    },
+    {
+      "epoch": 0.6530314097881665,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0005676830545903948,
+      "loss": 0.7036,
+      "step": 9387
+    },
+    {
+      "epoch": 0.6531009774253017,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0005674798779095161,
+      "loss": 0.6861,
+      "step": 9388
+    },
+    {
+      "epoch": 0.653170545062437,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005672767231898292,
+      "loss": 0.7598,
+      "step": 9389
+    },
+    {
+      "epoch": 0.6532401126995722,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005670735904416495,
+      "loss": 0.7604,
+      "step": 9390
+    },
+    {
+      "epoch": 0.6533096803367073,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005668704796752909,
+      "loss": 0.7848,
+      "step": 9391
+    },
+    {
+      "epoch": 0.6533792479738426,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0005666673909010658,
+      "loss": 0.9454,
+      "step": 9392
+    },
+    {
+      "epoch": 0.6534488156109778,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005664643241292864,
+      "loss": 0.9623,
+      "step": 9393
+    },
+    {
+      "epoch": 0.653518383248113,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0005662612793702639,
+      "loss": 0.8303,
+      "step": 9394
+    },
+    {
+      "epoch": 0.6535879508852482,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005660582566343068,
+      "loss": 0.7962,
+      "step": 9395
+    },
+    {
+      "epoch": 0.6536575185223834,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0005658552559317248,
+      "loss": 0.9712,
+      "step": 9396
+    },
+    {
+      "epoch": 0.6537270861595186,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005656522772728243,
+      "loss": 0.9124,
+      "step": 9397
+    },
+    {
+      "epoch": 0.6537966537966537,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0005654493206679121,
+      "loss": 0.968,
+      "step": 9398
+    },
+    {
+      "epoch": 0.653866221433789,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005652463861272928,
+      "loss": 0.6466,
+      "step": 9399
+    },
+    {
+      "epoch": 0.6539357890709242,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0005650434736612711,
+      "loss": 0.8994,
+      "step": 9400
+    },
+    {
+      "epoch": 0.6540053567080594,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0005648405832801495,
+      "loss": 0.6486,
+      "step": 9401
+    },
+    {
+      "epoch": 0.6540749243451947,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0005646377149942292,
+      "loss": 1.1225,
+      "step": 9402
+    },
+    {
+      "epoch": 0.6541444919823298,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005644348688138114,
+      "loss": 0.7517,
+      "step": 9403
+    },
+    {
+      "epoch": 0.654214059619465,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.000564232044749196,
+      "loss": 0.7078,
+      "step": 9404
+    },
+    {
+      "epoch": 0.6542836272566003,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.000564029242810681,
+      "loss": 0.8726,
+      "step": 9405
+    },
+    {
+      "epoch": 0.6543531948937354,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.000563826463008563,
+      "loss": 0.7171,
+      "step": 9406
+    },
+    {
+      "epoch": 0.6544227625308706,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0005636237053531388,
+      "loss": 0.5968,
+      "step": 9407
+    },
+    {
+      "epoch": 0.6544923301680059,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005634209698547038,
+      "loss": 0.7417,
+      "step": 9408
+    },
+    {
+      "epoch": 0.6545618978051411,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005632182565235514,
+      "loss": 0.8136,
+      "step": 9409
+    },
+    {
+      "epoch": 0.6546314654422762,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.000563015565369974,
+      "loss": 0.7378,
+      "step": 9410
+    },
+    {
+      "epoch": 0.6547010330794114,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0005628128964042636,
+      "loss": 0.919,
+      "step": 9411
+    },
+    {
+      "epoch": 0.6547706007165467,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0005626102496367111,
+      "loss": 0.9547,
+      "step": 9412
+    },
+    {
+      "epoch": 0.6548401683536819,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0005624076250776052,
+      "loss": 0.7509,
+      "step": 9413
+    },
+    {
+      "epoch": 0.654909735990817,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005622050227372348,
+      "loss": 0.7757,
+      "step": 9414
+    },
+    {
+      "epoch": 0.6549793036279523,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005620024426258867,
+      "loss": 0.6383,
+      "step": 9415
+    },
+    {
+      "epoch": 0.6550488712650875,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005617998847538466,
+      "loss": 0.648,
+      "step": 9416
+    },
+    {
+      "epoch": 0.6551184389022227,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005615973491313996,
+      "loss": 0.723,
+      "step": 9417
+    },
+    {
+      "epoch": 0.6551880065393579,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005613948357688299,
+      "loss": 0.7752,
+      "step": 9418
+    },
+    {
+      "epoch": 0.6552575741764931,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005611923446764196,
+      "loss": 0.8143,
+      "step": 9419
+    },
+    {
+      "epoch": 0.6553271418136283,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00056098987586445,
+      "loss": 0.8424,
+      "step": 9420
+    },
+    {
+      "epoch": 0.6553967094507636,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005607874293432017,
+      "loss": 0.987,
+      "step": 9421
+    },
+    {
+      "epoch": 0.6554662770878987,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0005605850051229544,
+      "loss": 0.5593,
+      "step": 9422
+    },
+    {
+      "epoch": 0.6555358447250339,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005603826032139856,
+      "loss": 0.7599,
+      "step": 9423
+    },
+    {
+      "epoch": 0.6556054123621691,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005601802236265721,
+      "loss": 0.8143,
+      "step": 9424
+    },
+    {
+      "epoch": 0.6556749799993044,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005599778663709898,
+      "loss": 0.5966,
+      "step": 9425
+    },
+    {
+      "epoch": 0.6557445476364395,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005597755314575142,
+      "loss": 0.8998,
+      "step": 9426
+    },
+    {
+      "epoch": 0.6558141152735747,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005595732188964177,
+      "loss": 1.1193,
+      "step": 9427
+    },
+    {
+      "epoch": 0.65588368291071,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0005593709286979736,
+      "loss": 0.8141,
+      "step": 9428
+    },
+    {
+      "epoch": 0.6559532505478451,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005591686608724524,
+      "loss": 0.7236,
+      "step": 9429
+    },
+    {
+      "epoch": 0.6560228181849803,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.000558966415430125,
+      "loss": 0.8101,
+      "step": 9430
+    },
+    {
+      "epoch": 0.6560923858221156,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005587641923812599,
+      "loss": 0.9927,
+      "step": 9431
+    },
+    {
+      "epoch": 0.6561619534592508,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005585619917361254,
+      "loss": 0.7156,
+      "step": 9432
+    },
+    {
+      "epoch": 0.6562315210963859,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0005583598135049879,
+      "loss": 0.9143,
+      "step": 9433
+    },
+    {
+      "epoch": 0.6563010887335212,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005581576576981125,
+      "loss": 0.8323,
+      "step": 9434
+    },
+    {
+      "epoch": 0.6563706563706564,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0005579555243257644,
+      "loss": 0.8342,
+      "step": 9435
+    },
+    {
+      "epoch": 0.6564402240077916,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005577534133982071,
+      "loss": 0.8752,
+      "step": 9436
+    },
+    {
+      "epoch": 0.6565097916449267,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005575513249257022,
+      "loss": 1.2009,
+      "step": 9437
+    },
+    {
+      "epoch": 0.656579359282062,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005573492589185107,
+      "loss": 0.8155,
+      "step": 9438
+    },
+    {
+      "epoch": 0.6566489269191972,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005571472153868926,
+      "loss": 1.0064,
+      "step": 9439
+    },
+    {
+      "epoch": 0.6567184945563324,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005569451943411072,
+      "loss": 0.7553,
+      "step": 9440
+    },
+    {
+      "epoch": 0.6567880621934676,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0005567431957914114,
+      "loss": 0.7241,
+      "step": 9441
+    },
+    {
+      "epoch": 0.6568576298306028,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005565412197480621,
+      "loss": 0.8892,
+      "step": 9442
+    },
+    {
+      "epoch": 0.656927197467738,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0005563392662213143,
+      "loss": 0.9475,
+      "step": 9443
+    },
+    {
+      "epoch": 0.6569967651048733,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005561373352214225,
+      "loss": 0.8963,
+      "step": 9444
+    },
+    {
+      "epoch": 0.6570663327420084,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005559354267586394,
+      "loss": 0.9154,
+      "step": 9445
+    },
+    {
+      "epoch": 0.6571359003791436,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005557335408432174,
+      "loss": 0.7852,
+      "step": 9446
+    },
+    {
+      "epoch": 0.6572054680162789,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005555316774854068,
+      "loss": 0.8167,
+      "step": 9447
+    },
+    {
+      "epoch": 0.657275035653414,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005553298366954566,
+      "loss": 0.7987,
+      "step": 9448
+    },
+    {
+      "epoch": 0.6573446032905492,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.000555128018483617,
+      "loss": 0.8527,
+      "step": 9449
+    },
+    {
+      "epoch": 0.6574141709276844,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000554926222860134,
+      "loss": 0.7769,
+      "step": 9450
+    },
+    {
+      "epoch": 0.6574837385648197,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005547244498352542,
+      "loss": 0.7784,
+      "step": 9451
+    },
+    {
+      "epoch": 0.6575533062019548,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0005545226994192221,
+      "loss": 0.8866,
+      "step": 9452
+    },
+    {
+      "epoch": 0.65762287383909,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005543209716222819,
+      "loss": 0.7285,
+      "step": 9453
+    },
+    {
+      "epoch": 0.6576924414762253,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005541192664546768,
+      "loss": 0.8682,
+      "step": 9454
+    },
+    {
+      "epoch": 0.6577620091133605,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0005539175839266475,
+      "loss": 0.9855,
+      "step": 9455
+    },
+    {
+      "epoch": 0.6578315767504956,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005537159240484353,
+      "loss": 1.0906,
+      "step": 9456
+    },
+    {
+      "epoch": 0.6579011443876309,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005535142868302787,
+      "loss": 0.7366,
+      "step": 9457
+    },
+    {
+      "epoch": 0.6579707120247661,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0005533126722824164,
+      "loss": 0.7251,
+      "step": 9458
+    },
+    {
+      "epoch": 0.6580402796619013,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000553111080415085,
+      "loss": 0.6313,
+      "step": 9459
+    },
+    {
+      "epoch": 0.6581098472990365,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005529095112385207,
+      "loss": 0.8287,
+      "step": 9460
+    },
+    {
+      "epoch": 0.6581794149361717,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005527079647629578,
+      "loss": 0.9603,
+      "step": 9461
+    },
+    {
+      "epoch": 0.6582489825733069,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0005525064409986292,
+      "loss": 0.7685,
+      "step": 9462
+    },
+    {
+      "epoch": 0.658318550210442,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0005523049399557689,
+      "loss": 0.7197,
+      "step": 9463
+    },
+    {
+      "epoch": 0.6583881178475773,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005521034616446071,
+      "loss": 0.7905,
+      "step": 9464
+    },
+    {
+      "epoch": 0.6584576854847125,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0005519020060753739,
+      "loss": 0.9,
+      "step": 9465
+    },
+    {
+      "epoch": 0.6585272531218477,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005517005732582981,
+      "loss": 0.7885,
+      "step": 9466
+    },
+    {
+      "epoch": 0.658596820758983,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0005514991632036073,
+      "loss": 1.0088,
+      "step": 9467
+    },
+    {
+      "epoch": 0.6586663883961181,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0005512977759215289,
+      "loss": 0.738,
+      "step": 9468
+    },
+    {
+      "epoch": 0.6587359560332533,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005510964114222873,
+      "loss": 0.8165,
+      "step": 9469
+    },
+    {
+      "epoch": 0.6588055236703886,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0005508950697161079,
+      "loss": 0.7834,
+      "step": 9470
+    },
+    {
+      "epoch": 0.6588750913075238,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005506937508132127,
+      "loss": 0.7316,
+      "step": 9471
+    },
+    {
+      "epoch": 0.6589446589446589,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005504924547238245,
+      "loss": 0.6777,
+      "step": 9472
+    },
+    {
+      "epoch": 0.6590142265817942,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005502911814581634,
+      "loss": 0.8826,
+      "step": 9473
+    },
+    {
+      "epoch": 0.6590837942189294,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00055008993102645,
+      "loss": 0.6996,
+      "step": 9474
+    },
+    {
+      "epoch": 0.6591533618560645,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0005498887034389015,
+      "loss": 0.7108,
+      "step": 9475
+    },
+    {
+      "epoch": 0.6592229294931997,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0005496874987057361,
+      "loss": 0.6236,
+      "step": 9476
+    },
+    {
+      "epoch": 0.659292497130335,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005494863168371701,
+      "loss": 0.7545,
+      "step": 9477
+    },
+    {
+      "epoch": 0.6593620647674702,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0005492851578434182,
+      "loss": 0.6097,
+      "step": 9478
+    },
+    {
+      "epoch": 0.6594316324046053,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005490840217346942,
+      "loss": 0.665,
+      "step": 9479
+    },
+    {
+      "epoch": 0.6595012000417406,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00054888290852121,
+      "loss": 0.8723,
+      "step": 9480
+    },
+    {
+      "epoch": 0.6595707676788758,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005486818182131785,
+      "loss": 0.8145,
+      "step": 9481
+    },
+    {
+      "epoch": 0.659640335316011,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005484807508208098,
+      "loss": 0.7581,
+      "step": 9482
+    },
+    {
+      "epoch": 0.6597099029531462,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005482797063543125,
+      "loss": 0.6727,
+      "step": 9483
+    },
+    {
+      "epoch": 0.6597794705902814,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0005480786848238946,
+      "loss": 0.6584,
+      "step": 9484
+    },
+    {
+      "epoch": 0.6598490382274166,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0005478776862397631,
+      "loss": 0.7583,
+      "step": 9485
+    },
+    {
+      "epoch": 0.6599186058645519,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0005476767106121245,
+      "loss": 0.6691,
+      "step": 9486
+    },
+    {
+      "epoch": 0.659988173501687,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000547475757951182,
+      "loss": 0.8247,
+      "step": 9487
+    },
+    {
+      "epoch": 0.6600577411388222,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0005472748282671401,
+      "loss": 0.9761,
+      "step": 9488
+    },
+    {
+      "epoch": 0.6601273087759574,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005470739215702001,
+      "loss": 0.8019,
+      "step": 9489
+    },
+    {
+      "epoch": 0.6601968764130927,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000546873037870564,
+      "loss": 0.7796,
+      "step": 9490
+    },
+    {
+      "epoch": 0.6602664440502278,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0005466721771784305,
+      "loss": 0.8948,
+      "step": 9491
+    },
+    {
+      "epoch": 0.660336011687363,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0005464713395039993,
+      "loss": 1.0112,
+      "step": 9492
+    },
+    {
+      "epoch": 0.6604055793244983,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005462705248574677,
+      "loss": 0.852,
+      "step": 9493
+    },
+    {
+      "epoch": 0.6604751469616335,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000546069733249031,
+      "loss": 0.9642,
+      "step": 9494
+    },
+    {
+      "epoch": 0.6605447145987686,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005458689646888859,
+      "loss": 0.8078,
+      "step": 9495
+    },
+    {
+      "epoch": 0.6606142822359039,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.000545668219187226,
+      "loss": 0.9276,
+      "step": 9496
+    },
+    {
+      "epoch": 0.6606838498730391,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005454674967542439,
+      "loss": 0.8876,
+      "step": 9497
+    },
+    {
+      "epoch": 0.6607534175101742,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005452667974001308,
+      "loss": 0.7041,
+      "step": 9498
+    },
+    {
+      "epoch": 0.6608229851473094,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0005450661211350779,
+      "loss": 0.6879,
+      "step": 9499
+    },
+    {
+      "epoch": 0.6608925527844447,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0005448654679692745,
+      "loss": 0.8165,
+      "step": 9500
+    },
+    {
+      "epoch": 0.6609621204215799,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005446648379129083,
+      "loss": 0.7812,
+      "step": 9501
+    },
+    {
+      "epoch": 0.661031688058715,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005444642309761669,
+      "loss": 0.8959,
+      "step": 9502
+    },
+    {
+      "epoch": 0.6611012556958503,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0005442636471692355,
+      "loss": 0.9082,
+      "step": 9503
+    },
+    {
+      "epoch": 0.6611708233329855,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0005440630865022993,
+      "loss": 0.5763,
+      "step": 9504
+    },
+    {
+      "epoch": 0.6612403909701207,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0005438625489855412,
+      "loss": 0.844,
+      "step": 9505
+    },
+    {
+      "epoch": 0.6613099586072559,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.000543662034629144,
+      "loss": 1.0799,
+      "step": 9506
+    },
+    {
+      "epoch": 0.6613795262443911,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005434615434432884,
+      "loss": 0.7713,
+      "step": 9507
+    },
+    {
+      "epoch": 0.6614490938815263,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005432610754381543,
+      "loss": 0.7808,
+      "step": 9508
+    },
+    {
+      "epoch": 0.6615186615186616,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0005430606306239211,
+      "loss": 0.8285,
+      "step": 9509
+    },
+    {
+      "epoch": 0.6615882291557967,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.000542860209010766,
+      "loss": 0.6139,
+      "step": 9510
+    },
+    {
+      "epoch": 0.6616577967929319,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005426598106088651,
+      "loss": 0.7686,
+      "step": 9511
+    },
+    {
+      "epoch": 0.6617273644300671,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0005424594354283937,
+      "loss": 0.8478,
+      "step": 9512
+    },
+    {
+      "epoch": 0.6617969320672024,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005422590834795259,
+      "loss": 0.7465,
+      "step": 9513
+    },
+    {
+      "epoch": 0.6618664997043375,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0005420587547724352,
+      "loss": 1.0016,
+      "step": 9514
+    },
+    {
+      "epoch": 0.6619360673414727,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005418584493172921,
+      "loss": 0.6856,
+      "step": 9515
+    },
+    {
+      "epoch": 0.662005634978608,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0005416581671242682,
+      "loss": 1.1134,
+      "step": 9516
+    },
+    {
+      "epoch": 0.6620752026157432,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.000541457908203532,
+      "loss": 0.5706,
+      "step": 9517
+    },
+    {
+      "epoch": 0.6621447702528783,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0005412576725652525,
+      "loss": 0.8443,
+      "step": 9518
+    },
+    {
+      "epoch": 0.6622143378900136,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0005410574602195957,
+      "loss": 0.9196,
+      "step": 9519
+    },
+    {
+      "epoch": 0.6622839055271488,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005408572711767282,
+      "loss": 0.9009,
+      "step": 9520
+    },
+    {
+      "epoch": 0.662353473164284,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005406571054468137,
+      "loss": 0.9215,
+      "step": 9521
+    },
+    {
+      "epoch": 0.6624230408014192,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005404569630400163,
+      "loss": 0.9282,
+      "step": 9522
+    },
+    {
+      "epoch": 0.6624926084385544,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0005402568439664983,
+      "loss": 0.7831,
+      "step": 9523
+    },
+    {
+      "epoch": 0.6625621760756896,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0005400567482364207,
+      "loss": 0.5068,
+      "step": 9524
+    },
+    {
+      "epoch": 0.6626317437128247,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005398566758599429,
+      "loss": 0.6971,
+      "step": 9525
+    },
+    {
+      "epoch": 0.66270131134996,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005396566268472231,
+      "loss": 0.9397,
+      "step": 9526
+    },
+    {
+      "epoch": 0.6627708789870952,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0005394566012084203,
+      "loss": 0.5619,
+      "step": 9527
+    },
+    {
+      "epoch": 0.6628404466242304,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00053925659895369,
+      "loss": 0.9043,
+      "step": 9528
+    },
+    {
+      "epoch": 0.6629100142613656,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005390566200931869,
+      "loss": 0.9458,
+      "step": 9529
+    },
+    {
+      "epoch": 0.6629795818985008,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0005388566646370656,
+      "loss": 0.7127,
+      "step": 9530
+    },
+    {
+      "epoch": 0.663049149535636,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005386567325954783,
+      "loss": 0.7,
+      "step": 9531
+    },
+    {
+      "epoch": 0.6631187171727713,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0005384568239785771,
+      "loss": 0.7347,
+      "step": 9532
+    },
+    {
+      "epoch": 0.6631882848099064,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0005382569387965115,
+      "loss": 0.8503,
+      "step": 9533
+    },
+    {
+      "epoch": 0.6632578524470416,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005380570770594317,
+      "loss": 0.8793,
+      "step": 9534
+    },
+    {
+      "epoch": 0.6633274200841769,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005378572387774849,
+      "loss": 0.8123,
+      "step": 9535
+    },
+    {
+      "epoch": 0.6633969877213121,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005376574239608179,
+      "loss": 0.9453,
+      "step": 9536
+    },
+    {
+      "epoch": 0.6634665553584472,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.000537457632619577,
+      "loss": 0.8145,
+      "step": 9537
+    },
+    {
+      "epoch": 0.6635361229955824,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005372578647639063,
+      "loss": 0.9752,
+      "step": 9538
+    },
+    {
+      "epoch": 0.6636056906327177,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005370581204039482,
+      "loss": 0.7784,
+      "step": 9539
+    },
+    {
+      "epoch": 0.6636752582698529,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0005368583995498455,
+      "loss": 0.7265,
+      "step": 9540
+    },
+    {
+      "epoch": 0.663744825906988,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0005366587022117392,
+      "loss": 0.5821,
+      "step": 9541
+    },
+    {
+      "epoch": 0.6638143935441233,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0005364590283997685,
+      "loss": 1.0459,
+      "step": 9542
+    },
+    {
+      "epoch": 0.6638839611812585,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005362593781240716,
+      "loss": 0.8329,
+      "step": 9543
+    },
+    {
+      "epoch": 0.6639535288183936,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0005360597513947866,
+      "loss": 0.5252,
+      "step": 9544
+    },
+    {
+      "epoch": 0.6640230964555289,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005358601482220484,
+      "loss": 0.6583,
+      "step": 9545
+    },
+    {
+      "epoch": 0.6640926640926641,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000535660568615993,
+      "loss": 0.8504,
+      "step": 9546
+    },
+    {
+      "epoch": 0.6641622317297993,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005354610125867529,
+      "loss": 0.8956,
+      "step": 9547
+    },
+    {
+      "epoch": 0.6642317993669345,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005352614801444617,
+      "loss": 1.0768,
+      "step": 9548
+    },
+    {
+      "epoch": 0.6643013670040697,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005350619712992495,
+      "loss": 0.8248,
+      "step": 9549
+    },
+    {
+      "epoch": 0.6643709346412049,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005348624860612471,
+      "loss": 0.7256,
+      "step": 9550
+    },
+    {
+      "epoch": 0.6644405022783401,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0005346630244405835,
+      "loss": 0.7343,
+      "step": 9551
+    },
+    {
+      "epoch": 0.6645100699154753,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0005344635864473861,
+      "loss": 0.6724,
+      "step": 9552
+    },
+    {
+      "epoch": 0.6645796375526105,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0005342641720917809,
+      "loss": 0.6969,
+      "step": 9553
+    },
+    {
+      "epoch": 0.6646492051897457,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0005340647813838935,
+      "loss": 1.1734,
+      "step": 9554
+    },
+    {
+      "epoch": 0.664718772826881,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005338654143338484,
+      "loss": 0.9715,
+      "step": 9555
+    },
+    {
+      "epoch": 0.6647883404640161,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005336660709517681,
+      "loss": 0.6627,
+      "step": 9556
+    },
+    {
+      "epoch": 0.6648579081011513,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005334667512477742,
+      "loss": 0.8215,
+      "step": 9557
+    },
+    {
+      "epoch": 0.6649274757382866,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005332674552319865,
+      "loss": 0.7588,
+      "step": 9558
+    },
+    {
+      "epoch": 0.6649970433754218,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005330681829145257,
+      "loss": 0.8211,
+      "step": 9559
+    },
+    {
+      "epoch": 0.6650666110125569,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005328689343055089,
+      "loss": 0.7802,
+      "step": 9560
+    },
+    {
+      "epoch": 0.6651361786496922,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0005326697094150528,
+      "loss": 0.699,
+      "step": 9561
+    },
+    {
+      "epoch": 0.6652057462868274,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0005324705082532737,
+      "loss": 0.6794,
+      "step": 9562
+    },
+    {
+      "epoch": 0.6652753139239626,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005322713308302852,
+      "loss": 0.6506,
+      "step": 9563
+    },
+    {
+      "epoch": 0.6653448815610977,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005320721771562015,
+      "loss": 0.8187,
+      "step": 9564
+    },
+    {
+      "epoch": 0.665414449198233,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005318730472411337,
+      "loss": 0.9967,
+      "step": 9565
+    },
+    {
+      "epoch": 0.6654840168353682,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005316739410951934,
+      "loss": 0.7501,
+      "step": 9566
+    },
+    {
+      "epoch": 0.6655535844725033,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005314748587284895,
+      "loss": 0.7076,
+      "step": 9567
+    },
+    {
+      "epoch": 0.6656231521096386,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0005312758001511307,
+      "loss": 1.0002,
+      "step": 9568
+    },
+    {
+      "epoch": 0.6656927197467738,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005310767653732246,
+      "loss": 0.8335,
+      "step": 9569
+    },
+    {
+      "epoch": 0.665762287383909,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005308777544048767,
+      "loss": 0.8125,
+      "step": 9570
+    },
+    {
+      "epoch": 0.6658318550210442,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005306787672561917,
+      "loss": 0.7869,
+      "step": 9571
+    },
+    {
+      "epoch": 0.6659014226581794,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005304798039372731,
+      "loss": 0.7647,
+      "step": 9572
+    },
+    {
+      "epoch": 0.6659709902953146,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005302808644582241,
+      "loss": 0.8244,
+      "step": 9573
+    },
+    {
+      "epoch": 0.6660405579324499,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0005300819488291452,
+      "loss": 0.6193,
+      "step": 9574
+    },
+    {
+      "epoch": 0.666110125569585,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000529883057060136,
+      "loss": 0.8622,
+      "step": 9575
+    },
+    {
+      "epoch": 0.6661796932067202,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0005296841891612959,
+      "loss": 0.8218,
+      "step": 9576
+    },
+    {
+      "epoch": 0.6662492608438554,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0005294853451427217,
+      "loss": 0.8445,
+      "step": 9577
+    },
+    {
+      "epoch": 0.6663188284809907,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0005292865250145107,
+      "loss": 0.6962,
+      "step": 9578
+    },
+    {
+      "epoch": 0.6663883961181258,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005290877287867568,
+      "loss": 0.9622,
+      "step": 9579
+    },
+    {
+      "epoch": 0.666457963755261,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000528888956469555,
+      "loss": 0.8845,
+      "step": 9580
+    },
+    {
+      "epoch": 0.6665275313923963,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005286902080729967,
+      "loss": 0.9223,
+      "step": 9581
+    },
+    {
+      "epoch": 0.6665970990295315,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005284914836071743,
+      "loss": 0.8549,
+      "step": 9582
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0005282927830821782,
+      "loss": 0.9951,
+      "step": 9583
+    },
+    {
+      "epoch": 0.6667362343038019,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.000528094106508097,
+      "loss": 0.6767,
+      "step": 9584
+    },
+    {
+      "epoch": 0.6668058019409371,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.000527895453895018,
+      "loss": 0.9949,
+      "step": 9585
+    },
+    {
+      "epoch": 0.6668753695780723,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0005276968252530283,
+      "loss": 0.7045,
+      "step": 9586
+    },
+    {
+      "epoch": 0.6669449372152075,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005274982205922136,
+      "loss": 0.8576,
+      "step": 9587
+    },
+    {
+      "epoch": 0.6670145048523427,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0005272996399226578,
+      "loss": 0.8649,
+      "step": 9588
+    },
+    {
+      "epoch": 0.6670840724894779,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0005271010832544431,
+      "loss": 0.7696,
+      "step": 9589
+    },
+    {
+      "epoch": 0.667153640126613,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005269025505976521,
+      "loss": 0.8932,
+      "step": 9590
+    },
+    {
+      "epoch": 0.6672232077637483,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005267040419623652,
+      "loss": 0.9085,
+      "step": 9591
+    },
+    {
+      "epoch": 0.6672927754008835,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0005265055573586614,
+      "loss": 0.7903,
+      "step": 9592
+    },
+    {
+      "epoch": 0.6673623430380187,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005263070967966186,
+      "loss": 0.7596,
+      "step": 9593
+    },
+    {
+      "epoch": 0.667431910675154,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005261086602863141,
+      "loss": 0.8148,
+      "step": 9594
+    },
+    {
+      "epoch": 0.6675014783122891,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005259102478378228,
+      "loss": 0.6692,
+      "step": 9595
+    },
+    {
+      "epoch": 0.6675710459494243,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005257118594612195,
+      "loss": 0.916,
+      "step": 9596
+    },
+    {
+      "epoch": 0.6676406135865596,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.000525513495166578,
+      "loss": 0.5399,
+      "step": 9597
+    },
+    {
+      "epoch": 0.6677101812236947,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0005253151549639694,
+      "loss": 0.7878,
+      "step": 9598
+    },
+    {
+      "epoch": 0.6677797488608299,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005251168388634644,
+      "loss": 0.8096,
+      "step": 9599
+    },
+    {
+      "epoch": 0.6678493164979652,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0005249185468751327,
+      "loss": 0.762,
+      "step": 9600
+    },
+    {
+      "epoch": 0.6679188841351004,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.000524720279009043,
+      "loss": 0.7958,
+      "step": 9601
+    },
+    {
+      "epoch": 0.6679884517722355,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0005245220352752619,
+      "loss": 0.9365,
+      "step": 9602
+    },
+    {
+      "epoch": 0.6680580194093707,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005243238156838548,
+      "loss": 0.8501,
+      "step": 9603
+    },
+    {
+      "epoch": 0.668127587046506,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.000524125620244887,
+      "loss": 0.9144,
+      "step": 9604
+    },
+    {
+      "epoch": 0.6681971546836412,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005239274489684218,
+      "loss": 0.8119,
+      "step": 9605
+    },
+    {
+      "epoch": 0.6682667223207763,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0005237293018645211,
+      "loss": 0.9269,
+      "step": 9606
+    },
+    {
+      "epoch": 0.6683362899579116,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0005235311789432457,
+      "loss": 0.9449,
+      "step": 9607
+    },
+    {
+      "epoch": 0.6684058575950468,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005233330802146556,
+      "loss": 0.9373,
+      "step": 9608
+    },
+    {
+      "epoch": 0.668475425232182,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0005231350056888089,
+      "loss": 0.823,
+      "step": 9609
+    },
+    {
+      "epoch": 0.6685449928693172,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.000522936955375763,
+      "loss": 0.9763,
+      "step": 9610
+    },
+    {
+      "epoch": 0.6686145605064524,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0005227389292855743,
+      "loss": 0.9294,
+      "step": 9611
+    },
+    {
+      "epoch": 0.6686841281435876,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005225409274282973,
+      "loss": 0.8357,
+      "step": 9612
+    },
+    {
+      "epoch": 0.6687536957807229,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0005223429498139849,
+      "loss": 0.8657,
+      "step": 9613
+    },
+    {
+      "epoch": 0.668823263417858,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0005221449964526899,
+      "loss": 1.0415,
+      "step": 9614
+    },
+    {
+      "epoch": 0.6688928310549932,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.000521947067354464,
+      "loss": 0.8141,
+      "step": 9615
+    },
+    {
+      "epoch": 0.6689623986921284,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005217491625293562,
+      "loss": 0.8964,
+      "step": 9616
+    },
+    {
+      "epoch": 0.6690319663292636,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0005215512819874152,
+      "loss": 0.8618,
+      "step": 9617
+    },
+    {
+      "epoch": 0.6691015339663988,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0005213534257386885,
+      "loss": 0.9095,
+      "step": 9618
+    },
+    {
+      "epoch": 0.669171101603534,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005211555937932225,
+      "loss": 0.58,
+      "step": 9619
+    },
+    {
+      "epoch": 0.6692406692406693,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0005209577861610621,
+      "loss": 0.7973,
+      "step": 9620
+    },
+    {
+      "epoch": 0.6693102368778044,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0005207600028522503,
+      "loss": 0.6226,
+      "step": 9621
+    },
+    {
+      "epoch": 0.6693798045149396,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00052056224387683,
+      "loss": 0.8321,
+      "step": 9622
+    },
+    {
+      "epoch": 0.6694493721520749,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005203645092448428,
+      "loss": 0.7101,
+      "step": 9623
+    },
+    {
+      "epoch": 0.6695189397892101,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0005201667989663279,
+      "loss": 0.7034,
+      "step": 9624
+    },
+    {
+      "epoch": 0.6695885074263452,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0005199691130513248,
+      "loss": 0.9162,
+      "step": 9625
+    },
+    {
+      "epoch": 0.6696580750634805,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0005197714515098705,
+      "loss": 0.7831,
+      "step": 9626
+    },
+    {
+      "epoch": 0.6697276427006157,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0005195738143520012,
+      "loss": 0.8744,
+      "step": 9627
+    },
+    {
+      "epoch": 0.6697972103377509,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0005193762015877519,
+      "loss": 0.7083,
+      "step": 9628
+    },
+    {
+      "epoch": 0.669866777974886,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000519178613227157,
+      "loss": 0.6718,
+      "step": 9629
+    },
+    {
+      "epoch": 0.6699363456120213,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005189810492802485,
+      "loss": 0.915,
+      "step": 9630
+    },
+    {
+      "epoch": 0.6700059132491565,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005187835097570576,
+      "loss": 0.7503,
+      "step": 9631
+    },
+    {
+      "epoch": 0.6700754808862917,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0005185859946676143,
+      "loss": 0.8699,
+      "step": 9632
+    },
+    {
+      "epoch": 0.6701450485234269,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005183885040219484,
+      "loss": 0.9738,
+      "step": 9633
+    },
+    {
+      "epoch": 0.6702146161605621,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005181910378300866,
+      "loss": 0.6297,
+      "step": 9634
+    },
+    {
+      "epoch": 0.6702841837976973,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000517993596102055,
+      "loss": 0.779,
+      "step": 9635
+    },
+    {
+      "epoch": 0.6703537514348326,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.000517796178847879,
+      "loss": 0.8163,
+      "step": 9636
+    },
+    {
+      "epoch": 0.6704233190719677,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0005175987860775832,
+      "loss": 0.7354,
+      "step": 9637
+    },
+    {
+      "epoch": 0.6704928867091029,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005174014178011894,
+      "loss": 0.8463,
+      "step": 9638
+    },
+    {
+      "epoch": 0.6705624543462382,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0005172040740287188,
+      "loss": 0.8223,
+      "step": 9639
+    },
+    {
+      "epoch": 0.6706320219833733,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005170067547701922,
+      "loss": 0.7575,
+      "step": 9640
+    },
+    {
+      "epoch": 0.6707015896205085,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0005168094600356277,
+      "loss": 0.5398,
+      "step": 9641
+    },
+    {
+      "epoch": 0.6707711572576437,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0005166121898350434,
+      "loss": 0.681,
+      "step": 9642
+    },
+    {
+      "epoch": 0.670840724894779,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.000516414944178456,
+      "loss": 0.9722,
+      "step": 9643
+    },
+    {
+      "epoch": 0.6709102925319141,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005162177230758803,
+      "loss": 0.8177,
+      "step": 9644
+    },
+    {
+      "epoch": 0.6709798601690493,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0005160205265373299,
+      "loss": 0.8753,
+      "step": 9645
+    },
+    {
+      "epoch": 0.6710494278061846,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0005158233545728175,
+      "loss": 1.0807,
+      "step": 9646
+    },
+    {
+      "epoch": 0.6711189954433198,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0005156262071923553,
+      "loss": 0.8875,
+      "step": 9647
+    },
+    {
+      "epoch": 0.6711885630804549,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0005154290844059528,
+      "loss": 1.0051,
+      "step": 9648
+    },
+    {
+      "epoch": 0.6712581307175902,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0005152319862236185,
+      "loss": 1.0798,
+      "step": 9649
+    },
+    {
+      "epoch": 0.6713276983547254,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005150349126553607,
+      "loss": 0.8117,
+      "step": 9650
+    },
+    {
+      "epoch": 0.6713972659918606,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000514837863711186,
+      "loss": 0.7735,
+      "step": 9651
+    },
+    {
+      "epoch": 0.6714668336289958,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0005146408394010991,
+      "loss": 0.8665,
+      "step": 9652
+    },
+    {
+      "epoch": 0.671536401266131,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0005144438397351037,
+      "loss": 1.0766,
+      "step": 9653
+    },
+    {
+      "epoch": 0.6716059689032662,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0005142468647232025,
+      "loss": 0.843,
+      "step": 9654
+    },
+    {
+      "epoch": 0.6716755365404014,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005140499143753978,
+      "loss": 0.746,
+      "step": 9655
+    },
+    {
+      "epoch": 0.6717451041775366,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005138529887016885,
+      "loss": 0.844,
+      "step": 9656
+    },
+    {
+      "epoch": 0.6718146718146718,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005136560877120746,
+      "loss": 0.7019,
+      "step": 9657
+    },
+    {
+      "epoch": 0.671884239451807,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0005134592114165531,
+      "loss": 0.7212,
+      "step": 9658
+    },
+    {
+      "epoch": 0.6719538070889423,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0005132623598251201,
+      "loss": 0.9283,
+      "step": 9659
+    },
+    {
+      "epoch": 0.6720233747260774,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005130655329477712,
+      "loss": 0.7341,
+      "step": 9660
+    },
+    {
+      "epoch": 0.6720929423632126,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0005128687307945006,
+      "loss": 0.5998,
+      "step": 9661
+    },
+    {
+      "epoch": 0.6721625100003479,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0005126719533753006,
+      "loss": 0.8028,
+      "step": 9662
+    },
+    {
+      "epoch": 0.672232077637483,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0005124752007001619,
+      "loss": 0.9841,
+      "step": 9663
+    },
+    {
+      "epoch": 0.6723016452746182,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0005122784727790752,
+      "loss": 1.1907,
+      "step": 9664
+    },
+    {
+      "epoch": 0.6723712129117535,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0005120817696220299,
+      "loss": 1.0263,
+      "step": 9665
+    },
+    {
+      "epoch": 0.6724407805488887,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0005118850912390131,
+      "loss": 0.827,
+      "step": 9666
+    },
+    {
+      "epoch": 0.6725103481860238,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005116884376400107,
+      "loss": 0.6834,
+      "step": 9667
+    },
+    {
+      "epoch": 0.672579915823159,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005114918088350079,
+      "loss": 0.9123,
+      "step": 9668
+    },
+    {
+      "epoch": 0.6726494834602943,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0005112952048339894,
+      "loss": 0.8906,
+      "step": 9669
+    },
+    {
+      "epoch": 0.6727190510974295,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0005110986256469366,
+      "loss": 0.7647,
+      "step": 9670
+    },
+    {
+      "epoch": 0.6727886187345646,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0005109020712838318,
+      "loss": 0.5316,
+      "step": 9671
+    },
+    {
+      "epoch": 0.6728581863716999,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0005107055417546547,
+      "loss": 0.6323,
+      "step": 9672
+    },
+    {
+      "epoch": 0.6729277540088351,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005105090370693835,
+      "loss": 0.7915,
+      "step": 9673
+    },
+    {
+      "epoch": 0.6729973216459703,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.000510312557237996,
+      "loss": 1.0018,
+      "step": 9674
+    },
+    {
+      "epoch": 0.6730668892831055,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0005101161022704692,
+      "loss": 0.6877,
+      "step": 9675
+    },
+    {
+      "epoch": 0.6731364569202407,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0005099196721767776,
+      "loss": 0.7115,
+      "step": 9676
+    },
+    {
+      "epoch": 0.6732060245573759,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005097232669668943,
+      "loss": 0.8713,
+      "step": 9677
+    },
+    {
+      "epoch": 0.6732755921945112,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0005095268866507924,
+      "loss": 0.4574,
+      "step": 9678
+    },
+    {
+      "epoch": 0.6733451598316463,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005093305312384434,
+      "loss": 0.7687,
+      "step": 9679
+    },
+    {
+      "epoch": 0.6734147274687815,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000509134200739817,
+      "loss": 0.7457,
+      "step": 9680
+    },
+    {
+      "epoch": 0.6734842951059167,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005089378951648811,
+      "loss": 0.5863,
+      "step": 9681
+    },
+    {
+      "epoch": 0.673553862743052,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0005087416145236039,
+      "loss": 0.7629,
+      "step": 9682
+    },
+    {
+      "epoch": 0.6736234303801871,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005085453588259519,
+      "loss": 0.6333,
+      "step": 9683
+    },
+    {
+      "epoch": 0.6736929980173223,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0005083491280818888,
+      "loss": 0.9301,
+      "step": 9684
+    },
+    {
+      "epoch": 0.6737625656544576,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0005081529223013795,
+      "loss": 0.7265,
+      "step": 9685
+    },
+    {
+      "epoch": 0.6738321332915927,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0005079567414943856,
+      "loss": 0.5622,
+      "step": 9686
+    },
+    {
+      "epoch": 0.6739017009287279,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005077605856708678,
+      "loss": 0.9549,
+      "step": 9687
+    },
+    {
+      "epoch": 0.6739712685658632,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005075644548407865,
+      "loss": 0.6197,
+      "step": 9688
+    },
+    {
+      "epoch": 0.6740408362029984,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0005073683490141005,
+      "loss": 0.8045,
+      "step": 9689
+    },
+    {
+      "epoch": 0.6741104038401335,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0005071722682007667,
+      "loss": 0.7151,
+      "step": 9690
+    },
+    {
+      "epoch": 0.6741799714772688,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005069762124107408,
+      "loss": 0.726,
+      "step": 9691
+    },
+    {
+      "epoch": 0.674249539114404,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0005067801816539776,
+      "loss": 1.2405,
+      "step": 9692
+    },
+    {
+      "epoch": 0.6743191067515392,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0005065841759404313,
+      "loss": 0.7895,
+      "step": 9693
+    },
+    {
+      "epoch": 0.6743886743886743,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0005063881952800535,
+      "loss": 1.0126,
+      "step": 9694
+    },
+    {
+      "epoch": 0.6744582420258096,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005061922396827947,
+      "loss": 0.7312,
+      "step": 9695
+    },
+    {
+      "epoch": 0.6745278096629448,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005059963091586051,
+      "loss": 0.8364,
+      "step": 9696
+    },
+    {
+      "epoch": 0.67459737730008,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0005058004037174333,
+      "loss": 0.7215,
+      "step": 9697
+    },
+    {
+      "epoch": 0.6746669449372152,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005056045233692257,
+      "loss": 0.9513,
+      "step": 9698
+    },
+    {
+      "epoch": 0.6747365125743504,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0005054086681239288,
+      "loss": 1.0549,
+      "step": 9699
+    },
+    {
+      "epoch": 0.6748060802114856,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0005052128379914864,
+      "loss": 0.5822,
+      "step": 9700
+    },
+    {
+      "epoch": 0.6748756478486209,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0005050170329818427,
+      "loss": 0.7373,
+      "step": 9701
+    },
+    {
+      "epoch": 0.674945215485756,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0005048212531049386,
+      "loss": 0.8542,
+      "step": 9702
+    },
+    {
+      "epoch": 0.6750147831228912,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005046254983707159,
+      "loss": 0.8557,
+      "step": 9703
+    },
+    {
+      "epoch": 0.6750843507600265,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0005044297687891135,
+      "loss": 0.9148,
+      "step": 9704
+    },
+    {
+      "epoch": 0.6751539183971617,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0005042340643700687,
+      "loss": 0.7164,
+      "step": 9705
+    },
+    {
+      "epoch": 0.6752234860342968,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005040383851235202,
+      "loss": 0.764,
+      "step": 9706
+    },
+    {
+      "epoch": 0.675293053671432,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0005038427310594026,
+      "loss": 0.6977,
+      "step": 9707
+    },
+    {
+      "epoch": 0.6753626213085673,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0005036471021876503,
+      "loss": 1.0362,
+      "step": 9708
+    },
+    {
+      "epoch": 0.6754321889457024,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0005034514985181959,
+      "loss": 0.6269,
+      "step": 9709
+    },
+    {
+      "epoch": 0.6755017565828376,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0005032559200609716,
+      "loss": 0.8695,
+      "step": 9710
+    },
+    {
+      "epoch": 0.6755713242199729,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0005030603668259084,
+      "loss": 0.8633,
+      "step": 9711
+    },
+    {
+      "epoch": 0.6756408918571081,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0005028648388229346,
+      "loss": 0.5972,
+      "step": 9712
+    },
+    {
+      "epoch": 0.6757104594942432,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.000502669336061979,
+      "loss": 0.7126,
+      "step": 9713
+    },
+    {
+      "epoch": 0.6757800271313785,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005024738585529672,
+      "loss": 0.9349,
+      "step": 9714
+    },
+    {
+      "epoch": 0.6758495947685137,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0005022784063058257,
+      "loss": 0.8628,
+      "step": 9715
+    },
+    {
+      "epoch": 0.6759191624056489,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0005020829793304775,
+      "loss": 0.6641,
+      "step": 9716
+    },
+    {
+      "epoch": 0.6759887300427841,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0005018875776368464,
+      "loss": 0.5537,
+      "step": 9717
+    },
+    {
+      "epoch": 0.6760582976799193,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0005016922012348535,
+      "loss": 0.6479,
+      "step": 9718
+    },
+    {
+      "epoch": 0.6761278653170545,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0005014968501344184,
+      "loss": 0.949,
+      "step": 9719
+    },
+    {
+      "epoch": 0.6761974329541897,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0005013015243454607,
+      "loss": 0.9195,
+      "step": 9720
+    },
+    {
+      "epoch": 0.6762670005913249,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0005011062238778983,
+      "loss": 0.8893,
+      "step": 9721
+    },
+    {
+      "epoch": 0.6763365682284601,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0005009109487416473,
+      "loss": 0.6517,
+      "step": 9722
+    },
+    {
+      "epoch": 0.6764061358655953,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0005007156989466224,
+      "loss": 0.8944,
+      "step": 9723
+    },
+    {
+      "epoch": 0.6764757035027306,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0005005204745027376,
+      "loss": 0.7622,
+      "step": 9724
+    },
+    {
+      "epoch": 0.6765452711398657,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000500325275419906,
+      "loss": 1.125,
+      "step": 9725
+    },
+    {
+      "epoch": 0.6766148387770009,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0005001301017080384,
+      "loss": 0.7286,
+      "step": 9726
+    },
+    {
+      "epoch": 0.6766844064141362,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0004999349533770444,
+      "loss": 0.8135,
+      "step": 9727
+    },
+    {
+      "epoch": 0.6767539740512714,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004997398304368327,
+      "loss": 0.7637,
+      "step": 9728
+    },
+    {
+      "epoch": 0.6768235416884065,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0004995447328973114,
+      "loss": 0.6227,
+      "step": 9729
+    },
+    {
+      "epoch": 0.6768931093255418,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0004993496607683857,
+      "loss": 0.9152,
+      "step": 9730
+    },
+    {
+      "epoch": 0.676962676962677,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004991546140599612,
+      "loss": 0.767,
+      "step": 9731
+    },
+    {
+      "epoch": 0.6770322445998121,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0004989595927819406,
+      "loss": 0.9917,
+      "step": 9732
+    },
+    {
+      "epoch": 0.6771018122369473,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004987645969442268,
+      "loss": 0.9896,
+      "step": 9733
+    },
+    {
+      "epoch": 0.6771713798740826,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0004985696265567198,
+      "loss": 0.6176,
+      "step": 9734
+    },
+    {
+      "epoch": 0.6772409475112178,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0004983746816293204,
+      "loss": 0.8158,
+      "step": 9735
+    },
+    {
+      "epoch": 0.6773105151483529,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004981797621719262,
+      "loss": 0.7196,
+      "step": 9736
+    },
+    {
+      "epoch": 0.6773800827854882,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004979848681944338,
+      "loss": 0.9503,
+      "step": 9737
+    },
+    {
+      "epoch": 0.6774496504226234,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0004977899997067396,
+      "loss": 0.9683,
+      "step": 9738
+    },
+    {
+      "epoch": 0.6775192180597586,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0004975951567187382,
+      "loss": 0.7924,
+      "step": 9739
+    },
+    {
+      "epoch": 0.6775887856968938,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0004974003392403224,
+      "loss": 0.8315,
+      "step": 9740
+    },
+    {
+      "epoch": 0.677658353334029,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0004972055472813839,
+      "loss": 1.0786,
+      "step": 9741
+    },
+    {
+      "epoch": 0.6777279209711642,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004970107808518133,
+      "loss": 0.8551,
+      "step": 9742
+    },
+    {
+      "epoch": 0.6777974886082995,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0004968160399615003,
+      "loss": 0.6797,
+      "step": 9743
+    },
+    {
+      "epoch": 0.6778670562454346,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0004966213246203323,
+      "loss": 0.7715,
+      "step": 9744
+    },
+    {
+      "epoch": 0.6779366238825698,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0004964266348381965,
+      "loss": 0.5859,
+      "step": 9745
+    },
+    {
+      "epoch": 0.678006191519705,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004962319706249777,
+      "loss": 0.7948,
+      "step": 9746
+    },
+    {
+      "epoch": 0.6780757591568403,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004960373319905605,
+      "loss": 0.6442,
+      "step": 9747
+    },
+    {
+      "epoch": 0.6781453267939754,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004958427189448272,
+      "loss": 0.5826,
+      "step": 9748
+    },
+    {
+      "epoch": 0.6782148944311106,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004956481314976599,
+      "loss": 0.8098,
+      "step": 9749
+    },
+    {
+      "epoch": 0.6782844620682459,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004954535696589382,
+      "loss": 0.9073,
+      "step": 9750
+    },
+    {
+      "epoch": 0.6783540297053811,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004952590334385404,
+      "loss": 0.8885,
+      "step": 9751
+    },
+    {
+      "epoch": 0.6784235973425162,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0004950645228463457,
+      "loss": 0.9079,
+      "step": 9752
+    },
+    {
+      "epoch": 0.6784931649796515,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0004948700378922293,
+      "loss": 0.8953,
+      "step": 9753
+    },
+    {
+      "epoch": 0.6785627326167867,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0004946755785860664,
+      "loss": 0.839,
+      "step": 9754
+    },
+    {
+      "epoch": 0.6786323002539218,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0004944811449377301,
+      "loss": 0.8367,
+      "step": 9755
+    },
+    {
+      "epoch": 0.6787018678910571,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004942867369570934,
+      "loss": 0.6971,
+      "step": 9756
+    },
+    {
+      "epoch": 0.6787714355281923,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004940923546540276,
+      "loss": 0.9824,
+      "step": 9757
+    },
+    {
+      "epoch": 0.6788410031653275,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0004938979980384017,
+      "loss": 0.8169,
+      "step": 9758
+    },
+    {
+      "epoch": 0.6789105708024626,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004937036671200847,
+      "loss": 0.8172,
+      "step": 9759
+    },
+    {
+      "epoch": 0.6789801384395979,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0004935093619089434,
+      "loss": 0.8487,
+      "step": 9760
+    },
+    {
+      "epoch": 0.6790497060767331,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004933150824148441,
+      "loss": 0.7046,
+      "step": 9761
+    },
+    {
+      "epoch": 0.6791192737138683,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0004931208286476506,
+      "loss": 0.5163,
+      "step": 9762
+    },
+    {
+      "epoch": 0.6791888413510035,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000492926600617227,
+      "loss": 0.7764,
+      "step": 9763
+    },
+    {
+      "epoch": 0.6792584089881387,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004927323983334344,
+      "loss": 0.8477,
+      "step": 9764
+    },
+    {
+      "epoch": 0.6793279766252739,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0004925382218061338,
+      "loss": 0.8992,
+      "step": 9765
+    },
+    {
+      "epoch": 0.6793975442624092,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004923440710451848,
+      "loss": 0.958,
+      "step": 9766
+    },
+    {
+      "epoch": 0.6794671118995443,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0004921499460604453,
+      "loss": 0.8821,
+      "step": 9767
+    },
+    {
+      "epoch": 0.6795366795366795,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0004919558468617717,
+      "loss": 1.3672,
+      "step": 9768
+    },
+    {
+      "epoch": 0.6796062471738148,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000491761773459019,
+      "loss": 0.6571,
+      "step": 9769
+    },
+    {
+      "epoch": 0.67967581481095,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0004915677258620416,
+      "loss": 0.8382,
+      "step": 9770
+    },
+    {
+      "epoch": 0.6797453824480851,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0004913737040806931,
+      "loss": 1.2732,
+      "step": 9771
+    },
+    {
+      "epoch": 0.6798149500852203,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004911797081248238,
+      "loss": 1.0633,
+      "step": 9772
+    },
+    {
+      "epoch": 0.6798845177223556,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0004909857380042845,
+      "loss": 0.8228,
+      "step": 9773
+    },
+    {
+      "epoch": 0.6799540853594908,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0004907917937289235,
+      "loss": 0.8447,
+      "step": 9774
+    },
+    {
+      "epoch": 0.6800236529966259,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0004905978753085889,
+      "loss": 0.8315,
+      "step": 9775
+    },
+    {
+      "epoch": 0.6800932206337612,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0004904039827531262,
+      "loss": 1.024,
+      "step": 9776
+    },
+    {
+      "epoch": 0.6801627882708964,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0004902101160723813,
+      "loss": 1.0698,
+      "step": 9777
+    },
+    {
+      "epoch": 0.6802323559080315,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0004900162752761966,
+      "loss": 0.9307,
+      "step": 9778
+    },
+    {
+      "epoch": 0.6803019235451668,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004898224603744151,
+      "loss": 1.0074,
+      "step": 9779
+    },
+    {
+      "epoch": 0.680371491182302,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0004896286713768778,
+      "loss": 0.7614,
+      "step": 9780
+    },
+    {
+      "epoch": 0.6804410588194372,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004894349082934243,
+      "loss": 0.6902,
+      "step": 9781
+    },
+    {
+      "epoch": 0.6805106264565725,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0004892411711338925,
+      "loss": 0.786,
+      "step": 9782
+    },
+    {
+      "epoch": 0.6805801940937076,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000489047459908119,
+      "loss": 0.7574,
+      "step": 9783
+    },
+    {
+      "epoch": 0.6806497617308428,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0004888537746259408,
+      "loss": 0.9947,
+      "step": 9784
+    },
+    {
+      "epoch": 0.680719329367978,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004886601152971915,
+      "loss": 0.5515,
+      "step": 9785
+    },
+    {
+      "epoch": 0.6807888970051132,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.000488466481931704,
+      "loss": 0.7297,
+      "step": 9786
+    },
+    {
+      "epoch": 0.6808584646422484,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0004882728745393105,
+      "loss": 0.6101,
+      "step": 9787
+    },
+    {
+      "epoch": 0.6809280322793836,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004880792931298408,
+      "loss": 1.0144,
+      "step": 9788
+    },
+    {
+      "epoch": 0.6809975999165189,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0004878857377131246,
+      "loss": 0.7814,
+      "step": 9789
+    },
+    {
+      "epoch": 0.681067167553654,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004876922082989891,
+      "loss": 0.7869,
+      "step": 9790
+    },
+    {
+      "epoch": 0.6811367351907892,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00048749870489726133,
+      "loss": 0.883,
+      "step": 9791
+    },
+    {
+      "epoch": 0.6812063028279245,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.00048730522751776586,
+      "loss": 0.9427,
+      "step": 9792
+    },
+    {
+      "epoch": 0.6812758704650597,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004871117761703271,
+      "loss": 0.6713,
+      "step": 9793
+    },
+    {
+      "epoch": 0.6813454381021948,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0004869183508647668,
+      "loss": 0.8737,
+      "step": 9794
+    },
+    {
+      "epoch": 0.6814150057393301,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0004867249516109069,
+      "loss": 0.76,
+      "step": 9795
+    },
+    {
+      "epoch": 0.6814845733764653,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0004865315784185664,
+      "loss": 0.7439,
+      "step": 9796
+    },
+    {
+      "epoch": 0.6815541410136005,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004863382312975644,
+      "loss": 0.8266,
+      "step": 9797
+    },
+    {
+      "epoch": 0.6816237086507356,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00048614491025771836,
+      "loss": 0.9495,
+      "step": 9798
+    },
+    {
+      "epoch": 0.6816932762878709,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004859516153088437,
+      "loss": 1.083,
+      "step": 9799
+    },
+    {
+      "epoch": 0.6817628439250061,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00048575834646075503,
+      "loss": 0.7001,
+      "step": 9800
+    },
+    {
+      "epoch": 0.6818324115621412,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00048556510372326514,
+      "loss": 0.6385,
+      "step": 9801
+    },
+    {
+      "epoch": 0.6819019791992765,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0004853718871061863,
+      "loss": 1.0354,
+      "step": 9802
+    },
+    {
+      "epoch": 0.6819715468364117,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00048517869661932956,
+      "loss": 0.7948,
+      "step": 9803
+    },
+    {
+      "epoch": 0.6820411144735469,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0004849855322725034,
+      "loss": 0.7419,
+      "step": 9804
+    },
+    {
+      "epoch": 0.6821106821106822,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00048479239407551636,
+      "loss": 0.7721,
+      "step": 9805
+    },
+    {
+      "epoch": 0.6821802497478173,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004845992820381743,
+      "loss": 0.9139,
+      "step": 9806
+    },
+    {
+      "epoch": 0.6822498173849525,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00048440619617028325,
+      "loss": 0.8945,
+      "step": 9807
+    },
+    {
+      "epoch": 0.6823193850220878,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00048421313648164645,
+      "loss": 0.8678,
+      "step": 9808
+    },
+    {
+      "epoch": 0.682388952659223,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0004840201029820672,
+      "loss": 0.6539,
+      "step": 9809
+    },
+    {
+      "epoch": 0.6824585202963581,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0004838270956813461,
+      "loss": 0.9426,
+      "step": 9810
+    },
+    {
+      "epoch": 0.6825280879334933,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0004836341145892832,
+      "loss": 0.6187,
+      "step": 9811
+    },
+    {
+      "epoch": 0.6825976555706286,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0004834411597156777,
+      "loss": 0.8741,
+      "step": 9812
+    },
+    {
+      "epoch": 0.6826672232077637,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00048324823107032653,
+      "loss": 0.7333,
+      "step": 9813
+    },
+    {
+      "epoch": 0.6827367908448989,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0004830553286630256,
+      "loss": 0.8243,
+      "step": 9814
+    },
+    {
+      "epoch": 0.6828063584820342,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00048286245250356866,
+      "loss": 0.8961,
+      "step": 9815
+    },
+    {
+      "epoch": 0.6828759261191694,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00048266960260175053,
+      "loss": 0.8247,
+      "step": 9816
+    },
+    {
+      "epoch": 0.6829454937563045,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00048247677896736253,
+      "loss": 0.9039,
+      "step": 9817
+    },
+    {
+      "epoch": 0.6830150613934398,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00048228398161019473,
+      "loss": 0.7044,
+      "step": 9818
+    },
+    {
+      "epoch": 0.683084629030575,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00048209121054003726,
+      "loss": 0.7803,
+      "step": 9819
+    },
+    {
+      "epoch": 0.6831541966677102,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00048189846576667726,
+      "loss": 0.6082,
+      "step": 9820
+    },
+    {
+      "epoch": 0.6832237643048454,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00048170574729990227,
+      "loss": 0.8898,
+      "step": 9821
+    },
+    {
+      "epoch": 0.6832933319419806,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0004815130551494965,
+      "loss": 1.072,
+      "step": 9822
+    },
+    {
+      "epoch": 0.6833628995791158,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00048132038932524493,
+      "loss": 0.7886,
+      "step": 9823
+    },
+    {
+      "epoch": 0.683432467216251,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00048112774983692907,
+      "loss": 0.6386,
+      "step": 9824
+    },
+    {
+      "epoch": 0.6835020348533862,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000480935136694331,
+      "loss": 0.7901,
+      "step": 9825
+    },
+    {
+      "epoch": 0.6835716024905214,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00048074254990723063,
+      "loss": 0.821,
+      "step": 9826
+    },
+    {
+      "epoch": 0.6836411701276566,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004805499894854063,
+      "loss": 0.8903,
+      "step": 9827
+    },
+    {
+      "epoch": 0.6837107377647919,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0004803574554386351,
+      "loss": 0.8313,
+      "step": 9828
+    },
+    {
+      "epoch": 0.683780305401927,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00048016494777669295,
+      "loss": 0.9615,
+      "step": 9829
+    },
+    {
+      "epoch": 0.6838498730390622,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.000479972466509355,
+      "loss": 0.7357,
+      "step": 9830
+    },
+    {
+      "epoch": 0.6839194406761975,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00047978001164639404,
+      "loss": 0.6229,
+      "step": 9831
+    },
+    {
+      "epoch": 0.6839890083133326,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00047958758319758166,
+      "loss": 0.688,
+      "step": 9832
+    },
+    {
+      "epoch": 0.6840585759504678,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0004793951811726891,
+      "loss": 0.8784,
+      "step": 9833
+    },
+    {
+      "epoch": 0.6841281435876031,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004792028055814848,
+      "loss": 0.7363,
+      "step": 9834
+    },
+    {
+      "epoch": 0.6841977112247383,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004790104564337374,
+      "loss": 0.8188,
+      "step": 9835
+    },
+    {
+      "epoch": 0.6842672788618734,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004788181337392127,
+      "loss": 0.6419,
+      "step": 9836
+    },
+    {
+      "epoch": 0.6843368464990086,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00047862583750767654,
+      "loss": 0.7032,
+      "step": 9837
+    },
+    {
+      "epoch": 0.6844064141361439,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0004784335677488921,
+      "loss": 0.6312,
+      "step": 9838
+    },
+    {
+      "epoch": 0.6844759817732791,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00047824132447262213,
+      "loss": 0.6002,
+      "step": 9839
+    },
+    {
+      "epoch": 0.6845455494104142,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0004780491076886283,
+      "loss": 0.8293,
+      "step": 9840
+    },
+    {
+      "epoch": 0.6846151170475495,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0004778569174066699,
+      "loss": 1.0433,
+      "step": 9841
+    },
+    {
+      "epoch": 0.6846846846846847,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0004776647536365051,
+      "loss": 1.086,
+      "step": 9842
+    },
+    {
+      "epoch": 0.6847542523218199,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004774726163878914,
+      "loss": 0.5927,
+      "step": 9843
+    },
+    {
+      "epoch": 0.6848238199589551,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004772805056705848,
+      "loss": 0.7805,
+      "step": 9844
+    },
+    {
+      "epoch": 0.6848933875960903,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0004770884214943394,
+      "loss": 0.7865,
+      "step": 9845
+    },
+    {
+      "epoch": 0.6849629552332255,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000476896363868908,
+      "loss": 0.7678,
+      "step": 9846
+    },
+    {
+      "epoch": 0.6850325228703608,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00047670433280404257,
+      "loss": 0.952,
+      "step": 9847
+    },
+    {
+      "epoch": 0.6851020905074959,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00047651232830949386,
+      "loss": 0.8057,
+      "step": 9848
+    },
+    {
+      "epoch": 0.6851716581446311,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00047632035039501055,
+      "loss": 0.7233,
+      "step": 9849
+    },
+    {
+      "epoch": 0.6852412257817663,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0004761283990703399,
+      "loss": 0.5269,
+      "step": 9850
+    },
+    {
+      "epoch": 0.6853107934189016,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.000475936474345229,
+      "loss": 0.5662,
+      "step": 9851
+    },
+    {
+      "epoch": 0.6853803610560367,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00047574457622942225,
+      "loss": 0.7139,
+      "step": 9852
+    },
+    {
+      "epoch": 0.6854499286931719,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004755527047326633,
+      "loss": 0.7737,
+      "step": 9853
+    },
+    {
+      "epoch": 0.6855194963303072,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.000475360859864695,
+      "loss": 0.7234,
+      "step": 9854
+    },
+    {
+      "epoch": 0.6855890639674423,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00047516904163525796,
+      "loss": 0.6907,
+      "step": 9855
+    },
+    {
+      "epoch": 0.6856586316045775,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0004749772500540912,
+      "loss": 0.8297,
+      "step": 9856
+    },
+    {
+      "epoch": 0.6857281992417128,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00047478548513093334,
+      "loss": 0.6738,
+      "step": 9857
+    },
+    {
+      "epoch": 0.685797766878848,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0004745937468755217,
+      "loss": 1.108,
+      "step": 9858
+    },
+    {
+      "epoch": 0.6858673345159831,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0004744020352975913,
+      "loss": 0.7054,
+      "step": 9859
+    },
+    {
+      "epoch": 0.6859369021531184,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000474210350406876,
+      "loss": 0.7712,
+      "step": 9860
+    },
+    {
+      "epoch": 0.6860064697902536,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00047401869221310887,
+      "loss": 0.867,
+      "step": 9861
+    },
+    {
+      "epoch": 0.6860760374273888,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004738270607260218,
+      "loss": 0.8149,
+      "step": 9862
+    },
+    {
+      "epoch": 0.6861456050645239,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004736354559553445,
+      "loss": 0.74,
+      "step": 9863
+    },
+    {
+      "epoch": 0.6862151727016592,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.00047344387791080535,
+      "loss": 0.4866,
+      "step": 9864
+    },
+    {
+      "epoch": 0.6862847403387944,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00047325232660213234,
+      "loss": 1.0872,
+      "step": 9865
+    },
+    {
+      "epoch": 0.6863543079759296,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00047306080203905076,
+      "loss": 0.7222,
+      "step": 9866
+    },
+    {
+      "epoch": 0.6864238756130648,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00047286930423128584,
+      "loss": 0.6516,
+      "step": 9867
+    },
+    {
+      "epoch": 0.6864934432502,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00047267783318856097,
+      "loss": 0.9393,
+      "step": 9868
+    },
+    {
+      "epoch": 0.6865630108873352,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0004724863889205978,
+      "loss": 0.673,
+      "step": 9869
+    },
+    {
+      "epoch": 0.6866325785244705,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0004722949714371166,
+      "loss": 0.7548,
+      "step": 9870
+    },
+    {
+      "epoch": 0.6867021461616056,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000472103580747837,
+      "loss": 0.899,
+      "step": 9871
+    },
+    {
+      "epoch": 0.6867717137987408,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0004719122168624771,
+      "loss": 0.892,
+      "step": 9872
+    },
+    {
+      "epoch": 0.6868412814358761,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00047172087979075307,
+      "loss": 1.0007,
+      "step": 9873
+    },
+    {
+      "epoch": 0.6869108490730113,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00047152956954237967,
+      "loss": 0.9274,
+      "step": 9874
+    },
+    {
+      "epoch": 0.6869804167101464,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00047133828612707095,
+      "loss": 0.8581,
+      "step": 9875
+    },
+    {
+      "epoch": 0.6870499843472816,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004711470295545399,
+      "loss": 0.8484,
+      "step": 9876
+    },
+    {
+      "epoch": 0.6871195519844169,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004709557998344971,
+      "loss": 0.7848,
+      "step": 9877
+    },
+    {
+      "epoch": 0.687189119621552,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00047076459697665174,
+      "loss": 0.9097,
+      "step": 9878
+    },
+    {
+      "epoch": 0.6872586872586872,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00047057342099071257,
+      "loss": 0.9504,
+      "step": 9879
+    },
+    {
+      "epoch": 0.6873282548958225,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00047038227188638703,
+      "loss": 0.9776,
+      "step": 9880
+    },
+    {
+      "epoch": 0.6873978225329577,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00047019114967338015,
+      "loss": 0.7916,
+      "step": 9881
+    },
+    {
+      "epoch": 0.6874673901700928,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.000470000054361396,
+      "loss": 0.5856,
+      "step": 9882
+    },
+    {
+      "epoch": 0.6875369578072281,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00046980898596013797,
+      "loss": 0.6789,
+      "step": 9883
+    },
+    {
+      "epoch": 0.6876065254443633,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004696179444793071,
+      "loss": 0.83,
+      "step": 9884
+    },
+    {
+      "epoch": 0.6876760930814985,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00046942692992860347,
+      "loss": 1.0817,
+      "step": 9885
+    },
+    {
+      "epoch": 0.6877456607186337,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0004692359423177265,
+      "loss": 0.8211,
+      "step": 9886
+    },
+    {
+      "epoch": 0.6878152283557689,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0004690449816563731,
+      "loss": 0.6253,
+      "step": 9887
+    },
+    {
+      "epoch": 0.6878847959929041,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00046885404795423894,
+      "loss": 0.749,
+      "step": 9888
+    },
+    {
+      "epoch": 0.6879543636300393,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00046866314122101906,
+      "loss": 1.0462,
+      "step": 9889
+    },
+    {
+      "epoch": 0.6880239312671745,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004684722614664072,
+      "loss": 0.7023,
+      "step": 9890
+    },
+    {
+      "epoch": 0.6880934989043097,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00046828140870009473,
+      "loss": 0.7339,
+      "step": 9891
+    },
+    {
+      "epoch": 0.6881630665414449,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00046809058293177186,
+      "loss": 0.7907,
+      "step": 9892
+    },
+    {
+      "epoch": 0.6882326341785802,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00046789978417112823,
+      "loss": 0.8627,
+      "step": 9893
+    },
+    {
+      "epoch": 0.6883022018157153,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004677090124278519,
+      "loss": 0.7997,
+      "step": 9894
+    },
+    {
+      "epoch": 0.6883717694528505,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00046751826771162895,
+      "loss": 0.8273,
+      "step": 9895
+    },
+    {
+      "epoch": 0.6884413370899858,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0004673275500321441,
+      "loss": 0.875,
+      "step": 9896
+    },
+    {
+      "epoch": 0.688510904727121,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0004671368593990818,
+      "loss": 0.7543,
+      "step": 9897
+    },
+    {
+      "epoch": 0.6885804723642561,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0004669461958221236,
+      "loss": 0.983,
+      "step": 9898
+    },
+    {
+      "epoch": 0.6886500400013914,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0004667555593109507,
+      "loss": 0.843,
+      "step": 9899
+    },
+    {
+      "epoch": 0.6887196076385266,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0004665649498752432,
+      "loss": 0.6497,
+      "step": 9900
+    },
+    {
+      "epoch": 0.6887891752756617,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00046637436752467874,
+      "loss": 0.8861,
+      "step": 9901
+    },
+    {
+      "epoch": 0.6888587429127969,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00046618381226893403,
+      "loss": 0.8833,
+      "step": 9902
+    },
+    {
+      "epoch": 0.6889283105499322,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0004659932841176845,
+      "loss": 0.7966,
+      "step": 9903
+    },
+    {
+      "epoch": 0.6889978781870674,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0004658027830806049,
+      "loss": 0.6317,
+      "step": 9904
+    },
+    {
+      "epoch": 0.6890674458242025,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004656123091673674,
+      "loss": 1.1123,
+      "step": 9905
+    },
+    {
+      "epoch": 0.6891370134613378,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00046542186238764295,
+      "loss": 0.8116,
+      "step": 9906
+    },
+    {
+      "epoch": 0.689206581098473,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00046523144275110187,
+      "loss": 0.9125,
+      "step": 9907
+    },
+    {
+      "epoch": 0.6892761487356082,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004650410502674131,
+      "loss": 0.9166,
+      "step": 9908
+    },
+    {
+      "epoch": 0.6893457163727434,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004648506849462433,
+      "loss": 0.8827,
+      "step": 9909
+    },
+    {
+      "epoch": 0.6894152840098786,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.000464660346797258,
+      "loss": 0.645,
+      "step": 9910
+    },
+    {
+      "epoch": 0.6894848516470138,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0004644700358301224,
+      "loss": 0.9024,
+      "step": 9911
+    },
+    {
+      "epoch": 0.6895544192841491,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0004642797520544987,
+      "loss": 0.8633,
+      "step": 9912
+    },
+    {
+      "epoch": 0.6896239869212842,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00046408949548004897,
+      "loss": 0.6726,
+      "step": 9913
+    },
+    {
+      "epoch": 0.6896935545584194,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.00046389926611643394,
+      "loss": 1.0634,
+      "step": 9914
+    },
+    {
+      "epoch": 0.6897631221955546,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004637090639733119,
+      "loss": 0.9568,
+      "step": 9915
+    },
+    {
+      "epoch": 0.6898326898326899,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0004635188890603402,
+      "loss": 0.7871,
+      "step": 9916
+    },
+    {
+      "epoch": 0.689902257469825,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00046332874138717517,
+      "loss": 0.731,
+      "step": 9917
+    },
+    {
+      "epoch": 0.6899718251069602,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00046313862096347203,
+      "loss": 0.9366,
+      "step": 9918
+    },
+    {
+      "epoch": 0.6900413927440955,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.00046294852779888384,
+      "loss": 1.1027,
+      "step": 9919
+    },
+    {
+      "epoch": 0.6901109603812307,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00046275846190306193,
+      "loss": 0.6191,
+      "step": 9920
+    },
+    {
+      "epoch": 0.6901805280183658,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0004625684232856575,
+      "loss": 0.7207,
+      "step": 9921
+    },
+    {
+      "epoch": 0.6902500956555011,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00046237841195632013,
+      "loss": 0.797,
+      "step": 9922
+    },
+    {
+      "epoch": 0.6903196632926363,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0004621884279246971,
+      "loss": 0.6195,
+      "step": 9923
+    },
+    {
+      "epoch": 0.6903892309297714,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0004619984712004346,
+      "loss": 0.5835,
+      "step": 9924
+    },
+    {
+      "epoch": 0.6904587985669067,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0004618085417931779,
+      "loss": 0.696,
+      "step": 9925
+    },
+    {
+      "epoch": 0.6905283662040419,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00046161863971257123,
+      "loss": 0.8957,
+      "step": 9926
+    },
+    {
+      "epoch": 0.6905979338411771,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00046142876496825606,
+      "loss": 0.7213,
+      "step": 9927
+    },
+    {
+      "epoch": 0.6906675014783122,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004612389175698739,
+      "loss": 0.855,
+      "step": 9928
+    },
+    {
+      "epoch": 0.6907370691154475,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0004610490975270639,
+      "loss": 0.7702,
+      "step": 9929
+    },
+    {
+      "epoch": 0.6908066367525827,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0004608593048494639,
+      "loss": 1.1194,
+      "step": 9930
+    },
+    {
+      "epoch": 0.6908762043897179,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000460669539546711,
+      "loss": 0.4911,
+      "step": 9931
+    },
+    {
+      "epoch": 0.6909457720268531,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00046047980162844073,
+      "loss": 0.8049,
+      "step": 9932
+    },
+    {
+      "epoch": 0.6910153396639883,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004602900911042868,
+      "loss": 0.8622,
+      "step": 9933
+    },
+    {
+      "epoch": 0.6910849073011235,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004601004079838813,
+      "loss": 0.7903,
+      "step": 9934
+    },
+    {
+      "epoch": 0.6911544749382588,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0004599107522768557,
+      "loss": 0.9072,
+      "step": 9935
+    },
+    {
+      "epoch": 0.6912240425753939,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00045972112399284037,
+      "loss": 0.7728,
+      "step": 9936
+    },
+    {
+      "epoch": 0.6912936102125291,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0004595315231414632,
+      "loss": 1.007,
+      "step": 9937
+    },
+    {
+      "epoch": 0.6913631778496644,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00045934194973235054,
+      "loss": 0.7445,
+      "step": 9938
+    },
+    {
+      "epoch": 0.6914327454867996,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00045915240377512867,
+      "loss": 0.963,
+      "step": 9939
+    },
+    {
+      "epoch": 0.6915023131239347,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0004589628852794221,
+      "loss": 0.7394,
+      "step": 9940
+    },
+    {
+      "epoch": 0.6915718807610699,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00045877339425485277,
+      "loss": 0.778,
+      "step": 9941
+    },
+    {
+      "epoch": 0.6916414483982052,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004585839307110428,
+      "loss": 0.7646,
+      "step": 9942
+    },
+    {
+      "epoch": 0.6917110160353404,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00045839449465761195,
+      "loss": 0.9427,
+      "step": 9943
+    },
+    {
+      "epoch": 0.6917805836724755,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004582050861041783,
+      "loss": 0.5645,
+      "step": 9944
+    },
+    {
+      "epoch": 0.6918501513096108,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00045801570506035974,
+      "loss": 0.7495,
+      "step": 9945
+    },
+    {
+      "epoch": 0.691919718946746,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.000457826351535772,
+      "loss": 0.8523,
+      "step": 9946
+    },
+    {
+      "epoch": 0.6919892865838811,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004576370255400295,
+      "loss": 1.0741,
+      "step": 9947
+    },
+    {
+      "epoch": 0.6920588542210164,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00045744772708274485,
+      "loss": 0.8472,
+      "step": 9948
+    },
+    {
+      "epoch": 0.6921284218581516,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00045725845617352977,
+      "loss": 0.7895,
+      "step": 9949
+    },
+    {
+      "epoch": 0.6921979894952868,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004570692128219951,
+      "loss": 0.7883,
+      "step": 9950
+    },
+    {
+      "epoch": 0.692267557132422,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004568799970377493,
+      "loss": 0.8317,
+      "step": 9951
+    },
+    {
+      "epoch": 0.6923371247695572,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00045669080883039924,
+      "loss": 0.9023,
+      "step": 9952
+    },
+    {
+      "epoch": 0.6924066924066924,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0004565016482095515,
+      "loss": 0.7524,
+      "step": 9953
+    },
+    {
+      "epoch": 0.6924762600438276,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.000456312515184811,
+      "loss": 0.7842,
+      "step": 9954
+    },
+    {
+      "epoch": 0.6925458276809628,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0004561234097657806,
+      "loss": 0.8211,
+      "step": 9955
+    },
+    {
+      "epoch": 0.692615395318098,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0004559343319620617,
+      "loss": 0.6618,
+      "step": 9956
+    },
+    {
+      "epoch": 0.6926849629552332,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0004557452817832551,
+      "loss": 0.8867,
+      "step": 9957
+    },
+    {
+      "epoch": 0.6927545305923685,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004555562592389603,
+      "loss": 0.7492,
+      "step": 9958
+    },
+    {
+      "epoch": 0.6928240982295036,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00045536726433877405,
+      "loss": 0.9273,
+      "step": 9959
+    },
+    {
+      "epoch": 0.6928936658666388,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0004551782970922933,
+      "loss": 1.032,
+      "step": 9960
+    },
+    {
+      "epoch": 0.6929632335037741,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00045498935750911253,
+      "loss": 0.7288,
+      "step": 9961
+    },
+    {
+      "epoch": 0.6930328011409093,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004548004455988248,
+      "loss": 0.9221,
+      "step": 9962
+    },
+    {
+      "epoch": 0.6931023687780444,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0004546115613710224,
+      "loss": 0.6233,
+      "step": 9963
+    },
+    {
+      "epoch": 0.6931719364151797,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00045442270483529636,
+      "loss": 0.7356,
+      "step": 9964
+    },
+    {
+      "epoch": 0.6932415040523149,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00045423387600123543,
+      "loss": 1.0036,
+      "step": 9965
+    },
+    {
+      "epoch": 0.69331107168945,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00045404507487842706,
+      "loss": 1.0622,
+      "step": 9966
+    },
+    {
+      "epoch": 0.6933806393265852,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00045385630147645793,
+      "loss": 0.5868,
+      "step": 9967
+    },
+    {
+      "epoch": 0.6934502069637205,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00045366755580491337,
+      "loss": 0.9475,
+      "step": 9968
+    },
+    {
+      "epoch": 0.6935197746008557,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00045347883787337674,
+      "loss": 1.0282,
+      "step": 9969
+    },
+    {
+      "epoch": 0.6935893422379908,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00045329014769142963,
+      "loss": 0.9051,
+      "step": 9970
+    },
+    {
+      "epoch": 0.6936589098751261,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00045310148526865314,
+      "loss": 0.6954,
+      "step": 9971
+    },
+    {
+      "epoch": 0.6937284775122613,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00045291285061462705,
+      "loss": 0.8558,
+      "step": 9972
+    },
+    {
+      "epoch": 0.6937980451493965,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0004527242437389285,
+      "loss": 0.6456,
+      "step": 9973
+    },
+    {
+      "epoch": 0.6938676127865318,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0004525356646511348,
+      "loss": 0.754,
+      "step": 9974
+    },
+    {
+      "epoch": 0.6939371804236669,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0004523471133608206,
+      "loss": 0.8408,
+      "step": 9975
+    },
+    {
+      "epoch": 0.6940067480608021,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0004521585898775592,
+      "loss": 0.629,
+      "step": 9976
+    },
+    {
+      "epoch": 0.6940763156979374,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0004519700942109234,
+      "loss": 0.861,
+      "step": 9977
+    },
+    {
+      "epoch": 0.6941458833350725,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00045178162637048413,
+      "loss": 0.7202,
+      "step": 9978
+    },
+    {
+      "epoch": 0.6942154509722077,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00045159318636581083,
+      "loss": 0.8389,
+      "step": 9979
+    },
+    {
+      "epoch": 0.6942850186093429,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004514047742064709,
+      "loss": 0.689,
+      "step": 9980
+    },
+    {
+      "epoch": 0.6943545862464782,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004512163899020314,
+      "loss": 0.6303,
+      "step": 9981
+    },
+    {
+      "epoch": 0.6944241538836133,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004510280334620579,
+      "loss": 0.7624,
+      "step": 9982
+    },
+    {
+      "epoch": 0.6944937215207485,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00045083970489611383,
+      "loss": 0.6146,
+      "step": 9983
+    },
+    {
+      "epoch": 0.6945632891578838,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00045065140421376125,
+      "loss": 0.9113,
+      "step": 9984
+    },
+    {
+      "epoch": 0.694632856795019,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0004504631314245614,
+      "loss": 0.7849,
+      "step": 9985
+    },
+    {
+      "epoch": 0.6947024244321541,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00045027488653807425,
+      "loss": 0.7079,
+      "step": 9986
+    },
+    {
+      "epoch": 0.6947719920692894,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00045008666956385725,
+      "loss": 0.8342,
+      "step": 9987
+    },
+    {
+      "epoch": 0.6948415597064246,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00044989848051146765,
+      "loss": 0.7096,
+      "step": 9988
+    },
+    {
+      "epoch": 0.6949111273435598,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004497103193904601,
+      "loss": 0.8621,
+      "step": 9989
+    },
+    {
+      "epoch": 0.694980694980695,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00044952218621038944,
+      "loss": 0.7878,
+      "step": 9990
+    },
+    {
+      "epoch": 0.6950502626178302,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.000449334080980807,
+      "loss": 0.925,
+      "step": 9991
+    },
+    {
+      "epoch": 0.6951198302549654,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004491460037112648,
+      "loss": 0.8893,
+      "step": 9992
+    },
+    {
+      "epoch": 0.6951893978921005,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000448957954411312,
+      "loss": 0.9219,
+      "step": 9993
+    },
+    {
+      "epoch": 0.6952589655292358,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00044876993309049654,
+      "loss": 1.1209,
+      "step": 9994
+    },
+    {
+      "epoch": 0.695328533166371,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00044858193975836534,
+      "loss": 0.7067,
+      "step": 9995
+    },
+    {
+      "epoch": 0.6953981008035062,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0004483939744244643,
+      "loss": 0.5887,
+      "step": 9996
+    },
+    {
+      "epoch": 0.6954676684406415,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000448206037098337,
+      "loss": 0.9967,
+      "step": 9997
+    },
+    {
+      "epoch": 0.6955372360777766,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00044801812778952544,
+      "loss": 0.8292,
+      "step": 9998
+    },
+    {
+      "epoch": 0.6956068037149118,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0004478302465075711,
+      "loss": 0.8827,
+      "step": 9999
+    },
+    {
+      "epoch": 0.6956763713520471,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00044764239326201415,
+      "loss": 0.809,
+      "step": 10000
+    },
+    {
+      "epoch": 0.6957459389891822,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00044745456806239206,
+      "loss": 0.6268,
+      "step": 10001
+    },
+    {
+      "epoch": 0.6958155066263174,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004472667709182423,
+      "loss": 0.8195,
+      "step": 10002
+    },
+    {
+      "epoch": 0.6958850742634527,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00044707900183909953,
+      "loss": 0.6104,
+      "step": 10003
+    },
+    {
+      "epoch": 0.6959546419005879,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004468912608344985,
+      "loss": 0.6953,
+      "step": 10004
+    },
+    {
+      "epoch": 0.696024209537723,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00044670354791397104,
+      "loss": 0.547,
+      "step": 10005
+    },
+    {
+      "epoch": 0.6960937771748582,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00044651586308704896,
+      "loss": 0.7972,
+      "step": 10006
+    },
+    {
+      "epoch": 0.6961633448119935,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00044632820636326156,
+      "loss": 0.7603,
+      "step": 10007
+    },
+    {
+      "epoch": 0.6962329124491287,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00044614057775213637,
+      "loss": 0.9619,
+      "step": 10008
+    },
+    {
+      "epoch": 0.6963024800862638,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.00044595297726320173,
+      "loss": 1.0059,
+      "step": 10009
+    },
+    {
+      "epoch": 0.6963720477233991,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00044576540490598226,
+      "loss": 0.8679,
+      "step": 10010
+    },
+    {
+      "epoch": 0.6964416153605343,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004455778606900021,
+      "loss": 0.5389,
+      "step": 10011
+    },
+    {
+      "epoch": 0.6965111829976695,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00044539034462478324,
+      "loss": 0.7035,
+      "step": 10012
+    },
+    {
+      "epoch": 0.6965807506348047,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00044520285671984715,
+      "loss": 0.6121,
+      "step": 10013
+    },
+    {
+      "epoch": 0.6966503182719399,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00044501539698471414,
+      "loss": 0.9757,
+      "step": 10014
+    },
+    {
+      "epoch": 0.6967198859090751,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0004448279654289015,
+      "loss": 0.7322,
+      "step": 10015
+    },
+    {
+      "epoch": 0.6967894535462104,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00044464056206192684,
+      "loss": 0.6639,
+      "step": 10016
+    },
+    {
+      "epoch": 0.6968590211833455,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00044445318689330496,
+      "loss": 0.6743,
+      "step": 10017
+    },
+    {
+      "epoch": 0.6969285888204807,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004442658399325503,
+      "loss": 1.0035,
+      "step": 10018
+    },
+    {
+      "epoch": 0.6969981564576159,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0004440785211891749,
+      "loss": 0.9152,
+      "step": 10019
+    },
+    {
+      "epoch": 0.6970677240947512,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00044389123067269055,
+      "loss": 0.7669,
+      "step": 10020
+    },
+    {
+      "epoch": 0.6971372917318863,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00044370396839260606,
+      "loss": 0.9853,
+      "step": 10021
+    },
+    {
+      "epoch": 0.6972068593690215,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0004435167343584302,
+      "loss": 0.827,
+      "step": 10022
+    },
+    {
+      "epoch": 0.6972764270061568,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0004433295285796699,
+      "loss": 0.8992,
+      "step": 10023
+    },
+    {
+      "epoch": 0.6973459946432919,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0004431423510658304,
+      "loss": 0.6252,
+      "step": 10024
+    },
+    {
+      "epoch": 0.6974155622804271,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004429552018264157,
+      "loss": 0.955,
+      "step": 10025
+    },
+    {
+      "epoch": 0.6974851299175624,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0004427680808709276,
+      "loss": 1.0229,
+      "step": 10026
+    },
+    {
+      "epoch": 0.6975546975546976,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00044258098820886774,
+      "loss": 0.8838,
+      "step": 10027
+    },
+    {
+      "epoch": 0.6976242651918327,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000442393923849736,
+      "loss": 0.9667,
+      "step": 10028
+    },
+    {
+      "epoch": 0.697693832828968,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004422068878030303,
+      "loss": 0.8528,
+      "step": 10029
+    },
+    {
+      "epoch": 0.6977634004661032,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.000442019880078247,
+      "loss": 0.851,
+      "step": 10030
+    },
+    {
+      "epoch": 0.6978329681032384,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0004418329006848818,
+      "loss": 0.7651,
+      "step": 10031
+    },
+    {
+      "epoch": 0.6979025357403735,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0004416459496324289,
+      "loss": 0.8588,
+      "step": 10032
+    },
+    {
+      "epoch": 0.6979721033775088,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00044145902693037986,
+      "loss": 1.0708,
+      "step": 10033
+    },
+    {
+      "epoch": 0.698041671014644,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0004412721325882266,
+      "loss": 0.701,
+      "step": 10034
+    },
+    {
+      "epoch": 0.6981112386517792,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.000441085266615458,
+      "loss": 0.7535,
+      "step": 10035
+    },
+    {
+      "epoch": 0.6981808062889144,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00044089842902156275,
+      "loss": 1.0042,
+      "step": 10036
+    },
+    {
+      "epoch": 0.6982503739260496,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00044071161981602667,
+      "loss": 0.9096,
+      "step": 10037
+    },
+    {
+      "epoch": 0.6983199415631848,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0004405248390083361,
+      "loss": 1.0399,
+      "step": 10038
+    },
+    {
+      "epoch": 0.6983895092003201,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0004403380866079741,
+      "loss": 1.0811,
+      "step": 10039
+    },
+    {
+      "epoch": 0.6984590768374552,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00044015136262442247,
+      "loss": 0.4406,
+      "step": 10040
+    },
+    {
+      "epoch": 0.6985286444745904,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00043996466706716354,
+      "loss": 0.8179,
+      "step": 10041
+    },
+    {
+      "epoch": 0.6985982121117257,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00043977799994567604,
+      "loss": 0.7871,
+      "step": 10042
+    },
+    {
+      "epoch": 0.6986677797488609,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0004395913612694379,
+      "loss": 1.0078,
+      "step": 10043
+    },
+    {
+      "epoch": 0.698737347385996,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0004394047510479254,
+      "loss": 0.721,
+      "step": 10044
+    },
+    {
+      "epoch": 0.6988069150231312,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0004392181692906142,
+      "loss": 0.8101,
+      "step": 10045
+    },
+    {
+      "epoch": 0.6988764826602665,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00043903161600697806,
+      "loss": 0.7656,
+      "step": 10046
+    },
+    {
+      "epoch": 0.6989460502974016,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00043884509120648864,
+      "loss": 0.5622,
+      "step": 10047
+    },
+    {
+      "epoch": 0.6990156179345368,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0004386585948986174,
+      "loss": 0.7001,
+      "step": 10048
+    },
+    {
+      "epoch": 0.6990851855716721,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0004384721270928329,
+      "loss": 0.8348,
+      "step": 10049
+    },
+    {
+      "epoch": 0.6991547532088073,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0004382856877986039,
+      "loss": 0.7234,
+      "step": 10050
+    },
+    {
+      "epoch": 0.6992243208459424,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00043809927702539607,
+      "loss": 0.5557,
+      "step": 10051
+    },
+    {
+      "epoch": 0.6992938884830777,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00043791289478267514,
+      "loss": 1.1759,
+      "step": 10052
+    },
+    {
+      "epoch": 0.6993634561202129,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00043772654107990385,
+      "loss": 0.8746,
+      "step": 10053
+    },
+    {
+      "epoch": 0.6994330237573481,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0004375402159265448,
+      "loss": 0.8024,
+      "step": 10054
+    },
+    {
+      "epoch": 0.6995025913944833,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004373539193320589,
+      "loss": 0.7398,
+      "step": 10055
+    },
+    {
+      "epoch": 0.6995721590316185,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00043716765130590507,
+      "loss": 0.6992,
+      "step": 10056
+    },
+    {
+      "epoch": 0.6996417266687537,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00043698141185754104,
+      "loss": 0.8592,
+      "step": 10057
+    },
+    {
+      "epoch": 0.6997112943058889,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00043679520099642276,
+      "loss": 0.7887,
+      "step": 10058
+    },
+    {
+      "epoch": 0.6997808619430241,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00043660901873200533,
+      "loss": 0.7324,
+      "step": 10059
+    },
+    {
+      "epoch": 0.6998504295801593,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0004364228650737426,
+      "loss": 0.7056,
+      "step": 10060
+    },
+    {
+      "epoch": 0.6999199972172945,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00043623674003108584,
+      "loss": 0.8171,
+      "step": 10061
+    },
+    {
+      "epoch": 0.6999895648544298,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00043605064361348613,
+      "loss": 0.7432,
+      "step": 10062
+    },
+    {
+      "epoch": 0.7000591324915649,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00043586457583039183,
+      "loss": 0.6791,
+      "step": 10063
+    },
+    {
+      "epoch": 0.7001287001287001,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00043567853669125133,
+      "loss": 0.8353,
+      "step": 10064
+    },
+    {
+      "epoch": 0.7001982677658354,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00043549252620551004,
+      "loss": 0.7989,
+      "step": 10065
+    },
+    {
+      "epoch": 0.7002678354029706,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0004353065443826133,
+      "loss": 0.6995,
+      "step": 10066
+    },
+    {
+      "epoch": 0.7003374030401057,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00043512059123200356,
+      "loss": 0.6129,
+      "step": 10067
+    },
+    {
+      "epoch": 0.7004069706772409,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.000434934666763123,
+      "loss": 0.8444,
+      "step": 10068
+    },
+    {
+      "epoch": 0.7004765383143762,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004347487709854122,
+      "loss": 0.8488,
+      "step": 10069
+    },
+    {
+      "epoch": 0.7005461059515113,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004345629039083099,
+      "loss": 0.8437,
+      "step": 10070
+    },
+    {
+      "epoch": 0.7006156735886465,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004343770655412532,
+      "loss": 1.0469,
+      "step": 10071
+    },
+    {
+      "epoch": 0.7006852412257818,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00043419125589367745,
+      "loss": 0.7606,
+      "step": 10072
+    },
+    {
+      "epoch": 0.700754808862917,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00043400547497501863,
+      "loss": 0.6727,
+      "step": 10073
+    },
+    {
+      "epoch": 0.7008243765000521,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00043381972279470896,
+      "loss": 0.7686,
+      "step": 10074
+    },
+    {
+      "epoch": 0.7008939441371874,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0004336339993621795,
+      "loss": 0.6855,
+      "step": 10075
+    },
+    {
+      "epoch": 0.7009635117743226,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00043344830468686137,
+      "loss": 0.7121,
+      "step": 10076
+    },
+    {
+      "epoch": 0.7010330794114578,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00043326263877818227,
+      "loss": 0.8287,
+      "step": 10077
+    },
+    {
+      "epoch": 0.701102647048593,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00043307700164557016,
+      "loss": 0.7769,
+      "step": 10078
+    },
+    {
+      "epoch": 0.7011722146857282,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00043289139329845004,
+      "loss": 0.8616,
+      "step": 10079
+    },
+    {
+      "epoch": 0.7012417823228634,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00043270581374624695,
+      "loss": 0.8422,
+      "step": 10080
+    },
+    {
+      "epoch": 0.7013113499599986,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0004325202629983829,
+      "loss": 0.9342,
+      "step": 10081
+    },
+    {
+      "epoch": 0.7013809175971338,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004323347410642795,
+      "loss": 0.7916,
+      "step": 10082
+    },
+    {
+      "epoch": 0.701450485234269,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00043214924795335717,
+      "loss": 0.7212,
+      "step": 10083
+    },
+    {
+      "epoch": 0.7015200528714042,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00043196378367503377,
+      "loss": 0.7458,
+      "step": 10084
+    },
+    {
+      "epoch": 0.7015896205085395,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00043177834823872644,
+      "loss": 0.8478,
+      "step": 10085
+    },
+    {
+      "epoch": 0.7016591881456746,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00043159294165384963,
+      "loss": 0.8785,
+      "step": 10086
+    },
+    {
+      "epoch": 0.7017287557828098,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00043140756392981905,
+      "loss": 0.6397,
+      "step": 10087
+    },
+    {
+      "epoch": 0.7017983234199451,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00043122221507604653,
+      "loss": 0.8018,
+      "step": 10088
+    },
+    {
+      "epoch": 0.7018678910570803,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00043103689510194264,
+      "loss": 0.6511,
+      "step": 10089
+    },
+    {
+      "epoch": 0.7019374586942154,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0004308516040169178,
+      "loss": 0.7782,
+      "step": 10090
+    },
+    {
+      "epoch": 0.7020070263313507,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00043066634183037945,
+      "loss": 0.7878,
+      "step": 10091
+    },
+    {
+      "epoch": 0.7020765939684859,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00043048110855173507,
+      "loss": 0.688,
+      "step": 10092
+    },
+    {
+      "epoch": 0.702146161605621,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0004302959041903889,
+      "loss": 0.619,
+      "step": 10093
+    },
+    {
+      "epoch": 0.7022157292427562,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0004301107287557455,
+      "loss": 0.8383,
+      "step": 10094
+    },
+    {
+      "epoch": 0.7022852968798915,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0004299255822572067,
+      "loss": 0.7158,
+      "step": 10095
+    },
+    {
+      "epoch": 0.7023548645170267,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00042974046470417327,
+      "loss": 0.8948,
+      "step": 10096
+    },
+    {
+      "epoch": 0.7024244321541618,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004295553761060451,
+      "loss": 0.9329,
+      "step": 10097
+    },
+    {
+      "epoch": 0.7024939997912971,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004293703164722197,
+      "loss": 0.7611,
+      "step": 10098
+    },
+    {
+      "epoch": 0.7025635674284323,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00042918528581209313,
+      "loss": 0.6111,
+      "step": 10099
+    },
+    {
+      "epoch": 0.7026331350655675,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00042900028413506055,
+      "loss": 0.6119,
+      "step": 10100
+    },
+    {
+      "epoch": 0.7027027027027027,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004288153114505159,
+      "loss": 0.7833,
+      "step": 10101
+    },
+    {
+      "epoch": 0.7027722703398379,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0004286303677678508,
+      "loss": 0.6972,
+      "step": 10102
+    },
+    {
+      "epoch": 0.7028418379769731,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004284454530964552,
+      "loss": 0.7726,
+      "step": 10103
+    },
+    {
+      "epoch": 0.7029114056141084,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00042826056744571875,
+      "loss": 0.7007,
+      "step": 10104
+    },
+    {
+      "epoch": 0.7029809732512435,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0004280757108250293,
+      "loss": 0.838,
+      "step": 10105
+    },
+    {
+      "epoch": 0.7030505408883787,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00042789088324377244,
+      "loss": 0.9073,
+      "step": 10106
+    },
+    {
+      "epoch": 0.7031201085255139,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00042770608471133254,
+      "loss": 0.642,
+      "step": 10107
+    },
+    {
+      "epoch": 0.7031896761626492,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00042752131523709347,
+      "loss": 1.1096,
+      "step": 10108
+    },
+    {
+      "epoch": 0.7032592437997843,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004273365748304362,
+      "loss": 0.7165,
+      "step": 10109
+    },
+    {
+      "epoch": 0.7033288114369195,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0004271518635007415,
+      "loss": 0.7117,
+      "step": 10110
+    },
+    {
+      "epoch": 0.7033983790740548,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00042696718125738756,
+      "loss": 0.8286,
+      "step": 10111
+    },
+    {
+      "epoch": 0.70346794671119,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00042678252810975206,
+      "loss": 0.8924,
+      "step": 10112
+    },
+    {
+      "epoch": 0.7035375143483251,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00042659790406721033,
+      "loss": 0.6813,
+      "step": 10113
+    },
+    {
+      "epoch": 0.7036070819854604,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00042641330913913676,
+      "loss": 1.0315,
+      "step": 10114
+    },
+    {
+      "epoch": 0.7036766496225956,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004262287433349047,
+      "loss": 0.8003,
+      "step": 10115
+    },
+    {
+      "epoch": 0.7037462172597307,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.000426044206663885,
+      "loss": 0.9367,
+      "step": 10116
+    },
+    {
+      "epoch": 0.703815784896866,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004258596991354475,
+      "loss": 0.7856,
+      "step": 10117
+    },
+    {
+      "epoch": 0.7038853525340012,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0004256752207589599,
+      "loss": 0.776,
+      "step": 10118
+    },
+    {
+      "epoch": 0.7039549201711364,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.00042549077154379055,
+      "loss": 0.6142,
+      "step": 10119
+    },
+    {
+      "epoch": 0.7040244878082715,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00042530635149930397,
+      "loss": 0.5824,
+      "step": 10120
+    },
+    {
+      "epoch": 0.7040940554454068,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00042512196063486396,
+      "loss": 0.8986,
+      "step": 10121
+    },
+    {
+      "epoch": 0.704163623082542,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0004249375989598335,
+      "loss": 0.9061,
+      "step": 10122
+    },
+    {
+      "epoch": 0.7042331907196772,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00042475326648357283,
+      "loss": 0.6117,
+      "step": 10123
+    },
+    {
+      "epoch": 0.7043027583568124,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00042456896321544225,
+      "loss": 0.7902,
+      "step": 10124
+    },
+    {
+      "epoch": 0.7043723259939476,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004243846891647989,
+      "loss": 0.8703,
+      "step": 10125
+    },
+    {
+      "epoch": 0.7044418936310828,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00042420044434100015,
+      "loss": 0.9079,
+      "step": 10126
+    },
+    {
+      "epoch": 0.7045114612682181,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00042401622875340016,
+      "loss": 0.7512,
+      "step": 10127
+    },
+    {
+      "epoch": 0.7045810289053532,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.000423832042411353,
+      "loss": 0.7617,
+      "step": 10128
+    },
+    {
+      "epoch": 0.7046505965424884,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00042364788532421095,
+      "loss": 0.6617,
+      "step": 10129
+    },
+    {
+      "epoch": 0.7047201641796237,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00042346375750132415,
+      "loss": 0.5288,
+      "step": 10130
+    },
+    {
+      "epoch": 0.7047897318167589,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00042327965895204157,
+      "loss": 0.9053,
+      "step": 10131
+    },
+    {
+      "epoch": 0.704859299453894,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0004230955896857109,
+      "loss": 0.7674,
+      "step": 10132
+    },
+    {
+      "epoch": 0.7049288670910292,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0004229115497116788,
+      "loss": 0.5979,
+      "step": 10133
+    },
+    {
+      "epoch": 0.7049984347281645,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0004227275390392895,
+      "loss": 0.9274,
+      "step": 10134
+    },
+    {
+      "epoch": 0.7050680023652997,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00042254355767788564,
+      "loss": 0.8571,
+      "step": 10135
+    },
+    {
+      "epoch": 0.7051375700024348,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0004223596056368094,
+      "loss": 0.9398,
+      "step": 10136
+    },
+    {
+      "epoch": 0.7052071376395701,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0004221756829254012,
+      "loss": 0.9064,
+      "step": 10137
+    },
+    {
+      "epoch": 0.7052767052767053,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00042199178955299946,
+      "loss": 0.6491,
+      "step": 10138
+    },
+    {
+      "epoch": 0.7053462729138404,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00042180792552894077,
+      "loss": 0.7337,
+      "step": 10139
+    },
+    {
+      "epoch": 0.7054158405509757,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0004216240908625617,
+      "loss": 0.7206,
+      "step": 10140
+    },
+    {
+      "epoch": 0.7054854081881109,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0004214402855631958,
+      "loss": 0.6149,
+      "step": 10141
+    },
+    {
+      "epoch": 0.7055549758252461,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00042125650964017593,
+      "loss": 0.8573,
+      "step": 10142
+    },
+    {
+      "epoch": 0.7056245434623813,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00042107276310283384,
+      "loss": 0.7419,
+      "step": 10143
+    },
+    {
+      "epoch": 0.7056941110995165,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00042088904596049884,
+      "loss": 0.8879,
+      "step": 10144
+    },
+    {
+      "epoch": 0.7057636787366517,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00042070535822249865,
+      "loss": 0.8415,
+      "step": 10145
+    },
+    {
+      "epoch": 0.7058332463737869,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004205216998981607,
+      "loss": 0.7848,
+      "step": 10146
+    },
+    {
+      "epoch": 0.7059028140109221,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0004203380709968103,
+      "loss": 0.7432,
+      "step": 10147
+    },
+    {
+      "epoch": 0.7059723816480573,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.000420154471527771,
+      "loss": 0.7737,
+      "step": 10148
+    },
+    {
+      "epoch": 0.7060419492851925,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0004199709015003645,
+      "loss": 0.7522,
+      "step": 10149
+    },
+    {
+      "epoch": 0.7061115169223278,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00041978736092391226,
+      "loss": 0.5649,
+      "step": 10150
+    },
+    {
+      "epoch": 0.7061810845594629,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00041960384980773357,
+      "loss": 0.7011,
+      "step": 10151
+    },
+    {
+      "epoch": 0.7062506521965981,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00041942036816114604,
+      "loss": 0.6468,
+      "step": 10152
+    },
+    {
+      "epoch": 0.7063202198337334,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00041923691599346545,
+      "loss": 0.7579,
+      "step": 10153
+    },
+    {
+      "epoch": 0.7063897874708686,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00041905349331400744,
+      "loss": 0.7975,
+      "step": 10154
+    },
+    {
+      "epoch": 0.7064593551080037,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0004188701001320845,
+      "loss": 1.0643,
+      "step": 10155
+    },
+    {
+      "epoch": 0.706528922745139,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0004186867364570087,
+      "loss": 0.741,
+      "step": 10156
+    },
+    {
+      "epoch": 0.7065984903822742,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0004185034022980907,
+      "loss": 1.1413,
+      "step": 10157
+    },
+    {
+      "epoch": 0.7066680580194094,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0004183200976646391,
+      "loss": 0.5771,
+      "step": 10158
+    },
+    {
+      "epoch": 0.7067376256565445,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00041813682256596065,
+      "loss": 0.9539,
+      "step": 10159
+    },
+    {
+      "epoch": 0.7068071932936798,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004179535770113615,
+      "loss": 0.6908,
+      "step": 10160
+    },
+    {
+      "epoch": 0.706876760930815,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0004177703610101463,
+      "loss": 0.6224,
+      "step": 10161
+    },
+    {
+      "epoch": 0.7069463285679501,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00041758717457161766,
+      "loss": 0.8689,
+      "step": 10162
+    },
+    {
+      "epoch": 0.7070158962050854,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004174040177050762,
+      "loss": 0.7937,
+      "step": 10163
+    },
+    {
+      "epoch": 0.7070854638422206,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00041722089041982234,
+      "loss": 0.8885,
+      "step": 10164
+    },
+    {
+      "epoch": 0.7071550314793558,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0004170377927251545,
+      "loss": 0.9376,
+      "step": 10165
+    },
+    {
+      "epoch": 0.707224599116491,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00041685472463036936,
+      "loss": 0.6855,
+      "step": 10166
+    },
+    {
+      "epoch": 0.7072941667536262,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0004166716861447615,
+      "loss": 0.7672,
+      "step": 10167
+    },
+    {
+      "epoch": 0.7073637343907614,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00041648867727762565,
+      "loss": 0.8199,
+      "step": 10168
+    },
+    {
+      "epoch": 0.7074333020278967,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00041630569803825324,
+      "loss": 0.8799,
+      "step": 10169
+    },
+    {
+      "epoch": 0.7075028696650318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00041612274843593557,
+      "loss": 0.6829,
+      "step": 10170
+    },
+    {
+      "epoch": 0.707572437302167,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00041593982847996203,
+      "loss": 0.6841,
+      "step": 10171
+    },
+    {
+      "epoch": 0.7076420049393022,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00041575693817962013,
+      "loss": 0.8985,
+      "step": 10172
+    },
+    {
+      "epoch": 0.7077115725764375,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004155740775441957,
+      "loss": 0.8377,
+      "step": 10173
+    },
+    {
+      "epoch": 0.7077811402135726,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.000415391246582974,
+      "loss": 0.8208,
+      "step": 10174
+    },
+    {
+      "epoch": 0.7078507078507078,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004152084453052385,
+      "loss": 0.8361,
+      "step": 10175
+    },
+    {
+      "epoch": 0.7079202754878431,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00041502567372027056,
+      "loss": 1.1543,
+      "step": 10176
+    },
+    {
+      "epoch": 0.7079898431249783,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00041484293183735,
+      "loss": 1.0145,
+      "step": 10177
+    },
+    {
+      "epoch": 0.7080594107621134,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0004146602196657561,
+      "loss": 0.8072,
+      "step": 10178
+    },
+    {
+      "epoch": 0.7081289783992487,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0004144775372147661,
+      "loss": 0.8695,
+      "step": 10179
+    },
+    {
+      "epoch": 0.7081985460363839,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0004142948844936556,
+      "loss": 0.8563,
+      "step": 10180
+    },
+    {
+      "epoch": 0.708268113673519,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0004141122615116982,
+      "loss": 0.7231,
+      "step": 10181
+    },
+    {
+      "epoch": 0.7083376813106543,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00041392966827816723,
+      "loss": 0.677,
+      "step": 10182
+    },
+    {
+      "epoch": 0.7084072489477895,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00041374710480233403,
+      "loss": 1.0409,
+      "step": 10183
+    },
+    {
+      "epoch": 0.7084768165849247,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004135645710934679,
+      "loss": 1.1684,
+      "step": 10184
+    },
+    {
+      "epoch": 0.7085463842220598,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0004133820671608366,
+      "loss": 0.6784,
+      "step": 10185
+    },
+    {
+      "epoch": 0.7086159518591951,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004131995930137076,
+      "loss": 0.8895,
+      "step": 10186
+    },
+    {
+      "epoch": 0.7086855194963303,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0004130171486613451,
+      "loss": 1.0721,
+      "step": 10187
+    },
+    {
+      "epoch": 0.7087550871334655,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0004128347341130132,
+      "loss": 0.7085,
+      "step": 10188
+    },
+    {
+      "epoch": 0.7088246547706007,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00041265234937797437,
+      "loss": 0.6015,
+      "step": 10189
+    },
+    {
+      "epoch": 0.7088942224077359,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00041246999446548885,
+      "loss": 0.859,
+      "step": 10190
+    },
+    {
+      "epoch": 0.7089637900448711,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0004122876693848151,
+      "loss": 0.8156,
+      "step": 10191
+    },
+    {
+      "epoch": 0.7090333576820064,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0004121053741452113,
+      "loss": 0.5851,
+      "step": 10192
+    },
+    {
+      "epoch": 0.7091029253191415,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00041192310875593386,
+      "loss": 0.7188,
+      "step": 10193
+    },
+    {
+      "epoch": 0.7091724929562767,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00041174087322623667,
+      "loss": 0.5221,
+      "step": 10194
+    },
+    {
+      "epoch": 0.709242060593412,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00041155866756537263,
+      "loss": 0.8736,
+      "step": 10195
+    },
+    {
+      "epoch": 0.7093116282305472,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0004113764917825935,
+      "loss": 0.6578,
+      "step": 10196
+    },
+    {
+      "epoch": 0.7093811958676823,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0004111943458871495,
+      "loss": 0.7692,
+      "step": 10197
+    },
+    {
+      "epoch": 0.7094507635048175,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0004110122298882889,
+      "loss": 1.0225,
+      "step": 10198
+    },
+    {
+      "epoch": 0.7095203311419528,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0004108301437952582,
+      "loss": 0.9021,
+      "step": 10199
+    },
+    {
+      "epoch": 0.709589898779088,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00041064808761730344,
+      "loss": 0.7685,
+      "step": 10200
+    },
+    {
+      "epoch": 0.7096594664162231,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00041046606136366795,
+      "loss": 0.843,
+      "step": 10201
+    },
+    {
+      "epoch": 0.7097290340533584,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0004102840650435943,
+      "loss": 0.6716,
+      "step": 10202
+    },
+    {
+      "epoch": 0.7097986016904936,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004101020986663239,
+      "loss": 0.9474,
+      "step": 10203
+    },
+    {
+      "epoch": 0.7098681693276288,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00040992016224109554,
+      "loss": 0.6837,
+      "step": 10204
+    },
+    {
+      "epoch": 0.709937736964764,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00040973825577714674,
+      "loss": 0.5875,
+      "step": 10205
+    },
+    {
+      "epoch": 0.7100073046018992,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00040955637928371424,
+      "loss": 0.7265,
+      "step": 10206
+    },
+    {
+      "epoch": 0.7100768722390344,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0004093745327700331,
+      "loss": 0.9111,
+      "step": 10207
+    },
+    {
+      "epoch": 0.7101464398761697,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00040919271624533627,
+      "loss": 0.8341,
+      "step": 10208
+    },
+    {
+      "epoch": 0.7102160075133048,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00040901092971885503,
+      "loss": 0.6712,
+      "step": 10209
+    },
+    {
+      "epoch": 0.71028557515044,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0004088291731998198,
+      "loss": 0.7024,
+      "step": 10210
+    },
+    {
+      "epoch": 0.7103551427875752,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00040864744669746,
+      "loss": 0.6244,
+      "step": 10211
+    },
+    {
+      "epoch": 0.7104247104247104,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0004084657502210022,
+      "loss": 0.6959,
+      "step": 10212
+    },
+    {
+      "epoch": 0.7104942780618456,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00040828408377967165,
+      "loss": 0.5662,
+      "step": 10213
+    },
+    {
+      "epoch": 0.7105638456989808,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00040810244738269277,
+      "loss": 0.7056,
+      "step": 10214
+    },
+    {
+      "epoch": 0.7106334133361161,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0004079208410392887,
+      "loss": 0.9715,
+      "step": 10215
+    },
+    {
+      "epoch": 0.7107029809732512,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0004077392647586796,
+      "loss": 0.7252,
+      "step": 10216
+    },
+    {
+      "epoch": 0.7107725486103864,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004075577185500858,
+      "loss": 0.768,
+      "step": 10217
+    },
+    {
+      "epoch": 0.7108421162475217,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.000407376202422725,
+      "loss": 0.789,
+      "step": 10218
+    },
+    {
+      "epoch": 0.7109116838846569,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0004071947163858131,
+      "loss": 0.8223,
+      "step": 10219
+    },
+    {
+      "epoch": 0.710981251521792,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00040701326044856556,
+      "loss": 0.6058,
+      "step": 10220
+    },
+    {
+      "epoch": 0.7110508191589273,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0004068318346201962,
+      "loss": 0.8007,
+      "step": 10221
+    },
+    {
+      "epoch": 0.7111203867960625,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0004066504389099165,
+      "loss": 0.7994,
+      "step": 10222
+    },
+    {
+      "epoch": 0.7111899544331977,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0004064690733269365,
+      "loss": 0.7742,
+      "step": 10223
+    },
+    {
+      "epoch": 0.7112595220703328,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00040628773788046525,
+      "loss": 0.7676,
+      "step": 10224
+    },
+    {
+      "epoch": 0.7113290897074681,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0004061064325797105,
+      "loss": 0.7144,
+      "step": 10225
+    },
+    {
+      "epoch": 0.7113986573446033,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0004059251574338776,
+      "loss": 0.8438,
+      "step": 10226
+    },
+    {
+      "epoch": 0.7114682249817385,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0004057439124521706,
+      "loss": 1.0629,
+      "step": 10227
+    },
+    {
+      "epoch": 0.7115377926188737,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0004055626976437924,
+      "loss": 0.816,
+      "step": 10228
+    },
+    {
+      "epoch": 0.7116073602560089,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00040538151301794455,
+      "loss": 0.7275,
+      "step": 10229
+    },
+    {
+      "epoch": 0.7116769278931441,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0004052003585838261,
+      "loss": 0.7616,
+      "step": 10230
+    },
+    {
+      "epoch": 0.7117464955302794,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0004050192343506358,
+      "loss": 0.9566,
+      "step": 10231
+    },
+    {
+      "epoch": 0.7118160631674145,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0004048381403275697,
+      "loss": 0.7487,
+      "step": 10232
+    },
+    {
+      "epoch": 0.7118856308045497,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00040465707652382276,
+      "loss": 0.855,
+      "step": 10233
+    },
+    {
+      "epoch": 0.711955198441685,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.00040447604294858877,
+      "loss": 0.6897,
+      "step": 10234
+    },
+    {
+      "epoch": 0.7120247660788201,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00040429503961106,
+      "loss": 0.9849,
+      "step": 10235
+    },
+    {
+      "epoch": 0.7120943337159553,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0004041140665204264,
+      "loss": 0.6458,
+      "step": 10236
+    },
+    {
+      "epoch": 0.7121639013530905,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00040393312368587674,
+      "loss": 0.8279,
+      "step": 10237
+    },
+    {
+      "epoch": 0.7122334689902258,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00040375221111659866,
+      "loss": 0.8865,
+      "step": 10238
+    },
+    {
+      "epoch": 0.7123030366273609,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004035713288217784,
+      "loss": 0.9888,
+      "step": 10239
+    },
+    {
+      "epoch": 0.7123726042644961,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0004033904768105997,
+      "loss": 0.8738,
+      "step": 10240
+    },
+    {
+      "epoch": 0.7124421719016314,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0004032096550922453,
+      "loss": 0.8892,
+      "step": 10241
+    },
+    {
+      "epoch": 0.7125117395387666,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0004030288636758964,
+      "loss": 0.7223,
+      "step": 10242
+    },
+    {
+      "epoch": 0.7125813071759017,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00040284810257073324,
+      "loss": 0.6951,
+      "step": 10243
+    },
+    {
+      "epoch": 0.712650874813037,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00040266737178593326,
+      "loss": 0.7866,
+      "step": 10244
+    },
+    {
+      "epoch": 0.7127204424501722,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00040248667133067364,
+      "loss": 0.693,
+      "step": 10245
+    },
+    {
+      "epoch": 0.7127900100873074,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00040230600121412885,
+      "loss": 0.9007,
+      "step": 10246
+    },
+    {
+      "epoch": 0.7128595777244426,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0004021253614454731,
+      "loss": 0.6981,
+      "step": 10247
+    },
+    {
+      "epoch": 0.7129291453615778,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0004019447520338776,
+      "loss": 0.7544,
+      "step": 10248
+    },
+    {
+      "epoch": 0.712998712998713,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0004017641729885134,
+      "loss": 0.8259,
+      "step": 10249
+    },
+    {
+      "epoch": 0.7130682806358482,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00040158362431854934,
+      "loss": 0.9236,
+      "step": 10250
+    },
+    {
+      "epoch": 0.7131378482729834,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0004014031060331522,
+      "loss": 0.7259,
+      "step": 10251
+    },
+    {
+      "epoch": 0.7132074159101186,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0004012226181414882,
+      "loss": 0.8314,
+      "step": 10252
+    },
+    {
+      "epoch": 0.7132769835472538,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0004010421606527218,
+      "loss": 0.7988,
+      "step": 10253
+    },
+    {
+      "epoch": 0.713346551184389,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00040086173357601566,
+      "loss": 0.8601,
+      "step": 10254
+    },
+    {
+      "epoch": 0.7134161188215242,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00040068133692053044,
+      "loss": 0.9581,
+      "step": 10255
+    },
+    {
+      "epoch": 0.7134856864586594,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00040050097069542614,
+      "loss": 0.9045,
+      "step": 10256
+    },
+    {
+      "epoch": 0.7135552540957947,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00040032063490986114,
+      "loss": 0.9228,
+      "step": 10257
+    },
+    {
+      "epoch": 0.7136248217329298,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0004001403295729914,
+      "loss": 0.7246,
+      "step": 10258
+    },
+    {
+      "epoch": 0.713694389370065,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0003999600546939726,
+      "loss": 0.9472,
+      "step": 10259
+    },
+    {
+      "epoch": 0.7137639570072003,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0003997798102819573,
+      "loss": 0.6811,
+      "step": 10260
+    },
+    {
+      "epoch": 0.7138335246443355,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0003995995963460983,
+      "loss": 0.8605,
+      "step": 10261
+    },
+    {
+      "epoch": 0.7139030922814706,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00039941941289554526,
+      "loss": 0.5264,
+      "step": 10262
+    },
+    {
+      "epoch": 0.7139726599186058,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00039923925993944764,
+      "loss": 0.945,
+      "step": 10263
+    },
+    {
+      "epoch": 0.7140422275557411,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003990591374869523,
+      "loss": 0.958,
+      "step": 10264
+    },
+    {
+      "epoch": 0.7141117951928763,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00039887904554720467,
+      "loss": 1.0233,
+      "step": 10265
+    },
+    {
+      "epoch": 0.7141813628300114,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00039869898412934926,
+      "loss": 0.8459,
+      "step": 10266
+    },
+    {
+      "epoch": 0.7142509304671467,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00039851895324252896,
+      "loss": 0.757,
+      "step": 10267
+    },
+    {
+      "epoch": 0.7143204981042819,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0003983389528958845,
+      "loss": 0.9271,
+      "step": 10268
+    },
+    {
+      "epoch": 0.7143900657414171,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.000398158983098555,
+      "loss": 0.8022,
+      "step": 10269
+    },
+    {
+      "epoch": 0.7144596333785523,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000397979043859679,
+      "loss": 0.7929,
+      "step": 10270
+    },
+    {
+      "epoch": 0.7145292010156875,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00039779913518839304,
+      "loss": 0.6102,
+      "step": 10271
+    },
+    {
+      "epoch": 0.7145987686528227,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0003976192570938316,
+      "loss": 0.7635,
+      "step": 10272
+    },
+    {
+      "epoch": 0.714668336289958,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00039743940958512783,
+      "loss": 0.8293,
+      "step": 10273
+    },
+    {
+      "epoch": 0.7147379039270931,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00039725959267141364,
+      "loss": 0.8435,
+      "step": 10274
+    },
+    {
+      "epoch": 0.7148074715642283,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0003970798063618196,
+      "loss": 0.832,
+      "step": 10275
+    },
+    {
+      "epoch": 0.7148770392013635,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00039690005066547377,
+      "loss": 0.8953,
+      "step": 10276
+    },
+    {
+      "epoch": 0.7149466068384988,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00039672032559150383,
+      "loss": 0.8819,
+      "step": 10277
+    },
+    {
+      "epoch": 0.7150161744756339,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003965406311490347,
+      "loss": 0.7694,
+      "step": 10278
+    },
+    {
+      "epoch": 0.7150857421127691,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00039636096734719096,
+      "loss": 0.7337,
+      "step": 10279
+    },
+    {
+      "epoch": 0.7151553097499044,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0003961813341950945,
+      "loss": 0.9547,
+      "step": 10280
+    },
+    {
+      "epoch": 0.7152248773870395,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0003960017317018666,
+      "loss": 0.815,
+      "step": 10281
+    },
+    {
+      "epoch": 0.7152944450241747,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0003958221598766265,
+      "loss": 0.9948,
+      "step": 10282
+    },
+    {
+      "epoch": 0.71536401266131,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003956426187284915,
+      "loss": 0.6415,
+      "step": 10283
+    },
+    {
+      "epoch": 0.7154335802984452,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.000395463108266578,
+      "loss": 0.8901,
+      "step": 10284
+    },
+    {
+      "epoch": 0.7155031479355803,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0003952836285000012,
+      "loss": 0.8202,
+      "step": 10285
+    },
+    {
+      "epoch": 0.7155727155727156,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0003951041794378738,
+      "loss": 0.6531,
+      "step": 10286
+    },
+    {
+      "epoch": 0.7156422832098508,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00039492476108930687,
+      "loss": 0.8524,
+      "step": 10287
+    },
+    {
+      "epoch": 0.715711850846986,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00039474537346341075,
+      "loss": 0.7424,
+      "step": 10288
+    },
+    {
+      "epoch": 0.7157814184841211,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003945660165692942,
+      "loss": 0.8893,
+      "step": 10289
+    },
+    {
+      "epoch": 0.7158509861212564,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00039438669041606345,
+      "loss": 0.738,
+      "step": 10290
+    },
+    {
+      "epoch": 0.7159205537583916,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003942073950128243,
+      "loss": 0.7486,
+      "step": 10291
+    },
+    {
+      "epoch": 0.7159901213955268,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0003940281303686799,
+      "loss": 0.8488,
+      "step": 10292
+    },
+    {
+      "epoch": 0.716059689032662,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00039384889649273305,
+      "loss": 0.9268,
+      "step": 10293
+    },
+    {
+      "epoch": 0.7161292566697972,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00039366969339408366,
+      "loss": 0.6976,
+      "step": 10294
+    },
+    {
+      "epoch": 0.7161988243069324,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00039349052108183147,
+      "loss": 0.8058,
+      "step": 10295
+    },
+    {
+      "epoch": 0.7162683919440677,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003933113795650737,
+      "loss": 0.7992,
+      "step": 10296
+    },
+    {
+      "epoch": 0.7163379595812028,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003931322688529052,
+      "loss": 0.7471,
+      "step": 10297
+    },
+    {
+      "epoch": 0.716407527218338,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0003929531889544221,
+      "loss": 0.8424,
+      "step": 10298
+    },
+    {
+      "epoch": 0.7164770948554733,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00039277413987871633,
+      "loss": 0.805,
+      "step": 10299
+    },
+    {
+      "epoch": 0.7165466624926085,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00039259512163487896,
+      "loss": 0.6088,
+      "step": 10300
+    },
+    {
+      "epoch": 0.7166162301297436,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003924161342319996,
+      "loss": 0.8697,
+      "step": 10301
+    },
+    {
+      "epoch": 0.7166857977668788,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00039223717767916633,
+      "loss": 0.8839,
+      "step": 10302
+    },
+    {
+      "epoch": 0.7167553654040141,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00039205825198546627,
+      "loss": 0.8275,
+      "step": 10303
+    },
+    {
+      "epoch": 0.7168249330411492,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0003918793571599836,
+      "loss": 0.5849,
+      "step": 10304
+    },
+    {
+      "epoch": 0.7168945006782844,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0003917004932118023,
+      "loss": 1.1259,
+      "step": 10305
+    },
+    {
+      "epoch": 0.7169640683154197,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00039152166015000354,
+      "loss": 0.7061,
+      "step": 10306
+    },
+    {
+      "epoch": 0.7170336359525549,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0003913428579836683,
+      "loss": 0.9957,
+      "step": 10307
+    },
+    {
+      "epoch": 0.71710320358969,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003911640867218745,
+      "loss": 0.837,
+      "step": 10308
+    },
+    {
+      "epoch": 0.7171727712268253,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00039098534637369996,
+      "loss": 0.8902,
+      "step": 10309
+    },
+    {
+      "epoch": 0.7172423388639605,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003908066369482196,
+      "loss": 0.7183,
+      "step": 10310
+    },
+    {
+      "epoch": 0.7173119065010957,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003906279584545076,
+      "loss": 0.6909,
+      "step": 10311
+    },
+    {
+      "epoch": 0.717381474138231,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0003904493109016367,
+      "loss": 0.6833,
+      "step": 10312
+    },
+    {
+      "epoch": 0.7174510417753661,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00039027069429867754,
+      "loss": 0.8421,
+      "step": 10313
+    },
+    {
+      "epoch": 0.7175206094125013,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00039009210865469926,
+      "loss": 0.759,
+      "step": 10314
+    },
+    {
+      "epoch": 0.7175901770496365,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00038991355397876903,
+      "loss": 0.9539,
+      "step": 10315
+    },
+    {
+      "epoch": 0.7176597446867717,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003897350302799536,
+      "loss": 0.8893,
+      "step": 10316
+    },
+    {
+      "epoch": 0.7177293123239069,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003895565375673177,
+      "loss": 1.0996,
+      "step": 10317
+    },
+    {
+      "epoch": 0.7177988799610421,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003893780758499236,
+      "loss": 0.701,
+      "step": 10318
+    },
+    {
+      "epoch": 0.7178684475981774,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00038919964513683334,
+      "loss": 0.6515,
+      "step": 10319
+    },
+    {
+      "epoch": 0.7179380152353125,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00038902124543710616,
+      "loss": 0.7534,
+      "step": 10320
+    },
+    {
+      "epoch": 0.7180075828724477,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0003888428767598009,
+      "loss": 0.8148,
+      "step": 10321
+    },
+    {
+      "epoch": 0.718077150509583,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0003886645391139736,
+      "loss": 0.938,
+      "step": 10322
+    },
+    {
+      "epoch": 0.7181467181467182,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00038848623250867985,
+      "loss": 1.1074,
+      "step": 10323
+    },
+    {
+      "epoch": 0.7182162857838533,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0003883079569529727,
+      "loss": 0.8166,
+      "step": 10324
+    },
+    {
+      "epoch": 0.7182858534209886,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00038812971245590446,
+      "loss": 0.7357,
+      "step": 10325
+    },
+    {
+      "epoch": 0.7183554210581238,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003879514990265255,
+      "loss": 0.8994,
+      "step": 10326
+    },
+    {
+      "epoch": 0.718424988695259,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0003877733166738846,
+      "loss": 0.8689,
+      "step": 10327
+    },
+    {
+      "epoch": 0.7184945563323941,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00038759516540702875,
+      "loss": 0.8483,
+      "step": 10328
+    },
+    {
+      "epoch": 0.7185641239695294,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0003874170452350031,
+      "loss": 0.8876,
+      "step": 10329
+    },
+    {
+      "epoch": 0.7186336916066646,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00038723895616685276,
+      "loss": 0.7773,
+      "step": 10330
+    },
+    {
+      "epoch": 0.7187032592437997,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0003870608982116198,
+      "loss": 0.7464,
+      "step": 10331
+    },
+    {
+      "epoch": 0.718772826880935,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00038688287137834455,
+      "loss": 0.8565,
+      "step": 10332
+    },
+    {
+      "epoch": 0.7188423945180702,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00038670487567606717,
+      "loss": 0.7739,
+      "step": 10333
+    },
+    {
+      "epoch": 0.7189119621552054,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0003865269111138247,
+      "loss": 1.0526,
+      "step": 10334
+    },
+    {
+      "epoch": 0.7189815297923406,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003863489777006537,
+      "loss": 0.8619,
+      "step": 10335
+    },
+    {
+      "epoch": 0.7190510974294758,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003861710754455884,
+      "loss": 0.6509,
+      "step": 10336
+    },
+    {
+      "epoch": 0.719120665066611,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00038599320435766214,
+      "loss": 0.7926,
+      "step": 10337
+    },
+    {
+      "epoch": 0.7191902327037463,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0003858153644459059,
+      "loss": 0.804,
+      "step": 10338
+    },
+    {
+      "epoch": 0.7192598003408814,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00038563755571934975,
+      "loss": 0.8925,
+      "step": 10339
+    },
+    {
+      "epoch": 0.7193293679780166,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00038545977818702225,
+      "loss": 0.5914,
+      "step": 10340
+    },
+    {
+      "epoch": 0.7193989356151518,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00038528203185794963,
+      "loss": 0.8468,
+      "step": 10341
+    },
+    {
+      "epoch": 0.7194685032522871,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003851043167411571,
+      "loss": 0.8216,
+      "step": 10342
+    },
+    {
+      "epoch": 0.7195380708894222,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003849266328456673,
+      "loss": 0.8859,
+      "step": 10343
+    },
+    {
+      "epoch": 0.7196076385265574,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0003847489801805036,
+      "loss": 0.7897,
+      "step": 10344
+    },
+    {
+      "epoch": 0.7196772061636927,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00038457135875468574,
+      "loss": 0.7999,
+      "step": 10345
+    },
+    {
+      "epoch": 0.7197467738008279,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0003843937685772321,
+      "loss": 0.8396,
+      "step": 10346
+    },
+    {
+      "epoch": 0.719816341437963,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00038421620965715974,
+      "loss": 0.6124,
+      "step": 10347
+    },
+    {
+      "epoch": 0.7198859090750983,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00038403868200348446,
+      "loss": 0.6664,
+      "step": 10348
+    },
+    {
+      "epoch": 0.7199554767122335,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00038386118562522053,
+      "loss": 0.9508,
+      "step": 10349
+    },
+    {
+      "epoch": 0.7200250443493686,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0003836837205313798,
+      "loss": 0.9475,
+      "step": 10350
+    },
+    {
+      "epoch": 0.7200946119865039,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00038350628673097353,
+      "loss": 0.6598,
+      "step": 10351
+    },
+    {
+      "epoch": 0.7201641796236391,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00038332888423301027,
+      "loss": 0.7691,
+      "step": 10352
+    },
+    {
+      "epoch": 0.7202337472607743,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00038315151304649844,
+      "loss": 0.6976,
+      "step": 10353
+    },
+    {
+      "epoch": 0.7203033148979094,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00038297417318044325,
+      "loss": 0.8059,
+      "step": 10354
+    },
+    {
+      "epoch": 0.7203728825350447,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00038279686464384987,
+      "loss": 0.9312,
+      "step": 10355
+    },
+    {
+      "epoch": 0.7204424501721799,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00038261958744572044,
+      "loss": 0.7603,
+      "step": 10356
+    },
+    {
+      "epoch": 0.7205120178093151,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00038244234159505653,
+      "loss": 0.803,
+      "step": 10357
+    },
+    {
+      "epoch": 0.7205815854464503,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00038226512710085817,
+      "loss": 0.745,
+      "step": 10358
+    },
+    {
+      "epoch": 0.7206511530835855,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.000382087943972123,
+      "loss": 0.7745,
+      "step": 10359
+    },
+    {
+      "epoch": 0.7207207207207207,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00038191079221784754,
+      "loss": 0.8289,
+      "step": 10360
+    },
+    {
+      "epoch": 0.720790288357856,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003817336718470259,
+      "loss": 1.0168,
+      "step": 10361
+    },
+    {
+      "epoch": 0.7208598559949911,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003815565828686528,
+      "loss": 0.7349,
+      "step": 10362
+    },
+    {
+      "epoch": 0.7209294236321263,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00038137952529171924,
+      "loss": 0.8569,
+      "step": 10363
+    },
+    {
+      "epoch": 0.7209989912692616,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0003812024991252151,
+      "loss": 0.9164,
+      "step": 10364
+    },
+    {
+      "epoch": 0.7210685589063968,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00038102550437812933,
+      "loss": 0.8319,
+      "step": 10365
+    },
+    {
+      "epoch": 0.7211381265435319,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0003808485410594482,
+      "loss": 0.676,
+      "step": 10366
+    },
+    {
+      "epoch": 0.7212076941806671,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0003806716091781578,
+      "loss": 0.7555,
+      "step": 10367
+    },
+    {
+      "epoch": 0.7212772618178024,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003804947087432411,
+      "loss": 0.5666,
+      "step": 10368
+    },
+    {
+      "epoch": 0.7213468294549376,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003803178397636808,
+      "loss": 0.9253,
+      "step": 10369
+    },
+    {
+      "epoch": 0.7214163970920727,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0003801410022484569,
+      "loss": 0.8512,
+      "step": 10370
+    },
+    {
+      "epoch": 0.721485964729208,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00037996419620654867,
+      "loss": 0.7916,
+      "step": 10371
+    },
+    {
+      "epoch": 0.7215555323663432,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0003797874216469336,
+      "loss": 0.8352,
+      "step": 10372
+    },
+    {
+      "epoch": 0.7216251000034783,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003796106785785871,
+      "loss": 0.7665,
+      "step": 10373
+    },
+    {
+      "epoch": 0.7216946676406136,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0003794339670104835,
+      "loss": 0.8361,
+      "step": 10374
+    },
+    {
+      "epoch": 0.7217642352777488,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00037925728695159435,
+      "loss": 0.6771,
+      "step": 10375
+    },
+    {
+      "epoch": 0.721833802914884,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00037908063841089214,
+      "loss": 0.6722,
+      "step": 10376
+    },
+    {
+      "epoch": 0.7219033705520193,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0003789040213973454,
+      "loss": 0.87,
+      "step": 10377
+    },
+    {
+      "epoch": 0.7219729381891544,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00037872743591992156,
+      "loss": 0.804,
+      "step": 10378
+    },
+    {
+      "epoch": 0.7220425058262896,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00037855088198758747,
+      "loss": 1.0457,
+      "step": 10379
+    },
+    {
+      "epoch": 0.7221120734634248,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00037837435960930686,
+      "loss": 0.8591,
+      "step": 10380
+    },
+    {
+      "epoch": 0.72218164110056,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00037819786879404336,
+      "loss": 0.8595,
+      "step": 10381
+    },
+    {
+      "epoch": 0.7222512087376952,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003780214095507577,
+      "loss": 0.8191,
+      "step": 10382
+    },
+    {
+      "epoch": 0.7223207763748304,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0003778449818884102,
+      "loss": 0.8937,
+      "step": 10383
+    },
+    {
+      "epoch": 0.7223903440119657,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0003776685858159583,
+      "loss": 0.6803,
+      "step": 10384
+    },
+    {
+      "epoch": 0.7224599116491008,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003774922213423588,
+      "loss": 0.9583,
+      "step": 10385
+    },
+    {
+      "epoch": 0.722529479286236,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003773158884765669,
+      "loss": 0.9292,
+      "step": 10386
+    },
+    {
+      "epoch": 0.7225990469233713,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003771395872275357,
+      "loss": 0.5646,
+      "step": 10387
+    },
+    {
+      "epoch": 0.7226686145605065,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00037696331760421654,
+      "loss": 0.8099,
+      "step": 10388
+    },
+    {
+      "epoch": 0.7227381821976416,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003767870796155597,
+      "loss": 0.7322,
+      "step": 10389
+    },
+    {
+      "epoch": 0.7228077498347769,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.000376610873270514,
+      "loss": 0.8178,
+      "step": 10390
+    },
+    {
+      "epoch": 0.7228773174719121,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00037643469857802614,
+      "loss": 0.7163,
+      "step": 10391
+    },
+    {
+      "epoch": 0.7229468851090473,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003762585555470409,
+      "loss": 0.7489,
+      "step": 10392
+    },
+    {
+      "epoch": 0.7230164527461824,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003760824441865026,
+      "loss": 0.4969,
+      "step": 10393
+    },
+    {
+      "epoch": 0.7230860203833177,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003759063645053528,
+      "loss": 0.983,
+      "step": 10394
+    },
+    {
+      "epoch": 0.7231555880204529,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00037573031651253245,
+      "loss": 0.98,
+      "step": 10395
+    },
+    {
+      "epoch": 0.723225155657588,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0003755543002169797,
+      "loss": 1.1609,
+      "step": 10396
+    },
+    {
+      "epoch": 0.7232947232947233,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0003753783156276325,
+      "loss": 0.8617,
+      "step": 10397
+    },
+    {
+      "epoch": 0.7233642909318585,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00037520236275342565,
+      "loss": 0.8373,
+      "step": 10398
+    },
+    {
+      "epoch": 0.7234338585689937,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0003750264416032938,
+      "loss": 0.8973,
+      "step": 10399
+    },
+    {
+      "epoch": 0.723503426206129,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0003748505521861694,
+      "loss": 0.6331,
+      "step": 10400
+    },
+    {
+      "epoch": 0.7235729938432641,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00037467469451098293,
+      "loss": 0.8331,
+      "step": 10401
+    },
+    {
+      "epoch": 0.7236425614803993,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0003744988685866633,
+      "loss": 0.9041,
+      "step": 10402
+    },
+    {
+      "epoch": 0.7237121291175346,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003743230744221383,
+      "loss": 1.0202,
+      "step": 10403
+    },
+    {
+      "epoch": 0.7237816967546697,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003741473120263345,
+      "loss": 0.8418,
+      "step": 10404
+    },
+    {
+      "epoch": 0.7238512643918049,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003739715814081754,
+      "loss": 0.7421,
+      "step": 10405
+    },
+    {
+      "epoch": 0.7239208320289401,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0003737958825765837,
+      "loss": 0.9736,
+      "step": 10406
+    },
+    {
+      "epoch": 0.7239903996660754,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003736202155404809,
+      "loss": 0.6861,
+      "step": 10407
+    },
+    {
+      "epoch": 0.7240599673032105,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003734445803087866,
+      "loss": 1.0322,
+      "step": 10408
+    },
+    {
+      "epoch": 0.7241295349403457,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003732689768904185,
+      "loss": 0.7579,
+      "step": 10409
+    },
+    {
+      "epoch": 0.724199102577481,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003730934052942924,
+      "loss": 0.7262,
+      "step": 10410
+    },
+    {
+      "epoch": 0.7242686702146162,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00037291786552932373,
+      "loss": 0.6312,
+      "step": 10411
+    },
+    {
+      "epoch": 0.7243382378517513,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00037274235760442466,
+      "loss": 0.7394,
+      "step": 10412
+    },
+    {
+      "epoch": 0.7244078054888866,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00037256688152850716,
+      "loss": 1.1719,
+      "step": 10413
+    },
+    {
+      "epoch": 0.7244773731260218,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0003723914373104813,
+      "loss": 0.5643,
+      "step": 10414
+    },
+    {
+      "epoch": 0.724546940763157,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0003722160249592548,
+      "loss": 0.6835,
+      "step": 10415
+    },
+    {
+      "epoch": 0.7246165084002922,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000372040644483734,
+      "loss": 0.7814,
+      "step": 10416
+    },
+    {
+      "epoch": 0.7246860760374274,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00037186529589282405,
+      "loss": 0.7209,
+      "step": 10417
+    },
+    {
+      "epoch": 0.7247556436745626,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003716899791954287,
+      "loss": 0.8136,
+      "step": 10418
+    },
+    {
+      "epoch": 0.7248252113116977,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0003715146944004494,
+      "loss": 0.941,
+      "step": 10419
+    },
+    {
+      "epoch": 0.724894778948833,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0003713394415167856,
+      "loss": 0.6571,
+      "step": 10420
+    },
+    {
+      "epoch": 0.7249643465859682,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00037116422055333634,
+      "loss": 1.0111,
+      "step": 10421
+    },
+    {
+      "epoch": 0.7250339142231034,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0003709890315189988,
+      "loss": 0.8752,
+      "step": 10422
+    },
+    {
+      "epoch": 0.7251034818602387,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0003708138744226678,
+      "loss": 1.1282,
+      "step": 10423
+    },
+    {
+      "epoch": 0.7251730494973738,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003706387492732365,
+      "loss": 0.925,
+      "step": 10424
+    },
+    {
+      "epoch": 0.725242617134509,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0003704636560795976,
+      "loss": 0.9034,
+      "step": 10425
+    },
+    {
+      "epoch": 0.7253121847716443,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00037028859485064094,
+      "loss": 0.7064,
+      "step": 10426
+    },
+    {
+      "epoch": 0.7253817524087794,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003701135655952557,
+      "loss": 0.7784,
+      "step": 10427
+    },
+    {
+      "epoch": 0.7254513200459146,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00036993856832232843,
+      "loss": 0.9509,
+      "step": 10428
+    },
+    {
+      "epoch": 0.7255208876830499,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00036976360304074516,
+      "loss": 0.8873,
+      "step": 10429
+    },
+    {
+      "epoch": 0.7255904553201851,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0003695886697593893,
+      "loss": 0.8801,
+      "step": 10430
+    },
+    {
+      "epoch": 0.7256600229573202,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00036941376848714325,
+      "loss": 0.5566,
+      "step": 10431
+    },
+    {
+      "epoch": 0.7257295905944554,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0003692388992328879,
+      "loss": 0.6438,
+      "step": 10432
+    },
+    {
+      "epoch": 0.7257991582315907,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00036906406200550213,
+      "loss": 0.9409,
+      "step": 10433
+    },
+    {
+      "epoch": 0.7258687258687259,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00036888925681386267,
+      "loss": 0.8923,
+      "step": 10434
+    },
+    {
+      "epoch": 0.725938293505861,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003687144836668457,
+      "loss": 0.9672,
+      "step": 10435
+    },
+    {
+      "epoch": 0.7260078611429963,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0003685397425733258,
+      "loss": 0.9184,
+      "step": 10436
+    },
+    {
+      "epoch": 0.7260774287801315,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0003683650335421749,
+      "loss": 0.8752,
+      "step": 10437
+    },
+    {
+      "epoch": 0.7261469964172667,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003681903565822635,
+      "loss": 0.9065,
+      "step": 10438
+    },
+    {
+      "epoch": 0.7262165640544019,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0003680157117024614,
+      "loss": 0.8314,
+      "step": 10439
+    },
+    {
+      "epoch": 0.7262861316915371,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003678410989116362,
+      "loss": 0.7061,
+      "step": 10440
+    },
+    {
+      "epoch": 0.7263556993286723,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0003676665182186538,
+      "loss": 0.702,
+      "step": 10441
+    },
+    {
+      "epoch": 0.7264252669658076,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0003674919696323781,
+      "loss": 0.7032,
+      "step": 10442
+    },
+    {
+      "epoch": 0.7264948346029427,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0003673174531616723,
+      "loss": 1.001,
+      "step": 10443
+    },
+    {
+      "epoch": 0.7265644022400779,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.000367142968815397,
+      "loss": 0.6741,
+      "step": 10444
+    },
+    {
+      "epoch": 0.7266339698772131,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0003669685166024119,
+      "loss": 0.7309,
+      "step": 10445
+    },
+    {
+      "epoch": 0.7267035375143484,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00036679409653157525,
+      "loss": 0.7796,
+      "step": 10446
+    },
+    {
+      "epoch": 0.7267731051514835,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00036661970861174263,
+      "loss": 0.832,
+      "step": 10447
+    },
+    {
+      "epoch": 0.7268426727886187,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0003664453528517685,
+      "loss": 0.9177,
+      "step": 10448
+    },
+    {
+      "epoch": 0.726912240425754,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00036627102926050596,
+      "loss": 1.0279,
+      "step": 10449
+    },
+    {
+      "epoch": 0.7269818080628891,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00036609673784680666,
+      "loss": 0.7577,
+      "step": 10450
+    },
+    {
+      "epoch": 0.7270513757000243,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003659224786195199,
+      "loss": 0.6044,
+      "step": 10451
+    },
+    {
+      "epoch": 0.7271209433371596,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00036574825158749335,
+      "loss": 0.5833,
+      "step": 10452
+    },
+    {
+      "epoch": 0.7271905109742948,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003655740567595738,
+      "loss": 0.8016,
+      "step": 10453
+    },
+    {
+      "epoch": 0.7272600786114299,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00036539989414460615,
+      "loss": 0.7016,
+      "step": 10454
+    },
+    {
+      "epoch": 0.7273296462485652,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00036522576375143325,
+      "loss": 0.7156,
+      "step": 10455
+    },
+    {
+      "epoch": 0.7273992138857004,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00036505166558889625,
+      "loss": 0.8444,
+      "step": 10456
+    },
+    {
+      "epoch": 0.7274687815228356,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00036487759966583565,
+      "loss": 0.9313,
+      "step": 10457
+    },
+    {
+      "epoch": 0.7275383491599707,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00036470356599108887,
+      "loss": 0.8356,
+      "step": 10458
+    },
+    {
+      "epoch": 0.727607916797106,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.0003645295645734931,
+      "loss": 1.0616,
+      "step": 10459
+    },
+    {
+      "epoch": 0.7276774844342412,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00036435559542188315,
+      "loss": 0.7141,
+      "step": 10460
+    },
+    {
+      "epoch": 0.7277470520713764,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003641816585450922,
+      "loss": 0.7129,
+      "step": 10461
+    },
+    {
+      "epoch": 0.7278166197085116,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0003640077539519516,
+      "loss": 0.9076,
+      "step": 10462
+    },
+    {
+      "epoch": 0.7278861873456468,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003638338816512916,
+      "loss": 0.7787,
+      "step": 10463
+    },
+    {
+      "epoch": 0.727955754982782,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003636600416519409,
+      "loss": 0.9157,
+      "step": 10464
+    },
+    {
+      "epoch": 0.7280253226199173,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003634862339627258,
+      "loss": 0.7722,
+      "step": 10465
+    },
+    {
+      "epoch": 0.7280948902570524,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0003633124585924713,
+      "loss": 0.914,
+      "step": 10466
+    },
+    {
+      "epoch": 0.7281644578941876,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00036313871555000086,
+      "loss": 0.6732,
+      "step": 10467
+    },
+    {
+      "epoch": 0.7282340255313229,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00036296500484413695,
+      "loss": 0.7468,
+      "step": 10468
+    },
+    {
+      "epoch": 0.728303593168458,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0003627913264836991,
+      "loss": 0.7366,
+      "step": 10469
+    },
+    {
+      "epoch": 0.7283731608055932,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00036261768047750554,
+      "loss": 1.0082,
+      "step": 10470
+    },
+    {
+      "epoch": 0.7284427284427284,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003624440668343736,
+      "loss": 0.7463,
+      "step": 10471
+    },
+    {
+      "epoch": 0.7285122960798637,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0003622704855631187,
+      "loss": 0.6717,
+      "step": 10472
+    },
+    {
+      "epoch": 0.7285818637169988,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003620969366725538,
+      "loss": 0.9143,
+      "step": 10473
+    },
+    {
+      "epoch": 0.728651431354134,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0003619234201714916,
+      "loss": 0.9855,
+      "step": 10474
+    },
+    {
+      "epoch": 0.7287209989912693,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00036174993606874186,
+      "loss": 1.0841,
+      "step": 10475
+    },
+    {
+      "epoch": 0.7287905666284045,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003615764843731131,
+      "loss": 1.0146,
+      "step": 10476
+    },
+    {
+      "epoch": 0.7288601342655396,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003614030650934126,
+      "loss": 0.6484,
+      "step": 10477
+    },
+    {
+      "epoch": 0.7289297019026749,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.000361229678238446,
+      "loss": 0.7777,
+      "step": 10478
+    },
+    {
+      "epoch": 0.7289992695398101,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003610563238170166,
+      "loss": 0.7467,
+      "step": 10479
+    },
+    {
+      "epoch": 0.7290688371769453,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00036088300183792634,
+      "loss": 0.6488,
+      "step": 10480
+    },
+    {
+      "epoch": 0.7291384048140805,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00036070971230997583,
+      "loss": 0.9898,
+      "step": 10481
+    },
+    {
+      "epoch": 0.7292079724512157,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003605364552419642,
+      "loss": 0.7607,
+      "step": 10482
+    },
+    {
+      "epoch": 0.7292775400883509,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00036036323064268815,
+      "loss": 0.862,
+      "step": 10483
+    },
+    {
+      "epoch": 0.729347107725486,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00036019003852094303,
+      "loss": 0.9068,
+      "step": 10484
+    },
+    {
+      "epoch": 0.7294166753626213,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003600168788855228,
+      "loss": 0.6621,
+      "step": 10485
+    },
+    {
+      "epoch": 0.7294862429997565,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003598437517452201,
+      "loss": 0.8121,
+      "step": 10486
+    },
+    {
+      "epoch": 0.7295558106368917,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00035967065710882474,
+      "loss": 0.9737,
+      "step": 10487
+    },
+    {
+      "epoch": 0.729625378274027,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0003594975949851261,
+      "loss": 0.7987,
+      "step": 10488
+    },
+    {
+      "epoch": 0.7296949459111621,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00035932456538291134,
+      "loss": 0.7368,
+      "step": 10489
+    },
+    {
+      "epoch": 0.7297645135482973,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0003591515683109656,
+      "loss": 0.8261,
+      "step": 10490
+    },
+    {
+      "epoch": 0.7298340811854326,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00035897860377807303,
+      "loss": 1.0825,
+      "step": 10491
+    },
+    {
+      "epoch": 0.7299036488225678,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00035880567179301636,
+      "loss": 1.1469,
+      "step": 10492
+    },
+    {
+      "epoch": 0.7299732164597029,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003586327723645758,
+      "loss": 0.6994,
+      "step": 10493
+    },
+    {
+      "epoch": 0.7300427840968382,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00035845990550153,
+      "loss": 0.9453,
+      "step": 10494
+    },
+    {
+      "epoch": 0.7301123517339734,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0003582870712126566,
+      "loss": 0.8885,
+      "step": 10495
+    },
+    {
+      "epoch": 0.7301819193711085,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00035811426950673153,
+      "loss": 0.6837,
+      "step": 10496
+    },
+    {
+      "epoch": 0.7302514870082437,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0003579415003925285,
+      "loss": 0.8565,
+      "step": 10497
+    },
+    {
+      "epoch": 0.730321054645379,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00035776876387881964,
+      "loss": 0.8585,
+      "step": 10498
+    },
+    {
+      "epoch": 0.7303906222825142,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00035759605997437574,
+      "loss": 0.7695,
+      "step": 10499
+    },
+    {
+      "epoch": 0.7304601899196493,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003574233886879664,
+      "loss": 1.0199,
+      "step": 10500
+    },
+    {
+      "epoch": 0.7305297575567846,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0003572507500283585,
+      "loss": 0.9845,
+      "step": 10501
+    },
+    {
+      "epoch": 0.7305993251939198,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00035707814400431746,
+      "loss": 0.7833,
+      "step": 10502
+    },
+    {
+      "epoch": 0.730668892831055,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0003569055706246077,
+      "loss": 1.0962,
+      "step": 10503
+    },
+    {
+      "epoch": 0.7307384604681902,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00035673302989799204,
+      "loss": 0.8713,
+      "step": 10504
+    },
+    {
+      "epoch": 0.7308080281053254,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003565605218332304,
+      "loss": 0.7375,
+      "step": 10505
+    },
+    {
+      "epoch": 0.7308775957424606,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00035638804643908274,
+      "loss": 0.7821,
+      "step": 10506
+    },
+    {
+      "epoch": 0.7309471633795959,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00035621560372430596,
+      "loss": 0.6925,
+      "step": 10507
+    },
+    {
+      "epoch": 0.731016731016731,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003560431936976556,
+      "loss": 0.9956,
+      "step": 10508
+    },
+    {
+      "epoch": 0.7310862986538662,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.000355870816367886,
+      "loss": 0.8902,
+      "step": 10509
+    },
+    {
+      "epoch": 0.7311558662910014,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00035569847174375,
+      "loss": 0.8974,
+      "step": 10510
+    },
+    {
+      "epoch": 0.7312254339281367,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003555261598339983,
+      "loss": 0.7936,
+      "step": 10511
+    },
+    {
+      "epoch": 0.7312950015652718,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003553538806473793,
+      "loss": 0.8037,
+      "step": 10512
+    },
+    {
+      "epoch": 0.731364569202407,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00035518163419264104,
+      "loss": 0.6703,
+      "step": 10513
+    },
+    {
+      "epoch": 0.7314341368395423,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0003550094204785296,
+      "loss": 0.7503,
+      "step": 10514
+    },
+    {
+      "epoch": 0.7315037044766775,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003548372395137888,
+      "loss": 0.7974,
+      "step": 10515
+    },
+    {
+      "epoch": 0.7315732721138126,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003546650913071607,
+      "loss": 0.8388,
+      "step": 10516
+    },
+    {
+      "epoch": 0.7316428397509479,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003544929758673866,
+      "loss": 0.7568,
+      "step": 10517
+    },
+    {
+      "epoch": 0.7317124073880831,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00035432089320320593,
+      "loss": 1.0404,
+      "step": 10518
+    },
+    {
+      "epoch": 0.7317819750252182,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0003541488433233555,
+      "loss": 0.5822,
+      "step": 10519
+    },
+    {
+      "epoch": 0.7318515426623535,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003539768262365719,
+      "loss": 0.9355,
+      "step": 10520
+    },
+    {
+      "epoch": 0.7319211102994887,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0003538048419515887,
+      "loss": 1.4892,
+      "step": 10521
+    },
+    {
+      "epoch": 0.7319906779366239,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0003536328904771383,
+      "loss": 0.7739,
+      "step": 10522
+    },
+    {
+      "epoch": 0.732060245573759,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0003534609718219518,
+      "loss": 0.9225,
+      "step": 10523
+    },
+    {
+      "epoch": 0.7321298132108943,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00035328908599475874,
+      "loss": 0.712,
+      "step": 10524
+    },
+    {
+      "epoch": 0.7321993808480295,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003531172330042861,
+      "loss": 0.865,
+      "step": 10525
+    },
+    {
+      "epoch": 0.7322689484851647,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00035294541285925965,
+      "loss": 0.5902,
+      "step": 10526
+    },
+    {
+      "epoch": 0.7323385161222999,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00035277362556840363,
+      "loss": 0.6576,
+      "step": 10527
+    },
+    {
+      "epoch": 0.7324080837594351,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00035260187114044095,
+      "loss": 0.8863,
+      "step": 10528
+    },
+    {
+      "epoch": 0.7324776513965703,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0003524301495840923,
+      "loss": 0.5281,
+      "step": 10529
+    },
+    {
+      "epoch": 0.7325472190337056,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003522584609080761,
+      "loss": 0.7786,
+      "step": 10530
+    },
+    {
+      "epoch": 0.7326167866708407,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00035208680512111056,
+      "loss": 0.6755,
+      "step": 10531
+    },
+    {
+      "epoch": 0.7326863543079759,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00035191518223191153,
+      "loss": 0.6931,
+      "step": 10532
+    },
+    {
+      "epoch": 0.7327559219451112,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00035174359224919273,
+      "loss": 0.9638,
+      "step": 10533
+    },
+    {
+      "epoch": 0.7328254895822464,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00035157203518166723,
+      "loss": 0.6878,
+      "step": 10534
+    },
+    {
+      "epoch": 0.7328950572193815,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00035140051103804503,
+      "loss": 0.9544,
+      "step": 10535
+    },
+    {
+      "epoch": 0.7329646248565167,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00035122901982703606,
+      "loss": 0.6548,
+      "step": 10536
+    },
+    {
+      "epoch": 0.733034192493652,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0003510575615573471,
+      "loss": 0.6057,
+      "step": 10537
+    },
+    {
+      "epoch": 0.7331037601307872,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003508861362376846,
+      "loss": 0.8301,
+      "step": 10538
+    },
+    {
+      "epoch": 0.7331733277679223,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00035071474387675226,
+      "loss": 0.8204,
+      "step": 10539
+    },
+    {
+      "epoch": 0.7332428954050576,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003505433844832523,
+      "loss": 0.7581,
+      "step": 10540
+    },
+    {
+      "epoch": 0.7333124630421928,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0003503720580658858,
+      "loss": 0.9967,
+      "step": 10541
+    },
+    {
+      "epoch": 0.733382030679328,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00035020076463335213,
+      "loss": 0.8009,
+      "step": 10542
+    },
+    {
+      "epoch": 0.7334515983164632,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003500295041943484,
+      "loss": 0.7618,
+      "step": 10543
+    },
+    {
+      "epoch": 0.7335211659535984,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00034985827675756997,
+      "loss": 0.772,
+      "step": 10544
+    },
+    {
+      "epoch": 0.7335907335907336,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00034968708233171133,
+      "loss": 0.9681,
+      "step": 10545
+    },
+    {
+      "epoch": 0.7336603012278688,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0003495159209254651,
+      "loss": 0.6452,
+      "step": 10546
+    },
+    {
+      "epoch": 0.733729868865004,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0003493447925475215,
+      "loss": 0.7094,
+      "step": 10547
+    },
+    {
+      "epoch": 0.7337994365021392,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00034917369720657013,
+      "loss": 0.6876,
+      "step": 10548
+    },
+    {
+      "epoch": 0.7338690041392744,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0003490026349112976,
+      "loss": 0.7527,
+      "step": 10549
+    },
+    {
+      "epoch": 0.7339385717764096,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0003488316056703904,
+      "loss": 0.8257,
+      "step": 10550
+    },
+    {
+      "epoch": 0.7340081394135448,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00034866060949253173,
+      "loss": 0.6408,
+      "step": 10551
+    },
+    {
+      "epoch": 0.73407770705068,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0003484896463864047,
+      "loss": 0.6556,
+      "step": 10552
+    },
+    {
+      "epoch": 0.7341472746878153,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0003483187163606895,
+      "loss": 0.8944,
+      "step": 10553
+    },
+    {
+      "epoch": 0.7342168423249504,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003481478194240645,
+      "loss": 0.8914,
+      "step": 10554
+    },
+    {
+      "epoch": 0.7342864099620856,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00034797695558520835,
+      "loss": 0.756,
+      "step": 10555
+    },
+    {
+      "epoch": 0.7343559775992209,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00034780612485279605,
+      "loss": 1.0581,
+      "step": 10556
+    },
+    {
+      "epoch": 0.7344255452363561,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0003476353272355013,
+      "loss": 1.0234,
+      "step": 10557
+    },
+    {
+      "epoch": 0.7344951128734912,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00034746456274199625,
+      "loss": 0.6046,
+      "step": 10558
+    },
+    {
+      "epoch": 0.7345646805106265,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003472938313809515,
+      "loss": 0.7405,
+      "step": 10559
+    },
+    {
+      "epoch": 0.7346342481477617,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00034712313316103663,
+      "loss": 0.9239,
+      "step": 10560
+    },
+    {
+      "epoch": 0.7347038157848969,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00034695246809091784,
+      "loss": 0.6487,
+      "step": 10561
+    },
+    {
+      "epoch": 0.734773383422032,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003467818361792615,
+      "loss": 0.791,
+      "step": 10562
+    },
+    {
+      "epoch": 0.7348429510591673,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00034661123743473076,
+      "loss": 0.4346,
+      "step": 10563
+    },
+    {
+      "epoch": 0.7349125186963025,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00034644067186598835,
+      "loss": 0.7441,
+      "step": 10564
+    },
+    {
+      "epoch": 0.7349820863334376,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0003462701394816942,
+      "loss": 0.607,
+      "step": 10565
+    },
+    {
+      "epoch": 0.7350516539705729,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00034609964029050757,
+      "loss": 0.8549,
+      "step": 10566
+    },
+    {
+      "epoch": 0.7351212216077081,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00034592917430108537,
+      "loss": 0.5786,
+      "step": 10567
+    },
+    {
+      "epoch": 0.7351907892448433,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003457587415220822,
+      "loss": 0.7486,
+      "step": 10568
+    },
+    {
+      "epoch": 0.7352603568819785,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0003455883419621532,
+      "loss": 0.8182,
+      "step": 10569
+    },
+    {
+      "epoch": 0.7353299245191137,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003454179756299497,
+      "loss": 0.8274,
+      "step": 10570
+    },
+    {
+      "epoch": 0.7353994921562489,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0003452476425341221,
+      "loss": 0.732,
+      "step": 10571
+    },
+    {
+      "epoch": 0.7354690597933842,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0003450773426833187,
+      "loss": 0.7192,
+      "step": 10572
+    },
+    {
+      "epoch": 0.7355386274305193,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00034490707608618676,
+      "loss": 1.0045,
+      "step": 10573
+    },
+    {
+      "epoch": 0.7356081950676545,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00034473684275137184,
+      "loss": 1.0613,
+      "step": 10574
+    },
+    {
+      "epoch": 0.7356777627047897,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.000344566642687517,
+      "loss": 1.0541,
+      "step": 10575
+    },
+    {
+      "epoch": 0.735747330341925,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0003443964759032647,
+      "loss": 0.9216,
+      "step": 10576
+    },
+    {
+      "epoch": 0.7358168979790601,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0003442263424072547,
+      "loss": 0.7047,
+      "step": 10577
+    },
+    {
+      "epoch": 0.7358864656161953,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003440562422081259,
+      "loss": 0.8835,
+      "step": 10578
+    },
+    {
+      "epoch": 0.7359560332533306,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0003438861753145146,
+      "loss": 0.8249,
+      "step": 10579
+    },
+    {
+      "epoch": 0.7360256008904658,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0003437161417350565,
+      "loss": 0.9425,
+      "step": 10580
+    },
+    {
+      "epoch": 0.7360951685276009,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0003435461414783846,
+      "loss": 0.9638,
+      "step": 10581
+    },
+    {
+      "epoch": 0.7361647361647362,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00034337617455313117,
+      "loss": 0.7829,
+      "step": 10582
+    },
+    {
+      "epoch": 0.7362343038018714,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0003432062409679256,
+      "loss": 0.5947,
+      "step": 10583
+    },
+    {
+      "epoch": 0.7363038714390066,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.000343036340731397,
+      "loss": 0.7484,
+      "step": 10584
+    },
+    {
+      "epoch": 0.7363734390761418,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00034286647385217163,
+      "loss": 1.0335,
+      "step": 10585
+    },
+    {
+      "epoch": 0.736443006713277,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0003426966403388739,
+      "loss": 0.9315,
+      "step": 10586
+    },
+    {
+      "epoch": 0.7365125743504122,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0003425268402001284,
+      "loss": 0.5161,
+      "step": 10587
+    },
+    {
+      "epoch": 0.7365821419875473,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00034235707344455605,
+      "loss": 0.8376,
+      "step": 10588
+    },
+    {
+      "epoch": 0.7366517096246826,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00034218734008077667,
+      "loss": 0.9266,
+      "step": 10589
+    },
+    {
+      "epoch": 0.7367212772618178,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003420176401174082,
+      "loss": 0.6244,
+      "step": 10590
+    },
+    {
+      "epoch": 0.736790844898953,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0003418479735630675,
+      "loss": 0.8644,
+      "step": 10591
+    },
+    {
+      "epoch": 0.7368604125360882,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003416783404263698,
+      "loss": 1.0935,
+      "step": 10592
+    },
+    {
+      "epoch": 0.7369299801732234,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0003415087407159273,
+      "loss": 0.9523,
+      "step": 10593
+    },
+    {
+      "epoch": 0.7369995478103586,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0003413391744403523,
+      "loss": 0.8776,
+      "step": 10594
+    },
+    {
+      "epoch": 0.7370691154474939,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00034116964160825394,
+      "loss": 0.7411,
+      "step": 10595
+    },
+    {
+      "epoch": 0.737138683084629,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0003410001422282406,
+      "loss": 0.9877,
+      "step": 10596
+    },
+    {
+      "epoch": 0.7372082507217642,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0003408306763089182,
+      "loss": 0.9689,
+      "step": 10597
+    },
+    {
+      "epoch": 0.7372778183588995,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00034066124385889176,
+      "loss": 0.7874,
+      "step": 10598
+    },
+    {
+      "epoch": 0.7373473859960347,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00034049184488676423,
+      "loss": 0.8638,
+      "step": 10599
+    },
+    {
+      "epoch": 0.7374169536331698,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003403224794011358,
+      "loss": 0.7705,
+      "step": 10600
+    },
+    {
+      "epoch": 0.737486521270305,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00034015314741060764,
+      "loss": 0.5618,
+      "step": 10601
+    },
+    {
+      "epoch": 0.7375560889074403,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00033998384892377673,
+      "loss": 1.0214,
+      "step": 10602
+    },
+    {
+      "epoch": 0.7376256565445755,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00033981458394923936,
+      "loss": 0.7092,
+      "step": 10603
+    },
+    {
+      "epoch": 0.7376952241817106,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0003396453524955894,
+      "loss": 0.8239,
+      "step": 10604
+    },
+    {
+      "epoch": 0.7377647918188459,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00033947615457142,
+      "loss": 0.8572,
+      "step": 10605
+    },
+    {
+      "epoch": 0.7378343594559811,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003393069901853225,
+      "loss": 0.905,
+      "step": 10606
+    },
+    {
+      "epoch": 0.7379039270931163,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00033913785934588556,
+      "loss": 0.6206,
+      "step": 10607
+    },
+    {
+      "epoch": 0.7379734947302515,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0003389687620616976,
+      "loss": 0.7289,
+      "step": 10608
+    },
+    {
+      "epoch": 0.7380430623673867,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003387996983413436,
+      "loss": 0.8671,
+      "step": 10609
+    },
+    {
+      "epoch": 0.7381126300045219,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0003386306681934086,
+      "loss": 0.6568,
+      "step": 10610
+    },
+    {
+      "epoch": 0.7381821976416572,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00033846167162647435,
+      "loss": 0.6951,
+      "step": 10611
+    },
+    {
+      "epoch": 0.7382517652787923,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0003382927086491223,
+      "loss": 0.5366,
+      "step": 10612
+    },
+    {
+      "epoch": 0.7383213329159275,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.000338123779269931,
+      "loss": 0.9697,
+      "step": 10613
+    },
+    {
+      "epoch": 0.7383909005530627,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00033795488349747815,
+      "loss": 0.9811,
+      "step": 10614
+    },
+    {
+      "epoch": 0.738460468190198,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0003377860213403395,
+      "loss": 0.965,
+      "step": 10615
+    },
+    {
+      "epoch": 0.7385300358273331,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00033761719280708905,
+      "loss": 0.7906,
+      "step": 10616
+    },
+    {
+      "epoch": 0.7385996034644683,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0003374483979062989,
+      "loss": 0.8549,
+      "step": 10617
+    },
+    {
+      "epoch": 0.7386691711016036,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00033727963664653915,
+      "loss": 1.0295,
+      "step": 10618
+    },
+    {
+      "epoch": 0.7387387387387387,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003371109090363792,
+      "loss": 0.6678,
+      "step": 10619
+    },
+    {
+      "epoch": 0.7388083063758739,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0003369422150843863,
+      "loss": 0.8005,
+      "step": 10620
+    },
+    {
+      "epoch": 0.7388778740130092,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00033677355479912543,
+      "loss": 0.6075,
+      "step": 10621
+    },
+    {
+      "epoch": 0.7389474416501444,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0003366049281891608,
+      "loss": 1.0783,
+      "step": 10622
+    },
+    {
+      "epoch": 0.7390170092872795,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0003364363352630538,
+      "loss": 0.8589,
+      "step": 10623
+    },
+    {
+      "epoch": 0.7390865769244148,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00033626777602936556,
+      "loss": 0.8846,
+      "step": 10624
+    },
+    {
+      "epoch": 0.73915614456155,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00033609925049665377,
+      "loss": 0.7845,
+      "step": 10625
+    },
+    {
+      "epoch": 0.7392257121986852,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000335930758673476,
+      "loss": 0.6639,
+      "step": 10626
+    },
+    {
+      "epoch": 0.7392952798358203,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.000335762300568387,
+      "loss": 0.8502,
+      "step": 10627
+    },
+    {
+      "epoch": 0.7393648474729556,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003355938761899402,
+      "loss": 0.7402,
+      "step": 10628
+    },
+    {
+      "epoch": 0.7394344151100908,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00033542548554668785,
+      "loss": 0.852,
+      "step": 10629
+    },
+    {
+      "epoch": 0.739503982747226,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0003352571286471797,
+      "loss": 0.8183,
+      "step": 10630
+    },
+    {
+      "epoch": 0.7395735503843612,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000335088805499964,
+      "loss": 0.8635,
+      "step": 10631
+    },
+    {
+      "epoch": 0.7396431180214964,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00033492051611358665,
+      "loss": 0.9351,
+      "step": 10632
+    },
+    {
+      "epoch": 0.7397126856586316,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00033475226049659403,
+      "loss": 0.9912,
+      "step": 10633
+    },
+    {
+      "epoch": 0.7397822532957669,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0003345840386575284,
+      "loss": 0.6397,
+      "step": 10634
+    },
+    {
+      "epoch": 0.739851820932902,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00033441585060493107,
+      "loss": 0.9123,
+      "step": 10635
+    },
+    {
+      "epoch": 0.7399213885700372,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00033424769634734234,
+      "loss": 0.6297,
+      "step": 10636
+    },
+    {
+      "epoch": 0.7399909562071725,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003340795758932996,
+      "loss": 0.8262,
+      "step": 10637
+    },
+    {
+      "epoch": 0.7400605238443076,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00033391148925133996,
+      "loss": 0.6155,
+      "step": 10638
+    },
+    {
+      "epoch": 0.7401300914814428,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0003337434364299972,
+      "loss": 0.7819,
+      "step": 10639
+    },
+    {
+      "epoch": 0.740199659118578,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003335754174378047,
+      "loss": 0.8932,
+      "step": 10640
+    },
+    {
+      "epoch": 0.7402692267557133,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003334074322832933,
+      "loss": 0.6714,
+      "step": 10641
+    },
+    {
+      "epoch": 0.7403387943928484,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0003332394809749927,
+      "loss": 0.8553,
+      "step": 10642
+    },
+    {
+      "epoch": 0.7404083620299836,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00033307156352143063,
+      "loss": 1.0901,
+      "step": 10643
+    },
+    {
+      "epoch": 0.7404779296671189,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0003329036799311331,
+      "loss": 0.6526,
+      "step": 10644
+    },
+    {
+      "epoch": 0.7405474973042541,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0003327358302126241,
+      "loss": 0.9371,
+      "step": 10645
+    },
+    {
+      "epoch": 0.7406170649413892,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003325680143744262,
+      "loss": 0.6818,
+      "step": 10646
+    },
+    {
+      "epoch": 0.7406866325785245,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0003324002324250609,
+      "loss": 0.7781,
+      "step": 10647
+    },
+    {
+      "epoch": 0.7407562002156597,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0003322324843730468,
+      "loss": 0.6444,
+      "step": 10648
+    },
+    {
+      "epoch": 0.7408257678527949,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00033206477022690084,
+      "loss": 0.6224,
+      "step": 10649
+    },
+    {
+      "epoch": 0.74089533548993,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003318970899951397,
+      "loss": 0.6883,
+      "step": 10650
+    },
+    {
+      "epoch": 0.7409649031270653,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00033172944368627653,
+      "loss": 0.8715,
+      "step": 10651
+    },
+    {
+      "epoch": 0.7410344707642005,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0003315618313088241,
+      "loss": 0.7715,
+      "step": 10652
+    },
+    {
+      "epoch": 0.7411040384013357,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003313942528712924,
+      "loss": 0.659,
+      "step": 10653
+    },
+    {
+      "epoch": 0.7411736060384709,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0003312267083821909,
+      "loss": 0.602,
+      "step": 10654
+    },
+    {
+      "epoch": 0.7412431736756061,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00033105919785002594,
+      "loss": 1.0,
+      "step": 10655
+    },
+    {
+      "epoch": 0.7413127413127413,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0003308917212833036,
+      "loss": 0.7299,
+      "step": 10656
+    },
+    {
+      "epoch": 0.7413823089498766,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00033072427869052667,
+      "loss": 0.6468,
+      "step": 10657
+    },
+    {
+      "epoch": 0.7414518765870117,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00033055687008019775,
+      "loss": 0.7774,
+      "step": 10658
+    },
+    {
+      "epoch": 0.7415214442241469,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0003303894954608165,
+      "loss": 0.7469,
+      "step": 10659
+    },
+    {
+      "epoch": 0.7415910118612822,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00033022215484088157,
+      "loss": 0.9953,
+      "step": 10660
+    },
+    {
+      "epoch": 0.7416605794984173,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00033005484822889,
+      "loss": 0.6639,
+      "step": 10661
+    },
+    {
+      "epoch": 0.7417301471355525,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00032988757563333636,
+      "loss": 0.689,
+      "step": 10662
+    },
+    {
+      "epoch": 0.7417997147726877,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0003297203370627142,
+      "loss": 0.7924,
+      "step": 10663
+    },
+    {
+      "epoch": 0.741869282409823,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0003295531325255141,
+      "loss": 0.845,
+      "step": 10664
+    },
+    {
+      "epoch": 0.7419388500469581,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0003293859620302273,
+      "loss": 0.8241,
+      "step": 10665
+    },
+    {
+      "epoch": 0.7420084176840933,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00032921882558534113,
+      "loss": 1.2517,
+      "step": 10666
+    },
+    {
+      "epoch": 0.7420779853212286,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00032905172319934174,
+      "loss": 0.7807,
+      "step": 10667
+    },
+    {
+      "epoch": 0.7421475529583638,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00032888465488071437,
+      "loss": 0.6846,
+      "step": 10668
+    },
+    {
+      "epoch": 0.7422171205954989,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0003287176206379412,
+      "loss": 0.5882,
+      "step": 10669
+    },
+    {
+      "epoch": 0.7422866882326342,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00032855062047950414,
+      "loss": 0.9032,
+      "step": 10670
+    },
+    {
+      "epoch": 0.7423562558697694,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0003283836544138818,
+      "loss": 0.8563,
+      "step": 10671
+    },
+    {
+      "epoch": 0.7424258235069046,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0003282167224495527,
+      "loss": 0.6975,
+      "step": 10672
+    },
+    {
+      "epoch": 0.7424953911440398,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000328049824594992,
+      "loss": 0.8196,
+      "step": 10673
+    },
+    {
+      "epoch": 0.742564958781175,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003278829608586743,
+      "loss": 0.8175,
+      "step": 10674
+    },
+    {
+      "epoch": 0.7426345264183102,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003277161312490725,
+      "loss": 0.72,
+      "step": 10675
+    },
+    {
+      "epoch": 0.7427040940554454,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00032754933577465694,
+      "loss": 0.8003,
+      "step": 10676
+    },
+    {
+      "epoch": 0.7427736616925806,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0003273825744438965,
+      "loss": 0.6127,
+      "step": 10677
+    },
+    {
+      "epoch": 0.7428432293297158,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00032721584726525855,
+      "loss": 0.8807,
+      "step": 10678
+    },
+    {
+      "epoch": 0.742912796966851,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0003270491542472092,
+      "loss": 0.9,
+      "step": 10679
+    },
+    {
+      "epoch": 0.7429823646039863,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003268824953982119,
+      "loss": 0.75,
+      "step": 10680
+    },
+    {
+      "epoch": 0.7430519322411214,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003267158707267284,
+      "loss": 0.835,
+      "step": 10681
+    },
+    {
+      "epoch": 0.7431214998782566,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00032654928024121953,
+      "loss": 0.9836,
+      "step": 10682
+    },
+    {
+      "epoch": 0.7431910675153919,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00032638272395014355,
+      "loss": 0.6932,
+      "step": 10683
+    },
+    {
+      "epoch": 0.743260635152527,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00032621620186195797,
+      "loss": 0.6945,
+      "step": 10684
+    },
+    {
+      "epoch": 0.7433302027896622,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003260497139851172,
+      "loss": 0.7294,
+      "step": 10685
+    },
+    {
+      "epoch": 0.7433997704267975,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00032588326032807524,
+      "loss": 0.8689,
+      "step": 10686
+    },
+    {
+      "epoch": 0.7434693380639327,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00032571684089928324,
+      "loss": 0.7692,
+      "step": 10687
+    },
+    {
+      "epoch": 0.7435389057010678,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00032555045570719135,
+      "loss": 0.8178,
+      "step": 10688
+    },
+    {
+      "epoch": 0.743608473338203,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0003253841047602483,
+      "loss": 0.9112,
+      "step": 10689
+    },
+    {
+      "epoch": 0.7436780409753383,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003252177880668999,
+      "loss": 0.7473,
+      "step": 10690
+    },
+    {
+      "epoch": 0.7437476086124735,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00032505150563559094,
+      "loss": 0.8301,
+      "step": 10691
+    },
+    {
+      "epoch": 0.7438171762496086,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003248852574747644,
+      "loss": 0.6411,
+      "step": 10692
+    },
+    {
+      "epoch": 0.7438867438867439,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003247190435928621,
+      "loss": 0.7741,
+      "step": 10693
+    },
+    {
+      "epoch": 0.7439563115238791,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00032455286399832295,
+      "loss": 0.8128,
+      "step": 10694
+    },
+    {
+      "epoch": 0.7440258791610143,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003243867186995847,
+      "loss": 0.757,
+      "step": 10695
+    },
+    {
+      "epoch": 0.7440954467981495,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0003242206077050834,
+      "loss": 0.7755,
+      "step": 10696
+    },
+    {
+      "epoch": 0.7441650144352847,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003240545310232538,
+      "loss": 0.7966,
+      "step": 10697
+    },
+    {
+      "epoch": 0.7442345820724199,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0003238884886625282,
+      "loss": 0.5772,
+      "step": 10698
+    },
+    {
+      "epoch": 0.7443041497095552,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0003237224806313368,
+      "loss": 0.832,
+      "step": 10699
+    },
+    {
+      "epoch": 0.7443737173466903,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00032355650693810956,
+      "loss": 1.0497,
+      "step": 10700
+    },
+    {
+      "epoch": 0.7444432849838255,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00032339056759127303,
+      "loss": 0.7867,
+      "step": 10701
+    },
+    {
+      "epoch": 0.7445128526209607,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0003232246625992532,
+      "loss": 0.6914,
+      "step": 10702
+    },
+    {
+      "epoch": 0.744582420258096,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00032305879197047405,
+      "loss": 0.7868,
+      "step": 10703
+    },
+    {
+      "epoch": 0.7446519878952311,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00032289295571335744,
+      "loss": 0.9864,
+      "step": 10704
+    },
+    {
+      "epoch": 0.7447215555323663,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003227271538363232,
+      "loss": 0.74,
+      "step": 10705
+    },
+    {
+      "epoch": 0.7447911231695016,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00032256138634779053,
+      "loss": 1.1954,
+      "step": 10706
+    },
+    {
+      "epoch": 0.7448606908066367,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0003223956532561765,
+      "loss": 0.7271,
+      "step": 10707
+    },
+    {
+      "epoch": 0.7449302584437719,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00032222995456989567,
+      "loss": 0.7618,
+      "step": 10708
+    },
+    {
+      "epoch": 0.7449998260809072,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0003220642902973613,
+      "loss": 0.9101,
+      "step": 10709
+    },
+    {
+      "epoch": 0.7450693937180424,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0003218986604469851,
+      "loss": 0.808,
+      "step": 10710
+    },
+    {
+      "epoch": 0.7451389613551775,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0003217330650271775,
+      "loss": 0.6827,
+      "step": 10711
+    },
+    {
+      "epoch": 0.7452085289923128,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00032156750404634604,
+      "loss": 0.7297,
+      "step": 10712
+    },
+    {
+      "epoch": 0.745278096629448,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00032140197751289693,
+      "loss": 0.8698,
+      "step": 10713
+    },
+    {
+      "epoch": 0.7453476642665832,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00032123648543523533,
+      "loss": 0.9402,
+      "step": 10714
+    },
+    {
+      "epoch": 0.7454172319037183,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0003210710278217634,
+      "loss": 0.7883,
+      "step": 10715
+    },
+    {
+      "epoch": 0.7454867995408536,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003209056046808827,
+      "loss": 0.6464,
+      "step": 10716
+    },
+    {
+      "epoch": 0.7455563671779888,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0003207402160209927,
+      "loss": 0.7639,
+      "step": 10717
+    },
+    {
+      "epoch": 0.745625934815124,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0003205748618504909,
+      "loss": 0.8756,
+      "step": 10718
+    },
+    {
+      "epoch": 0.7456955024522592,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00032040954217777274,
+      "loss": 0.7125,
+      "step": 10719
+    },
+    {
+      "epoch": 0.7457650700893944,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00032024425701123263,
+      "loss": 0.9542,
+      "step": 10720
+    },
+    {
+      "epoch": 0.7458346377265296,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00032007900635926324,
+      "loss": 0.6046,
+      "step": 10721
+    },
+    {
+      "epoch": 0.7459042053636649,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0003199137902302548,
+      "loss": 0.751,
+      "step": 10722
+    },
+    {
+      "epoch": 0.7459737730008,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0003197486086325959,
+      "loss": 0.806,
+      "step": 10723
+    },
+    {
+      "epoch": 0.7460433406379352,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.000319583461574674,
+      "loss": 0.7304,
+      "step": 10724
+    },
+    {
+      "epoch": 0.7461129082750705,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00031941834906487463,
+      "loss": 0.7119,
+      "step": 10725
+    },
+    {
+      "epoch": 0.7461824759122057,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0003192532711115812,
+      "loss": 0.6333,
+      "step": 10726
+    },
+    {
+      "epoch": 0.7462520435493408,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00031908822772317504,
+      "loss": 0.7809,
+      "step": 10727
+    },
+    {
+      "epoch": 0.746321611186476,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00031892321890803654,
+      "loss": 0.8851,
+      "step": 10728
+    },
+    {
+      "epoch": 0.7463911788236113,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0003187582446745446,
+      "loss": 0.9008,
+      "step": 10729
+    },
+    {
+      "epoch": 0.7464607464607464,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003185933050310749,
+      "loss": 0.6785,
+      "step": 10730
+    },
+    {
+      "epoch": 0.7465303140978816,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003184283999860029,
+      "loss": 0.8088,
+      "step": 10731
+    },
+    {
+      "epoch": 0.7465998817350169,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0003182635295477014,
+      "loss": 0.7669,
+      "step": 10732
+    },
+    {
+      "epoch": 0.7466694493721521,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00031809869372454136,
+      "loss": 0.6425,
+      "step": 10733
+    },
+    {
+      "epoch": 0.7467390170092872,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0003179338925248926,
+      "loss": 0.7079,
+      "step": 10734
+    },
+    {
+      "epoch": 0.7468085846464225,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0003177691259571233,
+      "loss": 0.782,
+      "step": 10735
+    },
+    {
+      "epoch": 0.7468781522835577,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00031760439402959896,
+      "loss": 0.6856,
+      "step": 10736
+    },
+    {
+      "epoch": 0.7469477199206929,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0003174396967506837,
+      "loss": 0.6101,
+      "step": 10737
+    },
+    {
+      "epoch": 0.7470172875578281,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00031727503412874025,
+      "loss": 0.9982,
+      "step": 10738
+    },
+    {
+      "epoch": 0.7470868551949633,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00031711040617212973,
+      "loss": 0.8929,
+      "step": 10739
+    },
+    {
+      "epoch": 0.7471564228320985,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00031694581288921076,
+      "loss": 0.9524,
+      "step": 10740
+    },
+    {
+      "epoch": 0.7472259904692337,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00031678125428834025,
+      "loss": 0.8278,
+      "step": 10741
+    },
+    {
+      "epoch": 0.7472955581063689,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.000316616730377874,
+      "loss": 0.8842,
+      "step": 10742
+    },
+    {
+      "epoch": 0.7473651257435041,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.000316452241166166,
+      "loss": 0.8872,
+      "step": 10743
+    },
+    {
+      "epoch": 0.7474346933806393,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00031628778666156776,
+      "loss": 0.7571,
+      "step": 10744
+    },
+    {
+      "epoch": 0.7475042610177746,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00031612336687242927,
+      "loss": 0.7759,
+      "step": 10745
+    },
+    {
+      "epoch": 0.7475738286549097,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00031595898180709957,
+      "loss": 0.7668,
+      "step": 10746
+    },
+    {
+      "epoch": 0.7476433962920449,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00031579463147392463,
+      "loss": 0.771,
+      "step": 10747
+    },
+    {
+      "epoch": 0.7477129639291802,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00031563031588124966,
+      "loss": 0.8609,
+      "step": 10748
+    },
+    {
+      "epoch": 0.7477825315663154,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0003154660350374181,
+      "loss": 0.7597,
+      "step": 10749
+    },
+    {
+      "epoch": 0.7478520992034505,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0003153017889507709,
+      "loss": 0.9094,
+      "step": 10750
+    },
+    {
+      "epoch": 0.7479216668405858,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00031513757762964746,
+      "loss": 0.8527,
+      "step": 10751
+    },
+    {
+      "epoch": 0.747991234477721,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0003149734010823858,
+      "loss": 0.6744,
+      "step": 10752
+    },
+    {
+      "epoch": 0.7480608021148561,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00031480925931732254,
+      "loss": 0.8425,
+      "step": 10753
+    },
+    {
+      "epoch": 0.7481303697519913,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003146451523427912,
+      "loss": 0.8637,
+      "step": 10754
+    },
+    {
+      "epoch": 0.7481999373891266,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003144810801671245,
+      "loss": 0.7141,
+      "step": 10755
+    },
+    {
+      "epoch": 0.7482695050262618,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0003143170427986531,
+      "loss": 0.932,
+      "step": 10756
+    },
+    {
+      "epoch": 0.7483390726633969,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0003141530402457067,
+      "loss": 0.571,
+      "step": 10757
+    },
+    {
+      "epoch": 0.7484086403005322,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0003139890725166118,
+      "loss": 1.0473,
+      "step": 10758
+    },
+    {
+      "epoch": 0.7484782079376674,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00031382513961969384,
+      "loss": 0.5942,
+      "step": 10759
+    },
+    {
+      "epoch": 0.7485477755748026,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00031366124156327667,
+      "loss": 0.7604,
+      "step": 10760
+    },
+    {
+      "epoch": 0.7486173432119378,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0003134973783556825,
+      "loss": 0.8232,
+      "step": 10761
+    },
+    {
+      "epoch": 0.748686910849073,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000313333550005231,
+      "loss": 0.9719,
+      "step": 10762
+    },
+    {
+      "epoch": 0.7487564784862082,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00031316975652024106,
+      "loss": 0.8764,
+      "step": 10763
+    },
+    {
+      "epoch": 0.7488260461233435,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00031300599790902905,
+      "loss": 0.6826,
+      "step": 10764
+    },
+    {
+      "epoch": 0.7488956137604786,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0003128422741799094,
+      "loss": 0.7145,
+      "step": 10765
+    },
+    {
+      "epoch": 0.7489651813976138,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00031267858534119553,
+      "loss": 0.9347,
+      "step": 10766
+    },
+    {
+      "epoch": 0.749034749034749,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.000312514931401199,
+      "loss": 0.6208,
+      "step": 10767
+    },
+    {
+      "epoch": 0.7491043166718843,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0003123513123682292,
+      "loss": 0.7783,
+      "step": 10768
+    },
+    {
+      "epoch": 0.7491738843090194,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00031218772825059336,
+      "loss": 0.8459,
+      "step": 10769
+    },
+    {
+      "epoch": 0.7492434519461546,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003120241790565979,
+      "loss": 0.7067,
+      "step": 10770
+    },
+    {
+      "epoch": 0.7493130195832899,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0003118606647945472,
+      "loss": 0.7017,
+      "step": 10771
+    },
+    {
+      "epoch": 0.7493825872204251,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003116971854727435,
+      "loss": 0.7706,
+      "step": 10772
+    },
+    {
+      "epoch": 0.7494521548575602,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0003115337410994872,
+      "loss": 0.7049,
+      "step": 10773
+    },
+    {
+      "epoch": 0.7495217224946955,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00031137033168307727,
+      "loss": 0.8327,
+      "step": 10774
+    },
+    {
+      "epoch": 0.7495912901318307,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00031120695723181125,
+      "loss": 0.5652,
+      "step": 10775
+    },
+    {
+      "epoch": 0.7496608577689658,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0003110436177539839,
+      "loss": 0.9325,
+      "step": 10776
+    },
+    {
+      "epoch": 0.7497304254061011,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00031088031325788944,
+      "loss": 0.7637,
+      "step": 10777
+    },
+    {
+      "epoch": 0.7497999930432363,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0003107170437518192,
+      "loss": 0.5993,
+      "step": 10778
+    },
+    {
+      "epoch": 0.7498695606803715,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00031055380924406285,
+      "loss": 1.0458,
+      "step": 10779
+    },
+    {
+      "epoch": 0.7499391283175066,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0003103906097429091,
+      "loss": 0.6709,
+      "step": 10780
+    },
+    {
+      "epoch": 0.7500086959546419,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0003102274452566445,
+      "loss": 0.5583,
+      "step": 10781
+    },
+    {
+      "epoch": 0.7500782635917771,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00031006431579355367,
+      "loss": 0.8395,
+      "step": 10782
+    },
+    {
+      "epoch": 0.7501478312289123,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0003099012213619189,
+      "loss": 0.8713,
+      "step": 10783
+    },
+    {
+      "epoch": 0.7502173988660475,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0003097381619700218,
+      "loss": 0.7511,
+      "step": 10784
+    },
+    {
+      "epoch": 0.7502869665031827,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00030957513762614196,
+      "loss": 0.6935,
+      "step": 10785
+    },
+    {
+      "epoch": 0.7503565341403179,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0003094121483385567,
+      "loss": 0.4937,
+      "step": 10786
+    },
+    {
+      "epoch": 0.7504261017774532,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0003092491941155413,
+      "loss": 0.9161,
+      "step": 10787
+    },
+    {
+      "epoch": 0.7504956694145883,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0003090862749653702,
+      "loss": 1.0645,
+      "step": 10788
+    },
+    {
+      "epoch": 0.7505652370517235,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00030892339089631603,
+      "loss": 0.5419,
+      "step": 10789
+    },
+    {
+      "epoch": 0.7506348046888588,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0003087605419166484,
+      "loss": 0.7095,
+      "step": 10790
+    },
+    {
+      "epoch": 0.750704372325994,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0003085977280346366,
+      "loss": 0.9747,
+      "step": 10791
+    },
+    {
+      "epoch": 0.7507739399631291,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0003084349492585473,
+      "loss": 1.0787,
+      "step": 10792
+    },
+    {
+      "epoch": 0.7508435076002643,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00030827220559664524,
+      "loss": 0.7317,
+      "step": 10793
+    },
+    {
+      "epoch": 0.7509130752373996,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00030810949705719395,
+      "loss": 0.7491,
+      "step": 10794
+    },
+    {
+      "epoch": 0.7509826428745348,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003079468236484554,
+      "loss": 0.8051,
+      "step": 10795
+    },
+    {
+      "epoch": 0.7510522105116699,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00030778418537868893,
+      "loss": 0.6746,
+      "step": 10796
+    },
+    {
+      "epoch": 0.7511217781488052,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003076215822561521,
+      "loss": 0.8839,
+      "step": 10797
+    },
+    {
+      "epoch": 0.7511913457859404,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0003074590142891015,
+      "loss": 0.8582,
+      "step": 10798
+    },
+    {
+      "epoch": 0.7512609134230755,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0003072964814857918,
+      "loss": 1.0314,
+      "step": 10799
+    },
+    {
+      "epoch": 0.7513304810602108,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00030713398385447534,
+      "loss": 0.5999,
+      "step": 10800
+    },
+    {
+      "epoch": 0.751400048697346,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00030697152140340256,
+      "loss": 0.761,
+      "step": 10801
+    },
+    {
+      "epoch": 0.7514696163344812,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0003068090941408228,
+      "loss": 0.906,
+      "step": 10802
+    },
+    {
+      "epoch": 0.7515391839716165,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0003066467020749836,
+      "loss": 0.7689,
+      "step": 10803
+    },
+    {
+      "epoch": 0.7516087516087516,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00030648434521412984,
+      "loss": 0.8183,
+      "step": 10804
+    },
+    {
+      "epoch": 0.7516783192458868,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0003063220235665056,
+      "loss": 0.7437,
+      "step": 10805
+    },
+    {
+      "epoch": 0.751747886883022,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0003061597371403525,
+      "loss": 0.4423,
+      "step": 10806
+    },
+    {
+      "epoch": 0.7518174545201572,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00030599748594391094,
+      "loss": 0.5525,
+      "step": 10807
+    },
+    {
+      "epoch": 0.7518870221572924,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00030583526998541875,
+      "loss": 1.2907,
+      "step": 10808
+    },
+    {
+      "epoch": 0.7519565897944276,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.000305673089273113,
+      "loss": 0.757,
+      "step": 10809
+    },
+    {
+      "epoch": 0.7520261574315629,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00030551094381522806,
+      "loss": 0.7258,
+      "step": 10810
+    },
+    {
+      "epoch": 0.752095725068698,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00030534883361999664,
+      "loss": 0.8237,
+      "step": 10811
+    },
+    {
+      "epoch": 0.7521652927058332,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0003051867586956502,
+      "loss": 0.5544,
+      "step": 10812
+    },
+    {
+      "epoch": 0.7522348603429685,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00030502471905041815,
+      "loss": 0.7884,
+      "step": 10813
+    },
+    {
+      "epoch": 0.7523044279801037,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0003048627146925281,
+      "loss": 0.4507,
+      "step": 10814
+    },
+    {
+      "epoch": 0.7523739956172388,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00030470074563020534,
+      "loss": 0.5509,
+      "step": 10815
+    },
+    {
+      "epoch": 0.7524435632543741,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0003045388118716741,
+      "loss": 0.9134,
+      "step": 10816
+    },
+    {
+      "epoch": 0.7525131308915093,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00030437691342515694,
+      "loss": 0.5909,
+      "step": 10817
+    },
+    {
+      "epoch": 0.7525826985286445,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0003042150502988739,
+      "loss": 0.9357,
+      "step": 10818
+    },
+    {
+      "epoch": 0.7526522661657796,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003040532225010433,
+      "loss": 0.7893,
+      "step": 10819
+    },
+    {
+      "epoch": 0.7527218338029149,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00030389143003988216,
+      "loss": 0.9734,
+      "step": 10820
+    },
+    {
+      "epoch": 0.7527914014400501,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00030372967292360587,
+      "loss": 0.8374,
+      "step": 10821
+    },
+    {
+      "epoch": 0.7528609690771852,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00030356795116042714,
+      "loss": 0.8235,
+      "step": 10822
+    },
+    {
+      "epoch": 0.7529305367143205,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00030340626475855784,
+      "loss": 1.0226,
+      "step": 10823
+    },
+    {
+      "epoch": 0.7530001043514557,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00030324461372620726,
+      "loss": 0.727,
+      "step": 10824
+    },
+    {
+      "epoch": 0.7530696719885909,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.000303082998071583,
+      "loss": 0.7792,
+      "step": 10825
+    },
+    {
+      "epoch": 0.7531392396257262,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0003029214178028914,
+      "loss": 0.5346,
+      "step": 10826
+    },
+    {
+      "epoch": 0.7532088072628613,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000302759872928337,
+      "loss": 0.797,
+      "step": 10827
+    },
+    {
+      "epoch": 0.7532783748999965,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0003025983634561218,
+      "loss": 1.0819,
+      "step": 10828
+    },
+    {
+      "epoch": 0.7533479425371318,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0003024368893944462,
+      "loss": 0.6368,
+      "step": 10829
+    },
+    {
+      "epoch": 0.753417510174267,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00030227545075150954,
+      "loss": 1.2654,
+      "step": 10830
+    },
+    {
+      "epoch": 0.7534870778114021,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.000302114047535509,
+      "loss": 0.7587,
+      "step": 10831
+    },
+    {
+      "epoch": 0.7535566454485373,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0003019526797546395,
+      "loss": 0.5841,
+      "step": 10832
+    },
+    {
+      "epoch": 0.7536262130856726,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00030179134741709405,
+      "loss": 0.8582,
+      "step": 10833
+    },
+    {
+      "epoch": 0.7536957807228077,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00030163005053106484,
+      "loss": 0.6291,
+      "step": 10834
+    },
+    {
+      "epoch": 0.7537653483599429,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00030146878910474194,
+      "loss": 0.6024,
+      "step": 10835
+    },
+    {
+      "epoch": 0.7538349159970782,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0003013075631463128,
+      "loss": 0.5758,
+      "step": 10836
+    },
+    {
+      "epoch": 0.7539044836342134,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00030114637266396416,
+      "loss": 0.7231,
+      "step": 10837
+    },
+    {
+      "epoch": 0.7539740512713485,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00030098521766587993,
+      "loss": 0.8559,
+      "step": 10838
+    },
+    {
+      "epoch": 0.7540436189084838,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0003008240981602435,
+      "loss": 0.8338,
+      "step": 10839
+    },
+    {
+      "epoch": 0.754113186545619,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00030066301415523477,
+      "loss": 0.732,
+      "step": 10840
+    },
+    {
+      "epoch": 0.7541827541827542,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00030050196565903364,
+      "loss": 0.814,
+      "step": 10841
+    },
+    {
+      "epoch": 0.7542523218198894,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.000300340952679817,
+      "loss": 0.8623,
+      "step": 10842
+    },
+    {
+      "epoch": 0.7543218894570246,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00030017997522575993,
+      "loss": 0.7784,
+      "step": 10843
+    },
+    {
+      "epoch": 0.7543914570941598,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0003000190333050363,
+      "loss": 0.8666,
+      "step": 10844
+    },
+    {
+      "epoch": 0.754461024731295,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0002998581269258183,
+      "loss": 0.8736,
+      "step": 10845
+    },
+    {
+      "epoch": 0.7545305923684302,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002996972560962757,
+      "loss": 0.7556,
+      "step": 10846
+    },
+    {
+      "epoch": 0.7546001600055654,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00029953642082457634,
+      "loss": 0.9385,
+      "step": 10847
+    },
+    {
+      "epoch": 0.7546697276427006,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00029937562111888685,
+      "loss": 0.6932,
+      "step": 10848
+    },
+    {
+      "epoch": 0.7547392952798359,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0002992148569873723,
+      "loss": 0.9168,
+      "step": 10849
+    },
+    {
+      "epoch": 0.754808862916971,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0002990541284381947,
+      "loss": 0.799,
+      "step": 10850
+    },
+    {
+      "epoch": 0.7548784305541062,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00029889343547951584,
+      "loss": 0.957,
+      "step": 10851
+    },
+    {
+      "epoch": 0.7549479981912415,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002987327781194942,
+      "loss": 0.8664,
+      "step": 10852
+    },
+    {
+      "epoch": 0.7550175658283766,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00029857215636628763,
+      "loss": 1.0017,
+      "step": 10853
+    },
+    {
+      "epoch": 0.7550871334655118,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0002984115702280512,
+      "loss": 0.7072,
+      "step": 10854
+    },
+    {
+      "epoch": 0.7551567011026471,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0002982510197129393,
+      "loss": 0.7619,
+      "step": 10855
+    },
+    {
+      "epoch": 0.7552262687397823,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0002980905048291036,
+      "loss": 0.7234,
+      "step": 10856
+    },
+    {
+      "epoch": 0.7552958363769174,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0002979300255846935,
+      "loss": 0.6323,
+      "step": 10857
+    },
+    {
+      "epoch": 0.7553654040140526,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00029776958198785865,
+      "loss": 0.7847,
+      "step": 10858
+    },
+    {
+      "epoch": 0.7554349716511879,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0002976091740467449,
+      "loss": 0.8902,
+      "step": 10859
+    },
+    {
+      "epoch": 0.7555045392883231,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00029744880176949706,
+      "loss": 0.934,
+      "step": 10860
+    },
+    {
+      "epoch": 0.7555741069254582,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002972884651642576,
+      "loss": 0.6618,
+      "step": 10861
+    },
+    {
+      "epoch": 0.7556436745625935,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0002971281642391679,
+      "loss": 0.9202,
+      "step": 10862
+    },
+    {
+      "epoch": 0.7557132421997287,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00029696789900236754,
+      "loss": 0.8987,
+      "step": 10863
+    },
+    {
+      "epoch": 0.7557828098368639,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00029680766946199355,
+      "loss": 0.7069,
+      "step": 10864
+    },
+    {
+      "epoch": 0.7558523774739991,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000296647475626182,
+      "loss": 0.8637,
+      "step": 10865
+    },
+    {
+      "epoch": 0.7559219451111343,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002964873175030661,
+      "loss": 0.8416,
+      "step": 10866
+    },
+    {
+      "epoch": 0.7559915127482695,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00029632719510077867,
+      "loss": 0.6538,
+      "step": 10867
+    },
+    {
+      "epoch": 0.7560610803854048,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0002961671084274492,
+      "loss": 0.639,
+      "step": 10868
+    },
+    {
+      "epoch": 0.7561306480225399,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0002960070574912066,
+      "loss": 0.6999,
+      "step": 10869
+    },
+    {
+      "epoch": 0.7562002156596751,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000295847042300177,
+      "loss": 0.8679,
+      "step": 10870
+    },
+    {
+      "epoch": 0.7562697832968103,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0002956870628624854,
+      "loss": 0.6761,
+      "step": 10871
+    },
+    {
+      "epoch": 0.7563393509339456,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00029552711918625496,
+      "loss": 0.7618,
+      "step": 10872
+    },
+    {
+      "epoch": 0.7564089185710807,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00029536721127960676,
+      "loss": 0.9856,
+      "step": 10873
+    },
+    {
+      "epoch": 0.7564784862082159,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0002952073391506598,
+      "loss": 0.6998,
+      "step": 10874
+    },
+    {
+      "epoch": 0.7565480538453512,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00029504750280753145,
+      "loss": 0.8895,
+      "step": 10875
+    },
+    {
+      "epoch": 0.7566176214824863,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0002948877022583378,
+      "loss": 0.6585,
+      "step": 10876
+    },
+    {
+      "epoch": 0.7566871891196215,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00029472793751119286,
+      "loss": 0.5516,
+      "step": 10877
+    },
+    {
+      "epoch": 0.7567567567567568,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0002945682085742081,
+      "loss": 0.8468,
+      "step": 10878
+    },
+    {
+      "epoch": 0.756826324393892,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0002944085154554943,
+      "loss": 0.9927,
+      "step": 10879
+    },
+    {
+      "epoch": 0.7568958920310271,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002942488581631594,
+      "loss": 0.776,
+      "step": 10880
+    },
+    {
+      "epoch": 0.7569654596681624,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0002940892367053105,
+      "loss": 0.7683,
+      "step": 10881
+    },
+    {
+      "epoch": 0.7570350273052976,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0002939296510900519,
+      "loss": 0.6813,
+      "step": 10882
+    },
+    {
+      "epoch": 0.7571045949424328,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00029377010132548696,
+      "loss": 0.6979,
+      "step": 10883
+    },
+    {
+      "epoch": 0.7571741625795679,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00029361058741971636,
+      "loss": 0.9223,
+      "step": 10884
+    },
+    {
+      "epoch": 0.7572437302167032,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00029345110938083964,
+      "loss": 0.5794,
+      "step": 10885
+    },
+    {
+      "epoch": 0.7573132978538384,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00029329166721695464,
+      "loss": 0.7523,
+      "step": 10886
+    },
+    {
+      "epoch": 0.7573828654909736,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002931322609361567,
+      "loss": 0.7918,
+      "step": 10887
+    },
+    {
+      "epoch": 0.7574524331281088,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00029297289054653974,
+      "loss": 0.9891,
+      "step": 10888
+    },
+    {
+      "epoch": 0.757522000765244,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00029281355605619496,
+      "loss": 0.9407,
+      "step": 10889
+    },
+    {
+      "epoch": 0.7575915684023792,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002926542574732141,
+      "loss": 0.7065,
+      "step": 10890
+    },
+    {
+      "epoch": 0.7576611360395145,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00029249499480568463,
+      "loss": 0.7619,
+      "step": 10891
+    },
+    {
+      "epoch": 0.7577307036766496,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00029233576806169325,
+      "loss": 0.7175,
+      "step": 10892
+    },
+    {
+      "epoch": 0.7578002713137848,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00029217657724932446,
+      "loss": 0.7108,
+      "step": 10893
+    },
+    {
+      "epoch": 0.7578698389509201,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0002920174223766613,
+      "loss": 0.7984,
+      "step": 10894
+    },
+    {
+      "epoch": 0.7579394065880553,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002918583034517852,
+      "loss": 0.7702,
+      "step": 10895
+    },
+    {
+      "epoch": 0.7580089742251904,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00029169922048277486,
+      "loss": 0.8288,
+      "step": 10896
+    },
+    {
+      "epoch": 0.7580785418623256,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00029154017347770845,
+      "loss": 0.7305,
+      "step": 10897
+    },
+    {
+      "epoch": 0.7581481094994609,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002913811624446606,
+      "loss": 0.7453,
+      "step": 10898
+    },
+    {
+      "epoch": 0.758217677136596,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00029122218739170615,
+      "loss": 0.5697,
+      "step": 10899
+    },
+    {
+      "epoch": 0.7582872447737312,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002910632483269161,
+      "loss": 0.5748,
+      "step": 10900
+    },
+    {
+      "epoch": 0.7583568124108665,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00029090434525836127,
+      "loss": 0.6577,
+      "step": 10901
+    },
+    {
+      "epoch": 0.7584263800480017,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00029074547819410944,
+      "loss": 0.596,
+      "step": 10902
+    },
+    {
+      "epoch": 0.7584959476851368,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00029058664714222724,
+      "loss": 0.7455,
+      "step": 10903
+    },
+    {
+      "epoch": 0.7585655153222721,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00029042785211077983,
+      "loss": 0.8898,
+      "step": 10904
+    },
+    {
+      "epoch": 0.7586350829594073,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00029026909310782945,
+      "loss": 0.8803,
+      "step": 10905
+    },
+    {
+      "epoch": 0.7587046505965425,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00029011037014143725,
+      "loss": 0.8165,
+      "step": 10906
+    },
+    {
+      "epoch": 0.7587742182336777,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00028995168321966215,
+      "loss": 0.5155,
+      "step": 10907
+    },
+    {
+      "epoch": 0.7588437858708129,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0002897930323505615,
+      "loss": 1.0407,
+      "step": 10908
+    },
+    {
+      "epoch": 0.7589133535079481,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00028963441754219135,
+      "loss": 0.9402,
+      "step": 10909
+    },
+    {
+      "epoch": 0.7589829211450833,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00028947583880260466,
+      "loss": 0.8225,
+      "step": 10910
+    },
+    {
+      "epoch": 0.7590524887822185,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00028931729613985394,
+      "loss": 0.7145,
+      "step": 10911
+    },
+    {
+      "epoch": 0.7591220564193537,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00028915878956198835,
+      "loss": 0.733,
+      "step": 10912
+    },
+    {
+      "epoch": 0.7591916240564889,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0002890003190770569,
+      "loss": 0.8173,
+      "step": 10913
+    },
+    {
+      "epoch": 0.7592611916936242,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00028884188469310525,
+      "loss": 0.8284,
+      "step": 10914
+    },
+    {
+      "epoch": 0.7593307593307593,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00028868348641817855,
+      "loss": 0.7452,
+      "step": 10915
+    },
+    {
+      "epoch": 0.7594003269678945,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00028852512426031876,
+      "loss": 0.4362,
+      "step": 10916
+    },
+    {
+      "epoch": 0.7594698946050298,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0002883667982275671,
+      "loss": 0.8837,
+      "step": 10917
+    },
+    {
+      "epoch": 0.759539462242165,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00028820850832796276,
+      "loss": 0.9928,
+      "step": 10918
+    },
+    {
+      "epoch": 0.7596090298793001,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.00028805025456954256,
+      "loss": 0.6872,
+      "step": 10919
+    },
+    {
+      "epoch": 0.7596785975164354,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00028789203696034216,
+      "loss": 1.023,
+      "step": 10920
+    },
+    {
+      "epoch": 0.7597481651535706,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00028773385550839414,
+      "loss": 0.7345,
+      "step": 10921
+    },
+    {
+      "epoch": 0.7598177327907057,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00028757571022173145,
+      "loss": 0.8374,
+      "step": 10922
+    },
+    {
+      "epoch": 0.7598873004278409,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00028741760110838333,
+      "loss": 0.7209,
+      "step": 10923
+    },
+    {
+      "epoch": 0.7599568680649762,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00028725952817637747,
+      "loss": 0.69,
+      "step": 10924
+    },
+    {
+      "epoch": 0.7600264357021114,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00028710149143374055,
+      "loss": 0.9986,
+      "step": 10925
+    },
+    {
+      "epoch": 0.7600960033392465,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00028694349088849625,
+      "loss": 0.4994,
+      "step": 10926
+    },
+    {
+      "epoch": 0.7601655709763818,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00028678552654866785,
+      "loss": 0.8706,
+      "step": 10927
+    },
+    {
+      "epoch": 0.760235138613517,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00028662759842227513,
+      "loss": 0.6004,
+      "step": 10928
+    },
+    {
+      "epoch": 0.7603047062506522,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002864697065173377,
+      "loss": 0.878,
+      "step": 10929
+    },
+    {
+      "epoch": 0.7603742738877874,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0002863118508418717,
+      "loss": 0.7029,
+      "step": 10930
+    },
+    {
+      "epoch": 0.7604438415249226,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0002861540314038927,
+      "loss": 0.9266,
+      "step": 10931
+    },
+    {
+      "epoch": 0.7605134091620578,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00028599624821141437,
+      "loss": 0.8503,
+      "step": 10932
+    },
+    {
+      "epoch": 0.7605829767991931,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002858385012724476,
+      "loss": 0.6257,
+      "step": 10933
+    },
+    {
+      "epoch": 0.7606525444363282,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00028568079059500175,
+      "loss": 0.7435,
+      "step": 10934
+    },
+    {
+      "epoch": 0.7607221120734634,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00028552311618708495,
+      "loss": 0.8444,
+      "step": 10935
+    },
+    {
+      "epoch": 0.7607916797105986,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0002853654780567034,
+      "loss": 0.7446,
+      "step": 10936
+    },
+    {
+      "epoch": 0.7608612473477339,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0002852078762118608,
+      "loss": 0.8777,
+      "step": 10937
+    },
+    {
+      "epoch": 0.760930814984869,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002850503106605592,
+      "loss": 0.7718,
+      "step": 10938
+    },
+    {
+      "epoch": 0.7610003826220042,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0002848927814107994,
+      "loss": 0.6905,
+      "step": 10939
+    },
+    {
+      "epoch": 0.7610699502591395,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0002847352884705796,
+      "loss": 0.9064,
+      "step": 10940
+    },
+    {
+      "epoch": 0.7611395178962747,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002845778318478969,
+      "loss": 0.6481,
+      "step": 10941
+    },
+    {
+      "epoch": 0.7612090855334098,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0002844204115507456,
+      "loss": 0.7602,
+      "step": 10942
+    },
+    {
+      "epoch": 0.7612786531705451,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0002842630275871193,
+      "loss": 0.7536,
+      "step": 10943
+    },
+    {
+      "epoch": 0.7613482208076803,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00028410567996500855,
+      "loss": 0.9728,
+      "step": 10944
+    },
+    {
+      "epoch": 0.7614177884448154,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.000283948368692403,
+      "loss": 0.5581,
+      "step": 10945
+    },
+    {
+      "epoch": 0.7614873560819507,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0002837910937772905,
+      "loss": 0.9369,
+      "step": 10946
+    },
+    {
+      "epoch": 0.7615569237190859,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00028363385522765615,
+      "loss": 0.7163,
+      "step": 10947
+    },
+    {
+      "epoch": 0.7616264913562211,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0002834766530514837,
+      "loss": 0.9173,
+      "step": 10948
+    },
+    {
+      "epoch": 0.7616960589933562,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00028331948725675526,
+      "loss": 0.9139,
+      "step": 10949
+    },
+    {
+      "epoch": 0.7617656266304915,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00028316235785145116,
+      "loss": 0.8441,
+      "step": 10950
+    },
+    {
+      "epoch": 0.7618351942676267,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0002830052648435495,
+      "loss": 0.9639,
+      "step": 10951
+    },
+    {
+      "epoch": 0.7619047619047619,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002828482082410262,
+      "loss": 0.6816,
+      "step": 10952
+    },
+    {
+      "epoch": 0.7619743295418971,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0002826911880518561,
+      "loss": 0.7788,
+      "step": 10953
+    },
+    {
+      "epoch": 0.7620438971790323,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002825342042840123,
+      "loss": 0.6936,
+      "step": 10954
+    },
+    {
+      "epoch": 0.7621134648161675,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00028237725694546544,
+      "loss": 0.7557,
+      "step": 10955
+    },
+    {
+      "epoch": 0.7621830324533028,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000282220346044184,
+      "loss": 0.7433,
+      "step": 10956
+    },
+    {
+      "epoch": 0.7622526000904379,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0002820634715881358,
+      "loss": 0.6299,
+      "step": 10957
+    },
+    {
+      "epoch": 0.7623221677275731,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0002819066335852856,
+      "loss": 0.8281,
+      "step": 10958
+    },
+    {
+      "epoch": 0.7623917353647084,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0002817498320435969,
+      "loss": 0.4377,
+      "step": 10959
+    },
+    {
+      "epoch": 0.7624613030018436,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0002815930669710319,
+      "loss": 0.9633,
+      "step": 10960
+    },
+    {
+      "epoch": 0.7625308706389787,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00028143633837555005,
+      "loss": 0.9027,
+      "step": 10961
+    },
+    {
+      "epoch": 0.7626004382761139,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002812796462651087,
+      "loss": 0.7701,
+      "step": 10962
+    },
+    {
+      "epoch": 0.7626700059132492,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00028112299064766424,
+      "loss": 0.7056,
+      "step": 10963
+    },
+    {
+      "epoch": 0.7627395735503844,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00028096637153117123,
+      "loss": 0.8179,
+      "step": 10964
+    },
+    {
+      "epoch": 0.7628091411875195,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00028080978892358176,
+      "loss": 0.9911,
+      "step": 10965
+    },
+    {
+      "epoch": 0.7628787088246548,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00028065324283284586,
+      "loss": 0.755,
+      "step": 10966
+    },
+    {
+      "epoch": 0.76294827646179,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002804967332669125,
+      "loss": 0.6879,
+      "step": 10967
+    },
+    {
+      "epoch": 0.7630178440989251,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00028034026023372873,
+      "loss": 0.8447,
+      "step": 10968
+    },
+    {
+      "epoch": 0.7630874117360604,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0002801838237412393,
+      "loss": 0.6667,
+      "step": 10969
+    },
+    {
+      "epoch": 0.7631569793731956,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00028002742379738674,
+      "loss": 0.6053,
+      "step": 10970
+    },
+    {
+      "epoch": 0.7632265470103308,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000279871060410113,
+      "loss": 0.9168,
+      "step": 10971
+    },
+    {
+      "epoch": 0.763296114647466,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002797147335873569,
+      "loss": 0.7791,
+      "step": 10972
+    },
+    {
+      "epoch": 0.7633656822846012,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00027955844333705626,
+      "loss": 1.2102,
+      "step": 10973
+    },
+    {
+      "epoch": 0.7634352499217364,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00027940218966714635,
+      "loss": 0.7311,
+      "step": 10974
+    },
+    {
+      "epoch": 0.7635048175588716,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0002792459725855615,
+      "loss": 0.6264,
+      "step": 10975
+    },
+    {
+      "epoch": 0.7635743851960068,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000279089792100233,
+      "loss": 0.868,
+      "step": 10976
+    },
+    {
+      "epoch": 0.763643952833142,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0002789336482190912,
+      "loss": 0.7842,
+      "step": 10977
+    },
+    {
+      "epoch": 0.7637135204702772,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0002787775409500645,
+      "loss": 0.8368,
+      "step": 10978
+    },
+    {
+      "epoch": 0.7637830881074125,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0002786214703010791,
+      "loss": 0.9723,
+      "step": 10979
+    },
+    {
+      "epoch": 0.7638526557445476,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00027846543628005916,
+      "loss": 0.7424,
+      "step": 10980
+    },
+    {
+      "epoch": 0.7639222233816828,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002783094388949274,
+      "loss": 0.6343,
+      "step": 10981
+    },
+    {
+      "epoch": 0.7639917910188181,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00027815347815360526,
+      "loss": 0.7459,
+      "step": 10982
+    },
+    {
+      "epoch": 0.7640613586559533,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0002779975540640111,
+      "loss": 0.7888,
+      "step": 10983
+    },
+    {
+      "epoch": 0.7641309262930884,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002778416666340615,
+      "loss": 0.8225,
+      "step": 10984
+    },
+    {
+      "epoch": 0.7642004939302237,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0002776858158716723,
+      "loss": 0.8207,
+      "step": 10985
+    },
+    {
+      "epoch": 0.7642700615673589,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00027753000178475687,
+      "loss": 0.9062,
+      "step": 10986
+    },
+    {
+      "epoch": 0.764339629204494,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00027737422438122637,
+      "loss": 0.902,
+      "step": 10987
+    },
+    {
+      "epoch": 0.7644091968416292,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00027721848366899025,
+      "loss": 0.9645,
+      "step": 10988
+    },
+    {
+      "epoch": 0.7644787644787645,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002770627796559567,
+      "loss": 1.067,
+      "step": 10989
+    },
+    {
+      "epoch": 0.7645483321158997,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000276907112350031,
+      "loss": 0.7567,
+      "step": 10990
+    },
+    {
+      "epoch": 0.7646178997530348,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00027675148175911746,
+      "loss": 0.6529,
+      "step": 10991
+    },
+    {
+      "epoch": 0.7646874673901701,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0002765958878911187,
+      "loss": 0.9196,
+      "step": 10992
+    },
+    {
+      "epoch": 0.7647570350273053,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00027644033075393436,
+      "loss": 0.6483,
+      "step": 10993
+    },
+    {
+      "epoch": 0.7648266026644405,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0002762848103554627,
+      "loss": 0.7832,
+      "step": 10994
+    },
+    {
+      "epoch": 0.7648961703015758,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0002761293267036007,
+      "loss": 0.7285,
+      "step": 10995
+    },
+    {
+      "epoch": 0.7649657379387109,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0002759738798062431,
+      "loss": 1.087,
+      "step": 10996
+    },
+    {
+      "epoch": 0.7650353055758461,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00027581846967128255,
+      "loss": 0.7953,
+      "step": 10997
+    },
+    {
+      "epoch": 0.7651048732129814,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0002756630963066097,
+      "loss": 0.9286,
+      "step": 10998
+    },
+    {
+      "epoch": 0.7651744408501165,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002755077597201139,
+      "loss": 0.7367,
+      "step": 10999
+    },
+    {
+      "epoch": 0.7652440084872517,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0002753524599196826,
+      "loss": 0.8037,
+      "step": 11000
+    },
+    {
+      "epoch": 0.7653135761243869,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0002751971969132009,
+      "loss": 0.7853,
+      "step": 11001
+    },
+    {
+      "epoch": 0.7653831437615222,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00027504197070855196,
+      "loss": 0.8712,
+      "step": 11002
+    },
+    {
+      "epoch": 0.7654527113986573,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000274886781313618,
+      "loss": 0.7074,
+      "step": 11003
+    },
+    {
+      "epoch": 0.7655222790357925,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0002747316287362782,
+      "loss": 0.8645,
+      "step": 11004
+    },
+    {
+      "epoch": 0.7655918466729278,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00027457651298441055,
+      "loss": 0.7833,
+      "step": 11005
+    },
+    {
+      "epoch": 0.765661414310063,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0002744214340658916,
+      "loss": 0.788,
+      "step": 11006
+    },
+    {
+      "epoch": 0.7657309819471981,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0002742663919885949,
+      "loss": 1.0066,
+      "step": 11007
+    },
+    {
+      "epoch": 0.7658005495843334,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002741113867603927,
+      "loss": 0.7631,
+      "step": 11008
+    },
+    {
+      "epoch": 0.7658701172214686,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0002739564183891554,
+      "loss": 0.7219,
+      "step": 11009
+    },
+    {
+      "epoch": 0.7659396848586038,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0002738014868827521,
+      "loss": 0.7114,
+      "step": 11010
+    },
+    {
+      "epoch": 0.766009252495739,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00027364659224904885,
+      "loss": 0.7598,
+      "step": 11011
+    },
+    {
+      "epoch": 0.7660788201328742,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0002734917344959103,
+      "loss": 0.8081,
+      "step": 11012
+    },
+    {
+      "epoch": 0.7661483877700094,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002733369136311995,
+      "loss": 0.6663,
+      "step": 11013
+    },
+    {
+      "epoch": 0.7662179554071445,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000273182129662778,
+      "loss": 0.8881,
+      "step": 11014
+    },
+    {
+      "epoch": 0.7662875230442798,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00027302738259850443,
+      "loss": 0.8484,
+      "step": 11015
+    },
+    {
+      "epoch": 0.766357090681415,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0002728726724462359,
+      "loss": 0.8909,
+      "step": 11016
+    },
+    {
+      "epoch": 0.7664266583185502,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00027271799921382844,
+      "loss": 0.5954,
+      "step": 11017
+    },
+    {
+      "epoch": 0.7664962259556855,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00027256336290913484,
+      "loss": 0.7347,
+      "step": 11018
+    },
+    {
+      "epoch": 0.7665657935928206,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0002724087635400071,
+      "loss": 0.8684,
+      "step": 11019
+    },
+    {
+      "epoch": 0.7666353612299558,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00027225420111429534,
+      "loss": 0.8686,
+      "step": 11020
+    },
+    {
+      "epoch": 0.7667049288670911,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00027209967563984717,
+      "loss": 0.963,
+      "step": 11021
+    },
+    {
+      "epoch": 0.7667744965042262,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0002719451871245082,
+      "loss": 0.7989,
+      "step": 11022
+    },
+    {
+      "epoch": 0.7668440641413614,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.000271790735576123,
+      "loss": 0.6866,
+      "step": 11023
+    },
+    {
+      "epoch": 0.7669136317784967,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0002716363210025341,
+      "loss": 0.8577,
+      "step": 11024
+    },
+    {
+      "epoch": 0.7669831994156319,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0002714819434115816,
+      "loss": 0.5434,
+      "step": 11025
+    },
+    {
+      "epoch": 0.767052767052767,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002713276028111037,
+      "loss": 0.8487,
+      "step": 11026
+    },
+    {
+      "epoch": 0.7671223346899022,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0002711732992089374,
+      "loss": 0.8545,
+      "step": 11027
+    },
+    {
+      "epoch": 0.7671919023270375,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00027101903261291763,
+      "loss": 0.9335,
+      "step": 11028
+    },
+    {
+      "epoch": 0.7672614699641727,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00027086480303087715,
+      "loss": 0.6288,
+      "step": 11029
+    },
+    {
+      "epoch": 0.7673310376013078,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002707106104706464,
+      "loss": 0.8595,
+      "step": 11030
+    },
+    {
+      "epoch": 0.7674006052384431,
+      "grad_norm": 3.515625,
+      "learning_rate": 0.0002705564549400551,
+      "loss": 0.6332,
+      "step": 11031
+    },
+    {
+      "epoch": 0.7674701728755783,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0002704023364469306,
+      "loss": 0.8574,
+      "step": 11032
+    },
+    {
+      "epoch": 0.7675397405127135,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0002702482549990977,
+      "loss": 0.6316,
+      "step": 11033
+    },
+    {
+      "epoch": 0.7676093081498487,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0002700942106043804,
+      "loss": 0.6021,
+      "step": 11034
+    },
+    {
+      "epoch": 0.7676788757869839,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002699402032706003,
+      "loss": 0.5737,
+      "step": 11035
+    },
+    {
+      "epoch": 0.7677484434241191,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00026978623300557647,
+      "loss": 0.9823,
+      "step": 11036
+    },
+    {
+      "epoch": 0.7678180110612544,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00026963229981712724,
+      "loss": 0.9057,
+      "step": 11037
+    },
+    {
+      "epoch": 0.7678875786983895,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00026947840371306875,
+      "loss": 0.6561,
+      "step": 11038
+    },
+    {
+      "epoch": 0.7679571463355247,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00026932454470121484,
+      "loss": 0.8715,
+      "step": 11039
+    },
+    {
+      "epoch": 0.7680267139726599,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0002691707227893774,
+      "loss": 0.7328,
+      "step": 11040
+    },
+    {
+      "epoch": 0.7680962816097952,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00026901693798536686,
+      "loss": 0.7496,
+      "step": 11041
+    },
+    {
+      "epoch": 0.7681658492469303,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00026886319029699224,
+      "loss": 0.6002,
+      "step": 11042
+    },
+    {
+      "epoch": 0.7682354168840655,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00026870947973205953,
+      "loss": 0.7732,
+      "step": 11043
+    },
+    {
+      "epoch": 0.7683049845212008,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0002685558062983732,
+      "loss": 0.8224,
+      "step": 11044
+    },
+    {
+      "epoch": 0.7683745521583359,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00026840217000373624,
+      "loss": 0.8444,
+      "step": 11045
+    },
+    {
+      "epoch": 0.7684441197954711,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00026824857085594987,
+      "loss": 0.7547,
+      "step": 11046
+    },
+    {
+      "epoch": 0.7685136874326064,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002680950088628128,
+      "loss": 0.6605,
+      "step": 11047
+    },
+    {
+      "epoch": 0.7685832550697416,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00026794148403212184,
+      "loss": 0.6999,
+      "step": 11048
+    },
+    {
+      "epoch": 0.7686528227068767,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00026778799637167274,
+      "loss": 0.7683,
+      "step": 11049
+    },
+    {
+      "epoch": 0.768722390344012,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00026763454588925816,
+      "loss": 0.9527,
+      "step": 11050
+    },
+    {
+      "epoch": 0.7687919579811472,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00026748113259267005,
+      "loss": 0.7597,
+      "step": 11051
+    },
+    {
+      "epoch": 0.7688615256182824,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0002673277564896982,
+      "loss": 0.6541,
+      "step": 11052
+    },
+    {
+      "epoch": 0.7689310932554175,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0002671744175881299,
+      "loss": 0.8399,
+      "step": 11053
+    },
+    {
+      "epoch": 0.7690006608925528,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0002670211158957506,
+      "loss": 0.832,
+      "step": 11054
+    },
+    {
+      "epoch": 0.769070228529688,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00026686785142034455,
+      "loss": 0.9521,
+      "step": 11055
+    },
+    {
+      "epoch": 0.7691397961668232,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00026671462416969416,
+      "loss": 0.8878,
+      "step": 11056
+    },
+    {
+      "epoch": 0.7692093638039584,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00026656143415157896,
+      "loss": 0.7894,
+      "step": 11057
+    },
+    {
+      "epoch": 0.7692789314410936,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00026640828137377713,
+      "loss": 0.8664,
+      "step": 11058
+    },
+    {
+      "epoch": 0.7693484990782288,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00026625516584406517,
+      "loss": 0.9427,
+      "step": 11059
+    },
+    {
+      "epoch": 0.7694180667153641,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00026610208757021784,
+      "loss": 0.7319,
+      "step": 11060
+    },
+    {
+      "epoch": 0.7694876343524992,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002659490465600074,
+      "loss": 0.7949,
+      "step": 11061
+    },
+    {
+      "epoch": 0.7695572019896344,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00026579604282120416,
+      "loss": 0.9307,
+      "step": 11062
+    },
+    {
+      "epoch": 0.7696267696267697,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00026564307636157725,
+      "loss": 0.8738,
+      "step": 11063
+    },
+    {
+      "epoch": 0.7696963372639049,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00026549014718889373,
+      "loss": 0.7137,
+      "step": 11064
+    },
+    {
+      "epoch": 0.76976590490104,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0002653372553109181,
+      "loss": 0.8228,
+      "step": 11065
+    },
+    {
+      "epoch": 0.7698354725381752,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00026518440073541394,
+      "loss": 0.87,
+      "step": 11066
+    },
+    {
+      "epoch": 0.7699050401753105,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0002650315834701421,
+      "loss": 0.8271,
+      "step": 11067
+    },
+    {
+      "epoch": 0.7699746078124456,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00026487880352286177,
+      "loss": 0.5988,
+      "step": 11068
+    },
+    {
+      "epoch": 0.7700441754495808,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002647260609013303,
+      "loss": 0.8066,
+      "step": 11069
+    },
+    {
+      "epoch": 0.7701137430867161,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0002645733556133039,
+      "loss": 0.7901,
+      "step": 11070
+    },
+    {
+      "epoch": 0.7701833107238513,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0002644206876665356,
+      "loss": 0.7836,
+      "step": 11071
+    },
+    {
+      "epoch": 0.7702528783609864,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00026426805706877685,
+      "loss": 0.9634,
+      "step": 11072
+    },
+    {
+      "epoch": 0.7703224459981217,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00026411546382777793,
+      "loss": 1.0034,
+      "step": 11073
+    },
+    {
+      "epoch": 0.7703920136352569,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00026396290795128687,
+      "loss": 0.6641,
+      "step": 11074
+    },
+    {
+      "epoch": 0.7704615812723921,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0002638103894470494,
+      "loss": 0.7064,
+      "step": 11075
+    },
+    {
+      "epoch": 0.7705311489095273,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0002636579083228093,
+      "loss": 0.6517,
+      "step": 11076
+    },
+    {
+      "epoch": 0.7706007165466625,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0002635054645863093,
+      "loss": 0.7051,
+      "step": 11077
+    },
+    {
+      "epoch": 0.7706702841837977,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00026335305824528985,
+      "loss": 1.0038,
+      "step": 11078
+    },
+    {
+      "epoch": 0.7707398518209329,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00026320068930748896,
+      "loss": 0.7383,
+      "step": 11079
+    },
+    {
+      "epoch": 0.7708094194580681,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0002630483577806435,
+      "loss": 0.8704,
+      "step": 11080
+    },
+    {
+      "epoch": 0.7708789870952033,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00026289606367248784,
+      "loss": 0.6069,
+      "step": 11081
+    },
+    {
+      "epoch": 0.7709485547323385,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0002627438069907546,
+      "loss": 0.6408,
+      "step": 11082
+    },
+    {
+      "epoch": 0.7710181223694738,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.00026259158774317483,
+      "loss": 0.6818,
+      "step": 11083
+    },
+    {
+      "epoch": 0.7710876900066089,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00026243940593747764,
+      "loss": 0.9507,
+      "step": 11084
+    },
+    {
+      "epoch": 0.7711572576437441,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00026228726158138984,
+      "loss": 0.6779,
+      "step": 11085
+    },
+    {
+      "epoch": 0.7712268252808794,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00026213515468263626,
+      "loss": 0.6792,
+      "step": 11086
+    },
+    {
+      "epoch": 0.7712963929180146,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0002619830852489404,
+      "loss": 0.8555,
+      "step": 11087
+    },
+    {
+      "epoch": 0.7713659605551497,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.000261831053288024,
+      "loss": 0.8009,
+      "step": 11088
+    },
+    {
+      "epoch": 0.771435528192285,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000261679058807606,
+      "loss": 0.861,
+      "step": 11089
+    },
+    {
+      "epoch": 0.7715050958294202,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0002615271018154036,
+      "loss": 0.8647,
+      "step": 11090
+    },
+    {
+      "epoch": 0.7715746634665553,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0002613751823191328,
+      "loss": 0.8819,
+      "step": 11091
+    },
+    {
+      "epoch": 0.7716442311036905,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00026122330032650774,
+      "loss": 0.6314,
+      "step": 11092
+    },
+    {
+      "epoch": 0.7717137987408258,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0002610714558452394,
+      "loss": 0.7506,
+      "step": 11093
+    },
+    {
+      "epoch": 0.771783366377961,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0002609196488830383,
+      "loss": 0.6546,
+      "step": 11094
+    },
+    {
+      "epoch": 0.7718529340150961,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0002607678794476119,
+      "loss": 1.1312,
+      "step": 11095
+    },
+    {
+      "epoch": 0.7719225016522314,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00026061614754666697,
+      "loss": 0.766,
+      "step": 11096
+    },
+    {
+      "epoch": 0.7719920692893666,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0002604644531879069,
+      "loss": 0.8864,
+      "step": 11097
+    },
+    {
+      "epoch": 0.7720616369265018,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0002603127963790347,
+      "loss": 0.7961,
+      "step": 11098
+    },
+    {
+      "epoch": 0.772131204563637,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0002601611771277505,
+      "loss": 0.6875,
+      "step": 11099
+    },
+    {
+      "epoch": 0.7722007722007722,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0002600095954417522,
+      "loss": 0.8267,
+      "step": 11100
+    },
+    {
+      "epoch": 0.7722703398379074,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00025985805132873685,
+      "loss": 0.9563,
+      "step": 11101
+    },
+    {
+      "epoch": 0.7723399074750427,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0002597065447963993,
+      "loss": 0.7816,
+      "step": 11102
+    },
+    {
+      "epoch": 0.7724094751121778,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0002595550758524322,
+      "loss": 0.72,
+      "step": 11103
+    },
+    {
+      "epoch": 0.772479042749313,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0002594036445045258,
+      "loss": 0.9023,
+      "step": 11104
+    },
+    {
+      "epoch": 0.7725486103864482,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0002592522507603695,
+      "loss": 0.8009,
+      "step": 11105
+    },
+    {
+      "epoch": 0.7726181780235835,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.0002591008946276506,
+      "loss": 0.6459,
+      "step": 11106
+    },
+    {
+      "epoch": 0.7726877456607186,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00025894957611405356,
+      "loss": 0.6435,
+      "step": 11107
+    },
+    {
+      "epoch": 0.7727573132978538,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00025879829522726215,
+      "loss": 0.8786,
+      "step": 11108
+    },
+    {
+      "epoch": 0.7728268809349891,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0002586470519749571,
+      "loss": 0.8265,
+      "step": 11109
+    },
+    {
+      "epoch": 0.7728964485721243,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00025849584636481826,
+      "loss": 1.0323,
+      "step": 11110
+    },
+    {
+      "epoch": 0.7729660162092594,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002583446784045227,
+      "loss": 0.7944,
+      "step": 11111
+    },
+    {
+      "epoch": 0.7730355838463947,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00025819354810174643,
+      "loss": 0.4472,
+      "step": 11112
+    },
+    {
+      "epoch": 0.7731051514835299,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00025804245546416274,
+      "loss": 0.9385,
+      "step": 11113
+    },
+    {
+      "epoch": 0.773174719120665,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0002578914004994429,
+      "loss": 1.0097,
+      "step": 11114
+    },
+    {
+      "epoch": 0.7732442867578003,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0002577403832152578,
+      "loss": 0.9413,
+      "step": 11115
+    },
+    {
+      "epoch": 0.7733138543949355,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00025758940361927474,
+      "loss": 0.7039,
+      "step": 11116
+    },
+    {
+      "epoch": 0.7733834220320707,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00025743846171915973,
+      "loss": 0.913,
+      "step": 11117
+    },
+    {
+      "epoch": 0.7734529896692058,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0002572875575225766,
+      "loss": 0.6947,
+      "step": 11118
+    },
+    {
+      "epoch": 0.7735225573063411,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00025713669103718774,
+      "loss": 0.8442,
+      "step": 11119
+    },
+    {
+      "epoch": 0.7735921249434763,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002569858622706537,
+      "loss": 0.8794,
+      "step": 11120
+    },
+    {
+      "epoch": 0.7736616925806115,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0002568350712306322,
+      "loss": 0.9511,
+      "step": 11121
+    },
+    {
+      "epoch": 0.7737312602177467,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00025668431792478033,
+      "loss": 0.8524,
+      "step": 11122
+    },
+    {
+      "epoch": 0.7738008278548819,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00025653360236075186,
+      "loss": 0.7128,
+      "step": 11123
+    },
+    {
+      "epoch": 0.7738703954920171,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00025638292454619995,
+      "loss": 1.1436,
+      "step": 11124
+    },
+    {
+      "epoch": 0.7739399631291524,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0002562322844887748,
+      "loss": 0.7928,
+      "step": 11125
+    },
+    {
+      "epoch": 0.7740095307662875,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0002560816821961256,
+      "loss": 0.5738,
+      "step": 11126
+    },
+    {
+      "epoch": 0.7740790984034227,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0002559311176758986,
+      "loss": 0.8763,
+      "step": 11127
+    },
+    {
+      "epoch": 0.774148666040558,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00025578059093573946,
+      "loss": 0.836,
+      "step": 11128
+    },
+    {
+      "epoch": 0.7742182336776932,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002556301019832905,
+      "loss": 0.962,
+      "step": 11129
+    },
+    {
+      "epoch": 0.7742878013148283,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002554796508261933,
+      "loss": 0.8005,
+      "step": 11130
+    },
+    {
+      "epoch": 0.7743573689519635,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0002553292374720868,
+      "loss": 0.7667,
+      "step": 11131
+    },
+    {
+      "epoch": 0.7744269365890988,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00025517886192860786,
+      "loss": 0.7471,
+      "step": 11132
+    },
+    {
+      "epoch": 0.774496504226234,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0002550285242033922,
+      "loss": 0.9119,
+      "step": 11133
+    },
+    {
+      "epoch": 0.7745660718633691,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00025487822430407336,
+      "loss": 0.709,
+      "step": 11134
+    },
+    {
+      "epoch": 0.7746356395005044,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00025472796223828265,
+      "loss": 0.9223,
+      "step": 11135
+    },
+    {
+      "epoch": 0.7747052071376396,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00025457773801364935,
+      "loss": 0.8418,
+      "step": 11136
+    },
+    {
+      "epoch": 0.7747747747747747,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0002544275516378012,
+      "loss": 0.933,
+      "step": 11137
+    },
+    {
+      "epoch": 0.77484434241191,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00025427740311836434,
+      "loss": 0.7165,
+      "step": 11138
+    },
+    {
+      "epoch": 0.7749139100490452,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00025412729246296193,
+      "loss": 0.6515,
+      "step": 11139
+    },
+    {
+      "epoch": 0.7749834776861804,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0002539772196792164,
+      "loss": 0.8536,
+      "step": 11140
+    },
+    {
+      "epoch": 0.7750530453233156,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0002538271847747472,
+      "loss": 0.9593,
+      "step": 11141
+    },
+    {
+      "epoch": 0.7751226129604508,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00025367718775717277,
+      "loss": 0.6178,
+      "step": 11142
+    },
+    {
+      "epoch": 0.775192180597586,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0002535272286341087,
+      "loss": 1.0278,
+      "step": 11143
+    },
+    {
+      "epoch": 0.7752617482347212,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002533773074131699,
+      "loss": 0.6262,
+      "step": 11144
+    },
+    {
+      "epoch": 0.7753313158718564,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0002532274241019681,
+      "loss": 0.9228,
+      "step": 11145
+    },
+    {
+      "epoch": 0.7754008835089916,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.000253077578708113,
+      "loss": 0.7004,
+      "step": 11146
+    },
+    {
+      "epoch": 0.7754704511461268,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0002529277712392144,
+      "loss": 1.1014,
+      "step": 11147
+    },
+    {
+      "epoch": 0.7755400187832621,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.000252778001702878,
+      "loss": 0.6228,
+      "step": 11148
+    },
+    {
+      "epoch": 0.7756095864203972,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0002526282701067084,
+      "loss": 0.7733,
+      "step": 11149
+    },
+    {
+      "epoch": 0.7756791540575324,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00025247857645830784,
+      "loss": 0.9438,
+      "step": 11150
+    },
+    {
+      "epoch": 0.7757487216946677,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00025232892076527746,
+      "loss": 0.8475,
+      "step": 11151
+    },
+    {
+      "epoch": 0.7758182893318029,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0002521793030352163,
+      "loss": 0.9927,
+      "step": 11152
+    },
+    {
+      "epoch": 0.775887856968938,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002520297232757205,
+      "loss": 0.6633,
+      "step": 11153
+    },
+    {
+      "epoch": 0.7759574246060733,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0002518801814943855,
+      "loss": 0.8391,
+      "step": 11154
+    },
+    {
+      "epoch": 0.7760269922432085,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00025173067769880384,
+      "loss": 0.7216,
+      "step": 11155
+    },
+    {
+      "epoch": 0.7760965598803437,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00025158121189656715,
+      "loss": 0.9152,
+      "step": 11156
+    },
+    {
+      "epoch": 0.7761661275174788,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0002514317840952639,
+      "loss": 0.7242,
+      "step": 11157
+    },
+    {
+      "epoch": 0.7762356951546141,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002512823943024819,
+      "loss": 0.9426,
+      "step": 11158
+    },
+    {
+      "epoch": 0.7763052627917493,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0002511330425258057,
+      "loss": 0.9526,
+      "step": 11159
+    },
+    {
+      "epoch": 0.7763748304288844,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00025098372877281914,
+      "loss": 0.7022,
+      "step": 11160
+    },
+    {
+      "epoch": 0.7764443980660197,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.00025083445305110387,
+      "loss": 0.53,
+      "step": 11161
+    },
+    {
+      "epoch": 0.7765139657031549,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00025068521536823887,
+      "loss": 0.7777,
+      "step": 11162
+    },
+    {
+      "epoch": 0.7765835333402901,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00025053601573180186,
+      "loss": 0.9637,
+      "step": 11163
+    },
+    {
+      "epoch": 0.7766531009774253,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.000250386854149368,
+      "loss": 0.8286,
+      "step": 11164
+    },
+    {
+      "epoch": 0.7767226686145605,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002502377306285115,
+      "loss": 0.919,
+      "step": 11165
+    },
+    {
+      "epoch": 0.7767922362516957,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00025008864517680416,
+      "loss": 0.8048,
+      "step": 11166
+    },
+    {
+      "epoch": 0.776861803888831,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0002499395978018153,
+      "loss": 0.8079,
+      "step": 11167
+    },
+    {
+      "epoch": 0.7769313715259661,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002497905885111135,
+      "loss": 0.8159,
+      "step": 11168
+    },
+    {
+      "epoch": 0.7770009391631013,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00024964161731226374,
+      "loss": 0.6467,
+      "step": 11169
+    },
+    {
+      "epoch": 0.7770705068002365,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002494926842128311,
+      "loss": 0.8966,
+      "step": 11170
+    },
+    {
+      "epoch": 0.7771400744373718,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00024934378922037673,
+      "loss": 0.8229,
+      "step": 11171
+    },
+    {
+      "epoch": 0.7772096420745069,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00024919493234246137,
+      "loss": 0.7604,
+      "step": 11172
+    },
+    {
+      "epoch": 0.7772792097116421,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00024904611358664286,
+      "loss": 0.7434,
+      "step": 11173
+    },
+    {
+      "epoch": 0.7773487773487774,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0002488973329604774,
+      "loss": 0.9076,
+      "step": 11174
+    },
+    {
+      "epoch": 0.7774183449859126,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0002487485904715201,
+      "loss": 0.6852,
+      "step": 11175
+    },
+    {
+      "epoch": 0.7774879126230477,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0002485998861273226,
+      "loss": 0.9038,
+      "step": 11176
+    },
+    {
+      "epoch": 0.777557480260183,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00024845121993543565,
+      "loss": 0.9135,
+      "step": 11177
+    },
+    {
+      "epoch": 0.7776270478973182,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.000248302591903407,
+      "loss": 0.7006,
+      "step": 11178
+    },
+    {
+      "epoch": 0.7776966155344534,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00024815400203878445,
+      "loss": 0.6716,
+      "step": 11179
+    },
+    {
+      "epoch": 0.7777661831715886,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00024800545034911226,
+      "loss": 0.8515,
+      "step": 11180
+    },
+    {
+      "epoch": 0.7778357508087238,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00024785693684193256,
+      "loss": 0.8551,
+      "step": 11181
+    },
+    {
+      "epoch": 0.777905318445859,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0002477084615247868,
+      "loss": 1.0675,
+      "step": 11182
+    },
+    {
+      "epoch": 0.7779748860829941,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0002475600244052133,
+      "loss": 0.8185,
+      "step": 11183
+    },
+    {
+      "epoch": 0.7780444537201294,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0002474116254907495,
+      "loss": 0.7336,
+      "step": 11184
+    },
+    {
+      "epoch": 0.7781140213572646,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00024726326478892956,
+      "loss": 0.8614,
+      "step": 11185
+    },
+    {
+      "epoch": 0.7781835889943998,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00024711494230728737,
+      "loss": 0.8187,
+      "step": 11186
+    },
+    {
+      "epoch": 0.778253156631535,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0002469666580533534,
+      "loss": 0.6623,
+      "step": 11187
+    },
+    {
+      "epoch": 0.7783227242686702,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0002468184120346568,
+      "loss": 0.7252,
+      "step": 11188
+    },
+    {
+      "epoch": 0.7783922919058054,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002466702042587253,
+      "loss": 0.6136,
+      "step": 11189
+    },
+    {
+      "epoch": 0.7784618595429407,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00024652203473308375,
+      "loss": 1.0125,
+      "step": 11190
+    },
+    {
+      "epoch": 0.7785314271800758,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00024637390346525544,
+      "loss": 0.8981,
+      "step": 11191
+    },
+    {
+      "epoch": 0.778600994817211,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0002462258104627612,
+      "loss": 1.1634,
+      "step": 11192
+    },
+    {
+      "epoch": 0.7786705624543463,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0002460777557331215,
+      "loss": 0.8263,
+      "step": 11193
+    },
+    {
+      "epoch": 0.7787401300914815,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0002459297392838534,
+      "loss": 0.7874,
+      "step": 11194
+    },
+    {
+      "epoch": 0.7788096977286166,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0002457817611224721,
+      "loss": 0.9767,
+      "step": 11195
+    },
+    {
+      "epoch": 0.7788792653657518,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00024563382125649167,
+      "loss": 0.7474,
+      "step": 11196
+    },
+    {
+      "epoch": 0.7789488330028871,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00024548591969342313,
+      "loss": 0.8021,
+      "step": 11197
+    },
+    {
+      "epoch": 0.7790184006400223,
+      "grad_norm": 1.125,
+      "learning_rate": 0.000245338056440777,
+      "loss": 0.8787,
+      "step": 11198
+    },
+    {
+      "epoch": 0.7790879682771574,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.00024519023150606026,
+      "loss": 0.7636,
+      "step": 11199
+    },
+    {
+      "epoch": 0.7791575359142927,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0002450424448967793,
+      "loss": 0.6921,
+      "step": 11200
+    },
+    {
+      "epoch": 0.7792271035514279,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0002448946966204374,
+      "loss": 0.548,
+      "step": 11201
+    },
+    {
+      "epoch": 0.779296671188563,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0002447469866845371,
+      "loss": 1.0118,
+      "step": 11202
+    },
+    {
+      "epoch": 0.7793662388256983,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00024459931509657776,
+      "loss": 0.8319,
+      "step": 11203
+    },
+    {
+      "epoch": 0.7794358064628335,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00024445168186405797,
+      "loss": 0.794,
+      "step": 11204
+    },
+    {
+      "epoch": 0.7795053740999687,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00024430408699447324,
+      "loss": 0.6468,
+      "step": 11205
+    },
+    {
+      "epoch": 0.779574941737104,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00024415653049531807,
+      "loss": 0.6303,
+      "step": 11206
+    },
+    {
+      "epoch": 0.7796445093742391,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0002440090123740848,
+      "loss": 0.7152,
+      "step": 11207
+    },
+    {
+      "epoch": 0.7797140770113743,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00024386153263826339,
+      "loss": 0.8969,
+      "step": 11208
+    },
+    {
+      "epoch": 0.7797836446485095,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00024371409129534205,
+      "loss": 0.9184,
+      "step": 11209
+    },
+    {
+      "epoch": 0.7798532122856447,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0002435666883528067,
+      "loss": 0.9784,
+      "step": 11210
+    },
+    {
+      "epoch": 0.7799227799227799,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0002434193238181428,
+      "loss": 0.7509,
+      "step": 11211
+    },
+    {
+      "epoch": 0.7799923475599151,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00024327199769883222,
+      "loss": 0.6518,
+      "step": 11212
+    },
+    {
+      "epoch": 0.7800619151970504,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00024312471000235503,
+      "loss": 0.8756,
+      "step": 11213
+    },
+    {
+      "epoch": 0.7801314828341855,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00024297746073619043,
+      "loss": 1.0232,
+      "step": 11214
+    },
+    {
+      "epoch": 0.7802010504713207,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00024283024990781444,
+      "loss": 0.7334,
+      "step": 11215
+    },
+    {
+      "epoch": 0.780270618108456,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0002426830775247022,
+      "loss": 0.8859,
+      "step": 11216
+    },
+    {
+      "epoch": 0.7803401857455912,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00024253594359432585,
+      "loss": 0.7879,
+      "step": 11217
+    },
+    {
+      "epoch": 0.7804097533827263,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.00024238884812415674,
+      "loss": 0.9066,
+      "step": 11218
+    },
+    {
+      "epoch": 0.7804793210198615,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002422417911216629,
+      "loss": 0.7788,
+      "step": 11219
+    },
+    {
+      "epoch": 0.7805488886569968,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00024209477259431157,
+      "loss": 0.5782,
+      "step": 11220
+    },
+    {
+      "epoch": 0.780618456294132,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00024194779254956778,
+      "loss": 0.6061,
+      "step": 11221
+    },
+    {
+      "epoch": 0.7806880239312671,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00024180085099489423,
+      "loss": 1.0929,
+      "step": 11222
+    },
+    {
+      "epoch": 0.7807575915684024,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.00024165394793775196,
+      "loss": 0.5921,
+      "step": 11223
+    },
+    {
+      "epoch": 0.7808271592055376,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00024150708338559922,
+      "loss": 0.6992,
+      "step": 11224
+    },
+    {
+      "epoch": 0.7808967268426728,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00024136025734589428,
+      "loss": 0.7722,
+      "step": 11225
+    },
+    {
+      "epoch": 0.780966294479808,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002412134698260916,
+      "loss": 0.8283,
+      "step": 11226
+    },
+    {
+      "epoch": 0.7810358621169432,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00024106672083364412,
+      "loss": 0.8134,
+      "step": 11227
+    },
+    {
+      "epoch": 0.7811054297540784,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00024092001037600354,
+      "loss": 0.8106,
+      "step": 11228
+    },
+    {
+      "epoch": 0.7811749973912137,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00024077333846061856,
+      "loss": 0.5998,
+      "step": 11229
+    },
+    {
+      "epoch": 0.7812445650283488,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0002406267050949369,
+      "loss": 0.7085,
+      "step": 11230
+    },
+    {
+      "epoch": 0.781314132665484,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00024048011028640328,
+      "loss": 0.9104,
+      "step": 11231
+    },
+    {
+      "epoch": 0.7813837003026192,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00024033355404246172,
+      "loss": 0.8226,
+      "step": 11232
+    },
+    {
+      "epoch": 0.7814532679397544,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00024018703637055305,
+      "loss": 0.809,
+      "step": 11233
+    },
+    {
+      "epoch": 0.7815228355768896,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0002400405572781168,
+      "loss": 0.7225,
+      "step": 11234
+    },
+    {
+      "epoch": 0.7815924032140248,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.000239894116772591,
+      "loss": 0.7899,
+      "step": 11235
+    },
+    {
+      "epoch": 0.7816619708511601,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00023974771486141066,
+      "loss": 0.8799,
+      "step": 11236
+    },
+    {
+      "epoch": 0.7817315384882952,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00023960135155200914,
+      "loss": 0.8214,
+      "step": 11237
+    },
+    {
+      "epoch": 0.7818011061254304,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0002394550268518183,
+      "loss": 0.7401,
+      "step": 11238
+    },
+    {
+      "epoch": 0.7818706737625657,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00023930874076826802,
+      "loss": 0.823,
+      "step": 11239
+    },
+    {
+      "epoch": 0.7819402413997009,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00023916249330878581,
+      "loss": 0.5478,
+      "step": 11240
+    },
+    {
+      "epoch": 0.782009809036836,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00023901628448079693,
+      "loss": 0.804,
+      "step": 11241
+    },
+    {
+      "epoch": 0.7820793766739713,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00023887011429172568,
+      "loss": 0.8168,
+      "step": 11242
+    },
+    {
+      "epoch": 0.7821489443111065,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00023872398274899344,
+      "loss": 0.8006,
+      "step": 11243
+    },
+    {
+      "epoch": 0.7822185119482417,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0002385778898600206,
+      "loss": 0.5458,
+      "step": 11244
+    },
+    {
+      "epoch": 0.7822880795853768,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00023843183563222425,
+      "loss": 0.9043,
+      "step": 11245
+    },
+    {
+      "epoch": 0.7823576472225121,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00023828582007302102,
+      "loss": 0.9319,
+      "step": 11246
+    },
+    {
+      "epoch": 0.7824272148596473,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00023813984318982428,
+      "loss": 0.6755,
+      "step": 11247
+    },
+    {
+      "epoch": 0.7824967824967825,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00023799390499004626,
+      "loss": 0.7779,
+      "step": 11248
+    },
+    {
+      "epoch": 0.7825663501339177,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0002378480054810972,
+      "loss": 0.7601,
+      "step": 11249
+    },
+    {
+      "epoch": 0.7826359177710529,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00023770214467038487,
+      "loss": 0.6702,
+      "step": 11250
+    },
+    {
+      "epoch": 0.7827054854081881,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00023755632256531513,
+      "loss": 0.6623,
+      "step": 11251
+    },
+    {
+      "epoch": 0.7827750530453234,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00023741053917329224,
+      "loss": 0.868,
+      "step": 11252
+    },
+    {
+      "epoch": 0.7828446206824585,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00023726479450171878,
+      "loss": 0.6187,
+      "step": 11253
+    },
+    {
+      "epoch": 0.7829141883195937,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0002371190885579946,
+      "loss": 1.0516,
+      "step": 11254
+    },
+    {
+      "epoch": 0.782983755956729,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0002369734213495176,
+      "loss": 1.0133,
+      "step": 11255
+    },
+    {
+      "epoch": 0.7830533235938641,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00023682779288368438,
+      "loss": 0.8297,
+      "step": 11256
+    },
+    {
+      "epoch": 0.7831228912309993,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00023668220316788935,
+      "loss": 0.5686,
+      "step": 11257
+    },
+    {
+      "epoch": 0.7831924588681345,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002365366522095247,
+      "loss": 0.82,
+      "step": 11258
+    },
+    {
+      "epoch": 0.7832620265052698,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00023639114001598038,
+      "loss": 0.6162,
+      "step": 11259
+    },
+    {
+      "epoch": 0.7833315941424049,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00023624566659464542,
+      "loss": 0.7459,
+      "step": 11260
+    },
+    {
+      "epoch": 0.7834011617795401,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00023610023195290563,
+      "loss": 0.6846,
+      "step": 11261
+    },
+    {
+      "epoch": 0.7834707294166754,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0002359548360981457,
+      "loss": 0.9254,
+      "step": 11262
+    },
+    {
+      "epoch": 0.7835402970538106,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0002358094790377484,
+      "loss": 0.8209,
+      "step": 11263
+    },
+    {
+      "epoch": 0.7836098646909457,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0002356641607790939,
+      "loss": 1.0074,
+      "step": 11264
+    },
+    {
+      "epoch": 0.783679432328081,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00023551888132956056,
+      "loss": 0.7177,
+      "step": 11265
+    },
+    {
+      "epoch": 0.7837489999652162,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.00023537364069652511,
+      "loss": 0.4447,
+      "step": 11266
+    },
+    {
+      "epoch": 0.7838185676023514,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00023522843888736257,
+      "loss": 0.7159,
+      "step": 11267
+    },
+    {
+      "epoch": 0.7838881352394866,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000235083275909445,
+      "loss": 0.7927,
+      "step": 11268
+    },
+    {
+      "epoch": 0.7839577028766218,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002349381517701431,
+      "loss": 0.7813,
+      "step": 11269
+    },
+    {
+      "epoch": 0.784027270513757,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00023479306647682552,
+      "loss": 0.8981,
+      "step": 11270
+    },
+    {
+      "epoch": 0.7840968381508922,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00023464802003685947,
+      "loss": 0.9437,
+      "step": 11271
+    },
+    {
+      "epoch": 0.7841664057880274,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0002345030124576093,
+      "loss": 0.7387,
+      "step": 11272
+    },
+    {
+      "epoch": 0.7842359734251626,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00023435804374643743,
+      "loss": 0.9306,
+      "step": 11273
+    },
+    {
+      "epoch": 0.7843055410622978,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00023421311391070532,
+      "loss": 0.6672,
+      "step": 11274
+    },
+    {
+      "epoch": 0.784375108699433,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00023406822295777107,
+      "loss": 1.0116,
+      "step": 11275
+    },
+    {
+      "epoch": 0.7844446763365682,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00023392337089499194,
+      "loss": 0.7879,
+      "step": 11276
+    },
+    {
+      "epoch": 0.7845142439737034,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000233778557729723,
+      "loss": 0.7684,
+      "step": 11277
+    },
+    {
+      "epoch": 0.7845838116108387,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00023363378346931684,
+      "loss": 0.7108,
+      "step": 11278
+    },
+    {
+      "epoch": 0.7846533792479738,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00023348904812112403,
+      "loss": 0.7429,
+      "step": 11279
+    },
+    {
+      "epoch": 0.784722946885109,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00023334435169249402,
+      "loss": 0.7404,
+      "step": 11280
+    },
+    {
+      "epoch": 0.7847925145222443,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0002331996941907738,
+      "loss": 0.842,
+      "step": 11281
+    },
+    {
+      "epoch": 0.7848620821593795,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00023305507562330807,
+      "loss": 0.8211,
+      "step": 11282
+    },
+    {
+      "epoch": 0.7849316497965146,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00023291049599743975,
+      "loss": 0.8249,
+      "step": 11283
+    },
+    {
+      "epoch": 0.7850012174336498,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0002327659553205099,
+      "loss": 0.6402,
+      "step": 11284
+    },
+    {
+      "epoch": 0.7850707850707851,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00023262145359985808,
+      "loss": 0.6879,
+      "step": 11285
+    },
+    {
+      "epoch": 0.7851403527079203,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00023247699084282092,
+      "loss": 0.69,
+      "step": 11286
+    },
+    {
+      "epoch": 0.7852099203450554,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00023233256705673333,
+      "loss": 0.6935,
+      "step": 11287
+    },
+    {
+      "epoch": 0.7852794879821907,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00023218818224892868,
+      "loss": 0.6691,
+      "step": 11288
+    },
+    {
+      "epoch": 0.7853490556193259,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0002320438364267383,
+      "loss": 0.9906,
+      "step": 11289
+    },
+    {
+      "epoch": 0.7854186232564611,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00023189952959749106,
+      "loss": 0.7326,
+      "step": 11290
+    },
+    {
+      "epoch": 0.7854881908935963,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00023175526176851403,
+      "loss": 0.7043,
+      "step": 11291
+    },
+    {
+      "epoch": 0.7855577585307315,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00023161103294713282,
+      "loss": 0.7164,
+      "step": 11292
+    },
+    {
+      "epoch": 0.7856273261678667,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00023146684314067002,
+      "loss": 1.0191,
+      "step": 11293
+    },
+    {
+      "epoch": 0.785696893805002,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00023132269235644733,
+      "loss": 0.9472,
+      "step": 11294
+    },
+    {
+      "epoch": 0.7857664614421371,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0002311785806017842,
+      "loss": 0.6708,
+      "step": 11295
+    },
+    {
+      "epoch": 0.7858360290792723,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002310345078839975,
+      "loss": 0.707,
+      "step": 11296
+    },
+    {
+      "epoch": 0.7859055967164075,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00023089047421040243,
+      "loss": 0.5383,
+      "step": 11297
+    },
+    {
+      "epoch": 0.7859751643535428,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0002307464795883124,
+      "loss": 0.7086,
+      "step": 11298
+    },
+    {
+      "epoch": 0.7860447319906779,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00023060252402503913,
+      "loss": 0.6749,
+      "step": 11299
+    },
+    {
+      "epoch": 0.7861142996278131,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0002304586075278916,
+      "loss": 0.8607,
+      "step": 11300
+    },
+    {
+      "epoch": 0.7861838672649484,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00023031473010417703,
+      "loss": 0.8593,
+      "step": 11301
+    },
+    {
+      "epoch": 0.7862534349020835,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00023017089176120088,
+      "loss": 0.6794,
+      "step": 11302
+    },
+    {
+      "epoch": 0.7863230025392187,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00023002709250626686,
+      "loss": 0.9,
+      "step": 11303
+    },
+    {
+      "epoch": 0.786392570176354,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00022988333234667626,
+      "loss": 0.8108,
+      "step": 11304
+    },
+    {
+      "epoch": 0.7864621378134892,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00022973961128972797,
+      "loss": 0.6458,
+      "step": 11305
+    },
+    {
+      "epoch": 0.7865317054506243,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002295959293427201,
+      "loss": 0.7002,
+      "step": 11306
+    },
+    {
+      "epoch": 0.7866012730877596,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0002294522865129476,
+      "loss": 0.8288,
+      "step": 11307
+    },
+    {
+      "epoch": 0.7866708407248948,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00022930868280770413,
+      "loss": 0.841,
+      "step": 11308
+    },
+    {
+      "epoch": 0.78674040836203,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00022916511823428142,
+      "loss": 0.6988,
+      "step": 11309
+    },
+    {
+      "epoch": 0.7868099759991651,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00022902159279996871,
+      "loss": 0.6887,
+      "step": 11310
+    },
+    {
+      "epoch": 0.7868795436363004,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00022887810651205331,
+      "loss": 0.9695,
+      "step": 11311
+    },
+    {
+      "epoch": 0.7869491112734356,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00022873465937782079,
+      "loss": 0.7344,
+      "step": 11312
+    },
+    {
+      "epoch": 0.7870186789105708,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00022859125140455515,
+      "loss": 0.9219,
+      "step": 11313
+    },
+    {
+      "epoch": 0.787088246547706,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00022844788259953765,
+      "loss": 0.923,
+      "step": 11314
+    },
+    {
+      "epoch": 0.7871578141848412,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00022830455297004738,
+      "loss": 0.7885,
+      "step": 11315
+    },
+    {
+      "epoch": 0.7872273818219764,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00022816126252336223,
+      "loss": 0.9752,
+      "step": 11316
+    },
+    {
+      "epoch": 0.7872969494591117,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00022801801126675814,
+      "loss": 0.7138,
+      "step": 11317
+    },
+    {
+      "epoch": 0.7873665170962468,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00022787479920750842,
+      "loss": 0.9382,
+      "step": 11318
+    },
+    {
+      "epoch": 0.787436084733382,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00022773162635288425,
+      "loss": 0.7596,
+      "step": 11319
+    },
+    {
+      "epoch": 0.7875056523705173,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0002275884927101557,
+      "loss": 0.6886,
+      "step": 11320
+    },
+    {
+      "epoch": 0.7875752200076525,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0002274453982865904,
+      "loss": 0.6172,
+      "step": 11321
+    },
+    {
+      "epoch": 0.7876447876447876,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00022730234308945352,
+      "loss": 0.5757,
+      "step": 11322
+    },
+    {
+      "epoch": 0.7877143552819228,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00022715932712600928,
+      "loss": 0.8766,
+      "step": 11323
+    },
+    {
+      "epoch": 0.7877839229190581,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00022701635040351897,
+      "loss": 0.6226,
+      "step": 11324
+    },
+    {
+      "epoch": 0.7878534905561932,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00022687341292924212,
+      "loss": 0.5474,
+      "step": 11325
+    },
+    {
+      "epoch": 0.7879230581933284,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00022673051471043637,
+      "loss": 0.6295,
+      "step": 11326
+    },
+    {
+      "epoch": 0.7879926258304637,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00022658765575435792,
+      "loss": 0.7875,
+      "step": 11327
+    },
+    {
+      "epoch": 0.7880621934675989,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00022644483606825994,
+      "loss": 0.7761,
+      "step": 11328
+    },
+    {
+      "epoch": 0.788131761104734,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00022630205565939387,
+      "loss": 0.8934,
+      "step": 11329
+    },
+    {
+      "epoch": 0.7882013287418693,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00022615931453500972,
+      "loss": 0.9277,
+      "step": 11330
+    },
+    {
+      "epoch": 0.7882708963790045,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0002260166127023554,
+      "loss": 0.5723,
+      "step": 11331
+    },
+    {
+      "epoch": 0.7883404640161397,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002258739501686763,
+      "loss": 0.7372,
+      "step": 11332
+    },
+    {
+      "epoch": 0.788410031653275,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002257313269412159,
+      "loss": 0.7885,
+      "step": 11333
+    },
+    {
+      "epoch": 0.7884795992904101,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0002255887430272161,
+      "loss": 0.7335,
+      "step": 11334
+    },
+    {
+      "epoch": 0.7885491669275453,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0002254461984339169,
+      "loss": 1.1112,
+      "step": 11335
+    },
+    {
+      "epoch": 0.7886187345646805,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00022530369316855537,
+      "loss": 0.7953,
+      "step": 11336
+    },
+    {
+      "epoch": 0.7886883022018157,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00022516122723836786,
+      "loss": 0.5744,
+      "step": 11337
+    },
+    {
+      "epoch": 0.7887578698389509,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00022501880065058777,
+      "loss": 0.859,
+      "step": 11338
+    },
+    {
+      "epoch": 0.7888274374760861,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00022487641341244647,
+      "loss": 0.7815,
+      "step": 11339
+    },
+    {
+      "epoch": 0.7888970051132214,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00022473406553117403,
+      "loss": 0.7874,
+      "step": 11340
+    },
+    {
+      "epoch": 0.7889665727503565,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.00022459175701399837,
+      "loss": 0.8343,
+      "step": 11341
+    },
+    {
+      "epoch": 0.7890361403874917,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00022444948786814502,
+      "loss": 0.7663,
+      "step": 11342
+    },
+    {
+      "epoch": 0.789105708024627,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00022430725810083718,
+      "loss": 0.7282,
+      "step": 11343
+    },
+    {
+      "epoch": 0.7891752756617622,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00022416506771929712,
+      "loss": 0.7486,
+      "step": 11344
+    },
+    {
+      "epoch": 0.7892448432988973,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0002240229167307446,
+      "loss": 0.8693,
+      "step": 11345
+    },
+    {
+      "epoch": 0.7893144109360326,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00022388080514239718,
+      "loss": 0.8566,
+      "step": 11346
+    },
+    {
+      "epoch": 0.7893839785731678,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0002237387329614703,
+      "loss": 0.5645,
+      "step": 11347
+    },
+    {
+      "epoch": 0.789453546210303,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00022359670019517797,
+      "loss": 0.8409,
+      "step": 11348
+    },
+    {
+      "epoch": 0.7895231138474381,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000223454706850732,
+      "loss": 0.8261,
+      "step": 11349
+    },
+    {
+      "epoch": 0.7895926814845734,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0002233127529353417,
+      "loss": 0.5983,
+      "step": 11350
+    },
+    {
+      "epoch": 0.7896622491217086,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00022317083845621534,
+      "loss": 0.8145,
+      "step": 11351
+    },
+    {
+      "epoch": 0.7897318167588437,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00022302896342055802,
+      "loss": 0.9216,
+      "step": 11352
+    },
+    {
+      "epoch": 0.789801384395979,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00022288712783557387,
+      "loss": 0.7054,
+      "step": 11353
+    },
+    {
+      "epoch": 0.7898709520331142,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00022274533170846424,
+      "loss": 0.6841,
+      "step": 11354
+    },
+    {
+      "epoch": 0.7899405196702494,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00022260357504642924,
+      "loss": 0.811,
+      "step": 11355
+    },
+    {
+      "epoch": 0.7900100873073846,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002224618578566664,
+      "loss": 0.7281,
+      "step": 11356
+    },
+    {
+      "epoch": 0.7900796549445198,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00022232018014637102,
+      "loss": 0.8313,
+      "step": 11357
+    },
+    {
+      "epoch": 0.790149222581655,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002221785419227371,
+      "loss": 0.7241,
+      "step": 11358
+    },
+    {
+      "epoch": 0.7902187902187903,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00022203694319295665,
+      "loss": 0.9019,
+      "step": 11359
+    },
+    {
+      "epoch": 0.7902883578559254,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00022189538396421893,
+      "loss": 0.867,
+      "step": 11360
+    },
+    {
+      "epoch": 0.7903579254930606,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00022175386424371136,
+      "loss": 0.8799,
+      "step": 11361
+    },
+    {
+      "epoch": 0.7904274931301958,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00022161238403861993,
+      "loss": 0.991,
+      "step": 11362
+    },
+    {
+      "epoch": 0.7904970607673311,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0002214709433561286,
+      "loss": 1.1083,
+      "step": 11363
+    },
+    {
+      "epoch": 0.7905666284044662,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00022132954220341873,
+      "loss": 0.6552,
+      "step": 11364
+    },
+    {
+      "epoch": 0.7906361960416014,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00022118818058766953,
+      "loss": 0.6947,
+      "step": 11365
+    },
+    {
+      "epoch": 0.7907057636787367,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002210468585160591,
+      "loss": 0.8956,
+      "step": 11366
+    },
+    {
+      "epoch": 0.7907753313158719,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0002209055759957632,
+      "loss": 0.762,
+      "step": 11367
+    },
+    {
+      "epoch": 0.790844898953007,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00022076433303395504,
+      "loss": 0.9421,
+      "step": 11368
+    },
+    {
+      "epoch": 0.7909144665901423,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00022062312963780663,
+      "loss": 0.5986,
+      "step": 11369
+    },
+    {
+      "epoch": 0.7909840342272775,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00022048196581448732,
+      "loss": 0.7251,
+      "step": 11370
+    },
+    {
+      "epoch": 0.7910536018644126,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002203408415711644,
+      "loss": 0.6971,
+      "step": 11371
+    },
+    {
+      "epoch": 0.7911231695015479,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00022019975691500382,
+      "loss": 0.9502,
+      "step": 11372
+    },
+    {
+      "epoch": 0.7911927371386831,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0002200587118531694,
+      "loss": 0.6688,
+      "step": 11373
+    },
+    {
+      "epoch": 0.7912623047758183,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00021991770639282238,
+      "loss": 0.7167,
+      "step": 11374
+    },
+    {
+      "epoch": 0.7913318724129534,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00021977674054112205,
+      "loss": 0.8944,
+      "step": 11375
+    },
+    {
+      "epoch": 0.7914014400500887,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00021963581430522628,
+      "loss": 0.5945,
+      "step": 11376
+    },
+    {
+      "epoch": 0.7914710076872239,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00021949492769229073,
+      "loss": 0.5457,
+      "step": 11377
+    },
+    {
+      "epoch": 0.7915405753243591,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0002193540807094687,
+      "loss": 0.9095,
+      "step": 11378
+    },
+    {
+      "epoch": 0.7916101429614943,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002192132733639115,
+      "loss": 0.8057,
+      "step": 11379
+    },
+    {
+      "epoch": 0.7916797105986295,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00021907250566276882,
+      "loss": 0.6894,
+      "step": 11380
+    },
+    {
+      "epoch": 0.7917492782357647,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0002189317776131884,
+      "loss": 0.586,
+      "step": 11381
+    },
+    {
+      "epoch": 0.7918188458729,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00021879108922231516,
+      "loss": 0.8967,
+      "step": 11382
+    },
+    {
+      "epoch": 0.7918884135100351,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.000218650440497293,
+      "loss": 0.7873,
+      "step": 11383
+    },
+    {
+      "epoch": 0.7919579811471703,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00021850983144526304,
+      "loss": 0.8728,
+      "step": 11384
+    },
+    {
+      "epoch": 0.7920275487843056,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00021836926207336504,
+      "loss": 0.9168,
+      "step": 11385
+    },
+    {
+      "epoch": 0.7920971164214408,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00021822873238873597,
+      "loss": 0.7407,
+      "step": 11386
+    },
+    {
+      "epoch": 0.7921666840585759,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00021808824239851165,
+      "loss": 0.9536,
+      "step": 11387
+    },
+    {
+      "epoch": 0.7922362516957111,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0002179477921098253,
+      "loss": 0.9262,
+      "step": 11388
+    },
+    {
+      "epoch": 0.7923058193328464,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00021780738152980795,
+      "loss": 0.7183,
+      "step": 11389
+    },
+    {
+      "epoch": 0.7923753869699816,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00021766701066558924,
+      "loss": 0.7785,
+      "step": 11390
+    },
+    {
+      "epoch": 0.7924449546071167,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00021752667952429673,
+      "loss": 0.7651,
+      "step": 11391
+    },
+    {
+      "epoch": 0.792514522244252,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00021738638811305555,
+      "loss": 0.6941,
+      "step": 11392
+    },
+    {
+      "epoch": 0.7925840898813872,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00021724613643898848,
+      "loss": 0.7646,
+      "step": 11393
+    },
+    {
+      "epoch": 0.7926536575185223,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0002171059245092174,
+      "loss": 0.9519,
+      "step": 11394
+    },
+    {
+      "epoch": 0.7927232251556576,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00021696575233086157,
+      "loss": 0.8139,
+      "step": 11395
+    },
+    {
+      "epoch": 0.7927927927927928,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0002168256199110379,
+      "loss": 1.0322,
+      "step": 11396
+    },
+    {
+      "epoch": 0.792862360429928,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00021668552725686186,
+      "loss": 0.8298,
+      "step": 11397
+    },
+    {
+      "epoch": 0.7929319280670633,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00021654547437544635,
+      "loss": 0.6793,
+      "step": 11398
+    },
+    {
+      "epoch": 0.7930014957041984,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00021640546127390302,
+      "loss": 0.9698,
+      "step": 11399
+    },
+    {
+      "epoch": 0.7930710633413336,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00021626548795934054,
+      "loss": 0.8393,
+      "step": 11400
+    },
+    {
+      "epoch": 0.7931406309784688,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0002161255544388665,
+      "loss": 0.6756,
+      "step": 11401
+    },
+    {
+      "epoch": 0.793210198615604,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0002159856607195857,
+      "loss": 0.4974,
+      "step": 11402
+    },
+    {
+      "epoch": 0.7932797662527392,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00021584580680860088,
+      "loss": 1.0398,
+      "step": 11403
+    },
+    {
+      "epoch": 0.7933493338898744,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00021570599271301404,
+      "loss": 0.9352,
+      "step": 11404
+    },
+    {
+      "epoch": 0.7934189015270097,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00021556621843992385,
+      "loss": 0.6301,
+      "step": 11405
+    },
+    {
+      "epoch": 0.7934884691641448,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00021542648399642717,
+      "loss": 0.6083,
+      "step": 11406
+    },
+    {
+      "epoch": 0.79355803680128,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00021528678938961888,
+      "loss": 0.6556,
+      "step": 11407
+    },
+    {
+      "epoch": 0.7936276044384153,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00021514713462659208,
+      "loss": 0.9146,
+      "step": 11408
+    },
+    {
+      "epoch": 0.7936971720755505,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002150075197144382,
+      "loss": 0.8798,
+      "step": 11409
+    },
+    {
+      "epoch": 0.7937667397126856,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0002148679446602455,
+      "loss": 0.7407,
+      "step": 11410
+    },
+    {
+      "epoch": 0.7938363073498209,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0002147284094711015,
+      "loss": 0.8417,
+      "step": 11411
+    },
+    {
+      "epoch": 0.7939058749869561,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00021458891415409055,
+      "loss": 0.8481,
+      "step": 11412
+    },
+    {
+      "epoch": 0.7939754426240913,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00021444945871629595,
+      "loss": 0.724,
+      "step": 11413
+    },
+    {
+      "epoch": 0.7940450102612264,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00021431004316479818,
+      "loss": 1.0432,
+      "step": 11414
+    },
+    {
+      "epoch": 0.7941145778983617,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00021417066750667658,
+      "loss": 0.9333,
+      "step": 11415
+    },
+    {
+      "epoch": 0.7941841455354969,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00021403133174900747,
+      "loss": 0.4688,
+      "step": 11416
+    },
+    {
+      "epoch": 0.794253713172632,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0002138920358988653,
+      "loss": 0.8709,
+      "step": 11417
+    },
+    {
+      "epoch": 0.7943232808097673,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00021375277996332377,
+      "loss": 0.6321,
+      "step": 11418
+    },
+    {
+      "epoch": 0.7943928484469025,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00021361356394945308,
+      "loss": 1.1456,
+      "step": 11419
+    },
+    {
+      "epoch": 0.7944624160840377,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00021347438786432205,
+      "loss": 0.6778,
+      "step": 11420
+    },
+    {
+      "epoch": 0.794531983721173,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0002133352517149968,
+      "loss": 0.7393,
+      "step": 11421
+    },
+    {
+      "epoch": 0.7946015513583081,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00021319615550854243,
+      "loss": 0.5918,
+      "step": 11422
+    },
+    {
+      "epoch": 0.7946711189954433,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0002130570992520219,
+      "loss": 0.5849,
+      "step": 11423
+    },
+    {
+      "epoch": 0.7947406866325786,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.00021291808295249493,
+      "loss": 0.7043,
+      "step": 11424
+    },
+    {
+      "epoch": 0.7948102542697137,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0002127791066170208,
+      "loss": 0.5975,
+      "step": 11425
+    },
+    {
+      "epoch": 0.7948798219068489,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00021264017025265558,
+      "loss": 0.5729,
+      "step": 11426
+    },
+    {
+      "epoch": 0.7949493895439841,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00021250127386645412,
+      "loss": 0.7633,
+      "step": 11427
+    },
+    {
+      "epoch": 0.7950189571811194,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00021236241746546848,
+      "loss": 0.8501,
+      "step": 11428
+    },
+    {
+      "epoch": 0.7950885248182545,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00021222360105674953,
+      "loss": 0.7237,
+      "step": 11429
+    },
+    {
+      "epoch": 0.7951580924553897,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00021208482464734525,
+      "loss": 0.7258,
+      "step": 11430
+    },
+    {
+      "epoch": 0.795227660092525,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00021194608824430205,
+      "loss": 0.781,
+      "step": 11431
+    },
+    {
+      "epoch": 0.7952972277296602,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00021180739185466468,
+      "loss": 1.0382,
+      "step": 11432
+    },
+    {
+      "epoch": 0.7953667953667953,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00021166873548547526,
+      "loss": 0.7659,
+      "step": 11433
+    },
+    {
+      "epoch": 0.7954363630039306,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00021153011914377395,
+      "loss": 0.8325,
+      "step": 11434
+    },
+    {
+      "epoch": 0.7955059306410658,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00021139154283659846,
+      "loss": 0.8347,
+      "step": 11435
+    },
+    {
+      "epoch": 0.795575498278201,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.000211253006570986,
+      "loss": 0.7483,
+      "step": 11436
+    },
+    {
+      "epoch": 0.7956450659153362,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00021111451035397033,
+      "loss": 0.7951,
+      "step": 11437
+    },
+    {
+      "epoch": 0.7957146335524714,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0002109760541925836,
+      "loss": 0.8256,
+      "step": 11438
+    },
+    {
+      "epoch": 0.7957842011896066,
+      "grad_norm": 3.0,
+      "learning_rate": 0.0002108376380938556,
+      "loss": 0.8697,
+      "step": 11439
+    },
+    {
+      "epoch": 0.7958537688267417,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0002106992620648146,
+      "loss": 1.0132,
+      "step": 11440
+    },
+    {
+      "epoch": 0.795923336463877,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.000210560926112487,
+      "loss": 0.8459,
+      "step": 11441
+    },
+    {
+      "epoch": 0.7959929041010122,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00021042263024389617,
+      "loss": 0.6435,
+      "step": 11442
+    },
+    {
+      "epoch": 0.7960624717381474,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00021028437446606475,
+      "loss": 0.9039,
+      "step": 11443
+    },
+    {
+      "epoch": 0.7961320393752827,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00021014615878601207,
+      "loss": 0.7823,
+      "step": 11444
+    },
+    {
+      "epoch": 0.7962016070124178,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00021000798321075653,
+      "loss": 0.7463,
+      "step": 11445
+    },
+    {
+      "epoch": 0.796271174649553,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00020986984774731354,
+      "loss": 0.6783,
+      "step": 11446
+    },
+    {
+      "epoch": 0.7963407422866883,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00020973175240269739,
+      "loss": 0.7749,
+      "step": 11447
+    },
+    {
+      "epoch": 0.7964103099238234,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0002095936971839195,
+      "loss": 0.691,
+      "step": 11448
+    },
+    {
+      "epoch": 0.7964798775609586,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00020945568209798928,
+      "loss": 0.886,
+      "step": 11449
+    },
+    {
+      "epoch": 0.7965494451980939,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00020931770715191533,
+      "loss": 1.0378,
+      "step": 11450
+    },
+    {
+      "epoch": 0.7966190128352291,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00020917977235270302,
+      "loss": 0.8532,
+      "step": 11451
+    },
+    {
+      "epoch": 0.7966885804723642,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00020904187770735572,
+      "loss": 0.838,
+      "step": 11452
+    },
+    {
+      "epoch": 0.7967581481094994,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00020890402322287495,
+      "loss": 0.6772,
+      "step": 11453
+    },
+    {
+      "epoch": 0.7968277157466347,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00020876620890626041,
+      "loss": 0.8102,
+      "step": 11454
+    },
+    {
+      "epoch": 0.7968972833837699,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00020862843476451,
+      "loss": 0.8577,
+      "step": 11455
+    },
+    {
+      "epoch": 0.796966851020905,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00020849070080461852,
+      "loss": 0.9275,
+      "step": 11456
+    },
+    {
+      "epoch": 0.7970364186580403,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00020835300703358006,
+      "loss": 0.7921,
+      "step": 11457
+    },
+    {
+      "epoch": 0.7971059862951755,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00020821535345838537,
+      "loss": 1.0517,
+      "step": 11458
+    },
+    {
+      "epoch": 0.7971755539323107,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00020807774008602454,
+      "loss": 0.6087,
+      "step": 11459
+    },
+    {
+      "epoch": 0.7972451215694459,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00020794016692348417,
+      "loss": 0.8276,
+      "step": 11460
+    },
+    {
+      "epoch": 0.7973146892065811,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00020780263397775,
+      "loss": 0.7144,
+      "step": 11461
+    },
+    {
+      "epoch": 0.7973842568437163,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00020766514125580493,
+      "loss": 0.6881,
+      "step": 11462
+    },
+    {
+      "epoch": 0.7974538244808516,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00020752768876463034,
+      "loss": 0.8766,
+      "step": 11463
+    },
+    {
+      "epoch": 0.7975233921179867,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00020739027651120567,
+      "loss": 0.6938,
+      "step": 11464
+    },
+    {
+      "epoch": 0.7975929597551219,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00020725290450250767,
+      "loss": 0.7493,
+      "step": 11465
+    },
+    {
+      "epoch": 0.7976625273922571,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0002071155727455114,
+      "loss": 0.8856,
+      "step": 11466
+    },
+    {
+      "epoch": 0.7977320950293924,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00020697828124718965,
+      "loss": 0.8574,
+      "step": 11467
+    },
+    {
+      "epoch": 0.7978016626665275,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0002068410300145136,
+      "loss": 0.9403,
+      "step": 11468
+    },
+    {
+      "epoch": 0.7978712303036627,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00020670381905445257,
+      "loss": 0.9696,
+      "step": 11469
+    },
+    {
+      "epoch": 0.797940797940798,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00020656664837397288,
+      "loss": 0.8581,
+      "step": 11470
+    },
+    {
+      "epoch": 0.7980103655779331,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00020642951798003972,
+      "loss": 0.88,
+      "step": 11471
+    },
+    {
+      "epoch": 0.7980799332150683,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00020629242787961556,
+      "loss": 0.8249,
+      "step": 11472
+    },
+    {
+      "epoch": 0.7981495008522036,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00020615537807966167,
+      "loss": 0.8592,
+      "step": 11473
+    },
+    {
+      "epoch": 0.7982190684893388,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00020601836858713597,
+      "loss": 0.7575,
+      "step": 11474
+    },
+    {
+      "epoch": 0.7982886361264739,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00020588139940899597,
+      "loss": 0.5698,
+      "step": 11475
+    },
+    {
+      "epoch": 0.7983582037636092,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00020574447055219546,
+      "loss": 0.9127,
+      "step": 11476
+    },
+    {
+      "epoch": 0.7984277714007444,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00020560758202368745,
+      "loss": 0.7286,
+      "step": 11477
+    },
+    {
+      "epoch": 0.7984973390378796,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0002054707338304227,
+      "loss": 0.9412,
+      "step": 11478
+    },
+    {
+      "epoch": 0.7985669066750147,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0002053339259793493,
+      "loss": 0.7786,
+      "step": 11479
+    },
+    {
+      "epoch": 0.79863647431215,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002051971584774137,
+      "loss": 0.6275,
+      "step": 11480
+    },
+    {
+      "epoch": 0.7987060419492852,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00020506043133155982,
+      "loss": 0.801,
+      "step": 11481
+    },
+    {
+      "epoch": 0.7987756095864204,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00020492374454873097,
+      "loss": 0.8623,
+      "step": 11482
+    },
+    {
+      "epoch": 0.7988451772235556,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00020478709813586692,
+      "loss": 0.7663,
+      "step": 11483
+    },
+    {
+      "epoch": 0.7989147448606908,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0002046504920999056,
+      "loss": 1.0197,
+      "step": 11484
+    },
+    {
+      "epoch": 0.798984312497826,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00020451392644778356,
+      "loss": 0.7304,
+      "step": 11485
+    },
+    {
+      "epoch": 0.7990538801349613,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00020437740118643466,
+      "loss": 0.6665,
+      "step": 11486
+    },
+    {
+      "epoch": 0.7991234477720964,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00020424091632279128,
+      "loss": 0.8603,
+      "step": 11487
+    },
+    {
+      "epoch": 0.7991930154092316,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.000204104471863783,
+      "loss": 1.0289,
+      "step": 11488
+    },
+    {
+      "epoch": 0.7992625830463669,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00020396806781633836,
+      "loss": 0.8675,
+      "step": 11489
+    },
+    {
+      "epoch": 0.799332150683502,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0002038317041873826,
+      "loss": 0.7787,
+      "step": 11490
+    },
+    {
+      "epoch": 0.7994017183206372,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00020369538098383987,
+      "loss": 0.9907,
+      "step": 11491
+    },
+    {
+      "epoch": 0.7994712859577724,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0002035590982126324,
+      "loss": 0.8315,
+      "step": 11492
+    },
+    {
+      "epoch": 0.7995408535949077,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00020342285588067954,
+      "loss": 0.6062,
+      "step": 11493
+    },
+    {
+      "epoch": 0.7996104212320428,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00020328665399489866,
+      "loss": 0.7618,
+      "step": 11494
+    },
+    {
+      "epoch": 0.799679988869178,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00020315049256220584,
+      "loss": 0.9091,
+      "step": 11495
+    },
+    {
+      "epoch": 0.7997495565063133,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00020301437158951486,
+      "loss": 0.9664,
+      "step": 11496
+    },
+    {
+      "epoch": 0.7998191241434485,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0002028782910837369,
+      "loss": 0.9051,
+      "step": 11497
+    },
+    {
+      "epoch": 0.7998886917805836,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00020274225105178134,
+      "loss": 0.9678,
+      "step": 11498
+    },
+    {
+      "epoch": 0.7999582594177189,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00020260625150055612,
+      "loss": 0.7862,
+      "step": 11499
+    },
+    {
+      "epoch": 0.8000278270548541,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0002024702924369659,
+      "loss": 0.8813,
+      "step": 11500
+    },
+    {
+      "epoch": 0.8000973946919893,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00020233437386791463,
+      "loss": 0.6225,
+      "step": 11501
+    },
+    {
+      "epoch": 0.8001669623291245,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00020219849580030313,
+      "loss": 0.7075,
+      "step": 11502
+    },
+    {
+      "epoch": 0.8002365299662597,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0002020626582410311,
+      "loss": 0.6389,
+      "step": 11503
+    },
+    {
+      "epoch": 0.8003060976033949,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000201926861196995,
+      "loss": 0.8935,
+      "step": 11504
+    },
+    {
+      "epoch": 0.80037566524053,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00020179110467509042,
+      "loss": 0.8508,
+      "step": 11505
+    },
+    {
+      "epoch": 0.8004452328776653,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00020165538868221046,
+      "loss": 0.8624,
+      "step": 11506
+    },
+    {
+      "epoch": 0.8005148005148005,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00020151971322524597,
+      "loss": 0.7857,
+      "step": 11507
+    },
+    {
+      "epoch": 0.8005843681519357,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0002013840783110854,
+      "loss": 0.8685,
+      "step": 11508
+    },
+    {
+      "epoch": 0.800653935789071,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00020124848394661622,
+      "loss": 0.8577,
+      "step": 11509
+    },
+    {
+      "epoch": 0.8007235034262061,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0002011129301387231,
+      "loss": 0.6494,
+      "step": 11510
+    },
+    {
+      "epoch": 0.8007930710633413,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00020097741689428884,
+      "loss": 0.7739,
+      "step": 11511
+    },
+    {
+      "epoch": 0.8008626387004766,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00020084194422019365,
+      "loss": 0.7364,
+      "step": 11512
+    },
+    {
+      "epoch": 0.8009322063376118,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00020070651212331648,
+      "loss": 0.7561,
+      "step": 11513
+    },
+    {
+      "epoch": 0.8010017739747469,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00020057112061053407,
+      "loss": 0.8435,
+      "step": 11514
+    },
+    {
+      "epoch": 0.8010713416118822,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0002004357696887208,
+      "loss": 0.7197,
+      "step": 11515
+    },
+    {
+      "epoch": 0.8011409092490174,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00020030045936474884,
+      "loss": 0.9916,
+      "step": 11516
+    },
+    {
+      "epoch": 0.8012104768861525,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0002001651896454889,
+      "loss": 0.8623,
+      "step": 11517
+    },
+    {
+      "epoch": 0.8012800445232877,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00020002996053780907,
+      "loss": 0.875,
+      "step": 11518
+    },
+    {
+      "epoch": 0.801349612160423,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00019989477204857586,
+      "loss": 0.8547,
+      "step": 11519
+    },
+    {
+      "epoch": 0.8014191797975582,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00019975962418465298,
+      "loss": 0.6164,
+      "step": 11520
+    },
+    {
+      "epoch": 0.8014887474346933,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00019962451695290328,
+      "loss": 0.8538,
+      "step": 11521
+    },
+    {
+      "epoch": 0.8015583150718286,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00019948945036018606,
+      "loss": 0.6978,
+      "step": 11522
+    },
+    {
+      "epoch": 0.8016278827089638,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001993544244133597,
+      "loss": 0.5762,
+      "step": 11523
+    },
+    {
+      "epoch": 0.801697450346099,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00019921943911928032,
+      "loss": 0.742,
+      "step": 11524
+    },
+    {
+      "epoch": 0.8017670179832342,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001990844944848017,
+      "loss": 0.6807,
+      "step": 11525
+    },
+    {
+      "epoch": 0.8018365856203694,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0001989495905167752,
+      "loss": 0.8195,
+      "step": 11526
+    },
+    {
+      "epoch": 0.8019061532575046,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00019881472722205085,
+      "loss": 0.9336,
+      "step": 11527
+    },
+    {
+      "epoch": 0.8019757208946399,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00019867990460747676,
+      "loss": 0.6865,
+      "step": 11528
+    },
+    {
+      "epoch": 0.802045288531775,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00019854512267989812,
+      "loss": 0.8677,
+      "step": 11529
+    },
+    {
+      "epoch": 0.8021148561689102,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001984103814461582,
+      "loss": 0.721,
+      "step": 11530
+    },
+    {
+      "epoch": 0.8021844238060454,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001982756809130991,
+      "loss": 0.7316,
+      "step": 11531
+    },
+    {
+      "epoch": 0.8022539914431807,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00019814102108755972,
+      "loss": 0.7935,
+      "step": 11532
+    },
+    {
+      "epoch": 0.8023235590803158,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00019800640197637786,
+      "loss": 1.0023,
+      "step": 11533
+    },
+    {
+      "epoch": 0.802393126717451,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00019787182358638823,
+      "loss": 0.5747,
+      "step": 11534
+    },
+    {
+      "epoch": 0.8024626943545863,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00019773728592442465,
+      "loss": 0.7026,
+      "step": 11535
+    },
+    {
+      "epoch": 0.8025322619917215,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00019760278899731777,
+      "loss": 0.8311,
+      "step": 11536
+    },
+    {
+      "epoch": 0.8026018296288566,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001974683328118969,
+      "loss": 0.8812,
+      "step": 11537
+    },
+    {
+      "epoch": 0.8026713972659919,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0001973339173749893,
+      "loss": 0.5614,
+      "step": 11538
+    },
+    {
+      "epoch": 0.8027409649031271,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00019719954269341956,
+      "loss": 0.8661,
+      "step": 11539
+    },
+    {
+      "epoch": 0.8028105325402622,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00019706520877401035,
+      "loss": 0.9264,
+      "step": 11540
+    },
+    {
+      "epoch": 0.8028801001773975,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0001969309156235829,
+      "loss": 0.7194,
+      "step": 11541
+    },
+    {
+      "epoch": 0.8029496678145327,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00019679666324895595,
+      "loss": 0.5175,
+      "step": 11542
+    },
+    {
+      "epoch": 0.8030192354516679,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00019666245165694596,
+      "loss": 0.6996,
+      "step": 11543
+    },
+    {
+      "epoch": 0.803088803088803,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00019652828085436736,
+      "loss": 0.6839,
+      "step": 11544
+    },
+    {
+      "epoch": 0.8031583707259383,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001963941508480328,
+      "loss": 0.914,
+      "step": 11545
+    },
+    {
+      "epoch": 0.8032279383630735,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00019626006164475307,
+      "loss": 0.8159,
+      "step": 11546
+    },
+    {
+      "epoch": 0.8032975060002087,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00019612601325133628,
+      "loss": 0.6157,
+      "step": 11547
+    },
+    {
+      "epoch": 0.8033670736373439,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0001959920056745884,
+      "loss": 0.7854,
+      "step": 11548
+    },
+    {
+      "epoch": 0.8034366412744791,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00019585803892131426,
+      "loss": 0.8643,
+      "step": 11549
+    },
+    {
+      "epoch": 0.8035062089116143,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001957241129983155,
+      "loss": 1.024,
+      "step": 11550
+    },
+    {
+      "epoch": 0.8035757765487496,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00019559022791239245,
+      "loss": 0.6395,
+      "step": 11551
+    },
+    {
+      "epoch": 0.8036453441858847,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00019545638367034335,
+      "loss": 0.6166,
+      "step": 11552
+    },
+    {
+      "epoch": 0.8037149118230199,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00019532258027896377,
+      "loss": 0.5566,
+      "step": 11553
+    },
+    {
+      "epoch": 0.8037844794601552,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0001951888177450476,
+      "loss": 0.947,
+      "step": 11554
+    },
+    {
+      "epoch": 0.8038540470972904,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00019505509607538663,
+      "loss": 0.8042,
+      "step": 11555
+    },
+    {
+      "epoch": 0.8039236147344255,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00019492141527677087,
+      "loss": 0.8172,
+      "step": 11556
+    },
+    {
+      "epoch": 0.8039931823715607,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0001947877753559878,
+      "loss": 0.7079,
+      "step": 11557
+    },
+    {
+      "epoch": 0.804062750008696,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00019465417631982262,
+      "loss": 0.8163,
+      "step": 11558
+    },
+    {
+      "epoch": 0.8041323176458312,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00019452061817505918,
+      "loss": 0.9206,
+      "step": 11559
+    },
+    {
+      "epoch": 0.8042018852829663,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001943871009284791,
+      "loss": 0.7406,
+      "step": 11560
+    },
+    {
+      "epoch": 0.8042714529201016,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00019425362458686148,
+      "loss": 0.8649,
+      "step": 11561
+    },
+    {
+      "epoch": 0.8043410205572368,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00019412018915698315,
+      "loss": 0.8075,
+      "step": 11562
+    },
+    {
+      "epoch": 0.804410588194372,
+      "grad_norm": 2.25,
+      "learning_rate": 0.00019398679464562008,
+      "loss": 1.022,
+      "step": 11563
+    },
+    {
+      "epoch": 0.8044801558315072,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00019385344105954462,
+      "loss": 1.0943,
+      "step": 11564
+    },
+    {
+      "epoch": 0.8045497234686424,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00019372012840552822,
+      "loss": 0.769,
+      "step": 11565
+    },
+    {
+      "epoch": 0.8046192911057776,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00019358685669033994,
+      "loss": 1.0729,
+      "step": 11566
+    },
+    {
+      "epoch": 0.8046888587429128,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00019345362592074645,
+      "loss": 0.7369,
+      "step": 11567
+    },
+    {
+      "epoch": 0.804758426380048,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00019332043610351224,
+      "loss": 0.7023,
+      "step": 11568
+    },
+    {
+      "epoch": 0.8048279940171832,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00019318728724540047,
+      "loss": 0.8439,
+      "step": 11569
+    },
+    {
+      "epoch": 0.8048975616543184,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0001930541793531717,
+      "loss": 0.9113,
+      "step": 11570
+    },
+    {
+      "epoch": 0.8049671292914536,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00019292111243358445,
+      "loss": 0.9856,
+      "step": 11571
+    },
+    {
+      "epoch": 0.8050366969285888,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00019278808649339496,
+      "loss": 0.688,
+      "step": 11572
+    },
+    {
+      "epoch": 0.805106264565724,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00019265510153935772,
+      "loss": 0.7357,
+      "step": 11573
+    },
+    {
+      "epoch": 0.8051758322028593,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00019252215757822533,
+      "loss": 0.6238,
+      "step": 11574
+    },
+    {
+      "epoch": 0.8052453998399944,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00019238925461674783,
+      "loss": 0.6776,
+      "step": 11575
+    },
+    {
+      "epoch": 0.8053149674771296,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00019225639266167317,
+      "loss": 0.8295,
+      "step": 11576
+    },
+    {
+      "epoch": 0.8053845351142649,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00019212357171974738,
+      "loss": 0.8657,
+      "step": 11577
+    },
+    {
+      "epoch": 0.8054541027514001,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00019199079179771494,
+      "loss": 0.6886,
+      "step": 11578
+    },
+    {
+      "epoch": 0.8055236703885352,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00019185805290231718,
+      "loss": 0.8363,
+      "step": 11579
+    },
+    {
+      "epoch": 0.8055932380256705,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00019172535504029443,
+      "loss": 0.8789,
+      "step": 11580
+    },
+    {
+      "epoch": 0.8056628056628057,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0001915926982183841,
+      "loss": 1.0713,
+      "step": 11581
+    },
+    {
+      "epoch": 0.8057323732999409,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0001914600824433217,
+      "loss": 0.7367,
+      "step": 11582
+    },
+    {
+      "epoch": 0.805801940937076,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00019132750772184092,
+      "loss": 0.7935,
+      "step": 11583
+    },
+    {
+      "epoch": 0.8058715085742113,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00019119497406067354,
+      "loss": 0.7253,
+      "step": 11584
+    },
+    {
+      "epoch": 0.8059410762113465,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00019106248146654869,
+      "loss": 0.6743,
+      "step": 11585
+    },
+    {
+      "epoch": 0.8060106438484816,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00019093002994619346,
+      "loss": 0.9133,
+      "step": 11586
+    },
+    {
+      "epoch": 0.8060802114856169,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00019079761950633323,
+      "loss": 0.763,
+      "step": 11587
+    },
+    {
+      "epoch": 0.8061497791227521,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0001906652501536915,
+      "loss": 0.5673,
+      "step": 11588
+    },
+    {
+      "epoch": 0.8062193467598873,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00019053292189498904,
+      "loss": 0.6943,
+      "step": 11589
+    },
+    {
+      "epoch": 0.8062889143970225,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00019040063473694448,
+      "loss": 0.8947,
+      "step": 11590
+    },
+    {
+      "epoch": 0.8063584820341577,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00019026838868627506,
+      "loss": 0.7626,
+      "step": 11591
+    },
+    {
+      "epoch": 0.8064280496712929,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00019013618374969578,
+      "loss": 0.8147,
+      "step": 11592
+    },
+    {
+      "epoch": 0.8064976173084282,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00019000401993391868,
+      "loss": 0.4795,
+      "step": 11593
+    },
+    {
+      "epoch": 0.8065671849455633,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00018987189724565512,
+      "loss": 0.7489,
+      "step": 11594
+    },
+    {
+      "epoch": 0.8066367525826985,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00018973981569161337,
+      "loss": 0.6915,
+      "step": 11595
+    },
+    {
+      "epoch": 0.8067063202198337,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00018960777527849936,
+      "loss": 0.8171,
+      "step": 11596
+    },
+    {
+      "epoch": 0.806775887856969,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0001894757760130179,
+      "loss": 1.0254,
+      "step": 11597
+    },
+    {
+      "epoch": 0.8068454554941041,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00018934381790187139,
+      "loss": 0.8368,
+      "step": 11598
+    },
+    {
+      "epoch": 0.8069150231312393,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0001892119009517599,
+      "loss": 0.91,
+      "step": 11599
+    },
+    {
+      "epoch": 0.8069845907683746,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00018908002516938106,
+      "loss": 0.8141,
+      "step": 11600
+    },
+    {
+      "epoch": 0.8070541584055098,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001889481905614313,
+      "loss": 0.947,
+      "step": 11601
+    },
+    {
+      "epoch": 0.8071237260426449,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00018881639713460452,
+      "loss": 0.9287,
+      "step": 11602
+    },
+    {
+      "epoch": 0.8071932936797802,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00018868464489559257,
+      "loss": 0.5424,
+      "step": 11603
+    },
+    {
+      "epoch": 0.8072628613169154,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00018855293385108474,
+      "loss": 0.6286,
+      "step": 11604
+    },
+    {
+      "epoch": 0.8073324289540506,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00018842126400776883,
+      "loss": 0.6904,
+      "step": 11605
+    },
+    {
+      "epoch": 0.8074019965911858,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001882896353723308,
+      "loss": 0.667,
+      "step": 11606
+    },
+    {
+      "epoch": 0.807471564228321,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00018815804795145385,
+      "loss": 0.9498,
+      "step": 11607
+    },
+    {
+      "epoch": 0.8075411318654562,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0001880265017518189,
+      "loss": 0.6469,
+      "step": 11608
+    },
+    {
+      "epoch": 0.8076106995025913,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00018789499678010548,
+      "loss": 1.0743,
+      "step": 11609
+    },
+    {
+      "epoch": 0.8076802671397266,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0001877635330429911,
+      "loss": 0.6298,
+      "step": 11610
+    },
+    {
+      "epoch": 0.8077498347768618,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018763211054715034,
+      "loss": 0.7808,
+      "step": 11611
+    },
+    {
+      "epoch": 0.807819402413997,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00018750072929925654,
+      "loss": 0.7542,
+      "step": 11612
+    },
+    {
+      "epoch": 0.8078889700511322,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00018736938930598047,
+      "loss": 0.8713,
+      "step": 11613
+    },
+    {
+      "epoch": 0.8079585376882674,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00018723809057399066,
+      "loss": 0.8394,
+      "step": 11614
+    },
+    {
+      "epoch": 0.8080281053254026,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00018710683310995392,
+      "loss": 0.5079,
+      "step": 11615
+    },
+    {
+      "epoch": 0.8080976729625379,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00018697561692053512,
+      "loss": 0.762,
+      "step": 11616
+    },
+    {
+      "epoch": 0.808167240599673,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018684444201239658,
+      "loss": 0.9277,
+      "step": 11617
+    },
+    {
+      "epoch": 0.8082368082368082,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00018671330839219836,
+      "loss": 0.8453,
+      "step": 11618
+    },
+    {
+      "epoch": 0.8083063758739435,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001865822160665992,
+      "loss": 0.8398,
+      "step": 11619
+    },
+    {
+      "epoch": 0.8083759435110787,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018645116504225536,
+      "loss": 0.8521,
+      "step": 11620
+    },
+    {
+      "epoch": 0.8084455111482138,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001863201553258207,
+      "loss": 0.8618,
+      "step": 11621
+    },
+    {
+      "epoch": 0.808515078785349,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00018618918692394715,
+      "loss": 0.7516,
+      "step": 11622
+    },
+    {
+      "epoch": 0.8085846464224843,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00018605825984328473,
+      "loss": 0.8396,
+      "step": 11623
+    },
+    {
+      "epoch": 0.8086542140596195,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00018592737409048156,
+      "loss": 0.7484,
+      "step": 11624
+    },
+    {
+      "epoch": 0.8087237816967546,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018579652967218286,
+      "loss": 0.7865,
+      "step": 11625
+    },
+    {
+      "epoch": 0.8087933493338899,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0001856657265950328,
+      "loss": 0.7438,
+      "step": 11626
+    },
+    {
+      "epoch": 0.8088629169710251,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018553496486567244,
+      "loss": 0.7413,
+      "step": 11627
+    },
+    {
+      "epoch": 0.8089324846081603,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00018540424449074123,
+      "loss": 0.7804,
+      "step": 11628
+    },
+    {
+      "epoch": 0.8090020522452955,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00018527356547687657,
+      "loss": 0.8049,
+      "step": 11629
+    },
+    {
+      "epoch": 0.8090716198824307,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018514292783071407,
+      "loss": 0.5443,
+      "step": 11630
+    },
+    {
+      "epoch": 0.8091411875195659,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0001850123315588864,
+      "loss": 1.0762,
+      "step": 11631
+    },
+    {
+      "epoch": 0.8092107551567012,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00018488177666802454,
+      "loss": 1.1306,
+      "step": 11632
+    },
+    {
+      "epoch": 0.8092803227938363,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00018475126316475744,
+      "loss": 0.8331,
+      "step": 11633
+    },
+    {
+      "epoch": 0.8093498904309715,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001846207910557124,
+      "loss": 0.815,
+      "step": 11634
+    },
+    {
+      "epoch": 0.8094194580681067,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00018449036034751375,
+      "loss": 0.7388,
+      "step": 11635
+    },
+    {
+      "epoch": 0.809489025705242,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00018435997104678382,
+      "loss": 0.784,
+      "step": 11636
+    },
+    {
+      "epoch": 0.8095585933423771,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00018422962316014347,
+      "loss": 0.7113,
+      "step": 11637
+    },
+    {
+      "epoch": 0.8096281609795123,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00018409931669421132,
+      "loss": 0.7346,
+      "step": 11638
+    },
+    {
+      "epoch": 0.8096977286166476,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0001839690516556032,
+      "loss": 0.6103,
+      "step": 11639
+    },
+    {
+      "epoch": 0.8097672962537827,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00018383882805093367,
+      "loss": 0.841,
+      "step": 11640
+    },
+    {
+      "epoch": 0.8098368638909179,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001837086458868148,
+      "loss": 0.8778,
+      "step": 11641
+    },
+    {
+      "epoch": 0.8099064315280532,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001835785051698562,
+      "loss": 0.6279,
+      "step": 11642
+    },
+    {
+      "epoch": 0.8099759991651884,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00018344840590666612,
+      "loss": 1.0347,
+      "step": 11643
+    },
+    {
+      "epoch": 0.8100455668023235,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0001833183481038504,
+      "loss": 0.8542,
+      "step": 11644
+    },
+    {
+      "epoch": 0.8101151344394588,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00018318833176801265,
+      "loss": 0.6418,
+      "step": 11645
+    },
+    {
+      "epoch": 0.810184702076594,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00018305835690575413,
+      "loss": 0.7551,
+      "step": 11646
+    },
+    {
+      "epoch": 0.8102542697137292,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018292842352367444,
+      "loss": 0.5889,
+      "step": 11647
+    },
+    {
+      "epoch": 0.8103238373508643,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00018279853162837145,
+      "loss": 1.0018,
+      "step": 11648
+    },
+    {
+      "epoch": 0.8103934049879996,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00018266868122643998,
+      "loss": 0.9089,
+      "step": 11649
+    },
+    {
+      "epoch": 0.8104629726251348,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00018253887232447285,
+      "loss": 0.8482,
+      "step": 11650
+    },
+    {
+      "epoch": 0.81053254026227,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001824091049290616,
+      "loss": 0.8073,
+      "step": 11651
+    },
+    {
+      "epoch": 0.8106021078994052,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00018227937904679526,
+      "loss": 0.7902,
+      "step": 11652
+    },
+    {
+      "epoch": 0.8106716755365404,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00018214969468426022,
+      "loss": 0.8469,
+      "step": 11653
+    },
+    {
+      "epoch": 0.8107412431736756,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00018202005184804172,
+      "loss": 0.9903,
+      "step": 11654
+    },
+    {
+      "epoch": 0.8108108108108109,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00018189045054472163,
+      "loss": 0.8441,
+      "step": 11655
+    },
+    {
+      "epoch": 0.810880378447946,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00018176089078088132,
+      "loss": 0.7733,
+      "step": 11656
+    },
+    {
+      "epoch": 0.8109499460850812,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00018163137256309837,
+      "loss": 0.5121,
+      "step": 11657
+    },
+    {
+      "epoch": 0.8110195137222165,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00018150189589794975,
+      "loss": 0.7866,
+      "step": 11658
+    },
+    {
+      "epoch": 0.8110890813593516,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001813724607920093,
+      "loss": 0.7702,
+      "step": 11659
+    },
+    {
+      "epoch": 0.8111586489964868,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00018124306725184858,
+      "loss": 0.703,
+      "step": 11660
+    },
+    {
+      "epoch": 0.811228216633622,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00018111371528403851,
+      "loss": 0.7809,
+      "step": 11661
+    },
+    {
+      "epoch": 0.8112977842707573,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00018098440489514668,
+      "loss": 0.6439,
+      "step": 11662
+    },
+    {
+      "epoch": 0.8113673519078924,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001808551360917384,
+      "loss": 0.7882,
+      "step": 11663
+    },
+    {
+      "epoch": 0.8114369195450276,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00018072590888037744,
+      "loss": 0.9247,
+      "step": 11664
+    },
+    {
+      "epoch": 0.8115064871821629,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00018059672326762533,
+      "loss": 0.7383,
+      "step": 11665
+    },
+    {
+      "epoch": 0.8115760548192981,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00018046757926004164,
+      "loss": 0.8242,
+      "step": 11666
+    },
+    {
+      "epoch": 0.8116456224564332,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00018033847686418347,
+      "loss": 0.91,
+      "step": 11667
+    },
+    {
+      "epoch": 0.8117151900935685,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00018020941608660614,
+      "loss": 0.7519,
+      "step": 11668
+    },
+    {
+      "epoch": 0.8117847577307037,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00018008039693386246,
+      "loss": 0.7026,
+      "step": 11669
+    },
+    {
+      "epoch": 0.8118543253678389,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0001799514194125037,
+      "loss": 0.749,
+      "step": 11670
+    },
+    {
+      "epoch": 0.8119238930049741,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00017982248352907827,
+      "loss": 0.8047,
+      "step": 11671
+    },
+    {
+      "epoch": 0.8119934606421093,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00017969358929013346,
+      "loss": 0.8779,
+      "step": 11672
+    },
+    {
+      "epoch": 0.8120630282792445,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001795647367022135,
+      "loss": 0.8413,
+      "step": 11673
+    },
+    {
+      "epoch": 0.8121325959163797,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017943592577186063,
+      "loss": 0.8145,
+      "step": 11674
+    },
+    {
+      "epoch": 0.8122021635535149,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00017930715650561546,
+      "loss": 0.9092,
+      "step": 11675
+    },
+    {
+      "epoch": 0.8122717311906501,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00017917842891001658,
+      "loss": 0.871,
+      "step": 11676
+    },
+    {
+      "epoch": 0.8123412988277853,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00017904974299159983,
+      "loss": 0.7144,
+      "step": 11677
+    },
+    {
+      "epoch": 0.8124108664649206,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.000178921098756899,
+      "loss": 0.7592,
+      "step": 11678
+    },
+    {
+      "epoch": 0.8124804341020557,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0001787924962124462,
+      "loss": 0.7098,
+      "step": 11679
+    },
+    {
+      "epoch": 0.8125500017391909,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00017866393536477155,
+      "loss": 0.8082,
+      "step": 11680
+    },
+    {
+      "epoch": 0.8126195693763262,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00017853541622040237,
+      "loss": 0.5663,
+      "step": 11681
+    },
+    {
+      "epoch": 0.8126891370134613,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000178406938785864,
+      "loss": 0.9504,
+      "step": 11682
+    },
+    {
+      "epoch": 0.8127587046505965,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00017827850306768024,
+      "loss": 0.9961,
+      "step": 11683
+    },
+    {
+      "epoch": 0.8128282722877318,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001781501090723725,
+      "loss": 0.8448,
+      "step": 11684
+    },
+    {
+      "epoch": 0.812897839924867,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017802175680645948,
+      "loss": 0.8075,
+      "step": 11685
+    },
+    {
+      "epoch": 0.8129674075620021,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00017789344627645897,
+      "loss": 0.9241,
+      "step": 11686
+    },
+    {
+      "epoch": 0.8130369751991373,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001777651774888851,
+      "loss": 0.7554,
+      "step": 11687
+    },
+    {
+      "epoch": 0.8131065428362726,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00017763695045025152,
+      "loss": 0.8555,
+      "step": 11688
+    },
+    {
+      "epoch": 0.8131761104734078,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00017750876516706837,
+      "loss": 0.7459,
+      "step": 11689
+    },
+    {
+      "epoch": 0.8132456781105429,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00017738062164584457,
+      "loss": 0.5829,
+      "step": 11690
+    },
+    {
+      "epoch": 0.8133152457476782,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00017725251989308654,
+      "loss": 0.7101,
+      "step": 11691
+    },
+    {
+      "epoch": 0.8133848133848134,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00017712445991529814,
+      "loss": 0.7135,
+      "step": 11692
+    },
+    {
+      "epoch": 0.8134543810219486,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00017699644171898256,
+      "loss": 0.6507,
+      "step": 11693
+    },
+    {
+      "epoch": 0.8135239486590838,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001768684653106395,
+      "loss": 0.8359,
+      "step": 11694
+    },
+    {
+      "epoch": 0.813593516296219,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00017674053069676677,
+      "loss": 0.9942,
+      "step": 11695
+    },
+    {
+      "epoch": 0.8136630839333542,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00017661263788386005,
+      "loss": 0.7005,
+      "step": 11696
+    },
+    {
+      "epoch": 0.8137326515704895,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00017648478687841353,
+      "loss": 0.706,
+      "step": 11697
+    },
+    {
+      "epoch": 0.8138022192076246,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00017635697768691894,
+      "loss": 0.918,
+      "step": 11698
+    },
+    {
+      "epoch": 0.8138717868447598,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00017622921031586525,
+      "loss": 0.7621,
+      "step": 11699
+    },
+    {
+      "epoch": 0.813941354481895,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00017610148477174037,
+      "loss": 0.7783,
+      "step": 11700
+    },
+    {
+      "epoch": 0.8140109221190303,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00017597380106102923,
+      "loss": 0.7429,
+      "step": 11701
+    },
+    {
+      "epoch": 0.8140804897561654,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001758461591902152,
+      "loss": 0.9648,
+      "step": 11702
+    },
+    {
+      "epoch": 0.8141500573933006,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00017571855916577895,
+      "loss": 0.9206,
+      "step": 11703
+    },
+    {
+      "epoch": 0.8142196250304359,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001755910009941998,
+      "loss": 0.9077,
+      "step": 11704
+    },
+    {
+      "epoch": 0.814289192667571,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0001754634846819543,
+      "loss": 0.772,
+      "step": 11705
+    },
+    {
+      "epoch": 0.8143587603047062,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0001753360102355166,
+      "loss": 1.0154,
+      "step": 11706
+    },
+    {
+      "epoch": 0.8144283279418415,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00017520857766136012,
+      "loss": 0.7274,
+      "step": 11707
+    },
+    {
+      "epoch": 0.8144978955789767,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00017508118696595487,
+      "loss": 0.9585,
+      "step": 11708
+    },
+    {
+      "epoch": 0.8145674632161118,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00017495383815576904,
+      "loss": 0.7493,
+      "step": 11709
+    },
+    {
+      "epoch": 0.8146370308532471,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00017482653123726855,
+      "loss": 0.7809,
+      "step": 11710
+    },
+    {
+      "epoch": 0.8147065984903823,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00017469926621691757,
+      "loss": 0.694,
+      "step": 11711
+    },
+    {
+      "epoch": 0.8147761661275175,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00017457204310117837,
+      "loss": 0.7324,
+      "step": 11712
+    },
+    {
+      "epoch": 0.8148457337646526,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00017444486189651,
+      "loss": 0.9471,
+      "step": 11713
+    },
+    {
+      "epoch": 0.8149153014017879,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00017431772260937073,
+      "loss": 0.8573,
+      "step": 11714
+    },
+    {
+      "epoch": 0.8149848690389231,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00017419062524621544,
+      "loss": 0.904,
+      "step": 11715
+    },
+    {
+      "epoch": 0.8150544366760583,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00017406356981349813,
+      "loss": 0.8472,
+      "step": 11716
+    },
+    {
+      "epoch": 0.8151240043131935,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00017393655631766947,
+      "loss": 0.5351,
+      "step": 11717
+    },
+    {
+      "epoch": 0.8151935719503287,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00017380958476517904,
+      "loss": 0.8369,
+      "step": 11718
+    },
+    {
+      "epoch": 0.8152631395874639,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00017368265516247338,
+      "loss": 0.7326,
+      "step": 11719
+    },
+    {
+      "epoch": 0.8153327072245992,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00017355576751599744,
+      "loss": 0.8288,
+      "step": 11720
+    },
+    {
+      "epoch": 0.8154022748617343,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0001734289218321944,
+      "loss": 0.7971,
+      "step": 11721
+    },
+    {
+      "epoch": 0.8154718424988695,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0001733021181175044,
+      "loss": 1.0176,
+      "step": 11722
+    },
+    {
+      "epoch": 0.8155414101360048,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00017317535637836602,
+      "loss": 0.7279,
+      "step": 11723
+    },
+    {
+      "epoch": 0.81561097777314,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00017304863662121527,
+      "loss": 1.0087,
+      "step": 11724
+    },
+    {
+      "epoch": 0.8156805454102751,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00017292195885248662,
+      "loss": 0.5907,
+      "step": 11725
+    },
+    {
+      "epoch": 0.8157501130474103,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00017279532307861245,
+      "loss": 0.9269,
+      "step": 11726
+    },
+    {
+      "epoch": 0.8158196806845456,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00017266872930602197,
+      "loss": 0.7344,
+      "step": 11727
+    },
+    {
+      "epoch": 0.8158892483216807,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00017254217754114365,
+      "loss": 0.7148,
+      "step": 11728
+    },
+    {
+      "epoch": 0.8159588159588159,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017241566779040263,
+      "loss": 0.7071,
+      "step": 11729
+    },
+    {
+      "epoch": 0.8160283835959512,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00017228920006022287,
+      "loss": 1.045,
+      "step": 11730
+    },
+    {
+      "epoch": 0.8160979512330864,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00017216277435702542,
+      "loss": 0.9198,
+      "step": 11731
+    },
+    {
+      "epoch": 0.8161675188702215,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00017203639068722975,
+      "loss": 0.7394,
+      "step": 11732
+    },
+    {
+      "epoch": 0.8162370865073568,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017191004905725283,
+      "loss": 0.8574,
+      "step": 11733
+    },
+    {
+      "epoch": 0.816306654144492,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0001717837494735097,
+      "loss": 0.6116,
+      "step": 11734
+    },
+    {
+      "epoch": 0.8163762217816272,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00017165749194241343,
+      "loss": 0.6934,
+      "step": 11735
+    },
+    {
+      "epoch": 0.8164457894187624,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00017153127647037458,
+      "loss": 0.7483,
+      "step": 11736
+    },
+    {
+      "epoch": 0.8165153570558976,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017140510306380176,
+      "loss": 0.8049,
+      "step": 11737
+    },
+    {
+      "epoch": 0.8165849246930328,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0001712789717291009,
+      "loss": 0.9112,
+      "step": 11738
+    },
+    {
+      "epoch": 0.816654492330168,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00017115288247267725,
+      "loss": 0.868,
+      "step": 11739
+    },
+    {
+      "epoch": 0.8167240599673032,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00017102683530093255,
+      "loss": 0.7093,
+      "step": 11740
+    },
+    {
+      "epoch": 0.8167936276044384,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001709008302202666,
+      "loss": 0.8052,
+      "step": 11741
+    },
+    {
+      "epoch": 0.8168631952415736,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001707748672370777,
+      "loss": 0.9135,
+      "step": 11742
+    },
+    {
+      "epoch": 0.8169327628787089,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00017064894635776117,
+      "loss": 0.7799,
+      "step": 11743
+    },
+    {
+      "epoch": 0.817002330515844,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00017052306758871127,
+      "loss": 0.9484,
+      "step": 11744
+    },
+    {
+      "epoch": 0.8170718981529792,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00017039723093631876,
+      "loss": 0.8796,
+      "step": 11745
+    },
+    {
+      "epoch": 0.8171414657901145,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00017027143640697362,
+      "loss": 0.6473,
+      "step": 11746
+    },
+    {
+      "epoch": 0.8172110334272497,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00017014568400706265,
+      "loss": 0.7669,
+      "step": 11747
+    },
+    {
+      "epoch": 0.8172806010643848,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00017001997374297095,
+      "loss": 0.9723,
+      "step": 11748
+    },
+    {
+      "epoch": 0.8173501687015201,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00016989430562108188,
+      "loss": 0.7161,
+      "step": 11749
+    },
+    {
+      "epoch": 0.8174197363386553,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00016976867964777598,
+      "loss": 0.894,
+      "step": 11750
+    },
+    {
+      "epoch": 0.8174893039757904,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001696430958294315,
+      "loss": 0.7189,
+      "step": 11751
+    },
+    {
+      "epoch": 0.8175588716129256,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0001695175541724253,
+      "loss": 0.8627,
+      "step": 11752
+    },
+    {
+      "epoch": 0.8176284392500609,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00016939205468313213,
+      "loss": 0.9201,
+      "step": 11753
+    },
+    {
+      "epoch": 0.8176980068871961,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0001692665973679237,
+      "loss": 0.4185,
+      "step": 11754
+    },
+    {
+      "epoch": 0.8177675745243312,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00016914118223317033,
+      "loss": 0.7814,
+      "step": 11755
+    },
+    {
+      "epoch": 0.8178371421614665,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00016901580928523963,
+      "loss": 0.5873,
+      "step": 11756
+    },
+    {
+      "epoch": 0.8179067097986017,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00016889047853049766,
+      "loss": 0.9774,
+      "step": 11757
+    },
+    {
+      "epoch": 0.8179762774357369,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00016876518997530843,
+      "loss": 0.6748,
+      "step": 11758
+    },
+    {
+      "epoch": 0.8180458450728721,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00016863994362603275,
+      "loss": 0.7434,
+      "step": 11759
+    },
+    {
+      "epoch": 0.8181154127100073,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00016851473948903062,
+      "loss": 0.9599,
+      "step": 11760
+    },
+    {
+      "epoch": 0.8181849803471425,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00016838957757065877,
+      "loss": 0.6192,
+      "step": 11761
+    },
+    {
+      "epoch": 0.8182545479842778,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00016826445787727285,
+      "loss": 0.7509,
+      "step": 11762
+    },
+    {
+      "epoch": 0.8183241156214129,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00016813938041522526,
+      "loss": 0.7313,
+      "step": 11763
+    },
+    {
+      "epoch": 0.8183936832585481,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00016801434519086723,
+      "loss": 0.8128,
+      "step": 11764
+    },
+    {
+      "epoch": 0.8184632508956833,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00016788935221054703,
+      "loss": 0.9444,
+      "step": 11765
+    },
+    {
+      "epoch": 0.8185328185328186,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00016776440148061133,
+      "loss": 0.9276,
+      "step": 11766
+    },
+    {
+      "epoch": 0.8186023861699537,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0001676394930074049,
+      "loss": 0.6666,
+      "step": 11767
+    },
+    {
+      "epoch": 0.8186719538070889,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00016751462679726948,
+      "loss": 0.7016,
+      "step": 11768
+    },
+    {
+      "epoch": 0.8187415214442242,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00016738980285654537,
+      "loss": 0.8631,
+      "step": 11769
+    },
+    {
+      "epoch": 0.8188110890813594,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00016726502119156984,
+      "loss": 0.7953,
+      "step": 11770
+    },
+    {
+      "epoch": 0.8188806567184945,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001671402818086797,
+      "loss": 0.8171,
+      "step": 11771
+    },
+    {
+      "epoch": 0.8189502243556298,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001670155847142082,
+      "loss": 0.5757,
+      "step": 11772
+    },
+    {
+      "epoch": 0.819019791992765,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0001668909299144865,
+      "loss": 0.9231,
+      "step": 11773
+    },
+    {
+      "epoch": 0.8190893596299001,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00016676631741584447,
+      "loss": 0.7563,
+      "step": 11774
+    },
+    {
+      "epoch": 0.8191589272670354,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00016664174722460866,
+      "loss": 0.6916,
+      "step": 11775
+    },
+    {
+      "epoch": 0.8192284949041706,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00016651721934710483,
+      "loss": 0.7551,
+      "step": 11776
+    },
+    {
+      "epoch": 0.8192980625413058,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00016639273378965536,
+      "loss": 0.697,
+      "step": 11777
+    },
+    {
+      "epoch": 0.8193676301784409,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00016626829055858128,
+      "loss": 0.9492,
+      "step": 11778
+    },
+    {
+      "epoch": 0.8194371978155762,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.000166143889660201,
+      "loss": 0.6445,
+      "step": 11779
+    },
+    {
+      "epoch": 0.8195067654527114,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0001660195311008309,
+      "loss": 0.8859,
+      "step": 11780
+    },
+    {
+      "epoch": 0.8195763330898466,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00016589521488678582,
+      "loss": 0.8174,
+      "step": 11781
+    },
+    {
+      "epoch": 0.8196459007269818,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001657709410243774,
+      "loss": 0.6393,
+      "step": 11782
+    },
+    {
+      "epoch": 0.819715468364117,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00016564670951991556,
+      "loss": 0.9107,
+      "step": 11783
+    },
+    {
+      "epoch": 0.8197850360012522,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00016552252037970838,
+      "loss": 0.965,
+      "step": 11784
+    },
+    {
+      "epoch": 0.8198546036383875,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00016539837361006184,
+      "loss": 0.6285,
+      "step": 11785
+    },
+    {
+      "epoch": 0.8199241712755226,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00016527426921727917,
+      "loss": 0.5821,
+      "step": 11786
+    },
+    {
+      "epoch": 0.8199937389126578,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00016515020720766149,
+      "loss": 0.7352,
+      "step": 11787
+    },
+    {
+      "epoch": 0.820063306549793,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00016502618758750854,
+      "loss": 0.9191,
+      "step": 11788
+    },
+    {
+      "epoch": 0.8201328741869283,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00016490221036311704,
+      "loss": 0.8733,
+      "step": 11789
+    },
+    {
+      "epoch": 0.8202024418240634,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00016477827554078228,
+      "loss": 0.9191,
+      "step": 11790
+    },
+    {
+      "epoch": 0.8202720094611986,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001646543831267966,
+      "loss": 0.7753,
+      "step": 11791
+    },
+    {
+      "epoch": 0.8203415770983339,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00016453053312745115,
+      "loss": 0.689,
+      "step": 11792
+    },
+    {
+      "epoch": 0.8204111447354691,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0001644067255490339,
+      "loss": 0.8691,
+      "step": 11793
+    },
+    {
+      "epoch": 0.8204807123726042,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00016428296039783152,
+      "loss": 0.6144,
+      "step": 11794
+    },
+    {
+      "epoch": 0.8205502800097395,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0001641592376801282,
+      "loss": 1.0364,
+      "step": 11795
+    },
+    {
+      "epoch": 0.8206198476468747,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0001640355574022059,
+      "loss": 0.694,
+      "step": 11796
+    },
+    {
+      "epoch": 0.8206894152840098,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00016391191957034422,
+      "loss": 0.8732,
+      "step": 11797
+    },
+    {
+      "epoch": 0.8207589829211451,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00016378832419082102,
+      "loss": 0.9643,
+      "step": 11798
+    },
+    {
+      "epoch": 0.8208285505582803,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00016366477126991208,
+      "loss": 0.8182,
+      "step": 11799
+    },
+    {
+      "epoch": 0.8208981181954155,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00016354126081389076,
+      "loss": 0.705,
+      "step": 11800
+    },
+    {
+      "epoch": 0.8209676858325506,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0001634177928290278,
+      "loss": 0.7874,
+      "step": 11801
+    },
+    {
+      "epoch": 0.8210372534696859,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00016329436732159263,
+      "loss": 0.5235,
+      "step": 11802
+    },
+    {
+      "epoch": 0.8211068211068211,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00016317098429785248,
+      "loss": 0.6098,
+      "step": 11803
+    },
+    {
+      "epoch": 0.8211763887439563,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00016304764376407177,
+      "loss": 0.7253,
+      "step": 11804
+    },
+    {
+      "epoch": 0.8212459563810915,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00016292434572651293,
+      "loss": 0.6622,
+      "step": 11805
+    },
+    {
+      "epoch": 0.8213155240182267,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00016280109019143685,
+      "loss": 0.5838,
+      "step": 11806
+    },
+    {
+      "epoch": 0.8213850916553619,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00016267787716510142,
+      "loss": 0.7913,
+      "step": 11807
+    },
+    {
+      "epoch": 0.8214546592924972,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00016255470665376304,
+      "loss": 0.6731,
+      "step": 11808
+    },
+    {
+      "epoch": 0.8215242269296323,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.00016243157866367575,
+      "loss": 1.1183,
+      "step": 11809
+    },
+    {
+      "epoch": 0.8215937945667675,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001623084932010912,
+      "loss": 0.8298,
+      "step": 11810
+    },
+    {
+      "epoch": 0.8216633622039028,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00016218545027225895,
+      "loss": 0.7191,
+      "step": 11811
+    },
+    {
+      "epoch": 0.821732929841038,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00016206244988342666,
+      "loss": 0.7012,
+      "step": 11812
+    },
+    {
+      "epoch": 0.8218024974781731,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0001619394920408398,
+      "loss": 0.8181,
+      "step": 11813
+    },
+    {
+      "epoch": 0.8218720651153083,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00016181657675074147,
+      "loss": 0.6448,
+      "step": 11814
+    },
+    {
+      "epoch": 0.8219416327524436,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00016169370401937223,
+      "loss": 0.7898,
+      "step": 11815
+    },
+    {
+      "epoch": 0.8220112003895788,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00016157087385297142,
+      "loss": 0.797,
+      "step": 11816
+    },
+    {
+      "epoch": 0.8220807680267139,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00016144808625777595,
+      "loss": 0.7012,
+      "step": 11817
+    },
+    {
+      "epoch": 0.8221503356638492,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00016132534124001997,
+      "loss": 0.7456,
+      "step": 11818
+    },
+    {
+      "epoch": 0.8222199033009844,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00016120263880593566,
+      "loss": 0.7268,
+      "step": 11819
+    },
+    {
+      "epoch": 0.8222894709381195,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00016107997896175374,
+      "loss": 0.7633,
+      "step": 11820
+    },
+    {
+      "epoch": 0.8223590385752548,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0001609573617137019,
+      "loss": 0.8158,
+      "step": 11821
+    },
+    {
+      "epoch": 0.82242860621239,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00016083478706800604,
+      "loss": 0.6253,
+      "step": 11822
+    },
+    {
+      "epoch": 0.8224981738495252,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00016071225503089026,
+      "loss": 0.5911,
+      "step": 11823
+    },
+    {
+      "epoch": 0.8225677414866605,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00016058976560857574,
+      "loss": 0.5473,
+      "step": 11824
+    },
+    {
+      "epoch": 0.8226373091237956,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00016046731880728184,
+      "loss": 0.6328,
+      "step": 11825
+    },
+    {
+      "epoch": 0.8227068767609308,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.000160344914633226,
+      "loss": 0.8415,
+      "step": 11826
+    },
+    {
+      "epoch": 0.822776444398066,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00016022255309262334,
+      "loss": 0.8935,
+      "step": 11827
+    },
+    {
+      "epoch": 0.8228460120352012,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00016010023419168673,
+      "loss": 0.7528,
+      "step": 11828
+    },
+    {
+      "epoch": 0.8229155796723364,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0001599779579366265,
+      "loss": 0.7167,
+      "step": 11829
+    },
+    {
+      "epoch": 0.8229851473094716,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00015985572433365158,
+      "loss": 0.9432,
+      "step": 11830
+    },
+    {
+      "epoch": 0.8230547149466069,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00015973353338896856,
+      "loss": 0.9314,
+      "step": 11831
+    },
+    {
+      "epoch": 0.823124282583742,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001596113851087815,
+      "loss": 0.818,
+      "step": 11832
+    },
+    {
+      "epoch": 0.8231938502208772,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00015948927949929216,
+      "loss": 0.7425,
+      "step": 11833
+    },
+    {
+      "epoch": 0.8232634178580125,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001593672165667007,
+      "loss": 0.6268,
+      "step": 11834
+    },
+    {
+      "epoch": 0.8233329854951477,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00015924519631720514,
+      "loss": 0.8678,
+      "step": 11835
+    },
+    {
+      "epoch": 0.8234025531322828,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00015912321875700074,
+      "loss": 1.029,
+      "step": 11836
+    },
+    {
+      "epoch": 0.8234721207694181,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00015900128389228086,
+      "loss": 0.8239,
+      "step": 11837
+    },
+    {
+      "epoch": 0.8235416884065533,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00015887939172923692,
+      "loss": 0.6994,
+      "step": 11838
+    },
+    {
+      "epoch": 0.8236112560436885,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001587575422740578,
+      "loss": 0.5914,
+      "step": 11839
+    },
+    {
+      "epoch": 0.8236808236808236,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00015863573553293042,
+      "loss": 0.7486,
+      "step": 11840
+    },
+    {
+      "epoch": 0.8237503913179589,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00015851397151203983,
+      "loss": 0.7262,
+      "step": 11841
+    },
+    {
+      "epoch": 0.8238199589550941,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0001583922502175684,
+      "loss": 0.7706,
+      "step": 11842
+    },
+    {
+      "epoch": 0.8238895265922292,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00015827057165569624,
+      "loss": 0.7025,
+      "step": 11843
+    },
+    {
+      "epoch": 0.8239590942293645,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0001581489358326018,
+      "loss": 0.7226,
+      "step": 11844
+    },
+    {
+      "epoch": 0.8240286618664997,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0001580273427544614,
+      "loss": 0.5513,
+      "step": 11845
+    },
+    {
+      "epoch": 0.8240982295036349,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00015790579242744873,
+      "loss": 0.8411,
+      "step": 11846
+    },
+    {
+      "epoch": 0.8241677971407702,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00015778428485773522,
+      "loss": 0.8452,
+      "step": 11847
+    },
+    {
+      "epoch": 0.8242373647779053,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00015766282005149056,
+      "loss": 0.6987,
+      "step": 11848
+    },
+    {
+      "epoch": 0.8243069324150405,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00015754139801488256,
+      "loss": 0.9424,
+      "step": 11849
+    },
+    {
+      "epoch": 0.8243765000521758,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00015742001875407598,
+      "loss": 0.6831,
+      "step": 11850
+    },
+    {
+      "epoch": 0.824446067689311,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0001572986822752336,
+      "loss": 0.6033,
+      "step": 11851
+    },
+    {
+      "epoch": 0.8245156353264461,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.000157177388584517,
+      "loss": 0.9047,
+      "step": 11852
+    },
+    {
+      "epoch": 0.8245852029635813,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00015705613768808414,
+      "loss": 0.8724,
+      "step": 11853
+    },
+    {
+      "epoch": 0.8246547706007166,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00015693492959209187,
+      "loss": 0.8429,
+      "step": 11854
+    },
+    {
+      "epoch": 0.8247243382378517,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0001568137643026948,
+      "loss": 0.6627,
+      "step": 11855
+    },
+    {
+      "epoch": 0.8247939058749869,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0001566926418260447,
+      "loss": 0.8518,
+      "step": 11856
+    },
+    {
+      "epoch": 0.8248634735121222,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00015657156216829148,
+      "loss": 0.938,
+      "step": 11857
+    },
+    {
+      "epoch": 0.8249330411492574,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00015645052533558323,
+      "loss": 0.7418,
+      "step": 11858
+    },
+    {
+      "epoch": 0.8250026087863925,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001563295313340657,
+      "loss": 0.7214,
+      "step": 11859
+    },
+    {
+      "epoch": 0.8250721764235278,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00015620858016988205,
+      "loss": 0.857,
+      "step": 11860
+    },
+    {
+      "epoch": 0.825141744060663,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001560876718491735,
+      "loss": 0.5527,
+      "step": 11861
+    },
+    {
+      "epoch": 0.8252113116977982,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00015596680637807936,
+      "loss": 0.8925,
+      "step": 11862
+    },
+    {
+      "epoch": 0.8252808793349334,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00015584598376273674,
+      "loss": 0.6308,
+      "step": 11863
+    },
+    {
+      "epoch": 0.8253504469720686,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00015572520400928026,
+      "loss": 0.7314,
+      "step": 11864
+    },
+    {
+      "epoch": 0.8254200146092038,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00015560446712384223,
+      "loss": 0.7527,
+      "step": 11865
+    },
+    {
+      "epoch": 0.825489582246339,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00015548377311255324,
+      "loss": 0.6309,
+      "step": 11866
+    },
+    {
+      "epoch": 0.8255591498834742,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001553631219815419,
+      "loss": 0.4498,
+      "step": 11867
+    },
+    {
+      "epoch": 0.8256287175206094,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00015524251373693354,
+      "loss": 0.7638,
+      "step": 11868
+    },
+    {
+      "epoch": 0.8256982851577446,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00015512194838485284,
+      "loss": 0.846,
+      "step": 11869
+    },
+    {
+      "epoch": 0.8257678527948799,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0001550014259314211,
+      "loss": 0.983,
+      "step": 11870
+    },
+    {
+      "epoch": 0.825837420432015,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00015488094638275751,
+      "loss": 0.7911,
+      "step": 11871
+    },
+    {
+      "epoch": 0.8259069880691502,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0001547605097449798,
+      "loss": 0.9909,
+      "step": 11872
+    },
+    {
+      "epoch": 0.8259765557062855,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00015464011602420324,
+      "loss": 0.8125,
+      "step": 11873
+    },
+    {
+      "epoch": 0.8260461233434206,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00015451976522654076,
+      "loss": 0.9632,
+      "step": 11874
+    },
+    {
+      "epoch": 0.8261156909805558,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0001543994573581028,
+      "loss": 0.7242,
+      "step": 11875
+    },
+    {
+      "epoch": 0.8261852586176911,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00015427919242499822,
+      "loss": 0.5407,
+      "step": 11876
+    },
+    {
+      "epoch": 0.8262548262548263,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001541589704333337,
+      "loss": 0.798,
+      "step": 11877
+    },
+    {
+      "epoch": 0.8263243938919614,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0001540387913892134,
+      "loss": 0.8822,
+      "step": 11878
+    },
+    {
+      "epoch": 0.8263939615290966,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00015391865529873906,
+      "loss": 0.7547,
+      "step": 11879
+    },
+    {
+      "epoch": 0.8264635291662319,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0001537985621680108,
+      "loss": 0.7576,
+      "step": 11880
+    },
+    {
+      "epoch": 0.8265330968033671,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00015367851200312666,
+      "loss": 0.6437,
+      "step": 11881
+    },
+    {
+      "epoch": 0.8266026644405022,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00015355850481018162,
+      "loss": 0.859,
+      "step": 11882
+    },
+    {
+      "epoch": 0.8266722320776375,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00015343854059526952,
+      "loss": 0.703,
+      "step": 11883
+    },
+    {
+      "epoch": 0.8267417997147727,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00015331861936448144,
+      "loss": 0.7281,
+      "step": 11884
+    },
+    {
+      "epoch": 0.8268113673519079,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00015319874112390598,
+      "loss": 0.8862,
+      "step": 11885
+    },
+    {
+      "epoch": 0.8268809349890431,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00015307890587963036,
+      "loss": 0.8087,
+      "step": 11886
+    },
+    {
+      "epoch": 0.8269505026261783,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00015295911363773918,
+      "loss": 0.7998,
+      "step": 11887
+    },
+    {
+      "epoch": 0.8270200702633135,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0001528393644043149,
+      "loss": 0.9382,
+      "step": 11888
+    },
+    {
+      "epoch": 0.8270896379004488,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00015271965818543744,
+      "loss": 0.8733,
+      "step": 11889
+    },
+    {
+      "epoch": 0.8271592055375839,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00015259999498718513,
+      "loss": 0.8205,
+      "step": 11890
+    },
+    {
+      "epoch": 0.8272287731747191,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00015248037481563415,
+      "loss": 0.7617,
+      "step": 11891
+    },
+    {
+      "epoch": 0.8272983408118543,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00015236079767685785,
+      "loss": 0.6772,
+      "step": 11892
+    },
+    {
+      "epoch": 0.8273679084489896,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00015224126357692757,
+      "loss": 0.4945,
+      "step": 11893
+    },
+    {
+      "epoch": 0.8274374760861247,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00015212177252191294,
+      "loss": 0.8874,
+      "step": 11894
+    },
+    {
+      "epoch": 0.8275070437232599,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00015200232451788133,
+      "loss": 0.8229,
+      "step": 11895
+    },
+    {
+      "epoch": 0.8275766113603952,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00015188291957089718,
+      "loss": 0.6975,
+      "step": 11896
+    },
+    {
+      "epoch": 0.8276461789975303,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00015176355768702388,
+      "loss": 0.6228,
+      "step": 11897
+    },
+    {
+      "epoch": 0.8277157466346655,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0001516442388723216,
+      "loss": 0.8128,
+      "step": 11898
+    },
+    {
+      "epoch": 0.8277853142718008,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0001515249631328486,
+      "loss": 1.1627,
+      "step": 11899
+    },
+    {
+      "epoch": 0.827854881908936,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00015140573047466133,
+      "loss": 0.6686,
+      "step": 11900
+    },
+    {
+      "epoch": 0.8279244495460711,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0001512865409038141,
+      "loss": 1.0271,
+      "step": 11901
+    },
+    {
+      "epoch": 0.8279940171832064,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00015116739442635853,
+      "loss": 0.8546,
+      "step": 11902
+    },
+    {
+      "epoch": 0.8280635848203416,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00015104829104834394,
+      "loss": 0.7779,
+      "step": 11903
+    },
+    {
+      "epoch": 0.8281331524574768,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0001509292307758181,
+      "loss": 0.7168,
+      "step": 11904
+    },
+    {
+      "epoch": 0.8282027200946119,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00015081021361482662,
+      "loss": 0.8175,
+      "step": 11905
+    },
+    {
+      "epoch": 0.8282722877317472,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00015069123957141219,
+      "loss": 0.8798,
+      "step": 11906
+    },
+    {
+      "epoch": 0.8283418553688824,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00015057230865161552,
+      "loss": 1.0727,
+      "step": 11907
+    },
+    {
+      "epoch": 0.8284114230060176,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00015045342086147562,
+      "loss": 1.0258,
+      "step": 11908
+    },
+    {
+      "epoch": 0.8284809906431528,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00015033457620702918,
+      "loss": 0.9374,
+      "step": 11909
+    },
+    {
+      "epoch": 0.828550558280288,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00015021577469431037,
+      "loss": 0.7597,
+      "step": 11910
+    },
+    {
+      "epoch": 0.8286201259174232,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00015009701632935103,
+      "loss": 0.9634,
+      "step": 11911
+    },
+    {
+      "epoch": 0.8286896935545585,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00014997830111818133,
+      "loss": 0.9563,
+      "step": 11912
+    },
+    {
+      "epoch": 0.8287592611916936,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00014985962906682938,
+      "loss": 0.5603,
+      "step": 11913
+    },
+    {
+      "epoch": 0.8288288288288288,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00014974100018132018,
+      "loss": 0.818,
+      "step": 11914
+    },
+    {
+      "epoch": 0.8288983964659641,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00014962241446767765,
+      "loss": 0.9945,
+      "step": 11915
+    },
+    {
+      "epoch": 0.8289679641030993,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0001495038719319226,
+      "loss": 0.8931,
+      "step": 11916
+    },
+    {
+      "epoch": 0.8290375317402344,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001493853725800739,
+      "loss": 0.7958,
+      "step": 11917
+    },
+    {
+      "epoch": 0.8291070993773696,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001492669164181486,
+      "loss": 0.9674,
+      "step": 11918
+    },
+    {
+      "epoch": 0.8291766670145049,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00014914850345216146,
+      "loss": 0.8706,
+      "step": 11919
+    },
+    {
+      "epoch": 0.82924623465164,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00014903013368812478,
+      "loss": 0.7766,
+      "step": 11920
+    },
+    {
+      "epoch": 0.8293158022887752,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00014891180713204845,
+      "loss": 1.0476,
+      "step": 11921
+    },
+    {
+      "epoch": 0.8293853699259105,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0001487935237899407,
+      "loss": 0.9776,
+      "step": 11922
+    },
+    {
+      "epoch": 0.8294549375630457,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0001486752836678077,
+      "loss": 0.7633,
+      "step": 11923
+    },
+    {
+      "epoch": 0.8295245052001808,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001485570867716528,
+      "loss": 0.9992,
+      "step": 11924
+    },
+    {
+      "epoch": 0.8295940728373161,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00014843893310747714,
+      "loss": 0.8413,
+      "step": 11925
+    },
+    {
+      "epoch": 0.8296636404744513,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00014832082268128032,
+      "loss": 0.8898,
+      "step": 11926
+    },
+    {
+      "epoch": 0.8297332081115865,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00014820275549905958,
+      "loss": 0.7373,
+      "step": 11927
+    },
+    {
+      "epoch": 0.8298027757487217,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00014808473156680934,
+      "loss": 0.8684,
+      "step": 11928
+    },
+    {
+      "epoch": 0.8298723433858569,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0001479667508905227,
+      "loss": 1.0323,
+      "step": 11929
+    },
+    {
+      "epoch": 0.8299419110229921,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00014784881347618985,
+      "loss": 1.0172,
+      "step": 11930
+    },
+    {
+      "epoch": 0.8300114786601273,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00014773091932979886,
+      "loss": 0.8575,
+      "step": 11931
+    },
+    {
+      "epoch": 0.8300810462972625,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00014761306845733602,
+      "loss": 0.7712,
+      "step": 11932
+    },
+    {
+      "epoch": 0.8301506139343977,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00014749526086478538,
+      "loss": 1.0773,
+      "step": 11933
+    },
+    {
+      "epoch": 0.8302201815715329,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0001473774965581286,
+      "loss": 0.836,
+      "step": 11934
+    },
+    {
+      "epoch": 0.8302897492086682,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001472597755433447,
+      "loss": 0.9085,
+      "step": 11935
+    },
+    {
+      "epoch": 0.8303593168458033,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001471420978264112,
+      "loss": 1.0149,
+      "step": 11936
+    },
+    {
+      "epoch": 0.8304288844829385,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00014702446341330355,
+      "loss": 0.6521,
+      "step": 11937
+    },
+    {
+      "epoch": 0.8304984521200738,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00014690687230999434,
+      "loss": 0.6956,
+      "step": 11938
+    },
+    {
+      "epoch": 0.830568019757209,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00014678932452245397,
+      "loss": 0.8258,
+      "step": 11939
+    },
+    {
+      "epoch": 0.8306375873943441,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00014667182005665124,
+      "loss": 0.6902,
+      "step": 11940
+    },
+    {
+      "epoch": 0.8307071550314794,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00014655435891855261,
+      "loss": 0.9419,
+      "step": 11941
+    },
+    {
+      "epoch": 0.8307767226686146,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00014643694111412175,
+      "loss": 0.5936,
+      "step": 11942
+    },
+    {
+      "epoch": 0.8308462903057497,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.000146319566649321,
+      "loss": 1.076,
+      "step": 11943
+    },
+    {
+      "epoch": 0.8309158579428849,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00014620223553010947,
+      "loss": 0.9286,
+      "step": 11944
+    },
+    {
+      "epoch": 0.8309854255800202,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00014608494776244529,
+      "loss": 0.7935,
+      "step": 11945
+    },
+    {
+      "epoch": 0.8310549932171554,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00014596770335228315,
+      "loss": 0.9173,
+      "step": 11946
+    },
+    {
+      "epoch": 0.8311245608542905,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0001458505023055765,
+      "loss": 0.8054,
+      "step": 11947
+    },
+    {
+      "epoch": 0.8311941284914258,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00014573334462827624,
+      "loss": 0.8516,
+      "step": 11948
+    },
+    {
+      "epoch": 0.831263696128561,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00014561623032633065,
+      "loss": 1.006,
+      "step": 11949
+    },
+    {
+      "epoch": 0.8313332637656962,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00014549915940568648,
+      "loss": 0.5378,
+      "step": 11950
+    },
+    {
+      "epoch": 0.8314028314028314,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0001453821318722882,
+      "loss": 0.8296,
+      "step": 11951
+    },
+    {
+      "epoch": 0.8314723990399666,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00014526514773207776,
+      "loss": 0.8133,
+      "step": 11952
+    },
+    {
+      "epoch": 0.8315419666771018,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00014514820699099463,
+      "loss": 0.7953,
+      "step": 11953
+    },
+    {
+      "epoch": 0.8316115343142371,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001450313096549768,
+      "loss": 0.8305,
+      "step": 11954
+    },
+    {
+      "epoch": 0.8316811019513722,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00014491445572995988,
+      "loss": 0.968,
+      "step": 11955
+    },
+    {
+      "epoch": 0.8317506695885074,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00014479764522187677,
+      "loss": 0.6758,
+      "step": 11956
+    },
+    {
+      "epoch": 0.8318202372256426,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00014468087813665888,
+      "loss": 0.7468,
+      "step": 11957
+    },
+    {
+      "epoch": 0.8318898048627779,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00014456415448023464,
+      "loss": 0.9035,
+      "step": 11958
+    },
+    {
+      "epoch": 0.831959372499913,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00014444747425853123,
+      "loss": 0.6326,
+      "step": 11959
+    },
+    {
+      "epoch": 0.8320289401370482,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.00014433083747747243,
+      "loss": 0.4847,
+      "step": 11960
+    },
+    {
+      "epoch": 0.8320985077741835,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00014421424414298113,
+      "loss": 0.8069,
+      "step": 11961
+    },
+    {
+      "epoch": 0.8321680754113187,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00014409769426097695,
+      "loss": 0.7904,
+      "step": 11962
+    },
+    {
+      "epoch": 0.8322376430484538,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00014398118783737746,
+      "loss": 0.8436,
+      "step": 11963
+    },
+    {
+      "epoch": 0.8323072106855891,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00014386472487809898,
+      "loss": 0.6022,
+      "step": 11964
+    },
+    {
+      "epoch": 0.8323767783227243,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00014374830538905448,
+      "loss": 0.9282,
+      "step": 11965
+    },
+    {
+      "epoch": 0.8324463459598594,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0001436319293761552,
+      "loss": 0.7943,
+      "step": 11966
+    },
+    {
+      "epoch": 0.8325159135969947,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00014351559684531,
+      "loss": 0.5992,
+      "step": 11967
+    },
+    {
+      "epoch": 0.8325854812341299,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00014339930780242572,
+      "loss": 0.7814,
+      "step": 11968
+    },
+    {
+      "epoch": 0.8326550488712651,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00014328306225340725,
+      "loss": 0.823,
+      "step": 11969
+    },
+    {
+      "epoch": 0.8327246165084002,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00014316686020415649,
+      "loss": 0.8234,
+      "step": 11970
+    },
+    {
+      "epoch": 0.8327941841455355,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001430507016605741,
+      "loss": 0.554,
+      "step": 11971
+    },
+    {
+      "epoch": 0.8328637517826707,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00014293458662855741,
+      "loss": 0.6621,
+      "step": 11972
+    },
+    {
+      "epoch": 0.8329333194198059,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001428185151140028,
+      "loss": 0.6714,
+      "step": 11973
+    },
+    {
+      "epoch": 0.8330028870569411,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001427024871228031,
+      "loss": 0.5449,
+      "step": 11974
+    },
+    {
+      "epoch": 0.8330724546940763,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00014258650266085038,
+      "loss": 1.0034,
+      "step": 11975
+    },
+    {
+      "epoch": 0.8331420223312115,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00014247056173403305,
+      "loss": 0.8965,
+      "step": 11976
+    },
+    {
+      "epoch": 0.8332115899683468,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001423546643482384,
+      "loss": 0.8702,
+      "step": 11977
+    },
+    {
+      "epoch": 0.8332811576054819,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00014223881050935117,
+      "loss": 0.7934,
+      "step": 11978
+    },
+    {
+      "epoch": 0.8333507252426171,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00014212300022325376,
+      "loss": 0.9499,
+      "step": 11979
+    },
+    {
+      "epoch": 0.8334202928797524,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00014200723349582644,
+      "loss": 1.0192,
+      "step": 11980
+    },
+    {
+      "epoch": 0.8334898605168876,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00014189151033294688,
+      "loss": 0.7489,
+      "step": 11981
+    },
+    {
+      "epoch": 0.8335594281540227,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00014177583074049128,
+      "loss": 0.6674,
+      "step": 11982
+    },
+    {
+      "epoch": 0.8336289957911579,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00014166019472433344,
+      "loss": 0.7599,
+      "step": 11983
+    },
+    {
+      "epoch": 0.8336985634282932,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00014154460229034427,
+      "loss": 0.7389,
+      "step": 11984
+    },
+    {
+      "epoch": 0.8337681310654284,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0001414290534443936,
+      "loss": 0.8186,
+      "step": 11985
+    },
+    {
+      "epoch": 0.8338376987025635,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00014131354819234775,
+      "loss": 0.7043,
+      "step": 11986
+    },
+    {
+      "epoch": 0.8339072663396988,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00014119808654007216,
+      "loss": 0.965,
+      "step": 11987
+    },
+    {
+      "epoch": 0.833976833976834,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00014108266849342877,
+      "loss": 0.7556,
+      "step": 11988
+    },
+    {
+      "epoch": 0.8340464016139691,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00014096729405827847,
+      "loss": 0.8747,
+      "step": 11989
+    },
+    {
+      "epoch": 0.8341159692511044,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00014085196324047878,
+      "loss": 0.7994,
+      "step": 11990
+    },
+    {
+      "epoch": 0.8341855368882396,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00014073667604588635,
+      "loss": 0.645,
+      "step": 11991
+    },
+    {
+      "epoch": 0.8342551045253748,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0001406214324803542,
+      "loss": 0.8727,
+      "step": 11992
+    },
+    {
+      "epoch": 0.83432467216251,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0001405062325497344,
+      "loss": 0.6552,
+      "step": 11993
+    },
+    {
+      "epoch": 0.8343942397996452,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0001403910762598758,
+      "loss": 0.5523,
+      "step": 11994
+    },
+    {
+      "epoch": 0.8344638074367804,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0001402759636166253,
+      "loss": 0.7422,
+      "step": 11995
+    },
+    {
+      "epoch": 0.8345333750739156,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00014016089462582837,
+      "loss": 0.9296,
+      "step": 11996
+    },
+    {
+      "epoch": 0.8346029427110508,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00014004586929332742,
+      "loss": 0.887,
+      "step": 11997
+    },
+    {
+      "epoch": 0.834672510348186,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00013993088762496265,
+      "loss": 0.8131,
+      "step": 11998
+    },
+    {
+      "epoch": 0.8347420779853212,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00013981594962657218,
+      "loss": 0.4077,
+      "step": 11999
+    },
+    {
+      "epoch": 0.8348116456224565,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00013970105530399212,
+      "loss": 1.0036,
+      "step": 12000
+    },
+    {
+      "epoch": 0.8348812132595916,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0001395862046630564,
+      "loss": 0.7612,
+      "step": 12001
+    },
+    {
+      "epoch": 0.8349507808967268,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00013947139770959627,
+      "loss": 0.5829,
+      "step": 12002
+    },
+    {
+      "epoch": 0.8350203485338621,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00013935663444944135,
+      "loss": 0.8279,
+      "step": 12003
+    },
+    {
+      "epoch": 0.8350899161709973,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0001392419148884183,
+      "loss": 0.9097,
+      "step": 12004
+    },
+    {
+      "epoch": 0.8351594838081324,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00013912723903235257,
+      "loss": 0.65,
+      "step": 12005
+    },
+    {
+      "epoch": 0.8352290514452677,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001390126068870663,
+      "loss": 0.7578,
+      "step": 12006
+    },
+    {
+      "epoch": 0.8352986190824029,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00013889801845838034,
+      "loss": 0.7598,
+      "step": 12007
+    },
+    {
+      "epoch": 0.835368186719538,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00013878347375211253,
+      "loss": 0.863,
+      "step": 12008
+    },
+    {
+      "epoch": 0.8354377543566732,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00013866897277407908,
+      "loss": 0.5281,
+      "step": 12009
+    },
+    {
+      "epoch": 0.8355073219938085,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00013855451553009392,
+      "loss": 1.1497,
+      "step": 12010
+    },
+    {
+      "epoch": 0.8355768896309437,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00013844010202596847,
+      "loss": 0.7283,
+      "step": 12011
+    },
+    {
+      "epoch": 0.8356464572680788,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0001383257322675121,
+      "loss": 0.8684,
+      "step": 12012
+    },
+    {
+      "epoch": 0.8357160249052141,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00013821140626053163,
+      "loss": 0.9184,
+      "step": 12013
+    },
+    {
+      "epoch": 0.8357855925423493,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00013809712401083229,
+      "loss": 0.9563,
+      "step": 12014
+    },
+    {
+      "epoch": 0.8358551601794845,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0001379828855242168,
+      "loss": 1.0051,
+      "step": 12015
+    },
+    {
+      "epoch": 0.8359247278166198,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00013786869080648534,
+      "loss": 0.5833,
+      "step": 12016
+    },
+    {
+      "epoch": 0.8359942954537549,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00013775453986343645,
+      "loss": 0.7164,
+      "step": 12017
+    },
+    {
+      "epoch": 0.8360638630908901,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001376404327008659,
+      "loss": 0.8782,
+      "step": 12018
+    },
+    {
+      "epoch": 0.8361334307280254,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00013752636932456763,
+      "loss": 0.6167,
+      "step": 12019
+    },
+    {
+      "epoch": 0.8362029983651605,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000137412349740333,
+      "loss": 0.9512,
+      "step": 12020
+    },
+    {
+      "epoch": 0.8362725660022957,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00013729837395395173,
+      "loss": 0.8319,
+      "step": 12021
+    },
+    {
+      "epoch": 0.8363421336394309,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00013718444197121038,
+      "loss": 0.8411,
+      "step": 12022
+    },
+    {
+      "epoch": 0.8364117012765662,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00013707055379789425,
+      "loss": 1.0388,
+      "step": 12023
+    },
+    {
+      "epoch": 0.8364812689137013,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.000136956709439786,
+      "loss": 0.7488,
+      "step": 12024
+    },
+    {
+      "epoch": 0.8365508365508365,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00013684290890266605,
+      "loss": 0.9733,
+      "step": 12025
+    },
+    {
+      "epoch": 0.8366204041879718,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00013672915219231264,
+      "loss": 0.8369,
+      "step": 12026
+    },
+    {
+      "epoch": 0.836689971825107,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00013661543931450115,
+      "loss": 0.8344,
+      "step": 12027
+    },
+    {
+      "epoch": 0.8367595394622421,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00013650177027500632,
+      "loss": 0.9055,
+      "step": 12028
+    },
+    {
+      "epoch": 0.8368291070993774,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001363881450795993,
+      "loss": 0.6351,
+      "step": 12029
+    },
+    {
+      "epoch": 0.8368986747365126,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.000136274563734049,
+      "loss": 0.8663,
+      "step": 12030
+    },
+    {
+      "epoch": 0.8369682423736478,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00013616102624412318,
+      "loss": 0.6302,
+      "step": 12031
+    },
+    {
+      "epoch": 0.837037810010783,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0001360475326155861,
+      "loss": 0.8401,
+      "step": 12032
+    },
+    {
+      "epoch": 0.8371073776479182,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00013593408285420095,
+      "loss": 0.7076,
+      "step": 12033
+    },
+    {
+      "epoch": 0.8371769452850534,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00013582067696572752,
+      "loss": 1.0298,
+      "step": 12034
+    },
+    {
+      "epoch": 0.8372465129221885,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0001357073149559247,
+      "loss": 0.6113,
+      "step": 12035
+    },
+    {
+      "epoch": 0.8373160805593238,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00013559399683054773,
+      "loss": 0.8717,
+      "step": 12036
+    },
+    {
+      "epoch": 0.837385648196459,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001354807225953507,
+      "loss": 0.7921,
+      "step": 12037
+    },
+    {
+      "epoch": 0.8374552158335942,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00013536749225608535,
+      "loss": 0.7053,
+      "step": 12038
+    },
+    {
+      "epoch": 0.8375247834707295,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001352543058185006,
+      "loss": 0.8521,
+      "step": 12039
+    },
+    {
+      "epoch": 0.8375943511078646,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00013514116328834348,
+      "loss": 0.7449,
+      "step": 12040
+    },
+    {
+      "epoch": 0.8376639187449998,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00013502806467135874,
+      "loss": 0.9089,
+      "step": 12041
+    },
+    {
+      "epoch": 0.8377334863821351,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0001349150099732893,
+      "loss": 0.6579,
+      "step": 12042
+    },
+    {
+      "epoch": 0.8378030540192702,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00013480199919987536,
+      "loss": 0.9044,
+      "step": 12043
+    },
+    {
+      "epoch": 0.8378726216564054,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001346890323568548,
+      "loss": 0.7812,
+      "step": 12044
+    },
+    {
+      "epoch": 0.8379421892935407,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0001345761094499638,
+      "loss": 0.8827,
+      "step": 12045
+    },
+    {
+      "epoch": 0.8380117569306759,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0001344632304849358,
+      "loss": 1.0183,
+      "step": 12046
+    },
+    {
+      "epoch": 0.838081324567811,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0001343503954675025,
+      "loss": 0.9414,
+      "step": 12047
+    },
+    {
+      "epoch": 0.8381508922049462,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00013423760440339262,
+      "loss": 0.6596,
+      "step": 12048
+    },
+    {
+      "epoch": 0.8382204598420815,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00013412485729833367,
+      "loss": 0.6656,
+      "step": 12049
+    },
+    {
+      "epoch": 0.8382900274792167,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00013401215415805002,
+      "loss": 0.8514,
+      "step": 12050
+    },
+    {
+      "epoch": 0.8383595951163518,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00013389949498826415,
+      "loss": 0.8535,
+      "step": 12051
+    },
+    {
+      "epoch": 0.8384291627534871,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00013378687979469684,
+      "loss": 0.5699,
+      "step": 12052
+    },
+    {
+      "epoch": 0.8384987303906223,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00013367430858306562,
+      "loss": 0.9435,
+      "step": 12053
+    },
+    {
+      "epoch": 0.8385682980277575,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00013356178135908613,
+      "loss": 0.5726,
+      "step": 12054
+    },
+    {
+      "epoch": 0.8386378656648927,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001334492981284723,
+      "loss": 0.7919,
+      "step": 12055
+    },
+    {
+      "epoch": 0.8387074333020279,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00013333685889693557,
+      "loss": 0.5164,
+      "step": 12056
+    },
+    {
+      "epoch": 0.8387770009391631,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001332244636701848,
+      "loss": 0.5733,
+      "step": 12057
+    },
+    {
+      "epoch": 0.8388465685762984,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00013311211245392674,
+      "loss": 0.6505,
+      "step": 12058
+    },
+    {
+      "epoch": 0.8389161362134335,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00013299980525386613,
+      "loss": 0.8734,
+      "step": 12059
+    },
+    {
+      "epoch": 0.8389857038505687,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00013288754207570563,
+      "loss": 0.9035,
+      "step": 12060
+    },
+    {
+      "epoch": 0.8390552714877039,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00013277532292514527,
+      "loss": 0.685,
+      "step": 12061
+    },
+    {
+      "epoch": 0.8391248391248392,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00013266314780788246,
+      "loss": 0.7693,
+      "step": 12062
+    },
+    {
+      "epoch": 0.8391944067619743,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00013255101672961366,
+      "loss": 0.8275,
+      "step": 12063
+    },
+    {
+      "epoch": 0.8392639743991095,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00013243892969603177,
+      "loss": 0.8141,
+      "step": 12064
+    },
+    {
+      "epoch": 0.8393335420362448,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00013232688671282832,
+      "loss": 0.8422,
+      "step": 12065
+    },
+    {
+      "epoch": 0.8394031096733799,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.000132214887785692,
+      "loss": 0.7412,
+      "step": 12066
+    },
+    {
+      "epoch": 0.8394726773105151,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00013210293292030995,
+      "loss": 0.9879,
+      "step": 12067
+    },
+    {
+      "epoch": 0.8395422449476504,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00013199102212236614,
+      "loss": 0.6968,
+      "step": 12068
+    },
+    {
+      "epoch": 0.8396118125847856,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00013187915539754325,
+      "loss": 0.6176,
+      "step": 12069
+    },
+    {
+      "epoch": 0.8396813802219207,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0001317673327515213,
+      "loss": 0.7005,
+      "step": 12070
+    },
+    {
+      "epoch": 0.839750947859056,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0001316555541899781,
+      "loss": 0.9586,
+      "step": 12071
+    },
+    {
+      "epoch": 0.8398205154961912,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00013154381971858898,
+      "loss": 0.8076,
+      "step": 12072
+    },
+    {
+      "epoch": 0.8398900831333264,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00013143212934302694,
+      "loss": 0.8819,
+      "step": 12073
+    },
+    {
+      "epoch": 0.8399596507704615,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00013132048306896394,
+      "loss": 0.6674,
+      "step": 12074
+    },
+    {
+      "epoch": 0.8400292184075968,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00013120888090206828,
+      "loss": 0.8355,
+      "step": 12075
+    },
+    {
+      "epoch": 0.840098786044732,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00013109732284800646,
+      "loss": 0.712,
+      "step": 12076
+    },
+    {
+      "epoch": 0.8401683536818672,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00013098580891244315,
+      "loss": 0.6425,
+      "step": 12077
+    },
+    {
+      "epoch": 0.8402379213190024,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00013087433910104006,
+      "loss": 0.6536,
+      "step": 12078
+    },
+    {
+      "epoch": 0.8403074889561376,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00013076291341945756,
+      "loss": 0.7956,
+      "step": 12079
+    },
+    {
+      "epoch": 0.8403770565932728,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0001306515318733529,
+      "loss": 0.9083,
+      "step": 12080
+    },
+    {
+      "epoch": 0.8404466242304081,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00013054019446838173,
+      "loss": 1.0065,
+      "step": 12081
+    },
+    {
+      "epoch": 0.8405161918675432,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00013042890121019691,
+      "loss": 1.097,
+      "step": 12082
+    },
+    {
+      "epoch": 0.8405857595046784,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00013031765210444956,
+      "loss": 0.8163,
+      "step": 12083
+    },
+    {
+      "epoch": 0.8406553271418137,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00013020644715678855,
+      "loss": 0.7708,
+      "step": 12084
+    },
+    {
+      "epoch": 0.8407248947789489,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00013009528637285994,
+      "loss": 0.6088,
+      "step": 12085
+    },
+    {
+      "epoch": 0.840794462416084,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00012998416975830795,
+      "loss": 0.8125,
+      "step": 12086
+    },
+    {
+      "epoch": 0.8408640300532192,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0001298730973187745,
+      "loss": 0.7017,
+      "step": 12087
+    },
+    {
+      "epoch": 0.8409335976903545,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00012976206905989973,
+      "loss": 0.7749,
+      "step": 12088
+    },
+    {
+      "epoch": 0.8410031653274896,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0001296510849873207,
+      "loss": 1.0097,
+      "step": 12089
+    },
+    {
+      "epoch": 0.8410727329646248,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00012954014510667246,
+      "loss": 0.7875,
+      "step": 12090
+    },
+    {
+      "epoch": 0.8411423006017601,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00012942924942358825,
+      "loss": 0.811,
+      "step": 12091
+    },
+    {
+      "epoch": 0.8412118682388953,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00012931839794369892,
+      "loss": 0.7561,
+      "step": 12092
+    },
+    {
+      "epoch": 0.8412814358760304,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00012920759067263287,
+      "loss": 0.7486,
+      "step": 12093
+    },
+    {
+      "epoch": 0.8413510035131657,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00012909682761601604,
+      "loss": 0.9831,
+      "step": 12094
+    },
+    {
+      "epoch": 0.8414205711503009,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0001289861087794727,
+      "loss": 0.9107,
+      "step": 12095
+    },
+    {
+      "epoch": 0.8414901387874361,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00012887543416862445,
+      "loss": 0.8535,
+      "step": 12096
+    },
+    {
+      "epoch": 0.8415597064245713,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00012876480378909083,
+      "loss": 0.8429,
+      "step": 12097
+    },
+    {
+      "epoch": 0.8416292740617065,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0001286542176464892,
+      "loss": 0.56,
+      "step": 12098
+    },
+    {
+      "epoch": 0.8416988416988417,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00012854367574643467,
+      "loss": 0.8433,
+      "step": 12099
+    },
+    {
+      "epoch": 0.8417684093359769,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00012843317809453959,
+      "loss": 0.958,
+      "step": 12100
+    },
+    {
+      "epoch": 0.8418379769731121,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00012832272469641458,
+      "loss": 0.8084,
+      "step": 12101
+    },
+    {
+      "epoch": 0.8419075446102473,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.00012821231555766832,
+      "loss": 0.6842,
+      "step": 12102
+    },
+    {
+      "epoch": 0.8419771122473825,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0001281019506839065,
+      "loss": 1.1108,
+      "step": 12103
+    },
+    {
+      "epoch": 0.8420466798845178,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00012799163008073278,
+      "loss": 0.794,
+      "step": 12104
+    },
+    {
+      "epoch": 0.8421162475216529,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001278813537537489,
+      "loss": 0.732,
+      "step": 12105
+    },
+    {
+      "epoch": 0.8421858151587881,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0001277711217085541,
+      "loss": 0.544,
+      "step": 12106
+    },
+    {
+      "epoch": 0.8422553827959234,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00012766093395074552,
+      "loss": 0.7447,
+      "step": 12107
+    },
+    {
+      "epoch": 0.8423249504330586,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00012755079048591756,
+      "loss": 0.7633,
+      "step": 12108
+    },
+    {
+      "epoch": 0.8423945180701937,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00012744069131966318,
+      "loss": 0.934,
+      "step": 12109
+    },
+    {
+      "epoch": 0.842464085707329,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00012733063645757226,
+      "loss": 0.8014,
+      "step": 12110
+    },
+    {
+      "epoch": 0.8425336533444642,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.000127220625905233,
+      "loss": 0.7666,
+      "step": 12111
+    },
+    {
+      "epoch": 0.8426032209815993,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00012711065966823155,
+      "loss": 1.0765,
+      "step": 12112
+    },
+    {
+      "epoch": 0.8426727886187345,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00012700073775215093,
+      "loss": 0.6896,
+      "step": 12113
+    },
+    {
+      "epoch": 0.8427423562558698,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00012689086016257257,
+      "loss": 0.8598,
+      "step": 12114
+    },
+    {
+      "epoch": 0.842811923893005,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00012678102690507544,
+      "loss": 0.9347,
+      "step": 12115
+    },
+    {
+      "epoch": 0.8428814915301401,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001266712379852367,
+      "loss": 0.8,
+      "step": 12116
+    },
+    {
+      "epoch": 0.8429510591672754,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00012656149340863055,
+      "loss": 0.9972,
+      "step": 12117
+    },
+    {
+      "epoch": 0.8430206268044106,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00012645179318082912,
+      "loss": 0.768,
+      "step": 12118
+    },
+    {
+      "epoch": 0.8430901944415458,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00012634213730740253,
+      "loss": 0.8108,
+      "step": 12119
+    },
+    {
+      "epoch": 0.843159762078681,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00012623252579391898,
+      "loss": 0.8261,
+      "step": 12120
+    },
+    {
+      "epoch": 0.8432293297158162,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00012612295864594358,
+      "loss": 0.6399,
+      "step": 12121
+    },
+    {
+      "epoch": 0.8432988973529514,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00012601343586903947,
+      "loss": 0.9218,
+      "step": 12122
+    },
+    {
+      "epoch": 0.8433684649900867,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00012590395746876802,
+      "loss": 0.6682,
+      "step": 12123
+    },
+    {
+      "epoch": 0.8434380326272218,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00012579452345068775,
+      "loss": 0.7739,
+      "step": 12124
+    },
+    {
+      "epoch": 0.843507600264357,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0001256851338203552,
+      "loss": 0.966,
+      "step": 12125
+    },
+    {
+      "epoch": 0.8435771679014922,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012557578858332486,
+      "loss": 0.6529,
+      "step": 12126
+    },
+    {
+      "epoch": 0.8436467355386275,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00012546648774514868,
+      "loss": 0.7381,
+      "step": 12127
+    },
+    {
+      "epoch": 0.8437163031757626,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00012535723131137588,
+      "loss": 0.7895,
+      "step": 12128
+    },
+    {
+      "epoch": 0.8437858708128978,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00012524801928755447,
+      "loss": 0.9043,
+      "step": 12129
+    },
+    {
+      "epoch": 0.8438554384500331,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012513885167922978,
+      "loss": 0.8401,
+      "step": 12130
+    },
+    {
+      "epoch": 0.8439250060871683,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0001250297284919445,
+      "loss": 0.6916,
+      "step": 12131
+    },
+    {
+      "epoch": 0.8439945737243034,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0001249206497312393,
+      "loss": 0.808,
+      "step": 12132
+    },
+    {
+      "epoch": 0.8440641413614387,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00012481161540265273,
+      "loss": 0.8094,
+      "step": 12133
+    },
+    {
+      "epoch": 0.8441337089985739,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001247026255117213,
+      "loss": 0.8593,
+      "step": 12134
+    },
+    {
+      "epoch": 0.844203276635709,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00012459368006397865,
+      "loss": 0.4995,
+      "step": 12135
+    },
+    {
+      "epoch": 0.8442728442728443,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0001244847790649565,
+      "loss": 0.8877,
+      "step": 12136
+    },
+    {
+      "epoch": 0.8443424119099795,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00012437592252018416,
+      "loss": 0.8176,
+      "step": 12137
+    },
+    {
+      "epoch": 0.8444119795471147,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00012426711043518924,
+      "loss": 0.6735,
+      "step": 12138
+    },
+    {
+      "epoch": 0.8444815471842498,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0001241583428154963,
+      "loss": 0.8597,
+      "step": 12139
+    },
+    {
+      "epoch": 0.8445511148213851,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0001240496196666283,
+      "loss": 0.8347,
+      "step": 12140
+    },
+    {
+      "epoch": 0.8446206824585203,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001239409409941056,
+      "loss": 0.7573,
+      "step": 12141
+    },
+    {
+      "epoch": 0.8446902500956555,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00012383230680344592,
+      "loss": 0.886,
+      "step": 12142
+    },
+    {
+      "epoch": 0.8447598177327907,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0001237237171001655,
+      "loss": 0.9381,
+      "step": 12143
+    },
+    {
+      "epoch": 0.8448293853699259,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00012361517188977822,
+      "loss": 0.957,
+      "step": 12144
+    },
+    {
+      "epoch": 0.8448989530070611,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00012350667117779512,
+      "loss": 0.8213,
+      "step": 12145
+    },
+    {
+      "epoch": 0.8449685206441964,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00012339821496972536,
+      "loss": 0.7806,
+      "step": 12146
+    },
+    {
+      "epoch": 0.8450380882813315,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00012328980327107575,
+      "loss": 0.972,
+      "step": 12147
+    },
+    {
+      "epoch": 0.8451076559184667,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0001231814360873511,
+      "loss": 0.7905,
+      "step": 12148
+    },
+    {
+      "epoch": 0.845177223555602,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0001230731134240538,
+      "loss": 0.7074,
+      "step": 12149
+    },
+    {
+      "epoch": 0.8452467911927372,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00012296483528668345,
+      "loss": 0.8852,
+      "step": 12150
+    },
+    {
+      "epoch": 0.8453163588298723,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0001228566016807382,
+      "loss": 0.7658,
+      "step": 12151
+    },
+    {
+      "epoch": 0.8453859264670075,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00012274841261171376,
+      "loss": 0.588,
+      "step": 12152
+    },
+    {
+      "epoch": 0.8454554941041428,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001226402680851033,
+      "loss": 0.9275,
+      "step": 12153
+    },
+    {
+      "epoch": 0.845525061741278,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00012253216810639755,
+      "loss": 0.8288,
+      "step": 12154
+    },
+    {
+      "epoch": 0.8455946293784131,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00012242411268108578,
+      "loss": 0.9221,
+      "step": 12155
+    },
+    {
+      "epoch": 0.8456641970155484,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00012231610181465415,
+      "loss": 0.8036,
+      "step": 12156
+    },
+    {
+      "epoch": 0.8457337646526836,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001222081355125868,
+      "loss": 0.5976,
+      "step": 12157
+    },
+    {
+      "epoch": 0.8458033322898187,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00012210021378036628,
+      "loss": 0.7004,
+      "step": 12158
+    },
+    {
+      "epoch": 0.845872899926954,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012199233662347198,
+      "loss": 0.7648,
+      "step": 12159
+    },
+    {
+      "epoch": 0.8459424675640892,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00012188450404738105,
+      "loss": 0.7067,
+      "step": 12160
+    },
+    {
+      "epoch": 0.8460120352012244,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00012177671605756901,
+      "loss": 0.7604,
+      "step": 12161
+    },
+    {
+      "epoch": 0.8460816028383596,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00012166897265950894,
+      "loss": 0.886,
+      "step": 12162
+    },
+    {
+      "epoch": 0.8461511704754948,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00012156127385867144,
+      "loss": 0.542,
+      "step": 12163
+    },
+    {
+      "epoch": 0.84622073811263,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00012145361966052449,
+      "loss": 0.6837,
+      "step": 12164
+    },
+    {
+      "epoch": 0.8462903057497652,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00012134601007053447,
+      "loss": 0.7801,
+      "step": 12165
+    },
+    {
+      "epoch": 0.8463598733869004,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00012123844509416559,
+      "loss": 0.4521,
+      "step": 12166
+    },
+    {
+      "epoch": 0.8464294410240356,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00012113092473687914,
+      "loss": 1.0481,
+      "step": 12167
+    },
+    {
+      "epoch": 0.8464990086611708,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012102344900413442,
+      "loss": 1.0554,
+      "step": 12168
+    },
+    {
+      "epoch": 0.8465685762983061,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00012091601790138851,
+      "loss": 0.7972,
+      "step": 12169
+    },
+    {
+      "epoch": 0.8466381439354412,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00012080863143409648,
+      "loss": 0.7867,
+      "step": 12170
+    },
+    {
+      "epoch": 0.8467077115725764,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00012070128960771043,
+      "loss": 0.7568,
+      "step": 12171
+    },
+    {
+      "epoch": 0.8467772792097117,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012059399242768122,
+      "loss": 0.6988,
+      "step": 12172
+    },
+    {
+      "epoch": 0.8468468468468469,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00012048673989945657,
+      "loss": 0.6091,
+      "step": 12173
+    },
+    {
+      "epoch": 0.846916414483982,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00012037953202848184,
+      "loss": 0.5821,
+      "step": 12174
+    },
+    {
+      "epoch": 0.8469859821211173,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00012027236882020099,
+      "loss": 0.829,
+      "step": 12175
+    },
+    {
+      "epoch": 0.8470555497582525,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00012016525028005521,
+      "loss": 0.5804,
+      "step": 12176
+    },
+    {
+      "epoch": 0.8471251173953877,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00012005817641348337,
+      "loss": 0.6997,
+      "step": 12177
+    },
+    {
+      "epoch": 0.8471946850325228,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00011995114722592193,
+      "loss": 0.8979,
+      "step": 12178
+    },
+    {
+      "epoch": 0.8472642526696581,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0001198441627228054,
+      "loss": 1.0183,
+      "step": 12179
+    },
+    {
+      "epoch": 0.8473338203067933,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00011973722290956613,
+      "loss": 0.8141,
+      "step": 12180
+    },
+    {
+      "epoch": 0.8474033879439284,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011963032779163397,
+      "loss": 0.7746,
+      "step": 12181
+    },
+    {
+      "epoch": 0.8474729555810637,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00011952347737443603,
+      "loss": 0.8633,
+      "step": 12182
+    },
+    {
+      "epoch": 0.8475425232181989,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00011941667166339809,
+      "loss": 0.6313,
+      "step": 12183
+    },
+    {
+      "epoch": 0.8476120908553341,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00011930991066394315,
+      "loss": 0.8317,
+      "step": 12184
+    },
+    {
+      "epoch": 0.8476816584924693,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011920319438149185,
+      "loss": 0.7958,
+      "step": 12185
+    },
+    {
+      "epoch": 0.8477512261296045,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00011909652282146299,
+      "loss": 0.8036,
+      "step": 12186
+    },
+    {
+      "epoch": 0.8478207937667397,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00011898989598927257,
+      "loss": 0.9245,
+      "step": 12187
+    },
+    {
+      "epoch": 0.847890361403875,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011888331389033447,
+      "loss": 0.9106,
+      "step": 12188
+    },
+    {
+      "epoch": 0.8479599290410101,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00011877677653006058,
+      "loss": 0.5841,
+      "step": 12189
+    },
+    {
+      "epoch": 0.8480294966781453,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00011867028391386037,
+      "loss": 0.4885,
+      "step": 12190
+    },
+    {
+      "epoch": 0.8480990643152805,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00011856383604714094,
+      "loss": 1.0532,
+      "step": 12191
+    },
+    {
+      "epoch": 0.8481686319524158,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011845743293530697,
+      "loss": 0.8783,
+      "step": 12192
+    },
+    {
+      "epoch": 0.8482381995895509,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011835107458376126,
+      "loss": 0.8023,
+      "step": 12193
+    },
+    {
+      "epoch": 0.8483077672266861,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00011824476099790426,
+      "loss": 0.784,
+      "step": 12194
+    },
+    {
+      "epoch": 0.8483773348638214,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.000118138492183134,
+      "loss": 0.7627,
+      "step": 12195
+    },
+    {
+      "epoch": 0.8484469025009566,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011803226814484602,
+      "loss": 0.8796,
+      "step": 12196
+    },
+    {
+      "epoch": 0.8485164701380917,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00011792608888843392,
+      "loss": 1.01,
+      "step": 12197
+    },
+    {
+      "epoch": 0.848586037775227,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00011781995441928939,
+      "loss": 0.7642,
+      "step": 12198
+    },
+    {
+      "epoch": 0.8486556054123622,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00011771386474280077,
+      "loss": 0.9497,
+      "step": 12199
+    },
+    {
+      "epoch": 0.8487251730494974,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0001176078198643552,
+      "loss": 0.8968,
+      "step": 12200
+    },
+    {
+      "epoch": 0.8487947406866326,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011750181978933682,
+      "loss": 0.8285,
+      "step": 12201
+    },
+    {
+      "epoch": 0.8488643083237678,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00011739586452312812,
+      "loss": 1.0651,
+      "step": 12202
+    },
+    {
+      "epoch": 0.848933875960903,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011728995407110854,
+      "loss": 0.757,
+      "step": 12203
+    },
+    {
+      "epoch": 0.8490034435980381,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00011718408843865602,
+      "loss": 0.727,
+      "step": 12204
+    },
+    {
+      "epoch": 0.8490730112351734,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00011707826763114593,
+      "loss": 0.773,
+      "step": 12205
+    },
+    {
+      "epoch": 0.8491425788723086,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011697249165395085,
+      "loss": 0.7744,
+      "step": 12206
+    },
+    {
+      "epoch": 0.8492121465094438,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00011686676051244183,
+      "loss": 0.8447,
+      "step": 12207
+    },
+    {
+      "epoch": 0.849281714146579,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.00011676107421198767,
+      "loss": 0.4904,
+      "step": 12208
+    },
+    {
+      "epoch": 0.8493512817837142,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011665543275795432,
+      "loss": 0.9249,
+      "step": 12209
+    },
+    {
+      "epoch": 0.8494208494208494,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00011654983615570546,
+      "loss": 0.5941,
+      "step": 12210
+    },
+    {
+      "epoch": 0.8494904170579847,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011644428441060295,
+      "loss": 0.7498,
+      "step": 12211
+    },
+    {
+      "epoch": 0.8495599846951198,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011633877752800648,
+      "loss": 0.8989,
+      "step": 12212
+    },
+    {
+      "epoch": 0.849629552332255,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00011623331551327276,
+      "loss": 0.8272,
+      "step": 12213
+    },
+    {
+      "epoch": 0.8496991199693903,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00011612789837175686,
+      "loss": 0.5662,
+      "step": 12214
+    },
+    {
+      "epoch": 0.8497686876065255,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00011602252610881115,
+      "loss": 0.7696,
+      "step": 12215
+    },
+    {
+      "epoch": 0.8498382552436606,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00011591719872978601,
+      "loss": 0.7914,
+      "step": 12216
+    },
+    {
+      "epoch": 0.8499078228807958,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0001158119162400294,
+      "loss": 1.0314,
+      "step": 12217
+    },
+    {
+      "epoch": 0.8499773905179311,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.00011570667864488716,
+      "loss": 0.6842,
+      "step": 12218
+    },
+    {
+      "epoch": 0.8500469581550663,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00011560148594970266,
+      "loss": 0.8372,
+      "step": 12219
+    },
+    {
+      "epoch": 0.8501165257922014,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011549633815981652,
+      "loss": 0.7289,
+      "step": 12220
+    },
+    {
+      "epoch": 0.8501860934293367,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001153912352805685,
+      "loss": 0.746,
+      "step": 12221
+    },
+    {
+      "epoch": 0.8502556610664719,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011528617731729485,
+      "loss": 0.7059,
+      "step": 12222
+    },
+    {
+      "epoch": 0.850325228703607,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00011518116427532988,
+      "loss": 0.6602,
+      "step": 12223
+    },
+    {
+      "epoch": 0.8503947963407423,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0001150761961600052,
+      "loss": 0.7342,
+      "step": 12224
+    },
+    {
+      "epoch": 0.8504643639778775,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00011497127297665111,
+      "loss": 0.836,
+      "step": 12225
+    },
+    {
+      "epoch": 0.8505339316150127,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00011486639473059502,
+      "loss": 0.7602,
+      "step": 12226
+    },
+    {
+      "epoch": 0.850603499252148,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00011476156142716198,
+      "loss": 0.6743,
+      "step": 12227
+    },
+    {
+      "epoch": 0.8506730668892831,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00011465677307167477,
+      "loss": 1.1329,
+      "step": 12228
+    },
+    {
+      "epoch": 0.8507426345264183,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0001145520296694541,
+      "loss": 0.999,
+      "step": 12229
+    },
+    {
+      "epoch": 0.8508122021635535,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00011444733122581863,
+      "loss": 0.7615,
+      "step": 12230
+    },
+    {
+      "epoch": 0.8508817698006887,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00011434267774608398,
+      "loss": 0.7004,
+      "step": 12231
+    },
+    {
+      "epoch": 0.8509513374378239,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011423806923556424,
+      "loss": 0.9571,
+      "step": 12232
+    },
+    {
+      "epoch": 0.8510209050749591,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0001141335056995706,
+      "loss": 0.8939,
+      "step": 12233
+    },
+    {
+      "epoch": 0.8510904727120944,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.00011402898714341269,
+      "loss": 0.7862,
+      "step": 12234
+    },
+    {
+      "epoch": 0.8511600403492295,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00011392451357239697,
+      "loss": 0.7262,
+      "step": 12235
+    },
+    {
+      "epoch": 0.8512296079863647,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0001138200849918285,
+      "loss": 0.7603,
+      "step": 12236
+    },
+    {
+      "epoch": 0.8512991756235,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00011371570140700937,
+      "loss": 0.7464,
+      "step": 12237
+    },
+    {
+      "epoch": 0.8513687432606352,
+      "grad_norm": 1.875,
+      "learning_rate": 0.00011361136282323959,
+      "loss": 0.8486,
+      "step": 12238
+    },
+    {
+      "epoch": 0.8514383108977703,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00011350706924581711,
+      "loss": 0.6902,
+      "step": 12239
+    },
+    {
+      "epoch": 0.8515078785349056,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00011340282068003749,
+      "loss": 0.9536,
+      "step": 12240
+    },
+    {
+      "epoch": 0.8515774461720408,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00011329861713119394,
+      "loss": 0.6185,
+      "step": 12241
+    },
+    {
+      "epoch": 0.851647013809176,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00011319445860457711,
+      "loss": 0.9331,
+      "step": 12242
+    },
+    {
+      "epoch": 0.8517165814463111,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00011309034510547578,
+      "loss": 0.9381,
+      "step": 12243
+    },
+    {
+      "epoch": 0.8517861490834464,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0001129862766391766,
+      "loss": 0.8453,
+      "step": 12244
+    },
+    {
+      "epoch": 0.8518557167205816,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00011288225321096323,
+      "loss": 0.8247,
+      "step": 12245
+    },
+    {
+      "epoch": 0.8519252843577167,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001127782748261178,
+      "loss": 0.7709,
+      "step": 12246
+    },
+    {
+      "epoch": 0.851994851994852,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0001126743414899194,
+      "loss": 0.7957,
+      "step": 12247
+    },
+    {
+      "epoch": 0.8520644196319872,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00011257045320764581,
+      "loss": 0.975,
+      "step": 12248
+    },
+    {
+      "epoch": 0.8521339872691224,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00011246660998457136,
+      "loss": 1.0589,
+      "step": 12249
+    },
+    {
+      "epoch": 0.8522035549062577,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0001123628118259692,
+      "loss": 0.8049,
+      "step": 12250
+    },
+    {
+      "epoch": 0.8522731225433928,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00011225905873710929,
+      "loss": 1.0467,
+      "step": 12251
+    },
+    {
+      "epoch": 0.852342690180528,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011215535072325956,
+      "loss": 0.5724,
+      "step": 12252
+    },
+    {
+      "epoch": 0.8524122578176633,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00011205168778968644,
+      "loss": 0.9426,
+      "step": 12253
+    },
+    {
+      "epoch": 0.8524818254547984,
+      "grad_norm": 1.0,
+      "learning_rate": 0.00011194806994165297,
+      "loss": 0.6056,
+      "step": 12254
+    },
+    {
+      "epoch": 0.8525513930919336,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00011184449718442047,
+      "loss": 0.711,
+      "step": 12255
+    },
+    {
+      "epoch": 0.8526209607290688,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011174096952324753,
+      "loss": 0.9735,
+      "step": 12256
+    },
+    {
+      "epoch": 0.8526905283662041,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00011163748696339104,
+      "loss": 0.7256,
+      "step": 12257
+    },
+    {
+      "epoch": 0.8527600960033392,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00011153404951010537,
+      "loss": 0.6559,
+      "step": 12258
+    },
+    {
+      "epoch": 0.8528296636404744,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00011143065716864243,
+      "loss": 0.8445,
+      "step": 12259
+    },
+    {
+      "epoch": 0.8528992312776097,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00011132730994425211,
+      "loss": 0.6826,
+      "step": 12260
+    },
+    {
+      "epoch": 0.8529687989147449,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00011122400784218157,
+      "loss": 0.8554,
+      "step": 12261
+    },
+    {
+      "epoch": 0.85303836655188,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00011112075086767626,
+      "loss": 0.8118,
+      "step": 12262
+    },
+    {
+      "epoch": 0.8531079341890153,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00011101753902597877,
+      "loss": 0.693,
+      "step": 12263
+    },
+    {
+      "epoch": 0.8531775018261505,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00011091437232233015,
+      "loss": 0.8587,
+      "step": 12264
+    },
+    {
+      "epoch": 0.8532470694632857,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00011081125076196807,
+      "loss": 0.7478,
+      "step": 12265
+    },
+    {
+      "epoch": 0.8533166371004209,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011070817435012892,
+      "loss": 0.8519,
+      "step": 12266
+    },
+    {
+      "epoch": 0.8533862047375561,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00011060514309204639,
+      "loss": 0.5889,
+      "step": 12267
+    },
+    {
+      "epoch": 0.8534557723746913,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00011050215699295196,
+      "loss": 1.0257,
+      "step": 12268
+    },
+    {
+      "epoch": 0.8535253400118264,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00011039921605807446,
+      "loss": 0.7956,
+      "step": 12269
+    },
+    {
+      "epoch": 0.8535949076489617,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.00011029632029264069,
+      "loss": 1.1196,
+      "step": 12270
+    },
+    {
+      "epoch": 0.8536644752860969,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00011019346970187538,
+      "loss": 0.9829,
+      "step": 12271
+    },
+    {
+      "epoch": 0.8537340429232321,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0001100906642910009,
+      "loss": 0.9512,
+      "step": 12272
+    },
+    {
+      "epoch": 0.8538036105603674,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010998790406523685,
+      "loss": 0.6774,
+      "step": 12273
+    },
+    {
+      "epoch": 0.8538731781975025,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010988518902980115,
+      "loss": 0.8759,
+      "step": 12274
+    },
+    {
+      "epoch": 0.8539427458346377,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00010978251918990889,
+      "loss": 0.6949,
+      "step": 12275
+    },
+    {
+      "epoch": 0.854012313471773,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00010967989455077353,
+      "loss": 0.8158,
+      "step": 12276
+    },
+    {
+      "epoch": 0.8540818811089081,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00010957731511760527,
+      "loss": 0.7616,
+      "step": 12277
+    },
+    {
+      "epoch": 0.8541514487460433,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00010947478089561314,
+      "loss": 0.6717,
+      "step": 12278
+    },
+    {
+      "epoch": 0.8542210163831786,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00010937229189000286,
+      "loss": 0.8055,
+      "step": 12279
+    },
+    {
+      "epoch": 0.8542905840203138,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00010926984810597851,
+      "loss": 0.8088,
+      "step": 12280
+    },
+    {
+      "epoch": 0.8543601516574489,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010916744954874192,
+      "loss": 0.8147,
+      "step": 12281
+    },
+    {
+      "epoch": 0.8544297192945841,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010906509622349204,
+      "loss": 0.8183,
+      "step": 12282
+    },
+    {
+      "epoch": 0.8544992869317194,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00010896278813542593,
+      "loss": 0.7092,
+      "step": 12283
+    },
+    {
+      "epoch": 0.8545688545688546,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.00010886052528973789,
+      "loss": 0.9953,
+      "step": 12284
+    },
+    {
+      "epoch": 0.8546384222059897,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00010875830769162109,
+      "loss": 0.9157,
+      "step": 12285
+    },
+    {
+      "epoch": 0.854707989843125,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00010865613534626517,
+      "loss": 0.6253,
+      "step": 12286
+    },
+    {
+      "epoch": 0.8547775574802602,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00010855400825885786,
+      "loss": 0.9757,
+      "step": 12287
+    },
+    {
+      "epoch": 0.8548471251173954,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010845192643458501,
+      "loss": 0.7175,
+      "step": 12288
+    },
+    {
+      "epoch": 0.8549166927545306,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00010834988987862936,
+      "loss": 0.6697,
+      "step": 12289
+    },
+    {
+      "epoch": 0.8549862603916658,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00010824789859617224,
+      "loss": 0.9938,
+      "step": 12290
+    },
+    {
+      "epoch": 0.855055828028801,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0001081459525923919,
+      "loss": 0.766,
+      "step": 12291
+    },
+    {
+      "epoch": 0.8551253956659363,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010804405187246502,
+      "loss": 0.6445,
+      "step": 12292
+    },
+    {
+      "epoch": 0.8551949633030714,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00010794219644156522,
+      "loss": 0.7632,
+      "step": 12293
+    },
+    {
+      "epoch": 0.8552645309402066,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010784038630486437,
+      "loss": 0.8305,
+      "step": 12294
+    },
+    {
+      "epoch": 0.8553340985773418,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.000107738621467532,
+      "loss": 0.765,
+      "step": 12295
+    },
+    {
+      "epoch": 0.855403666214477,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00010763690193473519,
+      "loss": 0.7739,
+      "step": 12296
+    },
+    {
+      "epoch": 0.8554732338516122,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0001075352277116386,
+      "loss": 0.8307,
+      "step": 12297
+    },
+    {
+      "epoch": 0.8555428014887474,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010743359880340442,
+      "loss": 0.648,
+      "step": 12298
+    },
+    {
+      "epoch": 0.8556123691258827,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.00010733201521519364,
+      "loss": 0.8163,
+      "step": 12299
+    },
+    {
+      "epoch": 0.8556819367630178,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001072304769521637,
+      "loss": 0.5939,
+      "step": 12300
+    },
+    {
+      "epoch": 0.855751504400153,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00010712898401947024,
+      "loss": 0.8583,
+      "step": 12301
+    },
+    {
+      "epoch": 0.8558210720372883,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00010702753642226649,
+      "loss": 0.7241,
+      "step": 12302
+    },
+    {
+      "epoch": 0.8558906396744235,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010692613416570341,
+      "loss": 0.7889,
+      "step": 12303
+    },
+    {
+      "epoch": 0.8559602073115586,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010682477725493,
+      "loss": 0.7806,
+      "step": 12304
+    },
+    {
+      "epoch": 0.8560297749486939,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00010672346569509229,
+      "loss": 0.7109,
+      "step": 12305
+    },
+    {
+      "epoch": 0.8560993425858291,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00010662219949133478,
+      "loss": 0.5379,
+      "step": 12306
+    },
+    {
+      "epoch": 0.8561689102229643,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00010652097864879884,
+      "loss": 0.8115,
+      "step": 12307
+    },
+    {
+      "epoch": 0.8562384778600994,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00010641980317262423,
+      "loss": 0.7521,
+      "step": 12308
+    },
+    {
+      "epoch": 0.8563080454972347,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010631867306794795,
+      "loss": 0.7504,
+      "step": 12309
+    },
+    {
+      "epoch": 0.8563776131343699,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.00010621758833990513,
+      "loss": 0.701,
+      "step": 12310
+    },
+    {
+      "epoch": 0.8564471807715051,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00010611654899362789,
+      "loss": 0.8729,
+      "step": 12311
+    },
+    {
+      "epoch": 0.8565167484086403,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00010601555503424687,
+      "loss": 0.7054,
+      "step": 12312
+    },
+    {
+      "epoch": 0.8565863160457755,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010591460646689022,
+      "loss": 0.78,
+      "step": 12313
+    },
+    {
+      "epoch": 0.8566558836829107,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00010581370329668316,
+      "loss": 0.7408,
+      "step": 12314
+    },
+    {
+      "epoch": 0.856725451320046,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010571284552874939,
+      "loss": 0.7002,
+      "step": 12315
+    },
+    {
+      "epoch": 0.8567950189571811,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00010561203316820922,
+      "loss": 0.7092,
+      "step": 12316
+    },
+    {
+      "epoch": 0.8568645865943163,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00010551126622018248,
+      "loss": 0.858,
+      "step": 12317
+    },
+    {
+      "epoch": 0.8569341542314516,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00010541054468978507,
+      "loss": 1.0974,
+      "step": 12318
+    },
+    {
+      "epoch": 0.8570037218685868,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00010530986858213088,
+      "loss": 0.9695,
+      "step": 12319
+    },
+    {
+      "epoch": 0.8570732895057219,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00010520923790233217,
+      "loss": 0.8819,
+      "step": 12320
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00010510865265549818,
+      "loss": 0.8206,
+      "step": 12321
+    },
+    {
+      "epoch": 0.8572124247799924,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00010500811284673628,
+      "loss": 0.7581,
+      "step": 12322
+    },
+    {
+      "epoch": 0.8572819924171275,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00010490761848115127,
+      "loss": 1.0358,
+      "step": 12323
+    },
+    {
+      "epoch": 0.8573515600542627,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00010480716956384584,
+      "loss": 0.8605,
+      "step": 12324
+    },
+    {
+      "epoch": 0.857421127691398,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00010470676609992014,
+      "loss": 0.928,
+      "step": 12325
+    },
+    {
+      "epoch": 0.8574906953285332,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0001046064080944723,
+      "loss": 0.8935,
+      "step": 12326
+    },
+    {
+      "epoch": 0.8575602629656683,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00010450609555259805,
+      "loss": 0.9788,
+      "step": 12327
+    },
+    {
+      "epoch": 0.8576298306028036,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010440582847939061,
+      "loss": 0.6736,
+      "step": 12328
+    },
+    {
+      "epoch": 0.8576993982399388,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010430560687994117,
+      "loss": 0.6788,
+      "step": 12329
+    },
+    {
+      "epoch": 0.857768965877074,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00010420543075933786,
+      "loss": 0.7944,
+      "step": 12330
+    },
+    {
+      "epoch": 0.8578385335142092,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00010410530012266817,
+      "loss": 0.8764,
+      "step": 12331
+    },
+    {
+      "epoch": 0.8579081011513444,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00010400521497501558,
+      "loss": 0.6809,
+      "step": 12332
+    },
+    {
+      "epoch": 0.8579776687884796,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00010390517532146182,
+      "loss": 0.8418,
+      "step": 12333
+    },
+    {
+      "epoch": 0.8580472364256148,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00010380518116708692,
+      "loss": 0.6544,
+      "step": 12334
+    },
+    {
+      "epoch": 0.85811680406275,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00010370523251696751,
+      "loss": 0.891,
+      "step": 12335
+    },
+    {
+      "epoch": 0.8581863716998852,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00010360532937617894,
+      "loss": 1.1266,
+      "step": 12336
+    },
+    {
+      "epoch": 0.8582559393370204,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001035054717497933,
+      "loss": 0.5519,
+      "step": 12337
+    },
+    {
+      "epoch": 0.8583255069741557,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001034056596428814,
+      "loss": 0.9505,
+      "step": 12338
+    },
+    {
+      "epoch": 0.8583950746112908,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00010330589306051074,
+      "loss": 1.0161,
+      "step": 12339
+    },
+    {
+      "epoch": 0.858464642248426,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00010320617200774718,
+      "loss": 0.7717,
+      "step": 12340
+    },
+    {
+      "epoch": 0.8585342098855613,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0001031064964896542,
+      "loss": 0.9509,
+      "step": 12341
+    },
+    {
+      "epoch": 0.8586037775226965,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00010300686651129265,
+      "loss": 1.0319,
+      "step": 12342
+    },
+    {
+      "epoch": 0.8586733451598316,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00010290728207772104,
+      "loss": 0.734,
+      "step": 12343
+    },
+    {
+      "epoch": 0.8587429127969669,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010280774319399599,
+      "loss": 0.7712,
+      "step": 12344
+    },
+    {
+      "epoch": 0.8588124804341021,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00010270824986517169,
+      "loss": 0.87,
+      "step": 12345
+    },
+    {
+      "epoch": 0.8588820480712372,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00010260880209629985,
+      "loss": 0.8685,
+      "step": 12346
+    },
+    {
+      "epoch": 0.8589516157083724,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.00010250939989242957,
+      "loss": 0.662,
+      "step": 12347
+    },
+    {
+      "epoch": 0.8590211833455077,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00010241004325860859,
+      "loss": 0.6317,
+      "step": 12348
+    },
+    {
+      "epoch": 0.8590907509826429,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00010231073219988108,
+      "loss": 0.663,
+      "step": 12349
+    },
+    {
+      "epoch": 0.859160318619778,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00010221146672129022,
+      "loss": 1.0907,
+      "step": 12350
+    },
+    {
+      "epoch": 0.8592298862569133,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00010211224682787567,
+      "loss": 0.7271,
+      "step": 12351
+    },
+    {
+      "epoch": 0.8592994538940485,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00010201307252467573,
+      "loss": 0.7229,
+      "step": 12352
+    },
+    {
+      "epoch": 0.8593690215311837,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00010191394381672547,
+      "loss": 0.848,
+      "step": 12353
+    },
+    {
+      "epoch": 0.859438589168319,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00010181486070905855,
+      "loss": 0.9485,
+      "step": 12354
+    },
+    {
+      "epoch": 0.8595081568054541,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010171582320670602,
+      "loss": 0.6691,
+      "step": 12355
+    },
+    {
+      "epoch": 0.8595777244425893,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00010161683131469635,
+      "loss": 0.9354,
+      "step": 12356
+    },
+    {
+      "epoch": 0.8596472920797246,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00010151788503805548,
+      "loss": 0.8412,
+      "step": 12357
+    },
+    {
+      "epoch": 0.8597168597168597,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.00010141898438180785,
+      "loss": 0.8609,
+      "step": 12358
+    },
+    {
+      "epoch": 0.8597864273539949,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00010132012935097512,
+      "loss": 1.051,
+      "step": 12359
+    },
+    {
+      "epoch": 0.8598559949911301,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001012213199505766,
+      "loss": 0.9528,
+      "step": 12360
+    },
+    {
+      "epoch": 0.8599255626282654,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00010112255618562894,
+      "loss": 0.9181,
+      "step": 12361
+    },
+    {
+      "epoch": 0.8599951302654005,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00010102383806114735,
+      "loss": 0.9213,
+      "step": 12362
+    },
+    {
+      "epoch": 0.8600646979025357,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00010092516558214427,
+      "loss": 0.7589,
+      "step": 12363
+    },
+    {
+      "epoch": 0.860134265539671,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00010082653875362946,
+      "loss": 0.8638,
+      "step": 12364
+    },
+    {
+      "epoch": 0.8602038331768062,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.00010072795758061082,
+      "loss": 0.7913,
+      "step": 12365
+    },
+    {
+      "epoch": 0.8602734008139413,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001006294220680939,
+      "loss": 0.7034,
+      "step": 12366
+    },
+    {
+      "epoch": 0.8603429684510766,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00010053093222108168,
+      "loss": 0.9527,
+      "step": 12367
+    },
+    {
+      "epoch": 0.8604125360882118,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00010043248804457494,
+      "loss": 0.7554,
+      "step": 12368
+    },
+    {
+      "epoch": 0.860482103725347,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001003340895435726,
+      "loss": 0.9477,
+      "step": 12369
+    },
+    {
+      "epoch": 0.8605516713624821,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00010023573672307052,
+      "loss": 0.8501,
+      "step": 12370
+    },
+    {
+      "epoch": 0.8606212389996174,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00010013742958806238,
+      "loss": 0.6945,
+      "step": 12371
+    },
+    {
+      "epoch": 0.8606908066367526,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.00010003916814353986,
+      "loss": 0.9602,
+      "step": 12372
+    },
+    {
+      "epoch": 0.8607603742738877,
+      "grad_norm": 1.140625,
+      "learning_rate": 9.994095239449253e-05,
+      "loss": 0.6929,
+      "step": 12373
+    },
+    {
+      "epoch": 0.860829941911023,
+      "grad_norm": 0.828125,
+      "learning_rate": 9.984278234590694e-05,
+      "loss": 0.6558,
+      "step": 12374
+    },
+    {
+      "epoch": 0.8608995095481582,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.974465800276755e-05,
+      "loss": 0.8409,
+      "step": 12375
+    },
+    {
+      "epoch": 0.8609690771852934,
+      "grad_norm": 1.1875,
+      "learning_rate": 9.964657937005683e-05,
+      "loss": 0.8428,
+      "step": 12376
+    },
+    {
+      "epoch": 0.8610386448224286,
+      "grad_norm": 0.78125,
+      "learning_rate": 9.95485464527549e-05,
+      "loss": 0.9456,
+      "step": 12377
+    },
+    {
+      "epoch": 0.8611082124595638,
+      "grad_norm": 1.265625,
+      "learning_rate": 9.945055925583913e-05,
+      "loss": 0.7942,
+      "step": 12378
+    },
+    {
+      "epoch": 0.861177780096699,
+      "grad_norm": 0.88671875,
+      "learning_rate": 9.935261778428473e-05,
+      "loss": 0.6238,
+      "step": 12379
+    },
+    {
+      "epoch": 0.8612473477338343,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.925472204306485e-05,
+      "loss": 0.8494,
+      "step": 12380
+    },
+    {
+      "epoch": 0.8613169153709694,
+      "grad_norm": 1.140625,
+      "learning_rate": 9.915687203715007e-05,
+      "loss": 0.7037,
+      "step": 12381
+    },
+    {
+      "epoch": 0.8613864830081046,
+      "grad_norm": 1.5546875,
+      "learning_rate": 9.905906777150874e-05,
+      "loss": 0.9521,
+      "step": 12382
+    },
+    {
+      "epoch": 0.8614560506452398,
+      "grad_norm": 1.3359375,
+      "learning_rate": 9.89613092511068e-05,
+      "loss": 0.9395,
+      "step": 12383
+    },
+    {
+      "epoch": 0.8615256182823751,
+      "grad_norm": 1.046875,
+      "learning_rate": 9.886359648090826e-05,
+      "loss": 0.7269,
+      "step": 12384
+    },
+    {
+      "epoch": 0.8615951859195102,
+      "grad_norm": 1.0390625,
+      "learning_rate": 9.876592946587393e-05,
+      "loss": 0.7753,
+      "step": 12385
+    },
+    {
+      "epoch": 0.8616647535566454,
+      "grad_norm": 0.98828125,
+      "learning_rate": 9.866830821096318e-05,
+      "loss": 0.7973,
+      "step": 12386
+    },
+    {
+      "epoch": 0.8617343211937807,
+      "grad_norm": 1.546875,
+      "learning_rate": 9.857073272113282e-05,
+      "loss": 0.6929,
+      "step": 12387
+    },
+    {
+      "epoch": 0.8618038888309159,
+      "grad_norm": 0.94140625,
+      "learning_rate": 9.847320300133722e-05,
+      "loss": 0.8213,
+      "step": 12388
+    },
+    {
+      "epoch": 0.861873456468051,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.837571905652808e-05,
+      "loss": 0.8065,
+      "step": 12389
+    },
+    {
+      "epoch": 0.8619430241051863,
+      "grad_norm": 1.21875,
+      "learning_rate": 9.827828089165547e-05,
+      "loss": 0.7282,
+      "step": 12390
+    },
+    {
+      "epoch": 0.8620125917423215,
+      "grad_norm": 0.83984375,
+      "learning_rate": 9.818088851166684e-05,
+      "loss": 0.6526,
+      "step": 12391
+    },
+    {
+      "epoch": 0.8620821593794566,
+      "grad_norm": 1.03125,
+      "learning_rate": 9.808354192150725e-05,
+      "loss": 0.918,
+      "step": 12392
+    },
+    {
+      "epoch": 0.8621517270165919,
+      "grad_norm": 1.1171875,
+      "learning_rate": 9.79862411261192e-05,
+      "loss": 1.1219,
+      "step": 12393
+    },
+    {
+      "epoch": 0.8622212946537271,
+      "grad_norm": 1.296875,
+      "learning_rate": 9.788898613044328e-05,
+      "loss": 0.8244,
+      "step": 12394
+    },
+    {
+      "epoch": 0.8622908622908623,
+      "grad_norm": 1.0546875,
+      "learning_rate": 9.779177693941799e-05,
+      "loss": 0.7914,
+      "step": 12395
+    },
+    {
+      "epoch": 0.8623604299279974,
+      "grad_norm": 0.91015625,
+      "learning_rate": 9.76946135579787e-05,
+      "loss": 0.6921,
+      "step": 12396
+    },
+    {
+      "epoch": 0.8624299975651327,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.759749599105883e-05,
+      "loss": 0.9035,
+      "step": 12397
+    },
+    {
+      "epoch": 0.8624995652022679,
+      "grad_norm": 0.7578125,
+      "learning_rate": 9.750042424358984e-05,
+      "loss": 0.6005,
+      "step": 12398
+    },
+    {
+      "epoch": 0.8625691328394031,
+      "grad_norm": 1.2109375,
+      "learning_rate": 9.740339832050016e-05,
+      "loss": 0.8299,
+      "step": 12399
+    },
+    {
+      "epoch": 0.8626387004765383,
+      "grad_norm": 0.94140625,
+      "learning_rate": 9.730641822671649e-05,
+      "loss": 0.8842,
+      "step": 12400
+    },
+    {
+      "epoch": 0.8627082681136735,
+      "grad_norm": 1.1171875,
+      "learning_rate": 9.720948396716323e-05,
+      "loss": 1.0405,
+      "step": 12401
+    },
+    {
+      "epoch": 0.8627778357508087,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.711259554676188e-05,
+      "loss": 0.8272,
+      "step": 12402
+    },
+    {
+      "epoch": 0.862847403387944,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.701575297043197e-05,
+      "loss": 0.9613,
+      "step": 12403
+    },
+    {
+      "epoch": 0.8629169710250791,
+      "grad_norm": 0.96484375,
+      "learning_rate": 9.691895624309066e-05,
+      "loss": 0.6554,
+      "step": 12404
+    },
+    {
+      "epoch": 0.8629865386622143,
+      "grad_norm": 0.79296875,
+      "learning_rate": 9.682220536965314e-05,
+      "loss": 0.6862,
+      "step": 12405
+    },
+    {
+      "epoch": 0.8630561062993496,
+      "grad_norm": 1.2890625,
+      "learning_rate": 9.672550035503158e-05,
+      "loss": 0.6961,
+      "step": 12406
+    },
+    {
+      "epoch": 0.8631256739364848,
+      "grad_norm": 1.203125,
+      "learning_rate": 9.662884120413617e-05,
+      "loss": 0.9276,
+      "step": 12407
+    },
+    {
+      "epoch": 0.8631952415736199,
+      "grad_norm": 0.890625,
+      "learning_rate": 9.653222792187489e-05,
+      "loss": 0.584,
+      "step": 12408
+    },
+    {
+      "epoch": 0.8632648092107551,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.643566051315334e-05,
+      "loss": 0.6404,
+      "step": 12409
+    },
+    {
+      "epoch": 0.8633343768478904,
+      "grad_norm": 1.1484375,
+      "learning_rate": 9.633913898287472e-05,
+      "loss": 0.7392,
+      "step": 12410
+    },
+    {
+      "epoch": 0.8634039444850256,
+      "grad_norm": 1.0078125,
+      "learning_rate": 9.624266333593968e-05,
+      "loss": 0.6214,
+      "step": 12411
+    },
+    {
+      "epoch": 0.8634735121221607,
+      "grad_norm": 1.21875,
+      "learning_rate": 9.614623357724706e-05,
+      "loss": 0.9785,
+      "step": 12412
+    },
+    {
+      "epoch": 0.863543079759296,
+      "grad_norm": 1.0078125,
+      "learning_rate": 9.604984971169273e-05,
+      "loss": 0.7385,
+      "step": 12413
+    },
+    {
+      "epoch": 0.8636126473964312,
+      "grad_norm": 0.796875,
+      "learning_rate": 9.595351174417089e-05,
+      "loss": 0.6108,
+      "step": 12414
+    },
+    {
+      "epoch": 0.8636822150335663,
+      "grad_norm": 1.1640625,
+      "learning_rate": 9.585721967957306e-05,
+      "loss": 0.8732,
+      "step": 12415
+    },
+    {
+      "epoch": 0.8637517826707016,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.576097352278846e-05,
+      "loss": 0.8376,
+      "step": 12416
+    },
+    {
+      "epoch": 0.8638213503078368,
+      "grad_norm": 0.9765625,
+      "learning_rate": 9.566477327870371e-05,
+      "loss": 0.883,
+      "step": 12417
+    },
+    {
+      "epoch": 0.863890917944972,
+      "grad_norm": 1.34375,
+      "learning_rate": 9.55686189522036e-05,
+      "loss": 0.9439,
+      "step": 12418
+    },
+    {
+      "epoch": 0.8639604855821073,
+      "grad_norm": 1.1875,
+      "learning_rate": 9.547251054817052e-05,
+      "loss": 0.9326,
+      "step": 12419
+    },
+    {
+      "epoch": 0.8640300532192424,
+      "grad_norm": 1.1484375,
+      "learning_rate": 9.537644807148416e-05,
+      "loss": 0.6504,
+      "step": 12420
+    },
+    {
+      "epoch": 0.8640996208563776,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.528043152702204e-05,
+      "loss": 0.9633,
+      "step": 12421
+    },
+    {
+      "epoch": 0.8641691884935128,
+      "grad_norm": 0.9453125,
+      "learning_rate": 9.518446091965938e-05,
+      "loss": 0.7757,
+      "step": 12422
+    },
+    {
+      "epoch": 0.864238756130648,
+      "grad_norm": 1.203125,
+      "learning_rate": 9.50885362542695e-05,
+      "loss": 0.8814,
+      "step": 12423
+    },
+    {
+      "epoch": 0.8643083237677832,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.49926575357225e-05,
+      "loss": 0.6348,
+      "step": 12424
+    },
+    {
+      "epoch": 0.8643778914049184,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.489682476888673e-05,
+      "loss": 0.8008,
+      "step": 12425
+    },
+    {
+      "epoch": 0.8644474590420537,
+      "grad_norm": 1.1796875,
+      "learning_rate": 9.480103795862805e-05,
+      "loss": 0.8026,
+      "step": 12426
+    },
+    {
+      "epoch": 0.8645170266791888,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.470529710981036e-05,
+      "loss": 0.8007,
+      "step": 12427
+    },
+    {
+      "epoch": 0.864586594316324,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.460960222729443e-05,
+      "loss": 0.9158,
+      "step": 12428
+    },
+    {
+      "epoch": 0.8646561619534593,
+      "grad_norm": 1.4375,
+      "learning_rate": 9.45139533159396e-05,
+      "loss": 0.6803,
+      "step": 12429
+    },
+    {
+      "epoch": 0.8647257295905945,
+      "grad_norm": 1.203125,
+      "learning_rate": 9.441835038060221e-05,
+      "loss": 1.0275,
+      "step": 12430
+    },
+    {
+      "epoch": 0.8647952972277296,
+      "grad_norm": 1.234375,
+      "learning_rate": 9.432279342613637e-05,
+      "loss": 0.9827,
+      "step": 12431
+    },
+    {
+      "epoch": 0.8648648648648649,
+      "grad_norm": 0.984375,
+      "learning_rate": 9.42272824573941e-05,
+      "loss": 0.469,
+      "step": 12432
+    },
+    {
+      "epoch": 0.8649344325020001,
+      "grad_norm": 1.28125,
+      "learning_rate": 9.413181747922517e-05,
+      "loss": 0.8856,
+      "step": 12433
+    },
+    {
+      "epoch": 0.8650040001391353,
+      "grad_norm": 1.265625,
+      "learning_rate": 9.403639849647672e-05,
+      "loss": 0.8125,
+      "step": 12434
+    },
+    {
+      "epoch": 0.8650735677762704,
+      "grad_norm": 1.3984375,
+      "learning_rate": 9.39410255139933e-05,
+      "loss": 0.6766,
+      "step": 12435
+    },
+    {
+      "epoch": 0.8651431354134057,
+      "grad_norm": 0.99609375,
+      "learning_rate": 9.384569853661773e-05,
+      "loss": 0.8128,
+      "step": 12436
+    },
+    {
+      "epoch": 0.8652127030505409,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.375041756919045e-05,
+      "loss": 0.9982,
+      "step": 12437
+    },
+    {
+      "epoch": 0.865282270687676,
+      "grad_norm": 0.8828125,
+      "learning_rate": 9.365518261654904e-05,
+      "loss": 0.5813,
+      "step": 12438
+    },
+    {
+      "epoch": 0.8653518383248113,
+      "grad_norm": 1.0546875,
+      "learning_rate": 9.355999368352907e-05,
+      "loss": 0.8297,
+      "step": 12439
+    },
+    {
+      "epoch": 0.8654214059619465,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.346485077496369e-05,
+      "loss": 0.8446,
+      "step": 12440
+    },
+    {
+      "epoch": 0.8654909735990817,
+      "grad_norm": 1.0390625,
+      "learning_rate": 9.336975389568425e-05,
+      "loss": 0.7497,
+      "step": 12441
+    },
+    {
+      "epoch": 0.865560541236217,
+      "grad_norm": 0.94140625,
+      "learning_rate": 9.327470305051866e-05,
+      "loss": 0.7889,
+      "step": 12442
+    },
+    {
+      "epoch": 0.8656301088733521,
+      "grad_norm": 1.0859375,
+      "learning_rate": 9.317969824429363e-05,
+      "loss": 0.6662,
+      "step": 12443
+    },
+    {
+      "epoch": 0.8656996765104873,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.308473948183283e-05,
+      "loss": 0.7784,
+      "step": 12444
+    },
+    {
+      "epoch": 0.8657692441476226,
+      "grad_norm": 1.0859375,
+      "learning_rate": 9.298982676795764e-05,
+      "loss": 0.8745,
+      "step": 12445
+    },
+    {
+      "epoch": 0.8658388117847577,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.289496010748722e-05,
+      "loss": 0.7813,
+      "step": 12446
+    },
+    {
+      "epoch": 0.8659083794218929,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.280013950523891e-05,
+      "loss": 0.9556,
+      "step": 12447
+    },
+    {
+      "epoch": 0.8659779470590281,
+      "grad_norm": 0.98046875,
+      "learning_rate": 9.270536496602678e-05,
+      "loss": 0.8412,
+      "step": 12448
+    },
+    {
+      "epoch": 0.8660475146961634,
+      "grad_norm": 1.0625,
+      "learning_rate": 9.261063649466306e-05,
+      "loss": 0.7001,
+      "step": 12449
+    },
+    {
+      "epoch": 0.8661170823332985,
+      "grad_norm": 1.171875,
+      "learning_rate": 9.251595409595748e-05,
+      "loss": 0.8509,
+      "step": 12450
+    },
+    {
+      "epoch": 0.8661866499704337,
+      "grad_norm": 1.234375,
+      "learning_rate": 9.242131777471796e-05,
+      "loss": 0.6475,
+      "step": 12451
+    },
+    {
+      "epoch": 0.866256217607569,
+      "grad_norm": 1.21875,
+      "learning_rate": 9.232672753574944e-05,
+      "loss": 0.7832,
+      "step": 12452
+    },
+    {
+      "epoch": 0.8663257852447042,
+      "grad_norm": 0.9140625,
+      "learning_rate": 9.223218338385441e-05,
+      "loss": 0.7964,
+      "step": 12453
+    },
+    {
+      "epoch": 0.8663953528818393,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.21376853238336e-05,
+      "loss": 0.6256,
+      "step": 12454
+    },
+    {
+      "epoch": 0.8664649205189746,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.204323336048548e-05,
+      "loss": 0.8907,
+      "step": 12455
+    },
+    {
+      "epoch": 0.8665344881561098,
+      "grad_norm": 1.3203125,
+      "learning_rate": 9.194882749860545e-05,
+      "loss": 0.839,
+      "step": 12456
+    },
+    {
+      "epoch": 0.866604055793245,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.185446774298678e-05,
+      "loss": 0.9559,
+      "step": 12457
+    },
+    {
+      "epoch": 0.8666736234303802,
+      "grad_norm": 1.28125,
+      "learning_rate": 9.176015409842098e-05,
+      "loss": 0.9003,
+      "step": 12458
+    },
+    {
+      "epoch": 0.8667431910675154,
+      "grad_norm": 1.1328125,
+      "learning_rate": 9.166588656969676e-05,
+      "loss": 1.0415,
+      "step": 12459
+    },
+    {
+      "epoch": 0.8668127587046506,
+      "grad_norm": 1.2734375,
+      "learning_rate": 9.157166516160031e-05,
+      "loss": 0.8276,
+      "step": 12460
+    },
+    {
+      "epoch": 0.8668823263417857,
+      "grad_norm": 1.25,
+      "learning_rate": 9.147748987891614e-05,
+      "loss": 0.9967,
+      "step": 12461
+    },
+    {
+      "epoch": 0.866951893978921,
+      "grad_norm": 1.59375,
+      "learning_rate": 9.138336072642573e-05,
+      "loss": 0.4936,
+      "step": 12462
+    },
+    {
+      "epoch": 0.8670214616160562,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.128927770890826e-05,
+      "loss": 0.682,
+      "step": 12463
+    },
+    {
+      "epoch": 0.8670910292531914,
+      "grad_norm": 0.98828125,
+      "learning_rate": 9.119524083114106e-05,
+      "loss": 0.5948,
+      "step": 12464
+    },
+    {
+      "epoch": 0.8671605968903267,
+      "grad_norm": 1.203125,
+      "learning_rate": 9.110125009789905e-05,
+      "loss": 0.8522,
+      "step": 12465
+    },
+    {
+      "epoch": 0.8672301645274618,
+      "grad_norm": 0.9765625,
+      "learning_rate": 9.100730551395431e-05,
+      "loss": 0.9638,
+      "step": 12466
+    },
+    {
+      "epoch": 0.867299732164597,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.09134070840767e-05,
+      "loss": 0.8597,
+      "step": 12467
+    },
+    {
+      "epoch": 0.8673692998017323,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.081955481303416e-05,
+      "loss": 0.6316,
+      "step": 12468
+    },
+    {
+      "epoch": 0.8674388674388674,
+      "grad_norm": 1.328125,
+      "learning_rate": 9.072574870559224e-05,
+      "loss": 1.031,
+      "step": 12469
+    },
+    {
+      "epoch": 0.8675084350760026,
+      "grad_norm": 1.09375,
+      "learning_rate": 9.06319887665138e-05,
+      "loss": 0.7982,
+      "step": 12470
+    },
+    {
+      "epoch": 0.8675780027131379,
+      "grad_norm": 0.953125,
+      "learning_rate": 9.053827500055911e-05,
+      "loss": 0.7964,
+      "step": 12471
+    },
+    {
+      "epoch": 0.8676475703502731,
+      "grad_norm": 1.1171875,
+      "learning_rate": 9.044460741248683e-05,
+      "loss": 0.8397,
+      "step": 12472
+    },
+    {
+      "epoch": 0.8677171379874082,
+      "grad_norm": 0.890625,
+      "learning_rate": 9.035098600705305e-05,
+      "loss": 0.7533,
+      "step": 12473
+    },
+    {
+      "epoch": 0.8677867056245434,
+      "grad_norm": 1.046875,
+      "learning_rate": 9.025741078901106e-05,
+      "loss": 0.6866,
+      "step": 12474
+    },
+    {
+      "epoch": 0.8678562732616787,
+      "grad_norm": 0.953125,
+      "learning_rate": 9.016388176311251e-05,
+      "loss": 0.5942,
+      "step": 12475
+    },
+    {
+      "epoch": 0.8679258408988139,
+      "grad_norm": 1.03125,
+      "learning_rate": 9.007039893410607e-05,
+      "loss": 0.7647,
+      "step": 12476
+    },
+    {
+      "epoch": 0.867995408535949,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.997696230673824e-05,
+      "loss": 0.8657,
+      "step": 12477
+    },
+    {
+      "epoch": 0.8680649761730843,
+      "grad_norm": 1.109375,
+      "learning_rate": 8.988357188575347e-05,
+      "loss": 0.6619,
+      "step": 12478
+    },
+    {
+      "epoch": 0.8681345438102195,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8.979022767589373e-05,
+      "loss": 0.6795,
+      "step": 12479
+    },
+    {
+      "epoch": 0.8682041114473547,
+      "grad_norm": 1.0703125,
+      "learning_rate": 8.969692968189835e-05,
+      "loss": 0.7326,
+      "step": 12480
+    },
+    {
+      "epoch": 0.8682736790844899,
+      "grad_norm": 0.9921875,
+      "learning_rate": 8.960367790850455e-05,
+      "loss": 0.7366,
+      "step": 12481
+    },
+    {
+      "epoch": 0.8683432467216251,
+      "grad_norm": 1.296875,
+      "learning_rate": 8.951047236044719e-05,
+      "loss": 0.8505,
+      "step": 12482
+    },
+    {
+      "epoch": 0.8684128143587603,
+      "grad_norm": 0.97265625,
+      "learning_rate": 8.941731304245903e-05,
+      "loss": 1.0211,
+      "step": 12483
+    },
+    {
+      "epoch": 0.8684823819958956,
+      "grad_norm": 1.1328125,
+      "learning_rate": 8.932419995927e-05,
+      "loss": 0.8623,
+      "step": 12484
+    },
+    {
+      "epoch": 0.8685519496330307,
+      "grad_norm": 1.15625,
+      "learning_rate": 8.923113311560782e-05,
+      "loss": 0.746,
+      "step": 12485
+    },
+    {
+      "epoch": 0.8686215172701659,
+      "grad_norm": 1.296875,
+      "learning_rate": 8.913811251619807e-05,
+      "loss": 0.9333,
+      "step": 12486
+    },
+    {
+      "epoch": 0.8686910849073011,
+      "grad_norm": 1.1328125,
+      "learning_rate": 8.90451381657641e-05,
+      "loss": 0.9065,
+      "step": 12487
+    },
+    {
+      "epoch": 0.8687606525444364,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.89522100690262e-05,
+      "loss": 1.2835,
+      "step": 12488
+    },
+    {
+      "epoch": 0.8688302201815715,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.88593282307033e-05,
+      "loss": 0.8051,
+      "step": 12489
+    },
+    {
+      "epoch": 0.8688997878187067,
+      "grad_norm": 1.0,
+      "learning_rate": 8.876649265551107e-05,
+      "loss": 0.7495,
+      "step": 12490
+    },
+    {
+      "epoch": 0.868969355455842,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.86737033481635e-05,
+      "loss": 0.869,
+      "step": 12491
+    },
+    {
+      "epoch": 0.8690389230929771,
+      "grad_norm": 1.34375,
+      "learning_rate": 8.85809603133716e-05,
+      "loss": 0.8236,
+      "step": 12492
+    },
+    {
+      "epoch": 0.8691084907301123,
+      "grad_norm": 0.98828125,
+      "learning_rate": 8.848826355584494e-05,
+      "loss": 0.8449,
+      "step": 12493
+    },
+    {
+      "epoch": 0.8691780583672476,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.839561308028987e-05,
+      "loss": 0.6904,
+      "step": 12494
+    },
+    {
+      "epoch": 0.8692476260043828,
+      "grad_norm": 0.88671875,
+      "learning_rate": 8.830300889141051e-05,
+      "loss": 0.5591,
+      "step": 12495
+    },
+    {
+      "epoch": 0.8693171936415179,
+      "grad_norm": 1.140625,
+      "learning_rate": 8.821045099390911e-05,
+      "loss": 0.8662,
+      "step": 12496
+    },
+    {
+      "epoch": 0.8693867612786532,
+      "grad_norm": 1.4296875,
+      "learning_rate": 8.811793939248547e-05,
+      "loss": 0.7743,
+      "step": 12497
+    },
+    {
+      "epoch": 0.8694563289157884,
+      "grad_norm": 1.140625,
+      "learning_rate": 8.802547409183659e-05,
+      "loss": 0.7568,
+      "step": 12498
+    },
+    {
+      "epoch": 0.8695258965529236,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.793305509665727e-05,
+      "loss": 0.7029,
+      "step": 12499
+    },
+    {
+      "epoch": 0.8695954641900587,
+      "grad_norm": 1.015625,
+      "learning_rate": 8.784068241164023e-05,
+      "loss": 0.7975,
+      "step": 12500
+    },
+    {
+      "epoch": 0.869665031827194,
+      "grad_norm": 1.109375,
+      "learning_rate": 8.774835604147602e-05,
+      "loss": 0.8389,
+      "step": 12501
+    },
+    {
+      "epoch": 0.8697345994643292,
+      "grad_norm": 0.9921875,
+      "learning_rate": 8.76560759908519e-05,
+      "loss": 0.9547,
+      "step": 12502
+    },
+    {
+      "epoch": 0.8698041671014644,
+      "grad_norm": 0.96484375,
+      "learning_rate": 8.75638422644539e-05,
+      "loss": 0.6406,
+      "step": 12503
+    },
+    {
+      "epoch": 0.8698737347385996,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.747165486696474e-05,
+      "loss": 0.75,
+      "step": 12504
+    },
+    {
+      "epoch": 0.8699433023757348,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.737951380306564e-05,
+      "loss": 0.7778,
+      "step": 12505
+    },
+    {
+      "epoch": 0.87001287001287,
+      "grad_norm": 0.95703125,
+      "learning_rate": 8.728741907743476e-05,
+      "loss": 0.7914,
+      "step": 12506
+    },
+    {
+      "epoch": 0.8700824376500053,
+      "grad_norm": 1.171875,
+      "learning_rate": 8.719537069474848e-05,
+      "loss": 0.8327,
+      "step": 12507
+    },
+    {
+      "epoch": 0.8701520052871404,
+      "grad_norm": 1.3515625,
+      "learning_rate": 8.71033686596805e-05,
+      "loss": 1.1135,
+      "step": 12508
+    },
+    {
+      "epoch": 0.8702215729242756,
+      "grad_norm": 0.92578125,
+      "learning_rate": 8.701141297690163e-05,
+      "loss": 0.8336,
+      "step": 12509
+    },
+    {
+      "epoch": 0.8702911405614109,
+      "grad_norm": 1.0703125,
+      "learning_rate": 8.69195036510818e-05,
+      "loss": 0.6165,
+      "step": 12510
+    },
+    {
+      "epoch": 0.870360708198546,
+      "grad_norm": 1.140625,
+      "learning_rate": 8.68276406868873e-05,
+      "loss": 0.895,
+      "step": 12511
+    },
+    {
+      "epoch": 0.8704302758356812,
+      "grad_norm": 1.421875,
+      "learning_rate": 8.673582408898251e-05,
+      "loss": 0.8865,
+      "step": 12512
+    },
+    {
+      "epoch": 0.8704998434728164,
+      "grad_norm": 1.0546875,
+      "learning_rate": 8.664405386202911e-05,
+      "loss": 0.735,
+      "step": 12513
+    },
+    {
+      "epoch": 0.8705694111099517,
+      "grad_norm": 1.3984375,
+      "learning_rate": 8.655233001068708e-05,
+      "loss": 0.9591,
+      "step": 12514
+    },
+    {
+      "epoch": 0.8706389787470868,
+      "grad_norm": 0.97265625,
+      "learning_rate": 8.646065253961377e-05,
+      "loss": 0.8405,
+      "step": 12515
+    },
+    {
+      "epoch": 0.870708546384222,
+      "grad_norm": 0.94921875,
+      "learning_rate": 8.636902145346381e-05,
+      "loss": 0.7767,
+      "step": 12516
+    },
+    {
+      "epoch": 0.8707781140213573,
+      "grad_norm": 1.1875,
+      "learning_rate": 8.627743675689004e-05,
+      "loss": 0.8149,
+      "step": 12517
+    },
+    {
+      "epoch": 0.8708476816584925,
+      "grad_norm": 0.85546875,
+      "learning_rate": 8.618589845454239e-05,
+      "loss": 0.7053,
+      "step": 12518
+    },
+    {
+      "epoch": 0.8709172492956276,
+      "grad_norm": 1.0234375,
+      "learning_rate": 8.609440655106903e-05,
+      "loss": 0.5503,
+      "step": 12519
+    },
+    {
+      "epoch": 0.8709868169327629,
+      "grad_norm": 1.125,
+      "learning_rate": 8.600296105111505e-05,
+      "loss": 0.6353,
+      "step": 12520
+    },
+    {
+      "epoch": 0.8710563845698981,
+      "grad_norm": 1.0234375,
+      "learning_rate": 8.591156195932403e-05,
+      "loss": 0.6243,
+      "step": 12521
+    },
+    {
+      "epoch": 0.8711259522070333,
+      "grad_norm": 1.3046875,
+      "learning_rate": 8.582020928033651e-05,
+      "loss": 0.9454,
+      "step": 12522
+    },
+    {
+      "epoch": 0.8711955198441685,
+      "grad_norm": 0.9296875,
+      "learning_rate": 8.572890301879066e-05,
+      "loss": 0.7238,
+      "step": 12523
+    },
+    {
+      "epoch": 0.8712650874813037,
+      "grad_norm": 1.3359375,
+      "learning_rate": 8.56376431793231e-05,
+      "loss": 0.849,
+      "step": 12524
+    },
+    {
+      "epoch": 0.8713346551184389,
+      "grad_norm": 1.234375,
+      "learning_rate": 8.554642976656734e-05,
+      "loss": 0.5616,
+      "step": 12525
+    },
+    {
+      "epoch": 0.871404222755574,
+      "grad_norm": 1.21875,
+      "learning_rate": 8.54552627851548e-05,
+      "loss": 0.7973,
+      "step": 12526
+    },
+    {
+      "epoch": 0.8714737903927093,
+      "grad_norm": 1.0703125,
+      "learning_rate": 8.5364142239714e-05,
+      "loss": 0.7257,
+      "step": 12527
+    },
+    {
+      "epoch": 0.8715433580298445,
+      "grad_norm": 0.97265625,
+      "learning_rate": 8.527306813487213e-05,
+      "loss": 0.6967,
+      "step": 12528
+    },
+    {
+      "epoch": 0.8716129256669797,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.518204047525336e-05,
+      "loss": 0.8197,
+      "step": 12529
+    },
+    {
+      "epoch": 0.871682493304115,
+      "grad_norm": 0.9140625,
+      "learning_rate": 8.509105926547945e-05,
+      "loss": 0.4892,
+      "step": 12530
+    },
+    {
+      "epoch": 0.8717520609412501,
+      "grad_norm": 1.40625,
+      "learning_rate": 8.500012451017014e-05,
+      "loss": 1.0708,
+      "step": 12531
+    },
+    {
+      "epoch": 0.8718216285783853,
+      "grad_norm": 0.9765625,
+      "learning_rate": 8.490923621394242e-05,
+      "loss": 0.7588,
+      "step": 12532
+    },
+    {
+      "epoch": 0.8718911962155206,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.481839438141159e-05,
+      "loss": 0.7692,
+      "step": 12533
+    },
+    {
+      "epoch": 0.8719607638526558,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.472759901718952e-05,
+      "loss": 0.9075,
+      "step": 12534
+    },
+    {
+      "epoch": 0.8720303314897909,
+      "grad_norm": 0.6640625,
+      "learning_rate": 8.463685012588685e-05,
+      "loss": 0.5494,
+      "step": 12535
+    },
+    {
+      "epoch": 0.8720998991269262,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.4546147712111e-05,
+      "loss": 0.7523,
+      "step": 12536
+    },
+    {
+      "epoch": 0.8721694667640614,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8.445549178046774e-05,
+      "loss": 0.7254,
+      "step": 12537
+    },
+    {
+      "epoch": 0.8722390344011965,
+      "grad_norm": 0.8515625,
+      "learning_rate": 8.436488233555973e-05,
+      "loss": 0.5229,
+      "step": 12538
+    },
+    {
+      "epoch": 0.8723086020383317,
+      "grad_norm": 0.984375,
+      "learning_rate": 8.427431938198805e-05,
+      "loss": 0.5742,
+      "step": 12539
+    },
+    {
+      "epoch": 0.872378169675467,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.418380292435079e-05,
+      "loss": 0.8908,
+      "step": 12540
+    },
+    {
+      "epoch": 0.8724477373126022,
+      "grad_norm": 1.1171875,
+      "learning_rate": 8.409333296724364e-05,
+      "loss": 0.7893,
+      "step": 12541
+    },
+    {
+      "epoch": 0.8725173049497373,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.40029095152609e-05,
+      "loss": 0.546,
+      "step": 12542
+    },
+    {
+      "epoch": 0.8725868725868726,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.391253257299336e-05,
+      "loss": 0.6896,
+      "step": 12543
+    },
+    {
+      "epoch": 0.8726564402240078,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8.382220214503011e-05,
+      "loss": 0.6414,
+      "step": 12544
+    },
+    {
+      "epoch": 0.872726007861143,
+      "grad_norm": 1.1875,
+      "learning_rate": 8.373191823595727e-05,
+      "loss": 0.7615,
+      "step": 12545
+    },
+    {
+      "epoch": 0.8727955754982782,
+      "grad_norm": 1.2421875,
+      "learning_rate": 8.364168085035939e-05,
+      "loss": 0.8378,
+      "step": 12546
+    },
+    {
+      "epoch": 0.8728651431354134,
+      "grad_norm": 0.95703125,
+      "learning_rate": 8.355148999281825e-05,
+      "loss": 0.7706,
+      "step": 12547
+    },
+    {
+      "epoch": 0.8729347107725486,
+      "grad_norm": 1.15625,
+      "learning_rate": 8.346134566791308e-05,
+      "loss": 0.863,
+      "step": 12548
+    },
+    {
+      "epoch": 0.8730042784096839,
+      "grad_norm": 1.1171875,
+      "learning_rate": 8.337124788022122e-05,
+      "loss": 0.632,
+      "step": 12549
+    },
+    {
+      "epoch": 0.873073846046819,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.32811966343171e-05,
+      "loss": 0.8922,
+      "step": 12550
+    },
+    {
+      "epoch": 0.8731434136839542,
+      "grad_norm": 1.2109375,
+      "learning_rate": 8.319119193477342e-05,
+      "loss": 0.8238,
+      "step": 12551
+    },
+    {
+      "epoch": 0.8732129813210894,
+      "grad_norm": 1.3203125,
+      "learning_rate": 8.310123378615975e-05,
+      "loss": 1.0244,
+      "step": 12552
+    },
+    {
+      "epoch": 0.8732825489582247,
+      "grad_norm": 1.3046875,
+      "learning_rate": 8.301132219304408e-05,
+      "loss": 0.9827,
+      "step": 12553
+    },
+    {
+      "epoch": 0.8733521165953598,
+      "grad_norm": 1.21875,
+      "learning_rate": 8.292145715999144e-05,
+      "loss": 0.6882,
+      "step": 12554
+    },
+    {
+      "epoch": 0.873421684232495,
+      "grad_norm": 0.99609375,
+      "learning_rate": 8.283163869156451e-05,
+      "loss": 0.6744,
+      "step": 12555
+    },
+    {
+      "epoch": 0.8734912518696303,
+      "grad_norm": 1.09375,
+      "learning_rate": 8.274186679232443e-05,
+      "loss": 0.7423,
+      "step": 12556
+    },
+    {
+      "epoch": 0.8735608195067655,
+      "grad_norm": 1.484375,
+      "learning_rate": 8.265214146682909e-05,
+      "loss": 0.9127,
+      "step": 12557
+    },
+    {
+      "epoch": 0.8736303871439006,
+      "grad_norm": 1.34375,
+      "learning_rate": 8.256246271963419e-05,
+      "loss": 0.7658,
+      "step": 12558
+    },
+    {
+      "epoch": 0.8736999547810359,
+      "grad_norm": 1.1328125,
+      "learning_rate": 8.247283055529298e-05,
+      "loss": 0.9081,
+      "step": 12559
+    },
+    {
+      "epoch": 0.8737695224181711,
+      "grad_norm": 1.578125,
+      "learning_rate": 8.238324497835681e-05,
+      "loss": 0.7911,
+      "step": 12560
+    },
+    {
+      "epoch": 0.8738390900553062,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.229370599337449e-05,
+      "loss": 0.7267,
+      "step": 12561
+    },
+    {
+      "epoch": 0.8739086576924415,
+      "grad_norm": 1.375,
+      "learning_rate": 8.220421360489205e-05,
+      "loss": 0.823,
+      "step": 12562
+    },
+    {
+      "epoch": 0.8739782253295767,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.211476781745375e-05,
+      "loss": 0.8053,
+      "step": 12563
+    },
+    {
+      "epoch": 0.8740477929667119,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.202536863560083e-05,
+      "loss": 0.6133,
+      "step": 12564
+    },
+    {
+      "epoch": 0.874117360603847,
+      "grad_norm": 1.140625,
+      "learning_rate": 8.193601606387302e-05,
+      "loss": 0.8156,
+      "step": 12565
+    },
+    {
+      "epoch": 0.8741869282409823,
+      "grad_norm": 1.359375,
+      "learning_rate": 8.184671010680677e-05,
+      "loss": 1.0027,
+      "step": 12566
+    },
+    {
+      "epoch": 0.8742564958781175,
+      "grad_norm": 1.0546875,
+      "learning_rate": 8.175745076893681e-05,
+      "loss": 0.8141,
+      "step": 12567
+    },
+    {
+      "epoch": 0.8743260635152527,
+      "grad_norm": 1.4140625,
+      "learning_rate": 8.166823805479507e-05,
+      "loss": 0.8223,
+      "step": 12568
+    },
+    {
+      "epoch": 0.8743956311523879,
+      "grad_norm": 1.0234375,
+      "learning_rate": 8.157907196891157e-05,
+      "loss": 0.6991,
+      "step": 12569
+    },
+    {
+      "epoch": 0.8744651987895231,
+      "grad_norm": 0.83984375,
+      "learning_rate": 8.14899525158137e-05,
+      "loss": 0.6608,
+      "step": 12570
+    },
+    {
+      "epoch": 0.8745347664266583,
+      "grad_norm": 0.94140625,
+      "learning_rate": 8.14008797000264e-05,
+      "loss": 0.9501,
+      "step": 12571
+    },
+    {
+      "epoch": 0.8746043340637936,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.13118535260724e-05,
+      "loss": 0.6796,
+      "step": 12572
+    },
+    {
+      "epoch": 0.8746739017009287,
+      "grad_norm": 2.09375,
+      "learning_rate": 8.122287399847173e-05,
+      "loss": 1.0462,
+      "step": 12573
+    },
+    {
+      "epoch": 0.8747434693380639,
+      "grad_norm": 1.203125,
+      "learning_rate": 8.113394112174255e-05,
+      "loss": 0.7808,
+      "step": 12574
+    },
+    {
+      "epoch": 0.8748130369751992,
+      "grad_norm": 1.203125,
+      "learning_rate": 8.10450549004006e-05,
+      "loss": 0.829,
+      "step": 12575
+    },
+    {
+      "epoch": 0.8748826046123344,
+      "grad_norm": 0.8828125,
+      "learning_rate": 8.095621533895869e-05,
+      "loss": 0.7831,
+      "step": 12576
+    },
+    {
+      "epoch": 0.8749521722494695,
+      "grad_norm": 1.21875,
+      "learning_rate": 8.086742244192802e-05,
+      "loss": 0.7498,
+      "step": 12577
+    },
+    {
+      "epoch": 0.8750217398866047,
+      "grad_norm": 1.2578125,
+      "learning_rate": 8.077867621381662e-05,
+      "loss": 0.8737,
+      "step": 12578
+    },
+    {
+      "epoch": 0.87509130752374,
+      "grad_norm": 0.8984375,
+      "learning_rate": 8.068997665913113e-05,
+      "loss": 0.7675,
+      "step": 12579
+    },
+    {
+      "epoch": 0.8751608751608752,
+      "grad_norm": 1.1953125,
+      "learning_rate": 8.060132378237473e-05,
+      "loss": 1.011,
+      "step": 12580
+    },
+    {
+      "epoch": 0.8752304427980103,
+      "grad_norm": 1.140625,
+      "learning_rate": 8.051271758804913e-05,
+      "loss": 0.7039,
+      "step": 12581
+    },
+    {
+      "epoch": 0.8753000104351456,
+      "grad_norm": 1.1171875,
+      "learning_rate": 8.042415808065306e-05,
+      "loss": 0.7051,
+      "step": 12582
+    },
+    {
+      "epoch": 0.8753695780722808,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.033564526468318e-05,
+      "loss": 0.7209,
+      "step": 12583
+    },
+    {
+      "epoch": 0.875439145709416,
+      "grad_norm": 1.4375,
+      "learning_rate": 8.024717914463397e-05,
+      "loss": 0.9931,
+      "step": 12584
+    },
+    {
+      "epoch": 0.8755087133465512,
+      "grad_norm": 1.21875,
+      "learning_rate": 8.01587597249972e-05,
+      "loss": 1.0059,
+      "step": 12585
+    },
+    {
+      "epoch": 0.8755782809836864,
+      "grad_norm": 1.0,
+      "learning_rate": 8.007038701026215e-05,
+      "loss": 0.8795,
+      "step": 12586
+    },
+    {
+      "epoch": 0.8756478486208216,
+      "grad_norm": 1.0390625,
+      "learning_rate": 7.998206100491578e-05,
+      "loss": 0.7554,
+      "step": 12587
+    },
+    {
+      "epoch": 0.8757174162579568,
+      "grad_norm": 1.0234375,
+      "learning_rate": 7.989378171344341e-05,
+      "loss": 0.7503,
+      "step": 12588
+    },
+    {
+      "epoch": 0.875786983895092,
+      "grad_norm": 1.171875,
+      "learning_rate": 7.980554914032712e-05,
+      "loss": 0.9079,
+      "step": 12589
+    },
+    {
+      "epoch": 0.8758565515322272,
+      "grad_norm": 0.6953125,
+      "learning_rate": 7.971736329004675e-05,
+      "loss": 0.5777,
+      "step": 12590
+    },
+    {
+      "epoch": 0.8759261191693624,
+      "grad_norm": 2.09375,
+      "learning_rate": 7.962922416708029e-05,
+      "loss": 0.9667,
+      "step": 12591
+    },
+    {
+      "epoch": 0.8759956868064976,
+      "grad_norm": 1.1875,
+      "learning_rate": 7.954113177590272e-05,
+      "loss": 0.9835,
+      "step": 12592
+    },
+    {
+      "epoch": 0.8760652544436328,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.945308612098712e-05,
+      "loss": 0.9883,
+      "step": 12593
+    },
+    {
+      "epoch": 0.876134822080768,
+      "grad_norm": 1.21875,
+      "learning_rate": 7.93650872068038e-05,
+      "loss": 0.8747,
+      "step": 12594
+    },
+    {
+      "epoch": 0.8762043897179033,
+      "grad_norm": 1.046875,
+      "learning_rate": 7.927713503782107e-05,
+      "loss": 0.8781,
+      "step": 12595
+    },
+    {
+      "epoch": 0.8762739573550384,
+      "grad_norm": 1.1328125,
+      "learning_rate": 7.91892296185045e-05,
+      "loss": 0.7986,
+      "step": 12596
+    },
+    {
+      "epoch": 0.8763435249921736,
+      "grad_norm": 1.1015625,
+      "learning_rate": 7.91013709533177e-05,
+      "loss": 0.7469,
+      "step": 12597
+    },
+    {
+      "epoch": 0.8764130926293089,
+      "grad_norm": 1.0,
+      "learning_rate": 7.90135590467217e-05,
+      "loss": 0.9099,
+      "step": 12598
+    },
+    {
+      "epoch": 0.8764826602664441,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.892579390317511e-05,
+      "loss": 0.7349,
+      "step": 12599
+    },
+    {
+      "epoch": 0.8765522279035792,
+      "grad_norm": 0.9765625,
+      "learning_rate": 7.883807552713384e-05,
+      "loss": 0.6099,
+      "step": 12600
+    },
+    {
+      "epoch": 0.8766217955407145,
+      "grad_norm": 1.046875,
+      "learning_rate": 7.875040392305222e-05,
+      "loss": 0.9179,
+      "step": 12601
+    },
+    {
+      "epoch": 0.8766913631778497,
+      "grad_norm": 1.03125,
+      "learning_rate": 7.866277909538177e-05,
+      "loss": 0.8175,
+      "step": 12602
+    },
+    {
+      "epoch": 0.8767609308149849,
+      "grad_norm": 1.4609375,
+      "learning_rate": 7.857520104857163e-05,
+      "loss": 1.1682,
+      "step": 12603
+    },
+    {
+      "epoch": 0.87683049845212,
+      "grad_norm": 1.0703125,
+      "learning_rate": 7.848766978706812e-05,
+      "loss": 0.6443,
+      "step": 12604
+    },
+    {
+      "epoch": 0.8769000660892553,
+      "grad_norm": 1.296875,
+      "learning_rate": 7.840018531531623e-05,
+      "loss": 0.7135,
+      "step": 12605
+    },
+    {
+      "epoch": 0.8769696337263905,
+      "grad_norm": 1.2421875,
+      "learning_rate": 7.831274763775754e-05,
+      "loss": 0.9032,
+      "step": 12606
+    },
+    {
+      "epoch": 0.8770392013635256,
+      "grad_norm": 1.453125,
+      "learning_rate": 7.822535675883202e-05,
+      "loss": 0.734,
+      "step": 12607
+    },
+    {
+      "epoch": 0.8771087690006609,
+      "grad_norm": 0.875,
+      "learning_rate": 7.813801268297672e-05,
+      "loss": 0.726,
+      "step": 12608
+    },
+    {
+      "epoch": 0.8771783366377961,
+      "grad_norm": 1.109375,
+      "learning_rate": 7.805071541462672e-05,
+      "loss": 0.7206,
+      "step": 12609
+    },
+    {
+      "epoch": 0.8772479042749313,
+      "grad_norm": 1.2734375,
+      "learning_rate": 7.796346495821415e-05,
+      "loss": 0.7493,
+      "step": 12610
+    },
+    {
+      "epoch": 0.8773174719120665,
+      "grad_norm": 0.83984375,
+      "learning_rate": 7.78762613181696e-05,
+      "loss": 0.8072,
+      "step": 12611
+    },
+    {
+      "epoch": 0.8773870395492017,
+      "grad_norm": 0.84765625,
+      "learning_rate": 7.778910449892074e-05,
+      "loss": 0.6677,
+      "step": 12612
+    },
+    {
+      "epoch": 0.8774566071863369,
+      "grad_norm": 1.296875,
+      "learning_rate": 7.770199450489279e-05,
+      "loss": 0.7072,
+      "step": 12613
+    },
+    {
+      "epoch": 0.8775261748234722,
+      "grad_norm": 1.4453125,
+      "learning_rate": 7.761493134050879e-05,
+      "loss": 0.9677,
+      "step": 12614
+    },
+    {
+      "epoch": 0.8775957424606073,
+      "grad_norm": 1.078125,
+      "learning_rate": 7.75279150101893e-05,
+      "loss": 1.0538,
+      "step": 12615
+    },
+    {
+      "epoch": 0.8776653100977425,
+      "grad_norm": 1.15625,
+      "learning_rate": 7.744094551835291e-05,
+      "loss": 0.9044,
+      "step": 12616
+    },
+    {
+      "epoch": 0.8777348777348777,
+      "grad_norm": 1.40625,
+      "learning_rate": 7.735402286941528e-05,
+      "loss": 1.0882,
+      "step": 12617
+    },
+    {
+      "epoch": 0.877804445372013,
+      "grad_norm": 1.484375,
+      "learning_rate": 7.726714706778992e-05,
+      "loss": 0.9486,
+      "step": 12618
+    },
+    {
+      "epoch": 0.8778740130091481,
+      "grad_norm": 1.0703125,
+      "learning_rate": 7.71803181178875e-05,
+      "loss": 0.8556,
+      "step": 12619
+    },
+    {
+      "epoch": 0.8779435806462833,
+      "grad_norm": 1.09375,
+      "learning_rate": 7.709353602411751e-05,
+      "loss": 0.6466,
+      "step": 12620
+    },
+    {
+      "epoch": 0.8780131482834186,
+      "grad_norm": 1.1953125,
+      "learning_rate": 7.700680079088595e-05,
+      "loss": 0.9507,
+      "step": 12621
+    },
+    {
+      "epoch": 0.8780827159205538,
+      "grad_norm": 1.046875,
+      "learning_rate": 7.692011242259677e-05,
+      "loss": 0.7419,
+      "step": 12622
+    },
+    {
+      "epoch": 0.8781522835576889,
+      "grad_norm": 1.2578125,
+      "learning_rate": 7.683347092365166e-05,
+      "loss": 0.9697,
+      "step": 12623
+    },
+    {
+      "epoch": 0.8782218511948242,
+      "grad_norm": 0.98046875,
+      "learning_rate": 7.674687629844967e-05,
+      "loss": 0.7267,
+      "step": 12624
+    },
+    {
+      "epoch": 0.8782914188319594,
+      "grad_norm": 1.0078125,
+      "learning_rate": 7.666032855138793e-05,
+      "loss": 0.6941,
+      "step": 12625
+    },
+    {
+      "epoch": 0.8783609864690946,
+      "grad_norm": 0.890625,
+      "learning_rate": 7.65738276868605e-05,
+      "loss": 0.6775,
+      "step": 12626
+    },
+    {
+      "epoch": 0.8784305541062298,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.648737370925995e-05,
+      "loss": 0.8829,
+      "step": 12627
+    },
+    {
+      "epoch": 0.878500121743365,
+      "grad_norm": 1.140625,
+      "learning_rate": 7.640096662297547e-05,
+      "loss": 1.0224,
+      "step": 12628
+    },
+    {
+      "epoch": 0.8785696893805002,
+      "grad_norm": 1.078125,
+      "learning_rate": 7.631460643239463e-05,
+      "loss": 0.9203,
+      "step": 12629
+    },
+    {
+      "epoch": 0.8786392570176353,
+      "grad_norm": 1.140625,
+      "learning_rate": 7.62282931419026e-05,
+      "loss": 0.7338,
+      "step": 12630
+    },
+    {
+      "epoch": 0.8787088246547706,
+      "grad_norm": 1.1171875,
+      "learning_rate": 7.614202675588167e-05,
+      "loss": 0.7872,
+      "step": 12631
+    },
+    {
+      "epoch": 0.8787783922919058,
+      "grad_norm": 0.91796875,
+      "learning_rate": 7.605580727871175e-05,
+      "loss": 0.8034,
+      "step": 12632
+    },
+    {
+      "epoch": 0.878847959929041,
+      "grad_norm": 0.94921875,
+      "learning_rate": 7.596963471477103e-05,
+      "loss": 0.4924,
+      "step": 12633
+    },
+    {
+      "epoch": 0.8789175275661762,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.5883509068435e-05,
+      "loss": 0.9425,
+      "step": 12634
+    },
+    {
+      "epoch": 0.8789870952033114,
+      "grad_norm": 1.203125,
+      "learning_rate": 7.579743034407638e-05,
+      "loss": 1.0256,
+      "step": 12635
+    },
+    {
+      "epoch": 0.8790566628404466,
+      "grad_norm": 1.1640625,
+      "learning_rate": 7.571139854606579e-05,
+      "loss": 0.8888,
+      "step": 12636
+    },
+    {
+      "epoch": 0.8791262304775819,
+      "grad_norm": 0.9453125,
+      "learning_rate": 7.562541367877184e-05,
+      "loss": 0.8218,
+      "step": 12637
+    },
+    {
+      "epoch": 0.879195798114717,
+      "grad_norm": 0.9375,
+      "learning_rate": 7.553947574655995e-05,
+      "loss": 0.5736,
+      "step": 12638
+    },
+    {
+      "epoch": 0.8792653657518522,
+      "grad_norm": 0.90625,
+      "learning_rate": 7.545358475379405e-05,
+      "loss": 0.7649,
+      "step": 12639
+    },
+    {
+      "epoch": 0.8793349333889875,
+      "grad_norm": 0.984375,
+      "learning_rate": 7.536774070483488e-05,
+      "loss": 0.7031,
+      "step": 12640
+    },
+    {
+      "epoch": 0.8794045010261227,
+      "grad_norm": 1.0703125,
+      "learning_rate": 7.52819436040415e-05,
+      "loss": 0.8432,
+      "step": 12641
+    },
+    {
+      "epoch": 0.8794740686632578,
+      "grad_norm": 1.3203125,
+      "learning_rate": 7.519619345577e-05,
+      "loss": 1.072,
+      "step": 12642
+    },
+    {
+      "epoch": 0.879543636300393,
+      "grad_norm": 1.234375,
+      "learning_rate": 7.511049026437434e-05,
+      "loss": 0.7029,
+      "step": 12643
+    },
+    {
+      "epoch": 0.8796132039375283,
+      "grad_norm": 0.92578125,
+      "learning_rate": 7.502483403420646e-05,
+      "loss": 0.656,
+      "step": 12644
+    },
+    {
+      "epoch": 0.8796827715746635,
+      "grad_norm": 0.99609375,
+      "learning_rate": 7.493922476961523e-05,
+      "loss": 0.7228,
+      "step": 12645
+    },
+    {
+      "epoch": 0.8797523392117986,
+      "grad_norm": 1.203125,
+      "learning_rate": 7.48536624749474e-05,
+      "loss": 0.7988,
+      "step": 12646
+    },
+    {
+      "epoch": 0.8798219068489339,
+      "grad_norm": 1.15625,
+      "learning_rate": 7.476814715454738e-05,
+      "loss": 0.9107,
+      "step": 12647
+    },
+    {
+      "epoch": 0.8798914744860691,
+      "grad_norm": 0.8984375,
+      "learning_rate": 7.46826788127577e-05,
+      "loss": 0.5806,
+      "step": 12648
+    },
+    {
+      "epoch": 0.8799610421232043,
+      "grad_norm": 1.0234375,
+      "learning_rate": 7.459725745391743e-05,
+      "loss": 0.7292,
+      "step": 12649
+    },
+    {
+      "epoch": 0.8800306097603395,
+      "grad_norm": 0.94921875,
+      "learning_rate": 7.451188308236401e-05,
+      "loss": 0.7109,
+      "step": 12650
+    },
+    {
+      "epoch": 0.8801001773974747,
+      "grad_norm": 1.03125,
+      "learning_rate": 7.44265557024324e-05,
+      "loss": 0.7169,
+      "step": 12651
+    },
+    {
+      "epoch": 0.8801697450346099,
+      "grad_norm": 1.4140625,
+      "learning_rate": 7.434127531845514e-05,
+      "loss": 0.8128,
+      "step": 12652
+    },
+    {
+      "epoch": 0.8802393126717452,
+      "grad_norm": 1.5859375,
+      "learning_rate": 7.425604193476232e-05,
+      "loss": 1.0139,
+      "step": 12653
+    },
+    {
+      "epoch": 0.8803088803088803,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.417085555568137e-05,
+      "loss": 0.8128,
+      "step": 12654
+    },
+    {
+      "epoch": 0.8803784479460155,
+      "grad_norm": 0.83984375,
+      "learning_rate": 7.408571618553794e-05,
+      "loss": 0.8119,
+      "step": 12655
+    },
+    {
+      "epoch": 0.8804480155831507,
+      "grad_norm": 0.91015625,
+      "learning_rate": 7.400062382865491e-05,
+      "loss": 0.6313,
+      "step": 12656
+    },
+    {
+      "epoch": 0.880517583220286,
+      "grad_norm": 1.1953125,
+      "learning_rate": 7.39155784893527e-05,
+      "loss": 0.7273,
+      "step": 12657
+    },
+    {
+      "epoch": 0.8805871508574211,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.383058017194976e-05,
+      "loss": 0.977,
+      "step": 12658
+    },
+    {
+      "epoch": 0.8806567184945563,
+      "grad_norm": 0.84765625,
+      "learning_rate": 7.374562888076175e-05,
+      "loss": 0.4533,
+      "step": 12659
+    },
+    {
+      "epoch": 0.8807262861316916,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.366072462010187e-05,
+      "loss": 0.6656,
+      "step": 12660
+    },
+    {
+      "epoch": 0.8807958537688267,
+      "grad_norm": 0.9921875,
+      "learning_rate": 7.357586739428135e-05,
+      "loss": 0.7439,
+      "step": 12661
+    },
+    {
+      "epoch": 0.8808654214059619,
+      "grad_norm": 1.21875,
+      "learning_rate": 7.349105720760884e-05,
+      "loss": 0.7649,
+      "step": 12662
+    },
+    {
+      "epoch": 0.8809349890430972,
+      "grad_norm": 0.9296875,
+      "learning_rate": 7.340629406439048e-05,
+      "loss": 0.7316,
+      "step": 12663
+    },
+    {
+      "epoch": 0.8810045566802324,
+      "grad_norm": 1.0390625,
+      "learning_rate": 7.332157796893002e-05,
+      "loss": 0.6436,
+      "step": 12664
+    },
+    {
+      "epoch": 0.8810741243173675,
+      "grad_norm": 1.2421875,
+      "learning_rate": 7.323690892552903e-05,
+      "loss": 0.7449,
+      "step": 12665
+    },
+    {
+      "epoch": 0.8811436919545028,
+      "grad_norm": 1.1171875,
+      "learning_rate": 7.315228693848674e-05,
+      "loss": 0.7347,
+      "step": 12666
+    },
+    {
+      "epoch": 0.881213259591638,
+      "grad_norm": 0.9140625,
+      "learning_rate": 7.306771201209961e-05,
+      "loss": 0.604,
+      "step": 12667
+    },
+    {
+      "epoch": 0.8812828272287732,
+      "grad_norm": 1.3046875,
+      "learning_rate": 7.298318415066186e-05,
+      "loss": 1.0214,
+      "step": 12668
+    },
+    {
+      "epoch": 0.8813523948659083,
+      "grad_norm": 1.03125,
+      "learning_rate": 7.289870335846571e-05,
+      "loss": 0.683,
+      "step": 12669
+    },
+    {
+      "epoch": 0.8814219625030436,
+      "grad_norm": 0.87890625,
+      "learning_rate": 7.28142696398002e-05,
+      "loss": 0.4949,
+      "step": 12670
+    },
+    {
+      "epoch": 0.8814915301401788,
+      "grad_norm": 1.125,
+      "learning_rate": 7.272988299895278e-05,
+      "loss": 0.9083,
+      "step": 12671
+    },
+    {
+      "epoch": 0.881561097777314,
+      "grad_norm": 1.5078125,
+      "learning_rate": 7.264554344020835e-05,
+      "loss": 0.964,
+      "step": 12672
+    },
+    {
+      "epoch": 0.8816306654144492,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.256125096784893e-05,
+      "loss": 0.7362,
+      "step": 12673
+    },
+    {
+      "epoch": 0.8817002330515844,
+      "grad_norm": 1.2109375,
+      "learning_rate": 7.247700558615433e-05,
+      "loss": 0.9041,
+      "step": 12674
+    },
+    {
+      "epoch": 0.8817698006887196,
+      "grad_norm": 0.96875,
+      "learning_rate": 7.239280729940234e-05,
+      "loss": 0.6304,
+      "step": 12675
+    },
+    {
+      "epoch": 0.8818393683258549,
+      "grad_norm": 1.328125,
+      "learning_rate": 7.230865611186833e-05,
+      "loss": 0.8177,
+      "step": 12676
+    },
+    {
+      "epoch": 0.88190893596299,
+      "grad_norm": 0.96875,
+      "learning_rate": 7.222455202782485e-05,
+      "loss": 0.6874,
+      "step": 12677
+    },
+    {
+      "epoch": 0.8819785036001252,
+      "grad_norm": 1.125,
+      "learning_rate": 7.214049505154207e-05,
+      "loss": 0.7348,
+      "step": 12678
+    },
+    {
+      "epoch": 0.8820480712372605,
+      "grad_norm": 0.91796875,
+      "learning_rate": 7.205648518728824e-05,
+      "loss": 0.6891,
+      "step": 12679
+    },
+    {
+      "epoch": 0.8821176388743956,
+      "grad_norm": 1.125,
+      "learning_rate": 7.197252243932906e-05,
+      "loss": 0.6281,
+      "step": 12680
+    },
+    {
+      "epoch": 0.8821872065115308,
+      "grad_norm": 1.2265625,
+      "learning_rate": 7.188860681192766e-05,
+      "loss": 0.8182,
+      "step": 12681
+    },
+    {
+      "epoch": 0.882256774148666,
+      "grad_norm": 1.34375,
+      "learning_rate": 7.180473830934453e-05,
+      "loss": 0.6742,
+      "step": 12682
+    },
+    {
+      "epoch": 0.8823263417858013,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.172091693583826e-05,
+      "loss": 0.7446,
+      "step": 12683
+    },
+    {
+      "epoch": 0.8823959094229364,
+      "grad_norm": 1.21875,
+      "learning_rate": 7.163714269566524e-05,
+      "loss": 0.6961,
+      "step": 12684
+    },
+    {
+      "epoch": 0.8824654770600716,
+      "grad_norm": 1.40625,
+      "learning_rate": 7.15534155930786e-05,
+      "loss": 0.7753,
+      "step": 12685
+    },
+    {
+      "epoch": 0.8825350446972069,
+      "grad_norm": 1.296875,
+      "learning_rate": 7.146973563233005e-05,
+      "loss": 0.8588,
+      "step": 12686
+    },
+    {
+      "epoch": 0.8826046123343421,
+      "grad_norm": 1.4375,
+      "learning_rate": 7.138610281766811e-05,
+      "loss": 0.6299,
+      "step": 12687
+    },
+    {
+      "epoch": 0.8826741799714772,
+      "grad_norm": 1.15625,
+      "learning_rate": 7.130251715333913e-05,
+      "loss": 0.6606,
+      "step": 12688
+    },
+    {
+      "epoch": 0.8827437476086125,
+      "grad_norm": 1.0859375,
+      "learning_rate": 7.12189786435874e-05,
+      "loss": 0.7995,
+      "step": 12689
+    },
+    {
+      "epoch": 0.8828133152457477,
+      "grad_norm": 1.078125,
+      "learning_rate": 7.113548729265462e-05,
+      "loss": 0.7172,
+      "step": 12690
+    },
+    {
+      "epoch": 0.8828828828828829,
+      "grad_norm": 0.9453125,
+      "learning_rate": 7.105204310478009e-05,
+      "loss": 0.7823,
+      "step": 12691
+    },
+    {
+      "epoch": 0.8829524505200181,
+      "grad_norm": 0.921875,
+      "learning_rate": 7.096864608420029e-05,
+      "loss": 0.6294,
+      "step": 12692
+    },
+    {
+      "epoch": 0.8830220181571533,
+      "grad_norm": 1.1015625,
+      "learning_rate": 7.088529623514995e-05,
+      "loss": 0.6797,
+      "step": 12693
+    },
+    {
+      "epoch": 0.8830915857942885,
+      "grad_norm": 1.3359375,
+      "learning_rate": 7.080199356186146e-05,
+      "loss": 0.814,
+      "step": 12694
+    },
+    {
+      "epoch": 0.8831611534314237,
+      "grad_norm": 1.2265625,
+      "learning_rate": 7.071873806856422e-05,
+      "loss": 0.8889,
+      "step": 12695
+    },
+    {
+      "epoch": 0.8832307210685589,
+      "grad_norm": 1.3515625,
+      "learning_rate": 7.063552975948528e-05,
+      "loss": 0.9012,
+      "step": 12696
+    },
+    {
+      "epoch": 0.8833002887056941,
+      "grad_norm": 1.4453125,
+      "learning_rate": 7.055236863884984e-05,
+      "loss": 1.0218,
+      "step": 12697
+    },
+    {
+      "epoch": 0.8833698563428293,
+      "grad_norm": 1.5546875,
+      "learning_rate": 7.04692547108805e-05,
+      "loss": 0.6138,
+      "step": 12698
+    },
+    {
+      "epoch": 0.8834394239799646,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.038618797979735e-05,
+      "loss": 0.8291,
+      "step": 12699
+    },
+    {
+      "epoch": 0.8835089916170997,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.030316844981766e-05,
+      "loss": 0.87,
+      "step": 12700
+    },
+    {
+      "epoch": 0.8835785592542349,
+      "grad_norm": 1.3359375,
+      "learning_rate": 7.022019612515728e-05,
+      "loss": 0.8387,
+      "step": 12701
+    },
+    {
+      "epoch": 0.8836481268913702,
+      "grad_norm": 1.046875,
+      "learning_rate": 7.013727101002876e-05,
+      "loss": 0.5919,
+      "step": 12702
+    },
+    {
+      "epoch": 0.8837176945285053,
+      "grad_norm": 1.203125,
+      "learning_rate": 7.00543931086427e-05,
+      "loss": 0.8006,
+      "step": 12703
+    },
+    {
+      "epoch": 0.8837872621656405,
+      "grad_norm": 1.3046875,
+      "learning_rate": 6.997156242520752e-05,
+      "loss": 1.0003,
+      "step": 12704
+    },
+    {
+      "epoch": 0.8838568298027758,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.988877896392864e-05,
+      "loss": 0.7739,
+      "step": 12705
+    },
+    {
+      "epoch": 0.883926397439911,
+      "grad_norm": 1.1171875,
+      "learning_rate": 6.980604272900937e-05,
+      "loss": 0.9019,
+      "step": 12706
+    },
+    {
+      "epoch": 0.8839959650770461,
+      "grad_norm": 0.91015625,
+      "learning_rate": 6.972335372465067e-05,
+      "loss": 0.7214,
+      "step": 12707
+    },
+    {
+      "epoch": 0.8840655327141813,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.964071195505129e-05,
+      "loss": 0.8192,
+      "step": 12708
+    },
+    {
+      "epoch": 0.8841351003513166,
+      "grad_norm": 0.9375,
+      "learning_rate": 6.955811742440721e-05,
+      "loss": 0.5364,
+      "step": 12709
+    },
+    {
+      "epoch": 0.8842046679884518,
+      "grad_norm": 0.91015625,
+      "learning_rate": 6.947557013691197e-05,
+      "loss": 0.6884,
+      "step": 12710
+    },
+    {
+      "epoch": 0.8842742356255869,
+      "grad_norm": 1.3515625,
+      "learning_rate": 6.939307009675711e-05,
+      "loss": 1.0167,
+      "step": 12711
+    },
+    {
+      "epoch": 0.8843438032627222,
+      "grad_norm": 1.0625,
+      "learning_rate": 6.931061730813171e-05,
+      "loss": 0.9485,
+      "step": 12712
+    },
+    {
+      "epoch": 0.8844133708998574,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.92282117752221e-05,
+      "loss": 0.7769,
+      "step": 12713
+    },
+    {
+      "epoch": 0.8844829385369926,
+      "grad_norm": 1.0,
+      "learning_rate": 6.914585350221236e-05,
+      "loss": 0.938,
+      "step": 12714
+    },
+    {
+      "epoch": 0.8845525061741278,
+      "grad_norm": 1.046875,
+      "learning_rate": 6.906354249328428e-05,
+      "loss": 0.7045,
+      "step": 12715
+    },
+    {
+      "epoch": 0.884622073811263,
+      "grad_norm": 1.171875,
+      "learning_rate": 6.89812787526175e-05,
+      "loss": 0.8892,
+      "step": 12716
+    },
+    {
+      "epoch": 0.8846916414483982,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.889906228438847e-05,
+      "loss": 1.0516,
+      "step": 12717
+    },
+    {
+      "epoch": 0.8847612090855335,
+      "grad_norm": 0.859375,
+      "learning_rate": 6.881689309277206e-05,
+      "loss": 0.5978,
+      "step": 12718
+    },
+    {
+      "epoch": 0.8848307767226686,
+      "grad_norm": 0.984375,
+      "learning_rate": 6.873477118194038e-05,
+      "loss": 0.8943,
+      "step": 12719
+    },
+    {
+      "epoch": 0.8849003443598038,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.865269655606288e-05,
+      "loss": 0.8368,
+      "step": 12720
+    },
+    {
+      "epoch": 0.884969911996939,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.857066921930721e-05,
+      "loss": 0.7237,
+      "step": 12721
+    },
+    {
+      "epoch": 0.8850394796340743,
+      "grad_norm": 1.046875,
+      "learning_rate": 6.848868917583828e-05,
+      "loss": 0.8268,
+      "step": 12722
+    },
+    {
+      "epoch": 0.8851090472712094,
+      "grad_norm": 1.1484375,
+      "learning_rate": 6.840675642981864e-05,
+      "loss": 0.9917,
+      "step": 12723
+    },
+    {
+      "epoch": 0.8851786149083446,
+      "grad_norm": 1.1484375,
+      "learning_rate": 6.832487098540807e-05,
+      "loss": 0.7869,
+      "step": 12724
+    },
+    {
+      "epoch": 0.8852481825454799,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.824303284676459e-05,
+      "loss": 0.7963,
+      "step": 12725
+    },
+    {
+      "epoch": 0.885317750182615,
+      "grad_norm": 1.0625,
+      "learning_rate": 6.816124201804364e-05,
+      "loss": 0.5991,
+      "step": 12726
+    },
+    {
+      "epoch": 0.8853873178197502,
+      "grad_norm": 1.3671875,
+      "learning_rate": 6.807949850339801e-05,
+      "loss": 0.8378,
+      "step": 12727
+    },
+    {
+      "epoch": 0.8854568854568855,
+      "grad_norm": 1.1015625,
+      "learning_rate": 6.799780230697816e-05,
+      "loss": 0.6915,
+      "step": 12728
+    },
+    {
+      "epoch": 0.8855264530940207,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.791615343293211e-05,
+      "loss": 0.8808,
+      "step": 12729
+    },
+    {
+      "epoch": 0.8855960207311558,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.783455188540599e-05,
+      "loss": 0.7991,
+      "step": 12730
+    },
+    {
+      "epoch": 0.8856655883682911,
+      "grad_norm": 1.0,
+      "learning_rate": 6.775299766854271e-05,
+      "loss": 0.8685,
+      "step": 12731
+    },
+    {
+      "epoch": 0.8857351560054263,
+      "grad_norm": 1.1796875,
+      "learning_rate": 6.767149078648348e-05,
+      "loss": 1.0224,
+      "step": 12732
+    },
+    {
+      "epoch": 0.8858047236425615,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.759003124336671e-05,
+      "loss": 0.695,
+      "step": 12733
+    },
+    {
+      "epoch": 0.8858742912796966,
+      "grad_norm": 1.5,
+      "learning_rate": 6.750861904332817e-05,
+      "loss": 1.0809,
+      "step": 12734
+    },
+    {
+      "epoch": 0.8859438589168319,
+      "grad_norm": 1.1640625,
+      "learning_rate": 6.7427254190502e-05,
+      "loss": 0.6709,
+      "step": 12735
+    },
+    {
+      "epoch": 0.8860134265539671,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.734593668901945e-05,
+      "loss": 0.7581,
+      "step": 12736
+    },
+    {
+      "epoch": 0.8860829941911023,
+      "grad_norm": 1.1875,
+      "learning_rate": 6.726466654300922e-05,
+      "loss": 0.7828,
+      "step": 12737
+    },
+    {
+      "epoch": 0.8861525618282375,
+      "grad_norm": 1.1171875,
+      "learning_rate": 6.718344375659779e-05,
+      "loss": 0.7232,
+      "step": 12738
+    },
+    {
+      "epoch": 0.8862221294653727,
+      "grad_norm": 1.0390625,
+      "learning_rate": 6.710226833390942e-05,
+      "loss": 0.7902,
+      "step": 12739
+    },
+    {
+      "epoch": 0.8862916971025079,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.702114027906581e-05,
+      "loss": 0.8745,
+      "step": 12740
+    },
+    {
+      "epoch": 0.8863612647396432,
+      "grad_norm": 1.1640625,
+      "learning_rate": 6.694005959618609e-05,
+      "loss": 0.8363,
+      "step": 12741
+    },
+    {
+      "epoch": 0.8864308323767783,
+      "grad_norm": 1.4765625,
+      "learning_rate": 6.685902628938711e-05,
+      "loss": 0.8588,
+      "step": 12742
+    },
+    {
+      "epoch": 0.8865004000139135,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.677804036278334e-05,
+      "loss": 1.0298,
+      "step": 12743
+    },
+    {
+      "epoch": 0.8865699676510488,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.669710182048705e-05,
+      "loss": 0.7713,
+      "step": 12744
+    },
+    {
+      "epoch": 0.886639535288184,
+      "grad_norm": 0.8046875,
+      "learning_rate": 6.66162106666075e-05,
+      "loss": 0.6444,
+      "step": 12745
+    },
+    {
+      "epoch": 0.8867091029253191,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.653536690525241e-05,
+      "loss": 0.8326,
+      "step": 12746
+    },
+    {
+      "epoch": 0.8867786705624543,
+      "grad_norm": 0.984375,
+      "learning_rate": 6.645457054052639e-05,
+      "loss": 0.7872,
+      "step": 12747
+    },
+    {
+      "epoch": 0.8868482381995896,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.637382157653171e-05,
+      "loss": 0.7572,
+      "step": 12748
+    },
+    {
+      "epoch": 0.8869178058367247,
+      "grad_norm": 0.94140625,
+      "learning_rate": 6.629312001736853e-05,
+      "loss": 0.5386,
+      "step": 12749
+    },
+    {
+      "epoch": 0.8869873734738599,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.62124658671347e-05,
+      "loss": 0.9461,
+      "step": 12750
+    },
+    {
+      "epoch": 0.8870569411109952,
+      "grad_norm": 1.03125,
+      "learning_rate": 6.613185912992514e-05,
+      "loss": 0.843,
+      "step": 12751
+    },
+    {
+      "epoch": 0.8871265087481304,
+      "grad_norm": 1.0390625,
+      "learning_rate": 6.605129980983249e-05,
+      "loss": 0.9026,
+      "step": 12752
+    },
+    {
+      "epoch": 0.8871960763852655,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.597078791094757e-05,
+      "loss": 0.8575,
+      "step": 12753
+    },
+    {
+      "epoch": 0.8872656440224008,
+      "grad_norm": 1.515625,
+      "learning_rate": 6.589032343735823e-05,
+      "loss": 0.8761,
+      "step": 12754
+    },
+    {
+      "epoch": 0.887335211659536,
+      "grad_norm": 1.21875,
+      "learning_rate": 6.580990639314998e-05,
+      "loss": 1.0263,
+      "step": 12755
+    },
+    {
+      "epoch": 0.8874047792966712,
+      "grad_norm": 1.046875,
+      "learning_rate": 6.57295367824059e-05,
+      "loss": 0.653,
+      "step": 12756
+    },
+    {
+      "epoch": 0.8874743469338064,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.564921460920692e-05,
+      "loss": 0.8159,
+      "step": 12757
+    },
+    {
+      "epoch": 0.8875439145709416,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.556893987763146e-05,
+      "loss": 0.8262,
+      "step": 12758
+    },
+    {
+      "epoch": 0.8876134822080768,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.548871259175516e-05,
+      "loss": 0.7602,
+      "step": 12759
+    },
+    {
+      "epoch": 0.887683049845212,
+      "grad_norm": 1.21875,
+      "learning_rate": 6.540853275565195e-05,
+      "loss": 0.876,
+      "step": 12760
+    },
+    {
+      "epoch": 0.8877526174823472,
+      "grad_norm": 1.140625,
+      "learning_rate": 6.532840037339261e-05,
+      "loss": 0.8405,
+      "step": 12761
+    },
+    {
+      "epoch": 0.8878221851194824,
+      "grad_norm": 1.1171875,
+      "learning_rate": 6.524831544904609e-05,
+      "loss": 0.9209,
+      "step": 12762
+    },
+    {
+      "epoch": 0.8878917527566176,
+      "grad_norm": 1.1171875,
+      "learning_rate": 6.516827798667857e-05,
+      "loss": 0.6923,
+      "step": 12763
+    },
+    {
+      "epoch": 0.8879613203937529,
+      "grad_norm": 1.4296875,
+      "learning_rate": 6.508828799035404e-05,
+      "loss": 0.8913,
+      "step": 12764
+    },
+    {
+      "epoch": 0.888030888030888,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.500834546413404e-05,
+      "loss": 0.9831,
+      "step": 12765
+    },
+    {
+      "epoch": 0.8881004556680232,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.492845041207707e-05,
+      "loss": 0.8282,
+      "step": 12766
+    },
+    {
+      "epoch": 0.8881700233051585,
+      "grad_norm": 1.65625,
+      "learning_rate": 6.484860283824079e-05,
+      "loss": 0.9055,
+      "step": 12767
+    },
+    {
+      "epoch": 0.8882395909422937,
+      "grad_norm": 1.3515625,
+      "learning_rate": 6.476880274667885e-05,
+      "loss": 0.8114,
+      "step": 12768
+    },
+    {
+      "epoch": 0.8883091585794288,
+      "grad_norm": 0.796875,
+      "learning_rate": 6.468905014144322e-05,
+      "loss": 0.6349,
+      "step": 12769
+    },
+    {
+      "epoch": 0.8883787262165641,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.460934502658311e-05,
+      "loss": 1.0425,
+      "step": 12770
+    },
+    {
+      "epoch": 0.8884482938536993,
+      "grad_norm": 0.828125,
+      "learning_rate": 6.452968740614574e-05,
+      "loss": 0.7217,
+      "step": 12771
+    },
+    {
+      "epoch": 0.8885178614908344,
+      "grad_norm": 1.0234375,
+      "learning_rate": 6.445007728417596e-05,
+      "loss": 0.9206,
+      "step": 12772
+    },
+    {
+      "epoch": 0.8885874291279696,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.437051466471567e-05,
+      "loss": 0.8422,
+      "step": 12773
+    },
+    {
+      "epoch": 0.8886569967651049,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.429099955180451e-05,
+      "loss": 0.9084,
+      "step": 12774
+    },
+    {
+      "epoch": 0.8887265644022401,
+      "grad_norm": 1.203125,
+      "learning_rate": 6.421153194948015e-05,
+      "loss": 0.86,
+      "step": 12775
+    },
+    {
+      "epoch": 0.8887961320393752,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.413211186177759e-05,
+      "loss": 0.8385,
+      "step": 12776
+    },
+    {
+      "epoch": 0.8888656996765105,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.405273929272914e-05,
+      "loss": 0.8658,
+      "step": 12777
+    },
+    {
+      "epoch": 0.8889352673136457,
+      "grad_norm": 1.25,
+      "learning_rate": 6.397341424636527e-05,
+      "loss": 0.789,
+      "step": 12778
+    },
+    {
+      "epoch": 0.8890048349507809,
+      "grad_norm": 1.453125,
+      "learning_rate": 6.38941367267134e-05,
+      "loss": 0.9334,
+      "step": 12779
+    },
+    {
+      "epoch": 0.8890744025879161,
+      "grad_norm": 1.2734375,
+      "learning_rate": 6.381490673779888e-05,
+      "loss": 1.0498,
+      "step": 12780
+    },
+    {
+      "epoch": 0.8891439702250513,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.37357242836446e-05,
+      "loss": 0.787,
+      "step": 12781
+    },
+    {
+      "epoch": 0.8892135378621865,
+      "grad_norm": 1.2734375,
+      "learning_rate": 6.365658936827135e-05,
+      "loss": 0.8056,
+      "step": 12782
+    },
+    {
+      "epoch": 0.8892831054993218,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.35775019956969e-05,
+      "loss": 0.7835,
+      "step": 12783
+    },
+    {
+      "epoch": 0.8893526731364569,
+      "grad_norm": 1.2890625,
+      "learning_rate": 6.349846216993682e-05,
+      "loss": 0.6149,
+      "step": 12784
+    },
+    {
+      "epoch": 0.8894222407735921,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.341946989500458e-05,
+      "loss": 0.8973,
+      "step": 12785
+    },
+    {
+      "epoch": 0.8894918084107273,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.334052517491107e-05,
+      "loss": 0.7343,
+      "step": 12786
+    },
+    {
+      "epoch": 0.8895613760478626,
+      "grad_norm": 1.265625,
+      "learning_rate": 6.326162801366453e-05,
+      "loss": 0.9022,
+      "step": 12787
+    },
+    {
+      "epoch": 0.8896309436849977,
+      "grad_norm": 1.1484375,
+      "learning_rate": 6.318277841527087e-05,
+      "loss": 0.9184,
+      "step": 12788
+    },
+    {
+      "epoch": 0.8897005113221329,
+      "grad_norm": 1.140625,
+      "learning_rate": 6.310397638373388e-05,
+      "loss": 0.9703,
+      "step": 12789
+    },
+    {
+      "epoch": 0.8897700789592682,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.302522192305471e-05,
+      "loss": 0.8512,
+      "step": 12790
+    },
+    {
+      "epoch": 0.8898396465964034,
+      "grad_norm": 1.2578125,
+      "learning_rate": 6.294651503723204e-05,
+      "loss": 0.7479,
+      "step": 12791
+    },
+    {
+      "epoch": 0.8899092142335385,
+      "grad_norm": 1.0390625,
+      "learning_rate": 6.286785573026232e-05,
+      "loss": 0.6878,
+      "step": 12792
+    },
+    {
+      "epoch": 0.8899787818706738,
+      "grad_norm": 0.9140625,
+      "learning_rate": 6.278924400613928e-05,
+      "loss": 0.4392,
+      "step": 12793
+    },
+    {
+      "epoch": 0.890048349507809,
+      "grad_norm": 1.2890625,
+      "learning_rate": 6.271067986885459e-05,
+      "loss": 0.8874,
+      "step": 12794
+    },
+    {
+      "epoch": 0.8901179171449441,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.263216332239718e-05,
+      "loss": 0.9909,
+      "step": 12795
+    },
+    {
+      "epoch": 0.8901874847820794,
+      "grad_norm": 1.046875,
+      "learning_rate": 6.255369437075409e-05,
+      "loss": 0.7922,
+      "step": 12796
+    },
+    {
+      "epoch": 0.8902570524192146,
+      "grad_norm": 1.0234375,
+      "learning_rate": 6.247527301790922e-05,
+      "loss": 0.8277,
+      "step": 12797
+    },
+    {
+      "epoch": 0.8903266200563498,
+      "grad_norm": 1.171875,
+      "learning_rate": 6.23968992678443e-05,
+      "loss": 0.8435,
+      "step": 12798
+    },
+    {
+      "epoch": 0.8903961876934849,
+      "grad_norm": 1.078125,
+      "learning_rate": 6.231857312453903e-05,
+      "loss": 0.6398,
+      "step": 12799
+    },
+    {
+      "epoch": 0.8904657553306202,
+      "grad_norm": 0.88671875,
+      "learning_rate": 6.224029459197056e-05,
+      "loss": 0.6042,
+      "step": 12800
+    },
+    {
+      "epoch": 0.8905353229677554,
+      "grad_norm": 1.046875,
+      "learning_rate": 6.216206367411326e-05,
+      "loss": 0.7371,
+      "step": 12801
+    },
+    {
+      "epoch": 0.8906048906048906,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.208388037493906e-05,
+      "loss": 0.7232,
+      "step": 12802
+    },
+    {
+      "epoch": 0.8906744582420258,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.200574469841813e-05,
+      "loss": 0.8836,
+      "step": 12803
+    },
+    {
+      "epoch": 0.890744025879161,
+      "grad_norm": 1.21875,
+      "learning_rate": 6.192765664851763e-05,
+      "loss": 1.2466,
+      "step": 12804
+    },
+    {
+      "epoch": 0.8908135935162962,
+      "grad_norm": 0.91015625,
+      "learning_rate": 6.184961622920237e-05,
+      "loss": 0.766,
+      "step": 12805
+    },
+    {
+      "epoch": 0.8908831611534315,
+      "grad_norm": 1.390625,
+      "learning_rate": 6.177162344443521e-05,
+      "loss": 0.7032,
+      "step": 12806
+    },
+    {
+      "epoch": 0.8909527287905666,
+      "grad_norm": 1.3125,
+      "learning_rate": 6.169367829817573e-05,
+      "loss": 0.6213,
+      "step": 12807
+    },
+    {
+      "epoch": 0.8910222964277018,
+      "grad_norm": 1.1640625,
+      "learning_rate": 6.161578079438212e-05,
+      "loss": 0.815,
+      "step": 12808
+    },
+    {
+      "epoch": 0.8910918640648371,
+      "grad_norm": 0.87109375,
+      "learning_rate": 6.15379309370091e-05,
+      "loss": 0.6714,
+      "step": 12809
+    },
+    {
+      "epoch": 0.8911614317019723,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.146012873000994e-05,
+      "loss": 0.9554,
+      "step": 12810
+    },
+    {
+      "epoch": 0.8912309993391074,
+      "grad_norm": 1.421875,
+      "learning_rate": 6.138237417733494e-05,
+      "loss": 0.9702,
+      "step": 12811
+    },
+    {
+      "epoch": 0.8913005669762426,
+      "grad_norm": 1.3046875,
+      "learning_rate": 6.130466728293161e-05,
+      "loss": 1.0276,
+      "step": 12812
+    },
+    {
+      "epoch": 0.8913701346133779,
+      "grad_norm": 1.2265625,
+      "learning_rate": 6.122700805074622e-05,
+      "loss": 0.9332,
+      "step": 12813
+    },
+    {
+      "epoch": 0.891439702250513,
+      "grad_norm": 1.21875,
+      "learning_rate": 6.114939648472151e-05,
+      "loss": 0.8333,
+      "step": 12814
+    },
+    {
+      "epoch": 0.8915092698876482,
+      "grad_norm": 1.1640625,
+      "learning_rate": 6.107183258879833e-05,
+      "loss": 0.9553,
+      "step": 12815
+    },
+    {
+      "epoch": 0.8915788375247835,
+      "grad_norm": 1.4296875,
+      "learning_rate": 6.099431636691488e-05,
+      "loss": 0.9614,
+      "step": 12816
+    },
+    {
+      "epoch": 0.8916484051619187,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.0916847823006994e-05,
+      "loss": 0.6986,
+      "step": 12817
+    },
+    {
+      "epoch": 0.8917179727990538,
+      "grad_norm": 1.25,
+      "learning_rate": 6.083942696100842e-05,
+      "loss": 0.8352,
+      "step": 12818
+    },
+    {
+      "epoch": 0.8917875404361891,
+      "grad_norm": 1.46875,
+      "learning_rate": 6.076205378484989e-05,
+      "loss": 1.0811,
+      "step": 12819
+    },
+    {
+      "epoch": 0.8918571080733243,
+      "grad_norm": 1.21875,
+      "learning_rate": 6.068472829846039e-05,
+      "loss": 0.7868,
+      "step": 12820
+    },
+    {
+      "epoch": 0.8919266757104595,
+      "grad_norm": 0.9609375,
+      "learning_rate": 6.060745050576566e-05,
+      "loss": 0.7291,
+      "step": 12821
+    },
+    {
+      "epoch": 0.8919962433475948,
+      "grad_norm": 1.1015625,
+      "learning_rate": 6.0530220410689786e-05,
+      "loss": 0.9385,
+      "step": 12822
+    },
+    {
+      "epoch": 0.8920658109847299,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.045303801715396e-05,
+      "loss": 0.6088,
+      "step": 12823
+    },
+    {
+      "epoch": 0.8921353786218651,
+      "grad_norm": 0.9609375,
+      "learning_rate": 6.037590332907739e-05,
+      "loss": 0.679,
+      "step": 12824
+    },
+    {
+      "epoch": 0.8922049462590003,
+      "grad_norm": 0.9296875,
+      "learning_rate": 6.029881635037615e-05,
+      "loss": 0.628,
+      "step": 12825
+    },
+    {
+      "epoch": 0.8922745138961355,
+      "grad_norm": 0.96484375,
+      "learning_rate": 6.022177708496468e-05,
+      "loss": 0.7714,
+      "step": 12826
+    },
+    {
+      "epoch": 0.8923440815332707,
+      "grad_norm": 1.125,
+      "learning_rate": 6.014478553675462e-05,
+      "loss": 0.8259,
+      "step": 12827
+    },
+    {
+      "epoch": 0.8924136491704059,
+      "grad_norm": 0.84765625,
+      "learning_rate": 6.006784170965518e-05,
+      "loss": 0.7613,
+      "step": 12828
+    },
+    {
+      "epoch": 0.8924832168075412,
+      "grad_norm": 1.0078125,
+      "learning_rate": 5.999094560757301e-05,
+      "loss": 0.5784,
+      "step": 12829
+    },
+    {
+      "epoch": 0.8925527844446763,
+      "grad_norm": 1.3203125,
+      "learning_rate": 5.991409723441255e-05,
+      "loss": 0.6955,
+      "step": 12830
+    },
+    {
+      "epoch": 0.8926223520818115,
+      "grad_norm": 0.98828125,
+      "learning_rate": 5.983729659407589e-05,
+      "loss": 0.9643,
+      "step": 12831
+    },
+    {
+      "epoch": 0.8926919197189468,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.976054369046269e-05,
+      "loss": 0.8414,
+      "step": 12832
+    },
+    {
+      "epoch": 0.892761487356082,
+      "grad_norm": 1.25,
+      "learning_rate": 5.968383852746973e-05,
+      "loss": 0.825,
+      "step": 12833
+    },
+    {
+      "epoch": 0.8928310549932171,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.9607181108991994e-05,
+      "loss": 0.9173,
+      "step": 12834
+    },
+    {
+      "epoch": 0.8929006226303524,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.95305714389216e-05,
+      "loss": 0.6716,
+      "step": 12835
+    },
+    {
+      "epoch": 0.8929701902674876,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.945400952114866e-05,
+      "loss": 0.7928,
+      "step": 12836
+    },
+    {
+      "epoch": 0.8930397579046228,
+      "grad_norm": 1.265625,
+      "learning_rate": 5.9377495359560165e-05,
+      "loss": 0.9662,
+      "step": 12837
+    },
+    {
+      "epoch": 0.8931093255417579,
+      "grad_norm": 1.375,
+      "learning_rate": 5.930102895804157e-05,
+      "loss": 0.9403,
+      "step": 12838
+    },
+    {
+      "epoch": 0.8931788931788932,
+      "grad_norm": 1.4375,
+      "learning_rate": 5.92246103204751e-05,
+      "loss": 0.7703,
+      "step": 12839
+    },
+    {
+      "epoch": 0.8932484608160284,
+      "grad_norm": 1.25,
+      "learning_rate": 5.914823945074099e-05,
+      "loss": 0.6388,
+      "step": 12840
+    },
+    {
+      "epoch": 0.8933180284531635,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.907191635271725e-05,
+      "loss": 0.8618,
+      "step": 12841
+    },
+    {
+      "epoch": 0.8933875960902988,
+      "grad_norm": 0.91796875,
+      "learning_rate": 5.899564103027899e-05,
+      "loss": 0.6604,
+      "step": 12842
+    },
+    {
+      "epoch": 0.893457163727434,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.891941348729901e-05,
+      "loss": 0.7512,
+      "step": 12843
+    },
+    {
+      "epoch": 0.8935267313645692,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.884323372764755e-05,
+      "loss": 0.853,
+      "step": 12844
+    },
+    {
+      "epoch": 0.8935962990017045,
+      "grad_norm": 1.078125,
+      "learning_rate": 5.8767101755193174e-05,
+      "loss": 0.6933,
+      "step": 12845
+    },
+    {
+      "epoch": 0.8936658666388396,
+      "grad_norm": 0.86328125,
+      "learning_rate": 5.8691017573801244e-05,
+      "loss": 0.6461,
+      "step": 12846
+    },
+    {
+      "epoch": 0.8937354342759748,
+      "grad_norm": 1.078125,
+      "learning_rate": 5.8614981187334884e-05,
+      "loss": 0.8107,
+      "step": 12847
+    },
+    {
+      "epoch": 0.8938050019131101,
+      "grad_norm": 1.125,
+      "learning_rate": 5.853899259965467e-05,
+      "loss": 0.7591,
+      "step": 12848
+    },
+    {
+      "epoch": 0.8938745695502452,
+      "grad_norm": 0.87109375,
+      "learning_rate": 5.846305181461908e-05,
+      "loss": 0.64,
+      "step": 12849
+    },
+    {
+      "epoch": 0.8939441371873804,
+      "grad_norm": 1.0,
+      "learning_rate": 5.8387158836084254e-05,
+      "loss": 0.5869,
+      "step": 12850
+    },
+    {
+      "epoch": 0.8940137048245156,
+      "grad_norm": 0.94921875,
+      "learning_rate": 5.8311313667903206e-05,
+      "loss": 0.6576,
+      "step": 12851
+    },
+    {
+      "epoch": 0.8940832724616509,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.8235516313927316e-05,
+      "loss": 0.7791,
+      "step": 12852
+    },
+    {
+      "epoch": 0.894152840098786,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.815976677800505e-05,
+      "loss": 0.9427,
+      "step": 12853
+    },
+    {
+      "epoch": 0.8942224077359212,
+      "grad_norm": 1.375,
+      "learning_rate": 5.808406506398256e-05,
+      "loss": 0.7885,
+      "step": 12854
+    },
+    {
+      "epoch": 0.8942919753730565,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.800841117570366e-05,
+      "loss": 0.7102,
+      "step": 12855
+    },
+    {
+      "epoch": 0.8943615430101917,
+      "grad_norm": 1.1328125,
+      "learning_rate": 5.793280511700971e-05,
+      "loss": 0.7416,
+      "step": 12856
+    },
+    {
+      "epoch": 0.8944311106473268,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.7857246891739324e-05,
+      "loss": 0.8422,
+      "step": 12857
+    },
+    {
+      "epoch": 0.8945006782844621,
+      "grad_norm": 1.21875,
+      "learning_rate": 5.778173650372931e-05,
+      "loss": 0.5853,
+      "step": 12858
+    },
+    {
+      "epoch": 0.8945702459215973,
+      "grad_norm": 1.9140625,
+      "learning_rate": 5.7706273956813716e-05,
+      "loss": 1.1291,
+      "step": 12859
+    },
+    {
+      "epoch": 0.8946398135587325,
+      "grad_norm": 1.03125,
+      "learning_rate": 5.763085925482403e-05,
+      "loss": 0.9255,
+      "step": 12860
+    },
+    {
+      "epoch": 0.8947093811958677,
+      "grad_norm": 0.890625,
+      "learning_rate": 5.7555492401589304e-05,
+      "loss": 0.7343,
+      "step": 12861
+    },
+    {
+      "epoch": 0.8947789488330029,
+      "grad_norm": 0.9765625,
+      "learning_rate": 5.748017340093636e-05,
+      "loss": 0.8176,
+      "step": 12862
+    },
+    {
+      "epoch": 0.8948485164701381,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.7404902256689596e-05,
+      "loss": 0.834,
+      "step": 12863
+    },
+    {
+      "epoch": 0.8949180841072732,
+      "grad_norm": 1.2265625,
+      "learning_rate": 5.732967897267094e-05,
+      "loss": 0.6891,
+      "step": 12864
+    },
+    {
+      "epoch": 0.8949876517444085,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.725450355269957e-05,
+      "loss": 0.6077,
+      "step": 12865
+    },
+    {
+      "epoch": 0.8950572193815437,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.7179376000592975e-05,
+      "loss": 0.9541,
+      "step": 12866
+    },
+    {
+      "epoch": 0.8951267870186789,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.710429632016534e-05,
+      "loss": 0.7923,
+      "step": 12867
+    },
+    {
+      "epoch": 0.8951963546558142,
+      "grad_norm": 0.984375,
+      "learning_rate": 5.702926451522905e-05,
+      "loss": 0.4878,
+      "step": 12868
+    },
+    {
+      "epoch": 0.8952659222929493,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.695428058959373e-05,
+      "loss": 1.0191,
+      "step": 12869
+    },
+    {
+      "epoch": 0.8953354899300845,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.687934454706689e-05,
+      "loss": 0.7994,
+      "step": 12870
+    },
+    {
+      "epoch": 0.8954050575672198,
+      "grad_norm": 1.3984375,
+      "learning_rate": 5.680445639145304e-05,
+      "loss": 0.8081,
+      "step": 12871
+    },
+    {
+      "epoch": 0.895474625204355,
+      "grad_norm": 1.5859375,
+      "learning_rate": 5.67296161265548e-05,
+      "loss": 0.9523,
+      "step": 12872
+    },
+    {
+      "epoch": 0.8955441928414901,
+      "grad_norm": 1.3203125,
+      "learning_rate": 5.665482375617248e-05,
+      "loss": 0.9154,
+      "step": 12873
+    },
+    {
+      "epoch": 0.8956137604786254,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.658007928410336e-05,
+      "loss": 0.7976,
+      "step": 12874
+    },
+    {
+      "epoch": 0.8956833281157606,
+      "grad_norm": 1.390625,
+      "learning_rate": 5.6505382714142626e-05,
+      "loss": 0.9391,
+      "step": 12875
+    },
+    {
+      "epoch": 0.8957528957528957,
+      "grad_norm": 1.0234375,
+      "learning_rate": 5.64307340500827e-05,
+      "loss": 0.9901,
+      "step": 12876
+    },
+    {
+      "epoch": 0.8958224633900309,
+      "grad_norm": 1.0078125,
+      "learning_rate": 5.6356133295714426e-05,
+      "loss": 0.759,
+      "step": 12877
+    },
+    {
+      "epoch": 0.8958920310271662,
+      "grad_norm": 1.765625,
+      "learning_rate": 5.6281580454825344e-05,
+      "loss": 0.9905,
+      "step": 12878
+    },
+    {
+      "epoch": 0.8959615986643014,
+      "grad_norm": 1.0703125,
+      "learning_rate": 5.620707553120086e-05,
+      "loss": 0.7871,
+      "step": 12879
+    },
+    {
+      "epoch": 0.8960311663014365,
+      "grad_norm": 0.94921875,
+      "learning_rate": 5.6132618528624055e-05,
+      "loss": 0.6924,
+      "step": 12880
+    },
+    {
+      "epoch": 0.8961007339385718,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.605820945087536e-05,
+      "loss": 0.8007,
+      "step": 12881
+    },
+    {
+      "epoch": 0.896170301575707,
+      "grad_norm": 0.95703125,
+      "learning_rate": 5.598384830173309e-05,
+      "loss": 0.7307,
+      "step": 12882
+    },
+    {
+      "epoch": 0.8962398692128422,
+      "grad_norm": 0.91796875,
+      "learning_rate": 5.590953508497276e-05,
+      "loss": 0.8733,
+      "step": 12883
+    },
+    {
+      "epoch": 0.8963094368499774,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.583526980436771e-05,
+      "loss": 0.9888,
+      "step": 12884
+    },
+    {
+      "epoch": 0.8963790044871126,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.576105246368857e-05,
+      "loss": 0.7823,
+      "step": 12885
+    },
+    {
+      "epoch": 0.8964485721242478,
+      "grad_norm": 1.2578125,
+      "learning_rate": 5.568688306670389e-05,
+      "loss": 0.8014,
+      "step": 12886
+    },
+    {
+      "epoch": 0.8965181397613831,
+      "grad_norm": 1.2265625,
+      "learning_rate": 5.5612761617179766e-05,
+      "loss": 0.7285,
+      "step": 12887
+    },
+    {
+      "epoch": 0.8965877073985182,
+      "grad_norm": 1.6171875,
+      "learning_rate": 5.553868811887952e-05,
+      "loss": 0.9137,
+      "step": 12888
+    },
+    {
+      "epoch": 0.8966572750356534,
+      "grad_norm": 0.93359375,
+      "learning_rate": 5.546466257556415e-05,
+      "loss": 0.7395,
+      "step": 12889
+    },
+    {
+      "epoch": 0.8967268426727886,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.539068499099231e-05,
+      "loss": 0.9236,
+      "step": 12890
+    },
+    {
+      "epoch": 0.8967964103099239,
+      "grad_norm": 1.0703125,
+      "learning_rate": 5.5316755368920554e-05,
+      "loss": 0.7547,
+      "step": 12891
+    },
+    {
+      "epoch": 0.896865977947059,
+      "grad_norm": 1.0078125,
+      "learning_rate": 5.5242873713102326e-05,
+      "loss": 0.8267,
+      "step": 12892
+    },
+    {
+      "epoch": 0.8969355455841942,
+      "grad_norm": 1.515625,
+      "learning_rate": 5.516904002728895e-05,
+      "loss": 0.8644,
+      "step": 12893
+    },
+    {
+      "epoch": 0.8970051132213295,
+      "grad_norm": 1.125,
+      "learning_rate": 5.509525431522955e-05,
+      "loss": 0.7402,
+      "step": 12894
+    },
+    {
+      "epoch": 0.8970746808584646,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.502151658067034e-05,
+      "loss": 0.8537,
+      "step": 12895
+    },
+    {
+      "epoch": 0.8971442484955998,
+      "grad_norm": 1.203125,
+      "learning_rate": 5.494782682735555e-05,
+      "loss": 0.6948,
+      "step": 12896
+    },
+    {
+      "epoch": 0.8972138161327351,
+      "grad_norm": 1.0703125,
+      "learning_rate": 5.487418505902664e-05,
+      "loss": 0.8975,
+      "step": 12897
+    },
+    {
+      "epoch": 0.8972833837698703,
+      "grad_norm": 1.2578125,
+      "learning_rate": 5.480059127942283e-05,
+      "loss": 0.8004,
+      "step": 12898
+    },
+    {
+      "epoch": 0.8973529514070054,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.47270454922808e-05,
+      "loss": 0.7677,
+      "step": 12899
+    },
+    {
+      "epoch": 0.8974225190441407,
+      "grad_norm": 1.1015625,
+      "learning_rate": 5.465354770133491e-05,
+      "loss": 1.0044,
+      "step": 12900
+    },
+    {
+      "epoch": 0.8974920866812759,
+      "grad_norm": 0.984375,
+      "learning_rate": 5.4580097910317036e-05,
+      "loss": 0.6903,
+      "step": 12901
+    },
+    {
+      "epoch": 0.8975616543184111,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.4506696122956556e-05,
+      "loss": 0.9146,
+      "step": 12902
+    },
+    {
+      "epoch": 0.8976312219555462,
+      "grad_norm": 1.609375,
+      "learning_rate": 5.443334234298025e-05,
+      "loss": 0.8403,
+      "step": 12903
+    },
+    {
+      "epoch": 0.8977007895926815,
+      "grad_norm": 1.28125,
+      "learning_rate": 5.436003657411281e-05,
+      "loss": 0.7782,
+      "step": 12904
+    },
+    {
+      "epoch": 0.8977703572298167,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.4286778820076486e-05,
+      "loss": 0.9536,
+      "step": 12905
+    },
+    {
+      "epoch": 0.8978399248669519,
+      "grad_norm": 1.421875,
+      "learning_rate": 5.421356908459074e-05,
+      "loss": 0.9864,
+      "step": 12906
+    },
+    {
+      "epoch": 0.8979094925040871,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.414040737137271e-05,
+      "loss": 0.636,
+      "step": 12907
+    },
+    {
+      "epoch": 0.8979790601412223,
+      "grad_norm": 1.421875,
+      "learning_rate": 5.406729368413743e-05,
+      "loss": 0.9173,
+      "step": 12908
+    },
+    {
+      "epoch": 0.8980486277783575,
+      "grad_norm": 1.3203125,
+      "learning_rate": 5.399422802659715e-05,
+      "loss": 0.8698,
+      "step": 12909
+    },
+    {
+      "epoch": 0.8981181954154928,
+      "grad_norm": 0.9296875,
+      "learning_rate": 5.3921210402461785e-05,
+      "loss": 0.6691,
+      "step": 12910
+    },
+    {
+      "epoch": 0.8981877630526279,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.38482408154386e-05,
+      "loss": 0.7015,
+      "step": 12911
+    },
+    {
+      "epoch": 0.8982573306897631,
+      "grad_norm": 1.6484375,
+      "learning_rate": 5.377531926923285e-05,
+      "loss": 0.8865,
+      "step": 12912
+    },
+    {
+      "epoch": 0.8983268983268984,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.3702445767547015e-05,
+      "loss": 0.9282,
+      "step": 12913
+    },
+    {
+      "epoch": 0.8983964659640336,
+      "grad_norm": 1.25,
+      "learning_rate": 5.362962031408136e-05,
+      "loss": 0.925,
+      "step": 12914
+    },
+    {
+      "epoch": 0.8984660336011687,
+      "grad_norm": 1.296875,
+      "learning_rate": 5.35568429125336e-05,
+      "loss": 0.8217,
+      "step": 12915
+    },
+    {
+      "epoch": 0.8985356012383039,
+      "grad_norm": 1.1171875,
+      "learning_rate": 5.348411356659888e-05,
+      "loss": 0.897,
+      "step": 12916
+    },
+    {
+      "epoch": 0.8986051688754392,
+      "grad_norm": 1.4453125,
+      "learning_rate": 5.341143227996992e-05,
+      "loss": 0.8641,
+      "step": 12917
+    },
+    {
+      "epoch": 0.8986747365125743,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.3338799056337316e-05,
+      "loss": 0.7648,
+      "step": 12918
+    },
+    {
+      "epoch": 0.8987443041497095,
+      "grad_norm": 1.03125,
+      "learning_rate": 5.326621389938913e-05,
+      "loss": 0.7663,
+      "step": 12919
+    },
+    {
+      "epoch": 0.8988138717868448,
+      "grad_norm": 1.0234375,
+      "learning_rate": 5.319367681281073e-05,
+      "loss": 1.0038,
+      "step": 12920
+    },
+    {
+      "epoch": 0.89888343942398,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.312118780028496e-05,
+      "loss": 0.8561,
+      "step": 12921
+    },
+    {
+      "epoch": 0.8989530070611151,
+      "grad_norm": 1.078125,
+      "learning_rate": 5.304874686549277e-05,
+      "loss": 0.8276,
+      "step": 12922
+    },
+    {
+      "epoch": 0.8990225746982504,
+      "grad_norm": 0.9609375,
+      "learning_rate": 5.29763540121122e-05,
+      "loss": 0.7428,
+      "step": 12923
+    },
+    {
+      "epoch": 0.8990921423353856,
+      "grad_norm": 1.2421875,
+      "learning_rate": 5.290400924381911e-05,
+      "loss": 0.7342,
+      "step": 12924
+    },
+    {
+      "epoch": 0.8991617099725208,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.2831712564286536e-05,
+      "loss": 1.0198,
+      "step": 12925
+    },
+    {
+      "epoch": 0.899231277609656,
+      "grad_norm": 0.96484375,
+      "learning_rate": 5.275946397718578e-05,
+      "loss": 0.9774,
+      "step": 12926
+    },
+    {
+      "epoch": 0.8993008452467912,
+      "grad_norm": 1.390625,
+      "learning_rate": 5.2687263486184686e-05,
+      "loss": 0.9127,
+      "step": 12927
+    },
+    {
+      "epoch": 0.8993704128839264,
+      "grad_norm": 0.8046875,
+      "learning_rate": 5.2615111094949765e-05,
+      "loss": 0.6506,
+      "step": 12928
+    },
+    {
+      "epoch": 0.8994399805210616,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.254300680714419e-05,
+      "loss": 0.9133,
+      "step": 12929
+    },
+    {
+      "epoch": 0.8995095481581968,
+      "grad_norm": 0.9921875,
+      "learning_rate": 5.247095062642937e-05,
+      "loss": 0.8668,
+      "step": 12930
+    },
+    {
+      "epoch": 0.899579115795332,
+      "grad_norm": 1.171875,
+      "learning_rate": 5.23989425564636e-05,
+      "loss": 0.7779,
+      "step": 12931
+    },
+    {
+      "epoch": 0.8996486834324672,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.2326982600903184e-05,
+      "loss": 0.8159,
+      "step": 12932
+    },
+    {
+      "epoch": 0.8997182510696025,
+      "grad_norm": 0.86328125,
+      "learning_rate": 5.225507076340219e-05,
+      "loss": 0.6071,
+      "step": 12933
+    },
+    {
+      "epoch": 0.8997878187067376,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.21832070476117e-05,
+      "loss": 0.6333,
+      "step": 12934
+    },
+    {
+      "epoch": 0.8998573863438728,
+      "grad_norm": 1.4453125,
+      "learning_rate": 5.211139145718047e-05,
+      "loss": 0.9271,
+      "step": 12935
+    },
+    {
+      "epoch": 0.8999269539810081,
+      "grad_norm": 0.81640625,
+      "learning_rate": 5.2039623995755126e-05,
+      "loss": 0.6386,
+      "step": 12936
+    },
+    {
+      "epoch": 0.8999965216181433,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.196790466697965e-05,
+      "loss": 0.7896,
+      "step": 12937
+    },
+    {
+      "epoch": 0.9000660892552784,
+      "grad_norm": 1.484375,
+      "learning_rate": 5.189623347449557e-05,
+      "loss": 0.9823,
+      "step": 12938
+    },
+    {
+      "epoch": 0.9001356568924136,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.182461042194175e-05,
+      "loss": 0.727,
+      "step": 12939
+    },
+    {
+      "epoch": 0.9002052245295489,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.1753035512955184e-05,
+      "loss": 0.9766,
+      "step": 12940
+    },
+    {
+      "epoch": 0.900274792166684,
+      "grad_norm": 1.3125,
+      "learning_rate": 5.168150875117006e-05,
+      "loss": 0.7615,
+      "step": 12941
+    },
+    {
+      "epoch": 0.9003443598038192,
+      "grad_norm": 0.8984375,
+      "learning_rate": 5.161003014021792e-05,
+      "loss": 0.7439,
+      "step": 12942
+    },
+    {
+      "epoch": 0.9004139274409545,
+      "grad_norm": 1.3828125,
+      "learning_rate": 5.1538599683728206e-05,
+      "loss": 0.8239,
+      "step": 12943
+    },
+    {
+      "epoch": 0.9004834950780897,
+      "grad_norm": 1.140625,
+      "learning_rate": 5.146721738532789e-05,
+      "loss": 0.6905,
+      "step": 12944
+    },
+    {
+      "epoch": 0.9005530627152248,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.1395883248641196e-05,
+      "loss": 0.7003,
+      "step": 12945
+    },
+    {
+      "epoch": 0.9006226303523601,
+      "grad_norm": 1.0625,
+      "learning_rate": 5.132459727729022e-05,
+      "loss": 1.021,
+      "step": 12946
+    },
+    {
+      "epoch": 0.9006921979894953,
+      "grad_norm": 1.15625,
+      "learning_rate": 5.125335947489462e-05,
+      "loss": 0.7083,
+      "step": 12947
+    },
+    {
+      "epoch": 0.9007617656266305,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.118216984507151e-05,
+      "loss": 1.0288,
+      "step": 12948
+    },
+    {
+      "epoch": 0.9008313332637657,
+      "grad_norm": 1.4375,
+      "learning_rate": 5.111102839143511e-05,
+      "loss": 0.9,
+      "step": 12949
+    },
+    {
+      "epoch": 0.9009009009009009,
+      "grad_norm": 0.90625,
+      "learning_rate": 5.103993511759808e-05,
+      "loss": 0.6472,
+      "step": 12950
+    },
+    {
+      "epoch": 0.9009704685380361,
+      "grad_norm": 1.359375,
+      "learning_rate": 5.096889002717009e-05,
+      "loss": 0.9621,
+      "step": 12951
+    },
+    {
+      "epoch": 0.9010400361751713,
+      "grad_norm": 1.0546875,
+      "learning_rate": 5.0897893123758365e-05,
+      "loss": 0.7863,
+      "step": 12952
+    },
+    {
+      "epoch": 0.9011096038123065,
+      "grad_norm": 1.28125,
+      "learning_rate": 5.08269444109678e-05,
+      "loss": 0.9322,
+      "step": 12953
+    },
+    {
+      "epoch": 0.9011791714494417,
+      "grad_norm": 1.2890625,
+      "learning_rate": 5.0756043892400626e-05,
+      "loss": 0.8211,
+      "step": 12954
+    },
+    {
+      "epoch": 0.9012487390865769,
+      "grad_norm": 1.0625,
+      "learning_rate": 5.0685191571657294e-05,
+      "loss": 0.9078,
+      "step": 12955
+    },
+    {
+      "epoch": 0.9013183067237122,
+      "grad_norm": 0.98828125,
+      "learning_rate": 5.061438745233493e-05,
+      "loss": 0.6872,
+      "step": 12956
+    },
+    {
+      "epoch": 0.9013878743608473,
+      "grad_norm": 1.125,
+      "learning_rate": 5.054363153802865e-05,
+      "loss": 0.8901,
+      "step": 12957
+    },
+    {
+      "epoch": 0.9014574419979825,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.0472923832331266e-05,
+      "loss": 0.8544,
+      "step": 12958
+    },
+    {
+      "epoch": 0.9015270096351178,
+      "grad_norm": 1.125,
+      "learning_rate": 5.040226433883266e-05,
+      "loss": 0.9961,
+      "step": 12959
+    },
+    {
+      "epoch": 0.901596577272253,
+      "grad_norm": 1.0234375,
+      "learning_rate": 5.0331653061120755e-05,
+      "loss": 0.7943,
+      "step": 12960
+    },
+    {
+      "epoch": 0.9016661449093881,
+      "grad_norm": 1.15625,
+      "learning_rate": 5.0261090002781004e-05,
+      "loss": 0.716,
+      "step": 12961
+    },
+    {
+      "epoch": 0.9017357125465234,
+      "grad_norm": 0.77734375,
+      "learning_rate": 5.0190575167396e-05,
+      "loss": 0.7278,
+      "step": 12962
+    },
+    {
+      "epoch": 0.9018052801836586,
+      "grad_norm": 1.1015625,
+      "learning_rate": 5.01201085585461e-05,
+      "loss": 0.8286,
+      "step": 12963
+    },
+    {
+      "epoch": 0.9018748478207937,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.0049690179809315e-05,
+      "loss": 0.6693,
+      "step": 12964
+    },
+    {
+      "epoch": 0.9019444154579289,
+      "grad_norm": 0.984375,
+      "learning_rate": 4.997932003476124e-05,
+      "loss": 0.7188,
+      "step": 12965
+    },
+    {
+      "epoch": 0.9020139830950642,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.9908998126974915e-05,
+      "loss": 0.7914,
+      "step": 12966
+    },
+    {
+      "epoch": 0.9020835507321994,
+      "grad_norm": 0.87109375,
+      "learning_rate": 4.9838724460020693e-05,
+      "loss": 0.7081,
+      "step": 12967
+    },
+    {
+      "epoch": 0.9021531183693345,
+      "grad_norm": 0.96875,
+      "learning_rate": 4.9768499037466944e-05,
+      "loss": 0.7018,
+      "step": 12968
+    },
+    {
+      "epoch": 0.9022226860064698,
+      "grad_norm": 1.0546875,
+      "learning_rate": 4.969832186287937e-05,
+      "loss": 0.7203,
+      "step": 12969
+    },
+    {
+      "epoch": 0.902292253643605,
+      "grad_norm": 0.9765625,
+      "learning_rate": 4.962819293982113e-05,
+      "loss": 0.8163,
+      "step": 12970
+    },
+    {
+      "epoch": 0.9023618212807402,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.9558112271852916e-05,
+      "loss": 0.7653,
+      "step": 12971
+    },
+    {
+      "epoch": 0.9024313889178754,
+      "grad_norm": 1.09375,
+      "learning_rate": 4.948807986253323e-05,
+      "loss": 0.8682,
+      "step": 12972
+    },
+    {
+      "epoch": 0.9025009565550106,
+      "grad_norm": 1.3515625,
+      "learning_rate": 4.9418095715417885e-05,
+      "loss": 0.6972,
+      "step": 12973
+    },
+    {
+      "epoch": 0.9025705241921458,
+      "grad_norm": 1.5390625,
+      "learning_rate": 4.934815983406027e-05,
+      "loss": 0.7371,
+      "step": 12974
+    },
+    {
+      "epoch": 0.9026400918292811,
+      "grad_norm": 1.53125,
+      "learning_rate": 4.927827222201165e-05,
+      "loss": 0.9693,
+      "step": 12975
+    },
+    {
+      "epoch": 0.9027096594664162,
+      "grad_norm": 1.296875,
+      "learning_rate": 4.9208432882820396e-05,
+      "loss": 0.691,
+      "step": 12976
+    },
+    {
+      "epoch": 0.9027792271035514,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.913864182003236e-05,
+      "loss": 0.7755,
+      "step": 12977
+    },
+    {
+      "epoch": 0.9028487947406866,
+      "grad_norm": 1.3515625,
+      "learning_rate": 4.9068899037191364e-05,
+      "loss": 0.7264,
+      "step": 12978
+    },
+    {
+      "epoch": 0.9029183623778219,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.8999204537838906e-05,
+      "loss": 0.7654,
+      "step": 12979
+    },
+    {
+      "epoch": 0.902987930014957,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.892955832551338e-05,
+      "loss": 0.7029,
+      "step": 12980
+    },
+    {
+      "epoch": 0.9030574976520922,
+      "grad_norm": 1.140625,
+      "learning_rate": 4.885996040375096e-05,
+      "loss": 1.0747,
+      "step": 12981
+    },
+    {
+      "epoch": 0.9031270652892275,
+      "grad_norm": 1.2421875,
+      "learning_rate": 4.8790410776085705e-05,
+      "loss": 0.8239,
+      "step": 12982
+    },
+    {
+      "epoch": 0.9031966329263627,
+      "grad_norm": 1.0234375,
+      "learning_rate": 4.872090944604901e-05,
+      "loss": 0.8294,
+      "step": 12983
+    },
+    {
+      "epoch": 0.9032662005634978,
+      "grad_norm": 1.390625,
+      "learning_rate": 4.865145641716972e-05,
+      "loss": 1.057,
+      "step": 12984
+    },
+    {
+      "epoch": 0.9033357682006331,
+      "grad_norm": 1.109375,
+      "learning_rate": 4.858205169297425e-05,
+      "loss": 0.7924,
+      "step": 12985
+    },
+    {
+      "epoch": 0.9034053358377683,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.851269527698665e-05,
+      "loss": 0.6779,
+      "step": 12986
+    },
+    {
+      "epoch": 0.9034749034749034,
+      "grad_norm": 1.3515625,
+      "learning_rate": 4.8443387172728784e-05,
+      "loss": 0.7835,
+      "step": 12987
+    },
+    {
+      "epoch": 0.9035444711120387,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.837412738371927e-05,
+      "loss": 1.1405,
+      "step": 12988
+    },
+    {
+      "epoch": 0.9036140387491739,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.830491591347519e-05,
+      "loss": 0.7007,
+      "step": 12989
+    },
+    {
+      "epoch": 0.9036836063863091,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.823575276551051e-05,
+      "loss": 0.8549,
+      "step": 12990
+    },
+    {
+      "epoch": 0.9037531740234442,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.816663794333698e-05,
+      "loss": 0.673,
+      "step": 12991
+    },
+    {
+      "epoch": 0.9038227416605795,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.8097571450464006e-05,
+      "loss": 0.9113,
+      "step": 12992
+    },
+    {
+      "epoch": 0.9038923092977147,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.802855329039846e-05,
+      "loss": 0.6998,
+      "step": 12993
+    },
+    {
+      "epoch": 0.9039618769348499,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.795958346664475e-05,
+      "loss": 0.7755,
+      "step": 12994
+    },
+    {
+      "epoch": 0.9040314445719851,
+      "grad_norm": 1.6484375,
+      "learning_rate": 4.789066198270464e-05,
+      "loss": 0.6506,
+      "step": 12995
+    },
+    {
+      "epoch": 0.9041010122091203,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.782178884207766e-05,
+      "loss": 0.8783,
+      "step": 12996
+    },
+    {
+      "epoch": 0.9041705798462555,
+      "grad_norm": 1.203125,
+      "learning_rate": 4.775296404826113e-05,
+      "loss": 0.8482,
+      "step": 12997
+    },
+    {
+      "epoch": 0.9042401474833908,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.768418760474935e-05,
+      "loss": 0.721,
+      "step": 12998
+    },
+    {
+      "epoch": 0.9043097151205259,
+      "grad_norm": 0.96875,
+      "learning_rate": 4.761545951503432e-05,
+      "loss": 0.9024,
+      "step": 12999
+    },
+    {
+      "epoch": 0.9043792827576611,
+      "grad_norm": 0.90234375,
+      "learning_rate": 4.7546779782605906e-05,
+      "loss": 0.7128,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9044488503947964,
+      "grad_norm": 0.8671875,
+      "learning_rate": 4.7478148410951546e-05,
+      "loss": 0.7462,
+      "step": 13001
+    },
+    {
+      "epoch": 0.9045184180319316,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.7409565403555456e-05,
+      "loss": 0.6865,
+      "step": 13002
+    },
+    {
+      "epoch": 0.9045879856690667,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.734103076390039e-05,
+      "loss": 0.8292,
+      "step": 13003
+    },
+    {
+      "epoch": 0.9046575533062019,
+      "grad_norm": 1.6171875,
+      "learning_rate": 4.727254449546614e-05,
+      "loss": 0.6711,
+      "step": 13004
+    },
+    {
+      "epoch": 0.9047271209433372,
+      "grad_norm": 0.8984375,
+      "learning_rate": 4.720410660172969e-05,
+      "loss": 0.878,
+      "step": 13005
+    },
+    {
+      "epoch": 0.9047966885804724,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.7135717086166375e-05,
+      "loss": 0.882,
+      "step": 13006
+    },
+    {
+      "epoch": 0.9048662562176075,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.7067375952248637e-05,
+      "loss": 0.848,
+      "step": 13007
+    },
+    {
+      "epoch": 0.9049358238547428,
+      "grad_norm": 1.4453125,
+      "learning_rate": 4.6999083203446366e-05,
+      "loss": 0.9239,
+      "step": 13008
+    },
+    {
+      "epoch": 0.905005391491878,
+      "grad_norm": 0.96875,
+      "learning_rate": 4.693083884322713e-05,
+      "loss": 0.6508,
+      "step": 13009
+    },
+    {
+      "epoch": 0.9050749591290131,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.686264287505604e-05,
+      "loss": 0.7265,
+      "step": 13010
+    },
+    {
+      "epoch": 0.9051445267661484,
+      "grad_norm": 1.234375,
+      "learning_rate": 4.679449530239588e-05,
+      "loss": 0.8146,
+      "step": 13011
+    },
+    {
+      "epoch": 0.9052140944032836,
+      "grad_norm": 1.171875,
+      "learning_rate": 4.6726396128706774e-05,
+      "loss": 0.8923,
+      "step": 13012
+    },
+    {
+      "epoch": 0.9052836620404188,
+      "grad_norm": 1.734375,
+      "learning_rate": 4.665834535744617e-05,
+      "loss": 0.8934,
+      "step": 13013
+    },
+    {
+      "epoch": 0.905353229677554,
+      "grad_norm": 0.8671875,
+      "learning_rate": 4.659034299206977e-05,
+      "loss": 0.6589,
+      "step": 13014
+    },
+    {
+      "epoch": 0.9054227973146892,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.652238903603023e-05,
+      "loss": 0.9328,
+      "step": 13015
+    },
+    {
+      "epoch": 0.9054923649518244,
+      "grad_norm": 1.171875,
+      "learning_rate": 4.6454483492777925e-05,
+      "loss": 0.9544,
+      "step": 13016
+    },
+    {
+      "epoch": 0.9055619325889596,
+      "grad_norm": 1.0703125,
+      "learning_rate": 4.638662636576052e-05,
+      "loss": 0.9136,
+      "step": 13017
+    },
+    {
+      "epoch": 0.9056315002260948,
+      "grad_norm": 1.125,
+      "learning_rate": 4.6318817658423715e-05,
+      "loss": 0.7097,
+      "step": 13018
+    },
+    {
+      "epoch": 0.90570106786323,
+      "grad_norm": 1.09375,
+      "learning_rate": 4.625105737421065e-05,
+      "loss": 0.8913,
+      "step": 13019
+    },
+    {
+      "epoch": 0.9057706355003652,
+      "grad_norm": 0.89453125,
+      "learning_rate": 4.618334551656145e-05,
+      "loss": 0.7505,
+      "step": 13020
+    },
+    {
+      "epoch": 0.9058402031375005,
+      "grad_norm": 0.859375,
+      "learning_rate": 4.611568208891448e-05,
+      "loss": 0.7989,
+      "step": 13021
+    },
+    {
+      "epoch": 0.9059097707746356,
+      "grad_norm": 0.9609375,
+      "learning_rate": 4.6048067094705216e-05,
+      "loss": 0.68,
+      "step": 13022
+    },
+    {
+      "epoch": 0.9059793384117708,
+      "grad_norm": 1.296875,
+      "learning_rate": 4.59805005373668e-05,
+      "loss": 0.8832,
+      "step": 13023
+    },
+    {
+      "epoch": 0.9060489060489061,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.591298242032982e-05,
+      "loss": 0.9539,
+      "step": 13024
+    },
+    {
+      "epoch": 0.9061184736860413,
+      "grad_norm": 0.9609375,
+      "learning_rate": 4.5845512747022865e-05,
+      "loss": 0.6243,
+      "step": 13025
+    },
+    {
+      "epoch": 0.9061880413231764,
+      "grad_norm": 0.94921875,
+      "learning_rate": 4.577809152087142e-05,
+      "loss": 0.7258,
+      "step": 13026
+    },
+    {
+      "epoch": 0.9062576089603117,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.571071874529886e-05,
+      "loss": 0.6775,
+      "step": 13027
+    },
+    {
+      "epoch": 0.9063271765974469,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.5643394423725895e-05,
+      "loss": 0.8348,
+      "step": 13028
+    },
+    {
+      "epoch": 0.906396744234582,
+      "grad_norm": 0.9296875,
+      "learning_rate": 4.5576118559571224e-05,
+      "loss": 0.7729,
+      "step": 13029
+    },
+    {
+      "epoch": 0.9064663118717172,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.5508891156250565e-05,
+      "loss": 0.7014,
+      "step": 13030
+    },
+    {
+      "epoch": 0.9065358795088525,
+      "grad_norm": 1.140625,
+      "learning_rate": 4.54417122171773e-05,
+      "loss": 0.7075,
+      "step": 13031
+    },
+    {
+      "epoch": 0.9066054471459877,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.537458174576259e-05,
+      "loss": 0.7201,
+      "step": 13032
+    },
+    {
+      "epoch": 0.9066750147831228,
+      "grad_norm": 1.25,
+      "learning_rate": 4.530749974541504e-05,
+      "loss": 0.7543,
+      "step": 13033
+    },
+    {
+      "epoch": 0.9067445824202581,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.524046621954048e-05,
+      "loss": 0.7822,
+      "step": 13034
+    },
+    {
+      "epoch": 0.9068141500573933,
+      "grad_norm": 1.3125,
+      "learning_rate": 4.517348117154296e-05,
+      "loss": 0.8342,
+      "step": 13035
+    },
+    {
+      "epoch": 0.9068837176945285,
+      "grad_norm": 0.9921875,
+      "learning_rate": 4.510654460482322e-05,
+      "loss": 0.8462,
+      "step": 13036
+    },
+    {
+      "epoch": 0.9069532853316637,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.503965652278008e-05,
+      "loss": 0.6069,
+      "step": 13037
+    },
+    {
+      "epoch": 0.9070228529687989,
+      "grad_norm": 1.4453125,
+      "learning_rate": 4.497281692880983e-05,
+      "loss": 0.9126,
+      "step": 13038
+    },
+    {
+      "epoch": 0.9070924206059341,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.490602582630643e-05,
+      "loss": 0.8152,
+      "step": 13039
+    },
+    {
+      "epoch": 0.9071619882430694,
+      "grad_norm": 1.2109375,
+      "learning_rate": 4.483928321866093e-05,
+      "loss": 1.0337,
+      "step": 13040
+    },
+    {
+      "epoch": 0.9072315558802045,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.4772589109262184e-05,
+      "loss": 0.6577,
+      "step": 13041
+    },
+    {
+      "epoch": 0.9073011235173397,
+      "grad_norm": 0.9296875,
+      "learning_rate": 4.4705943501496596e-05,
+      "loss": 1.0599,
+      "step": 13042
+    },
+    {
+      "epoch": 0.9073706911544749,
+      "grad_norm": 0.99609375,
+      "learning_rate": 4.463934639874834e-05,
+      "loss": 0.7178,
+      "step": 13043
+    },
+    {
+      "epoch": 0.9074402587916102,
+      "grad_norm": 1.1640625,
+      "learning_rate": 4.45727978043986e-05,
+      "loss": 0.7231,
+      "step": 13044
+    },
+    {
+      "epoch": 0.9075098264287453,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.450629772182646e-05,
+      "loss": 0.8416,
+      "step": 13045
+    },
+    {
+      "epoch": 0.9075793940658805,
+      "grad_norm": 1.9296875,
+      "learning_rate": 4.4439846154408435e-05,
+      "loss": 1.1456,
+      "step": 13046
+    },
+    {
+      "epoch": 0.9076489617030158,
+      "grad_norm": 0.99609375,
+      "learning_rate": 4.4373443105518827e-05,
+      "loss": 0.7196,
+      "step": 13047
+    },
+    {
+      "epoch": 0.907718529340151,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.430708857852883e-05,
+      "loss": 1.0032,
+      "step": 13048
+    },
+    {
+      "epoch": 0.9077880969772861,
+      "grad_norm": 1.25,
+      "learning_rate": 4.424078257680808e-05,
+      "loss": 0.9284,
+      "step": 13049
+    },
+    {
+      "epoch": 0.9078576646144214,
+      "grad_norm": 1.7578125,
+      "learning_rate": 4.417452510372277e-05,
+      "loss": 0.8973,
+      "step": 13050
+    },
+    {
+      "epoch": 0.9079272322515566,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.410831616263755e-05,
+      "loss": 0.9603,
+      "step": 13051
+    },
+    {
+      "epoch": 0.9079967998886918,
+      "grad_norm": 1.2890625,
+      "learning_rate": 4.404215575691384e-05,
+      "loss": 0.8146,
+      "step": 13052
+    },
+    {
+      "epoch": 0.908066367525827,
+      "grad_norm": 1.3515625,
+      "learning_rate": 4.397604388991116e-05,
+      "loss": 0.7503,
+      "step": 13053
+    },
+    {
+      "epoch": 0.9081359351629622,
+      "grad_norm": 0.953125,
+      "learning_rate": 4.3909980564986294e-05,
+      "loss": 0.6531,
+      "step": 13054
+    },
+    {
+      "epoch": 0.9082055028000974,
+      "grad_norm": 1.0703125,
+      "learning_rate": 4.3843965785493435e-05,
+      "loss": 0.6746,
+      "step": 13055
+    },
+    {
+      "epoch": 0.9082750704372325,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.377799955478456e-05,
+      "loss": 0.6423,
+      "step": 13056
+    },
+    {
+      "epoch": 0.9083446380743678,
+      "grad_norm": 0.98046875,
+      "learning_rate": 4.371208187620934e-05,
+      "loss": 0.6185,
+      "step": 13057
+    },
+    {
+      "epoch": 0.908414205711503,
+      "grad_norm": 1.25,
+      "learning_rate": 4.364621275311453e-05,
+      "loss": 0.846,
+      "step": 13058
+    },
+    {
+      "epoch": 0.9084837733486382,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.358039218884458e-05,
+      "loss": 0.8656,
+      "step": 13059
+    },
+    {
+      "epoch": 0.9085533409857734,
+      "grad_norm": 1.0,
+      "learning_rate": 4.351462018674157e-05,
+      "loss": 0.5444,
+      "step": 13060
+    },
+    {
+      "epoch": 0.9086229086229086,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.3448896750145184e-05,
+      "loss": 0.8221,
+      "step": 13061
+    },
+    {
+      "epoch": 0.9086924762600438,
+      "grad_norm": 0.95703125,
+      "learning_rate": 4.338322188239241e-05,
+      "loss": 0.6084,
+      "step": 13062
+    },
+    {
+      "epoch": 0.9087620438971791,
+      "grad_norm": 0.86328125,
+      "learning_rate": 4.331759558681803e-05,
+      "loss": 0.5762,
+      "step": 13063
+    },
+    {
+      "epoch": 0.9088316115343142,
+      "grad_norm": 1.59375,
+      "learning_rate": 4.3252017866753926e-05,
+      "loss": 1.0052,
+      "step": 13064
+    },
+    {
+      "epoch": 0.9089011791714494,
+      "grad_norm": 1.3125,
+      "learning_rate": 4.318648872553011e-05,
+      "loss": 0.9105,
+      "step": 13065
+    },
+    {
+      "epoch": 0.9089707468085847,
+      "grad_norm": 1.140625,
+      "learning_rate": 4.3121008166473576e-05,
+      "loss": 0.843,
+      "step": 13066
+    },
+    {
+      "epoch": 0.9090403144457199,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.305557619290934e-05,
+      "loss": 0.968,
+      "step": 13067
+    },
+    {
+      "epoch": 0.909109882082855,
+      "grad_norm": 1.4765625,
+      "learning_rate": 4.2990192808159636e-05,
+      "loss": 1.0298,
+      "step": 13068
+    },
+    {
+      "epoch": 0.9091794497199902,
+      "grad_norm": 0.88671875,
+      "learning_rate": 4.292485801554402e-05,
+      "loss": 0.5187,
+      "step": 13069
+    },
+    {
+      "epoch": 0.9092490173571255,
+      "grad_norm": 1.3984375,
+      "learning_rate": 4.2859571818380295e-05,
+      "loss": 1.0645,
+      "step": 13070
+    },
+    {
+      "epoch": 0.9093185849942607,
+      "grad_norm": 1.1953125,
+      "learning_rate": 4.279433421998324e-05,
+      "loss": 1.0641,
+      "step": 13071
+    },
+    {
+      "epoch": 0.9093881526313958,
+      "grad_norm": 0.890625,
+      "learning_rate": 4.272914522366511e-05,
+      "loss": 0.715,
+      "step": 13072
+    },
+    {
+      "epoch": 0.9094577202685311,
+      "grad_norm": 1.203125,
+      "learning_rate": 4.266400483273591e-05,
+      "loss": 1.082,
+      "step": 13073
+    },
+    {
+      "epoch": 0.9095272879056663,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.259891305050323e-05,
+      "loss": 0.4616,
+      "step": 13074
+    },
+    {
+      "epoch": 0.9095968555428015,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.253386988027219e-05,
+      "loss": 0.7929,
+      "step": 13075
+    },
+    {
+      "epoch": 0.9096664231799367,
+      "grad_norm": 1.2265625,
+      "learning_rate": 4.246887532534516e-05,
+      "loss": 0.7197,
+      "step": 13076
+    },
+    {
+      "epoch": 0.9097359908170719,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.240392938902238e-05,
+      "loss": 1.2147,
+      "step": 13077
+    },
+    {
+      "epoch": 0.9098055584542071,
+      "grad_norm": 1.25,
+      "learning_rate": 4.2339032074601326e-05,
+      "loss": 1.0718,
+      "step": 13078
+    },
+    {
+      "epoch": 0.9098751260913424,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.2274183385377476e-05,
+      "loss": 0.5959,
+      "step": 13079
+    },
+    {
+      "epoch": 0.9099446937284775,
+      "grad_norm": 1.0546875,
+      "learning_rate": 4.220938332464308e-05,
+      "loss": 0.9636,
+      "step": 13080
+    },
+    {
+      "epoch": 0.9100142613656127,
+      "grad_norm": 1.1953125,
+      "learning_rate": 4.214463189568874e-05,
+      "loss": 0.9016,
+      "step": 13081
+    },
+    {
+      "epoch": 0.9100838290027479,
+      "grad_norm": 1.203125,
+      "learning_rate": 4.20799291018018e-05,
+      "loss": 0.8073,
+      "step": 13082
+    },
+    {
+      "epoch": 0.9101533966398831,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.2015274946268115e-05,
+      "loss": 0.9264,
+      "step": 13083
+    },
+    {
+      "epoch": 0.9102229642770183,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.195066943236991e-05,
+      "loss": 0.7468,
+      "step": 13084
+    },
+    {
+      "epoch": 0.9102925319141535,
+      "grad_norm": 0.96875,
+      "learning_rate": 4.1886112563387924e-05,
+      "loss": 0.8731,
+      "step": 13085
+    },
+    {
+      "epoch": 0.9103620995512888,
+      "grad_norm": 1.25,
+      "learning_rate": 4.1821604342599854e-05,
+      "loss": 0.9215,
+      "step": 13086
+    },
+    {
+      "epoch": 0.9104316671884239,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.175714477328108e-05,
+      "loss": 0.8136,
+      "step": 13087
+    },
+    {
+      "epoch": 0.9105012348255591,
+      "grad_norm": 0.9453125,
+      "learning_rate": 4.169273385870454e-05,
+      "loss": 0.7271,
+      "step": 13088
+    },
+    {
+      "epoch": 0.9105708024626944,
+      "grad_norm": 1.2265625,
+      "learning_rate": 4.162837160214095e-05,
+      "loss": 0.6549,
+      "step": 13089
+    },
+    {
+      "epoch": 0.9106403700998296,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.156405800685803e-05,
+      "loss": 1.0101,
+      "step": 13090
+    },
+    {
+      "epoch": 0.9107099377369647,
+      "grad_norm": 1.5625,
+      "learning_rate": 4.1499793076121285e-05,
+      "loss": 0.7849,
+      "step": 13091
+    },
+    {
+      "epoch": 0.9107795053741,
+      "grad_norm": 0.921875,
+      "learning_rate": 4.1435576813193765e-05,
+      "loss": 0.8024,
+      "step": 13092
+    },
+    {
+      "epoch": 0.9108490730112352,
+      "grad_norm": 1.8671875,
+      "learning_rate": 4.137140922133642e-05,
+      "loss": 0.7898,
+      "step": 13093
+    },
+    {
+      "epoch": 0.9109186406483704,
+      "grad_norm": 1.1875,
+      "learning_rate": 4.130729030380675e-05,
+      "loss": 0.732,
+      "step": 13094
+    },
+    {
+      "epoch": 0.9109882082855055,
+      "grad_norm": 0.890625,
+      "learning_rate": 4.1243220063860944e-05,
+      "loss": 0.779,
+      "step": 13095
+    },
+    {
+      "epoch": 0.9110577759226408,
+      "grad_norm": 1.0859375,
+      "learning_rate": 4.117919850475183e-05,
+      "loss": 0.6693,
+      "step": 13096
+    },
+    {
+      "epoch": 0.911127343559776,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.111522562973025e-05,
+      "loss": 1.0803,
+      "step": 13097
+    },
+    {
+      "epoch": 0.9111969111969112,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.1051301442044276e-05,
+      "loss": 0.9407,
+      "step": 13098
+    },
+    {
+      "epoch": 0.9112664788340464,
+      "grad_norm": 1.4375,
+      "learning_rate": 4.098742594493998e-05,
+      "loss": 0.7143,
+      "step": 13099
+    },
+    {
+      "epoch": 0.9113360464711816,
+      "grad_norm": 0.97265625,
+      "learning_rate": 4.092359914166033e-05,
+      "loss": 0.6816,
+      "step": 13100
+    },
+    {
+      "epoch": 0.9114056141083168,
+      "grad_norm": 1.109375,
+      "learning_rate": 4.0859821035445946e-05,
+      "loss": 0.7882,
+      "step": 13101
+    },
+    {
+      "epoch": 0.9114751817454521,
+      "grad_norm": 1.5703125,
+      "learning_rate": 4.079609162953568e-05,
+      "loss": 1.231,
+      "step": 13102
+    },
+    {
+      "epoch": 0.9115447493825872,
+      "grad_norm": 1.3046875,
+      "learning_rate": 4.0732410927165067e-05,
+      "loss": 0.9799,
+      "step": 13103
+    },
+    {
+      "epoch": 0.9116143170197224,
+      "grad_norm": 1.03125,
+      "learning_rate": 4.066877893156762e-05,
+      "loss": 0.7711,
+      "step": 13104
+    },
+    {
+      "epoch": 0.9116838846568577,
+      "grad_norm": 1.0,
+      "learning_rate": 4.0605195645974094e-05,
+      "loss": 0.527,
+      "step": 13105
+    },
+    {
+      "epoch": 0.9117534522939928,
+      "grad_norm": 0.87890625,
+      "learning_rate": 4.054166107361301e-05,
+      "loss": 0.6169,
+      "step": 13106
+    },
+    {
+      "epoch": 0.911823019931128,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.0478175217710466e-05,
+      "loss": 0.8007,
+      "step": 13107
+    },
+    {
+      "epoch": 0.9118925875682632,
+      "grad_norm": 1.125,
+      "learning_rate": 4.041473808148977e-05,
+      "loss": 0.6891,
+      "step": 13108
+    },
+    {
+      "epoch": 0.9119621552053985,
+      "grad_norm": 1.25,
+      "learning_rate": 4.035134966817211e-05,
+      "loss": 0.8601,
+      "step": 13109
+    },
+    {
+      "epoch": 0.9120317228425336,
+      "grad_norm": 1.359375,
+      "learning_rate": 4.0288009980975706e-05,
+      "loss": 1.1731,
+      "step": 13110
+    },
+    {
+      "epoch": 0.9121012904796688,
+      "grad_norm": 1.09375,
+      "learning_rate": 4.022471902311709e-05,
+      "loss": 0.7185,
+      "step": 13111
+    },
+    {
+      "epoch": 0.9121708581168041,
+      "grad_norm": 1.25,
+      "learning_rate": 4.0161476797809456e-05,
+      "loss": 0.9068,
+      "step": 13112
+    },
+    {
+      "epoch": 0.9122404257539393,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.009828330826415e-05,
+      "loss": 0.6831,
+      "step": 13113
+    },
+    {
+      "epoch": 0.9123099933910744,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.00351385576897e-05,
+      "loss": 0.6043,
+      "step": 13114
+    },
+    {
+      "epoch": 0.9123795610282097,
+      "grad_norm": 0.8984375,
+      "learning_rate": 3.997204254929232e-05,
+      "loss": 0.8852,
+      "step": 13115
+    },
+    {
+      "epoch": 0.9124491286653449,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.9908995286275784e-05,
+      "loss": 0.8096,
+      "step": 13116
+    },
+    {
+      "epoch": 0.9125186963024801,
+      "grad_norm": 1.1796875,
+      "learning_rate": 3.984599677184131e-05,
+      "loss": 0.6992,
+      "step": 13117
+    },
+    {
+      "epoch": 0.9125882639396153,
+      "grad_norm": 0.859375,
+      "learning_rate": 3.978304700918755e-05,
+      "loss": 0.6913,
+      "step": 13118
+    },
+    {
+      "epoch": 0.9126578315767505,
+      "grad_norm": 1.1171875,
+      "learning_rate": 3.9720146001510746e-05,
+      "loss": 0.8249,
+      "step": 13119
+    },
+    {
+      "epoch": 0.9127273992138857,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.965729375200477e-05,
+      "loss": 0.7471,
+      "step": 13120
+    },
+    {
+      "epoch": 0.9127969668510209,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.959449026386097e-05,
+      "loss": 1.1734,
+      "step": 13121
+    },
+    {
+      "epoch": 0.9128665344881561,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.953173554026801e-05,
+      "loss": 0.72,
+      "step": 13122
+    },
+    {
+      "epoch": 0.9129361021252913,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.9469029584412676e-05,
+      "loss": 0.7768,
+      "step": 13123
+    },
+    {
+      "epoch": 0.9130056697624265,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.940637239947831e-05,
+      "loss": 0.7127,
+      "step": 13124
+    },
+    {
+      "epoch": 0.9130752373995618,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.9343763988646807e-05,
+      "loss": 0.8563,
+      "step": 13125
+    },
+    {
+      "epoch": 0.9131448050366969,
+      "grad_norm": 1.4140625,
+      "learning_rate": 3.928120435509675e-05,
+      "loss": 1.2617,
+      "step": 13126
+    },
+    {
+      "epoch": 0.9132143726738321,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.921869350200491e-05,
+      "loss": 0.9248,
+      "step": 13127
+    },
+    {
+      "epoch": 0.9132839403109674,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.915623143254488e-05,
+      "loss": 0.7759,
+      "step": 13128
+    },
+    {
+      "epoch": 0.9133535079481025,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.909381814988855e-05,
+      "loss": 0.7245,
+      "step": 13129
+    },
+    {
+      "epoch": 0.9134230755852377,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.903145365720484e-05,
+      "loss": 0.8968,
+      "step": 13130
+    },
+    {
+      "epoch": 0.913492643222373,
+      "grad_norm": 1.21875,
+      "learning_rate": 3.896913795766033e-05,
+      "loss": 0.9318,
+      "step": 13131
+    },
+    {
+      "epoch": 0.9135622108595082,
+      "grad_norm": 0.94140625,
+      "learning_rate": 3.8906871054419034e-05,
+      "loss": 0.6763,
+      "step": 13132
+    },
+    {
+      "epoch": 0.9136317784966433,
+      "grad_norm": 1.0234375,
+      "learning_rate": 3.884465295064232e-05,
+      "loss": 0.7571,
+      "step": 13133
+    },
+    {
+      "epoch": 0.9137013461337785,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.878248364948978e-05,
+      "loss": 0.7489,
+      "step": 13134
+    },
+    {
+      "epoch": 0.9137709137709138,
+      "grad_norm": 1.3203125,
+      "learning_rate": 3.8720363154117755e-05,
+      "loss": 0.8554,
+      "step": 13135
+    },
+    {
+      "epoch": 0.913840481408049,
+      "grad_norm": 1.140625,
+      "learning_rate": 3.865829146768041e-05,
+      "loss": 0.7288,
+      "step": 13136
+    },
+    {
+      "epoch": 0.9139100490451841,
+      "grad_norm": 1.1953125,
+      "learning_rate": 3.859626859332965e-05,
+      "loss": 0.8952,
+      "step": 13137
+    },
+    {
+      "epoch": 0.9139796166823194,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.853429453421442e-05,
+      "loss": 0.799,
+      "step": 13138
+    },
+    {
+      "epoch": 0.9140491843194546,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.847236929348163e-05,
+      "loss": 0.5849,
+      "step": 13139
+    },
+    {
+      "epoch": 0.9141187519565898,
+      "grad_norm": 1.4296875,
+      "learning_rate": 3.8410492874275335e-05,
+      "loss": 0.6744,
+      "step": 13140
+    },
+    {
+      "epoch": 0.914188319593725,
+      "grad_norm": 1.140625,
+      "learning_rate": 3.8348665279737684e-05,
+      "loss": 0.833,
+      "step": 13141
+    },
+    {
+      "epoch": 0.9142578872308602,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.828688651300749e-05,
+      "loss": 0.7386,
+      "step": 13142
+    },
+    {
+      "epoch": 0.9143274548679954,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.822515657722181e-05,
+      "loss": 0.8128,
+      "step": 13143
+    },
+    {
+      "epoch": 0.9143970225051307,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.816347547551524e-05,
+      "loss": 0.8064,
+      "step": 13144
+    },
+    {
+      "epoch": 0.9144665901422658,
+      "grad_norm": 1.3359375,
+      "learning_rate": 3.810184321101917e-05,
+      "loss": 0.9626,
+      "step": 13145
+    },
+    {
+      "epoch": 0.914536157779401,
+      "grad_norm": 1.125,
+      "learning_rate": 3.8040259786863315e-05,
+      "loss": 0.8708,
+      "step": 13146
+    },
+    {
+      "epoch": 0.9146057254165362,
+      "grad_norm": 1.1796875,
+      "learning_rate": 3.797872520617418e-05,
+      "loss": 0.6174,
+      "step": 13147
+    },
+    {
+      "epoch": 0.9146752930536715,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.791723947207659e-05,
+      "loss": 0.8264,
+      "step": 13148
+    },
+    {
+      "epoch": 0.9147448606908066,
+      "grad_norm": 1.1171875,
+      "learning_rate": 3.785580258769239e-05,
+      "loss": 0.5879,
+      "step": 13149
+    },
+    {
+      "epoch": 0.9148144283279418,
+      "grad_norm": 0.91796875,
+      "learning_rate": 3.779441455614086e-05,
+      "loss": 0.5422,
+      "step": 13150
+    },
+    {
+      "epoch": 0.9148839959650771,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.773307538053916e-05,
+      "loss": 0.5574,
+      "step": 13151
+    },
+    {
+      "epoch": 0.9149535636022122,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.76717850640016e-05,
+      "loss": 0.6906,
+      "step": 13152
+    },
+    {
+      "epoch": 0.9150231312393474,
+      "grad_norm": 1.125,
+      "learning_rate": 3.7610543609640444e-05,
+      "loss": 0.8463,
+      "step": 13153
+    },
+    {
+      "epoch": 0.9150926988764827,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.754935102056489e-05,
+      "loss": 0.9383,
+      "step": 13154
+    },
+    {
+      "epoch": 0.9151622665136179,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.7488207299882336e-05,
+      "loss": 0.7741,
+      "step": 13155
+    },
+    {
+      "epoch": 0.915231834150753,
+      "grad_norm": 1.21875,
+      "learning_rate": 3.7427112450697075e-05,
+      "loss": 0.9069,
+      "step": 13156
+    },
+    {
+      "epoch": 0.9153014017878883,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.736606647611141e-05,
+      "loss": 0.6133,
+      "step": 13157
+    },
+    {
+      "epoch": 0.9153709694250235,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.730506937922484e-05,
+      "loss": 0.7051,
+      "step": 13158
+    },
+    {
+      "epoch": 0.9154405370621587,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.7244121163134584e-05,
+      "loss": 0.6315,
+      "step": 13159
+    },
+    {
+      "epoch": 0.9155101046992938,
+      "grad_norm": 0.953125,
+      "learning_rate": 3.718322183093503e-05,
+      "loss": 0.7364,
+      "step": 13160
+    },
+    {
+      "epoch": 0.9155796723364291,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.7122371385718614e-05,
+      "loss": 0.6312,
+      "step": 13161
+    },
+    {
+      "epoch": 0.9156492399735643,
+      "grad_norm": 1.2265625,
+      "learning_rate": 3.706156983057496e-05,
+      "loss": 0.8725,
+      "step": 13162
+    },
+    {
+      "epoch": 0.9157188076106995,
+      "grad_norm": 1.359375,
+      "learning_rate": 3.700081716859116e-05,
+      "loss": 1.1978,
+      "step": 13163
+    },
+    {
+      "epoch": 0.9157883752478347,
+      "grad_norm": 1.203125,
+      "learning_rate": 3.694011340285208e-05,
+      "loss": 1.0957,
+      "step": 13164
+    },
+    {
+      "epoch": 0.9158579428849699,
+      "grad_norm": 1.5546875,
+      "learning_rate": 3.687945853643959e-05,
+      "loss": 0.6028,
+      "step": 13165
+    },
+    {
+      "epoch": 0.9159275105221051,
+      "grad_norm": 1.0078125,
+      "learning_rate": 3.6818852572434e-05,
+      "loss": 0.7117,
+      "step": 13166
+    },
+    {
+      "epoch": 0.9159970781592404,
+      "grad_norm": 0.9375,
+      "learning_rate": 3.6758295513912185e-05,
+      "loss": 0.6572,
+      "step": 13167
+    },
+    {
+      "epoch": 0.9160666457963755,
+      "grad_norm": 1.578125,
+      "learning_rate": 3.669778736394902e-05,
+      "loss": 0.7317,
+      "step": 13168
+    },
+    {
+      "epoch": 0.9161362134335107,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.663732812561682e-05,
+      "loss": 0.729,
+      "step": 13169
+    },
+    {
+      "epoch": 0.916205781070646,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.6576917801985355e-05,
+      "loss": 0.8536,
+      "step": 13170
+    },
+    {
+      "epoch": 0.9162753487077812,
+      "grad_norm": 1.3125,
+      "learning_rate": 3.651655639612206e-05,
+      "loss": 0.9006,
+      "step": 13171
+    },
+    {
+      "epoch": 0.9163449163449163,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.64562439110917e-05,
+      "loss": 0.6241,
+      "step": 13172
+    },
+    {
+      "epoch": 0.9164144839820515,
+      "grad_norm": 1.59375,
+      "learning_rate": 3.6395980349956616e-05,
+      "loss": 0.7321,
+      "step": 13173
+    },
+    {
+      "epoch": 0.9164840516191868,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.6335765715776684e-05,
+      "loss": 0.6549,
+      "step": 13174
+    },
+    {
+      "epoch": 0.916553619256322,
+      "grad_norm": 1.140625,
+      "learning_rate": 3.627560001160935e-05,
+      "loss": 0.8101,
+      "step": 13175
+    },
+    {
+      "epoch": 0.9166231868934571,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.6215483240509604e-05,
+      "loss": 0.7974,
+      "step": 13176
+    },
+    {
+      "epoch": 0.9166927545305924,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.61554154055298e-05,
+      "loss": 0.7702,
+      "step": 13177
+    },
+    {
+      "epoch": 0.9167623221677276,
+      "grad_norm": 1.59375,
+      "learning_rate": 3.6095396509719934e-05,
+      "loss": 0.8351,
+      "step": 13178
+    },
+    {
+      "epoch": 0.9168318898048627,
+      "grad_norm": 0.7734375,
+      "learning_rate": 3.603542655612702e-05,
+      "loss": 0.5195,
+      "step": 13179
+    },
+    {
+      "epoch": 0.916901457441998,
+      "grad_norm": 1.2578125,
+      "learning_rate": 3.5975505547796714e-05,
+      "loss": 0.8202,
+      "step": 13180
+    },
+    {
+      "epoch": 0.9169710250791332,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.591563348777127e-05,
+      "loss": 0.7548,
+      "step": 13181
+    },
+    {
+      "epoch": 0.9170405927162684,
+      "grad_norm": 1.0078125,
+      "learning_rate": 3.585581037909036e-05,
+      "loss": 0.615,
+      "step": 13182
+    },
+    {
+      "epoch": 0.9171101603534036,
+      "grad_norm": 1.3671875,
+      "learning_rate": 3.5796036224791884e-05,
+      "loss": 1.1456,
+      "step": 13183
+    },
+    {
+      "epoch": 0.9171797279905388,
+      "grad_norm": 0.9765625,
+      "learning_rate": 3.573631102791075e-05,
+      "loss": 0.747,
+      "step": 13184
+    },
+    {
+      "epoch": 0.917249295627674,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.5676634791479535e-05,
+      "loss": 0.8888,
+      "step": 13185
+    },
+    {
+      "epoch": 0.9173188632648092,
+      "grad_norm": 1.1328125,
+      "learning_rate": 3.561700751852803e-05,
+      "loss": 0.8245,
+      "step": 13186
+    },
+    {
+      "epoch": 0.9173884309019444,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.555742921208427e-05,
+      "loss": 0.8364,
+      "step": 13187
+    },
+    {
+      "epoch": 0.9174579985390796,
+      "grad_norm": 0.9609375,
+      "learning_rate": 3.5497899875172935e-05,
+      "loss": 0.8607,
+      "step": 13188
+    },
+    {
+      "epoch": 0.9175275661762148,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.5438419510816834e-05,
+      "loss": 0.9017,
+      "step": 13189
+    },
+    {
+      "epoch": 0.9175971338133501,
+      "grad_norm": 0.9765625,
+      "learning_rate": 3.537898812203621e-05,
+      "loss": 0.6674,
+      "step": 13190
+    },
+    {
+      "epoch": 0.9176667014504852,
+      "grad_norm": 0.84765625,
+      "learning_rate": 3.531960571184845e-05,
+      "loss": 0.7925,
+      "step": 13191
+    },
+    {
+      "epoch": 0.9177362690876204,
+      "grad_norm": 1.1171875,
+      "learning_rate": 3.526027228326867e-05,
+      "loss": 0.9025,
+      "step": 13192
+    },
+    {
+      "epoch": 0.9178058367247557,
+      "grad_norm": 0.94921875,
+      "learning_rate": 3.520098783930958e-05,
+      "loss": 0.6693,
+      "step": 13193
+    },
+    {
+      "epoch": 0.9178754043618909,
+      "grad_norm": 1.34375,
+      "learning_rate": 3.514175238298145e-05,
+      "loss": 0.8207,
+      "step": 13194
+    },
+    {
+      "epoch": 0.917944971999026,
+      "grad_norm": 1.328125,
+      "learning_rate": 3.508256591729198e-05,
+      "loss": 1.0026,
+      "step": 13195
+    },
+    {
+      "epoch": 0.9180145396361613,
+      "grad_norm": 1.1328125,
+      "learning_rate": 3.5023428445246085e-05,
+      "loss": 0.9401,
+      "step": 13196
+    },
+    {
+      "epoch": 0.9180841072732965,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.496433996984682e-05,
+      "loss": 0.8634,
+      "step": 13197
+    },
+    {
+      "epoch": 0.9181536749104316,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.4905300494094125e-05,
+      "loss": 0.6497,
+      "step": 13198
+    },
+    {
+      "epoch": 0.9182232425475668,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.4846310020985925e-05,
+      "loss": 0.7236,
+      "step": 13199
+    },
+    {
+      "epoch": 0.9182928101847021,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.478736855351727e-05,
+      "loss": 0.8447,
+      "step": 13200
+    },
+    {
+      "epoch": 0.9183623778218373,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.4728476094681105e-05,
+      "loss": 0.8562,
+      "step": 13201
+    },
+    {
+      "epoch": 0.9184319454589724,
+      "grad_norm": 1.328125,
+      "learning_rate": 3.466963264746748e-05,
+      "loss": 0.723,
+      "step": 13202
+    },
+    {
+      "epoch": 0.9185015130961077,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.461083821486421e-05,
+      "loss": 0.9176,
+      "step": 13203
+    },
+    {
+      "epoch": 0.9185710807332429,
+      "grad_norm": 1.21875,
+      "learning_rate": 3.4552092799856826e-05,
+      "loss": 0.6648,
+      "step": 13204
+    },
+    {
+      "epoch": 0.9186406483703781,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.449339640542804e-05,
+      "loss": 0.5927,
+      "step": 13205
+    },
+    {
+      "epoch": 0.9187102160075133,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.44347490345579e-05,
+      "loss": 1.0229,
+      "step": 13206
+    },
+    {
+      "epoch": 0.9187797836446485,
+      "grad_norm": 1.1796875,
+      "learning_rate": 3.4376150690224375e-05,
+      "loss": 0.9107,
+      "step": 13207
+    },
+    {
+      "epoch": 0.9188493512817837,
+      "grad_norm": 1.328125,
+      "learning_rate": 3.431760137540285e-05,
+      "loss": 0.7092,
+      "step": 13208
+    },
+    {
+      "epoch": 0.918918918918919,
+      "grad_norm": 1.4453125,
+      "learning_rate": 3.425910109306618e-05,
+      "loss": 0.8211,
+      "step": 13209
+    },
+    {
+      "epoch": 0.9189884865560541,
+      "grad_norm": 1.1796875,
+      "learning_rate": 3.4200649846184654e-05,
+      "loss": 0.9294,
+      "step": 13210
+    },
+    {
+      "epoch": 0.9190580541931893,
+      "grad_norm": 1.296875,
+      "learning_rate": 3.4142247637726e-05,
+      "loss": 1.0419,
+      "step": 13211
+    },
+    {
+      "epoch": 0.9191276218303245,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.408389447065596e-05,
+      "loss": 0.9011,
+      "step": 13212
+    },
+    {
+      "epoch": 0.9191971894674598,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.4025590347937066e-05,
+      "loss": 0.6634,
+      "step": 13213
+    },
+    {
+      "epoch": 0.9192667571045949,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.396733527252982e-05,
+      "loss": 0.9252,
+      "step": 13214
+    },
+    {
+      "epoch": 0.9193363247417301,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.390912924739209e-05,
+      "loss": 0.6012,
+      "step": 13215
+    },
+    {
+      "epoch": 0.9194058923788654,
+      "grad_norm": 1.28125,
+      "learning_rate": 3.385097227547929e-05,
+      "loss": 0.7775,
+      "step": 13216
+    },
+    {
+      "epoch": 0.9194754600160006,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.379286435974438e-05,
+      "loss": 0.8743,
+      "step": 13217
+    },
+    {
+      "epoch": 0.9195450276531357,
+      "grad_norm": 1.1640625,
+      "learning_rate": 3.37348055031379e-05,
+      "loss": 0.9789,
+      "step": 13218
+    },
+    {
+      "epoch": 0.919614595290271,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.36767957086076e-05,
+      "loss": 0.7683,
+      "step": 13219
+    },
+    {
+      "epoch": 0.9196841629274062,
+      "grad_norm": 0.86328125,
+      "learning_rate": 3.361883497909901e-05,
+      "loss": 0.7352,
+      "step": 13220
+    },
+    {
+      "epoch": 0.9197537305645413,
+      "grad_norm": 1.375,
+      "learning_rate": 3.356092331755489e-05,
+      "loss": 0.8684,
+      "step": 13221
+    },
+    {
+      "epoch": 0.9198232982016766,
+      "grad_norm": 0.96484375,
+      "learning_rate": 3.350306072691611e-05,
+      "loss": 0.798,
+      "step": 13222
+    },
+    {
+      "epoch": 0.9198928658388118,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.3445247210120324e-05,
+      "loss": 0.9874,
+      "step": 13223
+    },
+    {
+      "epoch": 0.919962433475947,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.338748277010295e-05,
+      "loss": 0.6423,
+      "step": 13224
+    },
+    {
+      "epoch": 0.9200320011130821,
+      "grad_norm": 1.1953125,
+      "learning_rate": 3.33297674097971e-05,
+      "loss": 0.7429,
+      "step": 13225
+    },
+    {
+      "epoch": 0.9201015687502174,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.327210113213353e-05,
+      "loss": 0.9039,
+      "step": 13226
+    },
+    {
+      "epoch": 0.9201711363873526,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.3214483940039894e-05,
+      "loss": 0.7173,
+      "step": 13227
+    },
+    {
+      "epoch": 0.9202407040244878,
+      "grad_norm": 1.1953125,
+      "learning_rate": 3.315691583644165e-05,
+      "loss": 0.6867,
+      "step": 13228
+    },
+    {
+      "epoch": 0.920310271661623,
+      "grad_norm": 1.078125,
+      "learning_rate": 3.309939682426222e-05,
+      "loss": 0.8921,
+      "step": 13229
+    },
+    {
+      "epoch": 0.9203798392987582,
+      "grad_norm": 0.94140625,
+      "learning_rate": 3.30419269064216e-05,
+      "loss": 0.7802,
+      "step": 13230
+    },
+    {
+      "epoch": 0.9204494069358934,
+      "grad_norm": 1.21875,
+      "learning_rate": 3.298450608583825e-05,
+      "loss": 0.9367,
+      "step": 13231
+    },
+    {
+      "epoch": 0.9205189745730287,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.29271343654276e-05,
+      "loss": 0.7717,
+      "step": 13232
+    },
+    {
+      "epoch": 0.9205885422101638,
+      "grad_norm": 1.2421875,
+      "learning_rate": 3.286981174810266e-05,
+      "loss": 0.8901,
+      "step": 13233
+    },
+    {
+      "epoch": 0.920658109847299,
+      "grad_norm": 1.1640625,
+      "learning_rate": 3.281253823677388e-05,
+      "loss": 0.7818,
+      "step": 13234
+    },
+    {
+      "epoch": 0.9207276774844343,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.275531383434938e-05,
+      "loss": 0.8075,
+      "step": 13235
+    },
+    {
+      "epoch": 0.9207972451215695,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.269813854373493e-05,
+      "loss": 0.793,
+      "step": 13236
+    },
+    {
+      "epoch": 0.9208668127587046,
+      "grad_norm": 1.1875,
+      "learning_rate": 3.264101236783346e-05,
+      "loss": 0.8248,
+      "step": 13237
+    },
+    {
+      "epoch": 0.9209363803958398,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.25839353095454e-05,
+      "loss": 0.7525,
+      "step": 13238
+    },
+    {
+      "epoch": 0.9210059480329751,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.2526907371768996e-05,
+      "loss": 0.5513,
+      "step": 13239
+    },
+    {
+      "epoch": 0.9210755156701103,
+      "grad_norm": 0.890625,
+      "learning_rate": 3.246992855739983e-05,
+      "loss": 0.7061,
+      "step": 13240
+    },
+    {
+      "epoch": 0.9211450833072454,
+      "grad_norm": 1.1171875,
+      "learning_rate": 3.2412998869331134e-05,
+      "loss": 0.7867,
+      "step": 13241
+    },
+    {
+      "epoch": 0.9212146509443807,
+      "grad_norm": 1.0,
+      "learning_rate": 3.235611831045304e-05,
+      "loss": 0.6275,
+      "step": 13242
+    },
+    {
+      "epoch": 0.9212842185815159,
+      "grad_norm": 0.9296875,
+      "learning_rate": 3.229928688365413e-05,
+      "loss": 0.5783,
+      "step": 13243
+    },
+    {
+      "epoch": 0.921353786218651,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.224250459181988e-05,
+      "loss": 0.6496,
+      "step": 13244
+    },
+    {
+      "epoch": 0.9214233538557863,
+      "grad_norm": 1.0078125,
+      "learning_rate": 3.218577143783341e-05,
+      "loss": 0.8002,
+      "step": 13245
+    },
+    {
+      "epoch": 0.9214929214929215,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.212908742457532e-05,
+      "loss": 0.7586,
+      "step": 13246
+    },
+    {
+      "epoch": 0.9215624891300567,
+      "grad_norm": 1.5859375,
+      "learning_rate": 3.2072452554923746e-05,
+      "loss": 0.9243,
+      "step": 13247
+    },
+    {
+      "epoch": 0.921632056767192,
+      "grad_norm": 0.90625,
+      "learning_rate": 3.201586683175417e-05,
+      "loss": 0.7317,
+      "step": 13248
+    },
+    {
+      "epoch": 0.9217016244043271,
+      "grad_norm": 1.2734375,
+      "learning_rate": 3.1959330257939957e-05,
+      "loss": 0.9879,
+      "step": 13249
+    },
+    {
+      "epoch": 0.9217711920414623,
+      "grad_norm": 1.703125,
+      "learning_rate": 3.1902842836351696e-05,
+      "loss": 0.7883,
+      "step": 13250
+    },
+    {
+      "epoch": 0.9218407596785975,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.184640456985755e-05,
+      "loss": 0.8505,
+      "step": 13251
+    },
+    {
+      "epoch": 0.9219103273157327,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.179001546132298e-05,
+      "loss": 0.6454,
+      "step": 13252
+    },
+    {
+      "epoch": 0.9219798949528679,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.173367551361139e-05,
+      "loss": 0.7965,
+      "step": 13253
+    },
+    {
+      "epoch": 0.9220494625900031,
+      "grad_norm": 0.9375,
+      "learning_rate": 3.167738472958337e-05,
+      "loss": 0.6902,
+      "step": 13254
+    },
+    {
+      "epoch": 0.9221190302271384,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.162114311209707e-05,
+      "loss": 0.9991,
+      "step": 13255
+    },
+    {
+      "epoch": 0.9221885978642735,
+      "grad_norm": 0.84765625,
+      "learning_rate": 3.1564950664007996e-05,
+      "loss": 0.5433,
+      "step": 13256
+    },
+    {
+      "epoch": 0.9222581655014087,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.1508807388169414e-05,
+      "loss": 0.781,
+      "step": 13257
+    },
+    {
+      "epoch": 0.922327733138544,
+      "grad_norm": 0.91015625,
+      "learning_rate": 3.1452713287432154e-05,
+      "loss": 0.7424,
+      "step": 13258
+    },
+    {
+      "epoch": 0.9223973007756792,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.139666836464439e-05,
+      "loss": 0.8479,
+      "step": 13259
+    },
+    {
+      "epoch": 0.9224668684128143,
+      "grad_norm": 0.76953125,
+      "learning_rate": 3.13406726226515e-05,
+      "loss": 0.6565,
+      "step": 13260
+    },
+    {
+      "epoch": 0.9225364360499496,
+      "grad_norm": 1.515625,
+      "learning_rate": 3.128472606429689e-05,
+      "loss": 0.8064,
+      "step": 13261
+    },
+    {
+      "epoch": 0.9226060036870848,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.122882869242116e-05,
+      "loss": 0.7445,
+      "step": 13262
+    },
+    {
+      "epoch": 0.92267557132422,
+      "grad_norm": 1.0,
+      "learning_rate": 3.1172980509862504e-05,
+      "loss": 0.5144,
+      "step": 13263
+    },
+    {
+      "epoch": 0.9227451389613551,
+      "grad_norm": 1.125,
+      "learning_rate": 3.111718151945686e-05,
+      "loss": 0.8032,
+      "step": 13264
+    },
+    {
+      "epoch": 0.9228147065984904,
+      "grad_norm": 0.94921875,
+      "learning_rate": 3.106143172403708e-05,
+      "loss": 0.7622,
+      "step": 13265
+    },
+    {
+      "epoch": 0.9228842742356256,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.10057311264339e-05,
+      "loss": 0.5968,
+      "step": 13266
+    },
+    {
+      "epoch": 0.9229538418727607,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.095007972947572e-05,
+      "loss": 0.8642,
+      "step": 13267
+    },
+    {
+      "epoch": 0.923023409509896,
+      "grad_norm": 0.96484375,
+      "learning_rate": 3.089447753598806e-05,
+      "loss": 0.7421,
+      "step": 13268
+    },
+    {
+      "epoch": 0.9230929771470312,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.083892454879433e-05,
+      "loss": 0.6603,
+      "step": 13269
+    },
+    {
+      "epoch": 0.9231625447841664,
+      "grad_norm": 0.95703125,
+      "learning_rate": 3.0783420770714834e-05,
+      "loss": 0.9264,
+      "step": 13270
+    },
+    {
+      "epoch": 0.9232321124213017,
+      "grad_norm": 1.2421875,
+      "learning_rate": 3.072796620456808e-05,
+      "loss": 0.7977,
+      "step": 13271
+    },
+    {
+      "epoch": 0.9233016800584368,
+      "grad_norm": 1.078125,
+      "learning_rate": 3.067256085316983e-05,
+      "loss": 0.7678,
+      "step": 13272
+    },
+    {
+      "epoch": 0.923371247695572,
+      "grad_norm": 0.90625,
+      "learning_rate": 3.0617204719333155e-05,
+      "loss": 0.7053,
+      "step": 13273
+    },
+    {
+      "epoch": 0.9234408153327073,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.056189780586871e-05,
+      "loss": 0.6626,
+      "step": 13274
+    },
+    {
+      "epoch": 0.9235103829698424,
+      "grad_norm": 2.109375,
+      "learning_rate": 3.0506640115584682e-05,
+      "loss": 1.0136,
+      "step": 13275
+    },
+    {
+      "epoch": 0.9235799506069776,
+      "grad_norm": 0.98046875,
+      "learning_rate": 3.0451431651286943e-05,
+      "loss": 0.6453,
+      "step": 13276
+    },
+    {
+      "epoch": 0.9236495182441128,
+      "grad_norm": 1.53125,
+      "learning_rate": 3.039627241577858e-05,
+      "loss": 0.9795,
+      "step": 13277
+    },
+    {
+      "epoch": 0.9237190858812481,
+      "grad_norm": 0.97265625,
+      "learning_rate": 3.0341162411860466e-05,
+      "loss": 0.7024,
+      "step": 13278
+    },
+    {
+      "epoch": 0.9237886535183832,
+      "grad_norm": 1.1328125,
+      "learning_rate": 3.02861016423307e-05,
+      "loss": 0.8726,
+      "step": 13279
+    },
+    {
+      "epoch": 0.9238582211555184,
+      "grad_norm": 0.98828125,
+      "learning_rate": 3.0231090109984814e-05,
+      "loss": 0.5164,
+      "step": 13280
+    },
+    {
+      "epoch": 0.9239277887926537,
+      "grad_norm": 0.8359375,
+      "learning_rate": 3.0176127817616138e-05,
+      "loss": 0.6693,
+      "step": 13281
+    },
+    {
+      "epoch": 0.9239973564297889,
+      "grad_norm": 1.390625,
+      "learning_rate": 3.0121214768015548e-05,
+      "loss": 0.7757,
+      "step": 13282
+    },
+    {
+      "epoch": 0.924066924066924,
+      "grad_norm": 1.0,
+      "learning_rate": 3.0066350963971145e-05,
+      "loss": 0.6186,
+      "step": 13283
+    },
+    {
+      "epoch": 0.9241364917040593,
+      "grad_norm": 1.0625,
+      "learning_rate": 3.0011536408268482e-05,
+      "loss": 0.8729,
+      "step": 13284
+    },
+    {
+      "epoch": 0.9242060593411945,
+      "grad_norm": 1.328125,
+      "learning_rate": 2.995677110369088e-05,
+      "loss": 1.0522,
+      "step": 13285
+    },
+    {
+      "epoch": 0.9242756269783297,
+      "grad_norm": 1.3828125,
+      "learning_rate": 2.9902055053019238e-05,
+      "loss": 0.9778,
+      "step": 13286
+    },
+    {
+      "epoch": 0.9243451946154649,
+      "grad_norm": 1.0,
+      "learning_rate": 2.984738825903155e-05,
+      "loss": 0.628,
+      "step": 13287
+    },
+    {
+      "epoch": 0.9244147622526001,
+      "grad_norm": 1.421875,
+      "learning_rate": 2.979277072450348e-05,
+      "loss": 0.7561,
+      "step": 13288
+    },
+    {
+      "epoch": 0.9244843298897353,
+      "grad_norm": 1.203125,
+      "learning_rate": 2.9738202452208263e-05,
+      "loss": 0.8658,
+      "step": 13289
+    },
+    {
+      "epoch": 0.9245538975268704,
+      "grad_norm": 1.15625,
+      "learning_rate": 2.9683683444916787e-05,
+      "loss": 0.6984,
+      "step": 13290
+    },
+    {
+      "epoch": 0.9246234651640057,
+      "grad_norm": 1.421875,
+      "learning_rate": 2.9629213705396953e-05,
+      "loss": 0.6508,
+      "step": 13291
+    },
+    {
+      "epoch": 0.9246930328011409,
+      "grad_norm": 0.96875,
+      "learning_rate": 2.9574793236414764e-05,
+      "loss": 0.6579,
+      "step": 13292
+    },
+    {
+      "epoch": 0.9247626004382761,
+      "grad_norm": 1.4453125,
+      "learning_rate": 2.952042204073324e-05,
+      "loss": 0.9784,
+      "step": 13293
+    },
+    {
+      "epoch": 0.9248321680754114,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.9466100121112947e-05,
+      "loss": 0.8942,
+      "step": 13294
+    },
+    {
+      "epoch": 0.9249017357125465,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.941182748031235e-05,
+      "loss": 0.9168,
+      "step": 13295
+    },
+    {
+      "epoch": 0.9249713033496817,
+      "grad_norm": 1.15625,
+      "learning_rate": 2.935760412108701e-05,
+      "loss": 1.0138,
+      "step": 13296
+    },
+    {
+      "epoch": 0.925040870986817,
+      "grad_norm": 1.515625,
+      "learning_rate": 2.9303430046190184e-05,
+      "loss": 0.614,
+      "step": 13297
+    },
+    {
+      "epoch": 0.9251104386239521,
+      "grad_norm": 1.0703125,
+      "learning_rate": 2.9249305258372437e-05,
+      "loss": 0.535,
+      "step": 13298
+    },
+    {
+      "epoch": 0.9251800062610873,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.9195229760382026e-05,
+      "loss": 0.9334,
+      "step": 13299
+    },
+    {
+      "epoch": 0.9252495738982226,
+      "grad_norm": 0.953125,
+      "learning_rate": 2.9141203554964745e-05,
+      "loss": 0.8297,
+      "step": 13300
+    },
+    {
+      "epoch": 0.9253191415353578,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.9087226644863628e-05,
+      "loss": 0.8797,
+      "step": 13301
+    },
+    {
+      "epoch": 0.9253887091724929,
+      "grad_norm": 1.3359375,
+      "learning_rate": 2.903329903281926e-05,
+      "loss": 0.893,
+      "step": 13302
+    },
+    {
+      "epoch": 0.9254582768096281,
+      "grad_norm": 0.86328125,
+      "learning_rate": 2.8979420721569892e-05,
+      "loss": 0.7535,
+      "step": 13303
+    },
+    {
+      "epoch": 0.9255278444467634,
+      "grad_norm": 0.9375,
+      "learning_rate": 2.892559171385145e-05,
+      "loss": 0.6196,
+      "step": 13304
+    },
+    {
+      "epoch": 0.9255974120838986,
+      "grad_norm": 0.9140625,
+      "learning_rate": 2.8871812012396635e-05,
+      "loss": 0.6602,
+      "step": 13305
+    },
+    {
+      "epoch": 0.9256669797210337,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.88180816199366e-05,
+      "loss": 0.8892,
+      "step": 13306
+    },
+    {
+      "epoch": 0.925736547358169,
+      "grad_norm": 0.96484375,
+      "learning_rate": 2.876440053919904e-05,
+      "loss": 0.7827,
+      "step": 13307
+    },
+    {
+      "epoch": 0.9258061149953042,
+      "grad_norm": 0.8828125,
+      "learning_rate": 2.871076877291001e-05,
+      "loss": 0.6778,
+      "step": 13308
+    },
+    {
+      "epoch": 0.9258756826324394,
+      "grad_norm": 0.91796875,
+      "learning_rate": 2.8657186323792438e-05,
+      "loss": 0.6027,
+      "step": 13309
+    },
+    {
+      "epoch": 0.9259452502695746,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.8603653194567036e-05,
+      "loss": 0.9155,
+      "step": 13310
+    },
+    {
+      "epoch": 0.9260148179067098,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.8550169387951852e-05,
+      "loss": 0.6383,
+      "step": 13311
+    },
+    {
+      "epoch": 0.926084385543845,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.8496734906662604e-05,
+      "loss": 0.7247,
+      "step": 13312
+    },
+    {
+      "epoch": 0.9261539531809803,
+      "grad_norm": 1.203125,
+      "learning_rate": 2.844334975341234e-05,
+      "loss": 0.9816,
+      "step": 13313
+    },
+    {
+      "epoch": 0.9262235208181154,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.8390013930912008e-05,
+      "loss": 0.8588,
+      "step": 13314
+    },
+    {
+      "epoch": 0.9262930884552506,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.8336727441869326e-05,
+      "loss": 0.7097,
+      "step": 13315
+    },
+    {
+      "epoch": 0.9263626560923858,
+      "grad_norm": 1.2890625,
+      "learning_rate": 2.828349028899002e-05,
+      "loss": 0.7123,
+      "step": 13316
+    },
+    {
+      "epoch": 0.926432223729521,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.8230302474977376e-05,
+      "loss": 0.7025,
+      "step": 13317
+    },
+    {
+      "epoch": 0.9265017913666562,
+      "grad_norm": 1.1015625,
+      "learning_rate": 2.8177164002531897e-05,
+      "loss": 0.9708,
+      "step": 13318
+    },
+    {
+      "epoch": 0.9265713590037914,
+      "grad_norm": 1.6015625,
+      "learning_rate": 2.8124074874351646e-05,
+      "loss": 0.7133,
+      "step": 13319
+    },
+    {
+      "epoch": 0.9266409266409267,
+      "grad_norm": 1.1953125,
+      "learning_rate": 2.8071035093132247e-05,
+      "loss": 0.7549,
+      "step": 13320
+    },
+    {
+      "epoch": 0.9267104942780618,
+      "grad_norm": 0.89453125,
+      "learning_rate": 2.8018044661566768e-05,
+      "loss": 0.7371,
+      "step": 13321
+    },
+    {
+      "epoch": 0.926780061915197,
+      "grad_norm": 1.0,
+      "learning_rate": 2.796510358234583e-05,
+      "loss": 0.8252,
+      "step": 13322
+    },
+    {
+      "epoch": 0.9268496295523323,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.791221185815751e-05,
+      "loss": 0.6271,
+      "step": 13323
+    },
+    {
+      "epoch": 0.9269191971894675,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.7859369491687547e-05,
+      "loss": 0.7798,
+      "step": 13324
+    },
+    {
+      "epoch": 0.9269887648266026,
+      "grad_norm": 0.984375,
+      "learning_rate": 2.7806576485618683e-05,
+      "loss": 0.7248,
+      "step": 13325
+    },
+    {
+      "epoch": 0.9270583324637379,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.7753832842631665e-05,
+      "loss": 0.6875,
+      "step": 13326
+    },
+    {
+      "epoch": 0.9271279001008731,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.770113856540457e-05,
+      "loss": 0.8432,
+      "step": 13327
+    },
+    {
+      "epoch": 0.9271974677380083,
+      "grad_norm": 1.203125,
+      "learning_rate": 2.7648493656612926e-05,
+      "loss": 0.8636,
+      "step": 13328
+    },
+    {
+      "epoch": 0.9272670353751434,
+      "grad_norm": 1.4453125,
+      "learning_rate": 2.7595898118929706e-05,
+      "loss": 1.0768,
+      "step": 13329
+    },
+    {
+      "epoch": 0.9273366030122787,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.7543351955025552e-05,
+      "loss": 0.8567,
+      "step": 13330
+    },
+    {
+      "epoch": 0.9274061706494139,
+      "grad_norm": 1.2578125,
+      "learning_rate": 2.749085516756833e-05,
+      "loss": 0.801,
+      "step": 13331
+    },
+    {
+      "epoch": 0.9274757382865491,
+      "grad_norm": 1.0625,
+      "learning_rate": 2.7438407759223793e-05,
+      "loss": 0.8172,
+      "step": 13332
+    },
+    {
+      "epoch": 0.9275453059236843,
+      "grad_norm": 0.859375,
+      "learning_rate": 2.7386009732654815e-05,
+      "loss": 0.6904,
+      "step": 13333
+    },
+    {
+      "epoch": 0.9276148735608195,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.7333661090521932e-05,
+      "loss": 0.8203,
+      "step": 13334
+    },
+    {
+      "epoch": 0.9276844411979547,
+      "grad_norm": 0.97265625,
+      "learning_rate": 2.7281361835483022e-05,
+      "loss": 0.7293,
+      "step": 13335
+    },
+    {
+      "epoch": 0.92775400883509,
+      "grad_norm": 1.0703125,
+      "learning_rate": 2.7229111970193842e-05,
+      "loss": 0.8736,
+      "step": 13336
+    },
+    {
+      "epoch": 0.9278235764722251,
+      "grad_norm": 1.34375,
+      "learning_rate": 2.7176911497307166e-05,
+      "loss": 0.9797,
+      "step": 13337
+    },
+    {
+      "epoch": 0.9278931441093603,
+      "grad_norm": 1.4375,
+      "learning_rate": 2.7124760419473537e-05,
+      "loss": 0.8142,
+      "step": 13338
+    },
+    {
+      "epoch": 0.9279627117464956,
+      "grad_norm": 0.96484375,
+      "learning_rate": 2.7072658739340837e-05,
+      "loss": 0.6258,
+      "step": 13339
+    },
+    {
+      "epoch": 0.9280322793836308,
+      "grad_norm": 0.92578125,
+      "learning_rate": 2.702060645955473e-05,
+      "loss": 0.8679,
+      "step": 13340
+    },
+    {
+      "epoch": 0.9281018470207659,
+      "grad_norm": 1.234375,
+      "learning_rate": 2.696860358275799e-05,
+      "loss": 0.946,
+      "step": 13341
+    },
+    {
+      "epoch": 0.9281714146579011,
+      "grad_norm": 0.87109375,
+      "learning_rate": 2.691665011159117e-05,
+      "loss": 0.6146,
+      "step": 13342
+    },
+    {
+      "epoch": 0.9282409822950364,
+      "grad_norm": 1.5234375,
+      "learning_rate": 2.6864746048692156e-05,
+      "loss": 1.1254,
+      "step": 13343
+    },
+    {
+      "epoch": 0.9283105499321715,
+      "grad_norm": 1.2265625,
+      "learning_rate": 2.6812891396696294e-05,
+      "loss": 0.9415,
+      "step": 13344
+    },
+    {
+      "epoch": 0.9283801175693067,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.676108615823658e-05,
+      "loss": 0.6591,
+      "step": 13345
+    },
+    {
+      "epoch": 0.928449685206442,
+      "grad_norm": 1.6015625,
+      "learning_rate": 2.670933033594358e-05,
+      "loss": 0.712,
+      "step": 13346
+    },
+    {
+      "epoch": 0.9285192528435772,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.6657623932444975e-05,
+      "loss": 0.6956,
+      "step": 13347
+    },
+    {
+      "epoch": 0.9285888204807123,
+      "grad_norm": 1.203125,
+      "learning_rate": 2.66059669503661e-05,
+      "loss": 0.8038,
+      "step": 13348
+    },
+    {
+      "epoch": 0.9286583881178476,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.6554359392329973e-05,
+      "loss": 0.7031,
+      "step": 13349
+    },
+    {
+      "epoch": 0.9287279557549828,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.6502801260957054e-05,
+      "loss": 0.8144,
+      "step": 13350
+    },
+    {
+      "epoch": 0.928797523392118,
+      "grad_norm": 1.125,
+      "learning_rate": 2.6451292558864915e-05,
+      "loss": 0.8768,
+      "step": 13351
+    },
+    {
+      "epoch": 0.9288670910292532,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.639983328866935e-05,
+      "loss": 0.7542,
+      "step": 13352
+    },
+    {
+      "epoch": 0.9289366586663884,
+      "grad_norm": 0.98828125,
+      "learning_rate": 2.6348423452982717e-05,
+      "loss": 0.9279,
+      "step": 13353
+    },
+    {
+      "epoch": 0.9290062263035236,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.6297063054415705e-05,
+      "loss": 0.8012,
+      "step": 13354
+    },
+    {
+      "epoch": 0.9290757939406588,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.624575209557589e-05,
+      "loss": 0.9576,
+      "step": 13355
+    },
+    {
+      "epoch": 0.929145361577794,
+      "grad_norm": 1.34375,
+      "learning_rate": 2.6194490579068864e-05,
+      "loss": 0.7498,
+      "step": 13356
+    },
+    {
+      "epoch": 0.9292149292149292,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.6143278507497203e-05,
+      "loss": 0.7359,
+      "step": 13357
+    },
+    {
+      "epoch": 0.9292844968520644,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.6092115883461054e-05,
+      "loss": 0.9154,
+      "step": 13358
+    },
+    {
+      "epoch": 0.9293540644891997,
+      "grad_norm": 1.1484375,
+      "learning_rate": 2.604100270955867e-05,
+      "loss": 1.0482,
+      "step": 13359
+    },
+    {
+      "epoch": 0.9294236321263348,
+      "grad_norm": 1.1484375,
+      "learning_rate": 2.5989938988384976e-05,
+      "loss": 0.7849,
+      "step": 13360
+    },
+    {
+      "epoch": 0.92949319976347,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.5938924722532788e-05,
+      "loss": 0.9157,
+      "step": 13361
+    },
+    {
+      "epoch": 0.9295627674006053,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.5887959914592364e-05,
+      "loss": 0.8902,
+      "step": 13362
+    },
+    {
+      "epoch": 0.9296323350377405,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.5837044567151412e-05,
+      "loss": 0.8757,
+      "step": 13363
+    },
+    {
+      "epoch": 0.9297019026748756,
+      "grad_norm": 0.9140625,
+      "learning_rate": 2.5786178682795204e-05,
+      "loss": 0.6357,
+      "step": 13364
+    },
+    {
+      "epoch": 0.9297714703120109,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.5735362264106442e-05,
+      "loss": 0.7472,
+      "step": 13365
+    },
+    {
+      "epoch": 0.9298410379491461,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.5684595313665405e-05,
+      "loss": 0.7725,
+      "step": 13366
+    },
+    {
+      "epoch": 0.9299106055862812,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.5633877834049578e-05,
+      "loss": 0.9519,
+      "step": 13367
+    },
+    {
+      "epoch": 0.9299801732234164,
+      "grad_norm": 1.34375,
+      "learning_rate": 2.5583209827834353e-05,
+      "loss": 0.8959,
+      "step": 13368
+    },
+    {
+      "epoch": 0.9300497408605517,
+      "grad_norm": 0.95703125,
+      "learning_rate": 2.5532591297592333e-05,
+      "loss": 0.807,
+      "step": 13369
+    },
+    {
+      "epoch": 0.9301193084976869,
+      "grad_norm": 1.2421875,
+      "learning_rate": 2.5482022245893578e-05,
+      "loss": 0.9299,
+      "step": 13370
+    },
+    {
+      "epoch": 0.930188876134822,
+      "grad_norm": 0.80859375,
+      "learning_rate": 2.543150267530592e-05,
+      "loss": 0.661,
+      "step": 13371
+    },
+    {
+      "epoch": 0.9302584437719573,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.538103258839408e-05,
+      "loss": 0.862,
+      "step": 13372
+    },
+    {
+      "epoch": 0.9303280114090925,
+      "grad_norm": 0.96875,
+      "learning_rate": 2.533061198772124e-05,
+      "loss": 0.7233,
+      "step": 13373
+    },
+    {
+      "epoch": 0.9303975790462277,
+      "grad_norm": 1.4921875,
+      "learning_rate": 2.5280240875847126e-05,
+      "loss": 0.6801,
+      "step": 13374
+    },
+    {
+      "epoch": 0.930467146683363,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.522991925532958e-05,
+      "loss": 0.9739,
+      "step": 13375
+    },
+    {
+      "epoch": 0.9305367143204981,
+      "grad_norm": 1.4453125,
+      "learning_rate": 2.5179647128723337e-05,
+      "loss": 0.9727,
+      "step": 13376
+    },
+    {
+      "epoch": 0.9306062819576333,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.5129424498581132e-05,
+      "loss": 0.6165,
+      "step": 13377
+    },
+    {
+      "epoch": 0.9306758495947686,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.507925136745315e-05,
+      "loss": 0.6481,
+      "step": 13378
+    },
+    {
+      "epoch": 0.9307454172319037,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.5029127737886793e-05,
+      "loss": 0.8426,
+      "step": 13379
+    },
+    {
+      "epoch": 0.9308149848690389,
+      "grad_norm": 1.25,
+      "learning_rate": 2.497905361242714e-05,
+      "loss": 0.7517,
+      "step": 13380
+    },
+    {
+      "epoch": 0.9308845525061741,
+      "grad_norm": 1.2421875,
+      "learning_rate": 2.4929028993616598e-05,
+      "loss": 0.7481,
+      "step": 13381
+    },
+    {
+      "epoch": 0.9309541201433094,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.487905388399525e-05,
+      "loss": 0.612,
+      "step": 13382
+    },
+    {
+      "epoch": 0.9310236877804445,
+      "grad_norm": 1.484375,
+      "learning_rate": 2.482912828610062e-05,
+      "loss": 0.7068,
+      "step": 13383
+    },
+    {
+      "epoch": 0.9310932554175797,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.4779252202467685e-05,
+      "loss": 0.823,
+      "step": 13384
+    },
+    {
+      "epoch": 0.931162823054715,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.4729425635628634e-05,
+      "loss": 0.6847,
+      "step": 13385
+    },
+    {
+      "epoch": 0.9312323906918502,
+      "grad_norm": 1.234375,
+      "learning_rate": 2.4679648588113777e-05,
+      "loss": 0.8414,
+      "step": 13386
+    },
+    {
+      "epoch": 0.9313019583289853,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.462992106245043e-05,
+      "loss": 0.9564,
+      "step": 13387
+    },
+    {
+      "epoch": 0.9313715259661206,
+      "grad_norm": 1.375,
+      "learning_rate": 2.4580243061163466e-05,
+      "loss": 0.709,
+      "step": 13388
+    },
+    {
+      "epoch": 0.9314410936032558,
+      "grad_norm": 0.92578125,
+      "learning_rate": 2.453061458677519e-05,
+      "loss": 0.7071,
+      "step": 13389
+    },
+    {
+      "epoch": 0.931510661240391,
+      "grad_norm": 0.8984375,
+      "learning_rate": 2.448103564180548e-05,
+      "loss": 0.6645,
+      "step": 13390
+    },
+    {
+      "epoch": 0.9315802288775262,
+      "grad_norm": 1.828125,
+      "learning_rate": 2.4431506228771993e-05,
+      "loss": 0.8114,
+      "step": 13391
+    },
+    {
+      "epoch": 0.9316497965146614,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.438202635018938e-05,
+      "loss": 0.735,
+      "step": 13392
+    },
+    {
+      "epoch": 0.9317193641517966,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.4332596008569853e-05,
+      "loss": 0.9012,
+      "step": 13393
+    },
+    {
+      "epoch": 0.9317889317889317,
+      "grad_norm": 1.25,
+      "learning_rate": 2.4283215206423514e-05,
+      "loss": 0.8712,
+      "step": 13394
+    },
+    {
+      "epoch": 0.931858499426067,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.4233883946257364e-05,
+      "loss": 0.8293,
+      "step": 13395
+    },
+    {
+      "epoch": 0.9319280670632022,
+      "grad_norm": 1.15625,
+      "learning_rate": 2.4184602230576613e-05,
+      "loss": 0.698,
+      "step": 13396
+    },
+    {
+      "epoch": 0.9319976347003374,
+      "grad_norm": 1.125,
+      "learning_rate": 2.4135370061883045e-05,
+      "loss": 0.7936,
+      "step": 13397
+    },
+    {
+      "epoch": 0.9320672023374726,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.4086187442676766e-05,
+      "loss": 0.9763,
+      "step": 13398
+    },
+    {
+      "epoch": 0.9321367699746078,
+      "grad_norm": 0.91015625,
+      "learning_rate": 2.403705437545489e-05,
+      "loss": 0.7281,
+      "step": 13399
+    },
+    {
+      "epoch": 0.932206337611743,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.3987970862712204e-05,
+      "loss": 0.7393,
+      "step": 13400
+    },
+    {
+      "epoch": 0.9322759052488783,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.3938936906940824e-05,
+      "loss": 0.9169,
+      "step": 13401
+    },
+    {
+      "epoch": 0.9323454728860134,
+      "grad_norm": 1.3203125,
+      "learning_rate": 2.3889952510630643e-05,
+      "loss": 0.7412,
+      "step": 13402
+    },
+    {
+      "epoch": 0.9324150405231486,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.3841017676268673e-05,
+      "loss": 0.6227,
+      "step": 13403
+    },
+    {
+      "epoch": 0.9324846081602839,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.3792132406339485e-05,
+      "loss": 1.0202,
+      "step": 13404
+    },
+    {
+      "epoch": 0.9325541757974191,
+      "grad_norm": 1.2265625,
+      "learning_rate": 2.3743296703325533e-05,
+      "loss": 0.7722,
+      "step": 13405
+    },
+    {
+      "epoch": 0.9326237434345542,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.3694510569706285e-05,
+      "loss": 1.0787,
+      "step": 13406
+    },
+    {
+      "epoch": 0.9326933110716894,
+      "grad_norm": 0.8671875,
+      "learning_rate": 2.3645774007958754e-05,
+      "loss": 0.6709,
+      "step": 13407
+    },
+    {
+      "epoch": 0.9327628787088247,
+      "grad_norm": 0.9453125,
+      "learning_rate": 2.3597087020557628e-05,
+      "loss": 0.6799,
+      "step": 13408
+    },
+    {
+      "epoch": 0.9328324463459599,
+      "grad_norm": 1.1015625,
+      "learning_rate": 2.354844960997493e-05,
+      "loss": 0.726,
+      "step": 13409
+    },
+    {
+      "epoch": 0.932902013983095,
+      "grad_norm": 0.9375,
+      "learning_rate": 2.3499861778680463e-05,
+      "loss": 1.0161,
+      "step": 13410
+    },
+    {
+      "epoch": 0.9329715816202303,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.3451323529140923e-05,
+      "loss": 0.8431,
+      "step": 13411
+    },
+    {
+      "epoch": 0.9330411492573655,
+      "grad_norm": 1.0625,
+      "learning_rate": 2.340283486382111e-05,
+      "loss": 0.8835,
+      "step": 13412
+    },
+    {
+      "epoch": 0.9331107168945006,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.3354395785182836e-05,
+      "loss": 0.6826,
+      "step": 13413
+    },
+    {
+      "epoch": 0.9331802845316359,
+      "grad_norm": 1.1015625,
+      "learning_rate": 2.330600629568569e-05,
+      "loss": 0.8793,
+      "step": 13414
+    },
+    {
+      "epoch": 0.9332498521687711,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.3257666397786702e-05,
+      "loss": 0.9135,
+      "step": 13415
+    },
+    {
+      "epoch": 0.9333194198059063,
+      "grad_norm": 0.9765625,
+      "learning_rate": 2.320937609394025e-05,
+      "loss": 0.6617,
+      "step": 13416
+    },
+    {
+      "epoch": 0.9333889874430416,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.3161135386598255e-05,
+      "loss": 0.8328,
+      "step": 13417
+    },
+    {
+      "epoch": 0.9334585550801767,
+      "grad_norm": 1.5859375,
+      "learning_rate": 2.31129442782102e-05,
+      "loss": 0.822,
+      "step": 13418
+    },
+    {
+      "epoch": 0.9335281227173119,
+      "grad_norm": 1.1796875,
+      "learning_rate": 2.3064802771223026e-05,
+      "loss": 0.5763,
+      "step": 13419
+    },
+    {
+      "epoch": 0.9335976903544471,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.301671086808099e-05,
+      "loss": 0.7431,
+      "step": 13420
+    },
+    {
+      "epoch": 0.9336672579915823,
+      "grad_norm": 0.94140625,
+      "learning_rate": 2.2968668571226038e-05,
+      "loss": 0.7305,
+      "step": 13421
+    },
+    {
+      "epoch": 0.9337368256287175,
+      "grad_norm": 1.125,
+      "learning_rate": 2.292067588309732e-05,
+      "loss": 0.6679,
+      "step": 13422
+    },
+    {
+      "epoch": 0.9338063932658527,
+      "grad_norm": 1.359375,
+      "learning_rate": 2.287273280613211e-05,
+      "loss": 0.9048,
+      "step": 13423
+    },
+    {
+      "epoch": 0.933875960902988,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.282483934276436e-05,
+      "loss": 0.873,
+      "step": 13424
+    },
+    {
+      "epoch": 0.9339455285401231,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.2776995495425778e-05,
+      "loss": 0.6686,
+      "step": 13425
+    },
+    {
+      "epoch": 0.9340150961772583,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.2729201266545983e-05,
+      "loss": 0.7767,
+      "step": 13426
+    },
+    {
+      "epoch": 0.9340846638143936,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.268145665855148e-05,
+      "loss": 0.8113,
+      "step": 13427
+    },
+    {
+      "epoch": 0.9341542314515288,
+      "grad_norm": 1.4296875,
+      "learning_rate": 2.2633761673866548e-05,
+      "loss": 0.8365,
+      "step": 13428
+    },
+    {
+      "epoch": 0.9342237990886639,
+      "grad_norm": 0.859375,
+      "learning_rate": 2.2586116314912807e-05,
+      "loss": 0.7165,
+      "step": 13429
+    },
+    {
+      "epoch": 0.9342933667257992,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.2538520584109766e-05,
+      "loss": 0.7445,
+      "step": 13430
+    },
+    {
+      "epoch": 0.9343629343629344,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.2490974483873715e-05,
+      "loss": 0.788,
+      "step": 13431
+    },
+    {
+      "epoch": 0.9344325020000696,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.2443478016618945e-05,
+      "loss": 0.8022,
+      "step": 13432
+    },
+    {
+      "epoch": 0.9345020696372047,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.2396031184757193e-05,
+      "loss": 0.8723,
+      "step": 13433
+    },
+    {
+      "epoch": 0.93457163727434,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.234863399069753e-05,
+      "loss": 0.7293,
+      "step": 13434
+    },
+    {
+      "epoch": 0.9346412049114752,
+      "grad_norm": 1.1953125,
+      "learning_rate": 2.230128643684648e-05,
+      "loss": 0.659,
+      "step": 13435
+    },
+    {
+      "epoch": 0.9347107725486103,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.2253988525608004e-05,
+      "loss": 0.6691,
+      "step": 13436
+    },
+    {
+      "epoch": 0.9347803401857456,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.2206740259383963e-05,
+      "loss": 1.1001,
+      "step": 13437
+    },
+    {
+      "epoch": 0.9348499078228808,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.2159541640573212e-05,
+      "loss": 0.7743,
+      "step": 13438
+    },
+    {
+      "epoch": 0.934919475460016,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.2112392671572058e-05,
+      "loss": 0.9172,
+      "step": 13439
+    },
+    {
+      "epoch": 0.9349890430971513,
+      "grad_norm": 1.1640625,
+      "learning_rate": 2.2065293354774916e-05,
+      "loss": 0.8179,
+      "step": 13440
+    },
+    {
+      "epoch": 0.9350586107342864,
+      "grad_norm": 0.8515625,
+      "learning_rate": 2.201824369257288e-05,
+      "loss": 0.6874,
+      "step": 13441
+    },
+    {
+      "epoch": 0.9351281783714216,
+      "grad_norm": 0.9765625,
+      "learning_rate": 2.1971243687355034e-05,
+      "loss": 0.7957,
+      "step": 13442
+    },
+    {
+      "epoch": 0.9351977460085569,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.1924293341507804e-05,
+      "loss": 0.9496,
+      "step": 13443
+    },
+    {
+      "epoch": 0.935267313645692,
+      "grad_norm": 1.25,
+      "learning_rate": 2.1877392657415172e-05,
+      "loss": 0.8483,
+      "step": 13444
+    },
+    {
+      "epoch": 0.9353368812828272,
+      "grad_norm": 0.875,
+      "learning_rate": 2.1830541637458347e-05,
+      "loss": 0.7571,
+      "step": 13445
+    },
+    {
+      "epoch": 0.9354064489199624,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.1783740284016306e-05,
+      "loss": 0.9877,
+      "step": 13446
+    },
+    {
+      "epoch": 0.9354760165570977,
+      "grad_norm": 1.375,
+      "learning_rate": 2.173698859946538e-05,
+      "loss": 1.0809,
+      "step": 13447
+    },
+    {
+      "epoch": 0.9355455841942328,
+      "grad_norm": 1.875,
+      "learning_rate": 2.169028658617944e-05,
+      "loss": 0.8189,
+      "step": 13448
+    },
+    {
+      "epoch": 0.935615151831368,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.1643634246529597e-05,
+      "loss": 0.7207,
+      "step": 13449
+    },
+    {
+      "epoch": 0.9356847194685033,
+      "grad_norm": 1.125,
+      "learning_rate": 2.159703158288462e-05,
+      "loss": 0.608,
+      "step": 13450
+    },
+    {
+      "epoch": 0.9357542871056385,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.1550478597611055e-05,
+      "loss": 0.6458,
+      "step": 13451
+    },
+    {
+      "epoch": 0.9358238547427736,
+      "grad_norm": 1.171875,
+      "learning_rate": 2.1503975293072466e-05,
+      "loss": 0.7931,
+      "step": 13452
+    },
+    {
+      "epoch": 0.9358934223799089,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.1457521671629842e-05,
+      "loss": 0.8415,
+      "step": 13453
+    },
+    {
+      "epoch": 0.9359629900170441,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.1411117735642194e-05,
+      "loss": 0.7379,
+      "step": 13454
+    },
+    {
+      "epoch": 0.9360325576541793,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.136476348746541e-05,
+      "loss": 0.7582,
+      "step": 13455
+    },
+    {
+      "epoch": 0.9361021252913145,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.1318458929453388e-05,
+      "loss": 0.8989,
+      "step": 13456
+    },
+    {
+      "epoch": 0.9361716929284497,
+      "grad_norm": 1.2578125,
+      "learning_rate": 2.1272204063957022e-05,
+      "loss": 0.571,
+      "step": 13457
+    },
+    {
+      "epoch": 0.9362412605655849,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.1225998893324993e-05,
+      "loss": 0.6005,
+      "step": 13458
+    },
+    {
+      "epoch": 0.93631082820272,
+      "grad_norm": 1.46875,
+      "learning_rate": 2.117984341990331e-05,
+      "loss": 0.7056,
+      "step": 13459
+    },
+    {
+      "epoch": 0.9363803958398553,
+      "grad_norm": 1.1953125,
+      "learning_rate": 2.1133737646035544e-05,
+      "loss": 0.7055,
+      "step": 13460
+    },
+    {
+      "epoch": 0.9364499634769905,
+      "grad_norm": 0.984375,
+      "learning_rate": 2.1087681574062824e-05,
+      "loss": 0.6748,
+      "step": 13461
+    },
+    {
+      "epoch": 0.9365195311141257,
+      "grad_norm": 0.984375,
+      "learning_rate": 2.1041675206323498e-05,
+      "loss": 0.7552,
+      "step": 13462
+    },
+    {
+      "epoch": 0.936589098751261,
+      "grad_norm": 1.3828125,
+      "learning_rate": 2.0995718545153585e-05,
+      "loss": 0.9645,
+      "step": 13463
+    },
+    {
+      "epoch": 0.9366586663883961,
+      "grad_norm": 1.4765625,
+      "learning_rate": 2.094981159288656e-05,
+      "loss": 0.7061,
+      "step": 13464
+    },
+    {
+      "epoch": 0.9367282340255313,
+      "grad_norm": 1.546875,
+      "learning_rate": 2.0903954351853328e-05,
+      "loss": 0.7951,
+      "step": 13465
+    },
+    {
+      "epoch": 0.9367978016626666,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.085814682438225e-05,
+      "loss": 0.8461,
+      "step": 13466
+    },
+    {
+      "epoch": 0.9368673692998017,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.0812389012799248e-05,
+      "loss": 0.7609,
+      "step": 13467
+    },
+    {
+      "epoch": 0.9369369369369369,
+      "grad_norm": 1.078125,
+      "learning_rate": 2.0766680919427682e-05,
+      "loss": 0.7872,
+      "step": 13468
+    },
+    {
+      "epoch": 0.9370065045740722,
+      "grad_norm": 1.6796875,
+      "learning_rate": 2.0721022546588362e-05,
+      "loss": 0.8846,
+      "step": 13469
+    },
+    {
+      "epoch": 0.9370760722112074,
+      "grad_norm": 1.3125,
+      "learning_rate": 2.0675413896599548e-05,
+      "loss": 0.7828,
+      "step": 13470
+    },
+    {
+      "epoch": 0.9371456398483425,
+      "grad_norm": 1.078125,
+      "learning_rate": 2.0629854971777053e-05,
+      "loss": 0.5559,
+      "step": 13471
+    },
+    {
+      "epoch": 0.9372152074854777,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.0584345774434243e-05,
+      "loss": 0.8942,
+      "step": 13472
+    },
+    {
+      "epoch": 0.937284775122613,
+      "grad_norm": 0.9296875,
+      "learning_rate": 2.053888630688161e-05,
+      "loss": 0.7171,
+      "step": 13473
+    },
+    {
+      "epoch": 0.9373543427597482,
+      "grad_norm": 1.34375,
+      "learning_rate": 2.0493476571427526e-05,
+      "loss": 0.8903,
+      "step": 13474
+    },
+    {
+      "epoch": 0.9374239103968833,
+      "grad_norm": 1.328125,
+      "learning_rate": 2.0448116570377596e-05,
+      "loss": 0.8405,
+      "step": 13475
+    },
+    {
+      "epoch": 0.9374934780340186,
+      "grad_norm": 1.078125,
+      "learning_rate": 2.0402806306034973e-05,
+      "loss": 0.5641,
+      "step": 13476
+    },
+    {
+      "epoch": 0.9375630456711538,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.035754578070037e-05,
+      "loss": 0.7364,
+      "step": 13477
+    },
+    {
+      "epoch": 0.937632613308289,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.0312334996671734e-05,
+      "loss": 0.9378,
+      "step": 13478
+    },
+    {
+      "epoch": 0.9377021809454242,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.0267173956244887e-05,
+      "loss": 0.7104,
+      "step": 13479
+    },
+    {
+      "epoch": 0.9377717485825594,
+      "grad_norm": 1.484375,
+      "learning_rate": 2.022206266171267e-05,
+      "loss": 0.8865,
+      "step": 13480
+    },
+    {
+      "epoch": 0.9378413162196946,
+      "grad_norm": 1.2890625,
+      "learning_rate": 2.017700111536558e-05,
+      "loss": 0.7749,
+      "step": 13481
+    },
+    {
+      "epoch": 0.9379108838568299,
+      "grad_norm": 1.0859375,
+      "learning_rate": 2.0131989319491784e-05,
+      "loss": 0.9203,
+      "step": 13482
+    },
+    {
+      "epoch": 0.937980451493965,
+      "grad_norm": 1.40625,
+      "learning_rate": 2.008702727637668e-05,
+      "loss": 1.0085,
+      "step": 13483
+    },
+    {
+      "epoch": 0.9380500191311002,
+      "grad_norm": 1.0703125,
+      "learning_rate": 2.0042114988303217e-05,
+      "loss": 0.7835,
+      "step": 13484
+    },
+    {
+      "epoch": 0.9381195867682354,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.9997252457551685e-05,
+      "loss": 0.7453,
+      "step": 13485
+    },
+    {
+      "epoch": 0.9381891544053707,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.9952439686400148e-05,
+      "loss": 0.5907,
+      "step": 13486
+    },
+    {
+      "epoch": 0.9382587220425058,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.9907676677123898e-05,
+      "loss": 0.6764,
+      "step": 13487
+    },
+    {
+      "epoch": 0.938328289679641,
+      "grad_norm": 1.3984375,
+      "learning_rate": 1.9862963431995895e-05,
+      "loss": 0.912,
+      "step": 13488
+    },
+    {
+      "epoch": 0.9383978573167763,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.981829995328621e-05,
+      "loss": 0.8614,
+      "step": 13489
+    },
+    {
+      "epoch": 0.9384674249539114,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.9773686243262924e-05,
+      "loss": 1.0099,
+      "step": 13490
+    },
+    {
+      "epoch": 0.9385369925910466,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.9729122304191104e-05,
+      "loss": 0.6065,
+      "step": 13491
+    },
+    {
+      "epoch": 0.9386065602281819,
+      "grad_norm": 1.546875,
+      "learning_rate": 1.9684608138333392e-05,
+      "loss": 0.9515,
+      "step": 13492
+    },
+    {
+      "epoch": 0.9386761278653171,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.9640143747950312e-05,
+      "loss": 0.6751,
+      "step": 13493
+    },
+    {
+      "epoch": 0.9387456955024522,
+      "grad_norm": 1.0,
+      "learning_rate": 1.959572913529928e-05,
+      "loss": 0.6923,
+      "step": 13494
+    },
+    {
+      "epoch": 0.9388152631395875,
+      "grad_norm": 1.2421875,
+      "learning_rate": 1.9551364302635377e-05,
+      "loss": 1.0487,
+      "step": 13495
+    },
+    {
+      "epoch": 0.9388848307767227,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.9507049252211472e-05,
+      "loss": 0.9824,
+      "step": 13496
+    },
+    {
+      "epoch": 0.9389543984138579,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.9462783986277655e-05,
+      "loss": 0.9057,
+      "step": 13497
+    },
+    {
+      "epoch": 0.939023966050993,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.9418568507081346e-05,
+      "loss": 0.6693,
+      "step": 13498
+    },
+    {
+      "epoch": 0.9390935336881283,
+      "grad_norm": 1.0,
+      "learning_rate": 1.937440281686753e-05,
+      "loss": 0.5237,
+      "step": 13499
+    },
+    {
+      "epoch": 0.9391631013252635,
+      "grad_norm": 0.87890625,
+      "learning_rate": 1.933028691787886e-05,
+      "loss": 0.6226,
+      "step": 13500
+    },
+    {
+      "epoch": 0.9392326689623987,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.9286220812355317e-05,
+      "loss": 0.77,
+      "step": 13501
+    },
+    {
+      "epoch": 0.9393022365995339,
+      "grad_norm": 0.9453125,
+      "learning_rate": 1.9242204502534332e-05,
+      "loss": 0.6295,
+      "step": 13502
+    },
+    {
+      "epoch": 0.9393718042366691,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.9198237990650792e-05,
+      "loss": 0.6855,
+      "step": 13503
+    },
+    {
+      "epoch": 0.9394413718738043,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.9154321278937126e-05,
+      "loss": 0.7297,
+      "step": 13504
+    },
+    {
+      "epoch": 0.9395109395109396,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.911045436962322e-05,
+      "loss": 0.8111,
+      "step": 13505
+    },
+    {
+      "epoch": 0.9395805071480747,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.9066637264936293e-05,
+      "loss": 0.7445,
+      "step": 13506
+    },
+    {
+      "epoch": 0.9396500747852099,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.902286996710134e-05,
+      "loss": 0.8456,
+      "step": 13507
+    },
+    {
+      "epoch": 0.9397196424223452,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.8979152478340588e-05,
+      "loss": 0.8239,
+      "step": 13508
+    },
+    {
+      "epoch": 0.9397892100594804,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.8935484800873702e-05,
+      "loss": 0.6998,
+      "step": 13509
+    },
+    {
+      "epoch": 0.9398587776966155,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.8891866936917913e-05,
+      "loss": 0.8455,
+      "step": 13510
+    },
+    {
+      "epoch": 0.9399283453337507,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.8848298888688108e-05,
+      "loss": 0.7153,
+      "step": 13511
+    },
+    {
+      "epoch": 0.939997912970886,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.8804780658396303e-05,
+      "loss": 0.9088,
+      "step": 13512
+    },
+    {
+      "epoch": 0.9400674806080211,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.876131224825195e-05,
+      "loss": 0.6779,
+      "step": 13513
+    },
+    {
+      "epoch": 0.9401370482451563,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.8717893660462502e-05,
+      "loss": 0.6937,
+      "step": 13514
+    },
+    {
+      "epoch": 0.9402066158822916,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.8674524897232427e-05,
+      "loss": 0.6491,
+      "step": 13515
+    },
+    {
+      "epoch": 0.9402761835194268,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.863120596076373e-05,
+      "loss": 0.7033,
+      "step": 13516
+    },
+    {
+      "epoch": 0.9403457511565619,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.858793685325577e-05,
+      "loss": 0.6249,
+      "step": 13517
+    },
+    {
+      "epoch": 0.9404153187936972,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.85447175769059e-05,
+      "loss": 0.7647,
+      "step": 13518
+    },
+    {
+      "epoch": 0.9404848864308324,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.850154813390814e-05,
+      "loss": 0.8839,
+      "step": 13519
+    },
+    {
+      "epoch": 0.9405544540679676,
+      "grad_norm": 1.8203125,
+      "learning_rate": 1.845842852645474e-05,
+      "loss": 1.343,
+      "step": 13520
+    },
+    {
+      "epoch": 0.9406240217051027,
+      "grad_norm": 1.125,
+      "learning_rate": 1.8415358756735168e-05,
+      "loss": 0.6996,
+      "step": 13521
+    },
+    {
+      "epoch": 0.940693589342238,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.8372338826936007e-05,
+      "loss": 0.9822,
+      "step": 13522
+    },
+    {
+      "epoch": 0.9407631569793732,
+      "grad_norm": 1.0,
+      "learning_rate": 1.8329368739241625e-05,
+      "loss": 0.5543,
+      "step": 13523
+    },
+    {
+      "epoch": 0.9408327246165084,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.828644849583394e-05,
+      "loss": 0.9282,
+      "step": 13524
+    },
+    {
+      "epoch": 0.9409022922536436,
+      "grad_norm": 1.453125,
+      "learning_rate": 1.8243578098892322e-05,
+      "loss": 0.9974,
+      "step": 13525
+    },
+    {
+      "epoch": 0.9409718598907788,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.820075755059336e-05,
+      "loss": 0.7867,
+      "step": 13526
+    },
+    {
+      "epoch": 0.941041427527914,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.8157986853111208e-05,
+      "loss": 1.1717,
+      "step": 13527
+    },
+    {
+      "epoch": 0.9411109951650493,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.811526600861757e-05,
+      "loss": 0.8979,
+      "step": 13528
+    },
+    {
+      "epoch": 0.9411805628021844,
+      "grad_norm": 0.97265625,
+      "learning_rate": 1.8072595019281824e-05,
+      "loss": 0.5598,
+      "step": 13529
+    },
+    {
+      "epoch": 0.9412501304393196,
+      "grad_norm": 1.5078125,
+      "learning_rate": 1.8029973887270344e-05,
+      "loss": 1.0097,
+      "step": 13530
+    },
+    {
+      "epoch": 0.9413196980764549,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.7987402614747296e-05,
+      "loss": 0.7811,
+      "step": 13531
+    },
+    {
+      "epoch": 0.94138926571359,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.7944881203874162e-05,
+      "loss": 0.7663,
+      "step": 13532
+    },
+    {
+      "epoch": 0.9414588333507252,
+      "grad_norm": 1.421875,
+      "learning_rate": 1.7902409656810226e-05,
+      "loss": 0.9591,
+      "step": 13533
+    },
+    {
+      "epoch": 0.9415284009878604,
+      "grad_norm": 1.171875,
+      "learning_rate": 1.7859987975711644e-05,
+      "loss": 0.8446,
+      "step": 13534
+    },
+    {
+      "epoch": 0.9415979686249957,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.7817616162732587e-05,
+      "loss": 0.7937,
+      "step": 13535
+    },
+    {
+      "epoch": 0.9416675362621308,
+      "grad_norm": 1.234375,
+      "learning_rate": 1.777529422002444e-05,
+      "loss": 1.1306,
+      "step": 13536
+    },
+    {
+      "epoch": 0.941737103899266,
+      "grad_norm": 0.9921875,
+      "learning_rate": 1.7733022149735934e-05,
+      "loss": 0.5752,
+      "step": 13537
+    },
+    {
+      "epoch": 0.9418066715364013,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.769079995401357e-05,
+      "loss": 0.7649,
+      "step": 13538
+    },
+    {
+      "epoch": 0.9418762391735365,
+      "grad_norm": 1.515625,
+      "learning_rate": 1.76486276350013e-05,
+      "loss": 0.9554,
+      "step": 13539
+    },
+    {
+      "epoch": 0.9419458068106716,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.7606505194840304e-05,
+      "loss": 0.9893,
+      "step": 13540
+    },
+    {
+      "epoch": 0.9420153744478069,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.7564432635669314e-05,
+      "loss": 0.7798,
+      "step": 13541
+    },
+    {
+      "epoch": 0.9420849420849421,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.752240995962451e-05,
+      "loss": 0.8572,
+      "step": 13542
+    },
+    {
+      "epoch": 0.9421545097220773,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.7480437168839847e-05,
+      "loss": 0.9295,
+      "step": 13543
+    },
+    {
+      "epoch": 0.9422240773592125,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.743851426544618e-05,
+      "loss": 0.7993,
+      "step": 13544
+    },
+    {
+      "epoch": 0.9422936449963477,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.7396641251572364e-05,
+      "loss": 0.8911,
+      "step": 13545
+    },
+    {
+      "epoch": 0.9423632126334829,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.7354818129344253e-05,
+      "loss": 0.9248,
+      "step": 13546
+    },
+    {
+      "epoch": 0.942432780270618,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.731304490088581e-05,
+      "loss": 0.7356,
+      "step": 13547
+    },
+    {
+      "epoch": 0.9425023479077533,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.7271321568317677e-05,
+      "loss": 0.5402,
+      "step": 13548
+    },
+    {
+      "epoch": 0.9425719155448885,
+      "grad_norm": 1.0,
+      "learning_rate": 1.72296481337586e-05,
+      "loss": 0.5653,
+      "step": 13549
+    },
+    {
+      "epoch": 0.9426414831820237,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.7188024599324448e-05,
+      "loss": 0.7083,
+      "step": 13550
+    },
+    {
+      "epoch": 0.942711050819159,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.7146450967128635e-05,
+      "loss": 0.8384,
+      "step": 13551
+    },
+    {
+      "epoch": 0.9427806184562941,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.710492723928203e-05,
+      "loss": 0.6302,
+      "step": 13552
+    },
+    {
+      "epoch": 0.9428501860934293,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.7063453417893173e-05,
+      "loss": 0.7427,
+      "step": 13553
+    },
+    {
+      "epoch": 0.9429197537305646,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.7022029505067816e-05,
+      "loss": 0.7552,
+      "step": 13554
+    },
+    {
+      "epoch": 0.9429893213676998,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.698065550290906e-05,
+      "loss": 0.6747,
+      "step": 13555
+    },
+    {
+      "epoch": 0.9430588890048349,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.693933141351789e-05,
+      "loss": 0.7567,
+      "step": 13556
+    },
+    {
+      "epoch": 0.9431284566419702,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.6898057238992625e-05,
+      "loss": 0.7553,
+      "step": 13557
+    },
+    {
+      "epoch": 0.9431980242791054,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.6856832981428706e-05,
+      "loss": 0.7016,
+      "step": 13558
+    },
+    {
+      "epoch": 0.9432675919162405,
+      "grad_norm": 0.94140625,
+      "learning_rate": 1.681565864291934e-05,
+      "loss": 0.7851,
+      "step": 13559
+    },
+    {
+      "epoch": 0.9433371595533757,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.6774534225555194e-05,
+      "loss": 0.9058,
+      "step": 13560
+    },
+    {
+      "epoch": 0.943406727190511,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.6733459731424594e-05,
+      "loss": 0.7591,
+      "step": 13561
+    },
+    {
+      "epoch": 0.9434762948276462,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.6692435162612764e-05,
+      "loss": 0.6447,
+      "step": 13562
+    },
+    {
+      "epoch": 0.9435458624647813,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.66514605212027e-05,
+      "loss": 1.0391,
+      "step": 13563
+    },
+    {
+      "epoch": 0.9436154301019166,
+      "grad_norm": 1.4296875,
+      "learning_rate": 1.6610535809275185e-05,
+      "loss": 0.7364,
+      "step": 13564
+    },
+    {
+      "epoch": 0.9436849977390518,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.6569661028908e-05,
+      "loss": 0.6518,
+      "step": 13565
+    },
+    {
+      "epoch": 0.943754565376187,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.6528836182176487e-05,
+      "loss": 0.8753,
+      "step": 13566
+    },
+    {
+      "epoch": 0.9438241330133222,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.6488061271153653e-05,
+      "loss": 0.9546,
+      "step": 13567
+    },
+    {
+      "epoch": 0.9438937006504574,
+      "grad_norm": 0.859375,
+      "learning_rate": 1.6447336297909842e-05,
+      "loss": 0.7661,
+      "step": 13568
+    },
+    {
+      "epoch": 0.9439632682875926,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.6406661264512733e-05,
+      "loss": 0.8817,
+      "step": 13569
+    },
+    {
+      "epoch": 0.9440328359247279,
+      "grad_norm": 0.9375,
+      "learning_rate": 1.6366036173027676e-05,
+      "loss": 0.7684,
+      "step": 13570
+    },
+    {
+      "epoch": 0.944102403561863,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.6325461025517574e-05,
+      "loss": 0.9332,
+      "step": 13571
+    },
+    {
+      "epoch": 0.9441719711989982,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.6284935824042447e-05,
+      "loss": 0.7933,
+      "step": 13572
+    },
+    {
+      "epoch": 0.9442415388361334,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.624446057065987e-05,
+      "loss": 0.8663,
+      "step": 13573
+    },
+    {
+      "epoch": 0.9443111064732687,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.6204035267425088e-05,
+      "loss": 0.9225,
+      "step": 13574
+    },
+    {
+      "epoch": 0.9443806741104038,
+      "grad_norm": 1.4296875,
+      "learning_rate": 1.6163659916390794e-05,
+      "loss": 1.2551,
+      "step": 13575
+    },
+    {
+      "epoch": 0.944450241747539,
+      "grad_norm": 0.9921875,
+      "learning_rate": 1.61233345196069e-05,
+      "loss": 0.6812,
+      "step": 13576
+    },
+    {
+      "epoch": 0.9445198093846743,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.6083059079121e-05,
+      "loss": 0.5759,
+      "step": 13577
+    },
+    {
+      "epoch": 0.9445893770218095,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.6042833596978e-05,
+      "loss": 0.8327,
+      "step": 13578
+    },
+    {
+      "epoch": 0.9446589446589446,
+      "grad_norm": 0.8984375,
+      "learning_rate": 1.600265807522039e-05,
+      "loss": 0.9751,
+      "step": 13579
+    },
+    {
+      "epoch": 0.9447285122960799,
+      "grad_norm": 1.1640625,
+      "learning_rate": 1.5962532515888086e-05,
+      "loss": 0.964,
+      "step": 13580
+    },
+    {
+      "epoch": 0.9447980799332151,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.592245692101857e-05,
+      "loss": 0.7652,
+      "step": 13581
+    },
+    {
+      "epoch": 0.9448676475703502,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.588243129264655e-05,
+      "loss": 1.1237,
+      "step": 13582
+    },
+    {
+      "epoch": 0.9449372152074855,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.5842455632804288e-05,
+      "loss": 0.6775,
+      "step": 13583
+    },
+    {
+      "epoch": 0.9450067828446207,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.5802529943521604e-05,
+      "loss": 0.6229,
+      "step": 13584
+    },
+    {
+      "epoch": 0.9450763504817559,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.576265422682577e-05,
+      "loss": 0.6356,
+      "step": 13585
+    },
+    {
+      "epoch": 0.945145918118891,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.5722828484741382e-05,
+      "loss": 0.9438,
+      "step": 13586
+    },
+    {
+      "epoch": 0.9452154857560263,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.5683052719290714e-05,
+      "loss": 0.9678,
+      "step": 13587
+    },
+    {
+      "epoch": 0.9452850533931615,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.564332693249315e-05,
+      "loss": 0.9181,
+      "step": 13588
+    },
+    {
+      "epoch": 0.9453546210302967,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.560365112636608e-05,
+      "loss": 0.927,
+      "step": 13589
+    },
+    {
+      "epoch": 0.9454241886674319,
+      "grad_norm": 0.91796875,
+      "learning_rate": 1.556402530292389e-05,
+      "loss": 0.9889,
+      "step": 13590
+    },
+    {
+      "epoch": 0.9454937563045671,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.5524449464178413e-05,
+      "loss": 0.8351,
+      "step": 13591
+    },
+    {
+      "epoch": 0.9455633239417023,
+      "grad_norm": 0.94140625,
+      "learning_rate": 1.548492361213938e-05,
+      "loss": 0.7599,
+      "step": 13592
+    },
+    {
+      "epoch": 0.9456328915788376,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.5445447748813624e-05,
+      "loss": 0.7491,
+      "step": 13593
+    },
+    {
+      "epoch": 0.9457024592159727,
+      "grad_norm": 1.25,
+      "learning_rate": 1.5406021876205435e-05,
+      "loss": 0.8565,
+      "step": 13594
+    },
+    {
+      "epoch": 0.9457720268531079,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.5366645996316764e-05,
+      "loss": 0.8743,
+      "step": 13595
+    },
+    {
+      "epoch": 0.9458415944902432,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.5327320111146904e-05,
+      "loss": 0.8134,
+      "step": 13596
+    },
+    {
+      "epoch": 0.9459111621273784,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.528804422269259e-05,
+      "loss": 0.8351,
+      "step": 13597
+    },
+    {
+      "epoch": 0.9459807297645135,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.5248818332948e-05,
+      "loss": 1.1409,
+      "step": 13598
+    },
+    {
+      "epoch": 0.9460502974016487,
+      "grad_norm": 1.6171875,
+      "learning_rate": 1.5209642443905103e-05,
+      "loss": 0.7357,
+      "step": 13599
+    },
+    {
+      "epoch": 0.946119865038784,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.517051655755275e-05,
+      "loss": 0.8461,
+      "step": 13600
+    },
+    {
+      "epoch": 0.9461894326759192,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.5131440675877572e-05,
+      "loss": 0.7245,
+      "step": 13601
+    },
+    {
+      "epoch": 0.9462590003130543,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.5092414800863763e-05,
+      "loss": 0.8758,
+      "step": 13602
+    },
+    {
+      "epoch": 0.9463285679501896,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.5053438934492958e-05,
+      "loss": 0.8778,
+      "step": 13603
+    },
+    {
+      "epoch": 0.9463981355873248,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.5014513078743907e-05,
+      "loss": 0.8159,
+      "step": 13604
+    },
+    {
+      "epoch": 0.9464677032244599,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.4975637235593253e-05,
+      "loss": 0.7748,
+      "step": 13605
+    },
+    {
+      "epoch": 0.9465372708615952,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.493681140701475e-05,
+      "loss": 0.7375,
+      "step": 13606
+    },
+    {
+      "epoch": 0.9466068384987304,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.4898035594979931e-05,
+      "loss": 0.7438,
+      "step": 13607
+    },
+    {
+      "epoch": 0.9466764061358656,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.4859309801457555e-05,
+      "loss": 0.7884,
+      "step": 13608
+    },
+    {
+      "epoch": 0.9467459737730008,
+      "grad_norm": 1.6796875,
+      "learning_rate": 1.4820634028414049e-05,
+      "loss": 0.7238,
+      "step": 13609
+    },
+    {
+      "epoch": 0.946815541410136,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.4782008277812953e-05,
+      "loss": 0.937,
+      "step": 13610
+    },
+    {
+      "epoch": 0.9468851090472712,
+      "grad_norm": 0.96484375,
+      "learning_rate": 1.4743432551615698e-05,
+      "loss": 0.8037,
+      "step": 13611
+    },
+    {
+      "epoch": 0.9469546766844064,
+      "grad_norm": 0.99609375,
+      "learning_rate": 1.470490685178083e-05,
+      "loss": 0.7272,
+      "step": 13612
+    },
+    {
+      "epoch": 0.9470242443215416,
+      "grad_norm": 0.92578125,
+      "learning_rate": 1.4666431180264561e-05,
+      "loss": 0.777,
+      "step": 13613
+    },
+    {
+      "epoch": 0.9470938119586768,
+      "grad_norm": 1.125,
+      "learning_rate": 1.4628005539020551e-05,
+      "loss": 0.8623,
+      "step": 13614
+    },
+    {
+      "epoch": 0.947163379595812,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.458962992999957e-05,
+      "loss": 0.6876,
+      "step": 13615
+    },
+    {
+      "epoch": 0.9472329472329473,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.4551304355150396e-05,
+      "loss": 1.0869,
+      "step": 13616
+    },
+    {
+      "epoch": 0.9473025148700824,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.4513028816419138e-05,
+      "loss": 0.9281,
+      "step": 13617
+    },
+    {
+      "epoch": 0.9473720825072176,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.4474803315748908e-05,
+      "loss": 0.7179,
+      "step": 13618
+    },
+    {
+      "epoch": 0.9474416501443529,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.443662785508082e-05,
+      "loss": 0.8745,
+      "step": 13619
+    },
+    {
+      "epoch": 0.9475112177814881,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.43985024363531e-05,
+      "loss": 0.6506,
+      "step": 13620
+    },
+    {
+      "epoch": 0.9475807854186232,
+      "grad_norm": 0.92578125,
+      "learning_rate": 1.4360427061501646e-05,
+      "loss": 0.6436,
+      "step": 13621
+    },
+    {
+      "epoch": 0.9476503530557585,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.432240173245969e-05,
+      "loss": 0.8159,
+      "step": 13622
+    },
+    {
+      "epoch": 0.9477199206928937,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.4284426451158018e-05,
+      "loss": 0.9967,
+      "step": 13623
+    },
+    {
+      "epoch": 0.9477894883300289,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.4246501219524754e-05,
+      "loss": 0.7271,
+      "step": 13624
+    },
+    {
+      "epoch": 0.947859055967164,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.4208626039485695e-05,
+      "loss": 0.832,
+      "step": 13625
+    },
+    {
+      "epoch": 0.9479286236042993,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.4170800912963744e-05,
+      "loss": 0.7337,
+      "step": 13626
+    },
+    {
+      "epoch": 0.9479981912414345,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.4133025841879699e-05,
+      "loss": 0.7956,
+      "step": 13627
+    },
+    {
+      "epoch": 0.9480677588785696,
+      "grad_norm": 1.546875,
+      "learning_rate": 1.4095300828151358e-05,
+      "loss": 0.7781,
+      "step": 13628
+    },
+    {
+      "epoch": 0.9481373265157049,
+      "grad_norm": 1.4765625,
+      "learning_rate": 1.4057625873694191e-05,
+      "loss": 0.8593,
+      "step": 13629
+    },
+    {
+      "epoch": 0.9482068941528401,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.4020000980421554e-05,
+      "loss": 0.7329,
+      "step": 13630
+    },
+    {
+      "epoch": 0.9482764617899753,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.3982426150243366e-05,
+      "loss": 0.8627,
+      "step": 13631
+    },
+    {
+      "epoch": 0.9483460294271105,
+      "grad_norm": 1.2421875,
+      "learning_rate": 1.3944901385067765e-05,
+      "loss": 0.7527,
+      "step": 13632
+    },
+    {
+      "epoch": 0.9484155970642457,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.3907426686800007e-05,
+      "loss": 0.9456,
+      "step": 13633
+    },
+    {
+      "epoch": 0.9484851647013809,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.3870002057342679e-05,
+      "loss": 0.6164,
+      "step": 13634
+    },
+    {
+      "epoch": 0.9485547323385162,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.3832627498596372e-05,
+      "loss": 0.8265,
+      "step": 13635
+    },
+    {
+      "epoch": 0.9486242999756513,
+      "grad_norm": 0.9453125,
+      "learning_rate": 1.379530301245857e-05,
+      "loss": 0.7207,
+      "step": 13636
+    },
+    {
+      "epoch": 0.9486938676127865,
+      "grad_norm": 1.4765625,
+      "learning_rate": 1.3758028600824313e-05,
+      "loss": 0.9727,
+      "step": 13637
+    },
+    {
+      "epoch": 0.9487634352499217,
+      "grad_norm": 1.3984375,
+      "learning_rate": 1.3720804265586417e-05,
+      "loss": 0.8802,
+      "step": 13638
+    },
+    {
+      "epoch": 0.948833002887057,
+      "grad_norm": 1.7109375,
+      "learning_rate": 1.3683630008634817e-05,
+      "loss": 1.1856,
+      "step": 13639
+    },
+    {
+      "epoch": 0.9489025705241921,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.3646505831857115e-05,
+      "loss": 0.7854,
+      "step": 13640
+    },
+    {
+      "epoch": 0.9489721381613273,
+      "grad_norm": 0.921875,
+      "learning_rate": 1.3609431737138356e-05,
+      "loss": 0.7218,
+      "step": 13641
+    },
+    {
+      "epoch": 0.9490417057984626,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.3572407726360703e-05,
+      "loss": 0.9039,
+      "step": 13642
+    },
+    {
+      "epoch": 0.9491112734355978,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.3535433801404317e-05,
+      "loss": 0.5143,
+      "step": 13643
+    },
+    {
+      "epoch": 0.9491808410727329,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.3498509964146366e-05,
+      "loss": 0.6741,
+      "step": 13644
+    },
+    {
+      "epoch": 0.9492504087098682,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.3461636216461904e-05,
+      "loss": 0.7088,
+      "step": 13645
+    },
+    {
+      "epoch": 0.9493199763470034,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.3424812560222987e-05,
+      "loss": 0.877,
+      "step": 13646
+    },
+    {
+      "epoch": 0.9493895439841386,
+      "grad_norm": 1.1796875,
+      "learning_rate": 1.3388038997299235e-05,
+      "loss": 0.8244,
+      "step": 13647
+    },
+    {
+      "epoch": 0.9494591116212738,
+      "grad_norm": 1.1875,
+      "learning_rate": 1.335131552955815e-05,
+      "loss": 0.7707,
+      "step": 13648
+    },
+    {
+      "epoch": 0.949528679258409,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.3314642158864132e-05,
+      "loss": 0.7041,
+      "step": 13649
+    },
+    {
+      "epoch": 0.9495982468955442,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.3278018887079247e-05,
+      "loss": 0.751,
+      "step": 13650
+    },
+    {
+      "epoch": 0.9496678145326793,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.3241445716063227e-05,
+      "loss": 0.8735,
+      "step": 13651
+    },
+    {
+      "epoch": 0.9497373821698146,
+      "grad_norm": 0.8828125,
+      "learning_rate": 1.3204922647672813e-05,
+      "loss": 0.6172,
+      "step": 13652
+    },
+    {
+      "epoch": 0.9498069498069498,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.316844968376274e-05,
+      "loss": 0.9435,
+      "step": 13653
+    },
+    {
+      "epoch": 0.949876517444085,
+      "grad_norm": 0.9296875,
+      "learning_rate": 1.3132026826184751e-05,
+      "loss": 0.6585,
+      "step": 13654
+    },
+    {
+      "epoch": 0.9499460850812202,
+      "grad_norm": 1.375,
+      "learning_rate": 1.3095654076788254e-05,
+      "loss": 0.9507,
+      "step": 13655
+    },
+    {
+      "epoch": 0.9500156527183554,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.3059331437420108e-05,
+      "loss": 0.8002,
+      "step": 13656
+    },
+    {
+      "epoch": 0.9500852203554906,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.30230589099245e-05,
+      "loss": 0.8445,
+      "step": 13657
+    },
+    {
+      "epoch": 0.9501547879926259,
+      "grad_norm": 1.171875,
+      "learning_rate": 1.2986836496143295e-05,
+      "loss": 0.8644,
+      "step": 13658
+    },
+    {
+      "epoch": 0.950224355629761,
+      "grad_norm": 1.6015625,
+      "learning_rate": 1.2950664197915573e-05,
+      "loss": 0.6882,
+      "step": 13659
+    },
+    {
+      "epoch": 0.9502939232668962,
+      "grad_norm": 0.96875,
+      "learning_rate": 1.291454201707809e-05,
+      "loss": 0.7682,
+      "step": 13660
+    },
+    {
+      "epoch": 0.9503634909040315,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.2878469955464712e-05,
+      "loss": 0.8046,
+      "step": 13661
+    },
+    {
+      "epoch": 0.9504330585411667,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.2842448014907304e-05,
+      "loss": 0.7754,
+      "step": 13662
+    },
+    {
+      "epoch": 0.9505026261783018,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.280647619723474e-05,
+      "loss": 0.7069,
+      "step": 13663
+    },
+    {
+      "epoch": 0.950572193815437,
+      "grad_norm": 1.1640625,
+      "learning_rate": 1.2770554504273557e-05,
+      "loss": 0.9939,
+      "step": 13664
+    },
+    {
+      "epoch": 0.9506417614525723,
+      "grad_norm": 0.8671875,
+      "learning_rate": 1.273468293784752e-05,
+      "loss": 0.7638,
+      "step": 13665
+    },
+    {
+      "epoch": 0.9507113290897075,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.2698861499778058e-05,
+      "loss": 0.8706,
+      "step": 13666
+    },
+    {
+      "epoch": 0.9507808967268426,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.2663090191884164e-05,
+      "loss": 0.8316,
+      "step": 13667
+    },
+    {
+      "epoch": 0.9508504643639779,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.2627369015981827e-05,
+      "loss": 0.9694,
+      "step": 13668
+    },
+    {
+      "epoch": 0.9509200320011131,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.2591697973885152e-05,
+      "loss": 0.9504,
+      "step": 13669
+    },
+    {
+      "epoch": 0.9509895996382483,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.2556077067405026e-05,
+      "loss": 0.8707,
+      "step": 13670
+    },
+    {
+      "epoch": 0.9510591672753835,
+      "grad_norm": 1.375,
+      "learning_rate": 1.2520506298350332e-05,
+      "loss": 0.8861,
+      "step": 13671
+    },
+    {
+      "epoch": 0.9511287349125187,
+      "grad_norm": 0.9296875,
+      "learning_rate": 1.248498566852696e-05,
+      "loss": 0.5927,
+      "step": 13672
+    },
+    {
+      "epoch": 0.9511983025496539,
+      "grad_norm": 0.8515625,
+      "learning_rate": 1.244951517973858e-05,
+      "loss": 0.6532,
+      "step": 13673
+    },
+    {
+      "epoch": 0.9512678701867892,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.2414094833786194e-05,
+      "loss": 0.7398,
+      "step": 13674
+    },
+    {
+      "epoch": 0.9513374378239243,
+      "grad_norm": 1.171875,
+      "learning_rate": 1.2378724632468253e-05,
+      "loss": 0.9259,
+      "step": 13675
+    },
+    {
+      "epoch": 0.9514070054610595,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.2343404577580764e-05,
+      "loss": 0.9632,
+      "step": 13676
+    },
+    {
+      "epoch": 0.9514765730981947,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.230813467091707e-05,
+      "loss": 0.6046,
+      "step": 13677
+    },
+    {
+      "epoch": 0.95154614073533,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.2272914914267963e-05,
+      "loss": 0.6549,
+      "step": 13678
+    },
+    {
+      "epoch": 0.9516157083724651,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.2237745309421567e-05,
+      "loss": 0.6816,
+      "step": 13679
+    },
+    {
+      "epoch": 0.9516852760096003,
+      "grad_norm": 1.8046875,
+      "learning_rate": 1.2202625858163896e-05,
+      "loss": 0.8623,
+      "step": 13680
+    },
+    {
+      "epoch": 0.9517548436467356,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.216755656227797e-05,
+      "loss": 0.7623,
+      "step": 13681
+    },
+    {
+      "epoch": 0.9518244112838707,
+      "grad_norm": 1.546875,
+      "learning_rate": 1.2132537423544476e-05,
+      "loss": 1.0027,
+      "step": 13682
+    },
+    {
+      "epoch": 0.9518939789210059,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.2097568443741547e-05,
+      "loss": 0.8778,
+      "step": 13683
+    },
+    {
+      "epoch": 0.9519635465581412,
+      "grad_norm": 0.93359375,
+      "learning_rate": 1.206264962464465e-05,
+      "loss": 0.8399,
+      "step": 13684
+    },
+    {
+      "epoch": 0.9520331141952764,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.2027780968026925e-05,
+      "loss": 0.8131,
+      "step": 13685
+    },
+    {
+      "epoch": 0.9521026818324115,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.199296247565862e-05,
+      "loss": 0.719,
+      "step": 13686
+    },
+    {
+      "epoch": 0.9521722494695468,
+      "grad_norm": 0.890625,
+      "learning_rate": 1.1958194149307767e-05,
+      "loss": 0.7673,
+      "step": 13687
+    },
+    {
+      "epoch": 0.952241817106682,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.1923475990739729e-05,
+      "loss": 0.5885,
+      "step": 13688
+    },
+    {
+      "epoch": 0.9523113847438172,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.1888808001717321e-05,
+      "loss": 0.7645,
+      "step": 13689
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.96484375,
+      "learning_rate": 1.1854190184000801e-05,
+      "loss": 0.6647,
+      "step": 13690
+    },
+    {
+      "epoch": 0.9524505200180876,
+      "grad_norm": 1.625,
+      "learning_rate": 1.1819622539347985e-05,
+      "loss": 0.9428,
+      "step": 13691
+    },
+    {
+      "epoch": 0.9525200876552228,
+      "grad_norm": 1.0,
+      "learning_rate": 1.1785105069513802e-05,
+      "loss": 0.7621,
+      "step": 13692
+    },
+    {
+      "epoch": 0.952589655292358,
+      "grad_norm": 0.81640625,
+      "learning_rate": 1.1750637776250961e-05,
+      "loss": 0.439,
+      "step": 13693
+    },
+    {
+      "epoch": 0.9526592229294932,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.171622066130973e-05,
+      "loss": 0.8382,
+      "step": 13694
+    },
+    {
+      "epoch": 0.9527287905666284,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.1681853726437376e-05,
+      "loss": 0.6706,
+      "step": 13695
+    },
+    {
+      "epoch": 0.9527983582037636,
+      "grad_norm": 1.1875,
+      "learning_rate": 1.1647536973379058e-05,
+      "loss": 0.7817,
+      "step": 13696
+    },
+    {
+      "epoch": 0.9528679258408989,
+      "grad_norm": 1.46875,
+      "learning_rate": 1.1613270403877163e-05,
+      "loss": 1.0401,
+      "step": 13697
+    },
+    {
+      "epoch": 0.952937493478034,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.157905401967152e-05,
+      "loss": 0.9601,
+      "step": 13698
+    },
+    {
+      "epoch": 0.9530070611151692,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.1544887822499517e-05,
+      "loss": 0.8719,
+      "step": 13699
+    },
+    {
+      "epoch": 0.9530766287523045,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.1510771814095989e-05,
+      "loss": 0.6628,
+      "step": 13700
+    },
+    {
+      "epoch": 0.9531461963894396,
+      "grad_norm": 0.95703125,
+      "learning_rate": 1.1476705996192993e-05,
+      "loss": 0.7796,
+      "step": 13701
+    },
+    {
+      "epoch": 0.9532157640265748,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.144269037052037e-05,
+      "loss": 0.9382,
+      "step": 13702
+    },
+    {
+      "epoch": 0.95328533166371,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.1408724938805293e-05,
+      "loss": 0.927,
+      "step": 13703
+    },
+    {
+      "epoch": 0.9533548993008453,
+      "grad_norm": 0.8671875,
+      "learning_rate": 1.137480970277227e-05,
+      "loss": 0.7409,
+      "step": 13704
+    },
+    {
+      "epoch": 0.9534244669379804,
+      "grad_norm": 0.8515625,
+      "learning_rate": 1.1340944664143371e-05,
+      "loss": 0.5701,
+      "step": 13705
+    },
+    {
+      "epoch": 0.9534940345751156,
+      "grad_norm": 1.234375,
+      "learning_rate": 1.1307129824638108e-05,
+      "loss": 0.8807,
+      "step": 13706
+    },
+    {
+      "epoch": 0.9535636022122509,
+      "grad_norm": 0.8671875,
+      "learning_rate": 1.1273365185973328e-05,
+      "loss": 0.7222,
+      "step": 13707
+    },
+    {
+      "epoch": 0.9536331698493861,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.1239650749863662e-05,
+      "loss": 0.7902,
+      "step": 13708
+    },
+    {
+      "epoch": 0.9537027374865212,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.1205986518020738e-05,
+      "loss": 0.8234,
+      "step": 13709
+    },
+    {
+      "epoch": 0.9537723051236565,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.1172372492153859e-05,
+      "loss": 0.7902,
+      "step": 13710
+    },
+    {
+      "epoch": 0.9538418727607917,
+      "grad_norm": 0.8671875,
+      "learning_rate": 1.113880867396988e-05,
+      "loss": 0.6262,
+      "step": 13711
+    },
+    {
+      "epoch": 0.9539114403979269,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.1105295065172993e-05,
+      "loss": 0.7055,
+      "step": 13712
+    },
+    {
+      "epoch": 0.9539810080350621,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.1071831667464838e-05,
+      "loss": 0.7361,
+      "step": 13713
+    },
+    {
+      "epoch": 0.9540505756721973,
+      "grad_norm": 1.421875,
+      "learning_rate": 1.1038418482544387e-05,
+      "loss": 0.9626,
+      "step": 13714
+    },
+    {
+      "epoch": 0.9541201433093325,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.1005055512108508e-05,
+      "loss": 1.0645,
+      "step": 13715
+    },
+    {
+      "epoch": 0.9541897109464677,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.0971742757850844e-05,
+      "loss": 0.8561,
+      "step": 13716
+    },
+    {
+      "epoch": 0.9542592785836029,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.0938480221463155e-05,
+      "loss": 0.8213,
+      "step": 13717
+    },
+    {
+      "epoch": 0.9543288462207381,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.0905267904633975e-05,
+      "loss": 0.7922,
+      "step": 13718
+    },
+    {
+      "epoch": 0.9543984138578733,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.087210580905007e-05,
+      "loss": 0.7726,
+      "step": 13719
+    },
+    {
+      "epoch": 0.9544679814950086,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.083899393639498e-05,
+      "loss": 0.6426,
+      "step": 13720
+    },
+    {
+      "epoch": 0.9545375491321437,
+      "grad_norm": 1.59375,
+      "learning_rate": 1.0805932288350029e-05,
+      "loss": 0.9605,
+      "step": 13721
+    },
+    {
+      "epoch": 0.9546071167692789,
+      "grad_norm": 1.578125,
+      "learning_rate": 1.0772920866593983e-05,
+      "loss": 0.7406,
+      "step": 13722
+    },
+    {
+      "epoch": 0.9546766844064142,
+      "grad_norm": 1.4296875,
+      "learning_rate": 1.0739959672803057e-05,
+      "loss": 1.0407,
+      "step": 13723
+    },
+    {
+      "epoch": 0.9547462520435493,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.070704870865058e-05,
+      "loss": 0.8359,
+      "step": 13724
+    },
+    {
+      "epoch": 0.9548158196806845,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.0674187975807659e-05,
+      "loss": 0.8006,
+      "step": 13725
+    },
+    {
+      "epoch": 0.9548853873178198,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.064137747594307e-05,
+      "loss": 1.2597,
+      "step": 13726
+    },
+    {
+      "epoch": 0.954954954954955,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.0608617210722594e-05,
+      "loss": 0.8139,
+      "step": 13727
+    },
+    {
+      "epoch": 0.9550245225920901,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.0575907181809563e-05,
+      "loss": 0.667,
+      "step": 13728
+    },
+    {
+      "epoch": 0.9550940902292253,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.0543247390864984e-05,
+      "loss": 0.7991,
+      "step": 13729
+    },
+    {
+      "epoch": 0.9551636578663606,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.0510637839546977e-05,
+      "loss": 0.9141,
+      "step": 13730
+    },
+    {
+      "epoch": 0.9552332255034958,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.0478078529511436e-05,
+      "loss": 0.9213,
+      "step": 13731
+    },
+    {
+      "epoch": 0.9553027931406309,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.0445569462411487e-05,
+      "loss": 0.9326,
+      "step": 13732
+    },
+    {
+      "epoch": 0.9553723607777662,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.0413110639897916e-05,
+      "loss": 0.8153,
+      "step": 13733
+    },
+    {
+      "epoch": 0.9554419284149014,
+      "grad_norm": 1.1875,
+      "learning_rate": 1.038070206361852e-05,
+      "loss": 0.7108,
+      "step": 13734
+    },
+    {
+      "epoch": 0.9555114960520366,
+      "grad_norm": 0.8125,
+      "learning_rate": 1.034834373521909e-05,
+      "loss": 0.5579,
+      "step": 13735
+    },
+    {
+      "epoch": 0.9555810636891718,
+      "grad_norm": 0.9453125,
+      "learning_rate": 1.0316035656342537e-05,
+      "loss": 0.8924,
+      "step": 13736
+    },
+    {
+      "epoch": 0.955650631326307,
+      "grad_norm": 1.3125,
+      "learning_rate": 1.0283777828629437e-05,
+      "loss": 0.8648,
+      "step": 13737
+    },
+    {
+      "epoch": 0.9557201989634422,
+      "grad_norm": 0.8515625,
+      "learning_rate": 1.0251570253717369e-05,
+      "loss": 0.6623,
+      "step": 13738
+    },
+    {
+      "epoch": 0.9557897666005775,
+      "grad_norm": 0.90234375,
+      "learning_rate": 1.0219412933241911e-05,
+      "loss": 0.5868,
+      "step": 13739
+    },
+    {
+      "epoch": 0.9558593342377126,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.0187305868835872e-05,
+      "loss": 0.7931,
+      "step": 13740
+    },
+    {
+      "epoch": 0.9559289018748478,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.015524906212939e-05,
+      "loss": 0.8052,
+      "step": 13741
+    },
+    {
+      "epoch": 0.955998469511983,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.0123242514750163e-05,
+      "loss": 0.7964,
+      "step": 13742
+    },
+    {
+      "epoch": 0.9560680371491183,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.0091286228323338e-05,
+      "loss": 0.9343,
+      "step": 13743
+    },
+    {
+      "epoch": 0.9561376047862534,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.0059380204471503e-05,
+      "loss": 0.816,
+      "step": 13744
+    },
+    {
+      "epoch": 0.9562071724233886,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.0027524444814694e-05,
+      "loss": 0.917,
+      "step": 13745
+    },
+    {
+      "epoch": 0.9562767400605239,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.995718950970289e-06,
+      "loss": 0.5941,
+      "step": 13746
+    },
+    {
+      "epoch": 0.956346307697659,
+      "grad_norm": 1.265625,
+      "learning_rate": 9.963963724553327e-06,
+      "loss": 0.8269,
+      "step": 13747
+    },
+    {
+      "epoch": 0.9564158753347942,
+      "grad_norm": 2.015625,
+      "learning_rate": 9.932258767176072e-06,
+      "loss": 0.8375,
+      "step": 13748
+    },
+    {
+      "epoch": 0.9564854429719295,
+      "grad_norm": 0.99609375,
+      "learning_rate": 9.900604080448461e-06,
+      "loss": 0.6939,
+      "step": 13749
+    },
+    {
+      "epoch": 0.9565550106090647,
+      "grad_norm": 1.328125,
+      "learning_rate": 9.868999665977763e-06,
+      "loss": 0.8623,
+      "step": 13750
+    },
+    {
+      "epoch": 0.9566245782461998,
+      "grad_norm": 1.0703125,
+      "learning_rate": 9.837445525368582e-06,
+      "loss": 1.0773,
+      "step": 13751
+    },
+    {
+      "epoch": 0.9566941458833351,
+      "grad_norm": 1.09375,
+      "learning_rate": 9.805941660223083e-06,
+      "loss": 0.8229,
+      "step": 13752
+    },
+    {
+      "epoch": 0.9567637135204703,
+      "grad_norm": 1.078125,
+      "learning_rate": 9.774488072140874e-06,
+      "loss": 0.7507,
+      "step": 13753
+    },
+    {
+      "epoch": 0.9568332811576055,
+      "grad_norm": 1.4765625,
+      "learning_rate": 9.743084762719235e-06,
+      "loss": 1.0015,
+      "step": 13754
+    },
+    {
+      "epoch": 0.9569028487947406,
+      "grad_norm": 1.3515625,
+      "learning_rate": 9.711731733552442e-06,
+      "loss": 0.9465,
+      "step": 13755
+    },
+    {
+      "epoch": 0.9569724164318759,
+      "grad_norm": 1.1875,
+      "learning_rate": 9.680428986232337e-06,
+      "loss": 0.9657,
+      "step": 13756
+    },
+    {
+      "epoch": 0.9570419840690111,
+      "grad_norm": 1.0625,
+      "learning_rate": 9.649176522348535e-06,
+      "loss": 0.9344,
+      "step": 13757
+    },
+    {
+      "epoch": 0.9571115517061463,
+      "grad_norm": 1.1953125,
+      "learning_rate": 9.617974343487878e-06,
+      "loss": 0.7727,
+      "step": 13758
+    },
+    {
+      "epoch": 0.9571811193432815,
+      "grad_norm": 1.2265625,
+      "learning_rate": 9.586822451234546e-06,
+      "loss": 0.7814,
+      "step": 13759
+    },
+    {
+      "epoch": 0.9572506869804167,
+      "grad_norm": 1.0625,
+      "learning_rate": 9.555720847170379e-06,
+      "loss": 0.5553,
+      "step": 13760
+    },
+    {
+      "epoch": 0.9573202546175519,
+      "grad_norm": 1.46875,
+      "learning_rate": 9.524669532874452e-06,
+      "loss": 0.8443,
+      "step": 13761
+    },
+    {
+      "epoch": 0.9573898222546872,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.493668509923392e-06,
+      "loss": 1.0941,
+      "step": 13762
+    },
+    {
+      "epoch": 0.9574593898918223,
+      "grad_norm": 0.890625,
+      "learning_rate": 9.462717779891273e-06,
+      "loss": 0.6499,
+      "step": 13763
+    },
+    {
+      "epoch": 0.9575289575289575,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.431817344349835e-06,
+      "loss": 0.7202,
+      "step": 13764
+    },
+    {
+      "epoch": 0.9575985251660928,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.400967204867827e-06,
+      "loss": 0.7296,
+      "step": 13765
+    },
+    {
+      "epoch": 0.957668092803228,
+      "grad_norm": 0.8984375,
+      "learning_rate": 9.370167363011662e-06,
+      "loss": 0.7224,
+      "step": 13766
+    },
+    {
+      "epoch": 0.9577376604403631,
+      "grad_norm": 1.4921875,
+      "learning_rate": 9.339417820345198e-06,
+      "loss": 0.8337,
+      "step": 13767
+    },
+    {
+      "epoch": 0.9578072280774983,
+      "grad_norm": 1.1875,
+      "learning_rate": 9.308718578429964e-06,
+      "loss": 1.0526,
+      "step": 13768
+    },
+    {
+      "epoch": 0.9578767957146336,
+      "grad_norm": 1.078125,
+      "learning_rate": 9.278069638824494e-06,
+      "loss": 0.7711,
+      "step": 13769
+    },
+    {
+      "epoch": 0.9579463633517687,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.247471003084984e-06,
+      "loss": 0.5816,
+      "step": 13770
+    },
+    {
+      "epoch": 0.9580159309889039,
+      "grad_norm": 1.078125,
+      "learning_rate": 9.216922672765082e-06,
+      "loss": 0.752,
+      "step": 13771
+    },
+    {
+      "epoch": 0.9580854986260392,
+      "grad_norm": 1.0859375,
+      "learning_rate": 9.186424649416103e-06,
+      "loss": 0.8951,
+      "step": 13772
+    },
+    {
+      "epoch": 0.9581550662631744,
+      "grad_norm": 1.109375,
+      "learning_rate": 9.155976934586251e-06,
+      "loss": 0.7709,
+      "step": 13773
+    },
+    {
+      "epoch": 0.9582246339003095,
+      "grad_norm": 1.0390625,
+      "learning_rate": 9.125579529821736e-06,
+      "loss": 0.7626,
+      "step": 13774
+    },
+    {
+      "epoch": 0.9582942015374448,
+      "grad_norm": 1.390625,
+      "learning_rate": 9.09523243666599e-06,
+      "loss": 0.7321,
+      "step": 13775
+    },
+    {
+      "epoch": 0.95836376917458,
+      "grad_norm": 1.0625,
+      "learning_rate": 9.064935656659668e-06,
+      "loss": 0.763,
+      "step": 13776
+    },
+    {
+      "epoch": 0.9584333368117152,
+      "grad_norm": 1.0703125,
+      "learning_rate": 9.034689191341206e-06,
+      "loss": 0.59,
+      "step": 13777
+    },
+    {
+      "epoch": 0.9585029044488504,
+      "grad_norm": 1.03125,
+      "learning_rate": 9.004493042246487e-06,
+      "loss": 0.7677,
+      "step": 13778
+    },
+    {
+      "epoch": 0.9585724720859856,
+      "grad_norm": 1.09375,
+      "learning_rate": 8.974347210908729e-06,
+      "loss": 0.9117,
+      "step": 13779
+    },
+    {
+      "epoch": 0.9586420397231208,
+      "grad_norm": 1.2109375,
+      "learning_rate": 8.944251698858263e-06,
+      "loss": 0.752,
+      "step": 13780
+    },
+    {
+      "epoch": 0.958711607360256,
+      "grad_norm": 1.5703125,
+      "learning_rate": 8.914206507623535e-06,
+      "loss": 0.6105,
+      "step": 13781
+    },
+    {
+      "epoch": 0.9587811749973912,
+      "grad_norm": 1.203125,
+      "learning_rate": 8.884211638729877e-06,
+      "loss": 0.7613,
+      "step": 13782
+    },
+    {
+      "epoch": 0.9588507426345264,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.854267093700518e-06,
+      "loss": 0.9493,
+      "step": 13783
+    },
+    {
+      "epoch": 0.9589203102716616,
+      "grad_norm": 1.1875,
+      "learning_rate": 8.824372874055575e-06,
+      "loss": 0.7822,
+      "step": 13784
+    },
+    {
+      "epoch": 0.9589898779087969,
+      "grad_norm": 1.34375,
+      "learning_rate": 8.794528981313055e-06,
+      "loss": 1.0353,
+      "step": 13785
+    },
+    {
+      "epoch": 0.959059445545932,
+      "grad_norm": 1.3515625,
+      "learning_rate": 8.764735416988413e-06,
+      "loss": 1.0167,
+      "step": 13786
+    },
+    {
+      "epoch": 0.9591290131830672,
+      "grad_norm": 1.109375,
+      "learning_rate": 8.734992182594325e-06,
+      "loss": 0.9073,
+      "step": 13787
+    },
+    {
+      "epoch": 0.9591985808202025,
+      "grad_norm": 1.328125,
+      "learning_rate": 8.705299279640921e-06,
+      "loss": 0.8753,
+      "step": 13788
+    },
+    {
+      "epoch": 0.9592681484573377,
+      "grad_norm": 1.4453125,
+      "learning_rate": 8.675656709635882e-06,
+      "loss": 0.8206,
+      "step": 13789
+    },
+    {
+      "epoch": 0.9593377160944728,
+      "grad_norm": 1.3671875,
+      "learning_rate": 8.646064474084447e-06,
+      "loss": 1.1265,
+      "step": 13790
+    },
+    {
+      "epoch": 0.9594072837316081,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.616522574489083e-06,
+      "loss": 0.7658,
+      "step": 13791
+    },
+    {
+      "epoch": 0.9594768513687433,
+      "grad_norm": 1.1875,
+      "learning_rate": 8.5870310123497e-06,
+      "loss": 0.8042,
+      "step": 13792
+    },
+    {
+      "epoch": 0.9595464190058784,
+      "grad_norm": 0.9296875,
+      "learning_rate": 8.557589789163767e-06,
+      "loss": 0.6322,
+      "step": 13793
+    },
+    {
+      "epoch": 0.9596159866430136,
+      "grad_norm": 0.93359375,
+      "learning_rate": 8.528198906426198e-06,
+      "loss": 0.9129,
+      "step": 13794
+    },
+    {
+      "epoch": 0.9596855542801489,
+      "grad_norm": 0.87109375,
+      "learning_rate": 8.498858365629359e-06,
+      "loss": 0.5803,
+      "step": 13795
+    },
+    {
+      "epoch": 0.9597551219172841,
+      "grad_norm": 0.98828125,
+      "learning_rate": 8.469568168262943e-06,
+      "loss": 0.7071,
+      "step": 13796
+    },
+    {
+      "epoch": 0.9598246895544192,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.440328315814094e-06,
+      "loss": 0.928,
+      "step": 13797
+    },
+    {
+      "epoch": 0.9598942571915545,
+      "grad_norm": 0.94921875,
+      "learning_rate": 8.411138809767626e-06,
+      "loss": 0.55,
+      "step": 13798
+    },
+    {
+      "epoch": 0.9599638248286897,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.381999651605466e-06,
+      "loss": 0.8848,
+      "step": 13799
+    },
+    {
+      "epoch": 0.9600333924658249,
+      "grad_norm": 1.171875,
+      "learning_rate": 8.352910842807315e-06,
+      "loss": 0.9132,
+      "step": 13800
+    },
+    {
+      "epoch": 0.9601029601029601,
+      "grad_norm": 1.2109375,
+      "learning_rate": 8.323872384850106e-06,
+      "loss": 0.7374,
+      "step": 13801
+    },
+    {
+      "epoch": 0.9601725277400953,
+      "grad_norm": 1.5234375,
+      "learning_rate": 8.294884279208104e-06,
+      "loss": 1.0615,
+      "step": 13802
+    },
+    {
+      "epoch": 0.9602420953772305,
+      "grad_norm": 1.15625,
+      "learning_rate": 8.265946527353462e-06,
+      "loss": 0.7314,
+      "step": 13803
+    },
+    {
+      "epoch": 0.9603116630143658,
+      "grad_norm": 1.203125,
+      "learning_rate": 8.237059130755232e-06,
+      "loss": 0.5477,
+      "step": 13804
+    },
+    {
+      "epoch": 0.9603812306515009,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.208222090880346e-06,
+      "loss": 0.8335,
+      "step": 13805
+    },
+    {
+      "epoch": 0.9604507982886361,
+      "grad_norm": 1.4609375,
+      "learning_rate": 8.17943540919297e-06,
+      "loss": 0.7856,
+      "step": 13806
+    },
+    {
+      "epoch": 0.9605203659257713,
+      "grad_norm": 0.84765625,
+      "learning_rate": 8.150699087154712e-06,
+      "loss": 0.7394,
+      "step": 13807
+    },
+    {
+      "epoch": 0.9605899335629066,
+      "grad_norm": 1.1640625,
+      "learning_rate": 8.122013126224514e-06,
+      "loss": 0.9102,
+      "step": 13808
+    },
+    {
+      "epoch": 0.9606595012000417,
+      "grad_norm": 1.28125,
+      "learning_rate": 8.093377527859213e-06,
+      "loss": 0.7345,
+      "step": 13809
+    },
+    {
+      "epoch": 0.9607290688371769,
+      "grad_norm": 1.21875,
+      "learning_rate": 8.064792293512535e-06,
+      "loss": 0.8206,
+      "step": 13810
+    },
+    {
+      "epoch": 0.9607986364743122,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.036257424636096e-06,
+      "loss": 0.9497,
+      "step": 13811
+    },
+    {
+      "epoch": 0.9608682041114474,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.007772922678514e-06,
+      "loss": 0.6294,
+      "step": 13812
+    },
+    {
+      "epoch": 0.9609377717485825,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.979338789086299e-06,
+      "loss": 0.5506,
+      "step": 13813
+    },
+    {
+      "epoch": 0.9610073393857178,
+      "grad_norm": 0.828125,
+      "learning_rate": 7.950955025303076e-06,
+      "loss": 0.7495,
+      "step": 13814
+    },
+    {
+      "epoch": 0.961076907022853,
+      "grad_norm": 1.125,
+      "learning_rate": 7.922621632770022e-06,
+      "loss": 0.6539,
+      "step": 13815
+    },
+    {
+      "epoch": 0.9611464746599881,
+      "grad_norm": 1.1796875,
+      "learning_rate": 7.894338612925877e-06,
+      "loss": 0.9119,
+      "step": 13816
+    },
+    {
+      "epoch": 0.9612160422971234,
+      "grad_norm": 1.0546875,
+      "learning_rate": 7.866105967206493e-06,
+      "loss": 0.6867,
+      "step": 13817
+    },
+    {
+      "epoch": 0.9612856099342586,
+      "grad_norm": 1.0390625,
+      "learning_rate": 7.837923697045613e-06,
+      "loss": 0.9962,
+      "step": 13818
+    },
+    {
+      "epoch": 0.9613551775713938,
+      "grad_norm": 1.1640625,
+      "learning_rate": 7.8097918038742e-06,
+      "loss": 0.8745,
+      "step": 13819
+    },
+    {
+      "epoch": 0.9614247452085289,
+      "grad_norm": 0.90625,
+      "learning_rate": 7.781710289120447e-06,
+      "loss": 0.9529,
+      "step": 13820
+    },
+    {
+      "epoch": 0.9614943128456642,
+      "grad_norm": 1.21875,
+      "learning_rate": 7.753679154210214e-06,
+      "loss": 0.8506,
+      "step": 13821
+    },
+    {
+      "epoch": 0.9615638804827994,
+      "grad_norm": 0.9765625,
+      "learning_rate": 7.725698400567026e-06,
+      "loss": 0.7563,
+      "step": 13822
+    },
+    {
+      "epoch": 0.9616334481199346,
+      "grad_norm": 1.5234375,
+      "learning_rate": 7.697768029611308e-06,
+      "loss": 1.1066,
+      "step": 13823
+    },
+    {
+      "epoch": 0.9617030157570698,
+      "grad_norm": 0.87890625,
+      "learning_rate": 7.669888042761475e-06,
+      "loss": 0.6832,
+      "step": 13824
+    },
+    {
+      "epoch": 0.961772583394205,
+      "grad_norm": 1.390625,
+      "learning_rate": 7.642058441432953e-06,
+      "loss": 0.8851,
+      "step": 13825
+    },
+    {
+      "epoch": 0.9618421510313402,
+      "grad_norm": 0.8984375,
+      "learning_rate": 7.614279227038834e-06,
+      "loss": 0.7575,
+      "step": 13826
+    },
+    {
+      "epoch": 0.9619117186684755,
+      "grad_norm": 0.921875,
+      "learning_rate": 7.5865504009895445e-06,
+      "loss": 0.6526,
+      "step": 13827
+    },
+    {
+      "epoch": 0.9619812863056106,
+      "grad_norm": 1.1640625,
+      "learning_rate": 7.558871964693181e-06,
+      "loss": 0.741,
+      "step": 13828
+    },
+    {
+      "epoch": 0.9620508539427458,
+      "grad_norm": 1.1484375,
+      "learning_rate": 7.531243919555064e-06,
+      "loss": 0.8455,
+      "step": 13829
+    },
+    {
+      "epoch": 0.9621204215798811,
+      "grad_norm": 1.5390625,
+      "learning_rate": 7.50366626697796e-06,
+      "loss": 0.8973,
+      "step": 13830
+    },
+    {
+      "epoch": 0.9621899892170163,
+      "grad_norm": 0.8984375,
+      "learning_rate": 7.4761390083619706e-06,
+      "loss": 0.6072,
+      "step": 13831
+    },
+    {
+      "epoch": 0.9622595568541514,
+      "grad_norm": 1.1796875,
+      "learning_rate": 7.4486621451052e-06,
+      "loss": 0.6326,
+      "step": 13832
+    },
+    {
+      "epoch": 0.9623291244912866,
+      "grad_norm": 1.0625,
+      "learning_rate": 7.421235678602423e-06,
+      "loss": 0.722,
+      "step": 13833
+    },
+    {
+      "epoch": 0.9623986921284219,
+      "grad_norm": 1.2890625,
+      "learning_rate": 7.3938596102463005e-06,
+      "loss": 0.7974,
+      "step": 13834
+    },
+    {
+      "epoch": 0.962468259765557,
+      "grad_norm": 1.2890625,
+      "learning_rate": 7.366533941426834e-06,
+      "loss": 1.1735,
+      "step": 13835
+    },
+    {
+      "epoch": 0.9625378274026922,
+      "grad_norm": 1.265625,
+      "learning_rate": 7.339258673531579e-06,
+      "loss": 0.6646,
+      "step": 13836
+    },
+    {
+      "epoch": 0.9626073950398275,
+      "grad_norm": 0.97265625,
+      "learning_rate": 7.3120338079454285e-06,
+      "loss": 0.8732,
+      "step": 13837
+    },
+    {
+      "epoch": 0.9626769626769627,
+      "grad_norm": 1.3515625,
+      "learning_rate": 7.28485934605072e-06,
+      "loss": 0.9918,
+      "step": 13838
+    },
+    {
+      "epoch": 0.9627465303140978,
+      "grad_norm": 1.0234375,
+      "learning_rate": 7.25773528922713e-06,
+      "loss": 0.54,
+      "step": 13839
+    },
+    {
+      "epoch": 0.9628160979512331,
+      "grad_norm": 1.140625,
+      "learning_rate": 7.230661638851887e-06,
+      "loss": 0.891,
+      "step": 13840
+    },
+    {
+      "epoch": 0.9628856655883683,
+      "grad_norm": 1.515625,
+      "learning_rate": 7.2036383962997835e-06,
+      "loss": 0.645,
+      "step": 13841
+    },
+    {
+      "epoch": 0.9629552332255035,
+      "grad_norm": 1.296875,
+      "learning_rate": 7.176665562942941e-06,
+      "loss": 0.7792,
+      "step": 13842
+    },
+    {
+      "epoch": 0.9630248008626388,
+      "grad_norm": 0.94921875,
+      "learning_rate": 7.149743140150711e-06,
+      "loss": 0.7466,
+      "step": 13843
+    },
+    {
+      "epoch": 0.9630943684997739,
+      "grad_norm": 1.2421875,
+      "learning_rate": 7.12287112929022e-06,
+      "loss": 0.9052,
+      "step": 13844
+    },
+    {
+      "epoch": 0.9631639361369091,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.096049531725823e-06,
+      "loss": 0.9388,
+      "step": 13845
+    },
+    {
+      "epoch": 0.9632335037740443,
+      "grad_norm": 1.3046875,
+      "learning_rate": 7.069278348819541e-06,
+      "loss": 0.8991,
+      "step": 13846
+    },
+    {
+      "epoch": 0.9633030714111795,
+      "grad_norm": 1.1875,
+      "learning_rate": 7.042557581930508e-06,
+      "loss": 0.9729,
+      "step": 13847
+    },
+    {
+      "epoch": 0.9633726390483147,
+      "grad_norm": 0.8828125,
+      "learning_rate": 7.015887232415419e-06,
+      "loss": 0.775,
+      "step": 13848
+    },
+    {
+      "epoch": 0.9634422066854499,
+      "grad_norm": 0.8828125,
+      "learning_rate": 6.989267301628632e-06,
+      "loss": 0.5252,
+      "step": 13849
+    },
+    {
+      "epoch": 0.9635117743225852,
+      "grad_norm": 0.953125,
+      "learning_rate": 6.9626977909217346e-06,
+      "loss": 0.7728,
+      "step": 13850
+    },
+    {
+      "epoch": 0.9635813419597203,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.936178701643758e-06,
+      "loss": 0.7419,
+      "step": 13851
+    },
+    {
+      "epoch": 0.9636509095968555,
+      "grad_norm": 1.1796875,
+      "learning_rate": 6.909710035141292e-06,
+      "loss": 1.1057,
+      "step": 13852
+    },
+    {
+      "epoch": 0.9637204772339908,
+      "grad_norm": 1.296875,
+      "learning_rate": 6.883291792758151e-06,
+      "loss": 0.8733,
+      "step": 13853
+    },
+    {
+      "epoch": 0.963790044871126,
+      "grad_norm": 0.9765625,
+      "learning_rate": 6.856923975835705e-06,
+      "loss": 0.9535,
+      "step": 13854
+    },
+    {
+      "epoch": 0.9638596125082611,
+      "grad_norm": 1.3203125,
+      "learning_rate": 6.830606585712884e-06,
+      "loss": 0.9676,
+      "step": 13855
+    },
+    {
+      "epoch": 0.9639291801453964,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.804339623725842e-06,
+      "loss": 0.7737,
+      "step": 13856
+    },
+    {
+      "epoch": 0.9639987477825316,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.77812309120851e-06,
+      "loss": 1.0122,
+      "step": 13857
+    },
+    {
+      "epoch": 0.9640683154196668,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.751956989491825e-06,
+      "loss": 0.8278,
+      "step": 13858
+    },
+    {
+      "epoch": 0.9641378830568019,
+      "grad_norm": 1.078125,
+      "learning_rate": 6.72584131990428e-06,
+      "loss": 0.5063,
+      "step": 13859
+    },
+    {
+      "epoch": 0.9642074506939372,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6.699776083772257e-06,
+      "loss": 0.7343,
+      "step": 13860
+    },
+    {
+      "epoch": 0.9642770183310724,
+      "grad_norm": 1.5234375,
+      "learning_rate": 6.673761282418922e-06,
+      "loss": 0.8882,
+      "step": 13861
+    },
+    {
+      "epoch": 0.9643465859682075,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.647796917165216e-06,
+      "loss": 0.6325,
+      "step": 13862
+    },
+    {
+      "epoch": 0.9644161536053428,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.621882989329531e-06,
+      "loss": 0.6688,
+      "step": 13863
+    },
+    {
+      "epoch": 0.964485721242478,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.59601950022759e-06,
+      "loss": 0.8961,
+      "step": 13864
+    },
+    {
+      "epoch": 0.9645552888796132,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.570206451172789e-06,
+      "loss": 1.0174,
+      "step": 13865
+    },
+    {
+      "epoch": 0.9646248565167485,
+      "grad_norm": 1.28125,
+      "learning_rate": 6.544443843475523e-06,
+      "loss": 0.8975,
+      "step": 13866
+    },
+    {
+      "epoch": 0.9646944241538836,
+      "grad_norm": 0.98046875,
+      "learning_rate": 6.518731678443968e-06,
+      "loss": 0.7626,
+      "step": 13867
+    },
+    {
+      "epoch": 0.9647639917910188,
+      "grad_norm": 1.234375,
+      "learning_rate": 6.493069957383857e-06,
+      "loss": 0.808,
+      "step": 13868
+    },
+    {
+      "epoch": 0.9648335594281541,
+      "grad_norm": 0.8125,
+      "learning_rate": 6.467458681597926e-06,
+      "loss": 0.6525,
+      "step": 13869
+    },
+    {
+      "epoch": 0.9649031270652892,
+      "grad_norm": 1.15625,
+      "learning_rate": 6.441897852386691e-06,
+      "loss": 1.021,
+      "step": 13870
+    },
+    {
+      "epoch": 0.9649726947024244,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.416387471047891e-06,
+      "loss": 0.7757,
+      "step": 13871
+    },
+    {
+      "epoch": 0.9650422623395596,
+      "grad_norm": 1.0859375,
+      "learning_rate": 6.390927538877045e-06,
+      "loss": 0.7479,
+      "step": 13872
+    },
+    {
+      "epoch": 0.9651118299766949,
+      "grad_norm": 1.03125,
+      "learning_rate": 6.365518057166564e-06,
+      "loss": 0.7607,
+      "step": 13873
+    },
+    {
+      "epoch": 0.96518139761383,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.340159027206971e-06,
+      "loss": 0.9347,
+      "step": 13874
+    },
+    {
+      "epoch": 0.9652509652509652,
+      "grad_norm": 1.46875,
+      "learning_rate": 6.3148504502855695e-06,
+      "loss": 1.1005,
+      "step": 13875
+    },
+    {
+      "epoch": 0.9653205328881005,
+      "grad_norm": 1.3515625,
+      "learning_rate": 6.289592327687554e-06,
+      "loss": 1.03,
+      "step": 13876
+    },
+    {
+      "epoch": 0.9653901005252357,
+      "grad_norm": 1.125,
+      "learning_rate": 6.264384660695343e-06,
+      "loss": 0.6541,
+      "step": 13877
+    },
+    {
+      "epoch": 0.9654596681623708,
+      "grad_norm": 1.1171875,
+      "learning_rate": 6.239227450588914e-06,
+      "loss": 1.0166,
+      "step": 13878
+    },
+    {
+      "epoch": 0.9655292357995061,
+      "grad_norm": 0.9140625,
+      "learning_rate": 6.214120698645575e-06,
+      "loss": 0.7993,
+      "step": 13879
+    },
+    {
+      "epoch": 0.9655988034366413,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.189064406140199e-06,
+      "loss": 0.723,
+      "step": 13880
+    },
+    {
+      "epoch": 0.9656683710737765,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.164058574344766e-06,
+      "loss": 0.5604,
+      "step": 13881
+    },
+    {
+      "epoch": 0.9657379387109117,
+      "grad_norm": 1.25,
+      "learning_rate": 6.139103204529372e-06,
+      "loss": 0.8618,
+      "step": 13882
+    },
+    {
+      "epoch": 0.9658075063480469,
+      "grad_norm": 1.2109375,
+      "learning_rate": 6.114198297960672e-06,
+      "loss": 0.741,
+      "step": 13883
+    },
+    {
+      "epoch": 0.9658770739851821,
+      "grad_norm": 1.078125,
+      "learning_rate": 6.089343855903651e-06,
+      "loss": 0.8718,
+      "step": 13884
+    },
+    {
+      "epoch": 0.9659466416223172,
+      "grad_norm": 1.09375,
+      "learning_rate": 6.064539879619968e-06,
+      "loss": 0.5796,
+      "step": 13885
+    },
+    {
+      "epoch": 0.9660162092594525,
+      "grad_norm": 1.0546875,
+      "learning_rate": 6.03978637036906e-06,
+      "loss": 0.8759,
+      "step": 13886
+    },
+    {
+      "epoch": 0.9660857768965877,
+      "grad_norm": 0.9296875,
+      "learning_rate": 6.015083329407922e-06,
+      "loss": 0.5381,
+      "step": 13887
+    },
+    {
+      "epoch": 0.9661553445337229,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.990430757990773e-06,
+      "loss": 1.0053,
+      "step": 13888
+    },
+    {
+      "epoch": 0.9662249121708582,
+      "grad_norm": 1.359375,
+      "learning_rate": 5.9658286573694984e-06,
+      "loss": 1.0386,
+      "step": 13889
+    },
+    {
+      "epoch": 0.9662944798079933,
+      "grad_norm": 1.4296875,
+      "learning_rate": 5.941277028792991e-06,
+      "loss": 1.0889,
+      "step": 13890
+    },
+    {
+      "epoch": 0.9663640474451285,
+      "grad_norm": 1.328125,
+      "learning_rate": 5.916775873508029e-06,
+      "loss": 0.8672,
+      "step": 13891
+    },
+    {
+      "epoch": 0.9664336150822638,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.89232519275873e-06,
+      "loss": 0.7844,
+      "step": 13892
+    },
+    {
+      "epoch": 0.966503182719399,
+      "grad_norm": 1.53125,
+      "learning_rate": 5.867924987786432e-06,
+      "loss": 1.0494,
+      "step": 13893
+    },
+    {
+      "epoch": 0.9665727503565341,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.843575259830036e-06,
+      "loss": 0.5348,
+      "step": 13894
+    },
+    {
+      "epoch": 0.9666423179936694,
+      "grad_norm": 0.93359375,
+      "learning_rate": 5.819276010126107e-06,
+      "loss": 0.7969,
+      "step": 13895
+    },
+    {
+      "epoch": 0.9667118856308046,
+      "grad_norm": 1.3359375,
+      "learning_rate": 5.795027239908213e-06,
+      "loss": 0.6913,
+      "step": 13896
+    },
+    {
+      "epoch": 0.9667814532679397,
+      "grad_norm": 1.0703125,
+      "learning_rate": 5.7708289504077024e-06,
+      "loss": 0.6975,
+      "step": 13897
+    },
+    {
+      "epoch": 0.9668510209050749,
+      "grad_norm": 1.015625,
+      "learning_rate": 5.746681142853149e-06,
+      "loss": 0.6556,
+      "step": 13898
+    },
+    {
+      "epoch": 0.9669205885422102,
+      "grad_norm": 0.94921875,
+      "learning_rate": 5.722583818470795e-06,
+      "loss": 0.7655,
+      "step": 13899
+    },
+    {
+      "epoch": 0.9669901561793454,
+      "grad_norm": 0.94921875,
+      "learning_rate": 5.698536978484104e-06,
+      "loss": 0.6782,
+      "step": 13900
+    },
+    {
+      "epoch": 0.9670597238164805,
+      "grad_norm": 1.265625,
+      "learning_rate": 5.674540624113988e-06,
+      "loss": 0.7805,
+      "step": 13901
+    },
+    {
+      "epoch": 0.9671292914536158,
+      "grad_norm": 1.046875,
+      "learning_rate": 5.650594756579031e-06,
+      "loss": 0.7414,
+      "step": 13902
+    },
+    {
+      "epoch": 0.967198859090751,
+      "grad_norm": 0.99609375,
+      "learning_rate": 5.626699377094924e-06,
+      "loss": 0.8086,
+      "step": 13903
+    },
+    {
+      "epoch": 0.9672684267278862,
+      "grad_norm": 0.98828125,
+      "learning_rate": 5.6028544868749194e-06,
+      "loss": 0.7736,
+      "step": 13904
+    },
+    {
+      "epoch": 0.9673379943650214,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.579060087129939e-06,
+      "loss": 0.6967,
+      "step": 13905
+    },
+    {
+      "epoch": 0.9674075620021566,
+      "grad_norm": 1.109375,
+      "learning_rate": 5.555316179068015e-06,
+      "loss": 0.9138,
+      "step": 13906
+    },
+    {
+      "epoch": 0.9674771296392918,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.531622763894739e-06,
+      "loss": 0.837,
+      "step": 13907
+    },
+    {
+      "epoch": 0.9675466972764271,
+      "grad_norm": 0.90234375,
+      "learning_rate": 5.507979842813149e-06,
+      "loss": 0.7755,
+      "step": 13908
+    },
+    {
+      "epoch": 0.9676162649135622,
+      "grad_norm": 1.1953125,
+      "learning_rate": 5.48438741702384e-06,
+      "loss": 0.7594,
+      "step": 13909
+    },
+    {
+      "epoch": 0.9676858325506974,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.46084548772452e-06,
+      "loss": 0.816,
+      "step": 13910
+    },
+    {
+      "epoch": 0.9677554001878326,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.437354056110566e-06,
+      "loss": 0.5594,
+      "step": 13911
+    },
+    {
+      "epoch": 0.9678249678249679,
+      "grad_norm": 1.1015625,
+      "learning_rate": 5.413913123374914e-06,
+      "loss": 0.8616,
+      "step": 13912
+    },
+    {
+      "epoch": 0.967894535462103,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.390522690707611e-06,
+      "loss": 0.7617,
+      "step": 13913
+    },
+    {
+      "epoch": 0.9679641030992382,
+      "grad_norm": 1.3359375,
+      "learning_rate": 5.367182759296374e-06,
+      "loss": 0.8175,
+      "step": 13914
+    },
+    {
+      "epoch": 0.9680336707363735,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.343893330326255e-06,
+      "loss": 0.7259,
+      "step": 13915
+    },
+    {
+      "epoch": 0.9681032383735086,
+      "grad_norm": 0.9296875,
+      "learning_rate": 5.320654404979863e-06,
+      "loss": 0.6797,
+      "step": 13916
+    },
+    {
+      "epoch": 0.9681728060106438,
+      "grad_norm": 1.4375,
+      "learning_rate": 5.297465984437033e-06,
+      "loss": 0.8994,
+      "step": 13917
+    },
+    {
+      "epoch": 0.9682423736477791,
+      "grad_norm": 1.25,
+      "learning_rate": 5.274328069875156e-06,
+      "loss": 0.8834,
+      "step": 13918
+    },
+    {
+      "epoch": 0.9683119412849143,
+      "grad_norm": 1.1875,
+      "learning_rate": 5.251240662469181e-06,
+      "loss": 0.7321,
+      "step": 13919
+    },
+    {
+      "epoch": 0.9683815089220494,
+      "grad_norm": 1.0625,
+      "learning_rate": 5.228203763391392e-06,
+      "loss": 0.6997,
+      "step": 13920
+    },
+    {
+      "epoch": 0.9684510765591847,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.2052173738113e-06,
+      "loss": 0.7915,
+      "step": 13921
+    },
+    {
+      "epoch": 0.9685206441963199,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.18228149489608e-06,
+      "loss": 0.8359,
+      "step": 13922
+    },
+    {
+      "epoch": 0.9685902118334551,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.1593961278103566e-06,
+      "loss": 0.82,
+      "step": 13923
+    },
+    {
+      "epoch": 0.9686597794705902,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.136561273716201e-06,
+      "loss": 0.8778,
+      "step": 13924
+    },
+    {
+      "epoch": 0.9687293471077255,
+      "grad_norm": 1.1640625,
+      "learning_rate": 5.113776933772907e-06,
+      "loss": 0.8501,
+      "step": 13925
+    },
+    {
+      "epoch": 0.9687989147448607,
+      "grad_norm": 0.91015625,
+      "learning_rate": 5.09104310913755e-06,
+      "loss": 0.5983,
+      "step": 13926
+    },
+    {
+      "epoch": 0.9688684823819959,
+      "grad_norm": 1.2890625,
+      "learning_rate": 5.068359800964206e-06,
+      "loss": 0.8232,
+      "step": 13927
+    },
+    {
+      "epoch": 0.9689380500191311,
+      "grad_norm": 0.91015625,
+      "learning_rate": 5.045727010404733e-06,
+      "loss": 0.6038,
+      "step": 13928
+    },
+    {
+      "epoch": 0.9690076176562663,
+      "grad_norm": 1.484375,
+      "learning_rate": 5.023144738608321e-06,
+      "loss": 0.6608,
+      "step": 13929
+    },
+    {
+      "epoch": 0.9690771852934015,
+      "grad_norm": 1.0078125,
+      "learning_rate": 5.000612986721498e-06,
+      "loss": 0.6612,
+      "step": 13930
+    },
+    {
+      "epoch": 0.9691467529305368,
+      "grad_norm": 1.0703125,
+      "learning_rate": 4.9781317558884596e-06,
+      "loss": 0.6919,
+      "step": 13931
+    },
+    {
+      "epoch": 0.9692163205676719,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.955701047250516e-06,
+      "loss": 0.9562,
+      "step": 13932
+    },
+    {
+      "epoch": 0.9692858882048071,
+      "grad_norm": 1.421875,
+      "learning_rate": 4.933320861946866e-06,
+      "loss": 1.0431,
+      "step": 13933
+    },
+    {
+      "epoch": 0.9693554558419424,
+      "grad_norm": 0.9609375,
+      "learning_rate": 4.91099120111349e-06,
+      "loss": 0.7166,
+      "step": 13934
+    },
+    {
+      "epoch": 0.9694250234790776,
+      "grad_norm": 1.2265625,
+      "learning_rate": 4.888712065884482e-06,
+      "loss": 1.1322,
+      "step": 13935
+    },
+    {
+      "epoch": 0.9694945911162127,
+      "grad_norm": 0.97265625,
+      "learning_rate": 4.866483457390825e-06,
+      "loss": 0.738,
+      "step": 13936
+    },
+    {
+      "epoch": 0.9695641587533479,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.844305376761393e-06,
+      "loss": 0.931,
+      "step": 13937
+    },
+    {
+      "epoch": 0.9696337263904832,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.822177825122176e-06,
+      "loss": 0.8767,
+      "step": 13938
+    },
+    {
+      "epoch": 0.9697032940276183,
+      "grad_norm": 1.125,
+      "learning_rate": 4.800100803596607e-06,
+      "loss": 0.6092,
+      "step": 13939
+    },
+    {
+      "epoch": 0.9697728616647535,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.778074313305791e-06,
+      "loss": 0.7926,
+      "step": 13940
+    },
+    {
+      "epoch": 0.9698424293018888,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.756098355368055e-06,
+      "loss": 0.9556,
+      "step": 13941
+    },
+    {
+      "epoch": 0.969911996939024,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.734172930899283e-06,
+      "loss": 0.7866,
+      "step": 13942
+    },
+    {
+      "epoch": 0.9699815645761591,
+      "grad_norm": 1.46875,
+      "learning_rate": 4.712298041012697e-06,
+      "loss": 0.9331,
+      "step": 13943
+    },
+    {
+      "epoch": 0.9700511322132944,
+      "grad_norm": 1.1796875,
+      "learning_rate": 4.690473686819075e-06,
+      "loss": 0.9383,
+      "step": 13944
+    },
+    {
+      "epoch": 0.9701206998504296,
+      "grad_norm": 1.0234375,
+      "learning_rate": 4.668699869426308e-06,
+      "loss": 0.6549,
+      "step": 13945
+    },
+    {
+      "epoch": 0.9701902674875648,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.646976589940177e-06,
+      "loss": 0.7424,
+      "step": 13946
+    },
+    {
+      "epoch": 0.9702598351247,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.625303849463581e-06,
+      "loss": 0.8849,
+      "step": 13947
+    },
+    {
+      "epoch": 0.9703294027618352,
+      "grad_norm": 1.2109375,
+      "learning_rate": 4.6036816490970805e-06,
+      "loss": 0.8294,
+      "step": 13948
+    },
+    {
+      "epoch": 0.9703989703989704,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.582109989938465e-06,
+      "loss": 1.1237,
+      "step": 13949
+    },
+    {
+      "epoch": 0.9704685380361056,
+      "grad_norm": 1.0546875,
+      "learning_rate": 4.560588873082972e-06,
+      "loss": 0.8116,
+      "step": 13950
+    },
+    {
+      "epoch": 0.9705381056732408,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.539118299623391e-06,
+      "loss": 0.793,
+      "step": 13951
+    },
+    {
+      "epoch": 0.970607673310376,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.517698270649961e-06,
+      "loss": 0.8075,
+      "step": 13952
+    },
+    {
+      "epoch": 0.9706772409475112,
+      "grad_norm": 1.15625,
+      "learning_rate": 4.496328787250148e-06,
+      "loss": 0.7519,
+      "step": 13953
+    },
+    {
+      "epoch": 0.9707468085846465,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.4750098505089705e-06,
+      "loss": 0.7596,
+      "step": 13954
+    },
+    {
+      "epoch": 0.9708163762217816,
+      "grad_norm": 1.390625,
+      "learning_rate": 4.45374146150912e-06,
+      "loss": 0.8662,
+      "step": 13955
+    },
+    {
+      "epoch": 0.9708859438589168,
+      "grad_norm": 0.98828125,
+      "learning_rate": 4.4325236213302865e-06,
+      "loss": 0.7271,
+      "step": 13956
+    },
+    {
+      "epoch": 0.9709555114960521,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.411356331049832e-06,
+      "loss": 1.0329,
+      "step": 13957
+    },
+    {
+      "epoch": 0.9710250791331873,
+      "grad_norm": 1.4375,
+      "learning_rate": 4.390239591742562e-06,
+      "loss": 0.7974,
+      "step": 13958
+    },
+    {
+      "epoch": 0.9710946467703224,
+      "grad_norm": 0.98828125,
+      "learning_rate": 4.369173404480731e-06,
+      "loss": 0.7028,
+      "step": 13959
+    },
+    {
+      "epoch": 0.9711642144074577,
+      "grad_norm": 1.25,
+      "learning_rate": 4.348157770333927e-06,
+      "loss": 0.7796,
+      "step": 13960
+    },
+    {
+      "epoch": 0.9712337820445929,
+      "grad_norm": 1.234375,
+      "learning_rate": 4.327192690369186e-06,
+      "loss": 0.7537,
+      "step": 13961
+    },
+    {
+      "epoch": 0.971303349681728,
+      "grad_norm": 1.2109375,
+      "learning_rate": 4.306278165651101e-06,
+      "loss": 0.9896,
+      "step": 13962
+    },
+    {
+      "epoch": 0.9713729173188632,
+      "grad_norm": 0.8828125,
+      "learning_rate": 4.2854141972414885e-06,
+      "loss": 0.9013,
+      "step": 13963
+    },
+    {
+      "epoch": 0.9714424849559985,
+      "grad_norm": 1.1640625,
+      "learning_rate": 4.2646007861997235e-06,
+      "loss": 0.8468,
+      "step": 13964
+    },
+    {
+      "epoch": 0.9715120525931337,
+      "grad_norm": 0.97265625,
+      "learning_rate": 4.243837933582739e-06,
+      "loss": 0.7459,
+      "step": 13965
+    },
+    {
+      "epoch": 0.9715816202302688,
+      "grad_norm": 1.078125,
+      "learning_rate": 4.2231256404446916e-06,
+      "loss": 0.856,
+      "step": 13966
+    },
+    {
+      "epoch": 0.9716511878674041,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.202463907837184e-06,
+      "loss": 0.8179,
+      "step": 13967
+    },
+    {
+      "epoch": 0.9717207555045393,
+      "grad_norm": 0.89453125,
+      "learning_rate": 4.1818527368093775e-06,
+      "loss": 0.6342,
+      "step": 13968
+    },
+    {
+      "epoch": 0.9717903231416745,
+      "grad_norm": 1.171875,
+      "learning_rate": 4.161292128407767e-06,
+      "loss": 1.057,
+      "step": 13969
+    },
+    {
+      "epoch": 0.9718598907788097,
+      "grad_norm": 1.3046875,
+      "learning_rate": 4.140782083676409e-06,
+      "loss": 0.8391,
+      "step": 13970
+    },
+    {
+      "epoch": 0.9719294584159449,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.1203226036565785e-06,
+      "loss": 0.7712,
+      "step": 13971
+    },
+    {
+      "epoch": 0.9719990260530801,
+      "grad_norm": 0.796875,
+      "learning_rate": 4.099913689387114e-06,
+      "loss": 0.5111,
+      "step": 13972
+    },
+    {
+      "epoch": 0.9720685936902154,
+      "grad_norm": 1.4921875,
+      "learning_rate": 4.079555341904406e-06,
+      "loss": 0.9152,
+      "step": 13973
+    },
+    {
+      "epoch": 0.9721381613273505,
+      "grad_norm": 1.2578125,
+      "learning_rate": 4.059247562242074e-06,
+      "loss": 0.758,
+      "step": 13974
+    },
+    {
+      "epoch": 0.9722077289644857,
+      "grad_norm": 1.2109375,
+      "learning_rate": 4.038990351431182e-06,
+      "loss": 0.7742,
+      "step": 13975
+    },
+    {
+      "epoch": 0.9722772966016209,
+      "grad_norm": 1.1328125,
+      "learning_rate": 4.018783710500462e-06,
+      "loss": 0.8875,
+      "step": 13976
+    },
+    {
+      "epoch": 0.9723468642387562,
+      "grad_norm": 0.9765625,
+      "learning_rate": 3.998627640475649e-06,
+      "loss": 0.7069,
+      "step": 13977
+    },
+    {
+      "epoch": 0.9724164318758913,
+      "grad_norm": 0.99609375,
+      "learning_rate": 3.978522142380259e-06,
+      "loss": 0.6547,
+      "step": 13978
+    },
+    {
+      "epoch": 0.9724859995130265,
+      "grad_norm": 0.9609375,
+      "learning_rate": 3.958467217235362e-06,
+      "loss": 0.9462,
+      "step": 13979
+    },
+    {
+      "epoch": 0.9725555671501618,
+      "grad_norm": 1.3359375,
+      "learning_rate": 3.938462866059034e-06,
+      "loss": 0.5854,
+      "step": 13980
+    },
+    {
+      "epoch": 0.972625134787297,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.918509089867017e-06,
+      "loss": 1.0063,
+      "step": 13981
+    },
+    {
+      "epoch": 0.9726947024244321,
+      "grad_norm": 1.1171875,
+      "learning_rate": 3.898605889672391e-06,
+      "loss": 0.7507,
+      "step": 13982
+    },
+    {
+      "epoch": 0.9727642700615674,
+      "grad_norm": 0.97265625,
+      "learning_rate": 3.878753266486013e-06,
+      "loss": 0.7795,
+      "step": 13983
+    },
+    {
+      "epoch": 0.9728338376987026,
+      "grad_norm": 1.0625,
+      "learning_rate": 3.858951221315632e-06,
+      "loss": 0.865,
+      "step": 13984
+    },
+    {
+      "epoch": 0.9729034053358377,
+      "grad_norm": 0.96484375,
+      "learning_rate": 3.839199755166778e-06,
+      "loss": 0.7666,
+      "step": 13985
+    },
+    {
+      "epoch": 0.972972972972973,
+      "grad_norm": 0.85546875,
+      "learning_rate": 3.819498869042315e-06,
+      "loss": 0.5805,
+      "step": 13986
+    },
+    {
+      "epoch": 0.9730425406101082,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.7998485639426648e-06,
+      "loss": 0.776,
+      "step": 13987
+    },
+    {
+      "epoch": 0.9731121082472434,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.7802488408653635e-06,
+      "loss": 0.8188,
+      "step": 13988
+    },
+    {
+      "epoch": 0.9731816758843785,
+      "grad_norm": 1.15625,
+      "learning_rate": 3.7606997008058363e-06,
+      "loss": 0.9584,
+      "step": 13989
+    },
+    {
+      "epoch": 0.9732512435215138,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.741201144756512e-06,
+      "loss": 0.8802,
+      "step": 13990
+    },
+    {
+      "epoch": 0.973320811158649,
+      "grad_norm": 1.125,
+      "learning_rate": 3.7217531737073762e-06,
+      "loss": 0.7809,
+      "step": 13991
+    },
+    {
+      "epoch": 0.9733903787957842,
+      "grad_norm": 0.98046875,
+      "learning_rate": 3.7023557886460833e-06,
+      "loss": 0.7774,
+      "step": 13992
+    },
+    {
+      "epoch": 0.9734599464329194,
+      "grad_norm": 0.90234375,
+      "learning_rate": 3.6830089905575128e-06,
+      "loss": 0.9329,
+      "step": 13993
+    },
+    {
+      "epoch": 0.9735295140700546,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.663712780423878e-06,
+      "loss": 0.582,
+      "step": 13994
+    },
+    {
+      "epoch": 0.9735990817071898,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.644467159224951e-06,
+      "loss": 0.8051,
+      "step": 13995
+    },
+    {
+      "epoch": 0.9736686493443251,
+      "grad_norm": 0.87890625,
+      "learning_rate": 3.625272127938062e-06,
+      "loss": 0.745,
+      "step": 13996
+    },
+    {
+      "epoch": 0.9737382169814602,
+      "grad_norm": 1.2109375,
+      "learning_rate": 3.6061276875376527e-06,
+      "loss": 0.7743,
+      "step": 13997
+    },
+    {
+      "epoch": 0.9738077846185954,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.5870338389959454e-06,
+      "loss": 0.9294,
+      "step": 13998
+    },
+    {
+      "epoch": 0.9738773522557307,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.567990583282388e-06,
+      "loss": 0.7511,
+      "step": 13999
+    },
+    {
+      "epoch": 0.9739469198928659,
+      "grad_norm": 1.0234375,
+      "learning_rate": 3.5489979213638724e-06,
+      "loss": 0.8622,
+      "step": 14000
+    },
+    {
+      "epoch": 0.974016487530001,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.530055854204739e-06,
+      "loss": 0.5685,
+      "step": 14001
+    },
+    {
+      "epoch": 0.9740860551671362,
+      "grad_norm": 1.078125,
+      "learning_rate": 3.5111643827667737e-06,
+      "loss": 0.6536,
+      "step": 14002
+    },
+    {
+      "epoch": 0.9741556228042715,
+      "grad_norm": 1.2578125,
+      "learning_rate": 3.4923235080092095e-06,
+      "loss": 1.0551,
+      "step": 14003
+    },
+    {
+      "epoch": 0.9742251904414067,
+      "grad_norm": 1.3828125,
+      "learning_rate": 3.473533230888726e-06,
+      "loss": 0.8098,
+      "step": 14004
+    },
+    {
+      "epoch": 0.9742947580785418,
+      "grad_norm": 1.0625,
+      "learning_rate": 3.4547935523593366e-06,
+      "loss": 0.6146,
+      "step": 14005
+    },
+    {
+      "epoch": 0.9743643257156771,
+      "grad_norm": 1.0390625,
+      "learning_rate": 3.436104473372503e-06,
+      "loss": 0.7175,
+      "step": 14006
+    },
+    {
+      "epoch": 0.9744338933528123,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.4174659948773554e-06,
+      "loss": 0.8898,
+      "step": 14007
+    },
+    {
+      "epoch": 0.9745034609899474,
+      "grad_norm": 1.203125,
+      "learning_rate": 3.3988781178201366e-06,
+      "loss": 0.7473,
+      "step": 14008
+    },
+    {
+      "epoch": 0.9745730286270827,
+      "grad_norm": 1.2578125,
+      "learning_rate": 3.380340843144536e-06,
+      "loss": 0.9942,
+      "step": 14009
+    },
+    {
+      "epoch": 0.9746425962642179,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.3618541717919117e-06,
+      "loss": 0.7472,
+      "step": 14010
+    },
+    {
+      "epoch": 0.9747121639013531,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.343418104700957e-06,
+      "loss": 0.7854,
+      "step": 14011
+    },
+    {
+      "epoch": 0.9747817315384883,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.3250326428077014e-06,
+      "loss": 0.7731,
+      "step": 14012
+    },
+    {
+      "epoch": 0.9748512991756235,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.3066977870456205e-06,
+      "loss": 0.7759,
+      "step": 14013
+    },
+    {
+      "epoch": 0.9749208668127587,
+      "grad_norm": 0.921875,
+      "learning_rate": 3.288413538345747e-06,
+      "loss": 0.6518,
+      "step": 14014
+    },
+    {
+      "epoch": 0.9749904344498939,
+      "grad_norm": 1.1796875,
+      "learning_rate": 3.2701798976364494e-06,
+      "loss": 0.8985,
+      "step": 14015
+    },
+    {
+      "epoch": 0.9750600020870291,
+      "grad_norm": 1.1015625,
+      "learning_rate": 3.2519968658435427e-06,
+      "loss": 0.7888,
+      "step": 14016
+    },
+    {
+      "epoch": 0.9751295697241643,
+      "grad_norm": 0.9375,
+      "learning_rate": 3.2338644438902887e-06,
+      "loss": 0.6723,
+      "step": 14017
+    },
+    {
+      "epoch": 0.9751991373612995,
+      "grad_norm": 1.0234375,
+      "learning_rate": 3.2157826326972837e-06,
+      "loss": 0.8776,
+      "step": 14018
+    },
+    {
+      "epoch": 0.9752687049984348,
+      "grad_norm": 1.5625,
+      "learning_rate": 3.197751433182572e-06,
+      "loss": 0.848,
+      "step": 14019
+    },
+    {
+      "epoch": 0.9753382726355699,
+      "grad_norm": 1.359375,
+      "learning_rate": 3.179770846261865e-06,
+      "loss": 1.0315,
+      "step": 14020
+    },
+    {
+      "epoch": 0.9754078402727051,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.1618408728480985e-06,
+      "loss": 0.8892,
+      "step": 14021
+    },
+    {
+      "epoch": 0.9754774079098404,
+      "grad_norm": 1.171875,
+      "learning_rate": 3.1439615138515454e-06,
+      "loss": 1.0339,
+      "step": 14022
+    },
+    {
+      "epoch": 0.9755469755469756,
+      "grad_norm": 1.3671875,
+      "learning_rate": 3.126132770180146e-06,
+      "loss": 0.9649,
+      "step": 14023
+    },
+    {
+      "epoch": 0.9756165431841107,
+      "grad_norm": 0.984375,
+      "learning_rate": 3.108354642739064e-06,
+      "loss": 0.7231,
+      "step": 14024
+    },
+    {
+      "epoch": 0.975686110821246,
+      "grad_norm": 0.9453125,
+      "learning_rate": 3.090627132431023e-06,
+      "loss": 0.8066,
+      "step": 14025
+    },
+    {
+      "epoch": 0.9757556784583812,
+      "grad_norm": 0.96875,
+      "learning_rate": 3.0729502401561916e-06,
+      "loss": 0.5879,
+      "step": 14026
+    },
+    {
+      "epoch": 0.9758252460955164,
+      "grad_norm": 0.7421875,
+      "learning_rate": 3.0553239668120735e-06,
+      "loss": 0.5491,
+      "step": 14027
+    },
+    {
+      "epoch": 0.9758948137326515,
+      "grad_norm": 1.03125,
+      "learning_rate": 3.037748313293509e-06,
+      "loss": 0.7964,
+      "step": 14028
+    },
+    {
+      "epoch": 0.9759643813697868,
+      "grad_norm": 1.2421875,
+      "learning_rate": 3.020223280493228e-06,
+      "loss": 0.7861,
+      "step": 14029
+    },
+    {
+      "epoch": 0.976033949006922,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.0027488693007422e-06,
+      "loss": 0.8609,
+      "step": 14030
+    },
+    {
+      "epoch": 0.9761035166440571,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.9853250806033407e-06,
+      "loss": 0.7567,
+      "step": 14031
+    },
+    {
+      "epoch": 0.9761730842811924,
+      "grad_norm": 1.1796875,
+      "learning_rate": 2.9679519152859824e-06,
+      "loss": 0.8647,
+      "step": 14032
+    },
+    {
+      "epoch": 0.9762426519183276,
+      "grad_norm": 1.125,
+      "learning_rate": 2.950629374230518e-06,
+      "loss": 0.8666,
+      "step": 14033
+    },
+    {
+      "epoch": 0.9763122195554628,
+      "grad_norm": 1.2578125,
+      "learning_rate": 2.9333574583165767e-06,
+      "loss": 0.9188,
+      "step": 14034
+    },
+    {
+      "epoch": 0.976381787192598,
+      "grad_norm": 1.5234375,
+      "learning_rate": 2.916136168421124e-06,
+      "loss": 0.9102,
+      "step": 14035
+    },
+    {
+      "epoch": 0.9764513548297332,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.8989655054186827e-06,
+      "loss": 0.6939,
+      "step": 14036
+    },
+    {
+      "epoch": 0.9765209224668684,
+      "grad_norm": 0.92578125,
+      "learning_rate": 2.881845470180999e-06,
+      "loss": 0.6198,
+      "step": 14037
+    },
+    {
+      "epoch": 0.9765904901040037,
+      "grad_norm": 1.0625,
+      "learning_rate": 2.864776063577268e-06,
+      "loss": 0.7109,
+      "step": 14038
+    },
+    {
+      "epoch": 0.9766600577411388,
+      "grad_norm": 0.9375,
+      "learning_rate": 2.8477572864744616e-06,
+      "loss": 0.7561,
+      "step": 14039
+    },
+    {
+      "epoch": 0.976729625378274,
+      "grad_norm": 1.0703125,
+      "learning_rate": 2.830789139736334e-06,
+      "loss": 0.6205,
+      "step": 14040
+    },
+    {
+      "epoch": 0.9767991930154092,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.8138716242247507e-06,
+      "loss": 0.882,
+      "step": 14041
+    },
+    {
+      "epoch": 0.9768687606525445,
+      "grad_norm": 0.99609375,
+      "learning_rate": 2.79700474079847e-06,
+      "loss": 0.9954,
+      "step": 14042
+    },
+    {
+      "epoch": 0.9769383282896796,
+      "grad_norm": 0.99609375,
+      "learning_rate": 2.7801884903141396e-06,
+      "loss": 0.8026,
+      "step": 14043
+    },
+    {
+      "epoch": 0.9770078959268148,
+      "grad_norm": 1.34375,
+      "learning_rate": 2.7634228736254097e-06,
+      "loss": 0.8356,
+      "step": 14044
+    },
+    {
+      "epoch": 0.9770774635639501,
+      "grad_norm": 0.91015625,
+      "learning_rate": 2.7467078915835996e-06,
+      "loss": 0.5747,
+      "step": 14045
+    },
+    {
+      "epoch": 0.9771470312010853,
+      "grad_norm": 0.9375,
+      "learning_rate": 2.730043545037364e-06,
+      "loss": 0.8995,
+      "step": 14046
+    },
+    {
+      "epoch": 0.9772165988382204,
+      "grad_norm": 1.578125,
+      "learning_rate": 2.7134298348330257e-06,
+      "loss": 0.9942,
+      "step": 14047
+    },
+    {
+      "epoch": 0.9772861664753557,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.6968667618140207e-06,
+      "loss": 0.9477,
+      "step": 14048
+    },
+    {
+      "epoch": 0.9773557341124909,
+      "grad_norm": 1.1171875,
+      "learning_rate": 2.6803543268213436e-06,
+      "loss": 0.725,
+      "step": 14049
+    },
+    {
+      "epoch": 0.977425301749626,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.663892530693324e-06,
+      "loss": 0.843,
+      "step": 14050
+    },
+    {
+      "epoch": 0.9774948693867613,
+      "grad_norm": 1.1484375,
+      "learning_rate": 2.6474813742659587e-06,
+      "loss": 0.7772,
+      "step": 14051
+    },
+    {
+      "epoch": 0.9775644370238965,
+      "grad_norm": 1.1484375,
+      "learning_rate": 2.631120858372471e-06,
+      "loss": 0.8957,
+      "step": 14052
+    },
+    {
+      "epoch": 0.9776340046610317,
+      "grad_norm": 1.328125,
+      "learning_rate": 2.6148109838435297e-06,
+      "loss": 0.8659,
+      "step": 14053
+    },
+    {
+      "epoch": 0.9777035722981668,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.59855175150725e-06,
+      "loss": 0.8784,
+      "step": 14054
+    },
+    {
+      "epoch": 0.9777731399353021,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.5823431621893046e-06,
+      "loss": 0.7079,
+      "step": 14055
+    },
+    {
+      "epoch": 0.9778427075724373,
+      "grad_norm": 1.3828125,
+      "learning_rate": 2.566185216712591e-06,
+      "loss": 0.8376,
+      "step": 14056
+    },
+    {
+      "epoch": 0.9779122752095725,
+      "grad_norm": 1.4140625,
+      "learning_rate": 2.550077915897564e-06,
+      "loss": 0.9035,
+      "step": 14057
+    },
+    {
+      "epoch": 0.9779818428467077,
+      "grad_norm": 0.78125,
+      "learning_rate": 2.534021260562014e-06,
+      "loss": 0.597,
+      "step": 14058
+    },
+    {
+      "epoch": 0.9780514104838429,
+      "grad_norm": 0.984375,
+      "learning_rate": 2.5180152515212885e-06,
+      "loss": 0.8717,
+      "step": 14059
+    },
+    {
+      "epoch": 0.9781209781209781,
+      "grad_norm": 1.140625,
+      "learning_rate": 2.5020598895880706e-06,
+      "loss": 0.985,
+      "step": 14060
+    },
+    {
+      "epoch": 0.9781905457581134,
+      "grad_norm": 1.3671875,
+      "learning_rate": 2.486155175572491e-06,
+      "loss": 0.7367,
+      "step": 14061
+    },
+    {
+      "epoch": 0.9782601133952485,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.470301110282236e-06,
+      "loss": 0.9985,
+      "step": 14062
+    },
+    {
+      "epoch": 0.9783296810323837,
+      "grad_norm": 1.140625,
+      "learning_rate": 2.4544976945219953e-06,
+      "loss": 0.7257,
+      "step": 14063
+    },
+    {
+      "epoch": 0.978399248669519,
+      "grad_norm": 0.90625,
+      "learning_rate": 2.438744929094461e-06,
+      "loss": 0.7439,
+      "step": 14064
+    },
+    {
+      "epoch": 0.9784688163066542,
+      "grad_norm": 0.90234375,
+      "learning_rate": 2.4230428147992146e-06,
+      "loss": 0.66,
+      "step": 14065
+    },
+    {
+      "epoch": 0.9785383839437893,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.407391352433841e-06,
+      "loss": 0.7612,
+      "step": 14066
+    },
+    {
+      "epoch": 0.9786079515809245,
+      "grad_norm": 1.0078125,
+      "learning_rate": 2.3917905427929265e-06,
+      "loss": 0.649,
+      "step": 14067
+    },
+    {
+      "epoch": 0.9786775192180598,
+      "grad_norm": 0.9140625,
+      "learning_rate": 2.3762403866685046e-06,
+      "loss": 0.8349,
+      "step": 14068
+    },
+    {
+      "epoch": 0.978747086855195,
+      "grad_norm": 1.390625,
+      "learning_rate": 2.3607408848501655e-06,
+      "loss": 0.7158,
+      "step": 14069
+    },
+    {
+      "epoch": 0.9788166544923301,
+      "grad_norm": 1.1796875,
+      "learning_rate": 2.3452920381249466e-06,
+      "loss": 1.0078,
+      "step": 14070
+    },
+    {
+      "epoch": 0.9788862221294654,
+      "grad_norm": 0.95703125,
+      "learning_rate": 2.329893847277331e-06,
+      "loss": 0.9462,
+      "step": 14071
+    },
+    {
+      "epoch": 0.9789557897666006,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.3145463130890276e-06,
+      "loss": 0.7862,
+      "step": 14072
+    },
+    {
+      "epoch": 0.9790253574037358,
+      "grad_norm": 1.0546875,
+      "learning_rate": 2.299249436339301e-06,
+      "loss": 0.9804,
+      "step": 14073
+    },
+    {
+      "epoch": 0.979094925040871,
+      "grad_norm": 1.3515625,
+      "learning_rate": 2.284003217804864e-06,
+      "loss": 0.9441,
+      "step": 14074
+    },
+    {
+      "epoch": 0.9791644926780062,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.268807658259986e-06,
+      "loss": 0.9997,
+      "step": 14075
+    },
+    {
+      "epoch": 0.9792340603151414,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.2536627584761603e-06,
+      "loss": 1.0353,
+      "step": 14076
+    },
+    {
+      "epoch": 0.9793036279522767,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.2385685192222173e-06,
+      "loss": 0.9707,
+      "step": 14077
+    },
+    {
+      "epoch": 0.9793731955894118,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.2235249412647653e-06,
+      "loss": 0.7933,
+      "step": 14078
+    },
+    {
+      "epoch": 0.979442763226547,
+      "grad_norm": 1.453125,
+      "learning_rate": 2.2085320253674155e-06,
+      "loss": 0.9587,
+      "step": 14079
+    },
+    {
+      "epoch": 0.9795123308636822,
+      "grad_norm": 0.92578125,
+      "learning_rate": 2.193589772291671e-06,
+      "loss": 0.6871,
+      "step": 14080
+    },
+    {
+      "epoch": 0.9795818985008174,
+      "grad_norm": 1.5703125,
+      "learning_rate": 2.178698182796146e-06,
+      "loss": 1.0512,
+      "step": 14081
+    },
+    {
+      "epoch": 0.9796514661379526,
+      "grad_norm": 1.0390625,
+      "learning_rate": 2.163857257636903e-06,
+      "loss": 0.6405,
+      "step": 14082
+    },
+    {
+      "epoch": 0.9797210337750878,
+      "grad_norm": 0.7734375,
+      "learning_rate": 2.1490669975674506e-06,
+      "loss": 0.6477,
+      "step": 14083
+    },
+    {
+      "epoch": 0.9797906014122231,
+      "grad_norm": 1.28125,
+      "learning_rate": 2.134327403338854e-06,
+      "loss": 0.9768,
+      "step": 14084
+    },
+    {
+      "epoch": 0.9798601690493582,
+      "grad_norm": 1.234375,
+      "learning_rate": 2.1196384756995145e-06,
+      "loss": 0.6931,
+      "step": 14085
+    },
+    {
+      "epoch": 0.9799297366864934,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.105000215395281e-06,
+      "loss": 0.8718,
+      "step": 14086
+    },
+    {
+      "epoch": 0.9799993043236287,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.0904126231693355e-06,
+      "loss": 0.7593,
+      "step": 14087
+    },
+    {
+      "epoch": 0.9800688719607639,
+      "grad_norm": 1.125,
+      "learning_rate": 2.0758756997624194e-06,
+      "loss": 0.6853,
+      "step": 14088
+    },
+    {
+      "epoch": 0.980138439597899,
+      "grad_norm": 0.90625,
+      "learning_rate": 2.0613894459127204e-06,
+      "loss": 0.7454,
+      "step": 14089
+    },
+    {
+      "epoch": 0.9802080072350342,
+      "grad_norm": 0.96484375,
+      "learning_rate": 2.0469538623555385e-06,
+      "loss": 0.7557,
+      "step": 14090
+    },
+    {
+      "epoch": 0.9802775748721695,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.032568949824065e-06,
+      "loss": 0.5998,
+      "step": 14091
+    },
+    {
+      "epoch": 0.9803471425093047,
+      "grad_norm": 1.0,
+      "learning_rate": 2.0182347090484944e-06,
+      "loss": 0.7196,
+      "step": 14092
+    },
+    {
+      "epoch": 0.9804167101464398,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.00395114075691e-06,
+      "loss": 0.9062,
+      "step": 14093
+    },
+    {
+      "epoch": 0.9804862777835751,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.989718245674288e-06,
+      "loss": 0.8441,
+      "step": 14094
+    },
+    {
+      "epoch": 0.9805558454207103,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.9755360245236055e-06,
+      "loss": 0.7855,
+      "step": 14095
+    },
+    {
+      "epoch": 0.9806254130578455,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.9614044780246198e-06,
+      "loss": 0.9628,
+      "step": 14096
+    },
+    {
+      "epoch": 0.9806949806949807,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.9473236068950905e-06,
+      "loss": 0.8135,
+      "step": 14097
+    },
+    {
+      "epoch": 0.9807645483321159,
+      "grad_norm": 0.94140625,
+      "learning_rate": 1.93329341184989e-06,
+      "loss": 0.6462,
+      "step": 14098
+    },
+    {
+      "epoch": 0.9808341159692511,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.9193138936014488e-06,
+      "loss": 0.5737,
+      "step": 14099
+    },
+    {
+      "epoch": 0.9809036836063864,
+      "grad_norm": 0.8828125,
+      "learning_rate": 1.9053850528595318e-06,
+      "loss": 0.6714,
+      "step": 14100
+    },
+    {
+      "epoch": 0.9809732512435215,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.8915068903313515e-06,
+      "loss": 0.8413,
+      "step": 14101
+    },
+    {
+      "epoch": 0.9810428188806567,
+      "grad_norm": 1.125,
+      "learning_rate": 1.8776794067216774e-06,
+      "loss": 0.6286,
+      "step": 14102
+    },
+    {
+      "epoch": 0.9811123865177919,
+      "grad_norm": 1.1484375,
+      "learning_rate": 1.8639026027325035e-06,
+      "loss": 0.8265,
+      "step": 14103
+    },
+    {
+      "epoch": 0.9811819541549271,
+      "grad_norm": 1.453125,
+      "learning_rate": 1.8501764790633814e-06,
+      "loss": 0.7367,
+      "step": 14104
+    },
+    {
+      "epoch": 0.9812515217920623,
+      "grad_norm": 0.953125,
+      "learning_rate": 1.8365010364113089e-06,
+      "loss": 0.8278,
+      "step": 14105
+    },
+    {
+      "epoch": 0.9813210894291975,
+      "grad_norm": 1.6875,
+      "learning_rate": 1.8228762754705086e-06,
+      "loss": 0.925,
+      "step": 14106
+    },
+    {
+      "epoch": 0.9813906570663328,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.8093021969328716e-06,
+      "loss": 0.7409,
+      "step": 14107
+    },
+    {
+      "epoch": 0.9814602247034679,
+      "grad_norm": 0.8671875,
+      "learning_rate": 1.7957788014877352e-06,
+      "loss": 0.7799,
+      "step": 14108
+    },
+    {
+      "epoch": 0.9815297923406031,
+      "grad_norm": 0.9375,
+      "learning_rate": 1.7823060898214395e-06,
+      "loss": 0.8179,
+      "step": 14109
+    },
+    {
+      "epoch": 0.9815993599777384,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.7688840626184367e-06,
+      "loss": 0.7299,
+      "step": 14110
+    },
+    {
+      "epoch": 0.9816689276148736,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.7555127205598487e-06,
+      "loss": 0.8655,
+      "step": 14111
+    },
+    {
+      "epoch": 0.9817384952520087,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.74219206432491e-06,
+      "loss": 0.7925,
+      "step": 14112
+    },
+    {
+      "epoch": 0.981808062889144,
+      "grad_norm": 0.99609375,
+      "learning_rate": 1.7289220945898576e-06,
+      "loss": 0.6584,
+      "step": 14113
+    },
+    {
+      "epoch": 0.9818776305262792,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.7157028120284857e-06,
+      "loss": 0.8806,
+      "step": 14114
+    },
+    {
+      "epoch": 0.9819471981634144,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.702534217312035e-06,
+      "loss": 0.8664,
+      "step": 14115
+    },
+    {
+      "epoch": 0.9820167658005495,
+      "grad_norm": 1.3125,
+      "learning_rate": 1.689416311109082e-06,
+      "loss": 0.7739,
+      "step": 14116
+    },
+    {
+      "epoch": 0.9820863334376848,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.6763490940856496e-06,
+      "loss": 0.6885,
+      "step": 14117
+    },
+    {
+      "epoch": 0.98215590107482,
+      "grad_norm": 0.890625,
+      "learning_rate": 1.6633325669054289e-06,
+      "loss": 0.6807,
+      "step": 14118
+    },
+    {
+      "epoch": 0.9822254687119552,
+      "grad_norm": 1.125,
+      "learning_rate": 1.6503667302290027e-06,
+      "loss": 0.8408,
+      "step": 14119
+    },
+    {
+      "epoch": 0.9822950363490904,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.6374515847149552e-06,
+      "loss": 0.8461,
+      "step": 14120
+    },
+    {
+      "epoch": 0.9823646039862256,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.6245871310190952e-06,
+      "loss": 0.9569,
+      "step": 14121
+    },
+    {
+      "epoch": 0.9824341716233608,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.611773369794456e-06,
+      "loss": 0.8808,
+      "step": 14122
+    },
+    {
+      "epoch": 0.9825037392604961,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.599010301691739e-06,
+      "loss": 0.6954,
+      "step": 14123
+    },
+    {
+      "epoch": 0.9825733068976312,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.5862979273588707e-06,
+      "loss": 1.0216,
+      "step": 14124
+    },
+    {
+      "epoch": 0.9826428745347664,
+      "grad_norm": 0.82421875,
+      "learning_rate": 1.5736362474415567e-06,
+      "loss": 0.7222,
+      "step": 14125
+    },
+    {
+      "epoch": 0.9827124421719017,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.561025262582394e-06,
+      "loss": 0.8758,
+      "step": 14126
+    },
+    {
+      "epoch": 0.9827820098090368,
+      "grad_norm": 1.4296875,
+      "learning_rate": 1.5484649734219814e-06,
+      "loss": 0.7369,
+      "step": 14127
+    },
+    {
+      "epoch": 0.982851577446172,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.5359553805979198e-06,
+      "loss": 0.9374,
+      "step": 14128
+    },
+    {
+      "epoch": 0.9829211450833072,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.523496484745368e-06,
+      "loss": 0.8218,
+      "step": 14129
+    },
+    {
+      "epoch": 0.9829907127204425,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.5110882864970422e-06,
+      "loss": 0.7643,
+      "step": 14130
+    },
+    {
+      "epoch": 0.9830602803575776,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.4987307864828825e-06,
+      "loss": 0.8093,
+      "step": 14131
+    },
+    {
+      "epoch": 0.9831298479947128,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.4864239853303873e-06,
+      "loss": 0.5871,
+      "step": 14132
+    },
+    {
+      "epoch": 0.9831994156318481,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.474167883664279e-06,
+      "loss": 0.7188,
+      "step": 14133
+    },
+    {
+      "epoch": 0.9832689832689833,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.4619624821070599e-06,
+      "loss": 0.8958,
+      "step": 14134
+    },
+    {
+      "epoch": 0.9833385509061184,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.449807781278345e-06,
+      "loss": 0.8396,
+      "step": 14135
+    },
+    {
+      "epoch": 0.9834081185432537,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.4377037817954186e-06,
+      "loss": 0.7646,
+      "step": 14136
+    },
+    {
+      "epoch": 0.9834776861803889,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.425650484272678e-06,
+      "loss": 0.7525,
+      "step": 14137
+    },
+    {
+      "epoch": 0.9835472538175241,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.4136478893221894e-06,
+      "loss": 0.7781,
+      "step": 14138
+    },
+    {
+      "epoch": 0.9836168214546593,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.401695997553465e-06,
+      "loss": 0.831,
+      "step": 14139
+    },
+    {
+      "epoch": 0.9836863890917945,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.3897948095733525e-06,
+      "loss": 0.9721,
+      "step": 14140
+    },
+    {
+      "epoch": 0.9837559567289297,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.3779443259860359e-06,
+      "loss": 0.7669,
+      "step": 14141
+    },
+    {
+      "epoch": 0.9838255243660649,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.3661445473933664e-06,
+      "loss": 0.8089,
+      "step": 14142
+    },
+    {
+      "epoch": 0.9838950920032001,
+      "grad_norm": 0.9921875,
+      "learning_rate": 1.35439547439431e-06,
+      "loss": 0.674,
+      "step": 14143
+    },
+    {
+      "epoch": 0.9839646596403353,
+      "grad_norm": 0.7890625,
+      "learning_rate": 1.3426971075855e-06,
+      "loss": 0.5941,
+      "step": 14144
+    },
+    {
+      "epoch": 0.9840342272774705,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.3310494475609058e-06,
+      "loss": 0.6877,
+      "step": 14145
+    },
+    {
+      "epoch": 0.9841037949146058,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.3194524949119435e-06,
+      "loss": 0.6499,
+      "step": 14146
+    },
+    {
+      "epoch": 0.9841733625517409,
+      "grad_norm": 1.2421875,
+      "learning_rate": 1.3079062502275863e-06,
+      "loss": 0.8235,
+      "step": 14147
+    },
+    {
+      "epoch": 0.9842429301888761,
+      "grad_norm": 1.375,
+      "learning_rate": 1.2964107140938096e-06,
+      "loss": 0.8694,
+      "step": 14148
+    },
+    {
+      "epoch": 0.9843124978260114,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.2849658870945914e-06,
+      "loss": 0.9599,
+      "step": 14149
+    },
+    {
+      "epoch": 0.9843820654631465,
+      "grad_norm": 0.84765625,
+      "learning_rate": 1.2735717698107996e-06,
+      "loss": 0.5055,
+      "step": 14150
+    },
+    {
+      "epoch": 0.9844516331002817,
+      "grad_norm": 1.171875,
+      "learning_rate": 1.262228362821194e-06,
+      "loss": 1.1511,
+      "step": 14151
+    },
+    {
+      "epoch": 0.984521200737417,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.250935666701536e-06,
+      "loss": 0.7013,
+      "step": 14152
+    },
+    {
+      "epoch": 0.9845907683745522,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.2396936820252557e-06,
+      "loss": 0.8599,
+      "step": 14153
+    },
+    {
+      "epoch": 0.9846603360116873,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.2285024093632303e-06,
+      "loss": 0.8528,
+      "step": 14154
+    },
+    {
+      "epoch": 0.9847299036488225,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.2173618492837823e-06,
+      "loss": 0.8547,
+      "step": 14155
+    },
+    {
+      "epoch": 0.9847994712859578,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.2062720023523488e-06,
+      "loss": 1.0045,
+      "step": 14156
+    },
+    {
+      "epoch": 0.984869038923093,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.1952328691321457e-06,
+      "loss": 0.5509,
+      "step": 14157
+    },
+    {
+      "epoch": 0.9849386065602281,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.1842444501837245e-06,
+      "loss": 0.9216,
+      "step": 14158
+    },
+    {
+      "epoch": 0.9850081741973634,
+      "grad_norm": 1.09375,
+      "learning_rate": 1.1733067460649727e-06,
+      "loss": 0.8681,
+      "step": 14159
+    },
+    {
+      "epoch": 0.9850777418344986,
+      "grad_norm": 0.91796875,
+      "learning_rate": 1.1624197573312234e-06,
+      "loss": 0.8702,
+      "step": 14160
+    },
+    {
+      "epoch": 0.9851473094716338,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.1515834845352568e-06,
+      "loss": 0.859,
+      "step": 14161
+    },
+    {
+      "epoch": 0.985216877108769,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.1407979282272996e-06,
+      "loss": 0.6349,
+      "step": 14162
+    },
+    {
+      "epoch": 0.9852864447459042,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.1300630889550245e-06,
+      "loss": 0.6855,
+      "step": 14163
+    },
+    {
+      "epoch": 0.9853560123830394,
+      "grad_norm": 1.0859375,
+      "learning_rate": 1.1193789672634402e-06,
+      "loss": 0.8024,
+      "step": 14164
+    },
+    {
+      "epoch": 0.9854255800201747,
+      "grad_norm": 1.078125,
+      "learning_rate": 1.1087455636951128e-06,
+      "loss": 0.6758,
+      "step": 14165
+    },
+    {
+      "epoch": 0.9854951476573098,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.0981628787898323e-06,
+      "loss": 0.612,
+      "step": 14166
+    },
+    {
+      "epoch": 0.985564715294445,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.0876309130850582e-06,
+      "loss": 0.7018,
+      "step": 14167
+    },
+    {
+      "epoch": 0.9856342829315802,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.0771496671154736e-06,
+      "loss": 0.8467,
+      "step": 14168
+    },
+    {
+      "epoch": 0.9857038505687155,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.0667191414133192e-06,
+      "loss": 0.8525,
+      "step": 14169
+    },
+    {
+      "epoch": 0.9857734182058506,
+      "grad_norm": 1.4453125,
+      "learning_rate": 1.0563393365080609e-06,
+      "loss": 1.0241,
+      "step": 14170
+    },
+    {
+      "epoch": 0.9858429858429858,
+      "grad_norm": 0.97265625,
+      "learning_rate": 1.0460102529269432e-06,
+      "loss": 0.7085,
+      "step": 14171
+    },
+    {
+      "epoch": 0.9859125534801211,
+      "grad_norm": 1.546875,
+      "learning_rate": 1.0357318911943247e-06,
+      "loss": 0.9521,
+      "step": 14172
+    },
+    {
+      "epoch": 0.9859821211172562,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.0255042518320102e-06,
+      "loss": 0.8574,
+      "step": 14173
+    },
+    {
+      "epoch": 0.9860516887543914,
+      "grad_norm": 0.96875,
+      "learning_rate": 1.0153273353594727e-06,
+      "loss": 0.6841,
+      "step": 14174
+    },
+    {
+      "epoch": 0.9861212563915267,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.0052011422932994e-06,
+      "loss": 0.5727,
+      "step": 14175
+    },
+    {
+      "epoch": 0.9861908240286619,
+      "grad_norm": 1.0703125,
+      "learning_rate": 9.951256731477453e-07,
+      "loss": 0.6811,
+      "step": 14176
+    },
+    {
+      "epoch": 0.986260391665797,
+      "grad_norm": 1.1953125,
+      "learning_rate": 9.851009284344016e-07,
+      "loss": 0.8459,
+      "step": 14177
+    },
+    {
+      "epoch": 0.9863299593029323,
+      "grad_norm": 1.0546875,
+      "learning_rate": 9.751269086620829e-07,
+      "loss": 0.642,
+      "step": 14178
+    },
+    {
+      "epoch": 0.9863995269400675,
+      "grad_norm": 1.5546875,
+      "learning_rate": 9.652036143374953e-07,
+      "loss": 0.965,
+      "step": 14179
+    },
+    {
+      "epoch": 0.9864690945772027,
+      "grad_norm": 1.2734375,
+      "learning_rate": 9.55331045964236e-07,
+      "loss": 0.9367,
+      "step": 14180
+    },
+    {
+      "epoch": 0.9865386622143378,
+      "grad_norm": 1.1328125,
+      "learning_rate": 9.455092040437929e-07,
+      "loss": 0.6349,
+      "step": 14181
+    },
+    {
+      "epoch": 0.9866082298514731,
+      "grad_norm": 0.98046875,
+      "learning_rate": 9.357380890747668e-07,
+      "loss": 0.5858,
+      "step": 14182
+    },
+    {
+      "epoch": 0.9866777974886083,
+      "grad_norm": 0.9375,
+      "learning_rate": 9.260177015533167e-07,
+      "loss": 0.6042,
+      "step": 14183
+    },
+    {
+      "epoch": 0.9867473651257435,
+      "grad_norm": 1.1015625,
+      "learning_rate": 9.163480419729365e-07,
+      "loss": 0.6278,
+      "step": 14184
+    },
+    {
+      "epoch": 0.9868169327628787,
+      "grad_norm": 1.0859375,
+      "learning_rate": 9.067291108246778e-07,
+      "loss": 0.9643,
+      "step": 14185
+    },
+    {
+      "epoch": 0.9868865004000139,
+      "grad_norm": 0.8671875,
+      "learning_rate": 8.971609085969279e-07,
+      "loss": 0.5668,
+      "step": 14186
+    },
+    {
+      "epoch": 0.9869560680371491,
+      "grad_norm": 1.171875,
+      "learning_rate": 8.876434357755203e-07,
+      "loss": 0.8407,
+      "step": 14187
+    },
+    {
+      "epoch": 0.9870256356742844,
+      "grad_norm": 1.1484375,
+      "learning_rate": 8.781766928436241e-07,
+      "loss": 0.8556,
+      "step": 14188
+    },
+    {
+      "epoch": 0.9870952033114195,
+      "grad_norm": 1.0390625,
+      "learning_rate": 8.687606802819659e-07,
+      "loss": 0.6536,
+      "step": 14189
+    },
+    {
+      "epoch": 0.9871647709485547,
+      "grad_norm": 1.453125,
+      "learning_rate": 8.593953985687186e-07,
+      "loss": 0.8054,
+      "step": 14190
+    },
+    {
+      "epoch": 0.98723433858569,
+      "grad_norm": 1.078125,
+      "learning_rate": 8.500808481792799e-07,
+      "loss": 0.6906,
+      "step": 14191
+    },
+    {
+      "epoch": 0.9873039062228252,
+      "grad_norm": 0.90625,
+      "learning_rate": 8.408170295866046e-07,
+      "loss": 0.6133,
+      "step": 14192
+    },
+    {
+      "epoch": 0.9873734738599603,
+      "grad_norm": 0.8828125,
+      "learning_rate": 8.316039432612055e-07,
+      "loss": 0.5521,
+      "step": 14193
+    },
+    {
+      "epoch": 0.9874430414970955,
+      "grad_norm": 1.359375,
+      "learning_rate": 8.224415896705972e-07,
+      "loss": 0.7895,
+      "step": 14194
+    },
+    {
+      "epoch": 0.9875126091342308,
+      "grad_norm": 1.046875,
+      "learning_rate": 8.133299692804075e-07,
+      "loss": 0.9231,
+      "step": 14195
+    },
+    {
+      "epoch": 0.987582176771366,
+      "grad_norm": 1.0234375,
+      "learning_rate": 8.042690825529331e-07,
+      "loss": 0.7716,
+      "step": 14196
+    },
+    {
+      "epoch": 0.9876517444085011,
+      "grad_norm": 1.0625,
+      "learning_rate": 7.952589299483615e-07,
+      "loss": 0.75,
+      "step": 14197
+    },
+    {
+      "epoch": 0.9877213120456364,
+      "grad_norm": 1.5,
+      "learning_rate": 7.862995119241045e-07,
+      "loss": 0.9462,
+      "step": 14198
+    },
+    {
+      "epoch": 0.9877908796827716,
+      "grad_norm": 0.96484375,
+      "learning_rate": 7.773908289352427e-07,
+      "loss": 0.8243,
+      "step": 14199
+    },
+    {
+      "epoch": 0.9878604473199067,
+      "grad_norm": 1.4921875,
+      "learning_rate": 7.685328814339698e-07,
+      "loss": 1.1487,
+      "step": 14200
+    },
+    {
+      "epoch": 0.987930014957042,
+      "grad_norm": 1.1875,
+      "learning_rate": 7.597256698701482e-07,
+      "loss": 0.9426,
+      "step": 14201
+    },
+    {
+      "epoch": 0.9879995825941772,
+      "grad_norm": 1.171875,
+      "learning_rate": 7.509691946908648e-07,
+      "loss": 0.785,
+      "step": 14202
+    },
+    {
+      "epoch": 0.9880691502313124,
+      "grad_norm": 1.125,
+      "learning_rate": 7.422634563407638e-07,
+      "loss": 0.989,
+      "step": 14203
+    },
+    {
+      "epoch": 0.9881387178684476,
+      "grad_norm": 1.0703125,
+      "learning_rate": 7.33608455261936e-07,
+      "loss": 0.9506,
+      "step": 14204
+    },
+    {
+      "epoch": 0.9882082855055828,
+      "grad_norm": 0.875,
+      "learning_rate": 7.250041918938077e-07,
+      "loss": 0.5925,
+      "step": 14205
+    },
+    {
+      "epoch": 0.988277853142718,
+      "grad_norm": 1.1640625,
+      "learning_rate": 7.164506666732518e-07,
+      "loss": 0.8583,
+      "step": 14206
+    },
+    {
+      "epoch": 0.9883474207798532,
+      "grad_norm": 1.296875,
+      "learning_rate": 7.079478800344763e-07,
+      "loss": 0.835,
+      "step": 14207
+    },
+    {
+      "epoch": 0.9884169884169884,
+      "grad_norm": 1.265625,
+      "learning_rate": 6.994958324093581e-07,
+      "loss": 0.7567,
+      "step": 14208
+    },
+    {
+      "epoch": 0.9884865560541236,
+      "grad_norm": 1.1328125,
+      "learning_rate": 6.910945242269983e-07,
+      "loss": 0.8193,
+      "step": 14209
+    },
+    {
+      "epoch": 0.9885561236912588,
+      "grad_norm": 1.25,
+      "learning_rate": 6.827439559140558e-07,
+      "loss": 0.8375,
+      "step": 14210
+    },
+    {
+      "epoch": 0.9886256913283941,
+      "grad_norm": 0.9140625,
+      "learning_rate": 6.744441278943025e-07,
+      "loss": 0.7245,
+      "step": 14211
+    },
+    {
+      "epoch": 0.9886952589655292,
+      "grad_norm": 1.34375,
+      "learning_rate": 6.661950405894013e-07,
+      "loss": 0.8492,
+      "step": 14212
+    },
+    {
+      "epoch": 0.9887648266026644,
+      "grad_norm": 1.4765625,
+      "learning_rate": 6.579966944180172e-07,
+      "loss": 0.8967,
+      "step": 14213
+    },
+    {
+      "epoch": 0.9888343942397997,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.498490897965948e-07,
+      "loss": 0.6543,
+      "step": 14214
+    },
+    {
+      "epoch": 0.9889039618769349,
+      "grad_norm": 1.125,
+      "learning_rate": 6.417522271386922e-07,
+      "loss": 0.7294,
+      "step": 14215
+    },
+    {
+      "epoch": 0.98897352951407,
+      "grad_norm": 1.25,
+      "learning_rate": 6.33706106855425e-07,
+      "loss": 0.7326,
+      "step": 14216
+    },
+    {
+      "epoch": 0.9890430971512053,
+      "grad_norm": 1.203125,
+      "learning_rate": 6.257107293554664e-07,
+      "loss": 0.9212,
+      "step": 14217
+    },
+    {
+      "epoch": 0.9891126647883405,
+      "grad_norm": 1.1875,
+      "learning_rate": 6.177660950446029e-07,
+      "loss": 0.9369,
+      "step": 14218
+    },
+    {
+      "epoch": 0.9891822324254756,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.098722043264005e-07,
+      "loss": 0.7213,
+      "step": 14219
+    },
+    {
+      "epoch": 0.9892518000626108,
+      "grad_norm": 1.125,
+      "learning_rate": 6.020290576015386e-07,
+      "loss": 0.8029,
+      "step": 14220
+    },
+    {
+      "epoch": 0.9893213676997461,
+      "grad_norm": 2.359375,
+      "learning_rate": 5.942366552683654e-07,
+      "loss": 0.724,
+      "step": 14221
+    },
+    {
+      "epoch": 0.9893909353368813,
+      "grad_norm": 1.203125,
+      "learning_rate": 5.864949977224532e-07,
+      "loss": 0.5431,
+      "step": 14222
+    },
+    {
+      "epoch": 0.9894605029740164,
+      "grad_norm": 1.2578125,
+      "learning_rate": 5.788040853568211e-07,
+      "loss": 0.772,
+      "step": 14223
+    },
+    {
+      "epoch": 0.9895300706111517,
+      "grad_norm": 1.0859375,
+      "learning_rate": 5.711639185621564e-07,
+      "loss": 0.7427,
+      "step": 14224
+    },
+    {
+      "epoch": 0.9895996382482869,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.635744977262603e-07,
+      "loss": 0.7589,
+      "step": 14225
+    },
+    {
+      "epoch": 0.9896692058854221,
+      "grad_norm": 0.98046875,
+      "learning_rate": 5.560358232344909e-07,
+      "loss": 0.716,
+      "step": 14226
+    },
+    {
+      "epoch": 0.9897387735225573,
+      "grad_norm": 1.25,
+      "learning_rate": 5.485478954697643e-07,
+      "loss": 1.0223,
+      "step": 14227
+    },
+    {
+      "epoch": 0.9898083411596925,
+      "grad_norm": 1.171875,
+      "learning_rate": 5.411107148119987e-07,
+      "loss": 0.9527,
+      "step": 14228
+    },
+    {
+      "epoch": 0.9898779087968277,
+      "grad_norm": 1.09375,
+      "learning_rate": 5.337242816391142e-07,
+      "loss": 0.7207,
+      "step": 14229
+    },
+    {
+      "epoch": 0.989947476433963,
+      "grad_norm": 1.21875,
+      "learning_rate": 5.263885963260329e-07,
+      "loss": 0.8742,
+      "step": 14230
+    },
+    {
+      "epoch": 0.9900170440710981,
+      "grad_norm": 1.2109375,
+      "learning_rate": 5.191036592451237e-07,
+      "loss": 0.6883,
+      "step": 14231
+    },
+    {
+      "epoch": 0.9900866117082333,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.11869470766424e-07,
+      "loss": 0.8554,
+      "step": 14232
+    },
+    {
+      "epoch": 0.9901561793453685,
+      "grad_norm": 1.234375,
+      "learning_rate": 5.046860312571955e-07,
+      "loss": 0.8186,
+      "step": 14233
+    },
+    {
+      "epoch": 0.9902257469825038,
+      "grad_norm": 1.2265625,
+      "learning_rate": 4.975533410821465e-07,
+      "loss": 1.1046,
+      "step": 14234
+    },
+    {
+      "epoch": 0.9902953146196389,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.904714006035427e-07,
+      "loss": 0.7421,
+      "step": 14235
+    },
+    {
+      "epoch": 0.9903648822567741,
+      "grad_norm": 1.1171875,
+      "learning_rate": 4.834402101808743e-07,
+      "loss": 0.7434,
+      "step": 14236
+    },
+    {
+      "epoch": 0.9904344498939094,
+      "grad_norm": 1.4609375,
+      "learning_rate": 4.7645977017118926e-07,
+      "loss": 1.1464,
+      "step": 14237
+    },
+    {
+      "epoch": 0.9905040175310446,
+      "grad_norm": 1.0,
+      "learning_rate": 4.695300809288705e-07,
+      "loss": 0.9736,
+      "step": 14238
+    },
+    {
+      "epoch": 0.9905735851681797,
+      "grad_norm": 1.0703125,
+      "learning_rate": 4.626511428058588e-07,
+      "loss": 0.8671,
+      "step": 14239
+    },
+    {
+      "epoch": 0.990643152805315,
+      "grad_norm": 1.171875,
+      "learning_rate": 4.558229561513194e-07,
+      "loss": 0.6504,
+      "step": 14240
+    },
+    {
+      "epoch": 0.9907127204424502,
+      "grad_norm": 1.2890625,
+      "learning_rate": 4.4904552131197485e-07,
+      "loss": 0.8912,
+      "step": 14241
+    },
+    {
+      "epoch": 0.9907822880795853,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.423188386321053e-07,
+      "loss": 0.6163,
+      "step": 14242
+    },
+    {
+      "epoch": 0.9908518557167206,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.356429084531044e-07,
+      "loss": 0.8492,
+      "step": 14243
+    },
+    {
+      "epoch": 0.9909214233538558,
+      "grad_norm": 0.9921875,
+      "learning_rate": 4.2901773111392317e-07,
+      "loss": 0.8083,
+      "step": 14244
+    },
+    {
+      "epoch": 0.990990990990991,
+      "grad_norm": 1.1484375,
+      "learning_rate": 4.2244330695107024e-07,
+      "loss": 0.7564,
+      "step": 14245
+    },
+    {
+      "epoch": 0.9910605586281261,
+      "grad_norm": 1.1953125,
+      "learning_rate": 4.1591963629827867e-07,
+      "loss": 0.8627,
+      "step": 14246
+    },
+    {
+      "epoch": 0.9911301262652614,
+      "grad_norm": 1.15625,
+      "learning_rate": 4.09446719486839e-07,
+      "loss": 0.8946,
+      "step": 14247
+    },
+    {
+      "epoch": 0.9911996939023966,
+      "grad_norm": 1.0,
+      "learning_rate": 4.030245568453772e-07,
+      "loss": 0.7983,
+      "step": 14248
+    },
+    {
+      "epoch": 0.9912692615395318,
+      "grad_norm": 1.203125,
+      "learning_rate": 3.966531486998548e-07,
+      "loss": 0.8036,
+      "step": 14249
+    },
+    {
+      "epoch": 0.991338829176667,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.9033249537412384e-07,
+      "loss": 0.8449,
+      "step": 14250
+    },
+    {
+      "epoch": 0.9914083968138022,
+      "grad_norm": 1.0703125,
+      "learning_rate": 3.8406259718881673e-07,
+      "loss": 0.8014,
+      "step": 14251
+    },
+    {
+      "epoch": 0.9914779644509374,
+      "grad_norm": 0.96484375,
+      "learning_rate": 3.7784345446234545e-07,
+      "loss": 0.7806,
+      "step": 14252
+    },
+    {
+      "epoch": 0.9915475320880727,
+      "grad_norm": 1.015625,
+      "learning_rate": 3.716750675104574e-07,
+      "loss": 0.6997,
+      "step": 14253
+    },
+    {
+      "epoch": 0.9916170997252078,
+      "grad_norm": 1.234375,
+      "learning_rate": 3.6555743664645757e-07,
+      "loss": 0.707,
+      "step": 14254
+    },
+    {
+      "epoch": 0.991686667362343,
+      "grad_norm": 1.4296875,
+      "learning_rate": 3.594905621809863e-07,
+      "loss": 0.9648,
+      "step": 14255
+    },
+    {
+      "epoch": 0.9917562349994783,
+      "grad_norm": 1.1484375,
+      "learning_rate": 3.534744444220195e-07,
+      "loss": 0.6445,
+      "step": 14256
+    },
+    {
+      "epoch": 0.9918258026366135,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.4750908367497946e-07,
+      "loss": 0.9463,
+      "step": 14257
+    },
+    {
+      "epoch": 0.9918953702737486,
+      "grad_norm": 1.2265625,
+      "learning_rate": 3.415944802428461e-07,
+      "loss": 0.9668,
+      "step": 14258
+    },
+    {
+      "epoch": 0.9919649379108838,
+      "grad_norm": 1.25,
+      "learning_rate": 3.3573063442582376e-07,
+      "loss": 0.9442,
+      "step": 14259
+    },
+    {
+      "epoch": 0.9920345055480191,
+      "grad_norm": 1.265625,
+      "learning_rate": 3.299175465217852e-07,
+      "loss": 0.9371,
+      "step": 14260
+    },
+    {
+      "epoch": 0.9921040731851543,
+      "grad_norm": 1.0625,
+      "learning_rate": 3.241552168257167e-07,
+      "loss": 0.8297,
+      "step": 14261
+    },
+    {
+      "epoch": 0.9921736408222894,
+      "grad_norm": 1.3515625,
+      "learning_rate": 3.1844364563038407e-07,
+      "loss": 0.9003,
+      "step": 14262
+    },
+    {
+      "epoch": 0.9922432084594247,
+      "grad_norm": 0.96875,
+      "learning_rate": 3.127828332257776e-07,
+      "loss": 0.8385,
+      "step": 14263
+    },
+    {
+      "epoch": 0.9923127760965599,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.07172779899223e-07,
+      "loss": 0.8694,
+      "step": 14264
+    },
+    {
+      "epoch": 0.992382343733695,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.016134859354924e-07,
+      "loss": 0.7328,
+      "step": 14265
+    },
+    {
+      "epoch": 0.9924519113708303,
+      "grad_norm": 1.3203125,
+      "learning_rate": 2.961049516171377e-07,
+      "loss": 0.911,
+      "step": 14266
+    },
+    {
+      "epoch": 0.9925214790079655,
+      "grad_norm": 1.6171875,
+      "learning_rate": 2.906471772236019e-07,
+      "loss": 0.773,
+      "step": 14267
+    },
+    {
+      "epoch": 0.9925910466451007,
+      "grad_norm": 1.0859375,
+      "learning_rate": 2.852401630321078e-07,
+      "loss": 0.7979,
+      "step": 14268
+    },
+    {
+      "epoch": 0.992660614282236,
+      "grad_norm": 1.1015625,
+      "learning_rate": 2.798839093172134e-07,
+      "loss": 0.7917,
+      "step": 14269
+    },
+    {
+      "epoch": 0.9927301819193711,
+      "grad_norm": 1.3359375,
+      "learning_rate": 2.745784163508125e-07,
+      "loss": 0.6746,
+      "step": 14270
+    },
+    {
+      "epoch": 0.9927997495565063,
+      "grad_norm": 1.3125,
+      "learning_rate": 2.693236844023561e-07,
+      "loss": 0.9603,
+      "step": 14271
+    },
+    {
+      "epoch": 0.9928693171936415,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.6411971373863086e-07,
+      "loss": 0.9019,
+      "step": 14272
+    },
+    {
+      "epoch": 0.9929388848307767,
+      "grad_norm": 1.4375,
+      "learning_rate": 2.5896650462386985e-07,
+      "loss": 0.8084,
+      "step": 14273
+    },
+    {
+      "epoch": 0.9930084524679119,
+      "grad_norm": 1.1328125,
+      "learning_rate": 2.5386405731964157e-07,
+      "loss": 0.8645,
+      "step": 14274
+    },
+    {
+      "epoch": 0.9930780201050471,
+      "grad_norm": 0.9453125,
+      "learning_rate": 2.4881237208518313e-07,
+      "loss": 0.7938,
+      "step": 14275
+    },
+    {
+      "epoch": 0.9931475877421824,
+      "grad_norm": 1.125,
+      "learning_rate": 2.4381144917695606e-07,
+      "loss": 0.8516,
+      "step": 14276
+    },
+    {
+      "epoch": 0.9932171553793175,
+      "grad_norm": 0.99609375,
+      "learning_rate": 2.3886128884875737e-07,
+      "loss": 0.743,
+      "step": 14277
+    },
+    {
+      "epoch": 0.9932867230164527,
+      "grad_norm": 0.80078125,
+      "learning_rate": 2.3396189135205248e-07,
+      "loss": 0.7701,
+      "step": 14278
+    },
+    {
+      "epoch": 0.993356290653588,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.2911325693553142e-07,
+      "loss": 0.8914,
+      "step": 14279
+    },
+    {
+      "epoch": 0.9934258582907232,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.2431538584544164e-07,
+      "loss": 0.5938,
+      "step": 14280
+    },
+    {
+      "epoch": 0.9934954259278583,
+      "grad_norm": 1.2265625,
+      "learning_rate": 2.1956827832536608e-07,
+      "loss": 0.8855,
+      "step": 14281
+    },
+    {
+      "epoch": 0.9935649935649936,
+      "grad_norm": 1.109375,
+      "learning_rate": 2.1487193461633415e-07,
+      "loss": 0.8047,
+      "step": 14282
+    },
+    {
+      "epoch": 0.9936345612021288,
+      "grad_norm": 1.1875,
+      "learning_rate": 2.1022635495682174e-07,
+      "loss": 0.9119,
+      "step": 14283
+    },
+    {
+      "epoch": 0.993704128839264,
+      "grad_norm": 0.82421875,
+      "learning_rate": 2.0563153958275128e-07,
+      "loss": 0.8005,
+      "step": 14284
+    },
+    {
+      "epoch": 0.9937736964763991,
+      "grad_norm": 1.015625,
+      "learning_rate": 2.0108748872726956e-07,
+      "loss": 0.9094,
+      "step": 14285
+    },
+    {
+      "epoch": 0.9938432641135344,
+      "grad_norm": 0.91015625,
+      "learning_rate": 1.9659420262130302e-07,
+      "loss": 0.8076,
+      "step": 14286
+    },
+    {
+      "epoch": 0.9939128317506696,
+      "grad_norm": 1.296875,
+      "learning_rate": 1.9215168149289143e-07,
+      "loss": 0.953,
+      "step": 14287
+    },
+    {
+      "epoch": 0.9939823993878047,
+      "grad_norm": 1.1328125,
+      "learning_rate": 1.8775992556752108e-07,
+      "loss": 0.8002,
+      "step": 14288
+    },
+    {
+      "epoch": 0.99405196702494,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.8341893506834684e-07,
+      "loss": 0.8113,
+      "step": 14289
+    },
+    {
+      "epoch": 0.9941215346620752,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.7912871021574794e-07,
+      "loss": 0.7923,
+      "step": 14290
+    },
+    {
+      "epoch": 0.9941911022992104,
+      "grad_norm": 0.90234375,
+      "learning_rate": 1.7488925122743916e-07,
+      "loss": 0.8136,
+      "step": 14291
+    },
+    {
+      "epoch": 0.9942606699363457,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.7070055831880372e-07,
+      "loss": 0.7789,
+      "step": 14292
+    },
+    {
+      "epoch": 0.9943302375734808,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.6656263170244934e-07,
+      "loss": 0.9456,
+      "step": 14293
+    },
+    {
+      "epoch": 0.994399805210616,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.6247547158854125e-07,
+      "loss": 0.9678,
+      "step": 14294
+    },
+    {
+      "epoch": 0.9944693728477513,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.5843907818458015e-07,
+      "loss": 0.8439,
+      "step": 14295
+    },
+    {
+      "epoch": 0.9945389404848864,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.5445345169551316e-07,
+      "loss": 0.7758,
+      "step": 14296
+    },
+    {
+      "epoch": 0.9946085081220216,
+      "grad_norm": 1.046875,
+      "learning_rate": 1.5051859232373398e-07,
+      "loss": 0.7314,
+      "step": 14297
+    },
+    {
+      "epoch": 0.9946780757591568,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.4663450026897174e-07,
+      "loss": 1.0071,
+      "step": 14298
+    },
+    {
+      "epoch": 0.9947476433962921,
+      "grad_norm": 0.96484375,
+      "learning_rate": 1.4280117572840202e-07,
+      "loss": 0.6489,
+      "step": 14299
+    },
+    {
+      "epoch": 0.9948172110334272,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.3901861889686895e-07,
+      "loss": 0.7407,
+      "step": 14300
+    },
+    {
+      "epoch": 0.9948867786705624,
+      "grad_norm": 0.83203125,
+      "learning_rate": 1.352868299662191e-07,
+      "loss": 0.6603,
+      "step": 14301
+    },
+    {
+      "epoch": 0.9949563463076977,
+      "grad_norm": 0.9296875,
+      "learning_rate": 1.3160580912596753e-07,
+      "loss": 0.8934,
+      "step": 14302
+    },
+    {
+      "epoch": 0.9950259139448329,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.2797555656318682e-07,
+      "loss": 0.9611,
+      "step": 14303
+    },
+    {
+      "epoch": 0.995095481581968,
+      "grad_norm": 1.21875,
+      "learning_rate": 1.2439607246195194e-07,
+      "loss": 0.6547,
+      "step": 14304
+    },
+    {
+      "epoch": 0.9951650492191033,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.2086735700422846e-07,
+      "loss": 0.9057,
+      "step": 14305
+    },
+    {
+      "epoch": 0.9952346168562385,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.1738941036909535e-07,
+      "loss": 0.9191,
+      "step": 14306
+    },
+    {
+      "epoch": 0.9953041844933737,
+      "grad_norm": 1.515625,
+      "learning_rate": 1.1396223273307804e-07,
+      "loss": 0.7479,
+      "step": 14307
+    },
+    {
+      "epoch": 0.9953737521305089,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.1058582427025954e-07,
+      "loss": 0.7768,
+      "step": 14308
+    },
+    {
+      "epoch": 0.9954433197676441,
+      "grad_norm": 1.296875,
+      "learning_rate": 1.0726018515216929e-07,
+      "loss": 0.6844,
+      "step": 14309
+    },
+    {
+      "epoch": 0.9955128874047793,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.0398531554745017e-07,
+      "loss": 0.8366,
+      "step": 14310
+    },
+    {
+      "epoch": 0.9955824550419144,
+      "grad_norm": 1.6640625,
+      "learning_rate": 1.0076121562263563e-07,
+      "loss": 0.8562,
+      "step": 14311
+    },
+    {
+      "epoch": 0.9956520226790497,
+      "grad_norm": 1.359375,
+      "learning_rate": 9.758788554126152e-08,
+      "loss": 0.812,
+      "step": 14312
+    },
+    {
+      "epoch": 0.9957215903161849,
+      "grad_norm": 1.1484375,
+      "learning_rate": 9.446532546442121e-08,
+      "loss": 0.9063,
+      "step": 14313
+    },
+    {
+      "epoch": 0.9957911579533201,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.139353555076557e-08,
+      "loss": 0.688,
+      "step": 14314
+    },
+    {
+      "epoch": 0.9958607255904554,
+      "grad_norm": 1.3828125,
+      "learning_rate": 8.837251595628093e-08,
+      "loss": 0.8599,
+      "step": 14315
+    },
+    {
+      "epoch": 0.9959302932275905,
+      "grad_norm": 1.1796875,
+      "learning_rate": 8.540226683428908e-08,
+      "loss": 0.8422,
+      "step": 14316
+    },
+    {
+      "epoch": 0.9959998608647257,
+      "grad_norm": 1.1015625,
+      "learning_rate": 8.248278833566936e-08,
+      "loss": 0.7857,
+      "step": 14317
+    },
+    {
+      "epoch": 0.996069428501861,
+      "grad_norm": 0.90625,
+      "learning_rate": 7.961408060852549e-08,
+      "loss": 0.6352,
+      "step": 14318
+    },
+    {
+      "epoch": 0.9961389961389961,
+      "grad_norm": 1.171875,
+      "learning_rate": 7.679614379862976e-08,
+      "loss": 0.8828,
+      "step": 14319
+    },
+    {
+      "epoch": 0.9962085637761313,
+      "grad_norm": 1.40625,
+      "learning_rate": 7.402897804908992e-08,
+      "loss": 0.9948,
+      "step": 14320
+    },
+    {
+      "epoch": 0.9962781314132666,
+      "grad_norm": 1.1640625,
+      "learning_rate": 7.13125835003492e-08,
+      "loss": 0.9744,
+      "step": 14321
+    },
+    {
+      "epoch": 0.9963476990504018,
+      "grad_norm": 1.078125,
+      "learning_rate": 6.864696029029727e-08,
+      "loss": 0.6612,
+      "step": 14322
+    },
+    {
+      "epoch": 0.9964172666875369,
+      "grad_norm": 1.140625,
+      "learning_rate": 6.603210855438136e-08,
+      "loss": 0.705,
+      "step": 14323
+    },
+    {
+      "epoch": 0.9964868343246721,
+      "grad_norm": 1.4140625,
+      "learning_rate": 6.34680284252731e-08,
+      "loss": 0.9975,
+      "step": 14324
+    },
+    {
+      "epoch": 0.9965564019618074,
+      "grad_norm": 1.109375,
+      "learning_rate": 6.095472003320169e-08,
+      "loss": 0.5625,
+      "step": 14325
+    },
+    {
+      "epoch": 0.9966259695989426,
+      "grad_norm": 1.140625,
+      "learning_rate": 5.849218350573171e-08,
+      "loss": 0.9337,
+      "step": 14326
+    },
+    {
+      "epoch": 0.9966955372360777,
+      "grad_norm": 1.03125,
+      "learning_rate": 5.6080418968096346e-08,
+      "loss": 0.665,
+      "step": 14327
+    },
+    {
+      "epoch": 0.996765104873213,
+      "grad_norm": 0.9609375,
+      "learning_rate": 5.371942654242012e-08,
+      "loss": 0.8357,
+      "step": 14328
+    },
+    {
+      "epoch": 0.9968346725103482,
+      "grad_norm": 1.1328125,
+      "learning_rate": 5.14092063489402e-08,
+      "loss": 0.7825,
+      "step": 14329
+    },
+    {
+      "epoch": 0.9969042401474834,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.914975850467407e-08,
+      "loss": 0.9078,
+      "step": 14330
+    },
+    {
+      "epoch": 0.9969738077846186,
+      "grad_norm": 1.2421875,
+      "learning_rate": 4.6941083124529824e-08,
+      "loss": 1.0158,
+      "step": 14331
+    },
+    {
+      "epoch": 0.9970433754217538,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.478318032052897e-08,
+      "loss": 0.8284,
+      "step": 14332
+    },
+    {
+      "epoch": 0.997112943058889,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.267605020236154e-08,
+      "loss": 1.2325,
+      "step": 14333
+    },
+    {
+      "epoch": 0.9971825106960243,
+      "grad_norm": 1.28125,
+      "learning_rate": 4.061969287683098e-08,
+      "loss": 0.8964,
+      "step": 14334
+    },
+    {
+      "epoch": 0.9972520783331594,
+      "grad_norm": 0.84375,
+      "learning_rate": 3.8614108448520316e-08,
+      "loss": 0.7729,
+      "step": 14335
+    },
+    {
+      "epoch": 0.9973216459702946,
+      "grad_norm": 1.046875,
+      "learning_rate": 3.665929701923698e-08,
+      "loss": 0.966,
+      "step": 14336
+    },
+    {
+      "epoch": 0.9973912136074298,
+      "grad_norm": 0.8984375,
+      "learning_rate": 3.475525868823493e-08,
+      "loss": 0.5503,
+      "step": 14337
+    },
+    {
+      "epoch": 0.997460781244565,
+      "grad_norm": 0.95703125,
+      "learning_rate": 3.2901993552103546e-08,
+      "loss": 0.7011,
+      "step": 14338
+    },
+    {
+      "epoch": 0.9975303488817002,
+      "grad_norm": 1.078125,
+      "learning_rate": 3.109950170498976e-08,
+      "loss": 0.9463,
+      "step": 14339
+    },
+    {
+      "epoch": 0.9975999165188354,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.934778323848697e-08,
+      "loss": 0.7466,
+      "step": 14340
+    },
+    {
+      "epoch": 0.9976694841559707,
+      "grad_norm": 1.140625,
+      "learning_rate": 2.764683824141301e-08,
+      "loss": 0.9995,
+      "step": 14341
+    },
+    {
+      "epoch": 0.9977390517931058,
+      "grad_norm": 0.91796875,
+      "learning_rate": 2.5996666800254253e-08,
+      "loss": 0.6633,
+      "step": 14342
+    },
+    {
+      "epoch": 0.997808619430241,
+      "grad_norm": 2.296875,
+      "learning_rate": 2.4397268998721523e-08,
+      "loss": 1.3606,
+      "step": 14343
+    },
+    {
+      "epoch": 0.9978781870673763,
+      "grad_norm": 1.0703125,
+      "learning_rate": 2.2848644917972116e-08,
+      "loss": 0.6953,
+      "step": 14344
+    },
+    {
+      "epoch": 0.9979477547045115,
+      "grad_norm": 1.21875,
+      "learning_rate": 2.1350794636831872e-08,
+      "loss": 0.9079,
+      "step": 14345
+    },
+    {
+      "epoch": 0.9980173223416466,
+      "grad_norm": 1.2421875,
+      "learning_rate": 1.990371823112902e-08,
+      "loss": 0.9467,
+      "step": 14346
+    },
+    {
+      "epoch": 0.9980868899787819,
+      "grad_norm": 1.109375,
+      "learning_rate": 1.850741577447135e-08,
+      "loss": 0.6985,
+      "step": 14347
+    },
+    {
+      "epoch": 0.9981564576159171,
+      "grad_norm": 1.0,
+      "learning_rate": 1.7161887337802108e-08,
+      "loss": 0.6278,
+      "step": 14348
+    },
+    {
+      "epoch": 0.9982260252530523,
+      "grad_norm": 1.5703125,
+      "learning_rate": 1.586713298928899e-08,
+      "loss": 0.6759,
+      "step": 14349
+    },
+    {
+      "epoch": 0.9982955928901874,
+      "grad_norm": 1.15625,
+      "learning_rate": 1.4623152794768224e-08,
+      "loss": 0.6557,
+      "step": 14350
+    },
+    {
+      "epoch": 0.9983651605273227,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.3429946817300476e-08,
+      "loss": 0.7722,
+      "step": 14351
+    },
+    {
+      "epoch": 0.9984347281644579,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.2287515117725968e-08,
+      "loss": 0.8084,
+      "step": 14352
+    },
+    {
+      "epoch": 0.9985042958015931,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.1195857753776295e-08,
+      "loss": 0.7004,
+      "step": 14353
+    },
+    {
+      "epoch": 0.9985738634387283,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.0154974780962611e-08,
+      "loss": 0.6947,
+      "step": 14354
+    },
+    {
+      "epoch": 0.9986434310758635,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.164866252242554e-09,
+      "loss": 0.674,
+      "step": 14355
+    },
+    {
+      "epoch": 0.9987129987129987,
+      "grad_norm": 1.09375,
+      "learning_rate": 8.225532217687181e-09,
+      "loss": 0.8365,
+      "step": 14356
+    },
+    {
+      "epoch": 0.998782566350134,
+      "grad_norm": 1.1015625,
+      "learning_rate": 7.336972725147106e-09,
+      "loss": 0.6817,
+      "step": 14357
+    },
+    {
+      "epoch": 0.9988521339872691,
+      "grad_norm": 1.015625,
+      "learning_rate": 6.499187819808405e-09,
+      "loss": 0.6803,
+      "step": 14358
+    },
+    {
+      "epoch": 0.9989217016244043,
+      "grad_norm": 1.0390625,
+      "learning_rate": 5.712177543970576e-09,
+      "loss": 0.8999,
+      "step": 14359
+    },
+    {
+      "epoch": 0.9989912692615396,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.975941937823691e-09,
+      "loss": 0.7464,
+      "step": 14360
+    },
+    {
+      "epoch": 0.9990608368986748,
+      "grad_norm": 1.8359375,
+      "learning_rate": 4.290481038560223e-09,
+      "loss": 0.9455,
+      "step": 14361
+    },
+    {
+      "epoch": 0.9991304045358099,
+      "grad_norm": 1.0,
+      "learning_rate": 3.655794881152197e-09,
+      "loss": 0.7925,
+      "step": 14362
+    },
+    {
+      "epoch": 0.9991999721729451,
+      "grad_norm": 1.0546875,
+      "learning_rate": 3.0718834976850575e-09,
+      "loss": 0.656,
+      "step": 14363
+    },
+    {
+      "epoch": 0.9992695398100804,
+      "grad_norm": 1.2109375,
+      "learning_rate": 2.538746917912782e-09,
+      "loss": 0.6469,
+      "step": 14364
+    },
+    {
+      "epoch": 0.9993391074472155,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.0563851688137903e-09,
+      "loss": 0.8579,
+      "step": 14365
+    },
+    {
+      "epoch": 0.9994086750843507,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.624798274924011e-09,
+      "loss": 0.7825,
+      "step": 14366
+    },
+    {
+      "epoch": 0.999478242721486,
+      "grad_norm": 1.03125,
+      "learning_rate": 1.24398625822586e-09,
+      "loss": 0.834,
+      "step": 14367
+    },
+    {
+      "epoch": 0.9995478103586212,
+      "grad_norm": 1.34375,
+      "learning_rate": 9.139491379261955e-10,
+      "loss": 1.0088,
+      "step": 14368
+    },
+    {
+      "epoch": 0.9996173779957563,
+      "grad_norm": 1.359375,
+      "learning_rate": 6.346869309004078e-10,
+      "loss": 0.7419,
+      "step": 14369
+    },
+    {
+      "epoch": 0.9996869456328916,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.06199651248329e-10,
+      "loss": 0.6799,
+      "step": 14370
+    },
+    {
+      "epoch": 0.9997565132700268,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.284873106273011e-10,
+      "loss": 0.9677,
+      "step": 14371
+    },
+    {
+      "epoch": 0.999826080907162,
+      "grad_norm": 0.98046875,
+      "learning_rate": 1.0154991791910816e-10,
+      "loss": 0.6741,
+      "step": 14372
+    },
+    {
+      "epoch": 0.9998956485442972,
+      "grad_norm": 1.4453125,
+      "learning_rate": 2.5387479785088375e-11,
+      "loss": 0.7453,
+      "step": 14373
+    },
+    {
+      "epoch": 0.9999652161814324,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0,
+      "loss": 1.1312,
+      "step": 14374
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 14374,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7122538767292826e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}