diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7821 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 1000,
+  "global_step": 100000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.153648614883423,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 1.9475,
+      "step": 100
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.318032145500183,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 1.7581,
+      "step": 200
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.7753959894180298,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 1.7353,
+      "step": 300
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.262397289276123,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 1.7634,
+      "step": 400
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.793628454208374,
+      "learning_rate": 0.0003,
+      "loss": 1.7828,
+      "step": 500
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.29915452003479,
+      "learning_rate": 0.00029969849246231153,
+      "loss": 1.7973,
+      "step": 600
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.3753058910369873,
+      "learning_rate": 0.0002993969849246231,
+      "loss": 1.7677,
+      "step": 700
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 3.032518148422241,
+      "learning_rate": 0.00029909547738693465,
+      "loss": 1.7775,
+      "step": 800
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4150569438934326,
+      "learning_rate": 0.0002987939698492462,
+      "loss": 1.7872,
+      "step": 900
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.6522352695465088,
+      "learning_rate": 0.00029849246231155777,
+      "loss": 1.7433,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05,
+      "eval_loss": 1.7443219423294067,
+      "eval_runtime": 37.3589,
+      "eval_samples_per_second": 26.767,
+      "eval_steps_per_second": 3.346,
+      "step": 1000
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.8125498294830322,
+      "learning_rate": 0.00029819095477386933,
+      "loss": 1.7885,
+      "step": 1100
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.4740030765533447,
+      "learning_rate": 0.0002978894472361809,
+      "loss": 1.7616,
+      "step": 1200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.670320749282837,
+      "learning_rate": 0.00029758793969849245,
+      "loss": 1.7545,
+      "step": 1300
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.096781611442566,
+      "learning_rate": 0.000297286432160804,
+      "loss": 1.7072,
+      "step": 1400
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.8927794694900513,
+      "learning_rate": 0.0002969849246231155,
+      "loss": 1.7243,
+      "step": 1500
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.244074583053589,
+      "learning_rate": 0.00029668341708542713,
+      "loss": 1.7369,
+      "step": 1600
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.6228864192962646,
+      "learning_rate": 0.0002963819095477387,
+      "loss": 1.7541,
+      "step": 1700
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.9201287627220154,
+      "learning_rate": 0.00029608040201005025,
+      "loss": 1.7236,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.308199405670166,
+      "learning_rate": 0.00029577889447236175,
+      "loss": 1.7345,
+      "step": 1900
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.5356643199920654,
+      "learning_rate": 0.00029548040201005023,
+      "loss": 1.7256,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1,
+      "eval_loss": 1.7340322732925415,
+      "eval_runtime": 37.5614,
+      "eval_samples_per_second": 26.623,
+      "eval_steps_per_second": 3.328,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.5252068042755127,
+      "learning_rate": 0.0002951788944723618,
+      "loss": 1.7454,
+      "step": 2100
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.5635749101638794,
+      "learning_rate": 0.00029487738693467335,
+      "loss": 1.7461,
+      "step": 2200
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.3426711559295654,
+      "learning_rate": 0.0002945758793969849,
+      "loss": 1.75,
+      "step": 2300
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.3829035758972168,
+      "learning_rate": 0.00029427437185929647,
+      "loss": 1.7554,
+      "step": 2400
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.2779866456985474,
+      "learning_rate": 0.00029397286432160803,
+      "loss": 1.7262,
+      "step": 2500
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.4913188219070435,
+      "learning_rate": 0.0002936713567839196,
+      "loss": 1.7333,
+      "step": 2600
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.5596439838409424,
+      "learning_rate": 0.00029336984924623115,
+      "loss": 1.728,
+      "step": 2700
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.1473088264465332,
+      "learning_rate": 0.0002930683417085427,
+      "loss": 1.7063,
+      "step": 2800
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.4872281551361084,
+      "learning_rate": 0.0002927668341708542,
+      "loss": 1.7417,
+      "step": 2900
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.3485779762268066,
+      "learning_rate": 0.0002924653266331658,
+      "loss": 1.7314,
+      "step": 3000
+    },
+    {
+      "epoch": 0.15,
+      "eval_loss": 1.680450677871704,
+      "eval_runtime": 38.1024,
+      "eval_samples_per_second": 26.245,
+      "eval_steps_per_second": 3.281,
+      "step": 3000
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.3290046453475952,
+      "learning_rate": 0.0002921638190954774,
+      "loss": 1.7101,
+      "step": 3100
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9259174466133118,
+      "learning_rate": 0.00029186231155778895,
+      "loss": 1.75,
+      "step": 3200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.4094349145889282,
+      "learning_rate": 0.00029156080402010045,
+      "loss": 1.7219,
+      "step": 3300
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.2119574546813965,
+      "learning_rate": 0.000291259296482412,
+      "loss": 1.741,
+      "step": 3400
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.1941887140274048,
+      "learning_rate": 0.0002909577889447236,
+      "loss": 1.7005,
+      "step": 3500
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.1339538097381592,
+      "learning_rate": 0.00029065628140703513,
+      "loss": 1.6965,
+      "step": 3600
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.2265231609344482,
+      "learning_rate": 0.0002903547738693467,
+      "loss": 1.701,
+      "step": 3700
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.011225938796997,
+      "learning_rate": 0.00029005326633165825,
+      "loss": 1.7241,
+      "step": 3800
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.3544070720672607,
+      "learning_rate": 0.00028975175879396986,
+      "loss": 1.7039,
+      "step": 3900
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.909501314163208,
+      "learning_rate": 0.00028945025125628137,
+      "loss": 1.676,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 1.6920135021209717,
+      "eval_runtime": 37.5936,
+      "eval_samples_per_second": 26.6,
+      "eval_steps_per_second": 3.325,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.7186845541000366,
+      "learning_rate": 0.0002891487437185929,
+      "loss": 1.7,
+      "step": 4100
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.9834026098251343,
+      "learning_rate": 0.0002888502512562814,
+      "loss": 1.6639,
+      "step": 4200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.9373641014099121,
+      "learning_rate": 0.0002885487437185929,
+      "loss": 1.6786,
+      "step": 4300
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.4471676349639893,
+      "learning_rate": 0.00028824723618090447,
+      "loss": 1.7199,
+      "step": 4400
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.9259161949157715,
+      "learning_rate": 0.0002879457286432161,
+      "loss": 1.7013,
+      "step": 4500
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.3199846744537354,
+      "learning_rate": 0.0002876442211055276,
+      "loss": 1.6865,
+      "step": 4600
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.6591229438781738,
+      "learning_rate": 0.00028734271356783915,
+      "loss": 1.6976,
+      "step": 4700
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.0676679611206055,
+      "learning_rate": 0.0002870412060301507,
+      "loss": 1.6921,
+      "step": 4800
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.3471635580062866,
+      "learning_rate": 0.0002867396984924623,
+      "loss": 1.6951,
+      "step": 4900
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.6372334957122803,
+      "learning_rate": 0.00028643819095477383,
+      "loss": 1.6951,
+      "step": 5000
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.6992712020874023,
+      "eval_runtime": 37.5723,
+      "eval_samples_per_second": 26.615,
+      "eval_steps_per_second": 3.327,
+      "step": 5000
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.9456429481506348,
+      "learning_rate": 0.0002861366834170854,
+      "loss": 1.6708,
+      "step": 5100
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.8768843412399292,
+      "learning_rate": 0.00028583517587939695,
+      "loss": 1.701,
+      "step": 5200
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.4709163904190063,
+      "learning_rate": 0.0002855336683417085,
+      "loss": 1.7119,
+      "step": 5300
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.15412175655365,
+      "learning_rate": 0.00028523216080402007,
+      "loss": 1.6864,
+      "step": 5400
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.9388611316680908,
+      "learning_rate": 0.0002849306532663316,
+      "loss": 1.6781,
+      "step": 5500
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.746967315673828,
+      "learning_rate": 0.0002846291457286432,
+      "loss": 1.7134,
+      "step": 5600
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.7656490802764893,
+      "learning_rate": 0.00028432763819095474,
+      "loss": 1.6441,
+      "step": 5700
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.4275192022323608,
+      "learning_rate": 0.0002840261306532663,
+      "loss": 1.6893,
+      "step": 5800
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.8908779621124268,
+      "learning_rate": 0.00028372462311557786,
+      "loss": 1.6814,
+      "step": 5900
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.0805633068084717,
+      "learning_rate": 0.0002834231155778894,
+      "loss": 1.6765,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3,
+      "eval_loss": 1.6724690198898315,
+      "eval_runtime": 37.9367,
+      "eval_samples_per_second": 26.36,
+      "eval_steps_per_second": 3.295,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.8758082389831543,
+      "learning_rate": 0.000283121608040201,
+      "loss": 1.6462,
+      "step": 6100
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.4686168432235718,
+      "learning_rate": 0.00028282010050251254,
+      "loss": 1.7117,
+      "step": 6200
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.857920527458191,
+      "learning_rate": 0.0002825185929648241,
+      "loss": 1.648,
+      "step": 6300
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.852232575416565,
+      "learning_rate": 0.00028221708542713566,
+      "loss": 1.7096,
+      "step": 6400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.6206820011138916,
+      "learning_rate": 0.0002819155778894472,
+      "loss": 1.6379,
+      "step": 6500
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.9906002879142761,
+      "learning_rate": 0.0002816140703517588,
+      "loss": 1.6905,
+      "step": 6600
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.8640550374984741,
+      "learning_rate": 0.00028131256281407034,
+      "loss": 1.6842,
+      "step": 6700
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.5478594303131104,
+      "learning_rate": 0.00028101407035175876,
+      "loss": 1.6554,
+      "step": 6800
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.2689837217330933,
+      "learning_rate": 0.0002807125628140703,
+      "loss": 1.6775,
+      "step": 6900
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.8730539083480835,
+      "learning_rate": 0.0002804110552763819,
+      "loss": 1.6603,
+      "step": 7000
+    },
+    {
+      "epoch": 0.35,
+      "eval_loss": 1.6557646989822388,
+      "eval_runtime": 37.4609,
+      "eval_samples_per_second": 26.694,
+      "eval_steps_per_second": 3.337,
+      "step": 7000
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.455672025680542,
+      "learning_rate": 0.00028010954773869344,
+      "loss": 1.6635,
+      "step": 7100
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.6935358047485352,
+      "learning_rate": 0.000279808040201005,
+      "loss": 1.6796,
+      "step": 7200
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 2.0117626190185547,
+      "learning_rate": 0.00027950653266331656,
+      "loss": 1.6972,
+      "step": 7300
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9937806129455566,
+      "learning_rate": 0.0002792050251256281,
+      "loss": 1.6666,
+      "step": 7400
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.1981546878814697,
+      "learning_rate": 0.0002789035175879397,
+      "loss": 1.665,
+      "step": 7500
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.0641427040100098,
+      "learning_rate": 0.00027860201005025124,
+      "loss": 1.6535,
+      "step": 7600
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.7236799001693726,
+      "learning_rate": 0.0002783005025125628,
+      "loss": 1.676,
+      "step": 7700
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 3.5143849849700928,
+      "learning_rate": 0.00027799899497487436,
+      "loss": 1.6521,
+      "step": 7800
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.7787047624588013,
+      "learning_rate": 0.0002776974874371859,
+      "loss": 1.6637,
+      "step": 7900
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.713392972946167,
+      "learning_rate": 0.0002773959798994975,
+      "loss": 1.6704,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 1.656675100326538,
+      "eval_runtime": 37.9526,
+      "eval_samples_per_second": 26.349,
+      "eval_steps_per_second": 3.294,
+      "step": 8000
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.1325870752334595,
+      "learning_rate": 0.00027709447236180904,
+      "loss": 1.6118,
+      "step": 8100
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.746856689453125,
+      "learning_rate": 0.0002767929648241206,
+      "loss": 1.6718,
+      "step": 8200
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.4181280136108398,
+      "learning_rate": 0.0002764914572864321,
+      "loss": 1.6957,
+      "step": 8300
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.078387975692749,
+      "learning_rate": 0.0002761899497487437,
+      "loss": 1.6546,
+      "step": 8400
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 2.694249153137207,
+      "learning_rate": 0.0002758884422110553,
+      "loss": 1.6875,
+      "step": 8500
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.530956745147705,
+      "learning_rate": 0.00027558693467336683,
+      "loss": 1.6112,
+      "step": 8600
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.4322137832641602,
+      "learning_rate": 0.00027528542713567834,
+      "loss": 1.6572,
+      "step": 8700
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.996591329574585,
+      "learning_rate": 0.0002749839195979899,
+      "loss": 1.6442,
+      "step": 8800
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.6497749090194702,
+      "learning_rate": 0.0002746824120603015,
+      "loss": 1.6404,
+      "step": 8900
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.429196357727051,
+      "learning_rate": 0.000274380904522613,
+      "loss": 1.656,
+      "step": 9000
+    },
+    {
+      "epoch": 0.45,
+      "eval_loss": 1.6714575290679932,
+      "eval_runtime": 37.5615,
+      "eval_samples_per_second": 26.623,
+      "eval_steps_per_second": 3.328,
+      "step": 9000
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9687163829803467,
+      "learning_rate": 0.0002740824120603015,
+      "loss": 1.6644,
+      "step": 9100
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.4858875274658203,
+      "learning_rate": 0.00027378090452261306,
+      "loss": 1.6434,
+      "step": 9200
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.8358262777328491,
+      "learning_rate": 0.0002734793969849246,
+      "loss": 1.6565,
+      "step": 9300
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.3822225332260132,
+      "learning_rate": 0.0002731778894472361,
+      "loss": 1.6506,
+      "step": 9400
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.9319818019866943,
+      "learning_rate": 0.00027287638190954774,
+      "loss": 1.6256,
+      "step": 9500
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.601515293121338,
+      "learning_rate": 0.0002725748743718593,
+      "loss": 1.6243,
+      "step": 9600
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.610561490058899,
+      "learning_rate": 0.0002722733668341708,
+      "loss": 1.6259,
+      "step": 9700
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 2.060863494873047,
+      "learning_rate": 0.00027197185929648236,
+      "loss": 1.655,
+      "step": 9800
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 2.7591397762298584,
+      "learning_rate": 0.000271670351758794,
+      "loss": 1.6342,
+      "step": 9900
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.2489566802978516,
+      "learning_rate": 0.00027136884422110553,
+      "loss": 1.6431,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.6347763538360596,
+      "eval_runtime": 37.5252,
+      "eval_samples_per_second": 26.649,
+      "eval_steps_per_second": 3.331,
+      "step": 10000
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.2953132390975952,
+      "learning_rate": 0.00027106733668341704,
+      "loss": 1.6872,
+      "step": 10100
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9919564723968506,
+      "learning_rate": 0.0002707658291457286,
+      "loss": 1.6576,
+      "step": 10200
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.3081834316253662,
+      "learning_rate": 0.0002704643216080402,
+      "loss": 1.6354,
+      "step": 10300
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.605245590209961,
+      "learning_rate": 0.0002701628140703517,
+      "loss": 1.6687,
+      "step": 10400
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.1541988849639893,
+      "learning_rate": 0.0002698613065326633,
+      "loss": 1.6631,
+      "step": 10500
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.8258408308029175,
+      "learning_rate": 0.00026955979899497484,
+      "loss": 1.6633,
+      "step": 10600
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.3377407789230347,
+      "learning_rate": 0.00026925829145728645,
+      "loss": 1.6544,
+      "step": 10700
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.998458206653595,
+      "learning_rate": 0.00026895678391959795,
+      "loss": 1.6126,
+      "step": 10800
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.3561229705810547,
+      "learning_rate": 0.0002686552763819095,
+      "loss": 1.6614,
+      "step": 10900
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.4729514122009277,
+      "learning_rate": 0.0002683537688442211,
+      "loss": 1.6439,
+      "step": 11000
+    },
+    {
+      "epoch": 0.55,
+      "eval_loss": 1.648992657661438,
+      "eval_runtime": 37.818,
+      "eval_samples_per_second": 26.442,
+      "eval_steps_per_second": 3.305,
+      "step": 11000
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.2856806516647339,
+      "learning_rate": 0.00026805226130653263,
+      "loss": 1.605,
+      "step": 11100
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7708286046981812,
+      "learning_rate": 0.0002677507537688442,
+      "loss": 1.6257,
+      "step": 11200
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.085149049758911,
+      "learning_rate": 0.00026744924623115575,
+      "loss": 1.6347,
+      "step": 11300
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.9750702977180481,
+      "learning_rate": 0.0002671477386934673,
+      "loss": 1.6496,
+      "step": 11400
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.9253658056259155,
+      "learning_rate": 0.00026684623115577887,
+      "loss": 1.6395,
+      "step": 11500
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.441312313079834,
+      "learning_rate": 0.00026654472361809043,
+      "loss": 1.6444,
+      "step": 11600
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.259020447731018,
+      "learning_rate": 0.000266243216080402,
+      "loss": 1.6114,
+      "step": 11700
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.3337092399597168,
+      "learning_rate": 0.00026594170854271355,
+      "loss": 1.6243,
+      "step": 11800
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.423687219619751,
+      "learning_rate": 0.0002656402010050251,
+      "loss": 1.623,
+      "step": 11900
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.2547069787979126,
+      "learning_rate": 0.00026533869346733667,
+      "loss": 1.631,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6,
+      "eval_loss": 1.6530547142028809,
+      "eval_runtime": 37.4898,
+      "eval_samples_per_second": 26.674,
+      "eval_steps_per_second": 3.334,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.1078051328659058,
+      "learning_rate": 0.00026503718592964823,
+      "loss": 1.6229,
+      "step": 12100
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.2142789363861084,
+      "learning_rate": 0.0002647356783919598,
+      "loss": 1.6251,
+      "step": 12200
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.5933152437210083,
+      "learning_rate": 0.00026443417085427135,
+      "loss": 1.627,
+      "step": 12300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.6759315729141235,
+      "learning_rate": 0.0002641326633165829,
+      "loss": 1.6294,
+      "step": 12400
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.487029790878296,
+      "learning_rate": 0.00026383115577889447,
+      "loss": 1.6406,
+      "step": 12500
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.004643201828003,
+      "learning_rate": 0.000263529648241206,
+      "loss": 1.6342,
+      "step": 12600
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.94633948802948,
+      "learning_rate": 0.00026322814070351753,
+      "loss": 1.6279,
+      "step": 12700
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.5213886499404907,
+      "learning_rate": 0.00026292663316582914,
+      "loss": 1.6023,
+      "step": 12800
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.4710832834243774,
+      "learning_rate": 0.0002626251256281407,
+      "loss": 1.6202,
+      "step": 12900
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.367037296295166,
+      "learning_rate": 0.00026232361809045226,
+      "loss": 1.6463,
+      "step": 13000
+    },
+    {
+      "epoch": 0.65,
+      "eval_loss": 1.6226933002471924,
+      "eval_runtime": 37.4429,
+      "eval_samples_per_second": 26.707,
+      "eval_steps_per_second": 3.338,
+      "step": 13000
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 2.803264856338501,
+      "learning_rate": 0.0002620251256281407,
+      "loss": 1.6219,
+      "step": 13100
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.5915918350219727,
+      "learning_rate": 0.00026172361809045225,
+      "loss": 1.6252,
+      "step": 13200
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.5102565288543701,
+      "learning_rate": 0.0002614221105527638,
+      "loss": 1.6346,
+      "step": 13300
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.8278179168701172,
+      "learning_rate": 0.00026112060301507537,
+      "loss": 1.6386,
+      "step": 13400
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.6756057739257812,
+      "learning_rate": 0.0002608190954773869,
+      "loss": 1.6135,
+      "step": 13500
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.4984675645828247,
+      "learning_rate": 0.0002605175879396985,
+      "loss": 1.6316,
+      "step": 13600
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 2.2536373138427734,
+      "learning_rate": 0.00026021608040201005,
+      "loss": 1.6317,
+      "step": 13700
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.3149595260620117,
+      "learning_rate": 0.0002599145728643216,
+      "loss": 1.6311,
+      "step": 13800
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.662287712097168,
+      "learning_rate": 0.00025961306532663316,
+      "loss": 1.6229,
+      "step": 13900
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.2096275091171265,
+      "learning_rate": 0.0002593115577889447,
+      "loss": 1.6305,
+      "step": 14000
+    },
+    {
+      "epoch": 0.7,
+      "eval_loss": 1.632125735282898,
+      "eval_runtime": 37.4575,
+      "eval_samples_per_second": 26.697,
+      "eval_steps_per_second": 3.337,
+      "step": 14000
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.01643705368042,
+      "learning_rate": 0.00025901005025125623,
+      "loss": 1.6271,
+      "step": 14100
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.2067662477493286,
+      "learning_rate": 0.0002587085427135678,
+      "loss": 1.6082,
+      "step": 14200
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.0471389293670654,
+      "learning_rate": 0.0002584070351758794,
+      "loss": 1.6346,
+      "step": 14300
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.1723861694335938,
+      "learning_rate": 0.0002581055276381909,
+      "loss": 1.6449,
+      "step": 14400
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.3024895191192627,
+      "learning_rate": 0.00025780402010050247,
+      "loss": 1.6159,
+      "step": 14500
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.6715235710144043,
+      "learning_rate": 0.000257502512562814,
+      "loss": 1.6096,
+      "step": 14600
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.116154432296753,
+      "learning_rate": 0.00025720100502512564,
+      "loss": 1.5984,
+      "step": 14700
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.2046904563903809,
+      "learning_rate": 0.00025689949748743714,
+      "loss": 1.6125,
+      "step": 14800
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.5058480501174927,
+      "learning_rate": 0.0002565979899497487,
+      "loss": 1.6164,
+      "step": 14900
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.2814691066741943,
+      "learning_rate": 0.00025629648241206026,
+      "loss": 1.6312,
+      "step": 15000
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.6935200691223145,
+      "eval_runtime": 37.4213,
+      "eval_samples_per_second": 26.723,
+      "eval_steps_per_second": 3.34,
+      "step": 15000
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.5283032655715942,
+      "learning_rate": 0.0002559979899497487,
+      "loss": 1.6201,
+      "step": 15100
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.6960535049438477,
+      "learning_rate": 0.00025569648241206025,
+      "loss": 1.6062,
+      "step": 15200
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.080701470375061,
+      "learning_rate": 0.00025539798994974873,
+      "loss": 1.622,
+      "step": 15300
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.6446688175201416,
+      "learning_rate": 0.0002550964824120603,
+      "loss": 1.6075,
+      "step": 15400
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.9143463373184204,
+      "learning_rate": 0.00025479497487437185,
+      "loss": 1.6363,
+      "step": 15500
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.6148111820220947,
+      "learning_rate": 0.0002544934673366834,
+      "loss": 1.6367,
+      "step": 15600
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.7216590642929077,
+      "learning_rate": 0.00025419195979899497,
+      "loss": 1.5997,
+      "step": 15700
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.442865014076233,
+      "learning_rate": 0.00025389045226130647,
+      "loss": 1.5776,
+      "step": 15800
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9140738844871521,
+      "learning_rate": 0.0002535889447236181,
+      "loss": 1.6275,
+      "step": 15900
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.099306583404541,
+      "learning_rate": 0.00025328743718592965,
+      "loss": 1.6072,
+      "step": 16000
+    },
+    {
+      "epoch": 0.8,
+      "eval_loss": 1.5524722337722778,
+      "eval_runtime": 37.4031,
+      "eval_samples_per_second": 26.736,
+      "eval_steps_per_second": 3.342,
+      "step": 16000
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.560035228729248,
+      "learning_rate": 0.0002529859296482412,
+      "loss": 1.5983,
+      "step": 16100
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8614755868911743,
+      "learning_rate": 0.0002526844221105527,
+      "loss": 1.6271,
+      "step": 16200
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7034022808074951,
+      "learning_rate": 0.0002523829145728643,
+      "loss": 1.6238,
+      "step": 16300
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.086572527885437,
+      "learning_rate": 0.0002520814070351759,
+      "loss": 1.6241,
+      "step": 16400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.8860014081001282,
+      "learning_rate": 0.0002517798994974874,
+      "loss": 1.562,
+      "step": 16500
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7819429636001587,
+      "learning_rate": 0.00025147839195979895,
+      "loss": 1.5995,
+      "step": 16600
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 3.512892246246338,
+      "learning_rate": 0.00025117688442211056,
+      "loss": 1.5866,
+      "step": 16700
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.5962600708007812,
+      "learning_rate": 0.0002508753768844221,
+      "loss": 1.6358,
+      "step": 16800
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.7714247703552246,
+      "learning_rate": 0.0002505738693467336,
+      "loss": 1.6043,
+      "step": 16900
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.3199384212493896,
+      "learning_rate": 0.0002502723618090452,
+      "loss": 1.6102,
+      "step": 17000
+    },
+    {
+      "epoch": 0.85,
+      "eval_loss": 1.577386736869812,
+      "eval_runtime": 37.4173,
+      "eval_samples_per_second": 26.726,
+      "eval_steps_per_second": 3.341,
+      "step": 17000
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.777269721031189,
+      "learning_rate": 0.00024997085427135675,
+      "loss": 1.6064,
+      "step": 17100
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.2964231967926025,
+      "learning_rate": 0.0002496693467336683,
+      "loss": 1.5964,
+      "step": 17200
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.7127012014389038,
+      "learning_rate": 0.00024936783919597986,
+      "loss": 1.6334,
+      "step": 17300
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.442181944847107,
+      "learning_rate": 0.0002490663316582914,
+      "loss": 1.5679,
+      "step": 17400
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.7092599868774414,
+      "learning_rate": 0.000248764824120603,
+      "loss": 1.6125,
+      "step": 17500
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.8017587661743164,
+      "learning_rate": 0.00024846331658291454,
+      "loss": 1.6386,
+      "step": 17600
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 3.3435380458831787,
+      "learning_rate": 0.0002481618090452261,
+      "loss": 1.5632,
+      "step": 17700
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.2036772966384888,
+      "learning_rate": 0.00024786030150753766,
+      "loss": 1.5972,
+      "step": 17800
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 2.212369203567505,
+      "learning_rate": 0.0002475587939698492,
+      "loss": 1.5777,
+      "step": 17900
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.9149020910263062,
+      "learning_rate": 0.0002472572864321608,
+      "loss": 1.601,
+      "step": 18000
+    },
+    {
+      "epoch": 0.9,
+      "eval_loss": 1.5726821422576904,
+      "eval_runtime": 37.4487,
+      "eval_samples_per_second": 26.703,
+      "eval_steps_per_second": 3.338,
+      "step": 18000
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.2173101902008057,
+      "learning_rate": 0.00024695577889447234,
+      "loss": 1.5975,
+      "step": 18100
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.0151243209838867,
+      "learning_rate": 0.0002466542713567839,
+      "loss": 1.5975,
+      "step": 18200
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.6275769472122192,
+      "learning_rate": 0.00024635276381909546,
+      "loss": 1.5909,
+      "step": 18300
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8248587846755981,
+      "learning_rate": 0.000246051256281407,
+      "loss": 1.5954,
+      "step": 18400
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 2.119255542755127,
+      "learning_rate": 0.0002457497487437186,
+      "loss": 1.5651,
+      "step": 18500
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.9526431560516357,
+      "learning_rate": 0.00024544824120603014,
+      "loss": 1.6184,
+      "step": 18600
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.407723903656006,
+      "learning_rate": 0.0002451467336683417,
+      "loss": 1.6181,
+      "step": 18700
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.526853084564209,
+      "learning_rate": 0.00024484522613065326,
+      "loss": 1.6172,
+      "step": 18800
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.4161404371261597,
+      "learning_rate": 0.0002445437185929648,
+      "loss": 1.6154,
+      "step": 18900
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.3028178215026855,
+      "learning_rate": 0.0002442422110552764,
+      "loss": 1.6053,
+      "step": 19000
+    },
+    {
+      "epoch": 0.95,
+      "eval_loss": 1.5818778276443481,
+      "eval_runtime": 37.4111,
+      "eval_samples_per_second": 26.73,
+      "eval_steps_per_second": 3.341,
+      "step": 19000
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.6454105377197266,
+      "learning_rate": 0.0002439407035175879,
+      "loss": 1.6149,
+      "step": 19100
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.664665699005127,
+      "learning_rate": 0.00024364221105527636,
+      "loss": 1.6178,
+      "step": 19200
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.53481125831604,
+      "learning_rate": 0.00024334070351758792,
+      "loss": 1.5858,
+      "step": 19300
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 6.695281982421875,
+      "learning_rate": 0.00024303919597989948,
+      "loss": 1.5717,
+      "step": 19400
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.6920980215072632,
+      "learning_rate": 0.00024273768844221104,
+      "loss": 1.5754,
+      "step": 19500
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 2.0135791301727295,
+      "learning_rate": 0.0002424361809045226,
+      "loss": 1.6219,
+      "step": 19600
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.3980337381362915,
+      "learning_rate": 0.00024213467336683413,
+      "loss": 1.5913,
+      "step": 19700
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.8416180610656738,
+      "learning_rate": 0.0002418331658291457,
+      "loss": 1.5691,
+      "step": 19800
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.366356372833252,
+      "learning_rate": 0.00024153165829145728,
+      "loss": 1.5804,
+      "step": 19900
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.3517309427261353,
+      "learning_rate": 0.00024123015075376884,
+      "loss": 1.5603,
+      "step": 20000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.6314821243286133,
+      "eval_runtime": 37.4553,
+      "eval_samples_per_second": 26.698,
+      "eval_steps_per_second": 3.337,
+      "step": 20000
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8194776773452759,
+      "learning_rate": 0.00024092864321608037,
+      "loss": 1.5964,
+      "step": 20100
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 3.187936305999756,
+      "learning_rate": 0.00024062713567839193,
+      "loss": 1.5926,
+      "step": 20200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.2052127122879028,
+      "learning_rate": 0.00024032562814070351,
+      "loss": 1.5727,
+      "step": 20300
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.6609078645706177,
+      "learning_rate": 0.00024002412060301505,
+      "loss": 1.5967,
+      "step": 20400
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.502246379852295,
+      "learning_rate": 0.0002397226130653266,
+      "loss": 1.5708,
+      "step": 20500
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 2.180826425552368,
+      "learning_rate": 0.00023942110552763817,
+      "loss": 1.5688,
+      "step": 20600
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 2.257434606552124,
+      "learning_rate": 0.00023911959798994975,
+      "loss": 1.5263,
+      "step": 20700
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.9630309343338013,
+      "learning_rate": 0.00023881809045226128,
+      "loss": 1.5776,
+      "step": 20800
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.248621940612793,
+      "learning_rate": 0.00023851658291457284,
+      "loss": 1.6,
+      "step": 20900
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.0489450693130493,
+      "learning_rate": 0.0002382150753768844,
+      "loss": 1.5699,
+      "step": 21000
+    },
+    {
+      "epoch": 1.05,
+      "eval_loss": 1.5522246360778809,
+      "eval_runtime": 37.4533,
+      "eval_samples_per_second": 26.7,
+      "eval_steps_per_second": 3.337,
+      "step": 21000
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 1.457306981086731,
+      "learning_rate": 0.00023791356783919596,
+      "loss": 1.5953,
+      "step": 21100
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 2.088200807571411,
+      "learning_rate": 0.00023761206030150752,
+      "loss": 1.5881,
+      "step": 21200
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.4078480005264282,
+      "learning_rate": 0.00023731356783919598,
+      "loss": 1.5746,
+      "step": 21300
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.5498270988464355,
+      "learning_rate": 0.00023701206030150753,
+      "loss": 1.5878,
+      "step": 21400
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.8573285341262817,
+      "learning_rate": 0.00023671055276381907,
+      "loss": 1.574,
+      "step": 21500
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.3603477478027344,
+      "learning_rate": 0.00023640904522613063,
+      "loss": 1.5994,
+      "step": 21600
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.6878479719161987,
+      "learning_rate": 0.0002361075376884422,
+      "loss": 1.63,
+      "step": 21700
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.879296064376831,
+      "learning_rate": 0.00023580603015075375,
+      "loss": 1.6313,
+      "step": 21800
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.695983648300171,
+      "learning_rate": 0.0002355045226130653,
+      "loss": 1.5837,
+      "step": 21900
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.6149425506591797,
+      "learning_rate": 0.00023520301507537686,
+      "loss": 1.5967,
+      "step": 22000
+    },
+    {
+      "epoch": 1.1,
+      "eval_loss": 1.6031874418258667,
+      "eval_runtime": 37.522,
+      "eval_samples_per_second": 26.651,
+      "eval_steps_per_second": 3.331,
+      "step": 22000
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.0208663940429688,
+      "learning_rate": 0.00023490150753768845,
+      "loss": 1.5691,
+      "step": 22100
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.202256202697754,
+      "learning_rate": 0.00023459999999999998,
+      "loss": 1.5815,
+      "step": 22200
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9692652225494385,
+      "learning_rate": 0.00023429849246231154,
+      "loss": 1.5854,
+      "step": 22300
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.9030089378356934,
+      "learning_rate": 0.00023399698492462308,
+      "loss": 1.5937,
+      "step": 22400
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.265805244445801,
+      "learning_rate": 0.00023369547738693463,
+      "loss": 1.5409,
+      "step": 22500
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9824811220169067,
+      "learning_rate": 0.00023339396984924622,
+      "loss": 1.5977,
+      "step": 22600
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.3028334379196167,
+      "learning_rate": 0.00023309246231155778,
+      "loss": 1.573,
+      "step": 22700
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.472731590270996,
+      "learning_rate": 0.0002327909547738693,
+      "loss": 1.5962,
+      "step": 22800
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.1267619132995605,
+      "learning_rate": 0.00023248944723618087,
+      "loss": 1.5871,
+      "step": 22900
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.7546107769012451,
+      "learning_rate": 0.00023218793969849246,
+      "loss": 1.6053,
+      "step": 23000
+    },
+    {
+      "epoch": 1.15,
+      "eval_loss": 1.5985630750656128,
+      "eval_runtime": 37.3462,
+      "eval_samples_per_second": 26.777,
+      "eval_steps_per_second": 3.347,
+      "step": 23000
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.7325044870376587,
+      "learning_rate": 0.000231886432160804,
+      "loss": 1.557,
+      "step": 23100
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.771462321281433,
+      "learning_rate": 0.00023158492462311555,
+      "loss": 1.6102,
+      "step": 23200
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.077540397644043,
+      "learning_rate": 0.0002312834170854271,
+      "loss": 1.5607,
+      "step": 23300
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.2537496089935303,
+      "learning_rate": 0.0002309819095477387,
+      "loss": 1.5801,
+      "step": 23400
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.5025357007980347,
+      "learning_rate": 0.00023068040201005023,
+      "loss": 1.583,
+      "step": 23500
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.2420939207077026,
+      "learning_rate": 0.0002303788944723618,
+      "loss": 1.569,
+      "step": 23600
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 2.900120735168457,
+      "learning_rate": 0.00023007738693467335,
+      "loss": 1.5561,
+      "step": 23700
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.6182540655136108,
+      "learning_rate": 0.0002297758793969849,
+      "loss": 1.5673,
+      "step": 23800
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.5824536085128784,
+      "learning_rate": 0.00022947437185929647,
+      "loss": 1.5603,
+      "step": 23900
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.1365340948104858,
+      "learning_rate": 0.00022917286432160803,
+      "loss": 1.571,
+      "step": 24000
+    },
+    {
+      "epoch": 1.2,
+      "eval_loss": 1.5782357454299927,
+      "eval_runtime": 37.4619,
+      "eval_samples_per_second": 26.694,
+      "eval_steps_per_second": 3.337,
+      "step": 24000
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8765733242034912,
+      "learning_rate": 0.00022887135678391956,
+      "loss": 1.5628,
+      "step": 24100
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 2.7541840076446533,
+      "learning_rate": 0.00022856984924623115,
+      "loss": 1.5967,
+      "step": 24200
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.2741538286209106,
+      "learning_rate": 0.0002282683417085427,
+      "loss": 1.5142,
+      "step": 24300
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.5907307863235474,
+      "learning_rate": 0.00022796683417085426,
+      "loss": 1.5685,
+      "step": 24400
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.187331438064575,
+      "learning_rate": 0.0002276653266331658,
+      "loss": 1.5788,
+      "step": 24500
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.0679118633270264,
+      "learning_rate": 0.00022736381909547736,
+      "loss": 1.5585,
+      "step": 24600
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.2467107772827148,
+      "learning_rate": 0.00022706231155778894,
+      "loss": 1.5531,
+      "step": 24700
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.334625005722046,
+      "learning_rate": 0.00022676080402010047,
+      "loss": 1.5831,
+      "step": 24800
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.466030240058899,
+      "learning_rate": 0.00022645929648241203,
+      "loss": 1.5753,
+      "step": 24900
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.8223164081573486,
+      "learning_rate": 0.0002261577889447236,
+      "loss": 1.6131,
+      "step": 25000
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 1.5768604278564453,
+      "eval_runtime": 37.5229,
+      "eval_samples_per_second": 26.65,
+      "eval_steps_per_second": 3.331,
+      "step": 25000
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.1590194702148438,
+      "learning_rate": 0.00022585628140703518,
+      "loss": 1.5831,
+      "step": 25100
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.1502777338027954,
+      "learning_rate": 0.00022555778894472358,
+      "loss": 1.5673,
+      "step": 25200
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.8671566247940063,
+      "learning_rate": 0.00022525628140703517,
+      "loss": 1.5632,
+      "step": 25300
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 2.4084360599517822,
+      "learning_rate": 0.00022495477386934673,
+      "loss": 1.5727,
+      "step": 25400
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.3032163381576538,
+      "learning_rate": 0.00022465326633165826,
+      "loss": 1.5754,
+      "step": 25500
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.2420716285705566,
+      "learning_rate": 0.00022435175879396982,
+      "loss": 1.5802,
+      "step": 25600
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.3116205930709839,
+      "learning_rate": 0.0002240502512562814,
+      "loss": 1.5744,
+      "step": 25700
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.0008848905563354,
+      "learning_rate": 0.00022374874371859294,
+      "loss": 1.5423,
+      "step": 25800
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 2.07833194732666,
+      "learning_rate": 0.0002234472361809045,
+      "loss": 1.5657,
+      "step": 25900
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 1.894939661026001,
+      "learning_rate": 0.00022314572864321605,
+      "loss": 1.5464,
+      "step": 26000
+    },
+    {
+      "epoch": 1.3,
+      "eval_loss": 1.568109154701233,
+      "eval_runtime": 37.5055,
+      "eval_samples_per_second": 26.663,
+      "eval_steps_per_second": 3.333,
+      "step": 26000
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 2.3820385932922363,
+      "learning_rate": 0.00022284422110552764,
+      "loss": 1.562,
+      "step": 26100
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.6219903230667114,
+      "learning_rate": 0.00022254271356783917,
+      "loss": 1.5683,
+      "step": 26200
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.721933126449585,
+      "learning_rate": 0.00022224120603015073,
+      "loss": 1.5646,
+      "step": 26300
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.8133726119995117,
+      "learning_rate": 0.0002219396984924623,
+      "loss": 1.5574,
+      "step": 26400
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.871902585029602,
+      "learning_rate": 0.00022163819095477385,
+      "loss": 1.5728,
+      "step": 26500
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 2.7765748500823975,
+      "learning_rate": 0.0002213366834170854,
+      "loss": 1.5768,
+      "step": 26600
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.848578691482544,
+      "learning_rate": 0.00022103819095477386,
+      "loss": 1.5721,
+      "step": 26700
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.944535732269287,
+      "learning_rate": 0.00022073668341708542,
+      "loss": 1.5722,
+      "step": 26800
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.5262278318405151,
+      "learning_rate": 0.00022043517587939696,
+      "loss": 1.5642,
+      "step": 26900
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.425851821899414,
+      "learning_rate": 0.00022013366834170852,
+      "loss": 1.5441,
+      "step": 27000
+    },
+    {
+      "epoch": 1.35,
+      "eval_loss": 1.5537927150726318,
+      "eval_runtime": 37.516,
+      "eval_samples_per_second": 26.655,
+      "eval_steps_per_second": 3.332,
+      "step": 27000
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.2145837545394897,
+      "learning_rate": 0.0002198321608040201,
+      "loss": 1.5994,
+      "step": 27100
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 2.361107349395752,
+      "learning_rate": 0.00021953065326633163,
+      "loss": 1.5551,
+      "step": 27200
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 1.1575865745544434,
+      "learning_rate": 0.0002192291457286432,
+      "loss": 1.5749,
+      "step": 27300
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.6274192333221436,
+      "learning_rate": 0.00021892763819095475,
+      "loss": 1.5729,
+      "step": 27400
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.660609245300293,
+      "learning_rate": 0.00021862613065326629,
+      "loss": 1.577,
+      "step": 27500
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 2.8537380695343018,
+      "learning_rate": 0.00021832462311557787,
+      "loss": 1.5962,
+      "step": 27600
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 7.63838529586792,
+      "learning_rate": 0.00021802311557788943,
+      "loss": 1.5462,
+      "step": 27700
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 3.0265047550201416,
+      "learning_rate": 0.000217721608040201,
+      "loss": 1.6263,
+      "step": 27800
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.552452802658081,
+      "learning_rate": 0.00021742010050251252,
+      "loss": 1.5683,
+      "step": 27900
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.2861206531524658,
+      "learning_rate": 0.0002171185929648241,
+      "loss": 1.5776,
+      "step": 28000
+    },
+    {
+      "epoch": 1.4,
+      "eval_loss": 1.5543726682662964,
+      "eval_runtime": 37.5463,
+      "eval_samples_per_second": 26.634,
+      "eval_steps_per_second": 3.329,
+      "step": 28000
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.410630941390991,
+      "learning_rate": 0.00021681708542713567,
+      "loss": 1.5513,
+      "step": 28100
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.8043034076690674,
+      "learning_rate": 0.0002165155778894472,
+      "loss": 1.594,
+      "step": 28200
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.8414337635040283,
+      "learning_rate": 0.00021621407035175876,
+      "loss": 1.5547,
+      "step": 28300
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.9315450191497803,
+      "learning_rate": 0.00021591256281407035,
+      "loss": 1.5387,
+      "step": 28400
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 2.3810365200042725,
+      "learning_rate": 0.0002156110552763819,
+      "loss": 1.564,
+      "step": 28500
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.4747709035873413,
+      "learning_rate": 0.00021530954773869344,
+      "loss": 1.5325,
+      "step": 28600
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.264954924583435,
+      "learning_rate": 0.000215008040201005,
+      "loss": 1.5716,
+      "step": 28700
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 2.2243235111236572,
+      "learning_rate": 0.00021470653266331659,
+      "loss": 1.5388,
+      "step": 28800
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 4.794454097747803,
+      "learning_rate": 0.00021440502512562812,
+      "loss": 1.5404,
+      "step": 28900
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 2.5027246475219727,
+      "learning_rate": 0.00021410351758793968,
+      "loss": 1.5592,
+      "step": 29000
+    },
+    {
+      "epoch": 1.45,
+      "eval_loss": 1.569115400314331,
+      "eval_runtime": 37.5972,
+      "eval_samples_per_second": 26.598,
+      "eval_steps_per_second": 3.325,
+      "step": 29000
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 2.8168818950653076,
+      "learning_rate": 0.00021380201005025124,
+      "loss": 1.5781,
+      "step": 29100
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.4402812719345093,
+      "learning_rate": 0.0002135005025125628,
+      "loss": 1.5626,
+      "step": 29200
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.5837584733963013,
+      "learning_rate": 0.00021319899497487436,
+      "loss": 1.6064,
+      "step": 29300
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.435542106628418,
+      "learning_rate": 0.00021289748743718592,
+      "loss": 1.5621,
+      "step": 29400
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.8227218389511108,
+      "learning_rate": 0.00021259597989949745,
+      "loss": 1.5481,
+      "step": 29500
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.9331644773483276,
+      "learning_rate": 0.00021229447236180903,
+      "loss": 1.5811,
+      "step": 29600
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 2.971740484237671,
+      "learning_rate": 0.0002119929648241206,
+      "loss": 1.5412,
+      "step": 29700
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 3.569145917892456,
+      "learning_rate": 0.00021169145728643215,
+      "loss": 1.5508,
+      "step": 29800
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.1709325313568115,
+      "learning_rate": 0.00021138994974874369,
+      "loss": 1.5802,
+      "step": 29900
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.8602783679962158,
+      "learning_rate": 0.00021108844221105524,
+      "loss": 1.5607,
+      "step": 30000
+    },
+    {
+      "epoch": 1.5,
+      "eval_loss": 1.5550180673599243,
+      "eval_runtime": 37.5792,
+      "eval_samples_per_second": 26.61,
+      "eval_steps_per_second": 3.326,
+      "step": 30000
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.3990256786346436,
+      "learning_rate": 0.00021078693467336683,
+      "loss": 1.5611,
+      "step": 30100
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 4.223874568939209,
+      "learning_rate": 0.00021048542713567836,
+      "loss": 1.6039,
+      "step": 30200
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.763484001159668,
+      "learning_rate": 0.00021018391959798992,
+      "loss": 1.5938,
+      "step": 30300
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 2.2013938426971436,
+      "learning_rate": 0.00020988241206030148,
+      "loss": 1.5883,
+      "step": 30400
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 2.446477174758911,
+      "learning_rate": 0.00020958391959798994,
+      "loss": 1.5289,
+      "step": 30500
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.6682789325714111,
+      "learning_rate": 0.00020928241206030147,
+      "loss": 1.5689,
+      "step": 30600
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.8999930620193481,
+      "learning_rate": 0.00020898090452261305,
+      "loss": 1.568,
+      "step": 30700
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 2.189378261566162,
+      "learning_rate": 0.00020867939698492461,
+      "loss": 1.5363,
+      "step": 30800
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 2.609349489212036,
+      "learning_rate": 0.00020837788944723615,
+      "loss": 1.5523,
+      "step": 30900
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 2.2627103328704834,
+      "learning_rate": 0.0002080763819095477,
+      "loss": 1.574,
+      "step": 31000
+    },
+    {
+      "epoch": 1.55,
+      "eval_loss": 1.5256458520889282,
+      "eval_runtime": 37.5514,
+      "eval_samples_per_second": 26.63,
+      "eval_steps_per_second": 3.329,
+      "step": 31000
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.9266563653945923,
+      "learning_rate": 0.0002077748743718593,
+      "loss": 1.5387,
+      "step": 31100
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.8447223901748657,
+      "learning_rate": 0.00020747336683417085,
+      "loss": 1.5632,
+      "step": 31200
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.375827670097351,
+      "learning_rate": 0.00020717185929648238,
+      "loss": 1.5494,
+      "step": 31300
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 2.164782762527466,
+      "learning_rate": 0.00020687035175879394,
+      "loss": 1.5468,
+      "step": 31400
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.186018943786621,
+      "learning_rate": 0.00020656884422110553,
+      "loss": 1.584,
+      "step": 31500
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 5.939981460571289,
+      "learning_rate": 0.00020626733668341706,
+      "loss": 1.5602,
+      "step": 31600
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 2.185728073120117,
+      "learning_rate": 0.00020596582914572862,
+      "loss": 1.5301,
+      "step": 31700
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.4321199655532837,
+      "learning_rate": 0.00020566432160804018,
+      "loss": 1.5308,
+      "step": 31800
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.8272013664245605,
+      "learning_rate": 0.00020536281407035177,
+      "loss": 1.5828,
+      "step": 31900
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.3199920654296875,
+      "learning_rate": 0.0002050613065326633,
+      "loss": 1.5704,
+      "step": 32000
+    },
+    {
+      "epoch": 1.6,
+      "eval_loss": 1.536350131034851,
+      "eval_runtime": 37.4489,
+      "eval_samples_per_second": 26.703,
+      "eval_steps_per_second": 3.338,
+      "step": 32000
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.796789526939392,
+      "learning_rate": 0.00020475979899497486,
+      "loss": 1.56,
+      "step": 32100
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.4835096597671509,
+      "learning_rate": 0.0002044582914572864,
+      "loss": 1.5409,
+      "step": 32200
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 1.225799798965454,
+      "learning_rate": 0.00020415678391959798,
+      "loss": 1.5109,
+      "step": 32300
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.5552102327346802,
+      "learning_rate": 0.00020385527638190954,
+      "loss": 1.5717,
+      "step": 32400
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.3638191223144531,
+      "learning_rate": 0.0002035537688442211,
+      "loss": 1.5763,
+      "step": 32500
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 2.177093505859375,
+      "learning_rate": 0.00020325226130653263,
+      "loss": 1.5173,
+      "step": 32600
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 3.643524169921875,
+      "learning_rate": 0.0002029507537688442,
+      "loss": 1.5596,
+      "step": 32700
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.2808345556259155,
+      "learning_rate": 0.00020264924623115578,
+      "loss": 1.5812,
+      "step": 32800
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 2.262430191040039,
+      "learning_rate": 0.0002023477386934673,
+      "loss": 1.5424,
+      "step": 32900
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.8705729246139526,
+      "learning_rate": 0.00020204623115577887,
+      "loss": 1.5552,
+      "step": 33000
+    },
+    {
+      "epoch": 1.65,
+      "eval_loss": 1.5897144079208374,
+      "eval_runtime": 37.4808,
+      "eval_samples_per_second": 26.68,
+      "eval_steps_per_second": 3.335,
+      "step": 33000
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.52475106716156,
+      "learning_rate": 0.00020174472361809043,
+      "loss": 1.5737,
+      "step": 33100
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.8177305459976196,
+      "learning_rate": 0.00020144321608040201,
+      "loss": 1.5638,
+      "step": 33200
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.8988004922866821,
+      "learning_rate": 0.00020114170854271355,
+      "loss": 1.54,
+      "step": 33300
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.61077880859375,
+      "learning_rate": 0.0002008402010050251,
+      "loss": 1.5463,
+      "step": 33400
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.3590441942214966,
+      "learning_rate": 0.00020053869346733667,
+      "loss": 1.5646,
+      "step": 33500
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 1.5618160963058472,
+      "learning_rate": 0.00020023718592964822,
+      "loss": 1.5316,
+      "step": 33600
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.7087860107421875,
+      "learning_rate": 0.00019993567839195978,
+      "loss": 1.5429,
+      "step": 33700
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 2.558692693710327,
+      "learning_rate": 0.00019963417085427134,
+      "loss": 1.558,
+      "step": 33800
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 2.3594534397125244,
+      "learning_rate": 0.00019933266331658288,
+      "loss": 1.5453,
+      "step": 33900
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.57821786403656,
+      "learning_rate": 0.00019903115577889446,
+      "loss": 1.5541,
+      "step": 34000
+    },
+    {
+      "epoch": 1.7,
+      "eval_loss": 1.5565224885940552,
+      "eval_runtime": 37.4415,
+      "eval_samples_per_second": 26.708,
+      "eval_steps_per_second": 3.339,
+      "step": 34000
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.9368255138397217,
+      "learning_rate": 0.00019872964824120602,
+      "loss": 1.5808,
+      "step": 34100
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 3.0767080783843994,
+      "learning_rate": 0.00019842814070351758,
+      "loss": 1.5613,
+      "step": 34200
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.8630317449569702,
+      "learning_rate": 0.00019812663316582911,
+      "loss": 1.5638,
+      "step": 34300
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 3.0488193035125732,
+      "learning_rate": 0.0001978251256281407,
+      "loss": 1.5615,
+      "step": 34400
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 3.8991503715515137,
+      "learning_rate": 0.00019752361809045226,
+      "loss": 1.4857,
+      "step": 34500
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 2.0849859714508057,
+      "learning_rate": 0.0001972221105527638,
+      "loss": 1.535,
+      "step": 34600
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.825913906097412,
+      "learning_rate": 0.00019692060301507535,
+      "loss": 1.5664,
+      "step": 34700
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 5.08195686340332,
+      "learning_rate": 0.0001966190954773869,
+      "loss": 1.5252,
+      "step": 34800
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.785659909248352,
+      "learning_rate": 0.0001963175879396985,
+      "loss": 1.5391,
+      "step": 34900
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.4543670415878296,
+      "learning_rate": 0.00019601909547738692,
+      "loss": 1.5484,
+      "step": 35000
+    },
+    {
+      "epoch": 1.75,
+      "eval_loss": 1.573486566543579,
+      "eval_runtime": 37.5549,
+      "eval_samples_per_second": 26.628,
+      "eval_steps_per_second": 3.328,
+      "step": 35000
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.6768901348114014,
+      "learning_rate": 0.00019571758793969848,
+      "loss": 1.5479,
+      "step": 35100
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.9926207065582275,
+      "learning_rate": 0.00019541608040201004,
+      "loss": 1.5712,
+      "step": 35200
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.9285818338394165,
+      "learning_rate": 0.00019511457286432157,
+      "loss": 1.567,
+      "step": 35300
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 4.1938252449035645,
+      "learning_rate": 0.00019481306532663313,
+      "loss": 1.5538,
+      "step": 35400
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 3.3867177963256836,
+      "learning_rate": 0.00019451155778894472,
+      "loss": 1.532,
+      "step": 35500
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 2.6039962768554688,
+      "learning_rate": 0.00019421005025125625,
+      "loss": 1.5914,
+      "step": 35600
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.900150179862976,
+      "learning_rate": 0.0001939085427135678,
+      "loss": 1.5435,
+      "step": 35700
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 6.110165596008301,
+      "learning_rate": 0.00019360703517587937,
+      "loss": 1.5532,
+      "step": 35800
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 2.2953763008117676,
+      "learning_rate": 0.00019330552763819096,
+      "loss": 1.5573,
+      "step": 35900
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.9452694654464722,
+      "learning_rate": 0.0001930040201005025,
+      "loss": 1.5602,
+      "step": 36000
+    },
+    {
+      "epoch": 1.8,
+      "eval_loss": 1.5185086727142334,
+      "eval_runtime": 37.4604,
+      "eval_samples_per_second": 26.695,
+      "eval_steps_per_second": 3.337,
+      "step": 36000
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.729576587677002,
+      "learning_rate": 0.00019270251256281405,
+      "loss": 1.5778,
+      "step": 36100
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 2.98600172996521,
+      "learning_rate": 0.0001924010050251256,
+      "loss": 1.5318,
+      "step": 36200
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 2.2745935916900635,
+      "learning_rate": 0.00019209949748743717,
+      "loss": 1.5475,
+      "step": 36300
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.7677953243255615,
+      "learning_rate": 0.00019179798994974873,
+      "loss": 1.5174,
+      "step": 36400
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 3.6853749752044678,
+      "learning_rate": 0.0001914964824120603,
+      "loss": 1.5709,
+      "step": 36500
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 2.8981404304504395,
+      "learning_rate": 0.00019119497487437182,
+      "loss": 1.5519,
+      "step": 36600
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 1.9862598180770874,
+      "learning_rate": 0.0001908934673366834,
+      "loss": 1.5547,
+      "step": 36700
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 2.030618667602539,
+      "learning_rate": 0.00019059195979899497,
+      "loss": 1.5388,
+      "step": 36800
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.609573483467102,
+      "learning_rate": 0.00019029045226130653,
+      "loss": 1.5727,
+      "step": 36900
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 2.0508673191070557,
+      "learning_rate": 0.00018998894472361806,
+      "loss": 1.5522,
+      "step": 37000
+    },
+    {
+      "epoch": 1.85,
+      "eval_loss": 1.4889146089553833,
+      "eval_runtime": 37.4776,
+      "eval_samples_per_second": 26.683,
+      "eval_steps_per_second": 3.335,
+      "step": 37000
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.5507237911224365,
+      "learning_rate": 0.00018968743718592964,
+      "loss": 1.5196,
+      "step": 37100
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 2.1241345405578613,
+      "learning_rate": 0.0001893859296482412,
+      "loss": 1.553,
+      "step": 37200
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.6903210878372192,
+      "learning_rate": 0.00018908442211055274,
+      "loss": 1.5292,
+      "step": 37300
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.684809923171997,
+      "learning_rate": 0.0001887829145728643,
+      "loss": 1.5597,
+      "step": 37400
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.072158098220825,
+      "learning_rate": 0.00018848442211055275,
+      "loss": 1.4914,
+      "step": 37500
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.545748472213745,
+      "learning_rate": 0.0001881829145728643,
+      "loss": 1.54,
+      "step": 37600
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 2.4186930656433105,
+      "learning_rate": 0.00018788140703517587,
+      "loss": 1.5298,
+      "step": 37700
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.8362512588500977,
+      "learning_rate": 0.00018757989949748743,
+      "loss": 1.5372,
+      "step": 37800
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 2.376615047454834,
+      "learning_rate": 0.000187278391959799,
+      "loss": 1.5343,
+      "step": 37900
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 2.72920823097229,
+      "learning_rate": 0.00018697688442211052,
+      "loss": 1.4955,
+      "step": 38000
+    },
+    {
+      "epoch": 1.9,
+      "eval_loss": 1.5501998662948608,
+      "eval_runtime": 37.4829,
+      "eval_samples_per_second": 26.679,
+      "eval_steps_per_second": 3.335,
+      "step": 38000
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 2.2700889110565186,
+      "learning_rate": 0.00018667537688442208,
+      "loss": 1.5149,
+      "step": 38100
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 2.9030048847198486,
+      "learning_rate": 0.00018637386934673367,
+      "loss": 1.5451,
+      "step": 38200
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 3.4646873474121094,
+      "learning_rate": 0.00018607236180904522,
+      "loss": 1.5477,
+      "step": 38300
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 2.5595543384552,
+      "learning_rate": 0.00018577085427135676,
+      "loss": 1.5013,
+      "step": 38400
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.3651905059814453,
+      "learning_rate": 0.00018546934673366832,
+      "loss": 1.5199,
+      "step": 38500
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 2.045830011367798,
+      "learning_rate": 0.0001851678391959799,
+      "loss": 1.5677,
+      "step": 38600
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.8075040578842163,
+      "learning_rate": 0.00018486633165829144,
+      "loss": 1.5564,
+      "step": 38700
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.4847893714904785,
+      "learning_rate": 0.000184564824120603,
+      "loss": 1.5178,
+      "step": 38800
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 3.2269062995910645,
+      "learning_rate": 0.00018426331658291455,
+      "loss": 1.513,
+      "step": 38900
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.8105851411819458,
+      "learning_rate": 0.00018396180904522614,
+      "loss": 1.5007,
+      "step": 39000
+    },
+    {
+      "epoch": 1.95,
+      "eval_loss": 1.5438071489334106,
+      "eval_runtime": 37.4427,
+      "eval_samples_per_second": 26.707,
+      "eval_steps_per_second": 3.338,
+      "step": 39000
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.8713701963424683,
+      "learning_rate": 0.00018366030150753767,
+      "loss": 1.5266,
+      "step": 39100
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.4177159070968628,
+      "learning_rate": 0.00018335879396984923,
+      "loss": 1.5345,
+      "step": 39200
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 2.068962574005127,
+      "learning_rate": 0.00018305728643216076,
+      "loss": 1.5136,
+      "step": 39300
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 2.0991291999816895,
+      "learning_rate": 0.00018275577889447235,
+      "loss": 1.5509,
+      "step": 39400
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 2.322105884552002,
+      "learning_rate": 0.0001824542713567839,
+      "loss": 1.5222,
+      "step": 39500
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 2.1995983123779297,
+      "learning_rate": 0.00018215276381909547,
+      "loss": 1.544,
+      "step": 39600
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.4002470970153809,
+      "learning_rate": 0.000181851256281407,
+      "loss": 1.5184,
+      "step": 39700
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 2.0827932357788086,
+      "learning_rate": 0.0001815497487437186,
+      "loss": 1.5388,
+      "step": 39800
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 3.9671342372894287,
+      "learning_rate": 0.00018124824120603015,
+      "loss": 1.5331,
+      "step": 39900
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.979598045349121,
+      "learning_rate": 0.00018094673366834168,
+      "loss": 1.5058,
+      "step": 40000
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.5161113739013672,
+      "eval_runtime": 37.4278,
+      "eval_samples_per_second": 26.718,
+      "eval_steps_per_second": 3.34,
+      "step": 40000
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.0998573303222656,
+      "learning_rate": 0.00018064824120603013,
+      "loss": 1.4831,
+      "step": 40100
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 2.1450562477111816,
+      "learning_rate": 0.0001803467336683417,
+      "loss": 1.5567,
+      "step": 40200
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 2.080376148223877,
+      "learning_rate": 0.00018004522613065325,
+      "loss": 1.5088,
+      "step": 40300
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.682288408279419,
+      "learning_rate": 0.0001797437185929648,
+      "loss": 1.5461,
+      "step": 40400
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.6313551664352417,
+      "learning_rate": 0.00017944221105527637,
+      "loss": 1.4776,
+      "step": 40500
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 3.1746320724487305,
+      "learning_rate": 0.00017914070351758793,
+      "loss": 1.534,
+      "step": 40600
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.7039170265197754,
+      "learning_rate": 0.00017883919597989946,
+      "loss": 1.565,
+      "step": 40700
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.738771677017212,
+      "learning_rate": 0.00017853768844221102,
+      "loss": 1.5391,
+      "step": 40800
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 4.0762715339660645,
+      "learning_rate": 0.0001782361809045226,
+      "loss": 1.5153,
+      "step": 40900
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 3.6206607818603516,
+      "learning_rate": 0.00017793467336683417,
+      "loss": 1.5254,
+      "step": 41000
+    },
+    {
+      "epoch": 2.05,
+      "eval_loss": 1.5652326345443726,
+      "eval_runtime": 37.621,
+      "eval_samples_per_second": 26.581,
+      "eval_steps_per_second": 3.323,
+      "step": 41000
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.2320189476013184,
+      "learning_rate": 0.0001776331658291457,
+      "loss": 1.5153,
+      "step": 41100
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 2.275785446166992,
+      "learning_rate": 0.00017733165829145726,
+      "loss": 1.5192,
+      "step": 41200
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.8659756183624268,
+      "learning_rate": 0.00017703015075376885,
+      "loss": 1.5563,
+      "step": 41300
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 1.6176475286483765,
+      "learning_rate": 0.00017672864321608038,
+      "loss": 1.5251,
+      "step": 41400
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.575243592262268,
+      "learning_rate": 0.00017642713567839194,
+      "loss": 1.5178,
+      "step": 41500
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.6838304996490479,
+      "learning_rate": 0.0001761256281407035,
+      "loss": 1.5255,
+      "step": 41600
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 5.561213493347168,
+      "learning_rate": 0.00017582412060301509,
+      "loss": 1.4885,
+      "step": 41700
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 2.0810487270355225,
+      "learning_rate": 0.00017552261306532662,
+      "loss": 1.491,
+      "step": 41800
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.894136667251587,
+      "learning_rate": 0.00017522110552763818,
+      "loss": 1.5072,
+      "step": 41900
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 4.531317710876465,
+      "learning_rate": 0.00017492261306532663,
+      "loss": 1.5108,
+      "step": 42000
+    },
+    {
+      "epoch": 2.1,
+      "eval_loss": 1.5298963785171509,
+      "eval_runtime": 37.5318,
+      "eval_samples_per_second": 26.644,
+      "eval_steps_per_second": 3.331,
+      "step": 42000
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.9595602750778198,
+      "learning_rate": 0.00017462110552763816,
+      "loss": 1.5178,
+      "step": 42100
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 2.1969220638275146,
+      "learning_rate": 0.00017431959798994972,
+      "loss": 1.508,
+      "step": 42200
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 3.0334370136260986,
+      "learning_rate": 0.0001740180904522613,
+      "loss": 1.5255,
+      "step": 42300
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.8148131370544434,
+      "learning_rate": 0.00017371658291457287,
+      "loss": 1.4686,
+      "step": 42400
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 5.726377964019775,
+      "learning_rate": 0.0001734150753768844,
+      "loss": 1.5355,
+      "step": 42500
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 3.15856671333313,
+      "learning_rate": 0.00017311356783919596,
+      "loss": 1.534,
+      "step": 42600
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 6.238559246063232,
+      "learning_rate": 0.00017281206030150755,
+      "loss": 1.5073,
+      "step": 42700
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 2.281912326812744,
+      "learning_rate": 0.00017251055276381908,
+      "loss": 1.5341,
+      "step": 42800
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 3.874361991882324,
+      "learning_rate": 0.00017220904522613064,
+      "loss": 1.5204,
+      "step": 42900
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 2.5632574558258057,
+      "learning_rate": 0.0001719075376884422,
+      "loss": 1.5028,
+      "step": 43000
+    },
+    {
+      "epoch": 2.15,
+      "eval_loss": 1.5213427543640137,
+      "eval_runtime": 37.5103,
+      "eval_samples_per_second": 26.659,
+      "eval_steps_per_second": 3.332,
+      "step": 43000
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 4.4505133628845215,
+      "learning_rate": 0.00017160603015075373,
+      "loss": 1.5318,
+      "step": 43100
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 1.6099460124969482,
+      "learning_rate": 0.00017130452261306532,
+      "loss": 1.5093,
+      "step": 43200
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 4.269103527069092,
+      "learning_rate": 0.00017100301507537688,
+      "loss": 1.4839,
+      "step": 43300
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 10.070590019226074,
+      "learning_rate": 0.0001707015075376884,
+      "loss": 1.5407,
+      "step": 43400
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 17.754796981811523,
+      "learning_rate": 0.00017039999999999997,
+      "loss": 1.5102,
+      "step": 43500
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 2.8812096118927,
+      "learning_rate": 0.00017009849246231155,
+      "loss": 1.5227,
+      "step": 43600
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 2.861893653869629,
+      "learning_rate": 0.00016979698492462311,
+      "loss": 1.544,
+      "step": 43700
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 1.9935343265533447,
+      "learning_rate": 0.00016949547738693465,
+      "loss": 1.5209,
+      "step": 43800
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 2.8261117935180664,
+      "learning_rate": 0.0001691939698492462,
+      "loss": 1.5404,
+      "step": 43900
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 2.6541242599487305,
+      "learning_rate": 0.0001688924623115578,
+      "loss": 1.5048,
+      "step": 44000
+    },
+    {
+      "epoch": 2.2,
+      "eval_loss": 1.5309374332427979,
+      "eval_runtime": 65.7795,
+      "eval_samples_per_second": 15.202,
+      "eval_steps_per_second": 1.9,
+      "step": 44000
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.1707357168197632,
+      "learning_rate": 0.00016859095477386932,
+      "loss": 1.5479,
+      "step": 44100
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 2.058530807495117,
+      "learning_rate": 0.00016828944723618088,
+      "loss": 1.5423,
+      "step": 44200
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 2.841702699661255,
+      "learning_rate": 0.00016798793969849244,
+      "loss": 1.5196,
+      "step": 44300
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 1.6679179668426514,
+      "learning_rate": 0.00016768643216080403,
+      "loss": 1.5085,
+      "step": 44400
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 2.1461362838745117,
+      "learning_rate": 0.00016738492462311556,
+      "loss": 1.538,
+      "step": 44500
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 2.351161241531372,
+      "learning_rate": 0.00016708341708542712,
+      "loss": 1.4874,
+      "step": 44600
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 2.474755048751831,
+      "learning_rate": 0.00016678190954773868,
+      "loss": 1.5646,
+      "step": 44700
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 2.6611087322235107,
+      "learning_rate": 0.00016648040201005024,
+      "loss": 1.5373,
+      "step": 44800
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 2.518184185028076,
+      "learning_rate": 0.0001661788944723618,
+      "loss": 1.4969,
+      "step": 44900
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 4.304960250854492,
+      "learning_rate": 0.00016587738693467336,
+      "loss": 1.4827,
+      "step": 45000
+    },
+    {
+      "epoch": 2.25,
+      "eval_loss": 1.5653032064437866,
+      "eval_runtime": 39.6553,
+      "eval_samples_per_second": 25.217,
+      "eval_steps_per_second": 3.152,
+      "step": 45000
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.1882766485214233,
+      "learning_rate": 0.0001655758793969849,
+      "loss": 1.5062,
+      "step": 45100
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 2.196139335632324,
+      "learning_rate": 0.00016527437185929648,
+      "loss": 1.5026,
+      "step": 45200
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 1.8797615766525269,
+      "learning_rate": 0.00016497286432160804,
+      "loss": 1.4947,
+      "step": 45300
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 1.396849274635315,
+      "learning_rate": 0.00016467135678391957,
+      "loss": 1.5002,
+      "step": 45400
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 2.432685375213623,
+      "learning_rate": 0.00016436984924623113,
+      "loss": 1.5021,
+      "step": 45500
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 2.2086715698242188,
+      "learning_rate": 0.0001640683417085427,
+      "loss": 1.5067,
+      "step": 45600
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 4.262127876281738,
+      "learning_rate": 0.00016376683417085428,
+      "loss": 1.519,
+      "step": 45700
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 2.92459774017334,
+      "learning_rate": 0.0001634653266331658,
+      "loss": 1.5138,
+      "step": 45800
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 1.3408390283584595,
+      "learning_rate": 0.00016316381909547737,
+      "loss": 1.5296,
+      "step": 45900
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.9604805707931519,
+      "learning_rate": 0.00016286231155778893,
+      "loss": 1.5207,
+      "step": 46000
+    },
+    {
+      "epoch": 2.3,
+      "eval_loss": 1.5254641771316528,
+      "eval_runtime": 42.9693,
+      "eval_samples_per_second": 23.272,
+      "eval_steps_per_second": 2.909,
+      "step": 46000
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 2.5174312591552734,
+      "learning_rate": 0.00016256080402010049,
+      "loss": 1.5487,
+      "step": 46100
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 1.4481089115142822,
+      "learning_rate": 0.00016225929648241205,
+      "loss": 1.5167,
+      "step": 46200
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 1.5994378328323364,
+      "learning_rate": 0.0001619577889447236,
+      "loss": 1.4549,
+      "step": 46300
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 2.4599764347076416,
+      "learning_rate": 0.00016165628140703514,
+      "loss": 1.5128,
+      "step": 46400
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 1.8390551805496216,
+      "learning_rate": 0.00016135477386934672,
+      "loss": 1.5248,
+      "step": 46500
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 2.679804801940918,
+      "learning_rate": 0.00016105326633165828,
+      "loss": 1.5105,
+      "step": 46600
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 1.7645354270935059,
+      "learning_rate": 0.00016075477386934674,
+      "loss": 1.4787,
+      "step": 46700
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 1.035243034362793,
+      "learning_rate": 0.00016045326633165827,
+      "loss": 1.5257,
+      "step": 46800
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 11.433294296264648,
+      "learning_rate": 0.00016015175879396983,
+      "loss": 1.4615,
+      "step": 46900
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 1.600629448890686,
+      "learning_rate": 0.0001598502512562814,
+      "loss": 1.4973,
+      "step": 47000
+    },
+    {
+      "epoch": 2.35,
+      "eval_loss": 1.5015385150909424,
+      "eval_runtime": 41.9852,
+      "eval_samples_per_second": 23.818,
+      "eval_steps_per_second": 2.977,
+      "step": 47000
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 2.1351780891418457,
+      "learning_rate": 0.00015954874371859297,
+      "loss": 1.5163,
+      "step": 47100
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 2.6234447956085205,
+      "learning_rate": 0.0001592472361809045,
+      "loss": 1.5147,
+      "step": 47200
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 2.0315120220184326,
+      "learning_rate": 0.00015894874371859296,
+      "loss": 1.5044,
+      "step": 47300
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 2.7582693099975586,
+      "learning_rate": 0.00015864723618090452,
+      "loss": 1.5033,
+      "step": 47400
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 2.5383968353271484,
+      "learning_rate": 0.00015834572864321605,
+      "loss": 1.5198,
+      "step": 47500
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 3.1361851692199707,
+      "learning_rate": 0.0001580442211055276,
+      "loss": 1.5006,
+      "step": 47600
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 1.669693946838379,
+      "learning_rate": 0.0001577427135678392,
+      "loss": 1.5203,
+      "step": 47700
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 3.662080764770508,
+      "learning_rate": 0.00015744120603015076,
+      "loss": 1.5427,
+      "step": 47800
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 2.101423740386963,
+      "learning_rate": 0.0001571396984924623,
+      "loss": 1.489,
+      "step": 47900
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 8.889337539672852,
+      "learning_rate": 0.00015684120603015074,
+      "loss": 1.5198,
+      "step": 48000
+    },
+    {
+      "epoch": 2.4,
+      "eval_loss": 1.5436657667160034,
+      "eval_runtime": 42.227,
+      "eval_samples_per_second": 23.682,
+      "eval_steps_per_second": 2.96,
+      "step": 48000
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 9.11814022064209,
+      "learning_rate": 0.0001565396984924623,
+      "loss": 1.5623,
+      "step": 48100
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 11.104165077209473,
+      "learning_rate": 0.00015623819095477383,
+      "loss": 1.5019,
+      "step": 48200
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 4.448848724365234,
+      "learning_rate": 0.00015593668341708542,
+      "loss": 1.5464,
+      "step": 48300
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 2.207139253616333,
+      "learning_rate": 0.00015563517587939698,
+      "loss": 1.5105,
+      "step": 48400
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 2.692350149154663,
+      "learning_rate": 0.00015533366834170854,
+      "loss": 1.5351,
+      "step": 48500
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 1.8738429546356201,
+      "learning_rate": 0.00015503216080402007,
+      "loss": 1.4882,
+      "step": 48600
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 2.8214309215545654,
+      "learning_rate": 0.00015473065326633166,
+      "loss": 1.4932,
+      "step": 48700
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.4316879510879517,
+      "learning_rate": 0.00015442914572864322,
+      "loss": 1.498,
+      "step": 48800
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.438391923904419,
+      "learning_rate": 0.00015412763819095475,
+      "loss": 1.4975,
+      "step": 48900
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 2.7039265632629395,
+      "learning_rate": 0.0001538261306532663,
+      "loss": 1.511,
+      "step": 49000
+    },
+    {
+      "epoch": 2.45,
+      "eval_loss": 1.5066561698913574,
+      "eval_runtime": 42.052,
+      "eval_samples_per_second": 23.78,
+      "eval_steps_per_second": 2.973,
+      "step": 49000
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 2.8025197982788086,
+      "learning_rate": 0.00015352462311557787,
+      "loss": 1.5082,
+      "step": 49100
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 4.876307964324951,
+      "learning_rate": 0.00015322311557788946,
+      "loss": 1.4825,
+      "step": 49200
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 4.34946346282959,
+      "learning_rate": 0.000152921608040201,
+      "loss": 1.5117,
+      "step": 49300
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 2.5481321811676025,
+      "learning_rate": 0.00015262010050251255,
+      "loss": 1.5361,
+      "step": 49400
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.5703433752059937,
+      "learning_rate": 0.00015231859296482408,
+      "loss": 1.5113,
+      "step": 49500
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.8963671922683716,
+      "learning_rate": 0.00015201708542713567,
+      "loss": 1.5095,
+      "step": 49600
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 2.296262264251709,
+      "learning_rate": 0.00015171557788944723,
+      "loss": 1.5479,
+      "step": 49700
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 2.796741247177124,
+      "learning_rate": 0.00015141407035175879,
+      "loss": 1.4946,
+      "step": 49800
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 2.633420705795288,
+      "learning_rate": 0.00015111256281407032,
+      "loss": 1.4917,
+      "step": 49900
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 3.0930418968200684,
+      "learning_rate": 0.0001508110552763819,
+      "loss": 1.5058,
+      "step": 50000
+    },
+    {
+      "epoch": 2.5,
+      "eval_loss": 1.539115071296692,
+      "eval_runtime": 41.8844,
+      "eval_samples_per_second": 23.875,
+      "eval_steps_per_second": 2.984,
+      "step": 50000
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 2.2942283153533936,
+      "learning_rate": 0.00015050954773869346,
+      "loss": 1.4981,
+      "step": 50100
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 2.622481346130371,
+      "learning_rate": 0.000150208040201005,
+      "loss": 1.4829,
+      "step": 50200
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 2.5901362895965576,
+      "learning_rate": 0.00014990653266331658,
+      "loss": 1.495,
+      "step": 50300
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 3.8583192825317383,
+      "learning_rate": 0.00014960502512562812,
+      "loss": 1.494,
+      "step": 50400
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 2.841306209564209,
+      "learning_rate": 0.00014930653266331657,
+      "loss": 1.4806,
+      "step": 50500
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 2.646027088165283,
+      "learning_rate": 0.00014900502512562813,
+      "loss": 1.4907,
+      "step": 50600
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 2.384547472000122,
+      "learning_rate": 0.0001487035175879397,
+      "loss": 1.4809,
+      "step": 50700
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 2.698951005935669,
+      "learning_rate": 0.00014840201005025125,
+      "loss": 1.5145,
+      "step": 50800
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 4.710977077484131,
+      "learning_rate": 0.0001481035175879397,
+      "loss": 1.4805,
+      "step": 50900
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 5.228128433227539,
+      "learning_rate": 0.00014780201005025123,
+      "loss": 1.512,
+      "step": 51000
+    },
+    {
+      "epoch": 2.55,
+      "eval_loss": 1.5644181966781616,
+      "eval_runtime": 37.1723,
+      "eval_samples_per_second": 26.902,
+      "eval_steps_per_second": 3.363,
+      "step": 51000
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 5.459705829620361,
+      "learning_rate": 0.00014750050251256282,
+      "loss": 1.4861,
+      "step": 51100
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.9696797132492065,
+      "learning_rate": 0.00014719899497487435,
+      "loss": 1.5177,
+      "step": 51200
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 2.262742757797241,
+      "learning_rate": 0.0001468974874371859,
+      "loss": 1.5074,
+      "step": 51300
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 3.0286054611206055,
+      "learning_rate": 0.00014659597989949747,
+      "loss": 1.5233,
+      "step": 51400
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 2.805699110031128,
+      "learning_rate": 0.00014629447236180903,
+      "loss": 1.5002,
+      "step": 51500
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 7.009899616241455,
+      "learning_rate": 0.0001459929648241206,
+      "loss": 1.5224,
+      "step": 51600
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 2.203697443008423,
+      "learning_rate": 0.00014569145728643215,
+      "loss": 1.5264,
+      "step": 51700
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 38.17327117919922,
+      "learning_rate": 0.0001453899497487437,
+      "loss": 1.5233,
+      "step": 51800
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 3.463594436645508,
+      "learning_rate": 0.00014508844221105527,
+      "loss": 1.5476,
+      "step": 51900
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 4.695711135864258,
+      "learning_rate": 0.00014478693467336683,
+      "loss": 1.5092,
+      "step": 52000
+    },
+    {
+      "epoch": 2.6,
+      "eval_loss": 1.524036169052124,
+      "eval_runtime": 41.6949,
+      "eval_samples_per_second": 23.984,
+      "eval_steps_per_second": 2.998,
+      "step": 52000
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 3.0102827548980713,
+      "learning_rate": 0.00014448542713567836,
+      "loss": 1.4978,
+      "step": 52100
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 4.087903022766113,
+      "learning_rate": 0.00014418391959798995,
+      "loss": 1.4772,
+      "step": 52200
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 2.3847885131835938,
+      "learning_rate": 0.00014388241206030148,
+      "loss": 1.5173,
+      "step": 52300
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 2.907341241836548,
+      "learning_rate": 0.00014358090452261306,
+      "loss": 1.5287,
+      "step": 52400
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 3.0708484649658203,
+      "learning_rate": 0.0001432793969849246,
+      "loss": 1.524,
+      "step": 52500
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 2.70914888381958,
+      "learning_rate": 0.00014297788944723618,
+      "loss": 1.5115,
+      "step": 52600
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 3.0721583366394043,
+      "learning_rate": 0.00014267638190954772,
+      "loss": 1.5273,
+      "step": 52700
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 4.067818641662598,
+      "learning_rate": 0.00014237487437185927,
+      "loss": 1.5237,
+      "step": 52800
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 22.84881591796875,
+      "learning_rate": 0.00014207336683417083,
+      "loss": 1.489,
+      "step": 52900
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 4.021462440490723,
+      "learning_rate": 0.0001417718592964824,
+      "loss": 1.5284,
+      "step": 53000
+    },
+    {
+      "epoch": 2.65,
+      "eval_loss": 1.5171101093292236,
+      "eval_runtime": 41.2537,
+      "eval_samples_per_second": 24.24,
+      "eval_steps_per_second": 3.03,
+      "step": 53000
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 14.866168975830078,
+      "learning_rate": 0.00014147035175879395,
+      "loss": 1.5114,
+      "step": 53100
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 8.153103828430176,
+      "learning_rate": 0.0001411688442211055,
+      "loss": 1.4384,
+      "step": 53200
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 3.6942877769470215,
+      "learning_rate": 0.00014086733668341707,
+      "loss": 1.5189,
+      "step": 53300
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 1.7417048215866089,
+      "learning_rate": 0.00014056582914572863,
+      "loss": 1.5491,
+      "step": 53400
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 3.291808605194092,
+      "learning_rate": 0.0001402643216080402,
+      "loss": 1.5085,
+      "step": 53500
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 4.061951637268066,
+      "learning_rate": 0.00013996281407035175,
+      "loss": 1.5328,
+      "step": 53600
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 2.119297742843628,
+      "learning_rate": 0.0001396613065326633,
+      "loss": 1.4872,
+      "step": 53700
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 3.23767352104187,
+      "learning_rate": 0.00013935979899497487,
+      "loss": 1.5208,
+      "step": 53800
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 4.745997428894043,
+      "learning_rate": 0.00013905829145728643,
+      "loss": 1.5251,
+      "step": 53900
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 1.6231697797775269,
+      "learning_rate": 0.000138756783919598,
+      "loss": 1.5151,
+      "step": 54000
+    },
+    {
+      "epoch": 2.7,
+      "eval_loss": 1.4974777698516846,
+      "eval_runtime": 37.2945,
+      "eval_samples_per_second": 26.814,
+      "eval_steps_per_second": 3.352,
+      "step": 54000
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 2.7751882076263428,
+      "learning_rate": 0.00013845527638190955,
+      "loss": 1.4891,
+      "step": 54100
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 3.2454652786254883,
+      "learning_rate": 0.00013815376884422108,
+      "loss": 1.4929,
+      "step": 54200
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 2.5808818340301514,
+      "learning_rate": 0.00013785226130653264,
+      "loss": 1.5728,
+      "step": 54300
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 3.754495859146118,
+      "learning_rate": 0.0001375507537688442,
+      "loss": 1.5073,
+      "step": 54400
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 2.158400535583496,
+      "learning_rate": 0.00013724924623115576,
+      "loss": 1.5014,
+      "step": 54500
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 2.022975206375122,
+      "learning_rate": 0.00013694773869346732,
+      "loss": 1.5147,
+      "step": 54600
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 2.061910629272461,
+      "learning_rate": 0.00013664623115577888,
+      "loss": 1.4701,
+      "step": 54700
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 2.8262743949890137,
+      "learning_rate": 0.00013634472361809044,
+      "loss": 1.4988,
+      "step": 54800
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 2.389618158340454,
+      "learning_rate": 0.000136043216080402,
+      "loss": 1.5138,
+      "step": 54900
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 2.31430983543396,
+      "learning_rate": 0.00013574170854271356,
+      "loss": 1.5234,
+      "step": 55000
+    },
+    {
+      "epoch": 2.75,
+      "eval_loss": 1.4553519487380981,
+      "eval_runtime": 43.1448,
+      "eval_samples_per_second": 23.178,
+      "eval_steps_per_second": 2.897,
+      "step": 55000
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 2.14939022064209,
+      "learning_rate": 0.00013544020100502512,
+      "loss": 1.4971,
+      "step": 55100
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 2.3362553119659424,
+      "learning_rate": 0.00013513869346733667,
+      "loss": 1.4899,
+      "step": 55200
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 1.8975802659988403,
+      "learning_rate": 0.00013483718592964823,
+      "loss": 1.4855,
+      "step": 55300
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 1.6080349683761597,
+      "learning_rate": 0.0001345356783919598,
+      "loss": 1.4789,
+      "step": 55400
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 2.4170632362365723,
+      "learning_rate": 0.00013423417085427135,
+      "loss": 1.5052,
+      "step": 55500
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 3.0807037353515625,
+      "learning_rate": 0.0001339326633165829,
+      "loss": 1.4848,
+      "step": 55600
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 2.7296719551086426,
+      "learning_rate": 0.00013363115577889447,
+      "loss": 1.4695,
+      "step": 55700
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 2.392073154449463,
+      "learning_rate": 0.000133329648241206,
+      "loss": 1.4757,
+      "step": 55800
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 1.3958042860031128,
+      "learning_rate": 0.0001330281407035176,
+      "loss": 1.5152,
+      "step": 55900
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 5.350344657897949,
+      "learning_rate": 0.00013272663316582912,
+      "loss": 1.4918,
+      "step": 56000
+    },
+    {
+      "epoch": 2.8,
+      "eval_loss": 1.4755498170852661,
+      "eval_runtime": 55.3712,
+      "eval_samples_per_second": 18.06,
+      "eval_steps_per_second": 2.257,
+      "step": 56000
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 2.9437155723571777,
+      "learning_rate": 0.0001324251256281407,
+      "loss": 1.4673,
+      "step": 56100
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 2.0883982181549072,
+      "learning_rate": 0.00013212361809045224,
+      "loss": 1.5435,
+      "step": 56200
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 1.475618600845337,
+      "learning_rate": 0.00013182211055276383,
+      "loss": 1.4895,
+      "step": 56300
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 3.1873602867126465,
+      "learning_rate": 0.00013152060301507536,
+      "loss": 1.4753,
+      "step": 56400
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 1.1994943618774414,
+      "learning_rate": 0.00013121909547738692,
+      "loss": 1.4627,
+      "step": 56500
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 2.3503193855285645,
+      "learning_rate": 0.00013091758793969848,
+      "loss": 1.5312,
+      "step": 56600
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 1.5025432109832764,
+      "learning_rate": 0.00013061608040201004,
+      "loss": 1.4981,
+      "step": 56700
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 1.2746458053588867,
+      "learning_rate": 0.0001303145728643216,
+      "loss": 1.4644,
+      "step": 56800
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 2.1974568367004395,
+      "learning_rate": 0.00013001306532663316,
+      "loss": 1.4958,
+      "step": 56900
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 2.137925863265991,
+      "learning_rate": 0.00012971155778894472,
+      "loss": 1.4808,
+      "step": 57000
+    },
+    {
+      "epoch": 2.85,
+      "eval_loss": 1.4832957983016968,
+      "eval_runtime": 58.8442,
+      "eval_samples_per_second": 16.994,
+      "eval_steps_per_second": 2.124,
+      "step": 57000
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.9320988655090332,
+      "learning_rate": 0.00012941005025125628,
+      "loss": 1.4718,
+      "step": 57100
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 2.317089080810547,
+      "learning_rate": 0.00012910854271356784,
+      "loss": 1.436,
+      "step": 57200
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 2.3655266761779785,
+      "learning_rate": 0.00012880703517587937,
+      "loss": 1.4967,
+      "step": 57300
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 2.1567230224609375,
+      "learning_rate": 0.00012850552763819096,
+      "loss": 1.4319,
+      "step": 57400
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.759560227394104,
+      "learning_rate": 0.0001282040201005025,
+      "loss": 1.4719,
+      "step": 57500
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 3.5496578216552734,
+      "learning_rate": 0.00012790251256281407,
+      "loss": 1.513,
+      "step": 57600
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.4334951639175415,
+      "learning_rate": 0.0001276010050251256,
+      "loss": 1.5037,
+      "step": 57700
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.5903607606887817,
+      "learning_rate": 0.0001272994974874372,
+      "loss": 1.4386,
+      "step": 57800
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.8693747520446777,
+      "learning_rate": 0.00012699798994974873,
+      "loss": 1.4691,
+      "step": 57900
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.4653220176696777,
+      "learning_rate": 0.00012669648241206029,
+      "loss": 1.4531,
+      "step": 58000
+    },
+    {
+      "epoch": 2.9,
+      "eval_loss": 1.4672846794128418,
+      "eval_runtime": 70.1651,
+      "eval_samples_per_second": 14.252,
+      "eval_steps_per_second": 1.782,
+      "step": 58000
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 1.9329131841659546,
+      "learning_rate": 0.00012639497487437184,
+      "loss": 1.4538,
+      "step": 58100
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 1.952959418296814,
+      "learning_rate": 0.0001260934673366834,
+      "loss": 1.4872,
+      "step": 58200
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.1974529027938843,
+      "learning_rate": 0.00012579195979899496,
+      "loss": 1.4487,
+      "step": 58300
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.7099848985671997,
+      "learning_rate": 0.00012549045226130652,
+      "loss": 1.4708,
+      "step": 58400
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.2982145547866821,
+      "learning_rate": 0.00012518894472361808,
+      "loss": 1.4524,
+      "step": 58500
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 1.275212049484253,
+      "learning_rate": 0.00012488743718592964,
+      "loss": 1.4771,
+      "step": 58600
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.4571611881256104,
+      "learning_rate": 0.0001245859296482412,
+      "loss": 1.4454,
+      "step": 58700
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.9355239868164062,
+      "learning_rate": 0.00012428442211055273,
+      "loss": 1.4873,
+      "step": 58800
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.2806047201156616,
+      "learning_rate": 0.00012398291457286432,
+      "loss": 1.4566,
+      "step": 58900
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 3.054337739944458,
+      "learning_rate": 0.00012368442211055277,
+      "loss": 1.5002,
+      "step": 59000
+    },
+    {
+      "epoch": 2.95,
+      "eval_loss": 1.4360119104385376,
+      "eval_runtime": 40.7087,
+      "eval_samples_per_second": 24.565,
+      "eval_steps_per_second": 3.071,
+      "step": 59000
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 3.0794198513031006,
+      "learning_rate": 0.0001233829145728643,
+      "loss": 1.4463,
+      "step": 59100
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 2.48854923248291,
+      "learning_rate": 0.00012308140703517586,
+      "loss": 1.4211,
+      "step": 59200
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 1.163191556930542,
+      "learning_rate": 0.00012277989949748742,
+      "loss": 1.4596,
+      "step": 59300
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 2.0918381214141846,
+      "learning_rate": 0.00012247839195979898,
+      "loss": 1.4713,
+      "step": 59400
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 2.207432746887207,
+      "learning_rate": 0.00012217688442211054,
+      "loss": 1.4747,
+      "step": 59500
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 2.485342502593994,
+      "learning_rate": 0.00012187537688442209,
+      "loss": 1.4531,
+      "step": 59600
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 1.9726412296295166,
+      "learning_rate": 0.00012157386934673366,
+      "loss": 1.4685,
+      "step": 59700
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 2.211527109146118,
+      "learning_rate": 0.00012127236180904521,
+      "loss": 1.488,
+      "step": 59800
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 2.2727925777435303,
+      "learning_rate": 0.00012097085427135678,
+      "loss": 1.4919,
+      "step": 59900
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.9899802207946777,
+      "learning_rate": 0.00012067236180904521,
+      "loss": 1.496,
+      "step": 60000
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.480151891708374,
+      "eval_runtime": 37.1123,
+      "eval_samples_per_second": 26.945,
+      "eval_steps_per_second": 3.368,
+      "step": 60000
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 2.080214023590088,
+      "learning_rate": 0.00012037085427135678,
+      "loss": 1.4381,
+      "step": 60100
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 2.6185152530670166,
+      "learning_rate": 0.00012006934673366833,
+      "loss": 1.4701,
+      "step": 60200
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 2.33237361907959,
+      "learning_rate": 0.0001197678391959799,
+      "loss": 1.4754,
+      "step": 60300
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 1.6651071310043335,
+      "learning_rate": 0.00011946633165829144,
+      "loss": 1.4264,
+      "step": 60400
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 2.55290150642395,
+      "learning_rate": 0.000119164824120603,
+      "loss": 1.4643,
+      "step": 60500
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 2.1360106468200684,
+      "learning_rate": 0.00011886331658291456,
+      "loss": 1.4485,
+      "step": 60600
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 2.012885808944702,
+      "learning_rate": 0.00011856180904522612,
+      "loss": 1.4613,
+      "step": 60700
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 2.1006858348846436,
+      "learning_rate": 0.00011826030150753767,
+      "loss": 1.4536,
+      "step": 60800
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 1.3656415939331055,
+      "learning_rate": 0.00011795879396984924,
+      "loss": 1.452,
+      "step": 60900
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 2.107713460922241,
+      "learning_rate": 0.00011765728643216079,
+      "loss": 1.4506,
+      "step": 61000
+    },
+    {
+      "epoch": 3.05,
+      "eval_loss": 1.4423640966415405,
+      "eval_runtime": 36.6897,
+      "eval_samples_per_second": 27.256,
+      "eval_steps_per_second": 3.407,
+      "step": 61000
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 1.7318499088287354,
+      "learning_rate": 0.00011735577889447236,
+      "loss": 1.4439,
+      "step": 61100
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 2.1133487224578857,
+      "learning_rate": 0.00011705427135678391,
+      "loss": 1.4401,
+      "step": 61200
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 1.763225793838501,
+      "learning_rate": 0.00011675276381909548,
+      "loss": 1.4196,
+      "step": 61300
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 3.682509660720825,
+      "learning_rate": 0.00011645125628140703,
+      "loss": 1.4355,
+      "step": 61400
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 1.6236958503723145,
+      "learning_rate": 0.00011614974874371859,
+      "loss": 1.4288,
+      "step": 61500
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 2.428751230239868,
+      "learning_rate": 0.00011584824120603014,
+      "loss": 1.4445,
+      "step": 61600
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 1.9624316692352295,
+      "learning_rate": 0.0001155467336683417,
+      "loss": 1.4509,
+      "step": 61700
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 1.2408591508865356,
+      "learning_rate": 0.00011524522613065325,
+      "loss": 1.4604,
+      "step": 61800
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 1.9674237966537476,
+      "learning_rate": 0.00011494371859296481,
+      "loss": 1.4187,
+      "step": 61900
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 1.1599769592285156,
+      "learning_rate": 0.00011464221105527637,
+      "loss": 1.4393,
+      "step": 62000
+    },
+    {
+      "epoch": 3.1,
+      "eval_loss": 1.4237370491027832,
+      "eval_runtime": 36.8952,
+      "eval_samples_per_second": 27.104,
+      "eval_steps_per_second": 3.388,
+      "step": 62000
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 1.2391512393951416,
+      "learning_rate": 0.00011434070351758793,
+      "loss": 1.4673,
+      "step": 62100
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 1.8557316064834595,
+      "learning_rate": 0.00011403919597989949,
+      "loss": 1.423,
+      "step": 62200
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 2.8055520057678223,
+      "learning_rate": 0.00011373768844221103,
+      "loss": 1.4722,
+      "step": 62300
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 3.723037004470825,
+      "learning_rate": 0.0001134361809045226,
+      "loss": 1.4313,
+      "step": 62400
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 1.5366125106811523,
+      "learning_rate": 0.00011313467336683415,
+      "loss": 1.4678,
+      "step": 62500
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 1.5289653539657593,
+      "learning_rate": 0.00011283316582914573,
+      "loss": 1.4313,
+      "step": 62600
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 1.496334195137024,
+      "learning_rate": 0.00011253165829145727,
+      "loss": 1.4249,
+      "step": 62700
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 2.485732078552246,
+      "learning_rate": 0.00011223015075376884,
+      "loss": 1.4071,
+      "step": 62800
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.784316897392273,
+      "learning_rate": 0.00011192864321608039,
+      "loss": 1.458,
+      "step": 62900
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.126514196395874,
+      "learning_rate": 0.00011162713567839195,
+      "loss": 1.4606,
+      "step": 63000
+    },
+    {
+      "epoch": 3.15,
+      "eval_loss": 1.4603298902511597,
+      "eval_runtime": 36.816,
+      "eval_samples_per_second": 27.162,
+      "eval_steps_per_second": 3.395,
+      "step": 63000
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.4732190370559692,
+      "learning_rate": 0.00011132562814070351,
+      "loss": 1.4118,
+      "step": 63100
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 2.9530584812164307,
+      "learning_rate": 0.00011102412060301507,
+      "loss": 1.4413,
+      "step": 63200
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 1.5375559329986572,
+      "learning_rate": 0.00011072261306532661,
+      "loss": 1.4301,
+      "step": 63300
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 3.584829568862915,
+      "learning_rate": 0.00011042110552763819,
+      "loss": 1.4563,
+      "step": 63400
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 3.209752082824707,
+      "learning_rate": 0.00011011959798994973,
+      "loss": 1.4456,
+      "step": 63500
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 1.9197559356689453,
+      "learning_rate": 0.0001098180904522613,
+      "loss": 1.4324,
+      "step": 63600
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 1.6928168535232544,
+      "learning_rate": 0.00010951658291457285,
+      "loss": 1.4237,
+      "step": 63700
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 2.0929553508758545,
+      "learning_rate": 0.00010921507537688443,
+      "loss": 1.4348,
+      "step": 63800
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 1.1685271263122559,
+      "learning_rate": 0.00010891356783919597,
+      "loss": 1.4297,
+      "step": 63900
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.703434944152832,
+      "learning_rate": 0.00010861206030150753,
+      "loss": 1.4324,
+      "step": 64000
+    },
+    {
+      "epoch": 3.2,
+      "eval_loss": 1.458383560180664,
+      "eval_runtime": 37.3628,
+      "eval_samples_per_second": 26.765,
+      "eval_steps_per_second": 3.346,
+      "step": 64000
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 2.0976409912109375,
+      "learning_rate": 0.00010831055276381909,
+      "loss": 1.4132,
+      "step": 64100
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 2.9562056064605713,
+      "learning_rate": 0.00010801206030150753,
+      "loss": 1.4216,
+      "step": 64200
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 3.074629545211792,
+      "learning_rate": 0.00010771055276381909,
+      "loss": 1.4099,
+      "step": 64300
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 2.264169454574585,
+      "learning_rate": 0.00010740904522613064,
+      "loss": 1.4065,
+      "step": 64400
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 1.6872237920761108,
+      "learning_rate": 0.00010710753768844221,
+      "loss": 1.4264,
+      "step": 64500
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 1.8734939098358154,
+      "learning_rate": 0.00010680603015075375,
+      "loss": 1.4346,
+      "step": 64600
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 2.0195159912109375,
+      "learning_rate": 0.00010650452261306531,
+      "loss": 1.4557,
+      "step": 64700
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 1.7375565767288208,
+      "learning_rate": 0.00010620301507537687,
+      "loss": 1.4237,
+      "step": 64800
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 3.346041202545166,
+      "learning_rate": 0.00010590452261306531,
+      "loss": 1.4393,
+      "step": 64900
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 1.8216383457183838,
+      "learning_rate": 0.00010560301507537687,
+      "loss": 1.4029,
+      "step": 65000
+    },
+    {
+      "epoch": 3.25,
+      "eval_loss": 1.4565457105636597,
+      "eval_runtime": 36.7819,
+      "eval_samples_per_second": 27.187,
+      "eval_steps_per_second": 3.398,
+      "step": 65000
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 2.628929853439331,
+      "learning_rate": 0.00010530150753768843,
+      "loss": 1.3928,
+      "step": 65100
+    },
+    {
+      "epoch": 3.26,
+      "grad_norm": 2.476813554763794,
+      "learning_rate": 0.00010499999999999999,
+      "loss": 1.4377,
+      "step": 65200
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 4.745122909545898,
+      "learning_rate": 0.00010469849246231155,
+      "loss": 1.4599,
+      "step": 65300
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 4.166439533233643,
+      "learning_rate": 0.0001043969849246231,
+      "loss": 1.4135,
+      "step": 65400
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 1.3662863969802856,
+      "learning_rate": 0.00010409547738693467,
+      "loss": 1.4398,
+      "step": 65500
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 1.9009519815444946,
+      "learning_rate": 0.00010379396984924622,
+      "loss": 1.4354,
+      "step": 65600
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 1.9361233711242676,
+      "learning_rate": 0.00010349246231155779,
+      "loss": 1.382,
+      "step": 65700
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 2.0850670337677,
+      "learning_rate": 0.00010319095477386933,
+      "loss": 1.3983,
+      "step": 65800
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 3.42114520072937,
+      "learning_rate": 0.0001028894472361809,
+      "loss": 1.414,
+      "step": 65900
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 1.9612314701080322,
+      "learning_rate": 0.00010258793969849245,
+      "loss": 1.4494,
+      "step": 66000
+    },
+    {
+      "epoch": 3.3,
+      "eval_loss": 1.4201833009719849,
+      "eval_runtime": 36.8374,
+      "eval_samples_per_second": 27.146,
+      "eval_steps_per_second": 3.393,
+      "step": 66000
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 2.3793907165527344,
+      "learning_rate": 0.00010228643216080401,
+      "loss": 1.4567,
+      "step": 66100
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 2.0631420612335205,
+      "learning_rate": 0.00010198492462311557,
+      "loss": 1.4246,
+      "step": 66200
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 1.973575234413147,
+      "learning_rate": 0.00010168643216080401,
+      "loss": 1.4315,
+      "step": 66300
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 4.41493034362793,
+      "learning_rate": 0.00010138492462311557,
+      "loss": 1.3869,
+      "step": 66400
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 4.229668617248535,
+      "learning_rate": 0.00010108341708542713,
+      "loss": 1.4064,
+      "step": 66500
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 2.8293817043304443,
+      "learning_rate": 0.00010078190954773868,
+      "loss": 1.4204,
+      "step": 66600
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 2.0787456035614014,
+      "learning_rate": 0.00010048040201005025,
+      "loss": 1.4011,
+      "step": 66700
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 2.429809093475342,
+      "learning_rate": 0.0001001788944723618,
+      "loss": 1.4205,
+      "step": 66800
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 1.428712248802185,
+      "learning_rate": 9.987738693467337e-05,
+      "loss": 1.4363,
+      "step": 66900
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 1.6250827312469482,
+      "learning_rate": 9.957587939698491e-05,
+      "loss": 1.4225,
+      "step": 67000
+    },
+    {
+      "epoch": 3.35,
+      "eval_loss": 1.4025810956954956,
+      "eval_runtime": 37.6269,
+      "eval_samples_per_second": 26.577,
+      "eval_steps_per_second": 3.322,
+      "step": 67000
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 16.908023834228516,
+      "learning_rate": 9.927437185929649e-05,
+      "loss": 1.4019,
+      "step": 67100
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 2.529090166091919,
+      "learning_rate": 9.897286432160803e-05,
+      "loss": 1.4536,
+      "step": 67200
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 2.6016106605529785,
+      "learning_rate": 9.867135678391959e-05,
+      "loss": 1.4144,
+      "step": 67300
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 6.646603584289551,
+      "learning_rate": 9.836984924623115e-05,
+      "loss": 1.4585,
+      "step": 67400
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 2.8519158363342285,
+      "learning_rate": 9.80683417085427e-05,
+      "loss": 1.4281,
+      "step": 67500
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 2.310377836227417,
+      "learning_rate": 9.776683417085426e-05,
+      "loss": 1.42,
+      "step": 67600
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 1.6958341598510742,
+      "learning_rate": 9.746532663316582e-05,
+      "loss": 1.4229,
+      "step": 67700
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 3.763411045074463,
+      "learning_rate": 9.716381909547738e-05,
+      "loss": 1.4212,
+      "step": 67800
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 3.6792852878570557,
+      "learning_rate": 9.686231155778894e-05,
+      "loss": 1.4438,
+      "step": 67900
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 1.9338295459747314,
+      "learning_rate": 9.65608040201005e-05,
+      "loss": 1.4285,
+      "step": 68000
+    },
+    {
+      "epoch": 3.4,
+      "eval_loss": 1.4275307655334473,
+      "eval_runtime": 36.7735,
+      "eval_samples_per_second": 27.193,
+      "eval_steps_per_second": 3.399,
+      "step": 68000
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 2.4784669876098633,
+      "learning_rate": 9.625929648241204e-05,
+      "loss": 1.426,
+      "step": 68100
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 2.1132450103759766,
+      "learning_rate": 9.595778894472361e-05,
+      "loss": 1.4243,
+      "step": 68200
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 1.0781810283660889,
+      "learning_rate": 9.565628140703516e-05,
+      "loss": 1.3787,
+      "step": 68300
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 1.9404023885726929,
+      "learning_rate": 9.535477386934673e-05,
+      "loss": 1.4554,
+      "step": 68400
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 2.5888288021087646,
+      "learning_rate": 9.505326633165828e-05,
+      "loss": 1.4224,
+      "step": 68500
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 1.2692792415618896,
+      "learning_rate": 9.475175879396985e-05,
+      "loss": 1.4262,
+      "step": 68600
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 3.69474196434021,
+      "learning_rate": 9.44502512562814e-05,
+      "loss": 1.4348,
+      "step": 68700
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 2.908108949661255,
+      "learning_rate": 9.414874371859296e-05,
+      "loss": 1.3915,
+      "step": 68800
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 2.764848470687866,
+      "learning_rate": 9.384723618090452e-05,
+      "loss": 1.4007,
+      "step": 68900
+    },
+    {
+      "epoch": 3.45,
+      "grad_norm": 2.2583723068237305,
+      "learning_rate": 9.354572864321608e-05,
+      "loss": 1.4274,
+      "step": 69000
+    },
+    {
+      "epoch": 3.45,
+      "eval_loss": 1.4281996488571167,
+      "eval_runtime": 36.7261,
+      "eval_samples_per_second": 27.229,
+      "eval_steps_per_second": 3.404,
+      "step": 69000
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 2.0189828872680664,
+      "learning_rate": 9.324422110552762e-05,
+      "loss": 1.3972,
+      "step": 69100
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 2.173034191131592,
+      "learning_rate": 9.29427135678392e-05,
+      "loss": 1.431,
+      "step": 69200
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 6.977257251739502,
+      "learning_rate": 9.264120603015074e-05,
+      "loss": 1.422,
+      "step": 69300
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 1.9481549263000488,
+      "learning_rate": 9.233969849246231e-05,
+      "loss": 1.424,
+      "step": 69400
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 1.5347398519515991,
+      "learning_rate": 9.203819095477386e-05,
+      "loss": 1.4017,
+      "step": 69500
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 4.210050582885742,
+      "learning_rate": 9.173668341708543e-05,
+      "loss": 1.4088,
+      "step": 69600
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 1.4653583765029907,
+      "learning_rate": 9.143517587939698e-05,
+      "loss": 1.4136,
+      "step": 69700
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 1.9992530345916748,
+      "learning_rate": 9.113366834170852e-05,
+      "loss": 1.4389,
+      "step": 69800
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 2.986983060836792,
+      "learning_rate": 9.08321608040201e-05,
+      "loss": 1.4207,
+      "step": 69900
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 3.8553383350372314,
+      "learning_rate": 9.053065326633164e-05,
+      "loss": 1.4158,
+      "step": 70000
+    },
+    {
+      "epoch": 3.5,
+      "eval_loss": 1.4086616039276123,
+      "eval_runtime": 44.1292,
+      "eval_samples_per_second": 22.661,
+      "eval_steps_per_second": 2.833,
+      "step": 70000
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 1.801664113998413,
+      "learning_rate": 9.022914572864322e-05,
+      "loss": 1.4129,
+      "step": 70100
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 4.6173529624938965,
+      "learning_rate": 8.992763819095476e-05,
+      "loss": 1.4052,
+      "step": 70200
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 2.9409828186035156,
+      "learning_rate": 8.962914572864322e-05,
+      "loss": 1.3987,
+      "step": 70300
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 1.4580490589141846,
+      "learning_rate": 8.932763819095476e-05,
+      "loss": 1.4278,
+      "step": 70400
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 3.432305335998535,
+      "learning_rate": 8.902613065326632e-05,
+      "loss": 1.4045,
+      "step": 70500
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 1.5630279779434204,
+      "learning_rate": 8.872763819095476e-05,
+      "loss": 1.4617,
+      "step": 70600
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 2.1527063846588135,
+      "learning_rate": 8.842613065326632e-05,
+      "loss": 1.4445,
+      "step": 70700
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 4.876186847686768,
+      "learning_rate": 8.812462311557788e-05,
+      "loss": 1.4025,
+      "step": 70800
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 2.359768867492676,
+      "learning_rate": 8.782311557788944e-05,
+      "loss": 1.3849,
+      "step": 70900
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 1.0750428438186646,
+      "learning_rate": 8.7521608040201e-05,
+      "loss": 1.3715,
+      "step": 71000
+    },
+    {
+      "epoch": 3.55,
+      "eval_loss": 1.4002715349197388,
+      "eval_runtime": 44.6209,
+      "eval_samples_per_second": 22.411,
+      "eval_steps_per_second": 2.801,
+      "step": 71000
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 3.729515552520752,
+      "learning_rate": 8.722010050251256e-05,
+      "loss": 1.4194,
+      "step": 71100
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 3.288299560546875,
+      "learning_rate": 8.69185929648241e-05,
+      "loss": 1.4096,
+      "step": 71200
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 1.2813684940338135,
+      "learning_rate": 8.661708542713568e-05,
+      "loss": 1.4256,
+      "step": 71300
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 1.7768347263336182,
+      "learning_rate": 8.631557788944722e-05,
+      "loss": 1.4021,
+      "step": 71400
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 1.5083990097045898,
+      "learning_rate": 8.60140703517588e-05,
+      "loss": 1.3925,
+      "step": 71500
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 2.056110382080078,
+      "learning_rate": 8.571256281407034e-05,
+      "loss": 1.3997,
+      "step": 71600
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 1.591378927230835,
+      "learning_rate": 8.54110552763819e-05,
+      "loss": 1.382,
+      "step": 71700
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 2.8046460151672363,
+      "learning_rate": 8.510954773869346e-05,
+      "loss": 1.4041,
+      "step": 71800
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 3.3818118572235107,
+      "learning_rate": 8.480804020100502e-05,
+      "loss": 1.3774,
+      "step": 71900
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 1.9918705224990845,
+      "learning_rate": 8.450653266331658e-05,
+      "loss": 1.4071,
+      "step": 72000
+    },
+    {
+      "epoch": 3.6,
+      "eval_loss": 1.3878278732299805,
+      "eval_runtime": 44.5957,
+      "eval_samples_per_second": 22.424,
+      "eval_steps_per_second": 2.803,
+      "step": 72000
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 2.911811351776123,
+      "learning_rate": 8.420502512562814e-05,
+      "loss": 1.3809,
+      "step": 72100
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 1.6014336347579956,
+      "learning_rate": 8.390351758793968e-05,
+      "loss": 1.434,
+      "step": 72200
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 2.2394659519195557,
+      "learning_rate": 8.360201005025126e-05,
+      "loss": 1.4076,
+      "step": 72300
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 2.3014962673187256,
+      "learning_rate": 8.33005025125628e-05,
+      "loss": 1.4367,
+      "step": 72400
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 3.4550154209136963,
+      "learning_rate": 8.299899497487438e-05,
+      "loss": 1.3938,
+      "step": 72500
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 2.599987268447876,
+      "learning_rate": 8.269748743718592e-05,
+      "loss": 1.3873,
+      "step": 72600
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 2.6119203567504883,
+      "learning_rate": 8.23959798994975e-05,
+      "loss": 1.4069,
+      "step": 72700
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 1.702276349067688,
+      "learning_rate": 8.209447236180904e-05,
+      "loss": 1.3923,
+      "step": 72800
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 2.805104970932007,
+      "learning_rate": 8.179296482412059e-05,
+      "loss": 1.386,
+      "step": 72900
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 2.306410312652588,
+      "learning_rate": 8.149145728643216e-05,
+      "loss": 1.4118,
+      "step": 73000
+    },
+    {
+      "epoch": 3.65,
+      "eval_loss": 1.387115716934204,
+      "eval_runtime": 37.4562,
+      "eval_samples_per_second": 26.698,
+      "eval_steps_per_second": 3.337,
+      "step": 73000
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 2.5508077144622803,
+      "learning_rate": 8.11899497487437e-05,
+      "loss": 1.3963,
+      "step": 73100
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 1.1488244533538818,
+      "learning_rate": 8.088844221105527e-05,
+      "loss": 1.4107,
+      "step": 73200
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 3.1272361278533936,
+      "learning_rate": 8.058693467336682e-05,
+      "loss": 1.3444,
+      "step": 73300
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 2.3045852184295654,
+      "learning_rate": 8.028542713567838e-05,
+      "loss": 1.3874,
+      "step": 73400
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 2.2844760417938232,
+      "learning_rate": 7.998391959798994e-05,
+      "loss": 1.3748,
+      "step": 73500
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 2.9295897483825684,
+      "learning_rate": 7.96824120603015e-05,
+      "loss": 1.3786,
+      "step": 73600
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.5064588785171509,
+      "learning_rate": 7.938090452261305e-05,
+      "loss": 1.4079,
+      "step": 73700
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.8818271160125732,
+      "learning_rate": 7.907939698492462e-05,
+      "loss": 1.3954,
+      "step": 73800
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.1885104179382324,
+      "learning_rate": 7.877788944723617e-05,
+      "loss": 1.3836,
+      "step": 73900
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 3.0773730278015137,
+      "learning_rate": 7.847638190954774e-05,
+      "loss": 1.3724,
+      "step": 74000
+    },
+    {
+      "epoch": 3.7,
+      "eval_loss": 1.3931760787963867,
+      "eval_runtime": 37.2623,
+      "eval_samples_per_second": 26.837,
+      "eval_steps_per_second": 3.355,
+      "step": 74000
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 1.4827876091003418,
+      "learning_rate": 7.817487437185929e-05,
+      "loss": 1.4122,
+      "step": 74100
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 1.8637442588806152,
+      "learning_rate": 7.787336683417086e-05,
+      "loss": 1.3466,
+      "step": 74200
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 2.286088228225708,
+      "learning_rate": 7.75718592964824e-05,
+      "loss": 1.3815,
+      "step": 74300
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 1.8894226551055908,
+      "learning_rate": 7.727035175879396e-05,
+      "loss": 1.4096,
+      "step": 74400
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 1.9993146657943726,
+      "learning_rate": 7.696884422110552e-05,
+      "loss": 1.4196,
+      "step": 74500
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 2.170727491378784,
+      "learning_rate": 7.666733668341708e-05,
+      "loss": 1.4034,
+      "step": 74600
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 1.9965826272964478,
+      "learning_rate": 7.636582914572863e-05,
+      "loss": 1.3971,
+      "step": 74700
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 2.5607786178588867,
+      "learning_rate": 7.60643216080402e-05,
+      "loss": 1.4116,
+      "step": 74800
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 1.5363885164260864,
+      "learning_rate": 7.576281407035175e-05,
+      "loss": 1.3932,
+      "step": 74900
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 1.9062122106552124,
+      "learning_rate": 7.546130653266332e-05,
+      "loss": 1.3625,
+      "step": 75000
+    },
+    {
+      "epoch": 3.75,
+      "eval_loss": 1.3875294923782349,
+      "eval_runtime": 37.1595,
+      "eval_samples_per_second": 26.911,
+      "eval_steps_per_second": 3.364,
+      "step": 75000
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 2.8305203914642334,
+      "learning_rate": 7.516281407035175e-05,
+      "loss": 1.3907,
+      "step": 75100
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 1.6526367664337158,
+      "learning_rate": 7.486130653266331e-05,
+      "loss": 1.3605,
+      "step": 75200
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 3.5784194469451904,
+      "learning_rate": 7.455979899497487e-05,
+      "loss": 1.3833,
+      "step": 75300
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 0.9500262141227722,
+      "learning_rate": 7.425829145728643e-05,
+      "loss": 1.3548,
+      "step": 75400
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 2.8620994091033936,
+      "learning_rate": 7.395678391959799e-05,
+      "loss": 1.3829,
+      "step": 75500
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 1.9316253662109375,
+      "learning_rate": 7.365527638190954e-05,
+      "loss": 1.3801,
+      "step": 75600
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 1.5958226919174194,
+      "learning_rate": 7.33537688442211e-05,
+      "loss": 1.3837,
+      "step": 75700
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 1.995013952255249,
+      "learning_rate": 7.305226130653266e-05,
+      "loss": 1.3933,
+      "step": 75800
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 1.4954237937927246,
+      "learning_rate": 7.275075376884422e-05,
+      "loss": 1.4223,
+      "step": 75900
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 1.363682508468628,
+      "learning_rate": 7.244924623115577e-05,
+      "loss": 1.3811,
+      "step": 76000
+    },
+    {
+      "epoch": 3.8,
+      "eval_loss": 1.3870151042938232,
+      "eval_runtime": 37.1537,
+      "eval_samples_per_second": 26.915,
+      "eval_steps_per_second": 3.364,
+      "step": 76000
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 1.6989458799362183,
+      "learning_rate": 7.214773869346733e-05,
+      "loss": 1.3755,
+      "step": 76100
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 2.042025327682495,
+      "learning_rate": 7.184623115577889e-05,
+      "loss": 1.3641,
+      "step": 76200
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 1.4355963468551636,
+      "learning_rate": 7.154472361809045e-05,
+      "loss": 1.3675,
+      "step": 76300
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 1.0404947996139526,
+      "learning_rate": 7.124321608040201e-05,
+      "loss": 1.3755,
+      "step": 76400
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 2.588839530944824,
+      "learning_rate": 7.094170854271357e-05,
+      "loss": 1.369,
+      "step": 76500
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 2.2656760215759277,
+      "learning_rate": 7.064020100502511e-05,
+      "loss": 1.4254,
+      "step": 76600
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 2.1572184562683105,
+      "learning_rate": 7.033869346733667e-05,
+      "loss": 1.3928,
+      "step": 76700
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 1.9283751249313354,
+      "learning_rate": 7.003718592964823e-05,
+      "loss": 1.3502,
+      "step": 76800
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 2.215160846710205,
+      "learning_rate": 6.973567839195979e-05,
+      "loss": 1.3701,
+      "step": 76900
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 1.9962304830551147,
+      "learning_rate": 6.943417085427135e-05,
+      "loss": 1.3353,
+      "step": 77000
+    },
+    {
+      "epoch": 3.85,
+      "eval_loss": 1.3705061674118042,
+      "eval_runtime": 36.8621,
+      "eval_samples_per_second": 27.128,
+      "eval_steps_per_second": 3.391,
+      "step": 77000
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 1.818708062171936,
+      "learning_rate": 6.913266331658291e-05,
+      "loss": 1.4337,
+      "step": 77100
+    },
+    {
+      "epoch": 3.86,
+      "grad_norm": 2.5512075424194336,
+      "learning_rate": 6.883115577889447e-05,
+      "loss": 1.3923,
+      "step": 77200
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 2.3595049381256104,
+      "learning_rate": 6.852964824120603e-05,
+      "loss": 1.3626,
+      "step": 77300
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 2.033395767211914,
+      "learning_rate": 6.822814070351757e-05,
+      "loss": 1.4061,
+      "step": 77400
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 1.8597971200942993,
+      "learning_rate": 6.792663316582913e-05,
+      "loss": 1.3672,
+      "step": 77500
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 6.214807987213135,
+      "learning_rate": 6.762512562814069e-05,
+      "loss": 1.3817,
+      "step": 77600
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 2.384474515914917,
+      "learning_rate": 6.732361809045225e-05,
+      "loss": 1.3811,
+      "step": 77700
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 2.6242575645446777,
+      "learning_rate": 6.702211055276381e-05,
+      "loss": 1.3883,
+      "step": 77800
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 1.4827545881271362,
+      "learning_rate": 6.672060301507537e-05,
+      "loss": 1.3755,
+      "step": 77900
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 2.150613307952881,
+      "learning_rate": 6.641909547738693e-05,
+      "loss": 1.3937,
+      "step": 78000
+    },
+    {
+      "epoch": 3.9,
+      "eval_loss": 1.379770278930664,
+      "eval_runtime": 37.0122,
+      "eval_samples_per_second": 27.018,
+      "eval_steps_per_second": 3.377,
+      "step": 78000
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 1.4805549383163452,
+      "learning_rate": 6.611758793969849e-05,
+      "loss": 1.365,
+      "step": 78100
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 1.699484944343567,
+      "learning_rate": 6.581608040201005e-05,
+      "loss": 1.3893,
+      "step": 78200
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 2.039008140563965,
+      "learning_rate": 6.551758793969849e-05,
+      "loss": 1.3601,
+      "step": 78300
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 1.3309062719345093,
+      "learning_rate": 6.521608040201005e-05,
+      "loss": 1.4056,
+      "step": 78400
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 2.382688045501709,
+      "learning_rate": 6.491758793969849e-05,
+      "loss": 1.3758,
+      "step": 78500
+    },
+    {
+      "epoch": 3.93,
+      "grad_norm": 3.1065239906311035,
+      "learning_rate": 6.461608040201005e-05,
+      "loss": 1.3725,
+      "step": 78600
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 2.193957805633545,
+      "learning_rate": 6.431457286432161e-05,
+      "loss": 1.3347,
+      "step": 78700
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.8022537231445312,
+      "learning_rate": 6.401306532663317e-05,
+      "loss": 1.3568,
+      "step": 78800
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.7433044910430908,
+      "learning_rate": 6.371155778894473e-05,
+      "loss": 1.3643,
+      "step": 78900
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 3.883768081665039,
+      "learning_rate": 6.341005025125627e-05,
+      "loss": 1.3158,
+      "step": 79000
+    },
+    {
+      "epoch": 3.95,
+      "eval_loss": 1.37774658203125,
+      "eval_runtime": 36.9703,
+      "eval_samples_per_second": 27.049,
+      "eval_steps_per_second": 3.381,
+      "step": 79000
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 2.9996466636657715,
+      "learning_rate": 6.310854271356783e-05,
+      "loss": 1.3295,
+      "step": 79100
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 1.2524511814117432,
+      "learning_rate": 6.280703517587939e-05,
+      "loss": 1.3362,
+      "step": 79200
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 1.8024784326553345,
+      "learning_rate": 6.250552763819095e-05,
+      "loss": 1.3857,
+      "step": 79300
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 1.3988703489303589,
+      "learning_rate": 6.220402010050251e-05,
+      "loss": 1.3463,
+      "step": 79400
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 2.499194622039795,
+      "learning_rate": 6.190251256281407e-05,
+      "loss": 1.3761,
+      "step": 79500
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 2.799959897994995,
+      "learning_rate": 6.160100502512562e-05,
+      "loss": 1.3995,
+      "step": 79600
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 1.2367918491363525,
+      "learning_rate": 6.129949748743717e-05,
+      "loss": 1.3789,
+      "step": 79700
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 1.7651324272155762,
+      "learning_rate": 6.0997989949748734e-05,
+      "loss": 1.3691,
+      "step": 79800
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.8847932815551758,
+      "learning_rate": 6.0696482412060293e-05,
+      "loss": 1.353,
+      "step": 79900
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.8860230445861816,
+      "learning_rate": 6.039497487437185e-05,
+      "loss": 1.3168,
+      "step": 80000
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.3387900590896606,
+      "eval_runtime": 37.0136,
+      "eval_samples_per_second": 27.017,
+      "eval_steps_per_second": 3.377,
+      "step": 80000
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 2.109632968902588,
+      "learning_rate": 6.009346733668341e-05,
+      "loss": 1.3518,
+      "step": 80100
+    },
+    {
+      "epoch": 4.01,
+      "grad_norm": 3.0571937561035156,
+      "learning_rate": 5.9791959798994965e-05,
+      "loss": 1.3664,
+      "step": 80200
+    },
+    {
+      "epoch": 4.01,
+      "grad_norm": 1.8600945472717285,
+      "learning_rate": 5.9490452261306525e-05,
+      "loss": 1.3252,
+      "step": 80300
+    },
+    {
+      "epoch": 4.02,
+      "grad_norm": 2.461089611053467,
+      "learning_rate": 5.9188944723618084e-05,
+      "loss": 1.3507,
+      "step": 80400
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 3.1424946784973145,
+      "learning_rate": 5.8887437185929643e-05,
+      "loss": 1.3414,
+      "step": 80500
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 1.701357126235962,
+      "learning_rate": 5.85859296482412e-05,
+      "loss": 1.3047,
+      "step": 80600
+    },
+    {
+      "epoch": 4.04,
+      "grad_norm": 2.331779956817627,
+      "learning_rate": 5.8284422110552756e-05,
+      "loss": 1.3309,
+      "step": 80700
+    },
+    {
+      "epoch": 4.04,
+      "grad_norm": 5.718899726867676,
+      "learning_rate": 5.7982914572864315e-05,
+      "loss": 1.3796,
+      "step": 80800
+    },
+    {
+      "epoch": 4.04,
+      "grad_norm": 1.1299536228179932,
+      "learning_rate": 5.7681407035175874e-05,
+      "loss": 1.3517,
+      "step": 80900
+    },
+    {
+      "epoch": 4.05,
+      "grad_norm": 2.256490468978882,
+      "learning_rate": 5.7379899497487434e-05,
+      "loss": 1.35,
+      "step": 81000
+    },
+    {
+      "epoch": 4.05,
+      "eval_loss": 1.3527089357376099,
+      "eval_runtime": 37.0846,
+      "eval_samples_per_second": 26.965,
+      "eval_steps_per_second": 3.371,
+      "step": 81000
+    },
+    {
+      "epoch": 4.05,
+      "grad_norm": 2.3526551723480225,
+      "learning_rate": 5.707839195979899e-05,
+      "loss": 1.3327,
+      "step": 81100
+    },
+    {
+      "epoch": 4.06,
+      "grad_norm": 2.987255573272705,
+      "learning_rate": 5.677688442211055e-05,
+      "loss": 1.3369,
+      "step": 81200
+    },
+    {
+      "epoch": 4.07,
+      "grad_norm": 3.35562801361084,
+      "learning_rate": 5.6475376884422105e-05,
+      "loss": 1.3583,
+      "step": 81300
+    },
+    {
+      "epoch": 4.07,
+      "grad_norm": 2.192720651626587,
+      "learning_rate": 5.6173869346733665e-05,
+      "loss": 1.3405,
+      "step": 81400
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 1.91267728805542,
+      "learning_rate": 5.5872361809045224e-05,
+      "loss": 1.3281,
+      "step": 81500
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 2.5109171867370605,
+      "learning_rate": 5.5570854271356784e-05,
+      "loss": 1.3126,
+      "step": 81600
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 1.4430352449417114,
+      "learning_rate": 5.526934673366834e-05,
+      "loss": 1.2983,
+      "step": 81700
+    },
+    {
+      "epoch": 4.09,
+      "grad_norm": 1.873565912246704,
+      "learning_rate": 5.4967839195979896e-05,
+      "loss": 1.292,
+      "step": 81800
+    },
+    {
+      "epoch": 4.09,
+      "grad_norm": 1.6436150074005127,
+      "learning_rate": 5.466633165829145e-05,
+      "loss": 1.3268,
+      "step": 81900
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 3.9100093841552734,
+      "learning_rate": 5.436482412060301e-05,
+      "loss": 1.3388,
+      "step": 82000
+    },
+    {
+      "epoch": 4.1,
+      "eval_loss": 1.3313664197921753,
+      "eval_runtime": 37.2377,
+      "eval_samples_per_second": 26.855,
+      "eval_steps_per_second": 3.357,
+      "step": 82000
+    },
+    {
+      "epoch": 4.11,
+      "grad_norm": 3.0072224140167236,
+      "learning_rate": 5.406331658291457e-05,
+      "loss": 1.2984,
+      "step": 82100
+    },
+    {
+      "epoch": 4.11,
+      "grad_norm": 1.3772695064544678,
+      "learning_rate": 5.376180904522612e-05,
+      "loss": 1.3182,
+      "step": 82200
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 3.444744110107422,
+      "learning_rate": 5.346030150753768e-05,
+      "loss": 1.3237,
+      "step": 82300
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 2.719045400619507,
+      "learning_rate": 5.315879396984924e-05,
+      "loss": 1.349,
+      "step": 82400
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 4.052174091339111,
+      "learning_rate": 5.28572864321608e-05,
+      "loss": 1.3504,
+      "step": 82500
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 1.3267349004745483,
+      "learning_rate": 5.255577889447236e-05,
+      "loss": 1.3038,
+      "step": 82600
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 2.1242218017578125,
+      "learning_rate": 5.225427135678391e-05,
+      "loss": 1.3087,
+      "step": 82700
+    },
+    {
+      "epoch": 4.14,
+      "grad_norm": 3.2896034717559814,
+      "learning_rate": 5.195276381909547e-05,
+      "loss": 1.31,
+      "step": 82800
+    },
+    {
+      "epoch": 4.14,
+      "grad_norm": 2.3824546337127686,
+      "learning_rate": 5.165125628140703e-05,
+      "loss": 1.2972,
+      "step": 82900
+    },
+    {
+      "epoch": 4.15,
+      "grad_norm": 2.0106704235076904,
+      "learning_rate": 5.134974874371859e-05,
+      "loss": 1.3578,
+      "step": 83000
+    },
+    {
+      "epoch": 4.15,
+      "eval_loss": 1.3431867361068726,
+      "eval_runtime": 36.9495,
+      "eval_samples_per_second": 27.064,
+      "eval_steps_per_second": 3.383,
+      "step": 83000
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 2.094095230102539,
+      "learning_rate": 5.104824120603015e-05,
+      "loss": 1.3172,
+      "step": 83100
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 2.0634231567382812,
+      "learning_rate": 5.074673366834171e-05,
+      "loss": 1.3381,
+      "step": 83200
+    },
+    {
+      "epoch": 4.17,
+      "grad_norm": 4.8768486976623535,
+      "learning_rate": 5.044522613065326e-05,
+      "loss": 1.3148,
+      "step": 83300
+    },
+    {
+      "epoch": 4.17,
+      "grad_norm": 4.0618486404418945,
+      "learning_rate": 5.014371859296482e-05,
+      "loss": 1.3635,
+      "step": 83400
+    },
+    {
+      "epoch": 4.17,
+      "grad_norm": 1.6911975145339966,
+      "learning_rate": 4.984221105527638e-05,
+      "loss": 1.3333,
+      "step": 83500
+    },
+    {
+      "epoch": 4.18,
+      "grad_norm": 2.164687156677246,
+      "learning_rate": 4.954070351758794e-05,
+      "loss": 1.3391,
+      "step": 83600
+    },
+    {
+      "epoch": 4.18,
+      "grad_norm": 1.3353967666625977,
+      "learning_rate": 4.92391959798995e-05,
+      "loss": 1.2989,
+      "step": 83700
+    },
+    {
+      "epoch": 4.19,
+      "grad_norm": 3.4308252334594727,
+      "learning_rate": 4.893768844221105e-05,
+      "loss": 1.2966,
+      "step": 83800
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 2.58341121673584,
+      "learning_rate": 4.863618090452261e-05,
+      "loss": 1.3036,
+      "step": 83900
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 2.0035691261291504,
+      "learning_rate": 4.833467336683417e-05,
+      "loss": 1.3028,
+      "step": 84000
+    },
+    {
+      "epoch": 4.2,
+      "eval_loss": 1.3128948211669922,
+      "eval_runtime": 36.9125,
+      "eval_samples_per_second": 27.091,
+      "eval_steps_per_second": 3.386,
+      "step": 84000
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 3.858201503753662,
+      "learning_rate": 4.803316582914573e-05,
+      "loss": 1.3459,
+      "step": 84100
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 1.9417260885238647,
+      "learning_rate": 4.7731658291457275e-05,
+      "loss": 1.3243,
+      "step": 84200
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 3.000342607498169,
+      "learning_rate": 4.7430150753768835e-05,
+      "loss": 1.3175,
+      "step": 84300
+    },
+    {
+      "epoch": 4.22,
+      "grad_norm": 5.205733776092529,
+      "learning_rate": 4.7128643216080394e-05,
+      "loss": 1.3486,
+      "step": 84400
+    },
+    {
+      "epoch": 4.22,
+      "grad_norm": 3.2263059616088867,
+      "learning_rate": 4.683015075376885e-05,
+      "loss": 1.3103,
+      "step": 84500
+    },
+    {
+      "epoch": 4.23,
+      "grad_norm": 2.210460662841797,
+      "learning_rate": 4.652864321608039e-05,
+      "loss": 1.339,
+      "step": 84600
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 3.195037364959717,
+      "learning_rate": 4.622713567839195e-05,
+      "loss": 1.3472,
+      "step": 84700
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 2.5944504737854004,
+      "learning_rate": 4.592562814070351e-05,
+      "loss": 1.3159,
+      "step": 84800
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 2.6246960163116455,
+      "learning_rate": 4.562412060301507e-05,
+      "loss": 1.2896,
+      "step": 84900
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 2.556218385696411,
+      "learning_rate": 4.5322613065326624e-05,
+      "loss": 1.3187,
+      "step": 85000
+    },
+    {
+      "epoch": 4.25,
+      "eval_loss": 1.324548363685608,
+      "eval_runtime": 36.9422,
+      "eval_samples_per_second": 27.069,
+      "eval_steps_per_second": 3.384,
+      "step": 85000
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 2.348661184310913,
+      "learning_rate": 4.5021105527638184e-05,
+      "loss": 1.3004,
+      "step": 85100
+    },
+    {
+      "epoch": 4.26,
+      "grad_norm": 2.0715219974517822,
+      "learning_rate": 4.471959798994974e-05,
+      "loss": 1.3265,
+      "step": 85200
+    },
+    {
+      "epoch": 4.26,
+      "grad_norm": 2.5301461219787598,
+      "learning_rate": 4.44180904522613e-05,
+      "loss": 1.3187,
+      "step": 85300
+    },
+    {
+      "epoch": 4.27,
+      "grad_norm": 5.115157604217529,
+      "learning_rate": 4.411658291457286e-05,
+      "loss": 1.3179,
+      "step": 85400
+    },
+    {
+      "epoch": 4.28,
+      "grad_norm": 2.671520471572876,
+      "learning_rate": 4.3815075376884415e-05,
+      "loss": 1.2988,
+      "step": 85500
+    },
+    {
+      "epoch": 4.28,
+      "grad_norm": 2.0337090492248535,
+      "learning_rate": 4.3513567839195974e-05,
+      "loss": 1.2781,
+      "step": 85600
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 2.735685348510742,
+      "learning_rate": 4.3212060301507534e-05,
+      "loss": 1.3167,
+      "step": 85700
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 2.573694944381714,
+      "learning_rate": 4.291055276381909e-05,
+      "loss": 1.3057,
+      "step": 85800
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 1.5959097146987915,
+      "learning_rate": 4.260904522613065e-05,
+      "loss": 1.327,
+      "step": 85900
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 2.1494078636169434,
+      "learning_rate": 4.2307537688442205e-05,
+      "loss": 1.3291,
+      "step": 86000
+    },
+    {
+      "epoch": 4.3,
+      "eval_loss": 1.3199845552444458,
+      "eval_runtime": 36.8676,
+      "eval_samples_per_second": 27.124,
+      "eval_steps_per_second": 3.391,
+      "step": 86000
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 1.7684657573699951,
+      "learning_rate": 4.2006030150753765e-05,
+      "loss": 1.3341,
+      "step": 86100
+    },
+    {
+      "epoch": 4.31,
+      "grad_norm": 2.656754732131958,
+      "learning_rate": 4.1704522613065324e-05,
+      "loss": 1.3275,
+      "step": 86200
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 1.330156683921814,
+      "learning_rate": 4.1403015075376883e-05,
+      "loss": 1.3365,
+      "step": 86300
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 3.0051920413970947,
+      "learning_rate": 4.110150753768844e-05,
+      "loss": 1.303,
+      "step": 86400
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 1.494343638420105,
+      "learning_rate": 4.08e-05,
+      "loss": 1.2839,
+      "step": 86500
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 3.456125020980835,
+      "learning_rate": 4.0498492462311555e-05,
+      "loss": 1.2964,
+      "step": 86600
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 2.114023447036743,
+      "learning_rate": 4.0196984924623115e-05,
+      "loss": 1.3397,
+      "step": 86700
+    },
+    {
+      "epoch": 4.34,
+      "grad_norm": 2.5307486057281494,
+      "learning_rate": 3.9895477386934674e-05,
+      "loss": 1.2841,
+      "step": 86800
+    },
+    {
+      "epoch": 4.34,
+      "grad_norm": 1.866807222366333,
+      "learning_rate": 3.9596984924623113e-05,
+      "loss": 1.3218,
+      "step": 86900
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 1.9067096710205078,
+      "learning_rate": 3.929547738693467e-05,
+      "loss": 1.3257,
+      "step": 87000
+    },
+    {
+      "epoch": 4.35,
+      "eval_loss": 1.3586480617523193,
+      "eval_runtime": 37.2768,
+      "eval_samples_per_second": 26.826,
+      "eval_steps_per_second": 3.353,
+      "step": 87000
+    },
+    {
+      "epoch": 4.36,
+      "grad_norm": 3.3964035511016846,
+      "learning_rate": 3.899396984924623e-05,
+      "loss": 1.3385,
+      "step": 87100
+    },
+    {
+      "epoch": 4.36,
+      "grad_norm": 3.567638874053955,
+      "learning_rate": 3.869246231155778e-05,
+      "loss": 1.3281,
+      "step": 87200
+    },
+    {
+      "epoch": 4.37,
+      "grad_norm": 2.170928955078125,
+      "learning_rate": 3.839095477386934e-05,
+      "loss": 1.3159,
+      "step": 87300
+    },
+    {
+      "epoch": 4.37,
+      "grad_norm": 2.326664924621582,
+      "learning_rate": 3.80894472361809e-05,
+      "loss": 1.3499,
+      "step": 87400
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 2.8397111892700195,
+      "learning_rate": 3.778793969849246e-05,
+      "loss": 1.349,
+      "step": 87500
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 2.2665247917175293,
+      "learning_rate": 3.7486432160804016e-05,
+      "loss": 1.3172,
+      "step": 87600
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 4.748266696929932,
+      "learning_rate": 3.718793969849246e-05,
+      "loss": 1.2857,
+      "step": 87700
+    },
+    {
+      "epoch": 4.39,
+      "grad_norm": 1.4008738994598389,
+      "learning_rate": 3.6886432160804015e-05,
+      "loss": 1.276,
+      "step": 87800
+    },
+    {
+      "epoch": 4.39,
+      "grad_norm": 3.0103044509887695,
+      "learning_rate": 3.6584924623115574e-05,
+      "loss": 1.3114,
+      "step": 87900
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.6098419427871704,
+      "learning_rate": 3.6283417085427134e-05,
+      "loss": 1.2995,
+      "step": 88000
+    },
+    {
+      "epoch": 4.4,
+      "eval_loss": 1.2803822755813599,
+      "eval_runtime": 37.2497,
+      "eval_samples_per_second": 26.846,
+      "eval_steps_per_second": 3.356,
+      "step": 88000
+    },
+    {
+      "epoch": 4.41,
+      "grad_norm": 1.6589020490646362,
+      "learning_rate": 3.5981909547738693e-05,
+      "loss": 1.3214,
+      "step": 88100
+    },
+    {
+      "epoch": 4.41,
+      "grad_norm": 3.2466797828674316,
+      "learning_rate": 3.5680402010050246e-05,
+      "loss": 1.2685,
+      "step": 88200
+    },
+    {
+      "epoch": 4.42,
+      "grad_norm": 3.273249626159668,
+      "learning_rate": 3.5378894472361806e-05,
+      "loss": 1.2773,
+      "step": 88300
+    },
+    {
+      "epoch": 4.42,
+      "grad_norm": 2.712141513824463,
+      "learning_rate": 3.5077386934673365e-05,
+      "loss": 1.3107,
+      "step": 88400
+    },
+    {
+      "epoch": 4.42,
+      "grad_norm": 2.671769857406616,
+      "learning_rate": 3.477587939698492e-05,
+      "loss": 1.3149,
+      "step": 88500
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 1.6911038160324097,
+      "learning_rate": 3.447437185929648e-05,
+      "loss": 1.312,
+      "step": 88600
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 2.3690972328186035,
+      "learning_rate": 3.4172864321608037e-05,
+      "loss": 1.2826,
+      "step": 88700
+    },
+    {
+      "epoch": 4.44,
+      "grad_norm": 2.1747915744781494,
+      "learning_rate": 3.3871356783919596e-05,
+      "loss": 1.3187,
+      "step": 88800
+    },
+    {
+      "epoch": 4.45,
+      "grad_norm": 1.7691452503204346,
+      "learning_rate": 3.3569849246231155e-05,
+      "loss": 1.3356,
+      "step": 88900
+    },
+    {
+      "epoch": 4.45,
+      "grad_norm": 2.9473211765289307,
+      "learning_rate": 3.3268341708542715e-05,
+      "loss": 1.2721,
+      "step": 89000
+    },
+    {
+      "epoch": 4.45,
+      "eval_loss": 1.3034113645553589,
+      "eval_runtime": 37.3017,
+      "eval_samples_per_second": 26.808,
+      "eval_steps_per_second": 3.351,
+      "step": 89000
+    },
+    {
+      "epoch": 4.46,
+      "grad_norm": 2.840188503265381,
+      "learning_rate": 3.296683417085427e-05,
+      "loss": 1.2842,
+      "step": 89100
+    },
+    {
+      "epoch": 4.46,
+      "grad_norm": 1.5584784746170044,
+      "learning_rate": 3.266532663316583e-05,
+      "loss": 1.2961,
+      "step": 89200
+    },
+    {
+      "epoch": 4.46,
+      "grad_norm": 1.889328956604004,
+      "learning_rate": 3.236381909547738e-05,
+      "loss": 1.3121,
+      "step": 89300
+    },
+    {
+      "epoch": 4.47,
+      "grad_norm": 1.627302646636963,
+      "learning_rate": 3.206231155778894e-05,
+      "loss": 1.3169,
+      "step": 89400
+    },
+    {
+      "epoch": 4.47,
+      "grad_norm": 3.445068359375,
+      "learning_rate": 3.17608040201005e-05,
+      "loss": 1.2691,
+      "step": 89500
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 3.2616968154907227,
+      "learning_rate": 3.145929648241206e-05,
+      "loss": 1.2818,
+      "step": 89600
+    },
+    {
+      "epoch": 4.49,
+      "grad_norm": 2.2444956302642822,
+      "learning_rate": 3.115778894472362e-05,
+      "loss": 1.2836,
+      "step": 89700
+    },
+    {
+      "epoch": 4.49,
+      "grad_norm": 2.341099500656128,
+      "learning_rate": 3.085628140703517e-05,
+      "loss": 1.2712,
+      "step": 89800
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 1.882042407989502,
+      "learning_rate": 3.055477386934673e-05,
+      "loss": 1.3316,
+      "step": 89900
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 3.1667628288269043,
+      "learning_rate": 3.025326633165829e-05,
+      "loss": 1.312,
+      "step": 90000
+    },
+    {
+      "epoch": 4.5,
+      "eval_loss": 1.309124231338501,
+      "eval_runtime": 37.1306,
+      "eval_samples_per_second": 26.932,
+      "eval_steps_per_second": 3.366,
+      "step": 90000
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 2.1243770122528076,
+      "learning_rate": 2.9954773869346732e-05,
+      "loss": 1.2924,
+      "step": 90100
+    },
+    {
+      "epoch": 4.51,
+      "grad_norm": 10.278544425964355,
+      "learning_rate": 2.9653266331658288e-05,
+      "loss": 1.3081,
+      "step": 90200
+    },
+    {
+      "epoch": 4.51,
+      "grad_norm": 8.060081481933594,
+      "learning_rate": 2.9351758793969847e-05,
+      "loss": 1.3317,
+      "step": 90300
+    },
+    {
+      "epoch": 4.52,
+      "grad_norm": 1.364973783493042,
+      "learning_rate": 2.9050251256281404e-05,
+      "loss": 1.2882,
+      "step": 90400
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 1.1993753910064697,
+      "learning_rate": 2.8748743718592963e-05,
+      "loss": 1.292,
+      "step": 90500
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 2.897918462753296,
+      "learning_rate": 2.8447236180904522e-05,
+      "loss": 1.2804,
+      "step": 90600
+    },
+    {
+      "epoch": 4.54,
+      "grad_norm": 2.7222065925598145,
+      "learning_rate": 2.814572864321608e-05,
+      "loss": 1.2985,
+      "step": 90700
+    },
+    {
+      "epoch": 4.54,
+      "grad_norm": 5.0719099044799805,
+      "learning_rate": 2.7844221105527635e-05,
+      "loss": 1.2768,
+      "step": 90800
+    },
+    {
+      "epoch": 4.54,
+      "grad_norm": 2.1225016117095947,
+      "learning_rate": 2.754271356783919e-05,
+      "loss": 1.2774,
+      "step": 90900
+    },
+    {
+      "epoch": 4.55,
+      "grad_norm": 1.314208984375,
+      "learning_rate": 2.724120603015075e-05,
+      "loss": 1.3064,
+      "step": 91000
+    },
+    {
+      "epoch": 4.55,
+      "eval_loss": 1.2666804790496826,
+      "eval_runtime": 37.0231,
+      "eval_samples_per_second": 27.01,
+      "eval_steps_per_second": 3.376,
+      "step": 91000
+    },
+    {
+      "epoch": 4.55,
+      "grad_norm": 1.5543466806411743,
+      "learning_rate": 2.693969849246231e-05,
+      "loss": 1.2905,
+      "step": 91100
+    },
+    {
+      "epoch": 4.56,
+      "grad_norm": 1.3213764429092407,
+      "learning_rate": 2.6638190954773866e-05,
+      "loss": 1.2541,
+      "step": 91200
+    },
+    {
+      "epoch": 4.56,
+      "grad_norm": 2.523181200027466,
+      "learning_rate": 2.6336683417085425e-05,
+      "loss": 1.2743,
+      "step": 91300
+    },
+    {
+      "epoch": 4.57,
+      "grad_norm": 3.6385650634765625,
+      "learning_rate": 2.6035175879396984e-05,
+      "loss": 1.2936,
+      "step": 91400
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 2.2603909969329834,
+      "learning_rate": 2.573366834170854e-05,
+      "loss": 1.252,
+      "step": 91500
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 3.178767442703247,
+      "learning_rate": 2.54321608040201e-05,
+      "loss": 1.2637,
+      "step": 91600
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 1.8341214656829834,
+      "learning_rate": 2.5130653266331656e-05,
+      "loss": 1.2692,
+      "step": 91700
+    },
+    {
+      "epoch": 4.59,
+      "grad_norm": 3.1009461879730225,
+      "learning_rate": 2.4829145728643216e-05,
+      "loss": 1.2821,
+      "step": 91800
+    },
+    {
+      "epoch": 4.59,
+      "grad_norm": 2.75738263130188,
+      "learning_rate": 2.4527638190954775e-05,
+      "loss": 1.284,
+      "step": 91900
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 3.137352466583252,
+      "learning_rate": 2.4226130653266328e-05,
+      "loss": 1.2846,
+      "step": 92000
+    },
+    {
+      "epoch": 4.6,
+      "eval_loss": 1.244607925415039,
+      "eval_runtime": 37.127,
+      "eval_samples_per_second": 26.935,
+      "eval_steps_per_second": 3.367,
+      "step": 92000
+    },
+    {
+      "epoch": 4.61,
+      "grad_norm": 2.5030505657196045,
+      "learning_rate": 2.3924623115577887e-05,
+      "loss": 1.2793,
+      "step": 92100
+    },
+    {
+      "epoch": 4.61,
+      "grad_norm": 2.625671625137329,
+      "learning_rate": 2.3623115577889443e-05,
+      "loss": 1.2575,
+      "step": 92200
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 1.8129239082336426,
+      "learning_rate": 2.3321608040201003e-05,
+      "loss": 1.265,
+      "step": 92300
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 2.747807502746582,
+      "learning_rate": 2.3020100502512562e-05,
+      "loss": 1.2781,
+      "step": 92400
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 3.083634853363037,
+      "learning_rate": 2.2718592964824118e-05,
+      "loss": 1.2667,
+      "step": 92500
+    },
+    {
+      "epoch": 4.63,
+      "grad_norm": 3.475771427154541,
+      "learning_rate": 2.2417085427135678e-05,
+      "loss": 1.2278,
+      "step": 92600
+    },
+    {
+      "epoch": 4.63,
+      "grad_norm": 4.056103706359863,
+      "learning_rate": 2.2115577889447234e-05,
+      "loss": 1.276,
+      "step": 92700
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 3.8437623977661133,
+      "learning_rate": 2.1814070351758793e-05,
+      "loss": 1.2799,
+      "step": 92800
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 2.0102827548980713,
+      "learning_rate": 2.1512562814070353e-05,
+      "loss": 1.2589,
+      "step": 92900
+    },
+    {
+      "epoch": 4.65,
+      "grad_norm": 3.917171001434326,
+      "learning_rate": 2.121105527638191e-05,
+      "loss": 1.245,
+      "step": 93000
+    },
+    {
+      "epoch": 4.65,
+      "eval_loss": 1.2793523073196411,
+      "eval_runtime": 37.5859,
+      "eval_samples_per_second": 26.606,
+      "eval_steps_per_second": 3.326,
+      "step": 93000
+    },
+    {
+      "epoch": 4.66,
+      "grad_norm": 1.847579836845398,
+      "learning_rate": 2.0909547738693465e-05,
+      "loss": 1.2172,
+      "step": 93100
+    },
+    {
+      "epoch": 4.66,
+      "grad_norm": 1.8271080255508423,
+      "learning_rate": 2.060804020100502e-05,
+      "loss": 1.2781,
+      "step": 93200
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 3.5897600650787354,
+      "learning_rate": 2.030653266331658e-05,
+      "loss": 1.2587,
+      "step": 93300
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 2.1579065322875977,
+      "learning_rate": 2.0008040201005026e-05,
+      "loss": 1.2742,
+      "step": 93400
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 2.0196428298950195,
+      "learning_rate": 1.970653266331658e-05,
+      "loss": 1.24,
+      "step": 93500
+    },
+    {
+      "epoch": 4.68,
+      "grad_norm": 2.4959633350372314,
+      "learning_rate": 1.940502512562814e-05,
+      "loss": 1.28,
+      "step": 93600
+    },
+    {
+      "epoch": 4.69,
+      "grad_norm": 2.4107792377471924,
+      "learning_rate": 1.9103517587939695e-05,
+      "loss": 1.326,
+      "step": 93700
+    },
+    {
+      "epoch": 4.69,
+      "grad_norm": 3.1433868408203125,
+      "learning_rate": 1.8802010050251254e-05,
+      "loss": 1.2423,
+      "step": 93800
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 3.2713465690612793,
+      "learning_rate": 1.8500502512562814e-05,
+      "loss": 1.3136,
+      "step": 93900
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 2.5864298343658447,
+      "learning_rate": 1.819899497487437e-05,
+      "loss": 1.3029,
+      "step": 94000
+    },
+    {
+      "epoch": 4.7,
+      "eval_loss": 1.3026132583618164,
+      "eval_runtime": 37.3025,
+      "eval_samples_per_second": 26.808,
+      "eval_steps_per_second": 3.351,
+      "step": 94000
+    },
+    {
+      "epoch": 4.71,
+      "grad_norm": 3.219913959503174,
+      "learning_rate": 1.789748743718593e-05,
+      "loss": 1.3279,
+      "step": 94100
+    },
+    {
+      "epoch": 4.71,
+      "grad_norm": 2.7392513751983643,
+      "learning_rate": 1.7595979899497485e-05,
+      "loss": 1.2637,
+      "step": 94200
+    },
+    {
+      "epoch": 4.71,
+      "grad_norm": 3.506613254547119,
+      "learning_rate": 1.7294472361809045e-05,
+      "loss": 1.2762,
+      "step": 94300
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 1.6417380571365356,
+      "learning_rate": 1.69929648241206e-05,
+      "loss": 1.2733,
+      "step": 94400
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 3.572312355041504,
+      "learning_rate": 1.669145728643216e-05,
+      "loss": 1.2524,
+      "step": 94500
+    },
+    {
+      "epoch": 4.73,
+      "grad_norm": 2.3081557750701904,
+      "learning_rate": 1.6389949748743716e-05,
+      "loss": 1.2388,
+      "step": 94600
+    },
+    {
+      "epoch": 4.74,
+      "grad_norm": 3.257410764694214,
+      "learning_rate": 1.6088442211055276e-05,
+      "loss": 1.2395,
+      "step": 94700
+    },
+    {
+      "epoch": 4.74,
+      "grad_norm": 3.3409955501556396,
+      "learning_rate": 1.5786934673366835e-05,
+      "loss": 1.2464,
+      "step": 94800
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 2.011337995529175,
+      "learning_rate": 1.548542713567839e-05,
+      "loss": 1.2799,
+      "step": 94900
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 3.929819107055664,
+      "learning_rate": 1.5183919597989947e-05,
+      "loss": 1.2408,
+      "step": 95000
+    },
+    {
+      "epoch": 4.75,
+      "eval_loss": 1.2593971490859985,
+      "eval_runtime": 37.2434,
+      "eval_samples_per_second": 26.85,
+      "eval_steps_per_second": 3.356,
+      "step": 95000
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 3.053884983062744,
+      "learning_rate": 1.4882412060301507e-05,
+      "loss": 1.2509,
+      "step": 95100
+    },
+    {
+      "epoch": 4.76,
+      "grad_norm": 2.4242303371429443,
+      "learning_rate": 1.4580904522613064e-05,
+      "loss": 1.2419,
+      "step": 95200
+    },
+    {
+      "epoch": 4.76,
+      "grad_norm": 5.195686340332031,
+      "learning_rate": 1.4279396984924622e-05,
+      "loss": 1.2542,
+      "step": 95300
+    },
+    {
+      "epoch": 4.77,
+      "grad_norm": 2.4507782459259033,
+      "learning_rate": 1.397788944723618e-05,
+      "loss": 1.2388,
+      "step": 95400
+    },
+    {
+      "epoch": 4.78,
+      "grad_norm": 3.998619318008423,
+      "learning_rate": 1.3676381909547736e-05,
+      "loss": 1.225,
+      "step": 95500
+    },
+    {
+      "epoch": 4.78,
+      "grad_norm": 6.8137664794921875,
+      "learning_rate": 1.3374874371859295e-05,
+      "loss": 1.2379,
+      "step": 95600
+    },
+    {
+      "epoch": 4.79,
+      "grad_norm": 2.8627781867980957,
+      "learning_rate": 1.3073366834170853e-05,
+      "loss": 1.2752,
+      "step": 95700
+    },
+    {
+      "epoch": 4.79,
+      "grad_norm": 2.2454240322113037,
+      "learning_rate": 1.2771859296482411e-05,
+      "loss": 1.243,
+      "step": 95800
+    },
+    {
+      "epoch": 4.79,
+      "grad_norm": 1.6306633949279785,
+      "learning_rate": 1.2470351758793969e-05,
+      "loss": 1.2593,
+      "step": 95900
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 2.288721799850464,
+      "learning_rate": 1.217185929648241e-05,
+      "loss": 1.246,
+      "step": 96000
+    },
+    {
+      "epoch": 4.8,
+      "eval_loss": 1.245192527770996,
+      "eval_runtime": 37.2658,
+      "eval_samples_per_second": 26.834,
+      "eval_steps_per_second": 3.354,
+      "step": 96000
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 3.1359307765960693,
+      "learning_rate": 1.187035175879397e-05,
+      "loss": 1.2531,
+      "step": 96100
+    },
+    {
+      "epoch": 4.81,
+      "grad_norm": 3.468085527420044,
+      "learning_rate": 1.1568844221105527e-05,
+      "loss": 1.2602,
+      "step": 96200
+    },
+    {
+      "epoch": 4.81,
+      "grad_norm": 2.3539419174194336,
+      "learning_rate": 1.1267336683417085e-05,
+      "loss": 1.236,
+      "step": 96300
+    },
+    {
+      "epoch": 4.82,
+      "grad_norm": 2.901660442352295,
+      "learning_rate": 1.0965829145728641e-05,
+      "loss": 1.2401,
+      "step": 96400
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 2.2228598594665527,
+      "learning_rate": 1.0664321608040199e-05,
+      "loss": 1.2718,
+      "step": 96500
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 2.1923272609710693,
+      "learning_rate": 1.0362814070351758e-05,
+      "loss": 1.2642,
+      "step": 96600
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 3.3871023654937744,
+      "learning_rate": 1.0061306532663316e-05,
+      "loss": 1.2247,
+      "step": 96700
+    },
+    {
+      "epoch": 4.84,
+      "grad_norm": 4.223761081695557,
+      "learning_rate": 9.759798994974874e-06,
+      "loss": 1.2761,
+      "step": 96800
+    },
+    {
+      "epoch": 4.84,
+      "grad_norm": 4.008758068084717,
+      "learning_rate": 9.458291457286431e-06,
+      "loss": 1.2895,
+      "step": 96900
+    },
+    {
+      "epoch": 4.85,
+      "grad_norm": 3.956162214279175,
+      "learning_rate": 9.156783919597989e-06,
+      "loss": 1.2573,
+      "step": 97000
+    },
+    {
+      "epoch": 4.85,
+      "eval_loss": 1.2709555625915527,
+      "eval_runtime": 37.1777,
+      "eval_samples_per_second": 26.898,
+      "eval_steps_per_second": 3.362,
+      "step": 97000
+    },
+    {
+      "epoch": 4.86,
+      "grad_norm": 1.0692745447158813,
+      "learning_rate": 8.855276381909547e-06,
+      "loss": 1.242,
+      "step": 97100
+    },
+    {
+      "epoch": 4.86,
+      "grad_norm": 2.2347793579101562,
+      "learning_rate": 8.553768844221105e-06,
+      "loss": 1.2211,
+      "step": 97200
+    },
+    {
+      "epoch": 4.87,
+      "grad_norm": 3.8911612033843994,
+      "learning_rate": 8.252261306532662e-06,
+      "loss": 1.2532,
+      "step": 97300
+    },
+    {
+      "epoch": 4.87,
+      "grad_norm": 3.3509294986724854,
+      "learning_rate": 7.95075376884422e-06,
+      "loss": 1.2599,
+      "step": 97400
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 2.3662168979644775,
+      "learning_rate": 7.649246231155778e-06,
+      "loss": 1.2768,
+      "step": 97500
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 3.0100605487823486,
+      "learning_rate": 7.3477386934673365e-06,
+      "loss": 1.266,
+      "step": 97600
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 3.673358201980591,
+      "learning_rate": 7.046231155778893e-06,
+      "loss": 1.2288,
+      "step": 97700
+    },
+    {
+      "epoch": 4.89,
+      "grad_norm": 3.1406137943267822,
+      "learning_rate": 6.744723618090451e-06,
+      "loss": 1.2193,
+      "step": 97800
+    },
+    {
+      "epoch": 4.89,
+      "grad_norm": 2.9519011974334717,
+      "learning_rate": 6.44321608040201e-06,
+      "loss": 1.2347,
+      "step": 97900
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 2.01814866065979,
+      "learning_rate": 6.141708542713567e-06,
+      "loss": 1.242,
+      "step": 98000
+    },
+    {
+      "epoch": 4.9,
+      "eval_loss": 1.2466384172439575,
+      "eval_runtime": 37.1499,
+      "eval_samples_per_second": 26.918,
+      "eval_steps_per_second": 3.365,
+      "step": 98000
+    },
+    {
+      "epoch": 4.91,
+      "grad_norm": 6.0040059089660645,
+      "learning_rate": 5.840201005025125e-06,
+      "loss": 1.2014,
+      "step": 98100
+    },
+    {
+      "epoch": 4.91,
+      "grad_norm": 5.865699291229248,
+      "learning_rate": 5.538693467336683e-06,
+      "loss": 1.2544,
+      "step": 98200
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 3.115818500518799,
+      "learning_rate": 5.23718592964824e-06,
+      "loss": 1.2369,
+      "step": 98300
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 2.521693468093872,
+      "learning_rate": 4.9356783919597985e-06,
+      "loss": 1.2166,
+      "step": 98400
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 3.7152583599090576,
+      "learning_rate": 4.634170854271356e-06,
+      "loss": 1.2458,
+      "step": 98500
+    },
+    {
+      "epoch": 4.93,
+      "grad_norm": 8.62978458404541,
+      "learning_rate": 4.332663316582914e-06,
+      "loss": 1.2525,
+      "step": 98600
+    },
+    {
+      "epoch": 4.94,
+      "grad_norm": 3.633636951446533,
+      "learning_rate": 4.031155778894472e-06,
+      "loss": 1.2316,
+      "step": 98700
+    },
+    {
+      "epoch": 4.94,
+      "grad_norm": 4.357114791870117,
+      "learning_rate": 3.7296482412060296e-06,
+      "loss": 1.2195,
+      "step": 98800
+    },
+    {
+      "epoch": 4.95,
+      "grad_norm": 1.2212806940078735,
+      "learning_rate": 3.4281407035175877e-06,
+      "loss": 1.1938,
+      "step": 98900
+    },
+    {
+      "epoch": 4.95,
+      "grad_norm": 3.4345273971557617,
+      "learning_rate": 3.1266331658291455e-06,
+      "loss": 1.2312,
+      "step": 99000
+    },
+    {
+      "epoch": 4.95,
+      "eval_loss": 1.2566660642623901,
+      "eval_runtime": 37.0254,
+      "eval_samples_per_second": 27.009,
+      "eval_steps_per_second": 3.376,
+      "step": 99000
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 4.7800188064575195,
+      "learning_rate": 2.8251256281407033e-06,
+      "loss": 1.1972,
+      "step": 99100
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 2.3637685775756836,
+      "learning_rate": 2.5236180904522614e-06,
+      "loss": 1.2475,
+      "step": 99200
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 3.591388463973999,
+      "learning_rate": 2.2221105527638188e-06,
+      "loss": 1.2505,
+      "step": 99300
+    },
+    {
+      "epoch": 4.97,
+      "grad_norm": 3.078840970993042,
+      "learning_rate": 1.9236180904522612e-06,
+      "loss": 1.2421,
+      "step": 99400
+    },
+    {
+      "epoch": 4.97,
+      "grad_norm": 3.289644479751587,
+      "learning_rate": 1.622110552763819e-06,
+      "loss": 1.257,
+      "step": 99500
+    },
+    {
+      "epoch": 4.98,
+      "grad_norm": 4.775394439697266,
+      "learning_rate": 1.3206030150753765e-06,
+      "loss": 1.256,
+      "step": 99600
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 4.110093116760254,
+      "learning_rate": 1.0190954773869345e-06,
+      "loss": 1.1982,
+      "step": 99700
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 1.86680006980896,
+      "learning_rate": 7.175879396984924e-07,
+      "loss": 1.2571,
+      "step": 99800
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 3.5120298862457275,
+      "learning_rate": 4.160804020100502e-07,
+      "loss": 1.2196,
+      "step": 99900
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 2.356171131134033,
+      "learning_rate": 1.1457286432160803e-07,
+      "loss": 1.2164,
+      "step": 100000
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.2736848592758179,
+      "eval_runtime": 37.4852,
+      "eval_samples_per_second": 26.677,
+      "eval_steps_per_second": 3.335,
+      "step": 100000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 100000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 1000,
+  "total_flos": 1.2076594495488e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}