diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,9645 @@
+{
+  "best_metric": 0.8222222222222222,
+  "best_model_checkpoint": "CTMAE-P2-V5-3g-S2/checkpoint-8613",
+  "epoch": 49.02,
+  "eval_steps": 500,
+  "global_step": 13050,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0007662835249042146,
+      "grad_norm": 5.429479122161865,
+      "learning_rate": 7.662835249042146e-08,
+      "loss": 0.6749,
+      "step": 10
+    },
+    {
+      "epoch": 0.0015325670498084292,
+      "grad_norm": 4.907957553863525,
+      "learning_rate": 1.5325670498084292e-07,
+      "loss": 0.6653,
+      "step": 20
+    },
+    {
+      "epoch": 0.0022988505747126436,
+      "grad_norm": 5.1367597579956055,
+      "learning_rate": 2.2988505747126437e-07,
+      "loss": 0.6889,
+      "step": 30
+    },
+    {
+      "epoch": 0.0030651340996168583,
+      "grad_norm": 6.116421699523926,
+      "learning_rate": 3.0651340996168583e-07,
+      "loss": 0.6844,
+      "step": 40
+    },
+    {
+      "epoch": 0.0038314176245210726,
+      "grad_norm": 5.355488300323486,
+      "learning_rate": 3.831417624521073e-07,
+      "loss": 0.6945,
+      "step": 50
+    },
+    {
+      "epoch": 0.004597701149425287,
+      "grad_norm": 4.986178398132324,
+      "learning_rate": 4.5977011494252875e-07,
+      "loss": 0.7041,
+      "step": 60
+    },
+    {
+      "epoch": 0.0053639846743295016,
+      "grad_norm": 4.705554008483887,
+      "learning_rate": 5.363984674329502e-07,
+      "loss": 0.6515,
+      "step": 70
+    },
+    {
+      "epoch": 0.006130268199233717,
+      "grad_norm": 5.1195902824401855,
+      "learning_rate": 6.130268199233717e-07,
+      "loss": 0.6834,
+      "step": 80
+    },
+    {
+      "epoch": 0.006896551724137931,
+      "grad_norm": 5.128364086151123,
+      "learning_rate": 6.896551724137931e-07,
+      "loss": 0.6325,
+      "step": 90
+    },
+    {
+      "epoch": 0.007662835249042145,
+      "grad_norm": 6.741299152374268,
+      "learning_rate": 7.662835249042146e-07,
+      "loss": 0.7427,
+      "step": 100
+    },
+    {
+      "epoch": 0.00842911877394636,
+      "grad_norm": 4.9074883460998535,
+      "learning_rate": 8.429118773946361e-07,
+      "loss": 0.6217,
+      "step": 110
+    },
+    {
+      "epoch": 0.009195402298850575,
+      "grad_norm": 7.730074405670166,
+      "learning_rate": 9.195402298850575e-07,
+      "loss": 0.5976,
+      "step": 120
+    },
+    {
+      "epoch": 0.00996168582375479,
+      "grad_norm": 5.705611705780029,
+      "learning_rate": 9.96168582375479e-07,
+      "loss": 0.5275,
+      "step": 130
+    },
+    {
+      "epoch": 0.010727969348659003,
+      "grad_norm": 5.608544826507568,
+      "learning_rate": 1.0727969348659004e-06,
+      "loss": 0.7498,
+      "step": 140
+    },
+    {
+      "epoch": 0.011494252873563218,
+      "grad_norm": 13.588452339172363,
+      "learning_rate": 1.1494252873563219e-06,
+      "loss": 0.5538,
+      "step": 150
+    },
+    {
+      "epoch": 0.012260536398467433,
+      "grad_norm": 14.538186073303223,
+      "learning_rate": 1.2260536398467433e-06,
+      "loss": 0.6394,
+      "step": 160
+    },
+    {
+      "epoch": 0.013026819923371647,
+      "grad_norm": 6.073761940002441,
+      "learning_rate": 1.3026819923371648e-06,
+      "loss": 0.6271,
+      "step": 170
+    },
+    {
+      "epoch": 0.013793103448275862,
+      "grad_norm": 7.062936782836914,
+      "learning_rate": 1.3793103448275862e-06,
+      "loss": 0.6278,
+      "step": 180
+    },
+    {
+      "epoch": 0.014559386973180077,
+      "grad_norm": 5.792269229888916,
+      "learning_rate": 1.455938697318008e-06,
+      "loss": 0.7695,
+      "step": 190
+    },
+    {
+      "epoch": 0.01532567049808429,
+      "grad_norm": 12.22718334197998,
+      "learning_rate": 1.5325670498084292e-06,
+      "loss": 0.6587,
+      "step": 200
+    },
+    {
+      "epoch": 0.016091954022988506,
+      "grad_norm": 6.609187126159668,
+      "learning_rate": 1.6091954022988506e-06,
+      "loss": 0.789,
+      "step": 210
+    },
+    {
+      "epoch": 0.01685823754789272,
+      "grad_norm": 7.355467796325684,
+      "learning_rate": 1.6858237547892723e-06,
+      "loss": 0.8254,
+      "step": 220
+    },
+    {
+      "epoch": 0.017624521072796936,
+      "grad_norm": 5.16705322265625,
+      "learning_rate": 1.7624521072796935e-06,
+      "loss": 0.3686,
+      "step": 230
+    },
+    {
+      "epoch": 0.01839080459770115,
+      "grad_norm": 6.630442142486572,
+      "learning_rate": 1.839080459770115e-06,
+      "loss": 0.8785,
+      "step": 240
+    },
+    {
+      "epoch": 0.019157088122605363,
+      "grad_norm": 5.100066661834717,
+      "learning_rate": 1.9157088122605367e-06,
+      "loss": 0.763,
+      "step": 250
+    },
+    {
+      "epoch": 0.01992337164750958,
+      "grad_norm": 5.187286376953125,
+      "learning_rate": 1.992337164750958e-06,
+      "loss": 1.3342,
+      "step": 260
+    },
+    {
+      "epoch": 0.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 1.334355354309082,
+      "eval_runtime": 18.4987,
+      "eval_samples_per_second": 2.433,
+      "eval_steps_per_second": 2.433,
+      "step": 261
+    },
+    {
+      "epoch": 1.0006896551724138,
+      "grad_norm": 3.1064951419830322,
+      "learning_rate": 2.0689655172413796e-06,
+      "loss": 0.567,
+      "step": 270
+    },
+    {
+      "epoch": 1.001455938697318,
+      "grad_norm": 49.32820510864258,
+      "learning_rate": 2.145593869731801e-06,
+      "loss": 1.4948,
+      "step": 280
+    },
+    {
+      "epoch": 1.0022222222222221,
+      "grad_norm": 72.73252868652344,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.9745,
+      "step": 290
+    },
+    {
+      "epoch": 1.0029885057471264,
+      "grad_norm": 2.4721927642822266,
+      "learning_rate": 2.2988505747126437e-06,
+      "loss": 0.9915,
+      "step": 300
+    },
+    {
+      "epoch": 1.0037547892720307,
+      "grad_norm": 3.0997977256774902,
+      "learning_rate": 2.3754789272030654e-06,
+      "loss": 2.5202,
+      "step": 310
+    },
+    {
+      "epoch": 1.004521072796935,
+      "grad_norm": 5.511836051940918,
+      "learning_rate": 2.4521072796934867e-06,
+      "loss": 1.1405,
+      "step": 320
+    },
+    {
+      "epoch": 1.0052873563218392,
+      "grad_norm": 2.4431302547454834,
+      "learning_rate": 2.5287356321839083e-06,
+      "loss": 1.1954,
+      "step": 330
+    },
+    {
+      "epoch": 1.0060536398467432,
+      "grad_norm": 0.8971877098083496,
+      "learning_rate": 2.6053639846743296e-06,
+      "loss": 1.3971,
+      "step": 340
+    },
+    {
+      "epoch": 1.0068199233716475,
+      "grad_norm": 2.4012413024902344,
+      "learning_rate": 2.6819923371647512e-06,
+      "loss": 1.8745,
+      "step": 350
+    },
+    {
+      "epoch": 1.0075862068965518,
+      "grad_norm": 107.75028991699219,
+      "learning_rate": 2.7586206896551725e-06,
+      "loss": 1.5544,
+      "step": 360
+    },
+    {
+      "epoch": 1.008352490421456,
+      "grad_norm": 36.23213195800781,
+      "learning_rate": 2.835249042145594e-06,
+      "loss": 1.8731,
+      "step": 370
+    },
+    {
+      "epoch": 1.00911877394636,
+      "grad_norm": 1.6178969144821167,
+      "learning_rate": 2.911877394636016e-06,
+      "loss": 0.4851,
+      "step": 380
+    },
+    {
+      "epoch": 1.0098850574712643,
+      "grad_norm": 85.86695861816406,
+      "learning_rate": 2.988505747126437e-06,
+      "loss": 2.0466,
+      "step": 390
+    },
+    {
+      "epoch": 1.0106513409961686,
+      "grad_norm": 1.086233377456665,
+      "learning_rate": 3.0651340996168583e-06,
+      "loss": 0.4139,
+      "step": 400
+    },
+    {
+      "epoch": 1.0114176245210729,
+      "grad_norm": 40.69083023071289,
+      "learning_rate": 3.14176245210728e-06,
+      "loss": 1.4377,
+      "step": 410
+    },
+    {
+      "epoch": 1.012183908045977,
+      "grad_norm": 0.6512795090675354,
+      "learning_rate": 3.2183908045977012e-06,
+      "loss": 1.7803,
+      "step": 420
+    },
+    {
+      "epoch": 1.0129501915708812,
+      "grad_norm": 0.8672497868537903,
+      "learning_rate": 3.295019157088123e-06,
+      "loss": 1.1516,
+      "step": 430
+    },
+    {
+      "epoch": 1.0137164750957854,
+      "grad_norm": 1.2285021543502808,
+      "learning_rate": 3.3716475095785446e-06,
+      "loss": 1.4235,
+      "step": 440
+    },
+    {
+      "epoch": 1.0144827586206897,
+      "grad_norm": 0.27478310465812683,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 0.0096,
+      "step": 450
+    },
+    {
+      "epoch": 1.015249042145594,
+      "grad_norm": 47.281856536865234,
+      "learning_rate": 3.524904214559387e-06,
+      "loss": 2.1905,
+      "step": 460
+    },
+    {
+      "epoch": 1.016015325670498,
+      "grad_norm": 0.7081122994422913,
+      "learning_rate": 3.6015325670498087e-06,
+      "loss": 1.4566,
+      "step": 470
+    },
+    {
+      "epoch": 1.0167816091954023,
+      "grad_norm": 43.52406311035156,
+      "learning_rate": 3.67816091954023e-06,
+      "loss": 2.12,
+      "step": 480
+    },
+    {
+      "epoch": 1.0175478927203065,
+      "grad_norm": 0.9346103668212891,
+      "learning_rate": 3.7547892720306517e-06,
+      "loss": 1.9091,
+      "step": 490
+    },
+    {
+      "epoch": 1.0183141762452108,
+      "grad_norm": 0.331731915473938,
+      "learning_rate": 3.831417624521073e-06,
+      "loss": 0.3666,
+      "step": 500
+    },
+    {
+      "epoch": 1.0190804597701149,
+      "grad_norm": 0.25669941306114197,
+      "learning_rate": 3.908045977011495e-06,
+      "loss": 0.9004,
+      "step": 510
+    },
+    {
+      "epoch": 1.0198467432950191,
+      "grad_norm": 0.0836009681224823,
+      "learning_rate": 3.984674329501916e-06,
+      "loss": 0.5407,
+      "step": 520
+    },
+    {
+      "epoch": 1.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 3.0925838947296143,
+      "eval_runtime": 17.5929,
+      "eval_samples_per_second": 2.558,
+      "eval_steps_per_second": 2.558,
+      "step": 522
+    },
+    {
+      "epoch": 2.0006130268199236,
+      "grad_norm": 61.599124908447266,
+      "learning_rate": 4.0613026819923375e-06,
+      "loss": 2.7935,
+      "step": 530
+    },
+    {
+      "epoch": 2.0013793103448276,
+      "grad_norm": 59.39590072631836,
+      "learning_rate": 4.137931034482759e-06,
+      "loss": 2.8997,
+      "step": 540
+    },
+    {
+      "epoch": 2.0021455938697317,
+      "grad_norm": 82.04475402832031,
+      "learning_rate": 4.214559386973181e-06,
+      "loss": 0.7127,
+      "step": 550
+    },
+    {
+      "epoch": 2.002911877394636,
+      "grad_norm": 53.86101531982422,
+      "learning_rate": 4.291187739463602e-06,
+      "loss": 1.9437,
+      "step": 560
+    },
+    {
+      "epoch": 2.00367816091954,
+      "grad_norm": 2.2206757068634033,
+      "learning_rate": 4.367816091954023e-06,
+      "loss": 1.5541,
+      "step": 570
+    },
+    {
+      "epoch": 2.0044444444444443,
+      "grad_norm": 0.2833731174468994,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 1.1967,
+      "step": 580
+    },
+    {
+      "epoch": 2.0052107279693487,
+      "grad_norm": 0.45417094230651855,
+      "learning_rate": 4.521072796934866e-06,
+      "loss": 1.3734,
+      "step": 590
+    },
+    {
+      "epoch": 2.005977011494253,
+      "grad_norm": 0.7539631128311157,
+      "learning_rate": 4.5977011494252875e-06,
+      "loss": 2.0189,
+      "step": 600
+    },
+    {
+      "epoch": 2.0067432950191573,
+      "grad_norm": 0.23031167685985565,
+      "learning_rate": 4.674329501915709e-06,
+      "loss": 0.3627,
+      "step": 610
+    },
+    {
+      "epoch": 2.0075095785440613,
+      "grad_norm": 0.4325632154941559,
+      "learning_rate": 4.750957854406131e-06,
+      "loss": 1.406,
+      "step": 620
+    },
+    {
+      "epoch": 2.0082758620689654,
+      "grad_norm": 0.14981916546821594,
+      "learning_rate": 4.8275862068965525e-06,
+      "loss": 0.0055,
+      "step": 630
+    },
+    {
+      "epoch": 2.00904214559387,
+      "grad_norm": 43.05668640136719,
+      "learning_rate": 4.904214559386973e-06,
+      "loss": 2.1516,
+      "step": 640
+    },
+    {
+      "epoch": 2.009808429118774,
+      "grad_norm": 87.29607391357422,
+      "learning_rate": 4.980842911877395e-06,
+      "loss": 1.5203,
+      "step": 650
+    },
+    {
+      "epoch": 2.0105747126436784,
+      "grad_norm": 0.2173413634300232,
+      "learning_rate": 5.057471264367817e-06,
+      "loss": 0.4668,
+      "step": 660
+    },
+    {
+      "epoch": 2.0113409961685824,
+      "grad_norm": 32.00040817260742,
+      "learning_rate": 5.134099616858238e-06,
+      "loss": 1.004,
+      "step": 670
+    },
+    {
+      "epoch": 2.0121072796934865,
+      "grad_norm": 38.91421127319336,
+      "learning_rate": 5.210727969348659e-06,
+      "loss": 2.774,
+      "step": 680
+    },
+    {
+      "epoch": 2.012873563218391,
+      "grad_norm": 0.2492929846048355,
+      "learning_rate": 5.287356321839081e-06,
+      "loss": 0.4122,
+      "step": 690
+    },
+    {
+      "epoch": 2.013639846743295,
+      "grad_norm": 32.141231536865234,
+      "learning_rate": 5.3639846743295025e-06,
+      "loss": 1.6792,
+      "step": 700
+    },
+    {
+      "epoch": 2.014406130268199,
+      "grad_norm": 0.1514960080385208,
+      "learning_rate": 5.440613026819924e-06,
+      "loss": 1.2704,
+      "step": 710
+    },
+    {
+      "epoch": 2.0151724137931035,
+      "grad_norm": 1.851475715637207,
+      "learning_rate": 5.517241379310345e-06,
+      "loss": 2.1833,
+      "step": 720
+    },
+    {
+      "epoch": 2.0159386973180076,
+      "grad_norm": 40.27357482910156,
+      "learning_rate": 5.593869731800766e-06,
+      "loss": 1.8789,
+      "step": 730
+    },
+    {
+      "epoch": 2.016704980842912,
+      "grad_norm": 0.5653375387191772,
+      "learning_rate": 5.670498084291188e-06,
+      "loss": 0.7952,
+      "step": 740
+    },
+    {
+      "epoch": 2.017471264367816,
+      "grad_norm": 34.9031867980957,
+      "learning_rate": 5.747126436781609e-06,
+      "loss": 1.3938,
+      "step": 750
+    },
+    {
+      "epoch": 2.01823754789272,
+      "grad_norm": 0.23187388479709625,
+      "learning_rate": 5.823754789272032e-06,
+      "loss": 0.9256,
+      "step": 760
+    },
+    {
+      "epoch": 2.0190038314176246,
+      "grad_norm": 34.71418762207031,
+      "learning_rate": 5.9003831417624525e-06,
+      "loss": 1.4213,
+      "step": 770
+    },
+    {
+      "epoch": 2.0197701149425287,
+      "grad_norm": 0.48546066880226135,
+      "learning_rate": 5.977011494252874e-06,
+      "loss": 1.9217,
+      "step": 780
+    },
+    {
+      "epoch": 2.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 2.3251731395721436,
+      "eval_runtime": 18.269,
+      "eval_samples_per_second": 2.463,
+      "eval_steps_per_second": 2.463,
+      "step": 783
+    },
+    {
+      "epoch": 3.000536398467433,
+      "grad_norm": 36.785194396972656,
+      "learning_rate": 6.053639846743296e-06,
+      "loss": 2.1571,
+      "step": 790
+    },
+    {
+      "epoch": 3.001302681992337,
+      "grad_norm": 1.3006188869476318,
+      "learning_rate": 6.130268199233717e-06,
+      "loss": 0.6094,
+      "step": 800
+    },
+    {
+      "epoch": 3.0020689655172412,
+      "grad_norm": 0.2517147362232208,
+      "learning_rate": 6.206896551724138e-06,
+      "loss": 1.2591,
+      "step": 810
+    },
+    {
+      "epoch": 3.0028352490421457,
+      "grad_norm": 0.14276565611362457,
+      "learning_rate": 6.28352490421456e-06,
+      "loss": 1.6718,
+      "step": 820
+    },
+    {
+      "epoch": 3.0036015325670498,
+      "grad_norm": 0.46172553300857544,
+      "learning_rate": 6.360153256704982e-06,
+      "loss": 1.3049,
+      "step": 830
+    },
+    {
+      "epoch": 3.004367816091954,
+      "grad_norm": 32.628814697265625,
+      "learning_rate": 6.4367816091954025e-06,
+      "loss": 1.0117,
+      "step": 840
+    },
+    {
+      "epoch": 3.0051340996168583,
+      "grad_norm": 0.6428257822990417,
+      "learning_rate": 6.513409961685824e-06,
+      "loss": 1.9741,
+      "step": 850
+    },
+    {
+      "epoch": 3.0059003831417623,
+      "grad_norm": 0.3465458154678345,
+      "learning_rate": 6.590038314176246e-06,
+      "loss": 0.8442,
+      "step": 860
+    },
+    {
+      "epoch": 3.006666666666667,
+      "grad_norm": 0.159845769405365,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.2305,
+      "step": 870
+    },
+    {
+      "epoch": 3.007432950191571,
+      "grad_norm": 0.09637564420700073,
+      "learning_rate": 6.743295019157089e-06,
+      "loss": 0.0061,
+      "step": 880
+    },
+    {
+      "epoch": 3.008199233716475,
+      "grad_norm": 35.53211975097656,
+      "learning_rate": 6.81992337164751e-06,
+      "loss": 2.2744,
+      "step": 890
+    },
+    {
+      "epoch": 3.0089655172413794,
+      "grad_norm": 0.5742450952529907,
+      "learning_rate": 6.896551724137932e-06,
+      "loss": 1.4589,
+      "step": 900
+    },
+    {
+      "epoch": 3.0097318007662834,
+      "grad_norm": 0.5233742594718933,
+      "learning_rate": 6.973180076628353e-06,
+      "loss": 1.8096,
+      "step": 910
+    },
+    {
+      "epoch": 3.010498084291188,
+      "grad_norm": 31.948074340820312,
+      "learning_rate": 7.049808429118774e-06,
+      "loss": 2.4072,
+      "step": 920
+    },
+    {
+      "epoch": 3.011264367816092,
+      "grad_norm": 1.22008216381073,
+      "learning_rate": 7.126436781609196e-06,
+      "loss": 0.298,
+      "step": 930
+    },
+    {
+      "epoch": 3.012030651340996,
+      "grad_norm": 72.74939727783203,
+      "learning_rate": 7.2030651340996175e-06,
+      "loss": 2.0516,
+      "step": 940
+    },
+    {
+      "epoch": 3.0127969348659005,
+      "grad_norm": 0.7335635423660278,
+      "learning_rate": 7.279693486590039e-06,
+      "loss": 1.2386,
+      "step": 950
+    },
+    {
+      "epoch": 3.0135632183908045,
+      "grad_norm": 37.59128952026367,
+      "learning_rate": 7.35632183908046e-06,
+      "loss": 1.3832,
+      "step": 960
+    },
+    {
+      "epoch": 3.014329501915709,
+      "grad_norm": 35.019805908203125,
+      "learning_rate": 7.4329501915708825e-06,
+      "loss": 2.6458,
+      "step": 970
+    },
+    {
+      "epoch": 3.015095785440613,
+      "grad_norm": 0.40072357654571533,
+      "learning_rate": 7.509578544061303e-06,
+      "loss": 0.8276,
+      "step": 980
+    },
+    {
+      "epoch": 3.015862068965517,
+      "grad_norm": 0.8792592287063599,
+      "learning_rate": 7.586206896551724e-06,
+      "loss": 1.6269,
+      "step": 990
+    },
+    {
+      "epoch": 3.0166283524904216,
+      "grad_norm": 0.8542113304138184,
+      "learning_rate": 7.662835249042147e-06,
+      "loss": 1.2116,
+      "step": 1000
+    },
+    {
+      "epoch": 3.0173946360153256,
+      "grad_norm": 31.216779708862305,
+      "learning_rate": 7.739463601532567e-06,
+      "loss": 2.2174,
+      "step": 1010
+    },
+    {
+      "epoch": 3.0181609195402297,
+      "grad_norm": 2.241011619567871,
+      "learning_rate": 7.81609195402299e-06,
+      "loss": 1.4982,
+      "step": 1020
+    },
+    {
+      "epoch": 3.018927203065134,
+      "grad_norm": 3.0700013637542725,
+      "learning_rate": 7.89272030651341e-06,
+      "loss": 0.8113,
+      "step": 1030
+    },
+    {
+      "epoch": 3.0196934865900382,
+      "grad_norm": 1.1464426517486572,
+      "learning_rate": 7.969348659003832e-06,
+      "loss": 0.6849,
+      "step": 1040
+    },
+    {
+      "epoch": 3.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 2.6546289920806885,
+      "eval_runtime": 18.1963,
+      "eval_samples_per_second": 2.473,
+      "eval_steps_per_second": 2.473,
+      "step": 1044
+    },
+    {
+      "epoch": 4.000459770114943,
+      "grad_norm": 0.24025849997997284,
+      "learning_rate": 8.045977011494253e-06,
+      "loss": 0.9452,
+      "step": 1050
+    },
+    {
+      "epoch": 4.001226053639847,
+      "grad_norm": 31.608797073364258,
+      "learning_rate": 8.122605363984675e-06,
+      "loss": 0.5849,
+      "step": 1060
+    },
+    {
+      "epoch": 4.001992337164751,
+      "grad_norm": 28.283447265625,
+      "learning_rate": 8.199233716475097e-06,
+      "loss": 1.1191,
+      "step": 1070
+    },
+    {
+      "epoch": 4.002758620689655,
+      "grad_norm": 1.7416435480117798,
+      "learning_rate": 8.275862068965518e-06,
+      "loss": 2.2576,
+      "step": 1080
+    },
+    {
+      "epoch": 4.00352490421456,
+      "grad_norm": 0.3107624351978302,
+      "learning_rate": 8.35249042145594e-06,
+      "loss": 0.7798,
+      "step": 1090
+    },
+    {
+      "epoch": 4.004291187739463,
+      "grad_norm": 30.57634925842285,
+      "learning_rate": 8.429118773946362e-06,
+      "loss": 1.9536,
+      "step": 1100
+    },
+    {
+      "epoch": 4.005057471264368,
+      "grad_norm": 25.17679214477539,
+      "learning_rate": 8.505747126436782e-06,
+      "loss": 1.8114,
+      "step": 1110
+    },
+    {
+      "epoch": 4.005823754789272,
+      "grad_norm": 1.670562505722046,
+      "learning_rate": 8.582375478927203e-06,
+      "loss": 1.0891,
+      "step": 1120
+    },
+    {
+      "epoch": 4.006590038314176,
+      "grad_norm": 2.437887668609619,
+      "learning_rate": 8.659003831417625e-06,
+      "loss": 1.4449,
+      "step": 1130
+    },
+    {
+      "epoch": 4.00735632183908,
+      "grad_norm": 26.456653594970703,
+      "learning_rate": 8.735632183908047e-06,
+      "loss": 1.4887,
+      "step": 1140
+    },
+    {
+      "epoch": 4.008122605363985,
+      "grad_norm": 0.47752654552459717,
+      "learning_rate": 8.812260536398468e-06,
+      "loss": 0.7679,
+      "step": 1150
+    },
+    {
+      "epoch": 4.0088888888888885,
+      "grad_norm": 0.18593165278434753,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.9672,
+      "step": 1160
+    },
+    {
+      "epoch": 4.009655172413793,
+      "grad_norm": 0.3449030816555023,
+      "learning_rate": 8.965517241379312e-06,
+      "loss": 2.2728,
+      "step": 1170
+    },
+    {
+      "epoch": 4.0104214559386975,
+      "grad_norm": 0.7808371186256409,
+      "learning_rate": 9.042145593869732e-06,
+      "loss": 1.4851,
+      "step": 1180
+    },
+    {
+      "epoch": 4.011187739463602,
+      "grad_norm": 1.013738989830017,
+      "learning_rate": 9.118773946360155e-06,
+      "loss": 1.7739,
+      "step": 1190
+    },
+    {
+      "epoch": 4.011954022988506,
+      "grad_norm": 1.093010663986206,
+      "learning_rate": 9.195402298850575e-06,
+      "loss": 1.014,
+      "step": 1200
+    },
+    {
+      "epoch": 4.01272030651341,
+      "grad_norm": 0.2179373949766159,
+      "learning_rate": 9.272030651340997e-06,
+      "loss": 0.4338,
+      "step": 1210
+    },
+    {
+      "epoch": 4.0134865900383145,
+      "grad_norm": 0.27451786398887634,
+      "learning_rate": 9.348659003831418e-06,
+      "loss": 0.936,
+      "step": 1220
+    },
+    {
+      "epoch": 4.014252873563218,
+      "grad_norm": 0.5827163457870483,
+      "learning_rate": 9.42528735632184e-06,
+      "loss": 2.1782,
+      "step": 1230
+    },
+    {
+      "epoch": 4.015019157088123,
+      "grad_norm": 0.4018303155899048,
+      "learning_rate": 9.501915708812262e-06,
+      "loss": 1.5439,
+      "step": 1240
+    },
+    {
+      "epoch": 4.015785440613027,
+      "grad_norm": 29.42448616027832,
+      "learning_rate": 9.578544061302683e-06,
+      "loss": 1.035,
+      "step": 1250
+    },
+    {
+      "epoch": 4.016551724137931,
+      "grad_norm": 0.5086475014686584,
+      "learning_rate": 9.655172413793105e-06,
+      "loss": 1.7375,
+      "step": 1260
+    },
+    {
+      "epoch": 4.017318007662835,
+      "grad_norm": 0.15479078888893127,
+      "learning_rate": 9.731800766283525e-06,
+      "loss": 0.4349,
+      "step": 1270
+    },
+    {
+      "epoch": 4.01808429118774,
+      "grad_norm": 0.2352169156074524,
+      "learning_rate": 9.808429118773947e-06,
+      "loss": 1.3062,
+      "step": 1280
+    },
+    {
+      "epoch": 4.018850574712643,
+      "grad_norm": 1.066627860069275,
+      "learning_rate": 9.885057471264368e-06,
+      "loss": 1.597,
+      "step": 1290
+    },
+    {
+      "epoch": 4.019616858237548,
+      "grad_norm": 34.565189361572266,
+      "learning_rate": 9.96168582375479e-06,
+      "loss": 1.7893,
+      "step": 1300
+    },
+    {
+      "epoch": 4.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 1.1092926263809204,
+      "eval_runtime": 17.0867,
+      "eval_samples_per_second": 2.634,
+      "eval_steps_per_second": 2.634,
+      "step": 1305
+    },
+    {
+      "epoch": 5.000383141762452,
+      "grad_norm": 2.3330183029174805,
+      "learning_rate": 9.995742869306088e-06,
+      "loss": 0.7353,
+      "step": 1310
+    },
+    {
+      "epoch": 5.001149425287356,
+      "grad_norm": 0.25167757272720337,
+      "learning_rate": 9.987228607918263e-06,
+      "loss": 0.9908,
+      "step": 1320
+    },
+    {
+      "epoch": 5.001915708812261,
+      "grad_norm": 0.1591140180826187,
+      "learning_rate": 9.97871434653044e-06,
+      "loss": 0.493,
+      "step": 1330
+    },
+    {
+      "epoch": 5.002681992337164,
+      "grad_norm": 0.3773989975452423,
+      "learning_rate": 9.970200085142615e-06,
+      "loss": 1.2863,
+      "step": 1340
+    },
+    {
+      "epoch": 5.003448275862069,
+      "grad_norm": 95.41172790527344,
+      "learning_rate": 9.96168582375479e-06,
+      "loss": 1.9463,
+      "step": 1350
+    },
+    {
+      "epoch": 5.004214559386973,
+      "grad_norm": 25.436981201171875,
+      "learning_rate": 9.953171562366965e-06,
+      "loss": 1.4723,
+      "step": 1360
+    },
+    {
+      "epoch": 5.004980842911878,
+      "grad_norm": 0.1610879898071289,
+      "learning_rate": 9.944657300979142e-06,
+      "loss": 0.0172,
+      "step": 1370
+    },
+    {
+      "epoch": 5.005747126436781,
+      "grad_norm": 0.39229804277420044,
+      "learning_rate": 9.936143039591317e-06,
+      "loss": 1.7084,
+      "step": 1380
+    },
+    {
+      "epoch": 5.006513409961686,
+      "grad_norm": 53.37186813354492,
+      "learning_rate": 9.927628778203492e-06,
+      "loss": 3.1111,
+      "step": 1390
+    },
+    {
+      "epoch": 5.00727969348659,
+      "grad_norm": 0.5538986921310425,
+      "learning_rate": 9.919114516815667e-06,
+      "loss": 0.4031,
+      "step": 1400
+    },
+    {
+      "epoch": 5.008045977011494,
+      "grad_norm": 45.50009536743164,
+      "learning_rate": 9.910600255427842e-06,
+      "loss": 2.3178,
+      "step": 1410
+    },
+    {
+      "epoch": 5.0088122605363985,
+      "grad_norm": 0.3090026378631592,
+      "learning_rate": 9.902085994040018e-06,
+      "loss": 0.4517,
+      "step": 1420
+    },
+    {
+      "epoch": 5.009578544061303,
+      "grad_norm": 26.65604019165039,
+      "learning_rate": 9.893571732652193e-06,
+      "loss": 1.3188,
+      "step": 1430
+    },
+    {
+      "epoch": 5.010344827586207,
+      "grad_norm": 0.24589665234088898,
+      "learning_rate": 9.885057471264368e-06,
+      "loss": 1.7328,
+      "step": 1440
+    },
+    {
+      "epoch": 5.011111111111111,
+      "grad_norm": 0.7630972266197205,
+      "learning_rate": 9.876543209876543e-06,
+      "loss": 1.6054,
+      "step": 1450
+    },
+    {
+      "epoch": 5.011877394636016,
+      "grad_norm": 0.7012438178062439,
+      "learning_rate": 9.86802894848872e-06,
+      "loss": 0.7106,
+      "step": 1460
+    },
+    {
+      "epoch": 5.012643678160919,
+      "grad_norm": 27.231481552124023,
+      "learning_rate": 9.859514687100895e-06,
+      "loss": 1.6023,
+      "step": 1470
+    },
+    {
+      "epoch": 5.013409961685824,
+      "grad_norm": 0.5588337182998657,
+      "learning_rate": 9.85100042571307e-06,
+      "loss": 1.8103,
+      "step": 1480
+    },
+    {
+      "epoch": 5.014176245210728,
+      "grad_norm": 0.27717721462249756,
+      "learning_rate": 9.842486164325245e-06,
+      "loss": 1.0512,
+      "step": 1490
+    },
+    {
+      "epoch": 5.014942528735633,
+      "grad_norm": 2.7197179794311523,
+      "learning_rate": 9.833971902937422e-06,
+      "loss": 0.9533,
+      "step": 1500
+    },
+    {
+      "epoch": 5.015708812260536,
+      "grad_norm": 25.656766891479492,
+      "learning_rate": 9.825457641549597e-06,
+      "loss": 1.2275,
+      "step": 1510
+    },
+    {
+      "epoch": 5.016475095785441,
+      "grad_norm": 0.29782333970069885,
+      "learning_rate": 9.816943380161772e-06,
+      "loss": 0.984,
+      "step": 1520
+    },
+    {
+      "epoch": 5.017241379310345,
+      "grad_norm": 57.583919525146484,
+      "learning_rate": 9.808429118773947e-06,
+      "loss": 2.2882,
+      "step": 1530
+    },
+    {
+      "epoch": 5.018007662835249,
+      "grad_norm": 14.479283332824707,
+      "learning_rate": 9.799914857386122e-06,
+      "loss": 1.3687,
+      "step": 1540
+    },
+    {
+      "epoch": 5.018773946360153,
+      "grad_norm": 8.05611801147461,
+      "learning_rate": 9.791400595998298e-06,
+      "loss": 0.652,
+      "step": 1550
+    },
+    {
+      "epoch": 5.019540229885058,
+      "grad_norm": 40.959903717041016,
+      "learning_rate": 9.782886334610473e-06,
+      "loss": 0.8412,
+      "step": 1560
+    },
+    {
+      "epoch": 5.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 1.2740881443023682,
+      "eval_runtime": 17.3951,
+      "eval_samples_per_second": 2.587,
+      "eval_steps_per_second": 2.587,
+      "step": 1566
+    },
+    {
+      "epoch": 6.000306513409962,
+      "grad_norm": 34.23516082763672,
+      "learning_rate": 9.774372073222648e-06,
+      "loss": 0.9929,
+      "step": 1570
+    },
+    {
+      "epoch": 6.001072796934866,
+      "grad_norm": 403.937255859375,
+      "learning_rate": 9.765857811834825e-06,
+      "loss": 0.8105,
+      "step": 1580
+    },
+    {
+      "epoch": 6.00183908045977,
+      "grad_norm": 0.16264407336711884,
+      "learning_rate": 9.757343550447e-06,
+      "loss": 0.4363,
+      "step": 1590
+    },
+    {
+      "epoch": 6.002605363984674,
+      "grad_norm": 0.08945406228303909,
+      "learning_rate": 9.748829289059175e-06,
+      "loss": 1.4511,
+      "step": 1600
+    },
+    {
+      "epoch": 6.003371647509579,
+      "grad_norm": 1.7249908447265625,
+      "learning_rate": 9.74031502767135e-06,
+      "loss": 2.3425,
+      "step": 1610
+    },
+    {
+      "epoch": 6.0041379310344825,
+      "grad_norm": 4.1057024002075195,
+      "learning_rate": 9.731800766283525e-06,
+      "loss": 0.916,
+      "step": 1620
+    },
+    {
+      "epoch": 6.004904214559387,
+      "grad_norm": 42.91880416870117,
+      "learning_rate": 9.723286504895702e-06,
+      "loss": 1.7682,
+      "step": 1630
+    },
+    {
+      "epoch": 6.005670498084291,
+      "grad_norm": 2.0418484210968018,
+      "learning_rate": 9.714772243507877e-06,
+      "loss": 0.0397,
+      "step": 1640
+    },
+    {
+      "epoch": 6.006436781609195,
+      "grad_norm": 35.45446014404297,
+      "learning_rate": 9.706257982120052e-06,
+      "loss": 1.5629,
+      "step": 1650
+    },
+    {
+      "epoch": 6.0072030651340995,
+      "grad_norm": 33.16194534301758,
+      "learning_rate": 9.697743720732228e-06,
+      "loss": 1.0235,
+      "step": 1660
+    },
+    {
+      "epoch": 6.007969348659004,
+      "grad_norm": 28.418079376220703,
+      "learning_rate": 9.689229459344403e-06,
+      "loss": 2.4078,
+      "step": 1670
+    },
+    {
+      "epoch": 6.008735632183908,
+      "grad_norm": 4.29979944229126,
+      "learning_rate": 9.680715197956578e-06,
+      "loss": 0.6792,
+      "step": 1680
+    },
+    {
+      "epoch": 6.009501915708812,
+      "grad_norm": 0.6289230585098267,
+      "learning_rate": 9.672200936568753e-06,
+      "loss": 0.9775,
+      "step": 1690
+    },
+    {
+      "epoch": 6.010268199233717,
+      "grad_norm": 26.520431518554688,
+      "learning_rate": 9.663686675180928e-06,
+      "loss": 1.6951,
+      "step": 1700
+    },
+    {
+      "epoch": 6.011034482758621,
+      "grad_norm": 0.7467366456985474,
+      "learning_rate": 9.655172413793105e-06,
+      "loss": 1.6701,
+      "step": 1710
+    },
+    {
+      "epoch": 6.011800766283525,
+      "grad_norm": 60.05059051513672,
+      "learning_rate": 9.64665815240528e-06,
+      "loss": 0.9815,
+      "step": 1720
+    },
+    {
+      "epoch": 6.012567049808429,
+      "grad_norm": 0.07844887673854828,
+      "learning_rate": 9.638143891017455e-06,
+      "loss": 0.0322,
+      "step": 1730
+    },
+    {
+      "epoch": 6.013333333333334,
+      "grad_norm": 40.86720657348633,
+      "learning_rate": 9.62962962962963e-06,
+      "loss": 2.105,
+      "step": 1740
+    },
+    {
+      "epoch": 6.014099616858237,
+      "grad_norm": 28.481767654418945,
+      "learning_rate": 9.621115368241805e-06,
+      "loss": 1.7868,
+      "step": 1750
+    },
+    {
+      "epoch": 6.014865900383142,
+      "grad_norm": 1.4983817338943481,
+      "learning_rate": 9.612601106853982e-06,
+      "loss": 0.5038,
+      "step": 1760
+    },
+    {
+      "epoch": 6.015632183908046,
+      "grad_norm": 0.2759397029876709,
+      "learning_rate": 9.604086845466157e-06,
+      "loss": 0.3795,
+      "step": 1770
+    },
+    {
+      "epoch": 6.01639846743295,
+      "grad_norm": 35.15885543823242,
+      "learning_rate": 9.595572584078332e-06,
+      "loss": 1.6443,
+      "step": 1780
+    },
+    {
+      "epoch": 6.017164750957854,
+      "grad_norm": 29.816890716552734,
+      "learning_rate": 9.587058322690508e-06,
+      "loss": 2.461,
+      "step": 1790
+    },
+    {
+      "epoch": 6.017931034482759,
+      "grad_norm": 20.45577049255371,
+      "learning_rate": 9.578544061302683e-06,
+      "loss": 1.9726,
+      "step": 1800
+    },
+    {
+      "epoch": 6.018697318007663,
+      "grad_norm": 0.6260418891906738,
+      "learning_rate": 9.570029799914858e-06,
+      "loss": 0.6286,
+      "step": 1810
+    },
+    {
+      "epoch": 6.019463601532567,
+      "grad_norm": 25.16329002380371,
+      "learning_rate": 9.561515538527033e-06,
+      "loss": 1.4469,
+      "step": 1820
+    },
+    {
+      "epoch": 6.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 2.1842398643493652,
+      "eval_runtime": 17.2295,
+      "eval_samples_per_second": 2.612,
+      "eval_steps_per_second": 2.612,
+      "step": 1827
+    },
+    {
+      "epoch": 7.000229885057471,
+      "grad_norm": 24.148334503173828,
+      "learning_rate": 9.553001277139208e-06,
+      "loss": 0.847,
+      "step": 1830
+    },
+    {
+      "epoch": 7.000996168582375,
+      "grad_norm": 26.963687896728516,
+      "learning_rate": 9.544487015751385e-06,
+      "loss": 1.5664,
+      "step": 1840
+    },
+    {
+      "epoch": 7.00176245210728,
+      "grad_norm": 0.8083379864692688,
+      "learning_rate": 9.53597275436356e-06,
+      "loss": 0.7355,
+      "step": 1850
+    },
+    {
+      "epoch": 7.0025287356321835,
+      "grad_norm": 25.176177978515625,
+      "learning_rate": 9.527458492975735e-06,
+      "loss": 2.2096,
+      "step": 1860
+    },
+    {
+      "epoch": 7.003295019157088,
+      "grad_norm": 0.3400355875492096,
+      "learning_rate": 9.518944231587912e-06,
+      "loss": 0.8617,
+      "step": 1870
+    },
+    {
+      "epoch": 7.0040613026819925,
+      "grad_norm": 28.38319206237793,
+      "learning_rate": 9.510429970200085e-06,
+      "loss": 1.7961,
+      "step": 1880
+    },
+    {
+      "epoch": 7.004827586206897,
+      "grad_norm": 35.7610969543457,
+      "learning_rate": 9.501915708812262e-06,
+      "loss": 1.4665,
+      "step": 1890
+    },
+    {
+      "epoch": 7.0055938697318005,
+      "grad_norm": 25.14791488647461,
+      "learning_rate": 9.493401447424437e-06,
+      "loss": 1.8566,
+      "step": 1900
+    },
+    {
+      "epoch": 7.006360153256705,
+      "grad_norm": 2.018683671951294,
+      "learning_rate": 9.484887186036612e-06,
+      "loss": 0.7445,
+      "step": 1910
+    },
+    {
+      "epoch": 7.0071264367816095,
+      "grad_norm": 0.32963842153549194,
+      "learning_rate": 9.476372924648788e-06,
+      "loss": 1.2967,
+      "step": 1920
+    },
+    {
+      "epoch": 7.007892720306513,
+      "grad_norm": 0.20718662440776825,
+      "learning_rate": 9.467858663260963e-06,
+      "loss": 0.9241,
+      "step": 1930
+    },
+    {
+      "epoch": 7.008659003831418,
+      "grad_norm": 0.7027409076690674,
+      "learning_rate": 9.459344401873138e-06,
+      "loss": 1.2763,
+      "step": 1940
+    },
+    {
+      "epoch": 7.009425287356322,
+      "grad_norm": 25.53325843811035,
+      "learning_rate": 9.450830140485315e-06,
+      "loss": 1.4356,
+      "step": 1950
+    },
+    {
+      "epoch": 7.010191570881226,
+      "grad_norm": 24.687814712524414,
+      "learning_rate": 9.442315879097488e-06,
+      "loss": 0.8671,
+      "step": 1960
+    },
+    {
+      "epoch": 7.01095785440613,
+      "grad_norm": 18.038463592529297,
+      "learning_rate": 9.433801617709665e-06,
+      "loss": 1.5805,
+      "step": 1970
+    },
+    {
+      "epoch": 7.011724137931035,
+      "grad_norm": 9.751618385314941,
+      "learning_rate": 9.42528735632184e-06,
+      "loss": 0.2246,
+      "step": 1980
+    },
+    {
+      "epoch": 7.012490421455938,
+      "grad_norm": 32.85557174682617,
+      "learning_rate": 9.416773094934015e-06,
+      "loss": 1.6588,
+      "step": 1990
+    },
+    {
+      "epoch": 7.013256704980843,
+      "grad_norm": 0.3825106918811798,
+      "learning_rate": 9.408258833546192e-06,
+      "loss": 0.6602,
+      "step": 2000
+    },
+    {
+      "epoch": 7.014022988505747,
+      "grad_norm": 22.983888626098633,
+      "learning_rate": 9.399744572158365e-06,
+      "loss": 0.6765,
+      "step": 2010
+    },
+    {
+      "epoch": 7.014789272030652,
+      "grad_norm": 39.498531341552734,
+      "learning_rate": 9.391230310770542e-06,
+      "loss": 1.6487,
+      "step": 2020
+    },
+    {
+      "epoch": 7.015555555555555,
+      "grad_norm": 0.9633718132972717,
+      "learning_rate": 9.382716049382717e-06,
+      "loss": 0.9638,
+      "step": 2030
+    },
+    {
+      "epoch": 7.01632183908046,
+      "grad_norm": 0.31984907388687134,
+      "learning_rate": 9.374201787994892e-06,
+      "loss": 0.3533,
+      "step": 2040
+    },
+    {
+      "epoch": 7.017088122605364,
+      "grad_norm": 0.2616819739341736,
+      "learning_rate": 9.365687526607068e-06,
+      "loss": 0.7693,
+      "step": 2050
+    },
+    {
+      "epoch": 7.017854406130268,
+      "grad_norm": 4.678760051727295,
+      "learning_rate": 9.357173265219243e-06,
+      "loss": 2.8814,
+      "step": 2060
+    },
+    {
+      "epoch": 7.018620689655172,
+      "grad_norm": 0.432799369096756,
+      "learning_rate": 9.348659003831418e-06,
+      "loss": 1.3922,
+      "step": 2070
+    },
+    {
+      "epoch": 7.019386973180077,
+      "grad_norm": 51.95867156982422,
+      "learning_rate": 9.340144742443595e-06,
+      "loss": 1.2686,
+      "step": 2080
+    },
+    {
+      "epoch": 7.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 1.870890498161316,
+      "eval_runtime": 16.3077,
+      "eval_samples_per_second": 2.759,
+      "eval_steps_per_second": 2.759,
+      "step": 2088
+    },
+    {
+      "epoch": 8.00015325670498,
+      "grad_norm": 0.3667392134666443,
+      "learning_rate": 9.331630481055768e-06,
+      "loss": 1.0531,
+      "step": 2090
+    },
+    {
+      "epoch": 8.000919540229885,
+      "grad_norm": 64.37175750732422,
+      "learning_rate": 9.323116219667945e-06,
+      "loss": 0.8667,
+      "step": 2100
+    },
+    {
+      "epoch": 8.001685823754789,
+      "grad_norm": 0.07098367810249329,
+      "learning_rate": 9.31460195828012e-06,
+      "loss": 1.5758,
+      "step": 2110
+    },
+    {
+      "epoch": 8.002452107279694,
+      "grad_norm": 0.17186589539051056,
+      "learning_rate": 9.306087696892295e-06,
+      "loss": 0.9477,
+      "step": 2120
+    },
+    {
+      "epoch": 8.003218390804598,
+      "grad_norm": 0.522612988948822,
+      "learning_rate": 9.297573435504472e-06,
+      "loss": 1.6729,
+      "step": 2130
+    },
+    {
+      "epoch": 8.003984674329502,
+      "grad_norm": 0.2258594036102295,
+      "learning_rate": 9.289059174116647e-06,
+      "loss": 0.0494,
+      "step": 2140
+    },
+    {
+      "epoch": 8.004750957854407,
+      "grad_norm": 0.04520529881119728,
+      "learning_rate": 9.280544912728822e-06,
+      "loss": 0.4919,
+      "step": 2150
+    },
+    {
+      "epoch": 8.00551724137931,
+      "grad_norm": 35.1839485168457,
+      "learning_rate": 9.272030651340997e-06,
+      "loss": 1.8108,
+      "step": 2160
+    },
+    {
+      "epoch": 8.006283524904214,
+      "grad_norm": 31.50969886779785,
+      "learning_rate": 9.263516389953172e-06,
+      "loss": 3.2791,
+      "step": 2170
+    },
+    {
+      "epoch": 8.00704980842912,
+      "grad_norm": 8.8189058303833,
+      "learning_rate": 9.255002128565348e-06,
+      "loss": 1.3613,
+      "step": 2180
+    },
+    {
+      "epoch": 8.007816091954023,
+      "grad_norm": 26.85751724243164,
+      "learning_rate": 9.246487867177523e-06,
+      "loss": 0.6001,
+      "step": 2190
+    },
+    {
+      "epoch": 8.008582375478927,
+      "grad_norm": 31.525379180908203,
+      "learning_rate": 9.237973605789698e-06,
+      "loss": 0.9019,
+      "step": 2200
+    },
+    {
+      "epoch": 8.009348659003832,
+      "grad_norm": 0.9542011618614197,
+      "learning_rate": 9.229459344401875e-06,
+      "loss": 2.1841,
+      "step": 2210
+    },
+    {
+      "epoch": 8.010114942528736,
+      "grad_norm": 0.21988973021507263,
+      "learning_rate": 9.220945083014048e-06,
+      "loss": 0.7927,
+      "step": 2220
+    },
+    {
+      "epoch": 8.01088122605364,
+      "grad_norm": 27.14443588256836,
+      "learning_rate": 9.212430821626225e-06,
+      "loss": 0.9833,
+      "step": 2230
+    },
+    {
+      "epoch": 8.011647509578545,
+      "grad_norm": 28.06905746459961,
+      "learning_rate": 9.2039165602384e-06,
+      "loss": 2.4601,
+      "step": 2240
+    },
+    {
+      "epoch": 8.012413793103448,
+      "grad_norm": 6.933136940002441,
+      "learning_rate": 9.195402298850575e-06,
+      "loss": 0.4801,
+      "step": 2250
+    },
+    {
+      "epoch": 8.013180076628352,
+      "grad_norm": 46.680973052978516,
+      "learning_rate": 9.186888037462752e-06,
+      "loss": 1.6454,
+      "step": 2260
+    },
+    {
+      "epoch": 8.013946360153257,
+      "grad_norm": 30.986583709716797,
+      "learning_rate": 9.178373776074927e-06,
+      "loss": 1.0139,
+      "step": 2270
+    },
+    {
+      "epoch": 8.01471264367816,
+      "grad_norm": 0.4223449230194092,
+      "learning_rate": 9.169859514687102e-06,
+      "loss": 0.7044,
+      "step": 2280
+    },
+    {
+      "epoch": 8.015478927203064,
+      "grad_norm": 45.04050064086914,
+      "learning_rate": 9.161345253299277e-06,
+      "loss": 0.9361,
+      "step": 2290
+    },
+    {
+      "epoch": 8.01624521072797,
+      "grad_norm": 0.4287947416305542,
+      "learning_rate": 9.152830991911452e-06,
+      "loss": 1.894,
+      "step": 2300
+    },
+    {
+      "epoch": 8.017011494252873,
+      "grad_norm": 0.11499989032745361,
+      "learning_rate": 9.144316730523628e-06,
+      "loss": 0.8698,
+      "step": 2310
+    },
+    {
+      "epoch": 8.017777777777777,
+      "grad_norm": 37.786678314208984,
+      "learning_rate": 9.135802469135803e-06,
+      "loss": 1.6632,
+      "step": 2320
+    },
+    {
+      "epoch": 8.018544061302682,
+      "grad_norm": 3.584336757659912,
+      "learning_rate": 9.127288207747978e-06,
+      "loss": 0.6548,
+      "step": 2330
+    },
+    {
+      "epoch": 8.019310344827586,
+      "grad_norm": 0.4127309322357178,
+      "learning_rate": 9.118773946360155e-06,
+      "loss": 1.4495,
+      "step": 2340
+    },
+    {
+      "epoch": 8.02,
+      "eval_accuracy": 0.4666666666666667,
+      "eval_loss": 1.8210101127624512,
+      "eval_runtime": 17.1866,
+      "eval_samples_per_second": 2.618,
+      "eval_steps_per_second": 2.618,
+      "step": 2349
+    },
+    {
+      "epoch": 9.00007662835249,
+      "grad_norm": 0.06787708401679993,
+      "learning_rate": 9.110259684972328e-06,
+      "loss": 0.5414,
+      "step": 2350
+    },
+    {
+      "epoch": 9.000842911877395,
+      "grad_norm": 67.92351531982422,
+      "learning_rate": 9.101745423584505e-06,
+      "loss": 0.309,
+      "step": 2360
+    },
+    {
+      "epoch": 9.001609195402299,
+      "grad_norm": 90.45152282714844,
+      "learning_rate": 9.09323116219668e-06,
+      "loss": 1.1411,
+      "step": 2370
+    },
+    {
+      "epoch": 9.002375478927203,
+      "grad_norm": 23.41851806640625,
+      "learning_rate": 9.084716900808855e-06,
+      "loss": 0.1713,
+      "step": 2380
+    },
+    {
+      "epoch": 9.003141762452108,
+      "grad_norm": 0.02565217763185501,
+      "learning_rate": 9.076202639421032e-06,
+      "loss": 1.7839,
+      "step": 2390
+    },
+    {
+      "epoch": 9.003908045977012,
+      "grad_norm": 41.862152099609375,
+      "learning_rate": 9.067688378033207e-06,
+      "loss": 1.3918,
+      "step": 2400
+    },
+    {
+      "epoch": 9.004674329501915,
+      "grad_norm": 25.538257598876953,
+      "learning_rate": 9.059174116645382e-06,
+      "loss": 0.7879,
+      "step": 2410
+    },
+    {
+      "epoch": 9.00544061302682,
+      "grad_norm": 14.444324493408203,
+      "learning_rate": 9.050659855257558e-06,
+      "loss": 0.38,
+      "step": 2420
+    },
+    {
+      "epoch": 9.006206896551724,
+      "grad_norm": 16.116519927978516,
+      "learning_rate": 9.042145593869732e-06,
+      "loss": 1.1602,
+      "step": 2430
+    },
+    {
+      "epoch": 9.006973180076628,
+      "grad_norm": 23.733491897583008,
+      "learning_rate": 9.033631332481908e-06,
+      "loss": 0.5195,
+      "step": 2440
+    },
+    {
+      "epoch": 9.007739463601533,
+      "grad_norm": 0.08872228860855103,
+      "learning_rate": 9.025117071094083e-06,
+      "loss": 0.7457,
+      "step": 2450
+    },
+    {
+      "epoch": 9.008505747126437,
+      "grad_norm": 414.07208251953125,
+      "learning_rate": 9.016602809706258e-06,
+      "loss": 0.7873,
+      "step": 2460
+    },
+    {
+      "epoch": 9.00927203065134,
+      "grad_norm": 12.048069953918457,
+      "learning_rate": 9.008088548318435e-06,
+      "loss": 0.8082,
+      "step": 2470
+    },
+    {
+      "epoch": 9.010038314176246,
+      "grad_norm": 0.1315668821334839,
+      "learning_rate": 8.999574286930608e-06,
+      "loss": 0.2489,
+      "step": 2480
+    },
+    {
+      "epoch": 9.01080459770115,
+      "grad_norm": 1.1408504247665405,
+      "learning_rate": 8.991060025542785e-06,
+      "loss": 1.1542,
+      "step": 2490
+    },
+    {
+      "epoch": 9.011570881226053,
+      "grad_norm": 0.1053474172949791,
+      "learning_rate": 8.98254576415496e-06,
+      "loss": 0.7823,
+      "step": 2500
+    },
+    {
+      "epoch": 9.012337164750958,
+      "grad_norm": 0.12916134297847748,
+      "learning_rate": 8.974031502767135e-06,
+      "loss": 0.8981,
+      "step": 2510
+    },
+    {
+      "epoch": 9.013103448275862,
+      "grad_norm": 0.052084363996982574,
+      "learning_rate": 8.965517241379312e-06,
+      "loss": 1.8993,
+      "step": 2520
+    },
+    {
+      "epoch": 9.013869731800765,
+      "grad_norm": 0.45651882886886597,
+      "learning_rate": 8.957002979991487e-06,
+      "loss": 0.8506,
+      "step": 2530
+    },
+    {
+      "epoch": 9.01463601532567,
+      "grad_norm": 0.6769386529922485,
+      "learning_rate": 8.948488718603662e-06,
+      "loss": 1.123,
+      "step": 2540
+    },
+    {
+      "epoch": 9.015402298850574,
+      "grad_norm": 39.06797409057617,
+      "learning_rate": 8.939974457215838e-06,
+      "loss": 0.9073,
+      "step": 2550
+    },
+    {
+      "epoch": 9.01616858237548,
+      "grad_norm": 9.656159400939941,
+      "learning_rate": 8.931460195828012e-06,
+      "loss": 0.7615,
+      "step": 2560
+    },
+    {
+      "epoch": 9.016934865900383,
+      "grad_norm": 312.3702087402344,
+      "learning_rate": 8.922945934440188e-06,
+      "loss": 0.6024,
+      "step": 2570
+    },
+    {
+      "epoch": 9.017701149425287,
+      "grad_norm": 65.21734619140625,
+      "learning_rate": 8.914431673052363e-06,
+      "loss": 1.669,
+      "step": 2580
+    },
+    {
+      "epoch": 9.018467432950192,
+      "grad_norm": 7.8115715980529785,
+      "learning_rate": 8.905917411664538e-06,
+      "loss": 1.291,
+      "step": 2590
+    },
+    {
+      "epoch": 9.019233716475096,
+      "grad_norm": 0.7356247305870056,
+      "learning_rate": 8.897403150276715e-06,
+      "loss": 1.0173,
+      "step": 2600
+    },
+    {
+      "epoch": 9.02,
+      "grad_norm": 184.26580810546875,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.7905,
+      "step": 2610
+    },
+    {
+      "epoch": 9.02,
+      "eval_accuracy": 0.7555555555555555,
+      "eval_loss": 0.6076056361198425,
+      "eval_runtime": 16.3109,
+      "eval_samples_per_second": 2.759,
+      "eval_steps_per_second": 2.759,
+      "step": 2610
+    },
+    {
+      "epoch": 10.000766283524904,
+      "grad_norm": 0.10444707423448563,
+      "learning_rate": 8.880374627501065e-06,
+      "loss": 0.8054,
+      "step": 2620
+    },
+    {
+      "epoch": 10.001532567049809,
+      "grad_norm": 30.12024688720703,
+      "learning_rate": 8.87186036611324e-06,
+      "loss": 0.6826,
+      "step": 2630
+    },
+    {
+      "epoch": 10.002298850574713,
+      "grad_norm": 0.5628738403320312,
+      "learning_rate": 8.863346104725415e-06,
+      "loss": 0.3652,
+      "step": 2640
+    },
+    {
+      "epoch": 10.003065134099616,
+      "grad_norm": 571.0258178710938,
+      "learning_rate": 8.854831843337592e-06,
+      "loss": 1.213,
+      "step": 2650
+    },
+    {
+      "epoch": 10.003831417624522,
+      "grad_norm": 0.017369825392961502,
+      "learning_rate": 8.846317581949767e-06,
+      "loss": 0.8148,
+      "step": 2660
+    },
+    {
+      "epoch": 10.004597701149425,
+      "grad_norm": 0.04240279644727707,
+      "learning_rate": 8.837803320561942e-06,
+      "loss": 0.3564,
+      "step": 2670
+    },
+    {
+      "epoch": 10.005363984674329,
+      "grad_norm": 1.3353875875473022,
+      "learning_rate": 8.829289059174118e-06,
+      "loss": 2.8184,
+      "step": 2680
+    },
+    {
+      "epoch": 10.006130268199234,
+      "grad_norm": 35.47079086303711,
+      "learning_rate": 8.820774797786292e-06,
+      "loss": 1.6083,
+      "step": 2690
+    },
+    {
+      "epoch": 10.006896551724138,
+      "grad_norm": 27.306764602661133,
+      "learning_rate": 8.812260536398468e-06,
+      "loss": 0.7927,
+      "step": 2700
+    },
+    {
+      "epoch": 10.007662835249041,
+      "grad_norm": 75.7034912109375,
+      "learning_rate": 8.803746275010643e-06,
+      "loss": 2.0877,
+      "step": 2710
+    },
+    {
+      "epoch": 10.008429118773947,
+      "grad_norm": 0.37073442339897156,
+      "learning_rate": 8.795232013622818e-06,
+      "loss": 0.4999,
+      "step": 2720
+    },
+    {
+      "epoch": 10.00919540229885,
+      "grad_norm": 0.018161851912736893,
+      "learning_rate": 8.786717752234995e-06,
+      "loss": 1.0229,
+      "step": 2730
+    },
+    {
+      "epoch": 10.009961685823756,
+      "grad_norm": 0.04276080057024956,
+      "learning_rate": 8.77820349084717e-06,
+      "loss": 1.1344,
+      "step": 2740
+    },
+    {
+      "epoch": 10.01072796934866,
+      "grad_norm": 61.130096435546875,
+      "learning_rate": 8.769689229459345e-06,
+      "loss": 1.6218,
+      "step": 2750
+    },
+    {
+      "epoch": 10.011494252873563,
+      "grad_norm": 0.27917972207069397,
+      "learning_rate": 8.76117496807152e-06,
+      "loss": 1.1409,
+      "step": 2760
+    },
+    {
+      "epoch": 10.012260536398468,
+      "grad_norm": 22.165864944458008,
+      "learning_rate": 8.752660706683695e-06,
+      "loss": 0.1824,
+      "step": 2770
+    },
+    {
+      "epoch": 10.013026819923372,
+      "grad_norm": 0.050605203956365585,
+      "learning_rate": 8.744146445295872e-06,
+      "loss": 0.4492,
+      "step": 2780
+    },
+    {
+      "epoch": 10.013793103448275,
+      "grad_norm": 0.0357230044901371,
+      "learning_rate": 8.735632183908047e-06,
+      "loss": 0.3711,
+      "step": 2790
+    },
+    {
+      "epoch": 10.01455938697318,
+      "grad_norm": 185.62095642089844,
+      "learning_rate": 8.727117922520222e-06,
+      "loss": 1.4587,
+      "step": 2800
+    },
+    {
+      "epoch": 10.015325670498084,
+      "grad_norm": 11.161556243896484,
+      "learning_rate": 8.718603661132398e-06,
+      "loss": 1.2944,
+      "step": 2810
+    },
+    {
+      "epoch": 10.016091954022988,
+      "grad_norm": 1.5606610774993896,
+      "learning_rate": 8.710089399744572e-06,
+      "loss": 0.2557,
+      "step": 2820
+    },
+    {
+      "epoch": 10.016858237547893,
+      "grad_norm": 0.5291488766670227,
+      "learning_rate": 8.701575138356748e-06,
+      "loss": 0.7238,
+      "step": 2830
+    },
+    {
+      "epoch": 10.017624521072797,
+      "grad_norm": 0.37825533747673035,
+      "learning_rate": 8.693060876968923e-06,
+      "loss": 0.9816,
+      "step": 2840
+    },
+    {
+      "epoch": 10.0183908045977,
+      "grad_norm": 0.05667515844106674,
+      "learning_rate": 8.684546615581098e-06,
+      "loss": 1.5252,
+      "step": 2850
+    },
+    {
+      "epoch": 10.019157088122606,
+      "grad_norm": 0.023939739912748337,
+      "learning_rate": 8.676032354193275e-06,
+      "loss": 1.1097,
+      "step": 2860
+    },
+    {
+      "epoch": 10.01992337164751,
+      "grad_norm": 0.10473144799470901,
+      "learning_rate": 8.66751809280545e-06,
+      "loss": 0.8322,
+      "step": 2870
+    },
+    {
+      "epoch": 10.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 0.9285709857940674,
+      "eval_runtime": 18.2418,
+      "eval_samples_per_second": 2.467,
+      "eval_steps_per_second": 2.467,
+      "step": 2871
+    },
+    {
+      "epoch": 11.000689655172414,
+      "grad_norm": 0.044995542615652084,
+      "learning_rate": 8.659003831417625e-06,
+      "loss": 0.602,
+      "step": 2880
+    },
+    {
+      "epoch": 11.001455938697317,
+      "grad_norm": 0.3559626042842865,
+      "learning_rate": 8.650489570029802e-06,
+      "loss": 0.8572,
+      "step": 2890
+    },
+    {
+      "epoch": 11.002222222222223,
+      "grad_norm": 0.23387664556503296,
+      "learning_rate": 8.641975308641975e-06,
+      "loss": 0.9715,
+      "step": 2900
+    },
+    {
+      "epoch": 11.002988505747126,
+      "grad_norm": 1.0412755012512207,
+      "learning_rate": 8.633461047254152e-06,
+      "loss": 0.0118,
+      "step": 2910
+    },
+    {
+      "epoch": 11.00375478927203,
+      "grad_norm": 0.013598160818219185,
+      "learning_rate": 8.624946785866327e-06,
+      "loss": 1.2322,
+      "step": 2920
+    },
+    {
+      "epoch": 11.004521072796935,
+      "grad_norm": 66.33006286621094,
+      "learning_rate": 8.616432524478502e-06,
+      "loss": 1.3479,
+      "step": 2930
+    },
+    {
+      "epoch": 11.005287356321839,
+      "grad_norm": 53.71303176879883,
+      "learning_rate": 8.607918263090678e-06,
+      "loss": 1.6688,
+      "step": 2940
+    },
+    {
+      "epoch": 11.006053639846744,
+      "grad_norm": 0.3487199544906616,
+      "learning_rate": 8.599404001702853e-06,
+      "loss": 0.7742,
+      "step": 2950
+    },
+    {
+      "epoch": 11.006819923371648,
+      "grad_norm": 35.98875045776367,
+      "learning_rate": 8.590889740315028e-06,
+      "loss": 1.1983,
+      "step": 2960
+    },
+    {
+      "epoch": 11.007586206896551,
+      "grad_norm": 36.40443420410156,
+      "learning_rate": 8.582375478927203e-06,
+      "loss": 0.9121,
+      "step": 2970
+    },
+    {
+      "epoch": 11.008352490421457,
+      "grad_norm": 20.993436813354492,
+      "learning_rate": 8.573861217539378e-06,
+      "loss": 0.7821,
+      "step": 2980
+    },
+    {
+      "epoch": 11.00911877394636,
+      "grad_norm": 58.57808303833008,
+      "learning_rate": 8.565346956151555e-06,
+      "loss": 0.0333,
+      "step": 2990
+    },
+    {
+      "epoch": 11.009885057471264,
+      "grad_norm": 0.014679349958896637,
+      "learning_rate": 8.55683269476373e-06,
+      "loss": 1.2069,
+      "step": 3000
+    },
+    {
+      "epoch": 11.01065134099617,
+      "grad_norm": 40.324317932128906,
+      "learning_rate": 8.548318433375905e-06,
+      "loss": 0.5127,
+      "step": 3010
+    },
+    {
+      "epoch": 11.011417624521073,
+      "grad_norm": 0.19081498682498932,
+      "learning_rate": 8.539804171988082e-06,
+      "loss": 0.4939,
+      "step": 3020
+    },
+    {
+      "epoch": 11.012183908045976,
+      "grad_norm": 269.95263671875,
+      "learning_rate": 8.531289910600255e-06,
+      "loss": 0.4889,
+      "step": 3030
+    },
+    {
+      "epoch": 11.012950191570882,
+      "grad_norm": 0.0237162746489048,
+      "learning_rate": 8.522775649212432e-06,
+      "loss": 0.6376,
+      "step": 3040
+    },
+    {
+      "epoch": 11.013716475095785,
+      "grad_norm": 0.5160467028617859,
+      "learning_rate": 8.514261387824607e-06,
+      "loss": 0.8452,
+      "step": 3050
+    },
+    {
+      "epoch": 11.014482758620689,
+      "grad_norm": 0.05254511907696724,
+      "learning_rate": 8.505747126436782e-06,
+      "loss": 1.0592,
+      "step": 3060
+    },
+    {
+      "epoch": 11.015249042145594,
+      "grad_norm": 0.009695000015199184,
+      "learning_rate": 8.497232865048958e-06,
+      "loss": 0.4962,
+      "step": 3070
+    },
+    {
+      "epoch": 11.016015325670498,
+      "grad_norm": 0.35122835636138916,
+      "learning_rate": 8.488718603661133e-06,
+      "loss": 2.3977,
+      "step": 3080
+    },
+    {
+      "epoch": 11.016781609195402,
+      "grad_norm": 0.01762336865067482,
+      "learning_rate": 8.480204342273308e-06,
+      "loss": 0.9964,
+      "step": 3090
+    },
+    {
+      "epoch": 11.017547892720307,
+      "grad_norm": 0.10008629411458969,
+      "learning_rate": 8.471690080885483e-06,
+      "loss": 0.386,
+      "step": 3100
+    },
+    {
+      "epoch": 11.01831417624521,
+      "grad_norm": 0.5303282737731934,
+      "learning_rate": 8.463175819497658e-06,
+      "loss": 1.3695,
+      "step": 3110
+    },
+    {
+      "epoch": 11.019080459770114,
+      "grad_norm": 0.020414335653185844,
+      "learning_rate": 8.454661558109835e-06,
+      "loss": 0.5522,
+      "step": 3120
+    },
+    {
+      "epoch": 11.01984674329502,
+      "grad_norm": 252.580322265625,
+      "learning_rate": 8.44614729672201e-06,
+      "loss": 0.8618,
+      "step": 3130
+    },
+    {
+      "epoch": 11.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 0.9191111922264099,
+      "eval_runtime": 17.0674,
+      "eval_samples_per_second": 2.637,
+      "eval_steps_per_second": 2.637,
+      "step": 3132
+    },
+    {
+      "epoch": 12.000613026819924,
+      "grad_norm": 1.643672227859497,
+      "learning_rate": 8.437633035334185e-06,
+      "loss": 0.5702,
+      "step": 3140
+    },
+    {
+      "epoch": 12.001379310344827,
+      "grad_norm": 0.3253929615020752,
+      "learning_rate": 8.429118773946362e-06,
+      "loss": 0.0697,
+      "step": 3150
+    },
+    {
+      "epoch": 12.002145593869733,
+      "grad_norm": 159.8040008544922,
+      "learning_rate": 8.420604512558537e-06,
+      "loss": 1.9085,
+      "step": 3160
+    },
+    {
+      "epoch": 12.002911877394636,
+      "grad_norm": 0.5147111415863037,
+      "learning_rate": 8.412090251170712e-06,
+      "loss": 0.1654,
+      "step": 3170
+    },
+    {
+      "epoch": 12.00367816091954,
+      "grad_norm": 247.5801239013672,
+      "learning_rate": 8.403575989782887e-06,
+      "loss": 1.9794,
+      "step": 3180
+    },
+    {
+      "epoch": 12.004444444444445,
+      "grad_norm": 0.0515538789331913,
+      "learning_rate": 8.395061728395062e-06,
+      "loss": 0.3445,
+      "step": 3190
+    },
+    {
+      "epoch": 12.005210727969349,
+      "grad_norm": 620.4723510742188,
+      "learning_rate": 8.386547467007238e-06,
+      "loss": 0.402,
+      "step": 3200
+    },
+    {
+      "epoch": 12.005977011494252,
+      "grad_norm": 0.02806105650961399,
+      "learning_rate": 8.378033205619413e-06,
+      "loss": 0.6373,
+      "step": 3210
+    },
+    {
+      "epoch": 12.006743295019158,
+      "grad_norm": 0.017435293644666672,
+      "learning_rate": 8.369518944231588e-06,
+      "loss": 0.1915,
+      "step": 3220
+    },
+    {
+      "epoch": 12.007509578544061,
+      "grad_norm": 0.01980624347925186,
+      "learning_rate": 8.361004682843763e-06,
+      "loss": 0.4933,
+      "step": 3230
+    },
+    {
+      "epoch": 12.008275862068965,
+      "grad_norm": 0.09661005437374115,
+      "learning_rate": 8.35249042145594e-06,
+      "loss": 0.0652,
+      "step": 3240
+    },
+    {
+      "epoch": 12.00904214559387,
+      "grad_norm": 45.30793762207031,
+      "learning_rate": 8.343976160068115e-06,
+      "loss": 1.8681,
+      "step": 3250
+    },
+    {
+      "epoch": 12.009808429118774,
+      "grad_norm": 0.02687702141702175,
+      "learning_rate": 8.33546189868029e-06,
+      "loss": 0.0036,
+      "step": 3260
+    },
+    {
+      "epoch": 12.010574712643677,
+      "grad_norm": 759.1505126953125,
+      "learning_rate": 8.326947637292465e-06,
+      "loss": 1.6322,
+      "step": 3270
+    },
+    {
+      "epoch": 12.011340996168583,
+      "grad_norm": 0.32265758514404297,
+      "learning_rate": 8.318433375904642e-06,
+      "loss": 1.0913,
+      "step": 3280
+    },
+    {
+      "epoch": 12.012107279693486,
+      "grad_norm": 0.13585622608661652,
+      "learning_rate": 8.309919114516817e-06,
+      "loss": 0.0522,
+      "step": 3290
+    },
+    {
+      "epoch": 12.01287356321839,
+      "grad_norm": 35.06301498413086,
+      "learning_rate": 8.301404853128992e-06,
+      "loss": 1.8327,
+      "step": 3300
+    },
+    {
+      "epoch": 12.013639846743295,
+      "grad_norm": 558.0282592773438,
+      "learning_rate": 8.292890591741167e-06,
+      "loss": 1.0827,
+      "step": 3310
+    },
+    {
+      "epoch": 12.014406130268199,
+      "grad_norm": 160.3280792236328,
+      "learning_rate": 8.284376330353342e-06,
+      "loss": 1.803,
+      "step": 3320
+    },
+    {
+      "epoch": 12.015172413793103,
+      "grad_norm": 0.16818971931934357,
+      "learning_rate": 8.275862068965518e-06,
+      "loss": 0.9934,
+      "step": 3330
+    },
+    {
+      "epoch": 12.015938697318008,
+      "grad_norm": 0.0374433659017086,
+      "learning_rate": 8.267347807577693e-06,
+      "loss": 1.0756,
+      "step": 3340
+    },
+    {
+      "epoch": 12.016704980842912,
+      "grad_norm": 0.08079817146062851,
+      "learning_rate": 8.258833546189868e-06,
+      "loss": 0.8724,
+      "step": 3350
+    },
+    {
+      "epoch": 12.017471264367815,
+      "grad_norm": 0.3729625344276428,
+      "learning_rate": 8.250319284802043e-06,
+      "loss": 0.0375,
+      "step": 3360
+    },
+    {
+      "epoch": 12.01823754789272,
+      "grad_norm": 62.6796875,
+      "learning_rate": 8.24180502341422e-06,
+      "loss": 0.0757,
+      "step": 3370
+    },
+    {
+      "epoch": 12.019003831417624,
+      "grad_norm": 0.029427042230963707,
+      "learning_rate": 8.233290762026395e-06,
+      "loss": 0.5963,
+      "step": 3380
+    },
+    {
+      "epoch": 12.01977011494253,
+      "grad_norm": 28.314170837402344,
+      "learning_rate": 8.22477650063857e-06,
+      "loss": 1.1268,
+      "step": 3390
+    },
+    {
+      "epoch": 12.02,
+      "eval_accuracy": 0.5777777777777777,
+      "eval_loss": 1.766817331314087,
+      "eval_runtime": 17.0852,
+      "eval_samples_per_second": 2.634,
+      "eval_steps_per_second": 2.634,
+      "step": 3393
+    },
+    {
+      "epoch": 13.000536398467434,
+      "grad_norm": 3.081188917160034,
+      "learning_rate": 8.216262239250745e-06,
+      "loss": 0.0051,
+      "step": 3400
+    },
+    {
+      "epoch": 13.001302681992337,
+      "grad_norm": 0.017045721411705017,
+      "learning_rate": 8.207747977862922e-06,
+      "loss": 0.7076,
+      "step": 3410
+    },
+    {
+      "epoch": 13.00206896551724,
+      "grad_norm": 41.369873046875,
+      "learning_rate": 8.199233716475097e-06,
+      "loss": 1.238,
+      "step": 3420
+    },
+    {
+      "epoch": 13.002835249042146,
+      "grad_norm": 25.155235290527344,
+      "learning_rate": 8.190719455087272e-06,
+      "loss": 0.5341,
+      "step": 3430
+    },
+    {
+      "epoch": 13.00360153256705,
+      "grad_norm": 0.038196343928575516,
+      "learning_rate": 8.182205193699447e-06,
+      "loss": 0.8086,
+      "step": 3440
+    },
+    {
+      "epoch": 13.004367816091953,
+      "grad_norm": 8.437373161315918,
+      "learning_rate": 8.173690932311623e-06,
+      "loss": 0.9984,
+      "step": 3450
+    },
+    {
+      "epoch": 13.005134099616859,
+      "grad_norm": 28.750768661499023,
+      "learning_rate": 8.165176670923798e-06,
+      "loss": 1.0622,
+      "step": 3460
+    },
+    {
+      "epoch": 13.005900383141762,
+      "grad_norm": 0.03872830793261528,
+      "learning_rate": 8.156662409535973e-06,
+      "loss": 0.0428,
+      "step": 3470
+    },
+    {
+      "epoch": 13.006666666666666,
+      "grad_norm": 43.70376968383789,
+      "learning_rate": 8.148148148148148e-06,
+      "loss": 1.5042,
+      "step": 3480
+    },
+    {
+      "epoch": 13.007432950191571,
+      "grad_norm": 0.016942838206887245,
+      "learning_rate": 8.139633886760325e-06,
+      "loss": 0.4918,
+      "step": 3490
+    },
+    {
+      "epoch": 13.008199233716475,
+      "grad_norm": 69.89582061767578,
+      "learning_rate": 8.1311196253725e-06,
+      "loss": 1.3908,
+      "step": 3500
+    },
+    {
+      "epoch": 13.008965517241379,
+      "grad_norm": 0.02800031192600727,
+      "learning_rate": 8.122605363984675e-06,
+      "loss": 1.0224,
+      "step": 3510
+    },
+    {
+      "epoch": 13.009731800766284,
+      "grad_norm": 0.3693395256996155,
+      "learning_rate": 8.11409110259685e-06,
+      "loss": 1.1892,
+      "step": 3520
+    },
+    {
+      "epoch": 13.010498084291187,
+      "grad_norm": 4.660586833953857,
+      "learning_rate": 8.105576841209027e-06,
+      "loss": 0.4709,
+      "step": 3530
+    },
+    {
+      "epoch": 13.011264367816091,
+      "grad_norm": 52.9511833190918,
+      "learning_rate": 8.097062579821202e-06,
+      "loss": 0.3896,
+      "step": 3540
+    },
+    {
+      "epoch": 13.012030651340996,
+      "grad_norm": 0.05054105818271637,
+      "learning_rate": 8.088548318433377e-06,
+      "loss": 0.8281,
+      "step": 3550
+    },
+    {
+      "epoch": 13.0127969348659,
+      "grad_norm": 0.0865572988986969,
+      "learning_rate": 8.080034057045552e-06,
+      "loss": 1.0519,
+      "step": 3560
+    },
+    {
+      "epoch": 13.013563218390805,
+      "grad_norm": 9.639525413513184,
+      "learning_rate": 8.071519795657727e-06,
+      "loss": 1.6116,
+      "step": 3570
+    },
+    {
+      "epoch": 13.014329501915709,
+      "grad_norm": 0.02205851301550865,
+      "learning_rate": 8.063005534269903e-06,
+      "loss": 1.3852,
+      "step": 3580
+    },
+    {
+      "epoch": 13.015095785440613,
+      "grad_norm": 2.6666343212127686,
+      "learning_rate": 8.054491272882078e-06,
+      "loss": 0.8278,
+      "step": 3590
+    },
+    {
+      "epoch": 13.015862068965518,
+      "grad_norm": 2.7708146572113037,
+      "learning_rate": 8.045977011494253e-06,
+      "loss": 1.5062,
+      "step": 3600
+    },
+    {
+      "epoch": 13.016628352490422,
+      "grad_norm": 4.875570774078369,
+      "learning_rate": 8.037462750106428e-06,
+      "loss": 0.509,
+      "step": 3610
+    },
+    {
+      "epoch": 13.017394636015325,
+      "grad_norm": 0.9614506959915161,
+      "learning_rate": 8.028948488718605e-06,
+      "loss": 1.2457,
+      "step": 3620
+    },
+    {
+      "epoch": 13.01816091954023,
+      "grad_norm": 2.391639232635498,
+      "learning_rate": 8.02043422733078e-06,
+      "loss": 0.1548,
+      "step": 3630
+    },
+    {
+      "epoch": 13.018927203065134,
+      "grad_norm": 0.40272876620292664,
+      "learning_rate": 8.011919965942955e-06,
+      "loss": 0.5445,
+      "step": 3640
+    },
+    {
+      "epoch": 13.019693486590038,
+      "grad_norm": 861.9212646484375,
+      "learning_rate": 8.00340570455513e-06,
+      "loss": 0.7087,
+      "step": 3650
+    },
+    {
+      "epoch": 13.02,
+      "eval_accuracy": 0.5333333333333333,
+      "eval_loss": 1.869879126548767,
+      "eval_runtime": 17.1706,
+      "eval_samples_per_second": 2.621,
+      "eval_steps_per_second": 2.621,
+      "step": 3654
+    },
+    {
+      "epoch": 14.000459770114942,
+      "grad_norm": 948.0760498046875,
+      "learning_rate": 7.994891443167307e-06,
+      "loss": 1.5876,
+      "step": 3660
+    },
+    {
+      "epoch": 14.001226053639847,
+      "grad_norm": 0.17879176139831543,
+      "learning_rate": 7.986377181779482e-06,
+      "loss": 0.386,
+      "step": 3670
+    },
+    {
+      "epoch": 14.00199233716475,
+      "grad_norm": 0.08500174432992935,
+      "learning_rate": 7.977862920391657e-06,
+      "loss": 1.078,
+      "step": 3680
+    },
+    {
+      "epoch": 14.002758620689654,
+      "grad_norm": 0.08254305273294449,
+      "learning_rate": 7.969348659003832e-06,
+      "loss": 0.602,
+      "step": 3690
+    },
+    {
+      "epoch": 14.00352490421456,
+      "grad_norm": 0.5430131554603577,
+      "learning_rate": 7.960834397616007e-06,
+      "loss": 1.4038,
+      "step": 3700
+    },
+    {
+      "epoch": 14.004291187739463,
+      "grad_norm": 116.88401794433594,
+      "learning_rate": 7.952320136228183e-06,
+      "loss": 1.7234,
+      "step": 3710
+    },
+    {
+      "epoch": 14.005057471264367,
+      "grad_norm": 40.495941162109375,
+      "learning_rate": 7.943805874840358e-06,
+      "loss": 0.3801,
+      "step": 3720
+    },
+    {
+      "epoch": 14.005823754789272,
+      "grad_norm": 0.008031925186514854,
+      "learning_rate": 7.935291613452533e-06,
+      "loss": 0.6601,
+      "step": 3730
+    },
+    {
+      "epoch": 14.006590038314176,
+      "grad_norm": 0.36836498975753784,
+      "learning_rate": 7.92677735206471e-06,
+      "loss": 0.9515,
+      "step": 3740
+    },
+    {
+      "epoch": 14.007356321839081,
+      "grad_norm": 0.18394853174686432,
+      "learning_rate": 7.918263090676885e-06,
+      "loss": 0.0218,
+      "step": 3750
+    },
+    {
+      "epoch": 14.008122605363985,
+      "grad_norm": 0.1189194992184639,
+      "learning_rate": 7.90974882928906e-06,
+      "loss": 0.6009,
+      "step": 3760
+    },
+    {
+      "epoch": 14.008888888888889,
+      "grad_norm": 0.1183149516582489,
+      "learning_rate": 7.901234567901235e-06,
+      "loss": 0.9259,
+      "step": 3770
+    },
+    {
+      "epoch": 14.009655172413794,
+      "grad_norm": 0.022695478051900864,
+      "learning_rate": 7.89272030651341e-06,
+      "loss": 0.6276,
+      "step": 3780
+    },
+    {
+      "epoch": 14.010421455938697,
+      "grad_norm": 0.058900777250528336,
+      "learning_rate": 7.884206045125587e-06,
+      "loss": 1.4136,
+      "step": 3790
+    },
+    {
+      "epoch": 14.011187739463601,
+      "grad_norm": 0.00957054179161787,
+      "learning_rate": 7.875691783737762e-06,
+      "loss": 0.1761,
+      "step": 3800
+    },
+    {
+      "epoch": 14.011954022988506,
+      "grad_norm": 0.03180115669965744,
+      "learning_rate": 7.867177522349937e-06,
+      "loss": 0.6962,
+      "step": 3810
+    },
+    {
+      "epoch": 14.01272030651341,
+      "grad_norm": 0.06167417764663696,
+      "learning_rate": 7.858663260962112e-06,
+      "loss": 0.9894,
+      "step": 3820
+    },
+    {
+      "epoch": 14.013486590038314,
+      "grad_norm": 0.08714844286441803,
+      "learning_rate": 7.850148999574287e-06,
+      "loss": 0.6252,
+      "step": 3830
+    },
+    {
+      "epoch": 14.014252873563219,
+      "grad_norm": 1.0463368892669678,
+      "learning_rate": 7.841634738186463e-06,
+      "loss": 1.2818,
+      "step": 3840
+    },
+    {
+      "epoch": 14.015019157088123,
+      "grad_norm": 77.14112091064453,
+      "learning_rate": 7.833120476798638e-06,
+      "loss": 0.7462,
+      "step": 3850
+    },
+    {
+      "epoch": 14.015785440613026,
+      "grad_norm": 954.8248901367188,
+      "learning_rate": 7.824606215410813e-06,
+      "loss": 0.7258,
+      "step": 3860
+    },
+    {
+      "epoch": 14.016551724137932,
+      "grad_norm": 0.7424802184104919,
+      "learning_rate": 7.81609195402299e-06,
+      "loss": 0.9292,
+      "step": 3870
+    },
+    {
+      "epoch": 14.017318007662835,
+      "grad_norm": 1.1518269777297974,
+      "learning_rate": 7.807577692635165e-06,
+      "loss": 1.1737,
+      "step": 3880
+    },
+    {
+      "epoch": 14.018084291187739,
+      "grad_norm": 34.38454055786133,
+      "learning_rate": 7.79906343124734e-06,
+      "loss": 2.1998,
+      "step": 3890
+    },
+    {
+      "epoch": 14.018850574712644,
+      "grad_norm": 49.79663848876953,
+      "learning_rate": 7.790549169859515e-06,
+      "loss": 1.1777,
+      "step": 3900
+    },
+    {
+      "epoch": 14.019616858237548,
+      "grad_norm": 61.34264373779297,
+      "learning_rate": 7.78203490847169e-06,
+      "loss": 1.5327,
+      "step": 3910
+    },
+    {
+      "epoch": 14.02,
+      "eval_accuracy": 0.5555555555555556,
+      "eval_loss": 1.4735982418060303,
+      "eval_runtime": 16.249,
+      "eval_samples_per_second": 2.769,
+      "eval_steps_per_second": 2.769,
+      "step": 3915
+    },
+    {
+      "epoch": 15.000383141762452,
+      "grad_norm": 0.053164273500442505,
+      "learning_rate": 7.773520647083867e-06,
+      "loss": 0.0647,
+      "step": 3920
+    },
+    {
+      "epoch": 15.001149425287357,
+      "grad_norm": 0.020094765350222588,
+      "learning_rate": 7.765006385696042e-06,
+      "loss": 0.5939,
+      "step": 3930
+    },
+    {
+      "epoch": 15.00191570881226,
+      "grad_norm": 0.2023773491382599,
+      "learning_rate": 7.756492124308217e-06,
+      "loss": 1.2966,
+      "step": 3940
+    },
+    {
+      "epoch": 15.002681992337164,
+      "grad_norm": 7.771537780761719,
+      "learning_rate": 7.747977862920393e-06,
+      "loss": 0.2176,
+      "step": 3950
+    },
+    {
+      "epoch": 15.00344827586207,
+      "grad_norm": 0.13149070739746094,
+      "learning_rate": 7.739463601532567e-06,
+      "loss": 0.0233,
+      "step": 3960
+    },
+    {
+      "epoch": 15.004214559386973,
+      "grad_norm": 0.008488762192428112,
+      "learning_rate": 7.730949340144743e-06,
+      "loss": 1.3726,
+      "step": 3970
+    },
+    {
+      "epoch": 15.004980842911877,
+      "grad_norm": 0.3997829854488373,
+      "learning_rate": 7.722435078756918e-06,
+      "loss": 0.8783,
+      "step": 3980
+    },
+    {
+      "epoch": 15.005747126436782,
+      "grad_norm": 0.5089442729949951,
+      "learning_rate": 7.713920817369093e-06,
+      "loss": 0.0101,
+      "step": 3990
+    },
+    {
+      "epoch": 15.006513409961686,
+      "grad_norm": 0.18670423328876495,
+      "learning_rate": 7.70540655598127e-06,
+      "loss": 1.2648,
+      "step": 4000
+    },
+    {
+      "epoch": 15.00727969348659,
+      "grad_norm": 0.2978835105895996,
+      "learning_rate": 7.696892294593445e-06,
+      "loss": 0.9482,
+      "step": 4010
+    },
+    {
+      "epoch": 15.008045977011495,
+      "grad_norm": 0.07457219809293747,
+      "learning_rate": 7.68837803320562e-06,
+      "loss": 0.9133,
+      "step": 4020
+    },
+    {
+      "epoch": 15.008812260536398,
+      "grad_norm": 0.013740615919232368,
+      "learning_rate": 7.679863771817797e-06,
+      "loss": 0.5137,
+      "step": 4030
+    },
+    {
+      "epoch": 15.009578544061302,
+      "grad_norm": 0.20961397886276245,
+      "learning_rate": 7.67134951042997e-06,
+      "loss": 1.0943,
+      "step": 4040
+    },
+    {
+      "epoch": 15.010344827586207,
+      "grad_norm": 1.4198709726333618,
+      "learning_rate": 7.662835249042147e-06,
+      "loss": 0.2508,
+      "step": 4050
+    },
+    {
+      "epoch": 15.011111111111111,
+      "grad_norm": 0.028359852731227875,
+      "learning_rate": 7.654320987654322e-06,
+      "loss": 1.0662,
+      "step": 4060
+    },
+    {
+      "epoch": 15.011877394636015,
+      "grad_norm": 0.09795750677585602,
+      "learning_rate": 7.645806726266497e-06,
+      "loss": 0.5543,
+      "step": 4070
+    },
+    {
+      "epoch": 15.01264367816092,
+      "grad_norm": 0.5180357098579407,
+      "learning_rate": 7.637292464878673e-06,
+      "loss": 0.0176,
+      "step": 4080
+    },
+    {
+      "epoch": 15.013409961685824,
+      "grad_norm": 176.0997314453125,
+      "learning_rate": 7.6287782034908475e-06,
+      "loss": 0.0743,
+      "step": 4090
+    },
+    {
+      "epoch": 15.014176245210727,
+      "grad_norm": 293.60955810546875,
+      "learning_rate": 7.620263942103023e-06,
+      "loss": 1.1809,
+      "step": 4100
+    },
+    {
+      "epoch": 15.014942528735633,
+      "grad_norm": 0.005142731126397848,
+      "learning_rate": 7.611749680715198e-06,
+      "loss": 0.0231,
+      "step": 4110
+    },
+    {
+      "epoch": 15.015708812260536,
+      "grad_norm": 0.13017666339874268,
+      "learning_rate": 7.603235419327374e-06,
+      "loss": 1.2608,
+      "step": 4120
+    },
+    {
+      "epoch": 15.01647509578544,
+      "grad_norm": 131.62759399414062,
+      "learning_rate": 7.59472115793955e-06,
+      "loss": 1.1,
+      "step": 4130
+    },
+    {
+      "epoch": 15.017241379310345,
+      "grad_norm": 575.9231567382812,
+      "learning_rate": 7.586206896551724e-06,
+      "loss": 0.6597,
+      "step": 4140
+    },
+    {
+      "epoch": 15.018007662835249,
+      "grad_norm": 0.03187684714794159,
+      "learning_rate": 7.5776926351639e-06,
+      "loss": 0.6267,
+      "step": 4150
+    },
+    {
+      "epoch": 15.018773946360152,
+      "grad_norm": 0.1445404440164566,
+      "learning_rate": 7.569178373776076e-06,
+      "loss": 0.558,
+      "step": 4160
+    },
+    {
+      "epoch": 15.019540229885058,
+      "grad_norm": 0.11278864741325378,
+      "learning_rate": 7.560664112388251e-06,
+      "loss": 0.531,
+      "step": 4170
+    },
+    {
+      "epoch": 15.02,
+      "eval_accuracy": 0.6666666666666666,
+      "eval_loss": 1.5144925117492676,
+      "eval_runtime": 16.4181,
+      "eval_samples_per_second": 2.741,
+      "eval_steps_per_second": 2.741,
+      "step": 4176
+    },
+    {
+      "epoch": 16.00030651340996,
+      "grad_norm": 1.0717499256134033,
+      "learning_rate": 7.552149851000427e-06,
+      "loss": 0.0066,
+      "step": 4180
+    },
+    {
+      "epoch": 16.001072796934867,
+      "grad_norm": 1.3242942094802856,
+      "learning_rate": 7.543635589612601e-06,
+      "loss": 0.6093,
+      "step": 4190
+    },
+    {
+      "epoch": 16.00183908045977,
+      "grad_norm": 0.05811280384659767,
+      "learning_rate": 7.535121328224777e-06,
+      "loss": 1.0724,
+      "step": 4200
+    },
+    {
+      "epoch": 16.002605363984674,
+      "grad_norm": 0.09780634194612503,
+      "learning_rate": 7.5266070668369525e-06,
+      "loss": 0.0013,
+      "step": 4210
+    },
+    {
+      "epoch": 16.003371647509578,
+      "grad_norm": 26.036725997924805,
+      "learning_rate": 7.5180928054491275e-06,
+      "loss": 1.7478,
+      "step": 4220
+    },
+    {
+      "epoch": 16.00413793103448,
+      "grad_norm": 24.482444763183594,
+      "learning_rate": 7.509578544061303e-06,
+      "loss": 1.364,
+      "step": 4230
+    },
+    {
+      "epoch": 16.00490421455939,
+      "grad_norm": 0.004039923660457134,
+      "learning_rate": 7.501064282673479e-06,
+      "loss": 0.6768,
+      "step": 4240
+    },
+    {
+      "epoch": 16.005670498084292,
+      "grad_norm": 38.828163146972656,
+      "learning_rate": 7.492550021285654e-06,
+      "loss": 0.6534,
+      "step": 4250
+    },
+    {
+      "epoch": 16.006436781609196,
+      "grad_norm": 0.015541213564574718,
+      "learning_rate": 7.48403575989783e-06,
+      "loss": 0.1335,
+      "step": 4260
+    },
+    {
+      "epoch": 16.0072030651341,
+      "grad_norm": 0.3942905366420746,
+      "learning_rate": 7.475521498510004e-06,
+      "loss": 0.6312,
+      "step": 4270
+    },
+    {
+      "epoch": 16.007969348659003,
+      "grad_norm": 1.2412219047546387,
+      "learning_rate": 7.46700723712218e-06,
+      "loss": 1.2928,
+      "step": 4280
+    },
+    {
+      "epoch": 16.008735632183907,
+      "grad_norm": 15.759116172790527,
+      "learning_rate": 7.458492975734356e-06,
+      "loss": 0.7285,
+      "step": 4290
+    },
+    {
+      "epoch": 16.009501915708814,
+      "grad_norm": 106.38333892822266,
+      "learning_rate": 7.449978714346531e-06,
+      "loss": 2.0474,
+      "step": 4300
+    },
+    {
+      "epoch": 16.010268199233717,
+      "grad_norm": 308.9496765136719,
+      "learning_rate": 7.441464452958707e-06,
+      "loss": 0.5782,
+      "step": 4310
+    },
+    {
+      "epoch": 16.01103448275862,
+      "grad_norm": 187.26907348632812,
+      "learning_rate": 7.4329501915708825e-06,
+      "loss": 0.8596,
+      "step": 4320
+    },
+    {
+      "epoch": 16.011800766283525,
+      "grad_norm": 25.111095428466797,
+      "learning_rate": 7.4244359301830575e-06,
+      "loss": 0.9912,
+      "step": 4330
+    },
+    {
+      "epoch": 16.01256704980843,
+      "grad_norm": 0.05673576518893242,
+      "learning_rate": 7.4159216687952325e-06,
+      "loss": 0.5216,
+      "step": 4340
+    },
+    {
+      "epoch": 16.013333333333332,
+      "grad_norm": 1.33418869972229,
+      "learning_rate": 7.4074074074074075e-06,
+      "loss": 0.5891,
+      "step": 4350
+    },
+    {
+      "epoch": 16.01409961685824,
+      "grad_norm": 0.22096247971057892,
+      "learning_rate": 7.398893146019583e-06,
+      "loss": 0.5017,
+      "step": 4360
+    },
+    {
+      "epoch": 16.014865900383143,
+      "grad_norm": 0.022658327594399452,
+      "learning_rate": 7.390378884631759e-06,
+      "loss": 0.0035,
+      "step": 4370
+    },
+    {
+      "epoch": 16.015632183908046,
+      "grad_norm": 0.02133125253021717,
+      "learning_rate": 7.381864623243934e-06,
+      "loss": 1.4307,
+      "step": 4380
+    },
+    {
+      "epoch": 16.01639846743295,
+      "grad_norm": 31.715421676635742,
+      "learning_rate": 7.37335036185611e-06,
+      "loss": 1.3714,
+      "step": 4390
+    },
+    {
+      "epoch": 16.017164750957853,
+      "grad_norm": 45.580810546875,
+      "learning_rate": 7.364836100468284e-06,
+      "loss": 0.0121,
+      "step": 4400
+    },
+    {
+      "epoch": 16.017931034482757,
+      "grad_norm": 0.2011858969926834,
+      "learning_rate": 7.35632183908046e-06,
+      "loss": 1.1134,
+      "step": 4410
+    },
+    {
+      "epoch": 16.018697318007664,
+      "grad_norm": 0.008143080398440361,
+      "learning_rate": 7.347807577692636e-06,
+      "loss": 0.7872,
+      "step": 4420
+    },
+    {
+      "epoch": 16.019463601532568,
+      "grad_norm": 0.09173174947500229,
+      "learning_rate": 7.339293316304811e-06,
+      "loss": 1.0064,
+      "step": 4430
+    },
+    {
+      "epoch": 16.02,
+      "eval_accuracy": 0.6222222222222222,
+      "eval_loss": 1.4190558195114136,
+      "eval_runtime": 17.1207,
+      "eval_samples_per_second": 2.628,
+      "eval_steps_per_second": 2.628,
+      "step": 4437
+    },
+    {
+      "epoch": 17.000229885057472,
+      "grad_norm": 0.03527137637138367,
+      "learning_rate": 7.330779054916987e-06,
+      "loss": 1.9455,
+      "step": 4440
+    },
+    {
+      "epoch": 17.000996168582375,
+      "grad_norm": 0.36683353781700134,
+      "learning_rate": 7.3222647935291625e-06,
+      "loss": 0.0178,
+      "step": 4450
+    },
+    {
+      "epoch": 17.00176245210728,
+      "grad_norm": 0.006566314492374659,
+      "learning_rate": 7.3137505321413375e-06,
+      "loss": 0.0074,
+      "step": 4460
+    },
+    {
+      "epoch": 17.002528735632183,
+      "grad_norm": 0.30550625920295715,
+      "learning_rate": 7.305236270753513e-06,
+      "loss": 1.6508,
+      "step": 4470
+    },
+    {
+      "epoch": 17.00329501915709,
+      "grad_norm": 0.012653980404138565,
+      "learning_rate": 7.2967220093656875e-06,
+      "loss": 0.4925,
+      "step": 4480
+    },
+    {
+      "epoch": 17.004061302681993,
+      "grad_norm": 29.529855728149414,
+      "learning_rate": 7.288207747977863e-06,
+      "loss": 0.9507,
+      "step": 4490
+    },
+    {
+      "epoch": 17.004827586206897,
+      "grad_norm": 0.32274582982063293,
+      "learning_rate": 7.279693486590039e-06,
+      "loss": 1.8334,
+      "step": 4500
+    },
+    {
+      "epoch": 17.0055938697318,
+      "grad_norm": 0.39408576488494873,
+      "learning_rate": 7.271179225202214e-06,
+      "loss": 0.0252,
+      "step": 4510
+    },
+    {
+      "epoch": 17.006360153256704,
+      "grad_norm": 0.02031654119491577,
+      "learning_rate": 7.26266496381439e-06,
+      "loss": 1.167,
+      "step": 4520
+    },
+    {
+      "epoch": 17.007126436781608,
+      "grad_norm": 0.2735036313533783,
+      "learning_rate": 7.254150702426566e-06,
+      "loss": 0.9168,
+      "step": 4530
+    },
+    {
+      "epoch": 17.007892720306515,
+      "grad_norm": 0.3391585946083069,
+      "learning_rate": 7.24563644103874e-06,
+      "loss": 0.4447,
+      "step": 4540
+    },
+    {
+      "epoch": 17.00865900383142,
+      "grad_norm": 0.009861011058092117,
+      "learning_rate": 7.237122179650916e-06,
+      "loss": 0.9521,
+      "step": 4550
+    },
+    {
+      "epoch": 17.009425287356322,
+      "grad_norm": 26.05072593688965,
+      "learning_rate": 7.228607918263091e-06,
+      "loss": 0.7494,
+      "step": 4560
+    },
+    {
+      "epoch": 17.010191570881226,
+      "grad_norm": 0.033406324684619904,
+      "learning_rate": 7.220093656875267e-06,
+      "loss": 0.0049,
+      "step": 4570
+    },
+    {
+      "epoch": 17.01095785440613,
+      "grad_norm": 25.96360969543457,
+      "learning_rate": 7.2115793954874425e-06,
+      "loss": 0.8902,
+      "step": 4580
+    },
+    {
+      "epoch": 17.011724137931033,
+      "grad_norm": 0.03734881803393364,
+      "learning_rate": 7.2030651340996175e-06,
+      "loss": 0.4747,
+      "step": 4590
+    },
+    {
+      "epoch": 17.01249042145594,
+      "grad_norm": 0.1114615648984909,
+      "learning_rate": 7.194550872711793e-06,
+      "loss": 0.8334,
+      "step": 4600
+    },
+    {
+      "epoch": 17.013256704980844,
+      "grad_norm": 0.484312504529953,
+      "learning_rate": 7.1860366113239675e-06,
+      "loss": 0.3247,
+      "step": 4610
+    },
+    {
+      "epoch": 17.014022988505747,
+      "grad_norm": 0.05565819889307022,
+      "learning_rate": 7.177522349936143e-06,
+      "loss": 0.017,
+      "step": 4620
+    },
+    {
+      "epoch": 17.01478927203065,
+      "grad_norm": 0.02689223363995552,
+      "learning_rate": 7.169008088548319e-06,
+      "loss": 0.2477,
+      "step": 4630
+    },
+    {
+      "epoch": 17.015555555555554,
+      "grad_norm": 252.61764526367188,
+      "learning_rate": 7.160493827160494e-06,
+      "loss": 2.1626,
+      "step": 4640
+    },
+    {
+      "epoch": 17.016321839080458,
+      "grad_norm": 0.009557435289025307,
+      "learning_rate": 7.15197956577267e-06,
+      "loss": 0.6396,
+      "step": 4650
+    },
+    {
+      "epoch": 17.017088122605365,
+      "grad_norm": 0.021211931481957436,
+      "learning_rate": 7.143465304384846e-06,
+      "loss": 0.4693,
+      "step": 4660
+    },
+    {
+      "epoch": 17.01785440613027,
+      "grad_norm": 0.0034942685160785913,
+      "learning_rate": 7.13495104299702e-06,
+      "loss": 0.5057,
+      "step": 4670
+    },
+    {
+      "epoch": 17.018620689655172,
+      "grad_norm": 0.11457903683185577,
+      "learning_rate": 7.126436781609196e-06,
+      "loss": 0.003,
+      "step": 4680
+    },
+    {
+      "epoch": 17.019386973180076,
+      "grad_norm": 191.91835021972656,
+      "learning_rate": 7.117922520221371e-06,
+      "loss": 0.2797,
+      "step": 4690
+    },
+    {
+      "epoch": 17.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.3301029205322266,
+      "eval_runtime": 16.2656,
+      "eval_samples_per_second": 2.767,
+      "eval_steps_per_second": 2.767,
+      "step": 4698
+    },
+    {
+      "epoch": 18.00015325670498,
+      "grad_norm": 859.1838989257812,
+      "learning_rate": 7.109408258833547e-06,
+      "loss": 1.5085,
+      "step": 4700
+    },
+    {
+      "epoch": 18.000919540229884,
+      "grad_norm": 4.364364147186279,
+      "learning_rate": 7.1008939974457225e-06,
+      "loss": 1.7176,
+      "step": 4710
+    },
+    {
+      "epoch": 18.00168582375479,
+      "grad_norm": 24.725149154663086,
+      "learning_rate": 7.0923797360578975e-06,
+      "loss": 1.3584,
+      "step": 4720
+    },
+    {
+      "epoch": 18.002452107279694,
+      "grad_norm": 11.975507736206055,
+      "learning_rate": 7.083865474670073e-06,
+      "loss": 0.5036,
+      "step": 4730
+    },
+    {
+      "epoch": 18.003218390804598,
+      "grad_norm": 0.005032980814576149,
+      "learning_rate": 7.075351213282249e-06,
+      "loss": 0.4398,
+      "step": 4740
+    },
+    {
+      "epoch": 18.0039846743295,
+      "grad_norm": 0.017475353553891182,
+      "learning_rate": 7.066836951894423e-06,
+      "loss": 0.0112,
+      "step": 4750
+    },
+    {
+      "epoch": 18.004750957854405,
+      "grad_norm": 99.97907257080078,
+      "learning_rate": 7.058322690506599e-06,
+      "loss": 1.2898,
+      "step": 4760
+    },
+    {
+      "epoch": 18.00551724137931,
+      "grad_norm": 0.004234318155795336,
+      "learning_rate": 7.049808429118774e-06,
+      "loss": 1.0167,
+      "step": 4770
+    },
+    {
+      "epoch": 18.006283524904216,
+      "grad_norm": 3.233228921890259,
+      "learning_rate": 7.04129416773095e-06,
+      "loss": 0.0056,
+      "step": 4780
+    },
+    {
+      "epoch": 18.00704980842912,
+      "grad_norm": 0.014259359799325466,
+      "learning_rate": 7.032779906343126e-06,
+      "loss": 1.1798,
+      "step": 4790
+    },
+    {
+      "epoch": 18.007816091954023,
+      "grad_norm": 0.007539619691669941,
+      "learning_rate": 7.0242656449553e-06,
+      "loss": 0.9871,
+      "step": 4800
+    },
+    {
+      "epoch": 18.008582375478927,
+      "grad_norm": 0.2630937397480011,
+      "learning_rate": 7.015751383567476e-06,
+      "loss": 0.9212,
+      "step": 4810
+    },
+    {
+      "epoch": 18.00934865900383,
+      "grad_norm": 0.016338275745511055,
+      "learning_rate": 7.007237122179652e-06,
+      "loss": 0.891,
+      "step": 4820
+    },
+    {
+      "epoch": 18.010114942528734,
+      "grad_norm": 94.00718688964844,
+      "learning_rate": 6.998722860791827e-06,
+      "loss": 0.6238,
+      "step": 4830
+    },
+    {
+      "epoch": 18.01088122605364,
+      "grad_norm": 0.008056026883423328,
+      "learning_rate": 6.9902085994040025e-06,
+      "loss": 0.6769,
+      "step": 4840
+    },
+    {
+      "epoch": 18.011647509578545,
+      "grad_norm": 315.77288818359375,
+      "learning_rate": 6.9816943380161775e-06,
+      "loss": 0.8629,
+      "step": 4850
+    },
+    {
+      "epoch": 18.01241379310345,
+      "grad_norm": 0.05887777730822563,
+      "learning_rate": 6.973180076628353e-06,
+      "loss": 2.0352,
+      "step": 4860
+    },
+    {
+      "epoch": 18.013180076628352,
+      "grad_norm": 0.08154245465993881,
+      "learning_rate": 6.964665815240529e-06,
+      "loss": 0.9894,
+      "step": 4870
+    },
+    {
+      "epoch": 18.013946360153255,
+      "grad_norm": 0.3390685021877289,
+      "learning_rate": 6.956151553852703e-06,
+      "loss": 0.2945,
+      "step": 4880
+    },
+    {
+      "epoch": 18.014712643678163,
+      "grad_norm": 0.3962027132511139,
+      "learning_rate": 6.947637292464879e-06,
+      "loss": 1.3786,
+      "step": 4890
+    },
+    {
+      "epoch": 18.015478927203066,
+      "grad_norm": 25.168548583984375,
+      "learning_rate": 6.939123031077054e-06,
+      "loss": 1.5385,
+      "step": 4900
+    },
+    {
+      "epoch": 18.01624521072797,
+      "grad_norm": 0.11930970847606659,
+      "learning_rate": 6.93060876968923e-06,
+      "loss": 1.2444,
+      "step": 4910
+    },
+    {
+      "epoch": 18.017011494252873,
+      "grad_norm": 0.3769722878932953,
+      "learning_rate": 6.922094508301406e-06,
+      "loss": 0.0121,
+      "step": 4920
+    },
+    {
+      "epoch": 18.017777777777777,
+      "grad_norm": 0.007638899143785238,
+      "learning_rate": 6.913580246913581e-06,
+      "loss": 0.0801,
+      "step": 4930
+    },
+    {
+      "epoch": 18.01854406130268,
+      "grad_norm": 0.0043776677921414375,
+      "learning_rate": 6.905065985525757e-06,
+      "loss": 0.005,
+      "step": 4940
+    },
+    {
+      "epoch": 18.019310344827588,
+      "grad_norm": 0.29639992117881775,
+      "learning_rate": 6.896551724137932e-06,
+      "loss": 1.0355,
+      "step": 4950
+    },
+    {
+      "epoch": 18.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.4828884601593018,
+      "eval_runtime": 16.3116,
+      "eval_samples_per_second": 2.759,
+      "eval_steps_per_second": 2.759,
+      "step": 4959
+    },
+    {
+      "epoch": 19.000076628352492,
+      "grad_norm": 473.79254150390625,
+      "learning_rate": 6.888037462750107e-06,
+      "loss": 1.339,
+      "step": 4960
+    },
+    {
+      "epoch": 19.000842911877395,
+      "grad_norm": 1.6242972612380981,
+      "learning_rate": 6.8795232013622825e-06,
+      "loss": 0.5571,
+      "step": 4970
+    },
+    {
+      "epoch": 19.0016091954023,
+      "grad_norm": 0.00575720239430666,
+      "learning_rate": 6.8710089399744575e-06,
+      "loss": 0.5021,
+      "step": 4980
+    },
+    {
+      "epoch": 19.002375478927203,
+      "grad_norm": 0.030885737389326096,
+      "learning_rate": 6.862494678586633e-06,
+      "loss": 0.4887,
+      "step": 4990
+    },
+    {
+      "epoch": 19.003141762452106,
+      "grad_norm": 0.08673913776874542,
+      "learning_rate": 6.853980417198809e-06,
+      "loss": 0.9856,
+      "step": 5000
+    },
+    {
+      "epoch": 19.00390804597701,
+      "grad_norm": 0.006700743921101093,
+      "learning_rate": 6.845466155810983e-06,
+      "loss": 1.0664,
+      "step": 5010
+    },
+    {
+      "epoch": 19.004674329501917,
+      "grad_norm": 0.008593742735683918,
+      "learning_rate": 6.836951894423159e-06,
+      "loss": 1.8705,
+      "step": 5020
+    },
+    {
+      "epoch": 19.00544061302682,
+      "grad_norm": 0.005270008929073811,
+      "learning_rate": 6.828437633035335e-06,
+      "loss": 0.0189,
+      "step": 5030
+    },
+    {
+      "epoch": 19.006206896551724,
+      "grad_norm": 1.125020980834961,
+      "learning_rate": 6.81992337164751e-06,
+      "loss": 0.4754,
+      "step": 5040
+    },
+    {
+      "epoch": 19.006973180076628,
+      "grad_norm": 26.364669799804688,
+      "learning_rate": 6.811409110259686e-06,
+      "loss": 1.5002,
+      "step": 5050
+    },
+    {
+      "epoch": 19.00773946360153,
+      "grad_norm": 252.63575744628906,
+      "learning_rate": 6.802894848871861e-06,
+      "loss": 1.4733,
+      "step": 5060
+    },
+    {
+      "epoch": 19.00850574712644,
+      "grad_norm": 0.004729693289846182,
+      "learning_rate": 6.794380587484037e-06,
+      "loss": 1.1292,
+      "step": 5070
+    },
+    {
+      "epoch": 19.009272030651342,
+      "grad_norm": 853.1784057617188,
+      "learning_rate": 6.7858663260962125e-06,
+      "loss": 0.2903,
+      "step": 5080
+    },
+    {
+      "epoch": 19.010038314176246,
+      "grad_norm": 0.29687902331352234,
+      "learning_rate": 6.777352064708387e-06,
+      "loss": 0.9156,
+      "step": 5090
+    },
+    {
+      "epoch": 19.01080459770115,
+      "grad_norm": 85.77644348144531,
+      "learning_rate": 6.7688378033205625e-06,
+      "loss": 1.3553,
+      "step": 5100
+    },
+    {
+      "epoch": 19.011570881226053,
+      "grad_norm": 0.005442415829747915,
+      "learning_rate": 6.760323541932738e-06,
+      "loss": 0.4803,
+      "step": 5110
+    },
+    {
+      "epoch": 19.012337164750956,
+      "grad_norm": 0.004595646634697914,
+      "learning_rate": 6.751809280544913e-06,
+      "loss": 0.6559,
+      "step": 5120
+    },
+    {
+      "epoch": 19.013103448275864,
+      "grad_norm": 0.20090419054031372,
+      "learning_rate": 6.743295019157089e-06,
+      "loss": 0.6939,
+      "step": 5130
+    },
+    {
+      "epoch": 19.013869731800767,
+      "grad_norm": 1.3870974779129028,
+      "learning_rate": 6.734780757769263e-06,
+      "loss": 0.0032,
+      "step": 5140
+    },
+    {
+      "epoch": 19.01463601532567,
+      "grad_norm": 0.010297191329300404,
+      "learning_rate": 6.726266496381439e-06,
+      "loss": 0.8631,
+      "step": 5150
+    },
+    {
+      "epoch": 19.015402298850574,
+      "grad_norm": 0.14181283116340637,
+      "learning_rate": 6.717752234993615e-06,
+      "loss": 0.4941,
+      "step": 5160
+    },
+    {
+      "epoch": 19.016168582375478,
+      "grad_norm": 0.07767921686172485,
+      "learning_rate": 6.70923797360579e-06,
+      "loss": 0.5378,
+      "step": 5170
+    },
+    {
+      "epoch": 19.01693486590038,
+      "grad_norm": 0.4668266773223877,
+      "learning_rate": 6.700723712217966e-06,
+      "loss": 0.5063,
+      "step": 5180
+    },
+    {
+      "epoch": 19.01770114942529,
+      "grad_norm": 0.003612382337450981,
+      "learning_rate": 6.692209450830141e-06,
+      "loss": 0.0058,
+      "step": 5190
+    },
+    {
+      "epoch": 19.018467432950192,
+      "grad_norm": 0.12045297026634216,
+      "learning_rate": 6.683695189442317e-06,
+      "loss": 1.9205,
+      "step": 5200
+    },
+    {
+      "epoch": 19.019233716475096,
+      "grad_norm": 0.15484854578971863,
+      "learning_rate": 6.6751809280544925e-06,
+      "loss": 0.5399,
+      "step": 5210
+    },
+    {
+      "epoch": 19.02,
+      "grad_norm": 0.3223140835762024,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.937,
+      "step": 5220
+    },
+    {
+      "epoch": 19.02,
+      "eval_accuracy": 0.5555555555555556,
+      "eval_loss": 1.8992173671722412,
+      "eval_runtime": 17.4077,
+      "eval_samples_per_second": 2.585,
+      "eval_steps_per_second": 2.585,
+      "step": 5220
+    },
+    {
+      "epoch": 20.000766283524904,
+      "grad_norm": 0.0048758527263998985,
+      "learning_rate": 6.6581524052788425e-06,
+      "loss": 0.3996,
+      "step": 5230
+    },
+    {
+      "epoch": 20.001532567049807,
+      "grad_norm": 0.005408442113548517,
+      "learning_rate": 6.649638143891018e-06,
+      "loss": 0.0246,
+      "step": 5240
+    },
+    {
+      "epoch": 20.002298850574714,
+      "grad_norm": 0.03478803113102913,
+      "learning_rate": 6.641123882503193e-06,
+      "loss": 0.3835,
+      "step": 5250
+    },
+    {
+      "epoch": 20.003065134099618,
+      "grad_norm": 142.55801391601562,
+      "learning_rate": 6.632609621115369e-06,
+      "loss": 0.3611,
+      "step": 5260
+    },
+    {
+      "epoch": 20.00383141762452,
+      "grad_norm": 0.14220836758613586,
+      "learning_rate": 6.624095359727543e-06,
+      "loss": 0.4811,
+      "step": 5270
+    },
+    {
+      "epoch": 20.004597701149425,
+      "grad_norm": 0.024164844304323196,
+      "learning_rate": 6.615581098339719e-06,
+      "loss": 0.9137,
+      "step": 5280
+    },
+    {
+      "epoch": 20.00536398467433,
+      "grad_norm": 0.715275228023529,
+      "learning_rate": 6.607066836951895e-06,
+      "loss": 1.0824,
+      "step": 5290
+    },
+    {
+      "epoch": 20.006130268199232,
+      "grad_norm": 0.2047380656003952,
+      "learning_rate": 6.59855257556407e-06,
+      "loss": 0.4944,
+      "step": 5300
+    },
+    {
+      "epoch": 20.00689655172414,
+      "grad_norm": 0.22113081812858582,
+      "learning_rate": 6.590038314176246e-06,
+      "loss": 0.3149,
+      "step": 5310
+    },
+    {
+      "epoch": 20.007662835249043,
+      "grad_norm": 0.11102890223264694,
+      "learning_rate": 6.581524052788422e-06,
+      "loss": 0.0029,
+      "step": 5320
+    },
+    {
+      "epoch": 20.008429118773947,
+      "grad_norm": 0.014461666345596313,
+      "learning_rate": 6.573009791400597e-06,
+      "loss": 0.8954,
+      "step": 5330
+    },
+    {
+      "epoch": 20.00919540229885,
+      "grad_norm": 4.228750228881836,
+      "learning_rate": 6.5644955300127725e-06,
+      "loss": 0.3519,
+      "step": 5340
+    },
+    {
+      "epoch": 20.009961685823754,
+      "grad_norm": 0.9284133315086365,
+      "learning_rate": 6.555981268624947e-06,
+      "loss": 1.1342,
+      "step": 5350
+    },
+    {
+      "epoch": 20.010727969348657,
+      "grad_norm": 0.009762330912053585,
+      "learning_rate": 6.5474670072371225e-06,
+      "loss": 0.9981,
+      "step": 5360
+    },
+    {
+      "epoch": 20.011494252873565,
+      "grad_norm": 0.024653086438775063,
+      "learning_rate": 6.538952745849298e-06,
+      "loss": 0.2436,
+      "step": 5370
+    },
+    {
+      "epoch": 20.01226053639847,
+      "grad_norm": 0.3645969331264496,
+      "learning_rate": 6.530438484461473e-06,
+      "loss": 1.1069,
+      "step": 5380
+    },
+    {
+      "epoch": 20.013026819923372,
+      "grad_norm": 0.0035229420755058527,
+      "learning_rate": 6.521924223073649e-06,
+      "loss": 0.8913,
+      "step": 5390
+    },
+    {
+      "epoch": 20.013793103448275,
+      "grad_norm": 0.0032086996361613274,
+      "learning_rate": 6.513409961685824e-06,
+      "loss": 1.7655,
+      "step": 5400
+    },
+    {
+      "epoch": 20.01455938697318,
+      "grad_norm": 0.0029455283656716347,
+      "learning_rate": 6.504895700297999e-06,
+      "loss": 0.008,
+      "step": 5410
+    },
+    {
+      "epoch": 20.015325670498083,
+      "grad_norm": 0.2534770965576172,
+      "learning_rate": 6.496381438910175e-06,
+      "loss": 0.0151,
+      "step": 5420
+    },
+    {
+      "epoch": 20.01609195402299,
+      "grad_norm": 0.2731533348560333,
+      "learning_rate": 6.48786717752235e-06,
+      "loss": 2.1,
+      "step": 5430
+    },
+    {
+      "epoch": 20.016858237547893,
+      "grad_norm": 0.15418186783790588,
+      "learning_rate": 6.479352916134526e-06,
+      "loss": 1.2051,
+      "step": 5440
+    },
+    {
+      "epoch": 20.017624521072797,
+      "grad_norm": 151.98146057128906,
+      "learning_rate": 6.470838654746702e-06,
+      "loss": 1.0109,
+      "step": 5450
+    },
+    {
+      "epoch": 20.0183908045977,
+      "grad_norm": 27.146896362304688,
+      "learning_rate": 6.462324393358877e-06,
+      "loss": 0.9353,
+      "step": 5460
+    },
+    {
+      "epoch": 20.019157088122604,
+      "grad_norm": 0.007136407773941755,
+      "learning_rate": 6.4538101319710525e-06,
+      "loss": 1.1601,
+      "step": 5470
+    },
+    {
+      "epoch": 20.01992337164751,
+      "grad_norm": 0.5974582433700562,
+      "learning_rate": 6.445295870583227e-06,
+      "loss": 0.0065,
+      "step": 5480
+    },
+    {
+      "epoch": 20.02,
+      "eval_accuracy": 0.6444444444444445,
+      "eval_loss": 1.1564029455184937,
+      "eval_runtime": 18.3465,
+      "eval_samples_per_second": 2.453,
+      "eval_steps_per_second": 2.453,
+      "step": 5481
+    },
+    {
+      "epoch": 21.000689655172415,
+      "grad_norm": 0.005541063379496336,
+      "learning_rate": 6.4367816091954025e-06,
+      "loss": 0.6849,
+      "step": 5490
+    },
+    {
+      "epoch": 21.00145593869732,
+      "grad_norm": 0.0032322239130735397,
+      "learning_rate": 6.428267347807578e-06,
+      "loss": 1.0243,
+      "step": 5500
+    },
+    {
+      "epoch": 21.002222222222223,
+      "grad_norm": 0.010749446228146553,
+      "learning_rate": 6.419753086419753e-06,
+      "loss": 0.0034,
+      "step": 5510
+    },
+    {
+      "epoch": 21.002988505747126,
+      "grad_norm": 0.014948604628443718,
+      "learning_rate": 6.411238825031929e-06,
+      "loss": 0.0597,
+      "step": 5520
+    },
+    {
+      "epoch": 21.00375478927203,
+      "grad_norm": 0.019523367285728455,
+      "learning_rate": 6.402724563644105e-06,
+      "loss": 0.8889,
+      "step": 5530
+    },
+    {
+      "epoch": 21.004521072796933,
+      "grad_norm": 0.22001327574253082,
+      "learning_rate": 6.39421030225628e-06,
+      "loss": 0.4529,
+      "step": 5540
+    },
+    {
+      "epoch": 21.00528735632184,
+      "grad_norm": 57.557777404785156,
+      "learning_rate": 6.385696040868455e-06,
+      "loss": 0.6169,
+      "step": 5550
+    },
+    {
+      "epoch": 21.006053639846744,
+      "grad_norm": 0.13096360862255096,
+      "learning_rate": 6.37718177948063e-06,
+      "loss": 0.6997,
+      "step": 5560
+    },
+    {
+      "epoch": 21.006819923371648,
+      "grad_norm": 0.15169228613376617,
+      "learning_rate": 6.368667518092806e-06,
+      "loss": 0.5378,
+      "step": 5570
+    },
+    {
+      "epoch": 21.00758620689655,
+      "grad_norm": 0.003085825825110078,
+      "learning_rate": 6.360153256704982e-06,
+      "loss": 1.4237,
+      "step": 5580
+    },
+    {
+      "epoch": 21.008352490421455,
+      "grad_norm": 0.3371953070163727,
+      "learning_rate": 6.351638995317157e-06,
+      "loss": 1.0755,
+      "step": 5590
+    },
+    {
+      "epoch": 21.00911877394636,
+      "grad_norm": 0.288158655166626,
+      "learning_rate": 6.3431247339293325e-06,
+      "loss": 0.6494,
+      "step": 5600
+    },
+    {
+      "epoch": 21.009885057471266,
+      "grad_norm": 0.4217139780521393,
+      "learning_rate": 6.334610472541508e-06,
+      "loss": 0.0029,
+      "step": 5610
+    },
+    {
+      "epoch": 21.01065134099617,
+      "grad_norm": 23.792030334472656,
+      "learning_rate": 6.3260962111536825e-06,
+      "loss": 0.0085,
+      "step": 5620
+    },
+    {
+      "epoch": 21.011417624521073,
+      "grad_norm": 0.006410900969058275,
+      "learning_rate": 6.317581949765858e-06,
+      "loss": 0.7975,
+      "step": 5630
+    },
+    {
+      "epoch": 21.012183908045976,
+      "grad_norm": 0.14496740698814392,
+      "learning_rate": 6.309067688378033e-06,
+      "loss": 0.0156,
+      "step": 5640
+    },
+    {
+      "epoch": 21.01295019157088,
+      "grad_norm": 60.5942268371582,
+      "learning_rate": 6.300553426990209e-06,
+      "loss": 0.9892,
+      "step": 5650
+    },
+    {
+      "epoch": 21.013716475095784,
+      "grad_norm": 1.9329127073287964,
+      "learning_rate": 6.292039165602385e-06,
+      "loss": 0.5093,
+      "step": 5660
+    },
+    {
+      "epoch": 21.01448275862069,
+      "grad_norm": 0.002778069581836462,
+      "learning_rate": 6.28352490421456e-06,
+      "loss": 0.6323,
+      "step": 5670
+    },
+    {
+      "epoch": 21.015249042145594,
+      "grad_norm": 0.22024425864219666,
+      "learning_rate": 6.275010642826736e-06,
+      "loss": 0.0126,
+      "step": 5680
+    },
+    {
+      "epoch": 21.016015325670498,
+      "grad_norm": 0.014522101730108261,
+      "learning_rate": 6.26649638143891e-06,
+      "loss": 0.0332,
+      "step": 5690
+    },
+    {
+      "epoch": 21.0167816091954,
+      "grad_norm": 0.008470996282994747,
+      "learning_rate": 6.257982120051086e-06,
+      "loss": 1.4853,
+      "step": 5700
+    },
+    {
+      "epoch": 21.017547892720305,
+      "grad_norm": 0.08409818261861801,
+      "learning_rate": 6.249467858663262e-06,
+      "loss": 0.4452,
+      "step": 5710
+    },
+    {
+      "epoch": 21.018314176245212,
+      "grad_norm": 193.189208984375,
+      "learning_rate": 6.240953597275437e-06,
+      "loss": 1.815,
+      "step": 5720
+    },
+    {
+      "epoch": 21.019080459770116,
+      "grad_norm": 356.51177978515625,
+      "learning_rate": 6.2324393358876125e-06,
+      "loss": 0.4184,
+      "step": 5730
+    },
+    {
+      "epoch": 21.01984674329502,
+      "grad_norm": 0.005723089445382357,
+      "learning_rate": 6.223925074499788e-06,
+      "loss": 0.9542,
+      "step": 5740
+    },
+    {
+      "epoch": 21.02,
+      "eval_accuracy": 0.6888888888888889,
+      "eval_loss": 1.4033571481704712,
+      "eval_runtime": 17.2555,
+      "eval_samples_per_second": 2.608,
+      "eval_steps_per_second": 2.608,
+      "step": 5742
+    },
+    {
+      "epoch": 22.000613026819924,
+      "grad_norm": 0.02343740314245224,
+      "learning_rate": 6.2154108131119625e-06,
+      "loss": 1.0042,
+      "step": 5750
+    },
+    {
+      "epoch": 22.001379310344827,
+      "grad_norm": 0.23128117620944977,
+      "learning_rate": 6.206896551724138e-06,
+      "loss": 0.3785,
+      "step": 5760
+    },
+    {
+      "epoch": 22.00214559386973,
+      "grad_norm": 23.553518295288086,
+      "learning_rate": 6.198382290336313e-06,
+      "loss": 0.4821,
+      "step": 5770
+    },
+    {
+      "epoch": 22.002911877394634,
+      "grad_norm": 0.24422501027584076,
+      "learning_rate": 6.189868028948489e-06,
+      "loss": 0.4265,
+      "step": 5780
+    },
+    {
+      "epoch": 22.00367816091954,
+      "grad_norm": 0.154605433344841,
+      "learning_rate": 6.181353767560665e-06,
+      "loss": 0.0137,
+      "step": 5790
+    },
+    {
+      "epoch": 22.004444444444445,
+      "grad_norm": 186.47622680664062,
+      "learning_rate": 6.17283950617284e-06,
+      "loss": 1.1339,
+      "step": 5800
+    },
+    {
+      "epoch": 22.00521072796935,
+      "grad_norm": 268.74114990234375,
+      "learning_rate": 6.164325244785016e-06,
+      "loss": 0.505,
+      "step": 5810
+    },
+    {
+      "epoch": 22.005977011494252,
+      "grad_norm": 0.00425740797072649,
+      "learning_rate": 6.155810983397192e-06,
+      "loss": 0.755,
+      "step": 5820
+    },
+    {
+      "epoch": 22.006743295019156,
+      "grad_norm": 0.017831366509199142,
+      "learning_rate": 6.147296722009366e-06,
+      "loss": 0.8608,
+      "step": 5830
+    },
+    {
+      "epoch": 22.00750957854406,
+      "grad_norm": 0.11955715715885162,
+      "learning_rate": 6.138782460621542e-06,
+      "loss": 0.6582,
+      "step": 5840
+    },
+    {
+      "epoch": 22.008275862068967,
+      "grad_norm": 0.021726898849010468,
+      "learning_rate": 6.130268199233717e-06,
+      "loss": 0.6259,
+      "step": 5850
+    },
+    {
+      "epoch": 22.00904214559387,
+      "grad_norm": 250.85305786132812,
+      "learning_rate": 6.1217539378458925e-06,
+      "loss": 0.9274,
+      "step": 5860
+    },
+    {
+      "epoch": 22.009808429118774,
+      "grad_norm": 0.11630397289991379,
+      "learning_rate": 6.113239676458068e-06,
+      "loss": 0.5529,
+      "step": 5870
+    },
+    {
+      "epoch": 22.010574712643677,
+      "grad_norm": 0.005806886591017246,
+      "learning_rate": 6.1047254150702425e-06,
+      "loss": 1.1417,
+      "step": 5880
+    },
+    {
+      "epoch": 22.01134099616858,
+      "grad_norm": 0.015259725041687489,
+      "learning_rate": 6.096211153682418e-06,
+      "loss": 0.0054,
+      "step": 5890
+    },
+    {
+      "epoch": 22.01210727969349,
+      "grad_norm": 59.33766555786133,
+      "learning_rate": 6.087696892294594e-06,
+      "loss": 0.9703,
+      "step": 5900
+    },
+    {
+      "epoch": 22.012873563218392,
+      "grad_norm": 0.06096731498837471,
+      "learning_rate": 6.079182630906769e-06,
+      "loss": 1.2769,
+      "step": 5910
+    },
+    {
+      "epoch": 22.013639846743295,
+      "grad_norm": 0.14904960989952087,
+      "learning_rate": 6.070668369518945e-06,
+      "loss": 0.0026,
+      "step": 5920
+    },
+    {
+      "epoch": 22.0144061302682,
+      "grad_norm": 0.13450872898101807,
+      "learning_rate": 6.06215410813112e-06,
+      "loss": 0.5342,
+      "step": 5930
+    },
+    {
+      "epoch": 22.015172413793103,
+      "grad_norm": 0.07450124621391296,
+      "learning_rate": 6.053639846743296e-06,
+      "loss": 0.5244,
+      "step": 5940
+    },
+    {
+      "epoch": 22.015938697318006,
+      "grad_norm": 0.030625246465206146,
+      "learning_rate": 6.045125585355472e-06,
+      "loss": 0.6235,
+      "step": 5950
+    },
+    {
+      "epoch": 22.016704980842913,
+      "grad_norm": 0.012795121408998966,
+      "learning_rate": 6.036611323967646e-06,
+      "loss": 0.4477,
+      "step": 5960
+    },
+    {
+      "epoch": 22.017471264367817,
+      "grad_norm": 0.003031395375728607,
+      "learning_rate": 6.028097062579822e-06,
+      "loss": 0.645,
+      "step": 5970
+    },
+    {
+      "epoch": 22.01823754789272,
+      "grad_norm": 0.19980162382125854,
+      "learning_rate": 6.019582801191997e-06,
+      "loss": 0.0042,
+      "step": 5980
+    },
+    {
+      "epoch": 22.019003831417624,
+      "grad_norm": 0.19694259762763977,
+      "learning_rate": 6.0110685398041725e-06,
+      "loss": 0.4617,
+      "step": 5990
+    },
+    {
+      "epoch": 22.019770114942528,
+      "grad_norm": 0.529032289981842,
+      "learning_rate": 6.002554278416348e-06,
+      "loss": 0.4833,
+      "step": 6000
+    },
+    {
+      "epoch": 22.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.3207600116729736,
+      "eval_runtime": 17.1518,
+      "eval_samples_per_second": 2.624,
+      "eval_steps_per_second": 2.624,
+      "step": 6003
+    },
+    {
+      "epoch": 23.000536398467432,
+      "grad_norm": 0.008429036475718021,
+      "learning_rate": 5.9940400170285225e-06,
+      "loss": 0.3749,
+      "step": 6010
+    },
+    {
+      "epoch": 23.001302681992335,
+      "grad_norm": 0.008820030838251114,
+      "learning_rate": 5.985525755640698e-06,
+      "loss": 0.5081,
+      "step": 6020
+    },
+    {
+      "epoch": 23.002068965517243,
+      "grad_norm": 0.32108014822006226,
+      "learning_rate": 5.977011494252874e-06,
+      "loss": 0.0048,
+      "step": 6030
+    },
+    {
+      "epoch": 23.002835249042146,
+      "grad_norm": 0.0169094055891037,
+      "learning_rate": 5.968497232865049e-06,
+      "loss": 0.441,
+      "step": 6040
+    },
+    {
+      "epoch": 23.00360153256705,
+      "grad_norm": 0.004983820486813784,
+      "learning_rate": 5.959982971477225e-06,
+      "loss": 0.3971,
+      "step": 6050
+    },
+    {
+      "epoch": 23.004367816091953,
+      "grad_norm": 0.007570244837552309,
+      "learning_rate": 5.9514687100894e-06,
+      "loss": 0.0009,
+      "step": 6060
+    },
+    {
+      "epoch": 23.005134099616857,
+      "grad_norm": 0.012383556924760342,
+      "learning_rate": 5.942954448701576e-06,
+      "loss": 0.9855,
+      "step": 6070
+    },
+    {
+      "epoch": 23.005900383141764,
+      "grad_norm": 62.781803131103516,
+      "learning_rate": 5.934440187313752e-06,
+      "loss": 0.9697,
+      "step": 6080
+    },
+    {
+      "epoch": 23.006666666666668,
+      "grad_norm": 167.15147399902344,
+      "learning_rate": 5.925925925925926e-06,
+      "loss": 0.6894,
+      "step": 6090
+    },
+    {
+      "epoch": 23.00743295019157,
+      "grad_norm": 33.346561431884766,
+      "learning_rate": 5.917411664538102e-06,
+      "loss": 0.9735,
+      "step": 6100
+    },
+    {
+      "epoch": 23.008199233716475,
+      "grad_norm": 0.583283543586731,
+      "learning_rate": 5.9088974031502775e-06,
+      "loss": 0.1683,
+      "step": 6110
+    },
+    {
+      "epoch": 23.00896551724138,
+      "grad_norm": 210.42234802246094,
+      "learning_rate": 5.9003831417624525e-06,
+      "loss": 0.4048,
+      "step": 6120
+    },
+    {
+      "epoch": 23.009731800766282,
+      "grad_norm": 0.6796733140945435,
+      "learning_rate": 5.891868880374628e-06,
+      "loss": 0.7284,
+      "step": 6130
+    },
+    {
+      "epoch": 23.01049808429119,
+      "grad_norm": 0.1172986701130867,
+      "learning_rate": 5.883354618986803e-06,
+      "loss": 0.4795,
+      "step": 6140
+    },
+    {
+      "epoch": 23.011264367816093,
+      "grad_norm": 0.14602810144424438,
+      "learning_rate": 5.874840357598979e-06,
+      "loss": 0.0261,
+      "step": 6150
+    },
+    {
+      "epoch": 23.012030651340996,
+      "grad_norm": 242.8683624267578,
+      "learning_rate": 5.866326096211154e-06,
+      "loss": 0.9012,
+      "step": 6160
+    },
+    {
+      "epoch": 23.0127969348659,
+      "grad_norm": 0.04556625708937645,
+      "learning_rate": 5.857811834823329e-06,
+      "loss": 0.0102,
+      "step": 6170
+    },
+    {
+      "epoch": 23.013563218390804,
+      "grad_norm": 0.0033912360668182373,
+      "learning_rate": 5.849297573435505e-06,
+      "loss": 0.0064,
+      "step": 6180
+    },
+    {
+      "epoch": 23.014329501915707,
+      "grad_norm": 0.013051849789917469,
+      "learning_rate": 5.84078331204768e-06,
+      "loss": 3.0725,
+      "step": 6190
+    },
+    {
+      "epoch": 23.015095785440614,
+      "grad_norm": 0.04039206728339195,
+      "learning_rate": 5.832269050659856e-06,
+      "loss": 0.1902,
+      "step": 6200
+    },
+    {
+      "epoch": 23.015862068965518,
+      "grad_norm": 0.021253997460007668,
+      "learning_rate": 5.823754789272032e-06,
+      "loss": 0.9769,
+      "step": 6210
+    },
+    {
+      "epoch": 23.01662835249042,
+      "grad_norm": 0.011993280611932278,
+      "learning_rate": 5.815240527884206e-06,
+      "loss": 0.0031,
+      "step": 6220
+    },
+    {
+      "epoch": 23.017394636015325,
+      "grad_norm": 0.03635237738490105,
+      "learning_rate": 5.806726266496382e-06,
+      "loss": 1.0961,
+      "step": 6230
+    },
+    {
+      "epoch": 23.01816091954023,
+      "grad_norm": 0.007159947417676449,
+      "learning_rate": 5.7982120051085575e-06,
+      "loss": 0.6208,
+      "step": 6240
+    },
+    {
+      "epoch": 23.018927203065132,
+      "grad_norm": 0.3210875988006592,
+      "learning_rate": 5.7896977437207325e-06,
+      "loss": 0.5218,
+      "step": 6250
+    },
+    {
+      "epoch": 23.01969348659004,
+      "grad_norm": 0.06819188594818115,
+      "learning_rate": 5.781183482332908e-06,
+      "loss": 1.1719,
+      "step": 6260
+    },
+    {
+      "epoch": 23.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.2116539478302002,
+      "eval_runtime": 16.2797,
+      "eval_samples_per_second": 2.764,
+      "eval_steps_per_second": 2.764,
+      "step": 6264
+    },
+    {
+      "epoch": 24.000459770114944,
+      "grad_norm": 111.79815673828125,
+      "learning_rate": 5.772669220945083e-06,
+      "loss": 0.3069,
+      "step": 6270
+    },
+    {
+      "epoch": 24.001226053639847,
+      "grad_norm": 0.24448484182357788,
+      "learning_rate": 5.764154959557259e-06,
+      "loss": 0.0516,
+      "step": 6280
+    },
+    {
+      "epoch": 24.00199233716475,
+      "grad_norm": 0.024158824235200882,
+      "learning_rate": 5.755640698169435e-06,
+      "loss": 1.107,
+      "step": 6290
+    },
+    {
+      "epoch": 24.002758620689654,
+      "grad_norm": 0.004840515088289976,
+      "learning_rate": 5.747126436781609e-06,
+      "loss": 0.5633,
+      "step": 6300
+    },
+    {
+      "epoch": 24.003524904214558,
+      "grad_norm": 0.007637548726052046,
+      "learning_rate": 5.738612175393785e-06,
+      "loss": 0.5301,
+      "step": 6310
+    },
+    {
+      "epoch": 24.004291187739465,
+      "grad_norm": 0.010497420094907284,
+      "learning_rate": 5.730097914005961e-06,
+      "loss": 0.5864,
+      "step": 6320
+    },
+    {
+      "epoch": 24.00505747126437,
+      "grad_norm": 0.17650379240512848,
+      "learning_rate": 5.721583652618136e-06,
+      "loss": 0.0074,
+      "step": 6330
+    },
+    {
+      "epoch": 24.005823754789272,
+      "grad_norm": 0.16446323692798615,
+      "learning_rate": 5.713069391230312e-06,
+      "loss": 0.4316,
+      "step": 6340
+    },
+    {
+      "epoch": 24.006590038314176,
+      "grad_norm": 0.09322980791330338,
+      "learning_rate": 5.704555129842486e-06,
+      "loss": 0.0048,
+      "step": 6350
+    },
+    {
+      "epoch": 24.00735632183908,
+      "grad_norm": 9.977555274963379,
+      "learning_rate": 5.696040868454662e-06,
+      "loss": 0.005,
+      "step": 6360
+    },
+    {
+      "epoch": 24.008122605363983,
+      "grad_norm": 0.07110035419464111,
+      "learning_rate": 5.6875266070668375e-06,
+      "loss": 0.7764,
+      "step": 6370
+    },
+    {
+      "epoch": 24.00888888888889,
+      "grad_norm": 0.002316427417099476,
+      "learning_rate": 5.6790123456790125e-06,
+      "loss": 1.0909,
+      "step": 6380
+    },
+    {
+      "epoch": 24.009655172413794,
+      "grad_norm": 0.01620934158563614,
+      "learning_rate": 5.670498084291188e-06,
+      "loss": 0.2525,
+      "step": 6390
+    },
+    {
+      "epoch": 24.010421455938697,
+      "grad_norm": 0.3324863612651825,
+      "learning_rate": 5.661983822903364e-06,
+      "loss": 0.6407,
+      "step": 6400
+    },
+    {
+      "epoch": 24.0111877394636,
+      "grad_norm": 0.5573359727859497,
+      "learning_rate": 5.653469561515539e-06,
+      "loss": 0.5663,
+      "step": 6410
+    },
+    {
+      "epoch": 24.011954022988505,
+      "grad_norm": 0.15817981958389282,
+      "learning_rate": 5.644955300127715e-06,
+      "loss": 0.048,
+      "step": 6420
+    },
+    {
+      "epoch": 24.01272030651341,
+      "grad_norm": 29.228792190551758,
+      "learning_rate": 5.636441038739889e-06,
+      "loss": 0.6431,
+      "step": 6430
+    },
+    {
+      "epoch": 24.013486590038315,
+      "grad_norm": 0.00793869886547327,
+      "learning_rate": 5.627926777352065e-06,
+      "loss": 0.0019,
+      "step": 6440
+    },
+    {
+      "epoch": 24.01425287356322,
+      "grad_norm": 0.10121812671422958,
+      "learning_rate": 5.619412515964241e-06,
+      "loss": 0.4982,
+      "step": 6450
+    },
+    {
+      "epoch": 24.015019157088123,
+      "grad_norm": 0.2566695511341095,
+      "learning_rate": 5.610898254576416e-06,
+      "loss": 0.1485,
+      "step": 6460
+    },
+    {
+      "epoch": 24.015785440613026,
+      "grad_norm": 0.008360895328223705,
+      "learning_rate": 5.602383993188592e-06,
+      "loss": 0.002,
+      "step": 6470
+    },
+    {
+      "epoch": 24.01655172413793,
+      "grad_norm": 0.5510136485099792,
+      "learning_rate": 5.593869731800766e-06,
+      "loss": 0.3374,
+      "step": 6480
+    },
+    {
+      "epoch": 24.017318007662837,
+      "grad_norm": 0.005763225723057985,
+      "learning_rate": 5.585355470412942e-06,
+      "loss": 0.9687,
+      "step": 6490
+    },
+    {
+      "epoch": 24.01808429118774,
+      "grad_norm": 0.05473093315958977,
+      "learning_rate": 5.5768412090251175e-06,
+      "loss": 1.7902,
+      "step": 6500
+    },
+    {
+      "epoch": 24.018850574712644,
+      "grad_norm": 82.5860824584961,
+      "learning_rate": 5.5683269476372925e-06,
+      "loss": 0.9781,
+      "step": 6510
+    },
+    {
+      "epoch": 24.019616858237548,
+      "grad_norm": 0.035912372171878815,
+      "learning_rate": 5.559812686249468e-06,
+      "loss": 0.9043,
+      "step": 6520
+    },
+    {
+      "epoch": 24.02,
+      "eval_accuracy": 0.8,
+      "eval_loss": 1.0468412637710571,
+      "eval_runtime": 16.2609,
+      "eval_samples_per_second": 2.767,
+      "eval_steps_per_second": 2.767,
+      "step": 6525
+    },
+    {
+      "epoch": 25.000383141762452,
+      "grad_norm": 0.297254741191864,
+      "learning_rate": 5.551298424861644e-06,
+      "loss": 0.4963,
+      "step": 6530
+    },
+    {
+      "epoch": 25.001149425287355,
+      "grad_norm": 0.10809308290481567,
+      "learning_rate": 5.542784163473819e-06,
+      "loss": 0.3862,
+      "step": 6540
+    },
+    {
+      "epoch": 25.00191570881226,
+      "grad_norm": 0.21265068650245667,
+      "learning_rate": 5.534269902085995e-06,
+      "loss": 0.1384,
+      "step": 6550
+    },
+    {
+      "epoch": 25.002681992337166,
+      "grad_norm": 0.006510636303573847,
+      "learning_rate": 5.525755640698169e-06,
+      "loss": 0.0024,
+      "step": 6560
+    },
+    {
+      "epoch": 25.00344827586207,
+      "grad_norm": 0.11088795959949493,
+      "learning_rate": 5.517241379310345e-06,
+      "loss": 0.4312,
+      "step": 6570
+    },
+    {
+      "epoch": 25.004214559386973,
+      "grad_norm": 0.003471273696050048,
+      "learning_rate": 5.508727117922521e-06,
+      "loss": 0.5084,
+      "step": 6580
+    },
+    {
+      "epoch": 25.004980842911877,
+      "grad_norm": 0.23261281847953796,
+      "learning_rate": 5.500212856534696e-06,
+      "loss": 0.0059,
+      "step": 6590
+    },
+    {
+      "epoch": 25.00574712643678,
+      "grad_norm": 0.002383657731115818,
+      "learning_rate": 5.491698595146872e-06,
+      "loss": 0.0013,
+      "step": 6600
+    },
+    {
+      "epoch": 25.006513409961684,
+      "grad_norm": 26.68533706665039,
+      "learning_rate": 5.4831843337590475e-06,
+      "loss": 0.8476,
+      "step": 6610
+    },
+    {
+      "epoch": 25.00727969348659,
+      "grad_norm": 0.004375293850898743,
+      "learning_rate": 5.474670072371222e-06,
+      "loss": 0.3487,
+      "step": 6620
+    },
+    {
+      "epoch": 25.008045977011495,
+      "grad_norm": 0.02778606489300728,
+      "learning_rate": 5.4661558109833975e-06,
+      "loss": 0.0959,
+      "step": 6630
+    },
+    {
+      "epoch": 25.0088122605364,
+      "grad_norm": 0.005876324605196714,
+      "learning_rate": 5.4576415495955725e-06,
+      "loss": 0.0023,
+      "step": 6640
+    },
+    {
+      "epoch": 25.009578544061302,
+      "grad_norm": 0.0023193422239273787,
+      "learning_rate": 5.449127288207748e-06,
+      "loss": 1.21,
+      "step": 6650
+    },
+    {
+      "epoch": 25.010344827586206,
+      "grad_norm": 0.16215364634990692,
+      "learning_rate": 5.440613026819924e-06,
+      "loss": 0.5084,
+      "step": 6660
+    },
+    {
+      "epoch": 25.011111111111113,
+      "grad_norm": 0.002316415077075362,
+      "learning_rate": 5.432098765432099e-06,
+      "loss": 0.7558,
+      "step": 6670
+    },
+    {
+      "epoch": 25.011877394636016,
+      "grad_norm": 199.45330810546875,
+      "learning_rate": 5.423584504044275e-06,
+      "loss": 1.2108,
+      "step": 6680
+    },
+    {
+      "epoch": 25.01264367816092,
+      "grad_norm": 0.008179481141269207,
+      "learning_rate": 5.415070242656451e-06,
+      "loss": 0.0028,
+      "step": 6690
+    },
+    {
+      "epoch": 25.013409961685824,
+      "grad_norm": 0.10535692423582077,
+      "learning_rate": 5.406555981268625e-06,
+      "loss": 0.0222,
+      "step": 6700
+    },
+    {
+      "epoch": 25.014176245210727,
+      "grad_norm": 0.008161659352481365,
+      "learning_rate": 5.398041719880801e-06,
+      "loss": 1.3135,
+      "step": 6710
+    },
+    {
+      "epoch": 25.01494252873563,
+      "grad_norm": 0.014941788278520107,
+      "learning_rate": 5.389527458492976e-06,
+      "loss": 1.7422,
+      "step": 6720
+    },
+    {
+      "epoch": 25.015708812260538,
+      "grad_norm": 0.09473875910043716,
+      "learning_rate": 5.381013197105152e-06,
+      "loss": 0.4056,
+      "step": 6730
+    },
+    {
+      "epoch": 25.01647509578544,
+      "grad_norm": 0.007046079728752375,
+      "learning_rate": 5.3724989357173275e-06,
+      "loss": 1.6799,
+      "step": 6740
+    },
+    {
+      "epoch": 25.017241379310345,
+      "grad_norm": 31.201223373413086,
+      "learning_rate": 5.3639846743295025e-06,
+      "loss": 1.4365,
+      "step": 6750
+    },
+    {
+      "epoch": 25.01800766283525,
+      "grad_norm": 0.005747068673372269,
+      "learning_rate": 5.3554704129416775e-06,
+      "loss": 0.9901,
+      "step": 6760
+    },
+    {
+      "epoch": 25.018773946360152,
+      "grad_norm": 0.1358274668455124,
+      "learning_rate": 5.3469561515538525e-06,
+      "loss": 0.0099,
+      "step": 6770
+    },
+    {
+      "epoch": 25.019540229885056,
+      "grad_norm": 300.8212890625,
+      "learning_rate": 5.338441890166028e-06,
+      "loss": 1.1914,
+      "step": 6780
+    },
+    {
+      "epoch": 25.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.3830008506774902,
+      "eval_runtime": 16.3647,
+      "eval_samples_per_second": 2.75,
+      "eval_steps_per_second": 2.75,
+      "step": 6786
+    },
+    {
+      "epoch": 26.00030651340996,
+      "grad_norm": 0.28865209221839905,
+      "learning_rate": 5.329927628778204e-06,
+      "loss": 0.916,
+      "step": 6790
+    },
+    {
+      "epoch": 26.001072796934867,
+      "grad_norm": 0.054898347705602646,
+      "learning_rate": 5.321413367390379e-06,
+      "loss": 0.3035,
+      "step": 6800
+    },
+    {
+      "epoch": 26.00183908045977,
+      "grad_norm": 177.0633087158203,
+      "learning_rate": 5.312899106002555e-06,
+      "loss": 0.6118,
+      "step": 6810
+    },
+    {
+      "epoch": 26.002605363984674,
+      "grad_norm": 0.15720027685165405,
+      "learning_rate": 5.304384844614731e-06,
+      "loss": 0.0062,
+      "step": 6820
+    },
+    {
+      "epoch": 26.003371647509578,
+      "grad_norm": 0.39405685663223267,
+      "learning_rate": 5.295870583226905e-06,
+      "loss": 0.4872,
+      "step": 6830
+    },
+    {
+      "epoch": 26.00413793103448,
+      "grad_norm": 1.5877101421356201,
+      "learning_rate": 5.287356321839081e-06,
+      "loss": 0.2091,
+      "step": 6840
+    },
+    {
+      "epoch": 26.00490421455939,
+      "grad_norm": 0.0017664715414866805,
+      "learning_rate": 5.278842060451256e-06,
+      "loss": 0.0455,
+      "step": 6850
+    },
+    {
+      "epoch": 26.005670498084292,
+      "grad_norm": 0.3004223108291626,
+      "learning_rate": 5.270327799063432e-06,
+      "loss": 0.2131,
+      "step": 6860
+    },
+    {
+      "epoch": 26.006436781609196,
+      "grad_norm": 0.002318794373422861,
+      "learning_rate": 5.2618135376756075e-06,
+      "loss": 0.003,
+      "step": 6870
+    },
+    {
+      "epoch": 26.0072030651341,
+      "grad_norm": 0.0031128637492656708,
+      "learning_rate": 5.2532992762877825e-06,
+      "loss": 0.553,
+      "step": 6880
+    },
+    {
+      "epoch": 26.007969348659003,
+      "grad_norm": 523.013671875,
+      "learning_rate": 5.244785014899958e-06,
+      "loss": 0.9374,
+      "step": 6890
+    },
+    {
+      "epoch": 26.008735632183907,
+      "grad_norm": 0.0014535968657582998,
+      "learning_rate": 5.236270753512134e-06,
+      "loss": 0.6743,
+      "step": 6900
+    },
+    {
+      "epoch": 26.009501915708814,
+      "grad_norm": 0.15418817102909088,
+      "learning_rate": 5.227756492124308e-06,
+      "loss": 0.3123,
+      "step": 6910
+    },
+    {
+      "epoch": 26.010268199233717,
+      "grad_norm": 0.22388340532779694,
+      "learning_rate": 5.219242230736484e-06,
+      "loss": 0.0047,
+      "step": 6920
+    },
+    {
+      "epoch": 26.01103448275862,
+      "grad_norm": 228.3011932373047,
+      "learning_rate": 5.210727969348659e-06,
+      "loss": 0.5351,
+      "step": 6930
+    },
+    {
+      "epoch": 26.011800766283525,
+      "grad_norm": 0.09498967975378036,
+      "learning_rate": 5.202213707960835e-06,
+      "loss": 0.0053,
+      "step": 6940
+    },
+    {
+      "epoch": 26.01256704980843,
+      "grad_norm": 0.03844846040010452,
+      "learning_rate": 5.193699446573011e-06,
+      "loss": 1.1011,
+      "step": 6950
+    },
+    {
+      "epoch": 26.013333333333332,
+      "grad_norm": 0.027119528502225876,
+      "learning_rate": 5.185185185185185e-06,
+      "loss": 0.0022,
+      "step": 6960
+    },
+    {
+      "epoch": 26.01409961685824,
+      "grad_norm": 0.0246810894459486,
+      "learning_rate": 5.176670923797361e-06,
+      "loss": 0.0014,
+      "step": 6970
+    },
+    {
+      "epoch": 26.014865900383143,
+      "grad_norm": 0.0030653183348476887,
+      "learning_rate": 5.168156662409536e-06,
+      "loss": 0.9798,
+      "step": 6980
+    },
+    {
+      "epoch": 26.015632183908046,
+      "grad_norm": 0.002964736893773079,
+      "learning_rate": 5.159642401021712e-06,
+      "loss": 0.0007,
+      "step": 6990
+    },
+    {
+      "epoch": 26.01639846743295,
+      "grad_norm": 0.13246798515319824,
+      "learning_rate": 5.1511281396338875e-06,
+      "loss": 0.5335,
+      "step": 7000
+    },
+    {
+      "epoch": 26.017164750957853,
+      "grad_norm": 0.048838112503290176,
+      "learning_rate": 5.1426138782460625e-06,
+      "loss": 1.0308,
+      "step": 7010
+    },
+    {
+      "epoch": 26.017931034482757,
+      "grad_norm": 0.036104172468185425,
+      "learning_rate": 5.134099616858238e-06,
+      "loss": 0.5105,
+      "step": 7020
+    },
+    {
+      "epoch": 26.018697318007664,
+      "grad_norm": 0.052102845162153244,
+      "learning_rate": 5.125585355470414e-06,
+      "loss": 0.0974,
+      "step": 7030
+    },
+    {
+      "epoch": 26.019463601532568,
+      "grad_norm": 0.002513181883841753,
+      "learning_rate": 5.117071094082588e-06,
+      "loss": 0.0032,
+      "step": 7040
+    },
+    {
+      "epoch": 26.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.8348387479782104,
+      "eval_runtime": 17.6382,
+      "eval_samples_per_second": 2.551,
+      "eval_steps_per_second": 2.551,
+      "step": 7047
+    },
+    {
+      "epoch": 27.000229885057472,
+      "grad_norm": 0.0024699727073311806,
+      "learning_rate": 5.108556832694764e-06,
+      "loss": 0.7769,
+      "step": 7050
+    },
+    {
+      "epoch": 27.000996168582375,
+      "grad_norm": 0.002219507237896323,
+      "learning_rate": 5.100042571306939e-06,
+      "loss": 1.7644,
+      "step": 7060
+    },
+    {
+      "epoch": 27.00176245210728,
+      "grad_norm": 25.39436149597168,
+      "learning_rate": 5.091528309919115e-06,
+      "loss": 0.5427,
+      "step": 7070
+    },
+    {
+      "epoch": 27.002528735632183,
+      "grad_norm": 0.010430865921080112,
+      "learning_rate": 5.083014048531291e-06,
+      "loss": 0.0014,
+      "step": 7080
+    },
+    {
+      "epoch": 27.00329501915709,
+      "grad_norm": 0.0023462825920432806,
+      "learning_rate": 5.074499787143465e-06,
+      "loss": 1.0208,
+      "step": 7090
+    },
+    {
+      "epoch": 27.004061302681993,
+      "grad_norm": 0.0032093566842377186,
+      "learning_rate": 5.065985525755641e-06,
+      "loss": 0.0013,
+      "step": 7100
+    },
+    {
+      "epoch": 27.004827586206897,
+      "grad_norm": 0.0018312092870473862,
+      "learning_rate": 5.057471264367817e-06,
+      "loss": 0.0014,
+      "step": 7110
+    },
+    {
+      "epoch": 27.0055938697318,
+      "grad_norm": 0.003123511793091893,
+      "learning_rate": 5.048957002979992e-06,
+      "loss": 0.0012,
+      "step": 7120
+    },
+    {
+      "epoch": 27.006360153256704,
+      "grad_norm": 0.007787426933646202,
+      "learning_rate": 5.0404427415921675e-06,
+      "loss": 0.5561,
+      "step": 7130
+    },
+    {
+      "epoch": 27.007126436781608,
+      "grad_norm": 0.00325384340249002,
+      "learning_rate": 5.0319284802043425e-06,
+      "loss": 0.0646,
+      "step": 7140
+    },
+    {
+      "epoch": 27.007892720306515,
+      "grad_norm": 0.3762580454349518,
+      "learning_rate": 5.023414218816518e-06,
+      "loss": 2.3582,
+      "step": 7150
+    },
+    {
+      "epoch": 27.00865900383142,
+      "grad_norm": 0.07755059003829956,
+      "learning_rate": 5.014899957428694e-06,
+      "loss": 1.0152,
+      "step": 7160
+    },
+    {
+      "epoch": 27.009425287356322,
+      "grad_norm": 0.008119486272335052,
+      "learning_rate": 5.006385696040868e-06,
+      "loss": 0.0018,
+      "step": 7170
+    },
+    {
+      "epoch": 27.010191570881226,
+      "grad_norm": 0.11317072808742523,
+      "learning_rate": 4.997871434653044e-06,
+      "loss": 1.1145,
+      "step": 7180
+    },
+    {
+      "epoch": 27.01095785440613,
+      "grad_norm": 0.007997841574251652,
+      "learning_rate": 4.98935717326522e-06,
+      "loss": 0.0019,
+      "step": 7190
+    },
+    {
+      "epoch": 27.011724137931033,
+      "grad_norm": 0.022566286846995354,
+      "learning_rate": 4.980842911877395e-06,
+      "loss": 0.553,
+      "step": 7200
+    },
+    {
+      "epoch": 27.01249042145594,
+      "grad_norm": 0.22851060330867767,
+      "learning_rate": 4.972328650489571e-06,
+      "loss": 1.0581,
+      "step": 7210
+    },
+    {
+      "epoch": 27.013256704980844,
+      "grad_norm": 0.014644362963736057,
+      "learning_rate": 4.963814389101746e-06,
+      "loss": 0.0068,
+      "step": 7220
+    },
+    {
+      "epoch": 27.014022988505747,
+      "grad_norm": 0.08528231084346771,
+      "learning_rate": 4.955300127713921e-06,
+      "loss": 0.9701,
+      "step": 7230
+    },
+    {
+      "epoch": 27.01478927203065,
+      "grad_norm": 0.011525926180183887,
+      "learning_rate": 4.946785866326097e-06,
+      "loss": 0.0043,
+      "step": 7240
+    },
+    {
+      "epoch": 27.015555555555554,
+      "grad_norm": 612.269775390625,
+      "learning_rate": 4.938271604938272e-06,
+      "loss": 0.3909,
+      "step": 7250
+    },
+    {
+      "epoch": 27.016321839080458,
+      "grad_norm": 0.0028381366282701492,
+      "learning_rate": 4.9297573435504475e-06,
+      "loss": 0.0016,
+      "step": 7260
+    },
+    {
+      "epoch": 27.017088122605365,
+      "grad_norm": 0.019702916964888573,
+      "learning_rate": 4.9212430821626225e-06,
+      "loss": 0.9995,
+      "step": 7270
+    },
+    {
+      "epoch": 27.01785440613027,
+      "grad_norm": 0.05296581611037254,
+      "learning_rate": 4.912728820774798e-06,
+      "loss": 0.0031,
+      "step": 7280
+    },
+    {
+      "epoch": 27.018620689655172,
+      "grad_norm": 0.09188272058963776,
+      "learning_rate": 4.904214559386973e-06,
+      "loss": 0.5764,
+      "step": 7290
+    },
+    {
+      "epoch": 27.019386973180076,
+      "grad_norm": 0.07929536700248718,
+      "learning_rate": 4.895700297999149e-06,
+      "loss": 0.0022,
+      "step": 7300
+    },
+    {
+      "epoch": 27.02,
+      "eval_accuracy": 0.6888888888888889,
+      "eval_loss": 1.9530377388000488,
+      "eval_runtime": 17.1126,
+      "eval_samples_per_second": 2.63,
+      "eval_steps_per_second": 2.63,
+      "step": 7308
+    },
+    {
+      "epoch": 28.00015325670498,
+      "grad_norm": 0.0030589948873966932,
+      "learning_rate": 4.887186036611324e-06,
+      "loss": 0.4899,
+      "step": 7310
+    },
+    {
+      "epoch": 28.000919540229884,
+      "grad_norm": 0.007198402192443609,
+      "learning_rate": 4.8786717752235e-06,
+      "loss": 0.6669,
+      "step": 7320
+    },
+    {
+      "epoch": 28.00168582375479,
+      "grad_norm": 0.10107333958148956,
+      "learning_rate": 4.870157513835675e-06,
+      "loss": 0.002,
+      "step": 7330
+    },
+    {
+      "epoch": 28.002452107279694,
+      "grad_norm": 0.0018596283625811338,
+      "learning_rate": 4.861643252447851e-06,
+      "loss": 1.2037,
+      "step": 7340
+    },
+    {
+      "epoch": 28.003218390804598,
+      "grad_norm": 0.0019081667996942997,
+      "learning_rate": 4.853128991060026e-06,
+      "loss": 0.0018,
+      "step": 7350
+    },
+    {
+      "epoch": 28.0039846743295,
+      "grad_norm": 0.05873342975974083,
+      "learning_rate": 4.844614729672202e-06,
+      "loss": 0.0035,
+      "step": 7360
+    },
+    {
+      "epoch": 28.004750957854405,
+      "grad_norm": 0.0041256314143538475,
+      "learning_rate": 4.836100468284377e-06,
+      "loss": 0.3685,
+      "step": 7370
+    },
+    {
+      "epoch": 28.00551724137931,
+      "grad_norm": 0.07202044129371643,
+      "learning_rate": 4.8275862068965525e-06,
+      "loss": 0.6319,
+      "step": 7380
+    },
+    {
+      "epoch": 28.006283524904216,
+      "grad_norm": 0.045292168855667114,
+      "learning_rate": 4.8190719455087275e-06,
+      "loss": 1.3031,
+      "step": 7390
+    },
+    {
+      "epoch": 28.00704980842912,
+      "grad_norm": 0.053444087505340576,
+      "learning_rate": 4.8105576841209025e-06,
+      "loss": 0.5637,
+      "step": 7400
+    },
+    {
+      "epoch": 28.007816091954023,
+      "grad_norm": 0.002044421387836337,
+      "learning_rate": 4.802043422733078e-06,
+      "loss": 0.5013,
+      "step": 7410
+    },
+    {
+      "epoch": 28.008582375478927,
+      "grad_norm": 23.52623176574707,
+      "learning_rate": 4.793529161345254e-06,
+      "loss": 0.5985,
+      "step": 7420
+    },
+    {
+      "epoch": 28.00934865900383,
+      "grad_norm": 0.09604190289974213,
+      "learning_rate": 4.785014899957429e-06,
+      "loss": 0.4264,
+      "step": 7430
+    },
+    {
+      "epoch": 28.010114942528734,
+      "grad_norm": 0.003636513603851199,
+      "learning_rate": 4.776500638569604e-06,
+      "loss": 0.6638,
+      "step": 7440
+    },
+    {
+      "epoch": 28.01088122605364,
+      "grad_norm": 0.2937626242637634,
+      "learning_rate": 4.76798637718178e-06,
+      "loss": 0.0016,
+      "step": 7450
+    },
+    {
+      "epoch": 28.011647509578545,
+      "grad_norm": 0.05882389470934868,
+      "learning_rate": 4.759472115793956e-06,
+      "loss": 0.0489,
+      "step": 7460
+    },
+    {
+      "epoch": 28.01241379310345,
+      "grad_norm": 0.005052788648754358,
+      "learning_rate": 4.750957854406131e-06,
+      "loss": 0.6296,
+      "step": 7470
+    },
+    {
+      "epoch": 28.013180076628352,
+      "grad_norm": 0.005214430391788483,
+      "learning_rate": 4.742443593018306e-06,
+      "loss": 1.0577,
+      "step": 7480
+    },
+    {
+      "epoch": 28.013946360153255,
+      "grad_norm": 621.5762939453125,
+      "learning_rate": 4.733929331630482e-06,
+      "loss": 0.4731,
+      "step": 7490
+    },
+    {
+      "epoch": 28.014712643678163,
+      "grad_norm": 0.008719748817384243,
+      "learning_rate": 4.7254150702426575e-06,
+      "loss": 0.0111,
+      "step": 7500
+    },
+    {
+      "epoch": 28.015478927203066,
+      "grad_norm": 0.19455721974372864,
+      "learning_rate": 4.7169008088548325e-06,
+      "loss": 0.0008,
+      "step": 7510
+    },
+    {
+      "epoch": 28.01624521072797,
+      "grad_norm": 0.0063719660975039005,
+      "learning_rate": 4.7083865474670075e-06,
+      "loss": 0.4166,
+      "step": 7520
+    },
+    {
+      "epoch": 28.017011494252873,
+      "grad_norm": 0.04965566471219063,
+      "learning_rate": 4.6998722860791825e-06,
+      "loss": 1.2705,
+      "step": 7530
+    },
+    {
+      "epoch": 28.017777777777777,
+      "grad_norm": 0.08509698510169983,
+      "learning_rate": 4.691358024691358e-06,
+      "loss": 0.6151,
+      "step": 7540
+    },
+    {
+      "epoch": 28.01854406130268,
+      "grad_norm": 0.06898875534534454,
+      "learning_rate": 4.682843763303534e-06,
+      "loss": 0.0018,
+      "step": 7550
+    },
+    {
+      "epoch": 28.019310344827588,
+      "grad_norm": 0.005845922045409679,
+      "learning_rate": 4.674329501915709e-06,
+      "loss": 1.1481,
+      "step": 7560
+    },
+    {
+      "epoch": 28.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.433872938156128,
+      "eval_runtime": 17.3187,
+      "eval_samples_per_second": 2.598,
+      "eval_steps_per_second": 2.598,
+      "step": 7569
+    },
+    {
+      "epoch": 29.000076628352492,
+      "grad_norm": 0.004079683218151331,
+      "learning_rate": 4.665815240527884e-06,
+      "loss": 1.0156,
+      "step": 7570
+    },
+    {
+      "epoch": 29.000842911877395,
+      "grad_norm": 0.15890641510486603,
+      "learning_rate": 4.65730097914006e-06,
+      "loss": 0.2197,
+      "step": 7580
+    },
+    {
+      "epoch": 29.0016091954023,
+      "grad_norm": 0.06670423597097397,
+      "learning_rate": 4.648786717752236e-06,
+      "loss": 0.7569,
+      "step": 7590
+    },
+    {
+      "epoch": 29.002375478927203,
+      "grad_norm": 0.0019463635981082916,
+      "learning_rate": 4.640272456364411e-06,
+      "loss": 0.4151,
+      "step": 7600
+    },
+    {
+      "epoch": 29.003141762452106,
+      "grad_norm": 192.4460906982422,
+      "learning_rate": 4.631758194976586e-06,
+      "loss": 1.1192,
+      "step": 7610
+    },
+    {
+      "epoch": 29.00390804597701,
+      "grad_norm": 0.0057052550837397575,
+      "learning_rate": 4.623243933588762e-06,
+      "loss": 0.0022,
+      "step": 7620
+    },
+    {
+      "epoch": 29.004674329501917,
+      "grad_norm": 0.0029543922282755375,
+      "learning_rate": 4.6147296722009375e-06,
+      "loss": 0.5465,
+      "step": 7630
+    },
+    {
+      "epoch": 29.00544061302682,
+      "grad_norm": 0.042063795030117035,
+      "learning_rate": 4.6062154108131125e-06,
+      "loss": 0.0017,
+      "step": 7640
+    },
+    {
+      "epoch": 29.006206896551724,
+      "grad_norm": 0.015184939838945866,
+      "learning_rate": 4.5977011494252875e-06,
+      "loss": 1.0624,
+      "step": 7650
+    },
+    {
+      "epoch": 29.006973180076628,
+      "grad_norm": 0.21465983986854553,
+      "learning_rate": 4.589186888037463e-06,
+      "loss": 0.0016,
+      "step": 7660
+    },
+    {
+      "epoch": 29.00773946360153,
+      "grad_norm": 0.0015866790199652314,
+      "learning_rate": 4.580672626649638e-06,
+      "loss": 0.5416,
+      "step": 7670
+    },
+    {
+      "epoch": 29.00850574712644,
+      "grad_norm": 0.03185751661658287,
+      "learning_rate": 4.572158365261814e-06,
+      "loss": 0.001,
+      "step": 7680
+    },
+    {
+      "epoch": 29.009272030651342,
+      "grad_norm": 0.01196732372045517,
+      "learning_rate": 4.563644103873989e-06,
+      "loss": 0.6732,
+      "step": 7690
+    },
+    {
+      "epoch": 29.010038314176246,
+      "grad_norm": 0.1618071347475052,
+      "learning_rate": 4.555129842486164e-06,
+      "loss": 0.3781,
+      "step": 7700
+    },
+    {
+      "epoch": 29.01080459770115,
+      "grad_norm": 0.003336111782118678,
+      "learning_rate": 4.54661558109834e-06,
+      "loss": 0.2088,
+      "step": 7710
+    },
+    {
+      "epoch": 29.011570881226053,
+      "grad_norm": 0.002319538267329335,
+      "learning_rate": 4.538101319710516e-06,
+      "loss": 0.478,
+      "step": 7720
+    },
+    {
+      "epoch": 29.012337164750956,
+      "grad_norm": 0.013910328038036823,
+      "learning_rate": 4.529587058322691e-06,
+      "loss": 0.0022,
+      "step": 7730
+    },
+    {
+      "epoch": 29.013103448275864,
+      "grad_norm": 0.005334469955414534,
+      "learning_rate": 4.521072796934866e-06,
+      "loss": 0.4131,
+      "step": 7740
+    },
+    {
+      "epoch": 29.013869731800767,
+      "grad_norm": 0.15581867098808289,
+      "learning_rate": 4.512558535547042e-06,
+      "loss": 0.0041,
+      "step": 7750
+    },
+    {
+      "epoch": 29.01463601532567,
+      "grad_norm": 0.06605182588100433,
+      "learning_rate": 4.5040442741592175e-06,
+      "loss": 0.0019,
+      "step": 7760
+    },
+    {
+      "epoch": 29.015402298850574,
+      "grad_norm": 0.03662487491965294,
+      "learning_rate": 4.4955300127713925e-06,
+      "loss": 0.0011,
+      "step": 7770
+    },
+    {
+      "epoch": 29.016168582375478,
+      "grad_norm": 0.012340080924332142,
+      "learning_rate": 4.4870157513835675e-06,
+      "loss": 0.0027,
+      "step": 7780
+    },
+    {
+      "epoch": 29.01693486590038,
+      "grad_norm": 0.0017297010635957122,
+      "learning_rate": 4.478501489995743e-06,
+      "loss": 0.0017,
+      "step": 7790
+    },
+    {
+      "epoch": 29.01770114942529,
+      "grad_norm": 0.0012412663782015443,
+      "learning_rate": 4.469987228607919e-06,
+      "loss": 0.6629,
+      "step": 7800
+    },
+    {
+      "epoch": 29.018467432950192,
+      "grad_norm": 0.004831795115023851,
+      "learning_rate": 4.461472967220094e-06,
+      "loss": 0.5858,
+      "step": 7810
+    },
+    {
+      "epoch": 29.019233716475096,
+      "grad_norm": 0.22104321420192719,
+      "learning_rate": 4.452958705832269e-06,
+      "loss": 0.5705,
+      "step": 7820
+    },
+    {
+      "epoch": 29.02,
+      "grad_norm": 0.0033627126831561327,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.8332,
+      "step": 7830
+    },
+    {
+      "epoch": 29.02,
+      "eval_accuracy": 0.6666666666666666,
+      "eval_loss": 2.084125518798828,
+      "eval_runtime": 17.0453,
+      "eval_samples_per_second": 2.64,
+      "eval_steps_per_second": 2.64,
+      "step": 7830
+    },
+    {
+      "epoch": 30.000766283524904,
+      "grad_norm": 0.0024623959325253963,
+      "learning_rate": 4.43593018305662e-06,
+      "loss": 0.8405,
+      "step": 7840
+    },
+    {
+      "epoch": 30.001532567049807,
+      "grad_norm": 0.07097584009170532,
+      "learning_rate": 4.427415921668796e-06,
+      "loss": 0.5673,
+      "step": 7850
+    },
+    {
+      "epoch": 30.002298850574714,
+      "grad_norm": 188.15597534179688,
+      "learning_rate": 4.418901660280971e-06,
+      "loss": 1.0111,
+      "step": 7860
+    },
+    {
+      "epoch": 30.003065134099618,
+      "grad_norm": 0.4013839066028595,
+      "learning_rate": 4.410387398893146e-06,
+      "loss": 0.0009,
+      "step": 7870
+    },
+    {
+      "epoch": 30.00383141762452,
+      "grad_norm": 1.3998527526855469,
+      "learning_rate": 4.401873137505322e-06,
+      "loss": 0.4193,
+      "step": 7880
+    },
+    {
+      "epoch": 30.004597701149425,
+      "grad_norm": 0.08924932032823563,
+      "learning_rate": 4.3933588761174975e-06,
+      "loss": 0.0012,
+      "step": 7890
+    },
+    {
+      "epoch": 30.00536398467433,
+      "grad_norm": 0.012195151299238205,
+      "learning_rate": 4.3848446147296725e-06,
+      "loss": 0.5731,
+      "step": 7900
+    },
+    {
+      "epoch": 30.006130268199232,
+      "grad_norm": 1290.56103515625,
+      "learning_rate": 4.3763303533418475e-06,
+      "loss": 0.5276,
+      "step": 7910
+    },
+    {
+      "epoch": 30.00689655172414,
+      "grad_norm": 0.0538308285176754,
+      "learning_rate": 4.367816091954023e-06,
+      "loss": 0.5861,
+      "step": 7920
+    },
+    {
+      "epoch": 30.007662835249043,
+      "grad_norm": 0.06133956462144852,
+      "learning_rate": 4.359301830566199e-06,
+      "loss": 0.002,
+      "step": 7930
+    },
+    {
+      "epoch": 30.008429118773947,
+      "grad_norm": 0.06996101140975952,
+      "learning_rate": 4.350787569178374e-06,
+      "loss": 0.4464,
+      "step": 7940
+    },
+    {
+      "epoch": 30.00919540229885,
+      "grad_norm": 0.0770382285118103,
+      "learning_rate": 4.342273307790549e-06,
+      "loss": 0.6284,
+      "step": 7950
+    },
+    {
+      "epoch": 30.009961685823754,
+      "grad_norm": 0.08282287418842316,
+      "learning_rate": 4.333759046402725e-06,
+      "loss": 0.6054,
+      "step": 7960
+    },
+    {
+      "epoch": 30.010727969348657,
+      "grad_norm": 2.7787227630615234,
+      "learning_rate": 4.325244785014901e-06,
+      "loss": 0.5217,
+      "step": 7970
+    },
+    {
+      "epoch": 30.011494252873565,
+      "grad_norm": 0.1345757693052292,
+      "learning_rate": 4.316730523627076e-06,
+      "loss": 0.0027,
+      "step": 7980
+    },
+    {
+      "epoch": 30.01226053639847,
+      "grad_norm": 0.001271132379770279,
+      "learning_rate": 4.308216262239251e-06,
+      "loss": 1.4019,
+      "step": 7990
+    },
+    {
+      "epoch": 30.013026819923372,
+      "grad_norm": 0.15066084265708923,
+      "learning_rate": 4.299702000851427e-06,
+      "loss": 0.0512,
+      "step": 8000
+    },
+    {
+      "epoch": 30.013793103448275,
+      "grad_norm": 0.004003384616225958,
+      "learning_rate": 4.291187739463602e-06,
+      "loss": 0.462,
+      "step": 8010
+    },
+    {
+      "epoch": 30.01455938697318,
+      "grad_norm": 0.15488368272781372,
+      "learning_rate": 4.2826734780757775e-06,
+      "loss": 0.423,
+      "step": 8020
+    },
+    {
+      "epoch": 30.015325670498083,
+      "grad_norm": 0.001451969612389803,
+      "learning_rate": 4.2741592166879525e-06,
+      "loss": 0.0016,
+      "step": 8030
+    },
+    {
+      "epoch": 30.01609195402299,
+      "grad_norm": 0.2645537257194519,
+      "learning_rate": 4.2656449553001275e-06,
+      "loss": 0.5445,
+      "step": 8040
+    },
+    {
+      "epoch": 30.016858237547893,
+      "grad_norm": 0.02811589650809765,
+      "learning_rate": 4.257130693912303e-06,
+      "loss": 0.0682,
+      "step": 8050
+    },
+    {
+      "epoch": 30.017624521072797,
+      "grad_norm": 0.027506444603204727,
+      "learning_rate": 4.248616432524479e-06,
+      "loss": 0.0021,
+      "step": 8060
+    },
+    {
+      "epoch": 30.0183908045977,
+      "grad_norm": 0.013095653615891933,
+      "learning_rate": 4.240102171136654e-06,
+      "loss": 0.5415,
+      "step": 8070
+    },
+    {
+      "epoch": 30.019157088122604,
+      "grad_norm": 130.42623901367188,
+      "learning_rate": 4.231587909748829e-06,
+      "loss": 1.7849,
+      "step": 8080
+    },
+    {
+      "epoch": 30.01992337164751,
+      "grad_norm": 0.0009240119252353907,
+      "learning_rate": 4.223073648361005e-06,
+      "loss": 0.9692,
+      "step": 8090
+    },
+    {
+      "epoch": 30.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.4363021850585938,
+      "eval_runtime": 17.1593,
+      "eval_samples_per_second": 2.622,
+      "eval_steps_per_second": 2.622,
+      "step": 8091
+    },
+    {
+      "epoch": 31.000689655172415,
+      "grad_norm": 96.31036376953125,
+      "learning_rate": 4.214559386973181e-06,
+      "loss": 0.0125,
+      "step": 8100
+    },
+    {
+      "epoch": 31.00145593869732,
+      "grad_norm": 0.002800540067255497,
+      "learning_rate": 4.206045125585356e-06,
+      "loss": 0.0023,
+      "step": 8110
+    },
+    {
+      "epoch": 31.002222222222223,
+      "grad_norm": 0.002380869584158063,
+      "learning_rate": 4.197530864197531e-06,
+      "loss": 0.0021,
+      "step": 8120
+    },
+    {
+      "epoch": 31.002988505747126,
+      "grad_norm": 0.2211143970489502,
+      "learning_rate": 4.189016602809707e-06,
+      "loss": 0.6557,
+      "step": 8130
+    },
+    {
+      "epoch": 31.00375478927203,
+      "grad_norm": 169.05653381347656,
+      "learning_rate": 4.180502341421882e-06,
+      "loss": 0.3814,
+      "step": 8140
+    },
+    {
+      "epoch": 31.004521072796933,
+      "grad_norm": 0.36818021535873413,
+      "learning_rate": 4.1719880800340575e-06,
+      "loss": 0.006,
+      "step": 8150
+    },
+    {
+      "epoch": 31.00528735632184,
+      "grad_norm": 0.0029586083255708218,
+      "learning_rate": 4.1634738186462325e-06,
+      "loss": 0.4719,
+      "step": 8160
+    },
+    {
+      "epoch": 31.006053639846744,
+      "grad_norm": 0.0009810187621042132,
+      "learning_rate": 4.154959557258408e-06,
+      "loss": 0.0016,
+      "step": 8170
+    },
+    {
+      "epoch": 31.006819923371648,
+      "grad_norm": 0.08530862629413605,
+      "learning_rate": 4.146445295870583e-06,
+      "loss": 0.0007,
+      "step": 8180
+    },
+    {
+      "epoch": 31.00758620689655,
+      "grad_norm": 0.01836472563445568,
+      "learning_rate": 4.137931034482759e-06,
+      "loss": 0.0013,
+      "step": 8190
+    },
+    {
+      "epoch": 31.008352490421455,
+      "grad_norm": 0.0029333005659282207,
+      "learning_rate": 4.129416773094934e-06,
+      "loss": 0.0018,
+      "step": 8200
+    },
+    {
+      "epoch": 31.00911877394636,
+      "grad_norm": 204.29147338867188,
+      "learning_rate": 4.12090251170711e-06,
+      "loss": 1.7055,
+      "step": 8210
+    },
+    {
+      "epoch": 31.009885057471266,
+      "grad_norm": 0.0009609694825485349,
+      "learning_rate": 4.112388250319285e-06,
+      "loss": 0.5827,
+      "step": 8220
+    },
+    {
+      "epoch": 31.01065134099617,
+      "grad_norm": 1.4072681665420532,
+      "learning_rate": 4.103873988931461e-06,
+      "loss": 1.468,
+      "step": 8230
+    },
+    {
+      "epoch": 31.011417624521073,
+      "grad_norm": 0.007886682637035847,
+      "learning_rate": 4.095359727543636e-06,
+      "loss": 1.0163,
+      "step": 8240
+    },
+    {
+      "epoch": 31.012183908045976,
+      "grad_norm": 0.0027065242175012827,
+      "learning_rate": 4.086845466155812e-06,
+      "loss": 0.5125,
+      "step": 8250
+    },
+    {
+      "epoch": 31.01295019157088,
+      "grad_norm": 0.005601299926638603,
+      "learning_rate": 4.078331204767987e-06,
+      "loss": 0.0014,
+      "step": 8260
+    },
+    {
+      "epoch": 31.013716475095784,
+      "grad_norm": 0.11666177213191986,
+      "learning_rate": 4.0698169433801625e-06,
+      "loss": 0.002,
+      "step": 8270
+    },
+    {
+      "epoch": 31.01448275862069,
+      "grad_norm": 0.12932337820529938,
+      "learning_rate": 4.0613026819923375e-06,
+      "loss": 0.4866,
+      "step": 8280
+    },
+    {
+      "epoch": 31.015249042145594,
+      "grad_norm": 0.0018837143434211612,
+      "learning_rate": 4.052788420604513e-06,
+      "loss": 0.0025,
+      "step": 8290
+    },
+    {
+      "epoch": 31.016015325670498,
+      "grad_norm": 0.209938645362854,
+      "learning_rate": 4.044274159216688e-06,
+      "loss": 0.0042,
+      "step": 8300
+    },
+    {
+      "epoch": 31.0167816091954,
+      "grad_norm": 0.0014070343459025025,
+      "learning_rate": 4.035759897828863e-06,
+      "loss": 0.2825,
+      "step": 8310
+    },
+    {
+      "epoch": 31.017547892720305,
+      "grad_norm": 0.05112835019826889,
+      "learning_rate": 4.027245636441039e-06,
+      "loss": 0.5729,
+      "step": 8320
+    },
+    {
+      "epoch": 31.018314176245212,
+      "grad_norm": 0.059866052120923996,
+      "learning_rate": 4.018731375053214e-06,
+      "loss": 0.0009,
+      "step": 8330
+    },
+    {
+      "epoch": 31.019080459770116,
+      "grad_norm": 0.0009642989607527852,
+      "learning_rate": 4.01021711366539e-06,
+      "loss": 0.6588,
+      "step": 8340
+    },
+    {
+      "epoch": 31.01984674329502,
+      "grad_norm": 0.06982675939798355,
+      "learning_rate": 4.001702852277565e-06,
+      "loss": 0.0095,
+      "step": 8350
+    },
+    {
+      "epoch": 31.02,
+      "eval_accuracy": 0.7777777777777778,
+      "eval_loss": 1.6086761951446533,
+      "eval_runtime": 18.2248,
+      "eval_samples_per_second": 2.469,
+      "eval_steps_per_second": 2.469,
+      "step": 8352
+    },
+    {
+      "epoch": 32.00061302681992,
+      "grad_norm": 0.05028977617621422,
+      "learning_rate": 3.993188590889741e-06,
+      "loss": 1.2057,
+      "step": 8360
+    },
+    {
+      "epoch": 32.00137931034483,
+      "grad_norm": 0.06176095828413963,
+      "learning_rate": 3.984674329501916e-06,
+      "loss": 0.0006,
+      "step": 8370
+    },
+    {
+      "epoch": 32.002145593869734,
+      "grad_norm": 0.1036386638879776,
+      "learning_rate": 3.976160068114092e-06,
+      "loss": 0.7547,
+      "step": 8380
+    },
+    {
+      "epoch": 32.00291187739464,
+      "grad_norm": 0.06603002548217773,
+      "learning_rate": 3.967645806726267e-06,
+      "loss": 0.0676,
+      "step": 8390
+    },
+    {
+      "epoch": 32.00367816091954,
+      "grad_norm": 0.001678028260357678,
+      "learning_rate": 3.9591315453384425e-06,
+      "loss": 0.0003,
+      "step": 8400
+    },
+    {
+      "epoch": 32.004444444444445,
+      "grad_norm": 0.1354062259197235,
+      "learning_rate": 3.9506172839506175e-06,
+      "loss": 0.3922,
+      "step": 8410
+    },
+    {
+      "epoch": 32.00521072796935,
+      "grad_norm": 0.062066033482551575,
+      "learning_rate": 3.942103022562793e-06,
+      "loss": 0.0282,
+      "step": 8420
+    },
+    {
+      "epoch": 32.00597701149425,
+      "grad_norm": 0.11745186895132065,
+      "learning_rate": 3.933588761174968e-06,
+      "loss": 0.001,
+      "step": 8430
+    },
+    {
+      "epoch": 32.006743295019156,
+      "grad_norm": 377.28131103515625,
+      "learning_rate": 3.925074499787143e-06,
+      "loss": 0.6375,
+      "step": 8440
+    },
+    {
+      "epoch": 32.00750957854406,
+      "grad_norm": 0.004686414264142513,
+      "learning_rate": 3.916560238399319e-06,
+      "loss": 0.6466,
+      "step": 8450
+    },
+    {
+      "epoch": 32.00827586206896,
+      "grad_norm": 0.3824837803840637,
+      "learning_rate": 3.908045977011495e-06,
+      "loss": 0.8895,
+      "step": 8460
+    },
+    {
+      "epoch": 32.00904214559387,
+      "grad_norm": 0.02006428875029087,
+      "learning_rate": 3.89953171562367e-06,
+      "loss": 0.0004,
+      "step": 8470
+    },
+    {
+      "epoch": 32.00980842911878,
+      "grad_norm": 15.476459503173828,
+      "learning_rate": 3.891017454235845e-06,
+      "loss": 0.0031,
+      "step": 8480
+    },
+    {
+      "epoch": 32.01057471264368,
+      "grad_norm": 0.06542538106441498,
+      "learning_rate": 3.882503192848021e-06,
+      "loss": 0.5439,
+      "step": 8490
+    },
+    {
+      "epoch": 32.011340996168585,
+      "grad_norm": 0.17155639827251434,
+      "learning_rate": 3.873988931460197e-06,
+      "loss": 0.8883,
+      "step": 8500
+    },
+    {
+      "epoch": 32.01210727969349,
+      "grad_norm": 0.0681372657418251,
+      "learning_rate": 3.865474670072372e-06,
+      "loss": 0.5491,
+      "step": 8510
+    },
+    {
+      "epoch": 32.01287356321839,
+      "grad_norm": 0.0009823060827329755,
+      "learning_rate": 3.856960408684547e-06,
+      "loss": 0.5316,
+      "step": 8520
+    },
+    {
+      "epoch": 32.013639846743295,
+      "grad_norm": 0.0013014402939006686,
+      "learning_rate": 3.8484461472967225e-06,
+      "loss": 0.0013,
+      "step": 8530
+    },
+    {
+      "epoch": 32.0144061302682,
+      "grad_norm": 0.14377635717391968,
+      "learning_rate": 3.839931885908898e-06,
+      "loss": 0.5197,
+      "step": 8540
+    },
+    {
+      "epoch": 32.0151724137931,
+      "grad_norm": 762.5878295898438,
+      "learning_rate": 3.831417624521073e-06,
+      "loss": 0.43,
+      "step": 8550
+    },
+    {
+      "epoch": 32.015938697318006,
+      "grad_norm": 0.003731831442564726,
+      "learning_rate": 3.822903363133248e-06,
+      "loss": 0.0012,
+      "step": 8560
+    },
+    {
+      "epoch": 32.01670498084291,
+      "grad_norm": 0.09580423682928085,
+      "learning_rate": 3.8143891017454237e-06,
+      "loss": 0.0492,
+      "step": 8570
+    },
+    {
+      "epoch": 32.01747126436781,
+      "grad_norm": 0.0034091139677911997,
+      "learning_rate": 3.805874840357599e-06,
+      "loss": 1.4227,
+      "step": 8580
+    },
+    {
+      "epoch": 32.01823754789272,
+      "grad_norm": 0.12071370333433151,
+      "learning_rate": 3.797360578969775e-06,
+      "loss": 1.1441,
+      "step": 8590
+    },
+    {
+      "epoch": 32.01900383141763,
+      "grad_norm": 0.0012643920490518212,
+      "learning_rate": 3.78884631758195e-06,
+      "loss": 0.0012,
+      "step": 8600
+    },
+    {
+      "epoch": 32.01977011494253,
+      "grad_norm": 1.8759658336639404,
+      "learning_rate": 3.7803320561941254e-06,
+      "loss": 0.0007,
+      "step": 8610
+    },
+    {
+      "epoch": 32.02,
+      "eval_accuracy": 0.8222222222222222,
+      "eval_loss": 1.1478798389434814,
+      "eval_runtime": 18.3025,
+      "eval_samples_per_second": 2.459,
+      "eval_steps_per_second": 2.459,
+      "step": 8613
+    },
+    {
+      "epoch": 33.00053639846743,
+      "grad_norm": 0.37706437706947327,
+      "learning_rate": 3.7718177948063004e-06,
+      "loss": 0.0077,
+      "step": 8620
+    },
+    {
+      "epoch": 33.001302681992335,
+      "grad_norm": 0.33746206760406494,
+      "learning_rate": 3.7633035334184762e-06,
+      "loss": 0.0025,
+      "step": 8630
+    },
+    {
+      "epoch": 33.00206896551724,
+      "grad_norm": 3.6422576904296875,
+      "learning_rate": 3.7547892720306517e-06,
+      "loss": 0.0016,
+      "step": 8640
+    },
+    {
+      "epoch": 33.00283524904214,
+      "grad_norm": 0.0012748910812661052,
+      "learning_rate": 3.746275010642827e-06,
+      "loss": 0.5986,
+      "step": 8650
+    },
+    {
+      "epoch": 33.00360153256705,
+      "grad_norm": 0.002419630531221628,
+      "learning_rate": 3.737760749255002e-06,
+      "loss": 2.2494,
+      "step": 8660
+    },
+    {
+      "epoch": 33.00436781609196,
+      "grad_norm": 0.0009685585973784328,
+      "learning_rate": 3.729246487867178e-06,
+      "loss": 0.0017,
+      "step": 8670
+    },
+    {
+      "epoch": 33.00513409961686,
+      "grad_norm": 0.06901658326387405,
+      "learning_rate": 3.7207322264793533e-06,
+      "loss": 0.6327,
+      "step": 8680
+    },
+    {
+      "epoch": 33.005900383141764,
+      "grad_norm": 0.0039053920190781355,
+      "learning_rate": 3.7122179650915287e-06,
+      "loss": 0.0011,
+      "step": 8690
+    },
+    {
+      "epoch": 33.00666666666667,
+      "grad_norm": 0.0013945258688181639,
+      "learning_rate": 3.7037037037037037e-06,
+      "loss": 0.0004,
+      "step": 8700
+    },
+    {
+      "epoch": 33.00743295019157,
+      "grad_norm": 0.0010772545356303453,
+      "learning_rate": 3.6951894423158796e-06,
+      "loss": 0.0017,
+      "step": 8710
+    },
+    {
+      "epoch": 33.008199233716475,
+      "grad_norm": 0.002986000385135412,
+      "learning_rate": 3.686675180928055e-06,
+      "loss": 0.5708,
+      "step": 8720
+    },
+    {
+      "epoch": 33.00896551724138,
+      "grad_norm": 59.92686080932617,
+      "learning_rate": 3.67816091954023e-06,
+      "loss": 0.5376,
+      "step": 8730
+    },
+    {
+      "epoch": 33.00973180076628,
+      "grad_norm": 0.0016347280470654368,
+      "learning_rate": 3.6696466581524054e-06,
+      "loss": 0.0016,
+      "step": 8740
+    },
+    {
+      "epoch": 33.010498084291186,
+      "grad_norm": 0.007008823566138744,
+      "learning_rate": 3.6611323967645812e-06,
+      "loss": 0.0053,
+      "step": 8750
+    },
+    {
+      "epoch": 33.01126436781609,
+      "grad_norm": 0.0015044922474771738,
+      "learning_rate": 3.6526181353767567e-06,
+      "loss": 0.0014,
+      "step": 8760
+    },
+    {
+      "epoch": 33.01203065134099,
+      "grad_norm": 0.09846771508455276,
+      "learning_rate": 3.6441038739889317e-06,
+      "loss": 0.0016,
+      "step": 8770
+    },
+    {
+      "epoch": 33.012796934865904,
+      "grad_norm": 0.009564564563333988,
+      "learning_rate": 3.635589612601107e-06,
+      "loss": 1.1216,
+      "step": 8780
+    },
+    {
+      "epoch": 33.01356321839081,
+      "grad_norm": 0.0015884574968367815,
+      "learning_rate": 3.627075351213283e-06,
+      "loss": 0.245,
+      "step": 8790
+    },
+    {
+      "epoch": 33.01432950191571,
+      "grad_norm": 0.0010513915913179517,
+      "learning_rate": 3.618561089825458e-06,
+      "loss": 0.8205,
+      "step": 8800
+    },
+    {
+      "epoch": 33.015095785440614,
+      "grad_norm": 0.0026978852692991495,
+      "learning_rate": 3.6100468284376333e-06,
+      "loss": 0.3304,
+      "step": 8810
+    },
+    {
+      "epoch": 33.01586206896552,
+      "grad_norm": 0.001224369159899652,
+      "learning_rate": 3.6015325670498087e-06,
+      "loss": 0.5547,
+      "step": 8820
+    },
+    {
+      "epoch": 33.01662835249042,
+      "grad_norm": 44.857872009277344,
+      "learning_rate": 3.5930183056619837e-06,
+      "loss": 0.8618,
+      "step": 8830
+    },
+    {
+      "epoch": 33.017394636015325,
+      "grad_norm": 0.02402423694729805,
+      "learning_rate": 3.5845040442741596e-06,
+      "loss": 0.0412,
+      "step": 8840
+    },
+    {
+      "epoch": 33.01816091954023,
+      "grad_norm": 0.011330818757414818,
+      "learning_rate": 3.575989782886335e-06,
+      "loss": 0.0006,
+      "step": 8850
+    },
+    {
+      "epoch": 33.01892720306513,
+      "grad_norm": 41.491024017333984,
+      "learning_rate": 3.56747552149851e-06,
+      "loss": 0.5524,
+      "step": 8860
+    },
+    {
+      "epoch": 33.019693486590036,
+      "grad_norm": 1.906652569770813,
+      "learning_rate": 3.5589612601106854e-06,
+      "loss": 0.534,
+      "step": 8870
+    },
+    {
+      "epoch": 33.02,
+      "eval_accuracy": 0.6666666666666666,
+      "eval_loss": 1.7736133337020874,
+      "eval_runtime": 17.1616,
+      "eval_samples_per_second": 2.622,
+      "eval_steps_per_second": 2.622,
+      "step": 8874
+    },
+    {
+      "epoch": 34.000459770114944,
+      "grad_norm": 0.11188524216413498,
+      "learning_rate": 3.5504469987228612e-06,
+      "loss": 0.0009,
+      "step": 8880
+    },
+    {
+      "epoch": 34.00122605363985,
+      "grad_norm": 0.005530110560357571,
+      "learning_rate": 3.5419327373350367e-06,
+      "loss": 0.0017,
+      "step": 8890
+    },
+    {
+      "epoch": 34.00199233716475,
+      "grad_norm": 0.000945807492826134,
+      "learning_rate": 3.5334184759472117e-06,
+      "loss": 1.0962,
+      "step": 8900
+    },
+    {
+      "epoch": 34.002758620689654,
+      "grad_norm": 0.051899876445531845,
+      "learning_rate": 3.524904214559387e-06,
+      "loss": 0.0103,
+      "step": 8910
+    },
+    {
+      "epoch": 34.00352490421456,
+      "grad_norm": 0.0009481986053287983,
+      "learning_rate": 3.516389953171563e-06,
+      "loss": 0.0034,
+      "step": 8920
+    },
+    {
+      "epoch": 34.00429118773946,
+      "grad_norm": 0.0034817762207239866,
+      "learning_rate": 3.507875691783738e-06,
+      "loss": 0.6872,
+      "step": 8930
+    },
+    {
+      "epoch": 34.005057471264365,
+      "grad_norm": 962.0402221679688,
+      "learning_rate": 3.4993614303959133e-06,
+      "loss": 0.4649,
+      "step": 8940
+    },
+    {
+      "epoch": 34.00582375478927,
+      "grad_norm": 0.04274230822920799,
+      "learning_rate": 3.4908471690080887e-06,
+      "loss": 0.0003,
+      "step": 8950
+    },
+    {
+      "epoch": 34.00659003831418,
+      "grad_norm": 0.009698279201984406,
+      "learning_rate": 3.4823329076202646e-06,
+      "loss": 0.0013,
+      "step": 8960
+    },
+    {
+      "epoch": 34.00735632183908,
+      "grad_norm": 0.00844807829707861,
+      "learning_rate": 3.4738186462324396e-06,
+      "loss": 0.002,
+      "step": 8970
+    },
+    {
+      "epoch": 34.00812260536399,
+      "grad_norm": 0.002243434078991413,
+      "learning_rate": 3.465304384844615e-06,
+      "loss": 0.5904,
+      "step": 8980
+    },
+    {
+      "epoch": 34.00888888888889,
+      "grad_norm": 0.002584859263151884,
+      "learning_rate": 3.4567901234567904e-06,
+      "loss": 0.0012,
+      "step": 8990
+    },
+    {
+      "epoch": 34.009655172413794,
+      "grad_norm": 0.20528066158294678,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 1.1604,
+      "step": 9000
+    },
+    {
+      "epoch": 34.0104214559387,
+      "grad_norm": 0.0014755720039829612,
+      "learning_rate": 3.4397616006811412e-06,
+      "loss": 0.0023,
+      "step": 9010
+    },
+    {
+      "epoch": 34.0111877394636,
+      "grad_norm": 0.08162973821163177,
+      "learning_rate": 3.4312473392933167e-06,
+      "loss": 0.775,
+      "step": 9020
+    },
+    {
+      "epoch": 34.011954022988505,
+      "grad_norm": 0.0007822496118023992,
+      "learning_rate": 3.4227330779054917e-06,
+      "loss": 0.6544,
+      "step": 9030
+    },
+    {
+      "epoch": 34.01272030651341,
+      "grad_norm": 0.1420706808567047,
+      "learning_rate": 3.4142188165176675e-06,
+      "loss": 0.4001,
+      "step": 9040
+    },
+    {
+      "epoch": 34.01348659003831,
+      "grad_norm": 0.0020095347426831722,
+      "learning_rate": 3.405704555129843e-06,
+      "loss": 0.0021,
+      "step": 9050
+    },
+    {
+      "epoch": 34.014252873563215,
+      "grad_norm": 0.1937248259782791,
+      "learning_rate": 3.3971902937420183e-06,
+      "loss": 0.6137,
+      "step": 9060
+    },
+    {
+      "epoch": 34.01501915708812,
+      "grad_norm": 0.0025739138945937157,
+      "learning_rate": 3.3886760323541933e-06,
+      "loss": 0.0009,
+      "step": 9070
+    },
+    {
+      "epoch": 34.01578544061303,
+      "grad_norm": 0.0011383603559806943,
+      "learning_rate": 3.380161770966369e-06,
+      "loss": 0.3318,
+      "step": 9080
+    },
+    {
+      "epoch": 34.01655172413793,
+      "grad_norm": 0.001992373261600733,
+      "learning_rate": 3.3716475095785446e-06,
+      "loss": 1.352,
+      "step": 9090
+    },
+    {
+      "epoch": 34.01731800766284,
+      "grad_norm": 0.04177727550268173,
+      "learning_rate": 3.3631332481907196e-06,
+      "loss": 0.0543,
+      "step": 9100
+    },
+    {
+      "epoch": 34.01808429118774,
+      "grad_norm": 0.002260725712403655,
+      "learning_rate": 3.354618986802895e-06,
+      "loss": 0.0791,
+      "step": 9110
+    },
+    {
+      "epoch": 34.018850574712644,
+      "grad_norm": 0.010179917328059673,
+      "learning_rate": 3.3461047254150704e-06,
+      "loss": 0.6124,
+      "step": 9120
+    },
+    {
+      "epoch": 34.01961685823755,
+      "grad_norm": 32.5108642578125,
+      "learning_rate": 3.3375904640272463e-06,
+      "loss": 1.149,
+      "step": 9130
+    },
+    {
+      "epoch": 34.02,
+      "eval_accuracy": 0.8,
+      "eval_loss": 1.3922863006591797,
+      "eval_runtime": 17.1415,
+      "eval_samples_per_second": 2.625,
+      "eval_steps_per_second": 2.625,
+      "step": 9135
+    },
+    {
+      "epoch": 35.000383141762455,
+      "grad_norm": 2.1568820476531982,
+      "learning_rate": 3.3290762026394212e-06,
+      "loss": 0.074,
+      "step": 9140
+    },
+    {
+      "epoch": 35.00114942528736,
+      "grad_norm": 0.07320680469274521,
+      "learning_rate": 3.3205619412515967e-06,
+      "loss": 0.0014,
+      "step": 9150
+    },
+    {
+      "epoch": 35.00191570881226,
+      "grad_norm": 0.0020217241253703833,
+      "learning_rate": 3.3120476798637717e-06,
+      "loss": 0.0007,
+      "step": 9160
+    },
+    {
+      "epoch": 35.002681992337166,
+      "grad_norm": 0.0010427357628941536,
+      "learning_rate": 3.3035334184759475e-06,
+      "loss": 1.0602,
+      "step": 9170
+    },
+    {
+      "epoch": 35.00344827586207,
+      "grad_norm": 0.00230971397832036,
+      "learning_rate": 3.295019157088123e-06,
+      "loss": 1.0503,
+      "step": 9180
+    },
+    {
+      "epoch": 35.00421455938697,
+      "grad_norm": 0.004158778116106987,
+      "learning_rate": 3.2865048957002983e-06,
+      "loss": 0.0013,
+      "step": 9190
+    },
+    {
+      "epoch": 35.00498084291188,
+      "grad_norm": 0.05161009356379509,
+      "learning_rate": 3.2779906343124733e-06,
+      "loss": 1.088,
+      "step": 9200
+    },
+    {
+      "epoch": 35.00574712643678,
+      "grad_norm": 0.046077948063611984,
+      "learning_rate": 3.269476372924649e-06,
+      "loss": 0.5326,
+      "step": 9210
+    },
+    {
+      "epoch": 35.006513409961684,
+      "grad_norm": 0.19523532688617706,
+      "learning_rate": 3.2609621115368246e-06,
+      "loss": 0.001,
+      "step": 9220
+    },
+    {
+      "epoch": 35.00727969348659,
+      "grad_norm": 46.848873138427734,
+      "learning_rate": 3.2524478501489996e-06,
+      "loss": 0.5054,
+      "step": 9230
+    },
+    {
+      "epoch": 35.00804597701149,
+      "grad_norm": 0.3965376019477844,
+      "learning_rate": 3.243933588761175e-06,
+      "loss": 0.4618,
+      "step": 9240
+    },
+    {
+      "epoch": 35.008812260536395,
+      "grad_norm": 0.21164460480213165,
+      "learning_rate": 3.235419327373351e-06,
+      "loss": 0.543,
+      "step": 9250
+    },
+    {
+      "epoch": 35.009578544061306,
+      "grad_norm": 0.0010897840838879347,
+      "learning_rate": 3.2269050659855262e-06,
+      "loss": 0.442,
+      "step": 9260
+    },
+    {
+      "epoch": 35.01034482758621,
+      "grad_norm": 0.18790724873542786,
+      "learning_rate": 3.2183908045977012e-06,
+      "loss": 0.8635,
+      "step": 9270
+    },
+    {
+      "epoch": 35.01111111111111,
+      "grad_norm": 0.08552441000938416,
+      "learning_rate": 3.2098765432098767e-06,
+      "loss": 0.8206,
+      "step": 9280
+    },
+    {
+      "epoch": 35.01187739463602,
+      "grad_norm": 0.267426460981369,
+      "learning_rate": 3.2013622818220525e-06,
+      "loss": 1.0238,
+      "step": 9290
+    },
+    {
+      "epoch": 35.01264367816092,
+      "grad_norm": 0.030172178521752357,
+      "learning_rate": 3.1928480204342275e-06,
+      "loss": 0.0016,
+      "step": 9300
+    },
+    {
+      "epoch": 35.013409961685824,
+      "grad_norm": 0.00807393528521061,
+      "learning_rate": 3.184333759046403e-06,
+      "loss": 0.0015,
+      "step": 9310
+    },
+    {
+      "epoch": 35.01417624521073,
+      "grad_norm": 0.005051786545664072,
+      "learning_rate": 3.1758194976585783e-06,
+      "loss": 1.069,
+      "step": 9320
+    },
+    {
+      "epoch": 35.01494252873563,
+      "grad_norm": 0.01685529388487339,
+      "learning_rate": 3.167305236270754e-06,
+      "loss": 0.5458,
+      "step": 9330
+    },
+    {
+      "epoch": 35.015708812260534,
+      "grad_norm": 0.0008387691923417151,
+      "learning_rate": 3.158790974882929e-06,
+      "loss": 1.0544,
+      "step": 9340
+    },
+    {
+      "epoch": 35.01647509578544,
+      "grad_norm": 0.10800078511238098,
+      "learning_rate": 3.1502767134951046e-06,
+      "loss": 0.0012,
+      "step": 9350
+    },
+    {
+      "epoch": 35.01724137931034,
+      "grad_norm": 0.008371325209736824,
+      "learning_rate": 3.14176245210728e-06,
+      "loss": 0.0027,
+      "step": 9360
+    },
+    {
+      "epoch": 35.01800766283525,
+      "grad_norm": 0.015372390858829021,
+      "learning_rate": 3.133248190719455e-06,
+      "loss": 0.0022,
+      "step": 9370
+    },
+    {
+      "epoch": 35.018773946360156,
+      "grad_norm": 0.0026980929542332888,
+      "learning_rate": 3.124733929331631e-06,
+      "loss": 0.5343,
+      "step": 9380
+    },
+    {
+      "epoch": 35.01954022988506,
+      "grad_norm": 0.0746304914355278,
+      "learning_rate": 3.1162196679438062e-06,
+      "loss": 0.0015,
+      "step": 9390
+    },
+    {
+      "epoch": 35.02,
+      "eval_accuracy": 0.6888888888888889,
+      "eval_loss": 1.7895233631134033,
+      "eval_runtime": 17.0827,
+      "eval_samples_per_second": 2.634,
+      "eval_steps_per_second": 2.634,
+      "step": 9396
+    },
+    {
+      "epoch": 36.00030651340996,
+      "grad_norm": 0.14152012765407562,
+      "learning_rate": 3.1077054065559812e-06,
+      "loss": 0.0194,
+      "step": 9400
+    },
+    {
+      "epoch": 36.001072796934864,
+      "grad_norm": 0.025358257815241814,
+      "learning_rate": 3.0991911451681567e-06,
+      "loss": 0.0006,
+      "step": 9410
+    },
+    {
+      "epoch": 36.00183908045977,
+      "grad_norm": 78.54558563232422,
+      "learning_rate": 3.0906768837803325e-06,
+      "loss": 0.5117,
+      "step": 9420
+    },
+    {
+      "epoch": 36.00260536398467,
+      "grad_norm": 0.11867334693670273,
+      "learning_rate": 3.082162622392508e-06,
+      "loss": 0.0017,
+      "step": 9430
+    },
+    {
+      "epoch": 36.00337164750958,
+      "grad_norm": 0.002273265039548278,
+      "learning_rate": 3.073648361004683e-06,
+      "loss": 0.1799,
+      "step": 9440
+    },
+    {
+      "epoch": 36.004137931034485,
+      "grad_norm": 203.22657775878906,
+      "learning_rate": 3.0651340996168583e-06,
+      "loss": 0.7579,
+      "step": 9450
+    },
+    {
+      "epoch": 36.00490421455939,
+      "grad_norm": 0.0017151336651295424,
+      "learning_rate": 3.056619838229034e-06,
+      "loss": 0.5871,
+      "step": 9460
+    },
+    {
+      "epoch": 36.00567049808429,
+      "grad_norm": 0.0021970414090901613,
+      "learning_rate": 3.048105576841209e-06,
+      "loss": 0.5158,
+      "step": 9470
+    },
+    {
+      "epoch": 36.006436781609196,
+      "grad_norm": 0.0016529745189473033,
+      "learning_rate": 3.0395913154533846e-06,
+      "loss": 0.0021,
+      "step": 9480
+    },
+    {
+      "epoch": 36.0072030651341,
+      "grad_norm": 0.09726948291063309,
+      "learning_rate": 3.03107705406556e-06,
+      "loss": 0.001,
+      "step": 9490
+    },
+    {
+      "epoch": 36.007969348659,
+      "grad_norm": 0.001085378578864038,
+      "learning_rate": 3.022562792677736e-06,
+      "loss": 0.248,
+      "step": 9500
+    },
+    {
+      "epoch": 36.00873563218391,
+      "grad_norm": 0.003718213876709342,
+      "learning_rate": 3.014048531289911e-06,
+      "loss": 0.0004,
+      "step": 9510
+    },
+    {
+      "epoch": 36.00950191570881,
+      "grad_norm": 0.0007817785954102874,
+      "learning_rate": 3.0055342699020862e-06,
+      "loss": 0.0009,
+      "step": 9520
+    },
+    {
+      "epoch": 36.010268199233714,
+      "grad_norm": 0.0016829303931444883,
+      "learning_rate": 2.9970200085142612e-06,
+      "loss": 0.2533,
+      "step": 9530
+    },
+    {
+      "epoch": 36.01103448275862,
+      "grad_norm": 0.0018956586718559265,
+      "learning_rate": 2.988505747126437e-06,
+      "loss": 0.1119,
+      "step": 9540
+    },
+    {
+      "epoch": 36.01180076628353,
+      "grad_norm": 0.0012920110020786524,
+      "learning_rate": 2.9799914857386125e-06,
+      "loss": 0.0008,
+      "step": 9550
+    },
+    {
+      "epoch": 36.01256704980843,
+      "grad_norm": 607.2188720703125,
+      "learning_rate": 2.971477224350788e-06,
+      "loss": 0.4537,
+      "step": 9560
+    },
+    {
+      "epoch": 36.013333333333335,
+      "grad_norm": 0.002291508950293064,
+      "learning_rate": 2.962962962962963e-06,
+      "loss": 2.0775,
+      "step": 9570
+    },
+    {
+      "epoch": 36.01409961685824,
+      "grad_norm": 2.0352463722229004,
+      "learning_rate": 2.9544487015751387e-06,
+      "loss": 0.3054,
+      "step": 9580
+    },
+    {
+      "epoch": 36.01486590038314,
+      "grad_norm": 0.006190211977809668,
+      "learning_rate": 2.945934440187314e-06,
+      "loss": 0.001,
+      "step": 9590
+    },
+    {
+      "epoch": 36.015632183908046,
+      "grad_norm": 0.0019515285966917872,
+      "learning_rate": 2.9374201787994896e-06,
+      "loss": 0.7126,
+      "step": 9600
+    },
+    {
+      "epoch": 36.01639846743295,
+      "grad_norm": 0.01208975724875927,
+      "learning_rate": 2.9289059174116646e-06,
+      "loss": 0.0005,
+      "step": 9610
+    },
+    {
+      "epoch": 36.01716475095785,
+      "grad_norm": 0.16302844882011414,
+      "learning_rate": 2.92039165602384e-06,
+      "loss": 0.0003,
+      "step": 9620
+    },
+    {
+      "epoch": 36.01793103448276,
+      "grad_norm": 380.09625244140625,
+      "learning_rate": 2.911877394636016e-06,
+      "loss": 0.3886,
+      "step": 9630
+    },
+    {
+      "epoch": 36.01869731800766,
+      "grad_norm": 0.03196420520544052,
+      "learning_rate": 2.903363133248191e-06,
+      "loss": 0.66,
+      "step": 9640
+    },
+    {
+      "epoch": 36.019463601532564,
+      "grad_norm": 0.0020150793716311455,
+      "learning_rate": 2.8948488718603662e-06,
+      "loss": 0.0007,
+      "step": 9650
+    },
+    {
+      "epoch": 36.02,
+      "eval_accuracy": 0.7555555555555555,
+      "eval_loss": 1.693882942199707,
+      "eval_runtime": 17.1572,
+      "eval_samples_per_second": 2.623,
+      "eval_steps_per_second": 2.623,
+      "step": 9657
+    },
+    {
+      "epoch": 37.00022988505747,
+      "grad_norm": 0.04321293532848358,
+      "learning_rate": 2.8863346104725417e-06,
+      "loss": 0.0011,
+      "step": 9660
+    },
+    {
+      "epoch": 37.000996168582375,
+      "grad_norm": 0.0007681910647079349,
+      "learning_rate": 2.8778203490847175e-06,
+      "loss": 0.5889,
+      "step": 9670
+    },
+    {
+      "epoch": 37.00176245210728,
+      "grad_norm": 0.004244087263941765,
+      "learning_rate": 2.8693060876968925e-06,
+      "loss": 1.3506,
+      "step": 9680
+    },
+    {
+      "epoch": 37.00252873563218,
+      "grad_norm": 0.0014911898178979754,
+      "learning_rate": 2.860791826309068e-06,
+      "loss": 0.0024,
+      "step": 9690
+    },
+    {
+      "epoch": 37.003295019157086,
+      "grad_norm": 0.03196387365460396,
+      "learning_rate": 2.852277564921243e-06,
+      "loss": 0.0004,
+      "step": 9700
+    },
+    {
+      "epoch": 37.00406130268199,
+      "grad_norm": 0.009950182400643826,
+      "learning_rate": 2.8437633035334187e-06,
+      "loss": 0.495,
+      "step": 9710
+    },
+    {
+      "epoch": 37.00482758620689,
+      "grad_norm": 0.0021573223639279604,
+      "learning_rate": 2.835249042145594e-06,
+      "loss": 0.6604,
+      "step": 9720
+    },
+    {
+      "epoch": 37.005593869731804,
+      "grad_norm": 0.00755895534530282,
+      "learning_rate": 2.8267347807577696e-06,
+      "loss": 0.0004,
+      "step": 9730
+    },
+    {
+      "epoch": 37.00636015325671,
+      "grad_norm": 0.001713043311610818,
+      "learning_rate": 2.8182205193699446e-06,
+      "loss": 1.3733,
+      "step": 9740
+    },
+    {
+      "epoch": 37.00712643678161,
+      "grad_norm": 0.003327340120449662,
+      "learning_rate": 2.8097062579821204e-06,
+      "loss": 0.5853,
+      "step": 9750
+    },
+    {
+      "epoch": 37.007892720306515,
+      "grad_norm": 0.0016224593855440617,
+      "learning_rate": 2.801191996594296e-06,
+      "loss": 1.2983,
+      "step": 9760
+    },
+    {
+      "epoch": 37.00865900383142,
+      "grad_norm": 0.0023931225296109915,
+      "learning_rate": 2.792677735206471e-06,
+      "loss": 0.0121,
+      "step": 9770
+    },
+    {
+      "epoch": 37.00942528735632,
+      "grad_norm": 0.0011039819801226258,
+      "learning_rate": 2.7841634738186462e-06,
+      "loss": 0.009,
+      "step": 9780
+    },
+    {
+      "epoch": 37.010191570881226,
+      "grad_norm": 0.04504973813891411,
+      "learning_rate": 2.775649212430822e-06,
+      "loss": 0.004,
+      "step": 9790
+    },
+    {
+      "epoch": 37.01095785440613,
+      "grad_norm": 0.006777584087103605,
+      "learning_rate": 2.7671349510429975e-06,
+      "loss": 0.8713,
+      "step": 9800
+    },
+    {
+      "epoch": 37.01172413793103,
+      "grad_norm": 157.15858459472656,
+      "learning_rate": 2.7586206896551725e-06,
+      "loss": 0.5456,
+      "step": 9810
+    },
+    {
+      "epoch": 37.01249042145594,
+      "grad_norm": 0.016833683475852013,
+      "learning_rate": 2.750106428267348e-06,
+      "loss": 0.4411,
+      "step": 9820
+    },
+    {
+      "epoch": 37.01325670498084,
+      "grad_norm": 0.028045177459716797,
+      "learning_rate": 2.7415921668795238e-06,
+      "loss": 0.0386,
+      "step": 9830
+    },
+    {
+      "epoch": 37.014022988505744,
+      "grad_norm": 0.0006943064508959651,
+      "learning_rate": 2.7330779054916987e-06,
+      "loss": 0.2108,
+      "step": 9840
+    },
+    {
+      "epoch": 37.014789272030654,
+      "grad_norm": 0.030458789318799973,
+      "learning_rate": 2.724563644103874e-06,
+      "loss": 0.0005,
+      "step": 9850
+    },
+    {
+      "epoch": 37.01555555555556,
+      "grad_norm": 0.059811390936374664,
+      "learning_rate": 2.7160493827160496e-06,
+      "loss": 0.0005,
+      "step": 9860
+    },
+    {
+      "epoch": 37.01632183908046,
+      "grad_norm": 0.10825932770967484,
+      "learning_rate": 2.7075351213282254e-06,
+      "loss": 0.0009,
+      "step": 9870
+    },
+    {
+      "epoch": 37.017088122605365,
+      "grad_norm": 0.008525940589606762,
+      "learning_rate": 2.6990208599404004e-06,
+      "loss": 0.0004,
+      "step": 9880
+    },
+    {
+      "epoch": 37.01785440613027,
+      "grad_norm": 0.0531802698969841,
+      "learning_rate": 2.690506598552576e-06,
+      "loss": 0.0004,
+      "step": 9890
+    },
+    {
+      "epoch": 37.01862068965517,
+      "grad_norm": 0.029235534369945526,
+      "learning_rate": 2.6819923371647512e-06,
+      "loss": 0.0004,
+      "step": 9900
+    },
+    {
+      "epoch": 37.019386973180076,
+      "grad_norm": 146.88075256347656,
+      "learning_rate": 2.6734780757769262e-06,
+      "loss": 0.6659,
+      "step": 9910
+    },
+    {
+      "epoch": 37.02,
+      "eval_accuracy": 0.6888888888888889,
+      "eval_loss": 1.9129738807678223,
+      "eval_runtime": 17.1472,
+      "eval_samples_per_second": 2.624,
+      "eval_steps_per_second": 2.624,
+      "step": 9918
+    },
+    {
+      "epoch": 38.000153256704984,
+      "grad_norm": 0.025278517976403236,
+      "learning_rate": 2.664963814389102e-06,
+      "loss": 0.0007,
+      "step": 9920
+    },
+    {
+      "epoch": 38.00091954022989,
+      "grad_norm": 0.04159550368785858,
+      "learning_rate": 2.6564495530012775e-06,
+      "loss": 0.0007,
+      "step": 9930
+    },
+    {
+      "epoch": 38.00168582375479,
+      "grad_norm": 0.020553968846797943,
+      "learning_rate": 2.6479352916134525e-06,
+      "loss": 0.0003,
+      "step": 9940
+    },
+    {
+      "epoch": 38.002452107279694,
+      "grad_norm": 0.2901555895805359,
+      "learning_rate": 2.639421030225628e-06,
+      "loss": 0.2573,
+      "step": 9950
+    },
+    {
+      "epoch": 38.0032183908046,
+      "grad_norm": 0.12247475236654282,
+      "learning_rate": 2.6309067688378037e-06,
+      "loss": 0.0007,
+      "step": 9960
+    },
+    {
+      "epoch": 38.0039846743295,
+      "grad_norm": 0.11259821802377701,
+      "learning_rate": 2.622392507449979e-06,
+      "loss": 0.0003,
+      "step": 9970
+    },
+    {
+      "epoch": 38.004750957854405,
+      "grad_norm": 0.014909302815794945,
+      "learning_rate": 2.613878246062154e-06,
+      "loss": 0.0006,
+      "step": 9980
+    },
+    {
+      "epoch": 38.00551724137931,
+      "grad_norm": 196.90673828125,
+      "learning_rate": 2.6053639846743296e-06,
+      "loss": 0.0095,
+      "step": 9990
+    },
+    {
+      "epoch": 38.00628352490421,
+      "grad_norm": 280.4552307128906,
+      "learning_rate": 2.5968497232865054e-06,
+      "loss": 1.1779,
+      "step": 10000
+    },
+    {
+      "epoch": 38.007049808429116,
+      "grad_norm": 0.0010620058747008443,
+      "learning_rate": 2.5883354618986804e-06,
+      "loss": 0.6285,
+      "step": 10010
+    },
+    {
+      "epoch": 38.00781609195402,
+      "grad_norm": 13.668376922607422,
+      "learning_rate": 2.579821200510856e-06,
+      "loss": 0.7079,
+      "step": 10020
+    },
+    {
+      "epoch": 38.00858237547893,
+      "grad_norm": 0.024773895740509033,
+      "learning_rate": 2.5713069391230312e-06,
+      "loss": 0.0007,
+      "step": 10030
+    },
+    {
+      "epoch": 38.009348659003834,
+      "grad_norm": 0.00320147885940969,
+      "learning_rate": 2.562792677735207e-06,
+      "loss": 0.0002,
+      "step": 10040
+    },
+    {
+      "epoch": 38.01011494252874,
+      "grad_norm": 0.0015671950532123446,
+      "learning_rate": 2.554278416347382e-06,
+      "loss": 0.0001,
+      "step": 10050
+    },
+    {
+      "epoch": 38.01088122605364,
+      "grad_norm": 0.01653999276459217,
+      "learning_rate": 2.5457641549595575e-06,
+      "loss": 0.6464,
+      "step": 10060
+    },
+    {
+      "epoch": 38.011647509578545,
+      "grad_norm": 0.017531948164105415,
+      "learning_rate": 2.5372498935717325e-06,
+      "loss": 0.002,
+      "step": 10070
+    },
+    {
+      "epoch": 38.01241379310345,
+      "grad_norm": 0.04314403235912323,
+      "learning_rate": 2.5287356321839083e-06,
+      "loss": 0.0003,
+      "step": 10080
+    },
+    {
+      "epoch": 38.01318007662835,
+      "grad_norm": 0.012460527941584587,
+      "learning_rate": 2.5202213707960837e-06,
+      "loss": 0.5619,
+      "step": 10090
+    },
+    {
+      "epoch": 38.013946360153255,
+      "grad_norm": 0.001233008224517107,
+      "learning_rate": 2.511707109408259e-06,
+      "loss": 0.0004,
+      "step": 10100
+    },
+    {
+      "epoch": 38.01471264367816,
+      "grad_norm": 0.023712268099188805,
+      "learning_rate": 2.503192848020434e-06,
+      "loss": 0.0825,
+      "step": 10110
+    },
+    {
+      "epoch": 38.01547892720306,
+      "grad_norm": 0.0006288658478297293,
+      "learning_rate": 2.49467858663261e-06,
+      "loss": 0.0009,
+      "step": 10120
+    },
+    {
+      "epoch": 38.016245210727966,
+      "grad_norm": 495.86383056640625,
+      "learning_rate": 2.4861643252447854e-06,
+      "loss": 0.0351,
+      "step": 10130
+    },
+    {
+      "epoch": 38.01701149425288,
+      "grad_norm": 0.014433549717068672,
+      "learning_rate": 2.4776500638569604e-06,
+      "loss": 0.0729,
+      "step": 10140
+    },
+    {
+      "epoch": 38.01777777777778,
+      "grad_norm": 0.01770133152604103,
+      "learning_rate": 2.469135802469136e-06,
+      "loss": 0.0002,
+      "step": 10150
+    },
+    {
+      "epoch": 38.018544061302684,
+      "grad_norm": 37.95453643798828,
+      "learning_rate": 2.4606215410813112e-06,
+      "loss": 0.7288,
+      "step": 10160
+    },
+    {
+      "epoch": 38.01931034482759,
+      "grad_norm": 0.017218776047229767,
+      "learning_rate": 2.4521072796934867e-06,
+      "loss": 0.2733,
+      "step": 10170
+    },
+    {
+      "epoch": 38.02,
+      "eval_accuracy": 0.6666666666666666,
+      "eval_loss": 1.8643172979354858,
+      "eval_runtime": 17.1245,
+      "eval_samples_per_second": 2.628,
+      "eval_steps_per_second": 2.628,
+      "step": 10179
+    },
+    {
+      "epoch": 39.00007662835249,
+      "grad_norm": 0.00903532188385725,
+      "learning_rate": 2.443593018305662e-06,
+      "loss": 1.556,
+      "step": 10180
+    },
+    {
+      "epoch": 39.00084291187739,
+      "grad_norm": 0.002156023168936372,
+      "learning_rate": 2.4350787569178375e-06,
+      "loss": 0.0006,
+      "step": 10190
+    },
+    {
+      "epoch": 39.001609195402295,
+      "grad_norm": 362.9033508300781,
+      "learning_rate": 2.426564495530013e-06,
+      "loss": 0.897,
+      "step": 10200
+    },
+    {
+      "epoch": 39.002375478927206,
+      "grad_norm": 0.010444692336022854,
+      "learning_rate": 2.4180502341421883e-06,
+      "loss": 0.0004,
+      "step": 10210
+    },
+    {
+      "epoch": 39.00314176245211,
+      "grad_norm": 0.0009983752388507128,
+      "learning_rate": 2.4095359727543637e-06,
+      "loss": 0.0004,
+      "step": 10220
+    },
+    {
+      "epoch": 39.00390804597701,
+      "grad_norm": 0.05090656131505966,
+      "learning_rate": 2.401021711366539e-06,
+      "loss": 0.0871,
+      "step": 10230
+    },
+    {
+      "epoch": 39.00467432950192,
+      "grad_norm": 0.019571129232645035,
+      "learning_rate": 2.3925074499787146e-06,
+      "loss": 0.0001,
+      "step": 10240
+    },
+    {
+      "epoch": 39.00544061302682,
+      "grad_norm": 2.97189998626709,
+      "learning_rate": 2.38399318859089e-06,
+      "loss": 0.0006,
+      "step": 10250
+    },
+    {
+      "epoch": 39.006206896551724,
+      "grad_norm": 2.586013078689575,
+      "learning_rate": 2.3754789272030654e-06,
+      "loss": 0.0012,
+      "step": 10260
+    },
+    {
+      "epoch": 39.00697318007663,
+      "grad_norm": 0.043128736317157745,
+      "learning_rate": 2.366964665815241e-06,
+      "loss": 0.595,
+      "step": 10270
+    },
+    {
+      "epoch": 39.00773946360153,
+      "grad_norm": 0.01944294199347496,
+      "learning_rate": 2.3584504044274162e-06,
+      "loss": 0.0021,
+      "step": 10280
+    },
+    {
+      "epoch": 39.008505747126435,
+      "grad_norm": 0.02368260733783245,
+      "learning_rate": 2.3499361430395912e-06,
+      "loss": 0.1503,
+      "step": 10290
+    },
+    {
+      "epoch": 39.00927203065134,
+      "grad_norm": 0.02799367532134056,
+      "learning_rate": 2.341421881651767e-06,
+      "loss": 0.0004,
+      "step": 10300
+    },
+    {
+      "epoch": 39.01003831417624,
+      "grad_norm": 0.0008915068465285003,
+      "learning_rate": 2.332907620263942e-06,
+      "loss": 0.1278,
+      "step": 10310
+    },
+    {
+      "epoch": 39.01080459770115,
+      "grad_norm": 0.0006595042650587857,
+      "learning_rate": 2.324393358876118e-06,
+      "loss": 0.8967,
+      "step": 10320
+    },
+    {
+      "epoch": 39.011570881226056,
+      "grad_norm": 0.0006145153311081231,
+      "learning_rate": 2.315879097488293e-06,
+      "loss": 0.0124,
+      "step": 10330
+    },
+    {
+      "epoch": 39.01233716475096,
+      "grad_norm": 0.08073096722364426,
+      "learning_rate": 2.3073648361004688e-06,
+      "loss": 0.0009,
+      "step": 10340
+    },
+    {
+      "epoch": 39.013103448275864,
+      "grad_norm": 0.0006259952206164598,
+      "learning_rate": 2.2988505747126437e-06,
+      "loss": 1.3612,
+      "step": 10350
+    },
+    {
+      "epoch": 39.01386973180077,
+      "grad_norm": 0.044197119772434235,
+      "learning_rate": 2.290336313324819e-06,
+      "loss": 0.7216,
+      "step": 10360
+    },
+    {
+      "epoch": 39.01463601532567,
+      "grad_norm": 0.0038827096577733755,
+      "learning_rate": 2.2818220519369946e-06,
+      "loss": 0.1281,
+      "step": 10370
+    },
+    {
+      "epoch": 39.015402298850574,
+      "grad_norm": 0.000527941738255322,
+      "learning_rate": 2.27330779054917e-06,
+      "loss": 0.4761,
+      "step": 10380
+    },
+    {
+      "epoch": 39.01616858237548,
+      "grad_norm": 0.10415507853031158,
+      "learning_rate": 2.2647935291613454e-06,
+      "loss": 0.0022,
+      "step": 10390
+    },
+    {
+      "epoch": 39.01693486590038,
+      "grad_norm": 0.12362392991781235,
+      "learning_rate": 2.256279267773521e-06,
+      "loss": 0.0013,
+      "step": 10400
+    },
+    {
+      "epoch": 39.017701149425285,
+      "grad_norm": 0.0005639142473228276,
+      "learning_rate": 2.2477650063856962e-06,
+      "loss": 0.0011,
+      "step": 10410
+    },
+    {
+      "epoch": 39.01846743295019,
+      "grad_norm": 0.0033310065045952797,
+      "learning_rate": 2.2392507449978717e-06,
+      "loss": 0.0004,
+      "step": 10420
+    },
+    {
+      "epoch": 39.01923371647509,
+      "grad_norm": 0.0061408476904034615,
+      "learning_rate": 2.230736483610047e-06,
+      "loss": 0.001,
+      "step": 10430
+    },
+    {
+      "epoch": 39.02,
+      "grad_norm": 0.0009819255210459232,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.3147,
+      "step": 10440
+    },
+    {
+      "epoch": 39.02,
+      "eval_accuracy": 0.6888888888888889,
+      "eval_loss": 1.8535500764846802,
+      "eval_runtime": 17.2255,
+      "eval_samples_per_second": 2.612,
+      "eval_steps_per_second": 2.612,
+      "step": 10440
+    },
+    {
+      "epoch": 40.000766283524904,
+      "grad_norm": 0.0008266578079201281,
+      "learning_rate": 2.213707960834398e-06,
+      "loss": 0.0004,
+      "step": 10450
+    },
+    {
+      "epoch": 40.00153256704981,
+      "grad_norm": 0.0013300732243806124,
+      "learning_rate": 2.205193699446573e-06,
+      "loss": 0.0005,
+      "step": 10460
+    },
+    {
+      "epoch": 40.00229885057471,
+      "grad_norm": 0.0724252238869667,
+      "learning_rate": 2.1966794380587487e-06,
+      "loss": 0.6297,
+      "step": 10470
+    },
+    {
+      "epoch": 40.003065134099614,
+      "grad_norm": 109.5344009399414,
+      "learning_rate": 2.1881651766709237e-06,
+      "loss": 0.6123,
+      "step": 10480
+    },
+    {
+      "epoch": 40.00383141762452,
+      "grad_norm": 0.0006431580404751003,
+      "learning_rate": 2.1796509152830996e-06,
+      "loss": 0.0003,
+      "step": 10490
+    },
+    {
+      "epoch": 40.00459770114943,
+      "grad_norm": 0.0006655193283222616,
+      "learning_rate": 2.1711366538952746e-06,
+      "loss": 0.0004,
+      "step": 10500
+    },
+    {
+      "epoch": 40.00536398467433,
+      "grad_norm": 0.040592875331640244,
+      "learning_rate": 2.1626223925074504e-06,
+      "loss": 0.0007,
+      "step": 10510
+    },
+    {
+      "epoch": 40.006130268199236,
+      "grad_norm": 0.029155420139431953,
+      "learning_rate": 2.1541081311196254e-06,
+      "loss": 0.0007,
+      "step": 10520
+    },
+    {
+      "epoch": 40.00689655172414,
+      "grad_norm": 0.001168000278994441,
+      "learning_rate": 2.145593869731801e-06,
+      "loss": 0.0007,
+      "step": 10530
+    },
+    {
+      "epoch": 40.00766283524904,
+      "grad_norm": 0.020939696580171585,
+      "learning_rate": 2.1370796083439762e-06,
+      "loss": 0.0003,
+      "step": 10540
+    },
+    {
+      "epoch": 40.00842911877395,
+      "grad_norm": 0.36834532022476196,
+      "learning_rate": 2.1285653469561517e-06,
+      "loss": 0.0458,
+      "step": 10550
+    },
+    {
+      "epoch": 40.00919540229885,
+      "grad_norm": 0.0005809459835290909,
+      "learning_rate": 2.120051085568327e-06,
+      "loss": 0.0003,
+      "step": 10560
+    },
+    {
+      "epoch": 40.009961685823754,
+      "grad_norm": 0.002721979282796383,
+      "learning_rate": 2.1115368241805025e-06,
+      "loss": 1.2751,
+      "step": 10570
+    },
+    {
+      "epoch": 40.01072796934866,
+      "grad_norm": 0.0007245915476232767,
+      "learning_rate": 2.103022562792678e-06,
+      "loss": 0.0008,
+      "step": 10580
+    },
+    {
+      "epoch": 40.01149425287356,
+      "grad_norm": 0.6239440441131592,
+      "learning_rate": 2.0945083014048533e-06,
+      "loss": 0.9704,
+      "step": 10590
+    },
+    {
+      "epoch": 40.012260536398465,
+      "grad_norm": 0.0005134647362865508,
+      "learning_rate": 2.0859940400170287e-06,
+      "loss": 0.6371,
+      "step": 10600
+    },
+    {
+      "epoch": 40.01302681992337,
+      "grad_norm": 0.035365816205739975,
+      "learning_rate": 2.077479778629204e-06,
+      "loss": 0.024,
+      "step": 10610
+    },
+    {
+      "epoch": 40.01379310344828,
+      "grad_norm": 0.0009426081087440252,
+      "learning_rate": 2.0689655172413796e-06,
+      "loss": 0.0084,
+      "step": 10620
+    },
+    {
+      "epoch": 40.01455938697318,
+      "grad_norm": 0.07545214146375656,
+      "learning_rate": 2.060451255853555e-06,
+      "loss": 0.0625,
+      "step": 10630
+    },
+    {
+      "epoch": 40.015325670498086,
+      "grad_norm": 0.0008644378394819796,
+      "learning_rate": 2.0519369944657304e-06,
+      "loss": 0.0005,
+      "step": 10640
+    },
+    {
+      "epoch": 40.01609195402299,
+      "grad_norm": 0.0005829230649396777,
+      "learning_rate": 2.043422733077906e-06,
+      "loss": 0.001,
+      "step": 10650
+    },
+    {
+      "epoch": 40.01685823754789,
+      "grad_norm": 0.002161706332117319,
+      "learning_rate": 2.0349084716900813e-06,
+      "loss": 0.0004,
+      "step": 10660
+    },
+    {
+      "epoch": 40.0176245210728,
+      "grad_norm": 0.0008721326594240963,
+      "learning_rate": 2.0263942103022567e-06,
+      "loss": 0.0004,
+      "step": 10670
+    },
+    {
+      "epoch": 40.0183908045977,
+      "grad_norm": 0.03235024958848953,
+      "learning_rate": 2.0178799489144317e-06,
+      "loss": 0.4799,
+      "step": 10680
+    },
+    {
+      "epoch": 40.019157088122604,
+      "grad_norm": 14.713685035705566,
+      "learning_rate": 2.009365687526607e-06,
+      "loss": 0.3028,
+      "step": 10690
+    },
+    {
+      "epoch": 40.01992337164751,
+      "grad_norm": 0.0006150007247924805,
+      "learning_rate": 2.0008514261387825e-06,
+      "loss": 0.0003,
+      "step": 10700
+    },
+    {
+      "epoch": 40.02,
+      "eval_accuracy": 0.7777777777777778,
+      "eval_loss": 1.6605358123779297,
+      "eval_runtime": 16.9691,
+      "eval_samples_per_second": 2.652,
+      "eval_steps_per_second": 2.652,
+      "step": 10701
+    },
+    {
+      "epoch": 41.000689655172415,
+      "grad_norm": 1.057999610900879,
+      "learning_rate": 1.992337164750958e-06,
+      "loss": 0.9546,
+      "step": 10710
+    },
+    {
+      "epoch": 41.00145593869732,
+      "grad_norm": 54.23439025878906,
+      "learning_rate": 1.9838229033631333e-06,
+      "loss": 1.4644,
+      "step": 10720
+    },
+    {
+      "epoch": 41.00222222222222,
+      "grad_norm": 0.009929795749485493,
+      "learning_rate": 1.9753086419753087e-06,
+      "loss": 0.0006,
+      "step": 10730
+    },
+    {
+      "epoch": 41.002988505747126,
+      "grad_norm": 0.0005556952673941851,
+      "learning_rate": 1.966794380587484e-06,
+      "loss": 0.1006,
+      "step": 10740
+    },
+    {
+      "epoch": 41.00375478927203,
+      "grad_norm": 0.09005004167556763,
+      "learning_rate": 1.9582801191996596e-06,
+      "loss": 0.0003,
+      "step": 10750
+    },
+    {
+      "epoch": 41.00452107279693,
+      "grad_norm": 0.006389525718986988,
+      "learning_rate": 1.949765857811835e-06,
+      "loss": 0.0019,
+      "step": 10760
+    },
+    {
+      "epoch": 41.00528735632184,
+      "grad_norm": 3.5716090202331543,
+      "learning_rate": 1.9412515964240104e-06,
+      "loss": 0.0116,
+      "step": 10770
+    },
+    {
+      "epoch": 41.00605363984674,
+      "grad_norm": 0.0018415194936096668,
+      "learning_rate": 1.932737335036186e-06,
+      "loss": 0.0003,
+      "step": 10780
+    },
+    {
+      "epoch": 41.006819923371644,
+      "grad_norm": 0.015272000804543495,
+      "learning_rate": 1.9242230736483612e-06,
+      "loss": 0.0003,
+      "step": 10790
+    },
+    {
+      "epoch": 41.007586206896555,
+      "grad_norm": 0.033736713230609894,
+      "learning_rate": 1.9157088122605367e-06,
+      "loss": 0.0004,
+      "step": 10800
+    },
+    {
+      "epoch": 41.00835249042146,
+      "grad_norm": 0.0019670985639095306,
+      "learning_rate": 1.9071945508727119e-06,
+      "loss": 1.3008,
+      "step": 10810
+    },
+    {
+      "epoch": 41.00911877394636,
+      "grad_norm": 0.03387555480003357,
+      "learning_rate": 1.8986802894848875e-06,
+      "loss": 0.0006,
+      "step": 10820
+    },
+    {
+      "epoch": 41.009885057471266,
+      "grad_norm": 0.0023133272770792246,
+      "learning_rate": 1.8901660280970627e-06,
+      "loss": 0.0001,
+      "step": 10830
+    },
+    {
+      "epoch": 41.01065134099617,
+      "grad_norm": 0.0012174684088677168,
+      "learning_rate": 1.8816517667092381e-06,
+      "loss": 0.0001,
+      "step": 10840
+    },
+    {
+      "epoch": 41.01141762452107,
+      "grad_norm": 0.0011998199624940753,
+      "learning_rate": 1.8731375053214135e-06,
+      "loss": 0.0005,
+      "step": 10850
+    },
+    {
+      "epoch": 41.01218390804598,
+      "grad_norm": 0.07908272743225098,
+      "learning_rate": 1.864623243933589e-06,
+      "loss": 0.0006,
+      "step": 10860
+    },
+    {
+      "epoch": 41.01295019157088,
+      "grad_norm": 0.022703932598233223,
+      "learning_rate": 1.8561089825457644e-06,
+      "loss": 0.0008,
+      "step": 10870
+    },
+    {
+      "epoch": 41.013716475095784,
+      "grad_norm": 0.017189128324389458,
+      "learning_rate": 1.8475947211579398e-06,
+      "loss": 0.6056,
+      "step": 10880
+    },
+    {
+      "epoch": 41.01448275862069,
+      "grad_norm": 0.0010669787880033255,
+      "learning_rate": 1.839080459770115e-06,
+      "loss": 0.5873,
+      "step": 10890
+    },
+    {
+      "epoch": 41.01524904214559,
+      "grad_norm": 0.06532321125268936,
+      "learning_rate": 1.8305661983822906e-06,
+      "loss": 0.2424,
+      "step": 10900
+    },
+    {
+      "epoch": 41.0160153256705,
+      "grad_norm": 0.0010624454589560628,
+      "learning_rate": 1.8220519369944658e-06,
+      "loss": 0.0001,
+      "step": 10910
+    },
+    {
+      "epoch": 41.016781609195405,
+      "grad_norm": 0.5484663844108582,
+      "learning_rate": 1.8135376756066415e-06,
+      "loss": 0.0014,
+      "step": 10920
+    },
+    {
+      "epoch": 41.01754789272031,
+      "grad_norm": 1.631242036819458,
+      "learning_rate": 1.8050234142188167e-06,
+      "loss": 0.0113,
+      "step": 10930
+    },
+    {
+      "epoch": 41.01831417624521,
+      "grad_norm": 0.020637700334191322,
+      "learning_rate": 1.7965091528309919e-06,
+      "loss": 0.0004,
+      "step": 10940
+    },
+    {
+      "epoch": 41.019080459770116,
+      "grad_norm": 0.033113788813352585,
+      "learning_rate": 1.7879948914431675e-06,
+      "loss": 0.0004,
+      "step": 10950
+    },
+    {
+      "epoch": 41.01984674329502,
+      "grad_norm": 0.0013185038696974516,
+      "learning_rate": 1.7794806300553427e-06,
+      "loss": 0.4885,
+      "step": 10960
+    },
+    {
+      "epoch": 41.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.7212529182434082,
+      "eval_runtime": 17.9943,
+      "eval_samples_per_second": 2.501,
+      "eval_steps_per_second": 2.501,
+      "step": 10962
+    },
+    {
+      "epoch": 42.00061302681992,
+      "grad_norm": 511.4117431640625,
+      "learning_rate": 1.7709663686675183e-06,
+      "loss": 1.1464,
+      "step": 10970
+    },
+    {
+      "epoch": 42.00137931034483,
+      "grad_norm": 0.03529811650514603,
+      "learning_rate": 1.7624521072796935e-06,
+      "loss": 0.0019,
+      "step": 10980
+    },
+    {
+      "epoch": 42.002145593869734,
+      "grad_norm": 0.0008390815346501768,
+      "learning_rate": 1.753937845891869e-06,
+      "loss": 0.0004,
+      "step": 10990
+    },
+    {
+      "epoch": 42.00291187739464,
+      "grad_norm": 0.0013074242742732167,
+      "learning_rate": 1.7454235845040444e-06,
+      "loss": 0.2003,
+      "step": 11000
+    },
+    {
+      "epoch": 42.00367816091954,
+      "grad_norm": 0.0029662505257874727,
+      "learning_rate": 1.7369093231162198e-06,
+      "loss": 0.7556,
+      "step": 11010
+    },
+    {
+      "epoch": 42.004444444444445,
+      "grad_norm": 0.000991358421742916,
+      "learning_rate": 1.7283950617283952e-06,
+      "loss": 0.0005,
+      "step": 11020
+    },
+    {
+      "epoch": 42.00521072796935,
+      "grad_norm": 0.07371685653924942,
+      "learning_rate": 1.7198808003405706e-06,
+      "loss": 0.0001,
+      "step": 11030
+    },
+    {
+      "epoch": 42.00597701149425,
+      "grad_norm": 0.0010008163517341018,
+      "learning_rate": 1.7113665389527458e-06,
+      "loss": 0.0007,
+      "step": 11040
+    },
+    {
+      "epoch": 42.006743295019156,
+      "grad_norm": 0.004903385415673256,
+      "learning_rate": 1.7028522775649215e-06,
+      "loss": 0.0009,
+      "step": 11050
+    },
+    {
+      "epoch": 42.00750957854406,
+      "grad_norm": 0.0005655346321873367,
+      "learning_rate": 1.6943380161770967e-06,
+      "loss": 0.0002,
+      "step": 11060
+    },
+    {
+      "epoch": 42.00827586206896,
+      "grad_norm": 0.0006228500860743225,
+      "learning_rate": 1.6858237547892723e-06,
+      "loss": 0.0002,
+      "step": 11070
+    },
+    {
+      "epoch": 42.00904214559387,
+      "grad_norm": 249.06565856933594,
+      "learning_rate": 1.6773094934014475e-06,
+      "loss": 0.0268,
+      "step": 11080
+    },
+    {
+      "epoch": 42.00980842911878,
+      "grad_norm": 0.016563888639211655,
+      "learning_rate": 1.6687952320136231e-06,
+      "loss": 0.0144,
+      "step": 11090
+    },
+    {
+      "epoch": 42.01057471264368,
+      "grad_norm": 0.0037711604963988066,
+      "learning_rate": 1.6602809706257983e-06,
+      "loss": 0.0003,
+      "step": 11100
+    },
+    {
+      "epoch": 42.011340996168585,
+      "grad_norm": 0.0009828859474509954,
+      "learning_rate": 1.6517667092379737e-06,
+      "loss": 0.0002,
+      "step": 11110
+    },
+    {
+      "epoch": 42.01210727969349,
+      "grad_norm": 0.002542088972404599,
+      "learning_rate": 1.6432524478501492e-06,
+      "loss": 0.0003,
+      "step": 11120
+    },
+    {
+      "epoch": 42.01287356321839,
+      "grad_norm": 0.00075729307718575,
+      "learning_rate": 1.6347381864623246e-06,
+      "loss": 0.0001,
+      "step": 11130
+    },
+    {
+      "epoch": 42.013639846743295,
+      "grad_norm": 0.018644072115421295,
+      "learning_rate": 1.6262239250744998e-06,
+      "loss": 0.001,
+      "step": 11140
+    },
+    {
+      "epoch": 42.0144061302682,
+      "grad_norm": 0.0005072808125987649,
+      "learning_rate": 1.6177096636866754e-06,
+      "loss": 0.1782,
+      "step": 11150
+    },
+    {
+      "epoch": 42.0151724137931,
+      "grad_norm": 0.0010206066071987152,
+      "learning_rate": 1.6091954022988506e-06,
+      "loss": 0.0002,
+      "step": 11160
+    },
+    {
+      "epoch": 42.015938697318006,
+      "grad_norm": 0.0035783525090664625,
+      "learning_rate": 1.6006811409110262e-06,
+      "loss": 0.0003,
+      "step": 11170
+    },
+    {
+      "epoch": 42.01670498084291,
+      "grad_norm": 0.014116492122411728,
+      "learning_rate": 1.5921668795232015e-06,
+      "loss": 0.6274,
+      "step": 11180
+    },
+    {
+      "epoch": 42.01747126436781,
+      "grad_norm": 0.0007213352364487946,
+      "learning_rate": 1.583652618135377e-06,
+      "loss": 0.0002,
+      "step": 11190
+    },
+    {
+      "epoch": 42.01823754789272,
+      "grad_norm": 0.0006297453655861318,
+      "learning_rate": 1.5751383567475523e-06,
+      "loss": 0.0003,
+      "step": 11200
+    },
+    {
+      "epoch": 42.01900383141763,
+      "grad_norm": 1331.007568359375,
+      "learning_rate": 1.5666240953597275e-06,
+      "loss": 0.0974,
+      "step": 11210
+    },
+    {
+      "epoch": 42.01977011494253,
+      "grad_norm": 0.02300421893596649,
+      "learning_rate": 1.5581098339719031e-06,
+      "loss": 0.0086,
+      "step": 11220
+    },
+    {
+      "epoch": 42.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.9743114709854126,
+      "eval_runtime": 17.4074,
+      "eval_samples_per_second": 2.585,
+      "eval_steps_per_second": 2.585,
+      "step": 11223
+    },
+    {
+      "epoch": 43.00053639846743,
+      "grad_norm": 0.07084634155035019,
+      "learning_rate": 1.5495955725840783e-06,
+      "loss": 0.9528,
+      "step": 11230
+    },
+    {
+      "epoch": 43.001302681992335,
+      "grad_norm": 0.011919819749891758,
+      "learning_rate": 1.541081311196254e-06,
+      "loss": 0.0005,
+      "step": 11240
+    },
+    {
+      "epoch": 43.00206896551724,
+      "grad_norm": 0.000885434914380312,
+      "learning_rate": 1.5325670498084292e-06,
+      "loss": 0.5355,
+      "step": 11250
+    },
+    {
+      "epoch": 43.00283524904214,
+      "grad_norm": 0.0007969682919792831,
+      "learning_rate": 1.5240527884206046e-06,
+      "loss": 0.0007,
+      "step": 11260
+    },
+    {
+      "epoch": 43.00360153256705,
+      "grad_norm": 0.007111699786037207,
+      "learning_rate": 1.51553852703278e-06,
+      "loss": 0.0004,
+      "step": 11270
+    },
+    {
+      "epoch": 43.00436781609196,
+      "grad_norm": 0.0009880387224256992,
+      "learning_rate": 1.5070242656449554e-06,
+      "loss": 0.0003,
+      "step": 11280
+    },
+    {
+      "epoch": 43.00513409961686,
+      "grad_norm": 0.003055058652535081,
+      "learning_rate": 1.4985100042571306e-06,
+      "loss": 0.0002,
+      "step": 11290
+    },
+    {
+      "epoch": 43.005900383141764,
+      "grad_norm": 0.21318435668945312,
+      "learning_rate": 1.4899957428693062e-06,
+      "loss": 0.6109,
+      "step": 11300
+    },
+    {
+      "epoch": 43.00666666666667,
+      "grad_norm": 0.01973254792392254,
+      "learning_rate": 1.4814814814814815e-06,
+      "loss": 0.0011,
+      "step": 11310
+    },
+    {
+      "epoch": 43.00743295019157,
+      "grad_norm": 0.013083684258162975,
+      "learning_rate": 1.472967220093657e-06,
+      "loss": 0.0002,
+      "step": 11320
+    },
+    {
+      "epoch": 43.008199233716475,
+      "grad_norm": 0.0007720005232840776,
+      "learning_rate": 1.4644529587058323e-06,
+      "loss": 0.4174,
+      "step": 11330
+    },
+    {
+      "epoch": 43.00896551724138,
+      "grad_norm": 0.0007410580874420702,
+      "learning_rate": 1.455938697318008e-06,
+      "loss": 0.629,
+      "step": 11340
+    },
+    {
+      "epoch": 43.00973180076628,
+      "grad_norm": 0.0692082941532135,
+      "learning_rate": 1.4474244359301831e-06,
+      "loss": 0.6658,
+      "step": 11350
+    },
+    {
+      "epoch": 43.010498084291186,
+      "grad_norm": 0.002989733824506402,
+      "learning_rate": 1.4389101745423588e-06,
+      "loss": 0.0006,
+      "step": 11360
+    },
+    {
+      "epoch": 43.01126436781609,
+      "grad_norm": 0.14531533420085907,
+      "learning_rate": 1.430395913154534e-06,
+      "loss": 0.0004,
+      "step": 11370
+    },
+    {
+      "epoch": 43.01203065134099,
+      "grad_norm": 208.61495971679688,
+      "learning_rate": 1.4218816517667094e-06,
+      "loss": 0.0193,
+      "step": 11380
+    },
+    {
+      "epoch": 43.012796934865904,
+      "grad_norm": 0.009606936946511269,
+      "learning_rate": 1.4133673903788848e-06,
+      "loss": 0.0003,
+      "step": 11390
+    },
+    {
+      "epoch": 43.01356321839081,
+      "grad_norm": 0.00043209921568632126,
+      "learning_rate": 1.4048531289910602e-06,
+      "loss": 0.6961,
+      "step": 11400
+    },
+    {
+      "epoch": 43.01432950191571,
+      "grad_norm": 0.009452243335545063,
+      "learning_rate": 1.3963388676032354e-06,
+      "loss": 0.0004,
+      "step": 11410
+    },
+    {
+      "epoch": 43.015095785440614,
+      "grad_norm": 0.0018241740763187408,
+      "learning_rate": 1.387824606215411e-06,
+      "loss": 0.0002,
+      "step": 11420
+    },
+    {
+      "epoch": 43.01586206896552,
+      "grad_norm": 0.000981905497610569,
+      "learning_rate": 1.3793103448275862e-06,
+      "loss": 0.0008,
+      "step": 11430
+    },
+    {
+      "epoch": 43.01662835249042,
+      "grad_norm": 0.0007704213494434953,
+      "learning_rate": 1.3707960834397619e-06,
+      "loss": 0.0004,
+      "step": 11440
+    },
+    {
+      "epoch": 43.017394636015325,
+      "grad_norm": 1331.0528564453125,
+      "learning_rate": 1.362281822051937e-06,
+      "loss": 0.4822,
+      "step": 11450
+    },
+    {
+      "epoch": 43.01816091954023,
+      "grad_norm": 0.0009956113062798977,
+      "learning_rate": 1.3537675606641127e-06,
+      "loss": 0.0004,
+      "step": 11460
+    },
+    {
+      "epoch": 43.01892720306513,
+      "grad_norm": 0.0004226352903060615,
+      "learning_rate": 1.345253299276288e-06,
+      "loss": 0.0002,
+      "step": 11470
+    },
+    {
+      "epoch": 43.019693486590036,
+      "grad_norm": 0.0016257098177447915,
+      "learning_rate": 1.3367390378884631e-06,
+      "loss": 0.0004,
+      "step": 11480
+    },
+    {
+      "epoch": 43.02,
+      "eval_accuracy": 0.7555555555555555,
+      "eval_loss": 1.5626163482666016,
+      "eval_runtime": 16.3,
+      "eval_samples_per_second": 2.761,
+      "eval_steps_per_second": 2.761,
+      "step": 11484
+    },
+    {
+      "epoch": 44.000459770114944,
+      "grad_norm": 0.0017896126955747604,
+      "learning_rate": 1.3282247765006387e-06,
+      "loss": 0.0002,
+      "step": 11490
+    },
+    {
+      "epoch": 44.00122605363985,
+      "grad_norm": 0.05169057101011276,
+      "learning_rate": 1.319710515112814e-06,
+      "loss": 0.0004,
+      "step": 11500
+    },
+    {
+      "epoch": 44.00199233716475,
+      "grad_norm": 0.03178132697939873,
+      "learning_rate": 1.3111962537249896e-06,
+      "loss": 0.0001,
+      "step": 11510
+    },
+    {
+      "epoch": 44.002758620689654,
+      "grad_norm": 0.0395672582089901,
+      "learning_rate": 1.3026819923371648e-06,
+      "loss": 0.0004,
+      "step": 11520
+    },
+    {
+      "epoch": 44.00352490421456,
+      "grad_norm": 0.020331287756562233,
+      "learning_rate": 1.2941677309493402e-06,
+      "loss": 0.0003,
+      "step": 11530
+    },
+    {
+      "epoch": 44.00429118773946,
+      "grad_norm": 0.0004271493526175618,
+      "learning_rate": 1.2856534695615156e-06,
+      "loss": 0.0003,
+      "step": 11540
+    },
+    {
+      "epoch": 44.005057471264365,
+      "grad_norm": 0.00042054656660184264,
+      "learning_rate": 1.277139208173691e-06,
+      "loss": 0.0039,
+      "step": 11550
+    },
+    {
+      "epoch": 44.00582375478927,
+      "grad_norm": 0.00051471235929057,
+      "learning_rate": 1.2686249467858662e-06,
+      "loss": 0.8308,
+      "step": 11560
+    },
+    {
+      "epoch": 44.00659003831418,
+      "grad_norm": 0.00109906739089638,
+      "learning_rate": 1.2601106853980419e-06,
+      "loss": 0.0002,
+      "step": 11570
+    },
+    {
+      "epoch": 44.00735632183908,
+      "grad_norm": 0.014966871589422226,
+      "learning_rate": 1.251596424010217e-06,
+      "loss": 0.0056,
+      "step": 11580
+    },
+    {
+      "epoch": 44.00812260536399,
+      "grad_norm": 0.03208824247121811,
+      "learning_rate": 1.2430821626223927e-06,
+      "loss": 0.001,
+      "step": 11590
+    },
+    {
+      "epoch": 44.00888888888889,
+      "grad_norm": 0.001142689841799438,
+      "learning_rate": 1.234567901234568e-06,
+      "loss": 0.0014,
+      "step": 11600
+    },
+    {
+      "epoch": 44.009655172413794,
+      "grad_norm": 0.00575136486440897,
+      "learning_rate": 1.2260536398467433e-06,
+      "loss": 0.0002,
+      "step": 11610
+    },
+    {
+      "epoch": 44.0104214559387,
+      "grad_norm": 0.0004102112434338778,
+      "learning_rate": 1.2175393784589187e-06,
+      "loss": 0.0001,
+      "step": 11620
+    },
+    {
+      "epoch": 44.0111877394636,
+      "grad_norm": 0.0004949597641825676,
+      "learning_rate": 1.2090251170710942e-06,
+      "loss": 0.0002,
+      "step": 11630
+    },
+    {
+      "epoch": 44.011954022988505,
+      "grad_norm": 0.0005726158851757646,
+      "learning_rate": 1.2005108556832696e-06,
+      "loss": 0.6512,
+      "step": 11640
+    },
+    {
+      "epoch": 44.01272030651341,
+      "grad_norm": 0.1821271926164627,
+      "learning_rate": 1.191996594295445e-06,
+      "loss": 0.0002,
+      "step": 11650
+    },
+    {
+      "epoch": 44.01348659003831,
+      "grad_norm": 0.0020704707130789757,
+      "learning_rate": 1.1834823329076204e-06,
+      "loss": 0.0003,
+      "step": 11660
+    },
+    {
+      "epoch": 44.014252873563215,
+      "grad_norm": 0.04254362732172012,
+      "learning_rate": 1.1749680715197956e-06,
+      "loss": 0.0002,
+      "step": 11670
+    },
+    {
+      "epoch": 44.01501915708812,
+      "grad_norm": 0.0006496690912172198,
+      "learning_rate": 1.166453810131971e-06,
+      "loss": 0.6609,
+      "step": 11680
+    },
+    {
+      "epoch": 44.01578544061303,
+      "grad_norm": 0.0007688644109293818,
+      "learning_rate": 1.1579395487441465e-06,
+      "loss": 0.0003,
+      "step": 11690
+    },
+    {
+      "epoch": 44.01655172413793,
+      "grad_norm": 0.0023576049134135246,
+      "learning_rate": 1.1494252873563219e-06,
+      "loss": 0.0003,
+      "step": 11700
+    },
+    {
+      "epoch": 44.01731800766284,
+      "grad_norm": 0.0004909622948616743,
+      "learning_rate": 1.1409110259684973e-06,
+      "loss": 0.0014,
+      "step": 11710
+    },
+    {
+      "epoch": 44.01808429118774,
+      "grad_norm": 0.0005817667115479708,
+      "learning_rate": 1.1323967645806727e-06,
+      "loss": 0.4629,
+      "step": 11720
+    },
+    {
+      "epoch": 44.018850574712644,
+      "grad_norm": 0.0340137705206871,
+      "learning_rate": 1.1238825031928481e-06,
+      "loss": 0.8261,
+      "step": 11730
+    },
+    {
+      "epoch": 44.01961685823755,
+      "grad_norm": 0.0010054216254502535,
+      "learning_rate": 1.1153682418050235e-06,
+      "loss": 0.0005,
+      "step": 11740
+    },
+    {
+      "epoch": 44.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.5693700313568115,
+      "eval_runtime": 16.2882,
+      "eval_samples_per_second": 2.763,
+      "eval_steps_per_second": 2.763,
+      "step": 11745
+    },
+    {
+      "epoch": 45.000383141762455,
+      "grad_norm": 0.0026499666273593903,
+      "learning_rate": 1.106853980417199e-06,
+      "loss": 0.0003,
+      "step": 11750
+    },
+    {
+      "epoch": 45.00114942528736,
+      "grad_norm": 0.0010546314297243953,
+      "learning_rate": 1.0983397190293744e-06,
+      "loss": 0.5939,
+      "step": 11760
+    },
+    {
+      "epoch": 45.00191570881226,
+      "grad_norm": 0.0005378160858526826,
+      "learning_rate": 1.0898254576415498e-06,
+      "loss": 0.0002,
+      "step": 11770
+    },
+    {
+      "epoch": 45.002681992337166,
+      "grad_norm": 0.001701247412711382,
+      "learning_rate": 1.0813111962537252e-06,
+      "loss": 0.0003,
+      "step": 11780
+    },
+    {
+      "epoch": 45.00344827586207,
+      "grad_norm": 0.0022411306854337454,
+      "learning_rate": 1.0727969348659004e-06,
+      "loss": 0.6054,
+      "step": 11790
+    },
+    {
+      "epoch": 45.00421455938697,
+      "grad_norm": 0.010880502872169018,
+      "learning_rate": 1.0642826734780758e-06,
+      "loss": 0.0002,
+      "step": 11800
+    },
+    {
+      "epoch": 45.00498084291188,
+      "grad_norm": 0.0004433699941728264,
+      "learning_rate": 1.0557684120902512e-06,
+      "loss": 0.0004,
+      "step": 11810
+    },
+    {
+      "epoch": 45.00574712643678,
+      "grad_norm": 0.0003745293361134827,
+      "learning_rate": 1.0472541507024267e-06,
+      "loss": 0.0002,
+      "step": 11820
+    },
+    {
+      "epoch": 45.006513409961684,
+      "grad_norm": 0.0005537657416425645,
+      "learning_rate": 1.038739889314602e-06,
+      "loss": 0.0002,
+      "step": 11830
+    },
+    {
+      "epoch": 45.00727969348659,
+      "grad_norm": 0.0006434423266910017,
+      "learning_rate": 1.0302256279267775e-06,
+      "loss": 0.0114,
+      "step": 11840
+    },
+    {
+      "epoch": 45.00804597701149,
+      "grad_norm": 0.0012914492981508374,
+      "learning_rate": 1.021711366538953e-06,
+      "loss": 0.0003,
+      "step": 11850
+    },
+    {
+      "epoch": 45.008812260536395,
+      "grad_norm": 0.0007428836543112993,
+      "learning_rate": 1.0131971051511283e-06,
+      "loss": 0.0004,
+      "step": 11860
+    },
+    {
+      "epoch": 45.009578544061306,
+      "grad_norm": 0.062420498579740524,
+      "learning_rate": 1.0046828437633035e-06,
+      "loss": 0.0003,
+      "step": 11870
+    },
+    {
+      "epoch": 45.01034482758621,
+      "grad_norm": 0.007152738980948925,
+      "learning_rate": 9.96168582375479e-07,
+      "loss": 0.0002,
+      "step": 11880
+    },
+    {
+      "epoch": 45.01111111111111,
+      "grad_norm": 0.02456565760076046,
+      "learning_rate": 9.876543209876544e-07,
+      "loss": 0.0003,
+      "step": 11890
+    },
+    {
+      "epoch": 45.01187739463602,
+      "grad_norm": 0.0014226444764062762,
+      "learning_rate": 9.791400595998298e-07,
+      "loss": 0.0003,
+      "step": 11900
+    },
+    {
+      "epoch": 45.01264367816092,
+      "grad_norm": 0.0007156677893362939,
+      "learning_rate": 9.706257982120052e-07,
+      "loss": 0.6446,
+      "step": 11910
+    },
+    {
+      "epoch": 45.013409961685824,
+      "grad_norm": 0.001060871290974319,
+      "learning_rate": 9.621115368241806e-07,
+      "loss": 0.0002,
+      "step": 11920
+    },
+    {
+      "epoch": 45.01417624521073,
+      "grad_norm": 0.01419631578028202,
+      "learning_rate": 9.535972754363559e-07,
+      "loss": 0.0003,
+      "step": 11930
+    },
+    {
+      "epoch": 45.01494252873563,
+      "grad_norm": 0.0027811871841549873,
+      "learning_rate": 9.450830140485314e-07,
+      "loss": 0.0003,
+      "step": 11940
+    },
+    {
+      "epoch": 45.015708812260534,
+      "grad_norm": 0.011095247231423855,
+      "learning_rate": 9.365687526607068e-07,
+      "loss": 0.6532,
+      "step": 11950
+    },
+    {
+      "epoch": 45.01647509578544,
+      "grad_norm": 0.0004981970996595919,
+      "learning_rate": 9.280544912728822e-07,
+      "loss": 0.8515,
+      "step": 11960
+    },
+    {
+      "epoch": 45.01724137931034,
+      "grad_norm": 0.006281125824898481,
+      "learning_rate": 9.195402298850575e-07,
+      "loss": 0.6951,
+      "step": 11970
+    },
+    {
+      "epoch": 45.01800766283525,
+      "grad_norm": 0.023470468819141388,
+      "learning_rate": 9.110259684972329e-07,
+      "loss": 0.0003,
+      "step": 11980
+    },
+    {
+      "epoch": 45.018773946360156,
+      "grad_norm": 0.0008022257243283093,
+      "learning_rate": 9.025117071094083e-07,
+      "loss": 0.0175,
+      "step": 11990
+    },
+    {
+      "epoch": 45.01954022988506,
+      "grad_norm": 0.027053140103816986,
+      "learning_rate": 8.939974457215837e-07,
+      "loss": 0.0003,
+      "step": 12000
+    },
+    {
+      "epoch": 45.02,
+      "eval_accuracy": 0.7777777777777778,
+      "eval_loss": 1.653847575187683,
+      "eval_runtime": 17.1103,
+      "eval_samples_per_second": 2.63,
+      "eval_steps_per_second": 2.63,
+      "step": 12006
+    },
+    {
+      "epoch": 46.00030651340996,
+      "grad_norm": 0.0039772093296051025,
+      "learning_rate": 8.854831843337592e-07,
+      "loss": 0.5268,
+      "step": 12010
+    },
+    {
+      "epoch": 46.001072796934864,
+      "grad_norm": 0.0015084996121004224,
+      "learning_rate": 8.769689229459345e-07,
+      "loss": 0.485,
+      "step": 12020
+    },
+    {
+      "epoch": 46.00183908045977,
+      "grad_norm": 0.019918980076909065,
+      "learning_rate": 8.684546615581099e-07,
+      "loss": 0.0003,
+      "step": 12030
+    },
+    {
+      "epoch": 46.00260536398467,
+      "grad_norm": 0.016599399968981743,
+      "learning_rate": 8.599404001702853e-07,
+      "loss": 0.3725,
+      "step": 12040
+    },
+    {
+      "epoch": 46.00337164750958,
+      "grad_norm": 0.011276071891188622,
+      "learning_rate": 8.514261387824607e-07,
+      "loss": 0.6707,
+      "step": 12050
+    },
+    {
+      "epoch": 46.004137931034485,
+      "grad_norm": 0.0011176816187798977,
+      "learning_rate": 8.429118773946361e-07,
+      "loss": 0.4441,
+      "step": 12060
+    },
+    {
+      "epoch": 46.00490421455939,
+      "grad_norm": 0.05207664519548416,
+      "learning_rate": 8.343976160068116e-07,
+      "loss": 0.0002,
+      "step": 12070
+    },
+    {
+      "epoch": 46.00567049808429,
+      "grad_norm": 0.0005521380226127803,
+      "learning_rate": 8.258833546189869e-07,
+      "loss": 0.0002,
+      "step": 12080
+    },
+    {
+      "epoch": 46.006436781609196,
+      "grad_norm": 0.0004713898815680295,
+      "learning_rate": 8.173690932311623e-07,
+      "loss": 0.7625,
+      "step": 12090
+    },
+    {
+      "epoch": 46.0072030651341,
+      "grad_norm": 0.006049466319382191,
+      "learning_rate": 8.088548318433377e-07,
+      "loss": 0.8684,
+      "step": 12100
+    },
+    {
+      "epoch": 46.007969348659,
+      "grad_norm": 0.010157289914786816,
+      "learning_rate": 8.003405704555131e-07,
+      "loss": 0.0003,
+      "step": 12110
+    },
+    {
+      "epoch": 46.00873563218391,
+      "grad_norm": 0.1842534989118576,
+      "learning_rate": 7.918263090676885e-07,
+      "loss": 0.0968,
+      "step": 12120
+    },
+    {
+      "epoch": 46.00950191570881,
+      "grad_norm": 0.001697059953585267,
+      "learning_rate": 7.833120476798637e-07,
+      "loss": 0.0005,
+      "step": 12130
+    },
+    {
+      "epoch": 46.010268199233714,
+      "grad_norm": 0.003639540169388056,
+      "learning_rate": 7.747977862920392e-07,
+      "loss": 0.0002,
+      "step": 12140
+    },
+    {
+      "epoch": 46.01103448275862,
+      "grad_norm": 0.0007196399383246899,
+      "learning_rate": 7.662835249042146e-07,
+      "loss": 0.0108,
+      "step": 12150
+    },
+    {
+      "epoch": 46.01180076628353,
+      "grad_norm": 0.001255286275409162,
+      "learning_rate": 7.5776926351639e-07,
+      "loss": 0.0001,
+      "step": 12160
+    },
+    {
+      "epoch": 46.01256704980843,
+      "grad_norm": 0.004178752191364765,
+      "learning_rate": 7.492550021285653e-07,
+      "loss": 0.0001,
+      "step": 12170
+    },
+    {
+      "epoch": 46.013333333333335,
+      "grad_norm": 0.0006210522842593491,
+      "learning_rate": 7.407407407407407e-07,
+      "loss": 0.0002,
+      "step": 12180
+    },
+    {
+      "epoch": 46.01409961685824,
+      "grad_norm": 0.0007648312603123486,
+      "learning_rate": 7.322264793529161e-07,
+      "loss": 0.0002,
+      "step": 12190
+    },
+    {
+      "epoch": 46.01486590038314,
+      "grad_norm": 0.00938662514090538,
+      "learning_rate": 7.237122179650916e-07,
+      "loss": 0.0002,
+      "step": 12200
+    },
+    {
+      "epoch": 46.015632183908046,
+      "grad_norm": 0.009409523569047451,
+      "learning_rate": 7.15197956577267e-07,
+      "loss": 0.0001,
+      "step": 12210
+    },
+    {
+      "epoch": 46.01639846743295,
+      "grad_norm": 0.017453685402870178,
+      "learning_rate": 7.066836951894424e-07,
+      "loss": 0.7013,
+      "step": 12220
+    },
+    {
+      "epoch": 46.01716475095785,
+      "grad_norm": 0.000773155246861279,
+      "learning_rate": 6.981694338016177e-07,
+      "loss": 0.0005,
+      "step": 12230
+    },
+    {
+      "epoch": 46.01793103448276,
+      "grad_norm": 0.014481929130852222,
+      "learning_rate": 6.896551724137931e-07,
+      "loss": 0.03,
+      "step": 12240
+    },
+    {
+      "epoch": 46.01869731800766,
+      "grad_norm": 0.21550171077251434,
+      "learning_rate": 6.811409110259685e-07,
+      "loss": 0.0004,
+      "step": 12250
+    },
+    {
+      "epoch": 46.019463601532564,
+      "grad_norm": 0.002777979476377368,
+      "learning_rate": 6.72626649638144e-07,
+      "loss": 0.0002,
+      "step": 12260
+    },
+    {
+      "epoch": 46.02,
+      "eval_accuracy": 0.7111111111111111,
+      "eval_loss": 1.8243345022201538,
+      "eval_runtime": 18.0214,
+      "eval_samples_per_second": 2.497,
+      "eval_steps_per_second": 2.497,
+      "step": 12267
+    },
+    {
+      "epoch": 47.00022988505747,
+      "grad_norm": 0.040269769728183746,
+      "learning_rate": 6.641123882503194e-07,
+      "loss": 0.5425,
+      "step": 12270
+    },
+    {
+      "epoch": 47.000996168582375,
+      "grad_norm": 0.0042386422865092754,
+      "learning_rate": 6.555981268624948e-07,
+      "loss": 0.0002,
+      "step": 12280
+    },
+    {
+      "epoch": 47.00176245210728,
+      "grad_norm": 0.009512203745543957,
+      "learning_rate": 6.470838654746701e-07,
+      "loss": 0.0004,
+      "step": 12290
+    },
+    {
+      "epoch": 47.00252873563218,
+      "grad_norm": 0.0015298279467970133,
+      "learning_rate": 6.385696040868455e-07,
+      "loss": 0.0932,
+      "step": 12300
+    },
+    {
+      "epoch": 47.003295019157086,
+      "grad_norm": 0.0006447223131544888,
+      "learning_rate": 6.300553426990209e-07,
+      "loss": 0.0002,
+      "step": 12310
+    },
+    {
+      "epoch": 47.00406130268199,
+      "grad_norm": 0.0025422817561775446,
+      "learning_rate": 6.215410813111964e-07,
+      "loss": 0.0001,
+      "step": 12320
+    },
+    {
+      "epoch": 47.00482758620689,
+      "grad_norm": 0.0007822861662134528,
+      "learning_rate": 6.130268199233717e-07,
+      "loss": 0.0001,
+      "step": 12330
+    },
+    {
+      "epoch": 47.005593869731804,
+      "grad_norm": 0.0012738342629745603,
+      "learning_rate": 6.045125585355471e-07,
+      "loss": 0.0002,
+      "step": 12340
+    },
+    {
+      "epoch": 47.00636015325671,
+      "grad_norm": 0.0005095800152048469,
+      "learning_rate": 5.959982971477225e-07,
+      "loss": 0.0001,
+      "step": 12350
+    },
+    {
+      "epoch": 47.00712643678161,
+      "grad_norm": 0.08711840957403183,
+      "learning_rate": 5.874840357598978e-07,
+      "loss": 0.0004,
+      "step": 12360
+    },
+    {
+      "epoch": 47.007892720306515,
+      "grad_norm": 0.000607455032877624,
+      "learning_rate": 5.789697743720732e-07,
+      "loss": 0.0003,
+      "step": 12370
+    },
+    {
+      "epoch": 47.00865900383142,
+      "grad_norm": 0.0007099256035871804,
+      "learning_rate": 5.704555129842486e-07,
+      "loss": 0.0002,
+      "step": 12380
+    },
+    {
+      "epoch": 47.00942528735632,
+      "grad_norm": 0.024994418025016785,
+      "learning_rate": 5.619412515964241e-07,
+      "loss": 0.666,
+      "step": 12390
+    },
+    {
+      "epoch": 47.010191570881226,
+      "grad_norm": 0.0008645387133583426,
+      "learning_rate": 5.534269902085995e-07,
+      "loss": 0.3129,
+      "step": 12400
+    },
+    {
+      "epoch": 47.01095785440613,
+      "grad_norm": 0.000409560336265713,
+      "learning_rate": 5.449127288207749e-07,
+      "loss": 0.0001,
+      "step": 12410
+    },
+    {
+      "epoch": 47.01172413793103,
+      "grad_norm": 0.004132567439228296,
+      "learning_rate": 5.363984674329502e-07,
+      "loss": 0.0002,
+      "step": 12420
+    },
+    {
+      "epoch": 47.01249042145594,
+      "grad_norm": 0.0008664773195050657,
+      "learning_rate": 5.278842060451256e-07,
+      "loss": 0.0003,
+      "step": 12430
+    },
+    {
+      "epoch": 47.01325670498084,
+      "grad_norm": 0.0006345092551782727,
+      "learning_rate": 5.19369944657301e-07,
+      "loss": 0.656,
+      "step": 12440
+    },
+    {
+      "epoch": 47.014022988505744,
+      "grad_norm": 0.002261986257508397,
+      "learning_rate": 5.108556832694765e-07,
+      "loss": 0.0002,
+      "step": 12450
+    },
+    {
+      "epoch": 47.014789272030654,
+      "grad_norm": 0.0006148297106847167,
+      "learning_rate": 5.023414218816518e-07,
+      "loss": 0.0004,
+      "step": 12460
+    },
+    {
+      "epoch": 47.01555555555556,
+      "grad_norm": 0.017736852169036865,
+      "learning_rate": 4.938271604938272e-07,
+      "loss": 0.0002,
+      "step": 12470
+    },
+    {
+      "epoch": 47.01632183908046,
+      "grad_norm": 0.03878958523273468,
+      "learning_rate": 4.853128991060026e-07,
+      "loss": 0.6275,
+      "step": 12480
+    },
+    {
+      "epoch": 47.017088122605365,
+      "grad_norm": 0.00793781690299511,
+      "learning_rate": 4.7679863771817797e-07,
+      "loss": 0.0002,
+      "step": 12490
+    },
+    {
+      "epoch": 47.01785440613027,
+      "grad_norm": 0.0006141722551546991,
+      "learning_rate": 4.682843763303534e-07,
+      "loss": 0.0003,
+      "step": 12500
+    },
+    {
+      "epoch": 47.01862068965517,
+      "grad_norm": 0.009914839640259743,
+      "learning_rate": 4.5977011494252875e-07,
+      "loss": 0.0019,
+      "step": 12510
+    },
+    {
+      "epoch": 47.019386973180076,
+      "grad_norm": 0.00038367536035366356,
+      "learning_rate": 4.5125585355470417e-07,
+      "loss": 0.0001,
+      "step": 12520
+    },
+    {
+      "epoch": 47.02,
+      "eval_accuracy": 0.7555555555555555,
+      "eval_loss": 1.7648043632507324,
+      "eval_runtime": 18.542,
+      "eval_samples_per_second": 2.427,
+      "eval_steps_per_second": 2.427,
+      "step": 12528
+    },
+    {
+      "epoch": 48.000153256704984,
+      "grad_norm": 0.004572936799377203,
+      "learning_rate": 4.427415921668796e-07,
+      "loss": 0.0002,
+      "step": 12530
+    },
+    {
+      "epoch": 48.00091954022989,
+      "grad_norm": 0.000799658359028399,
+      "learning_rate": 4.3422733077905495e-07,
+      "loss": 0.0002,
+      "step": 12540
+    },
+    {
+      "epoch": 48.00168582375479,
+      "grad_norm": 0.003123689442873001,
+      "learning_rate": 4.2571306939123036e-07,
+      "loss": 0.0002,
+      "step": 12550
+    },
+    {
+      "epoch": 48.002452107279694,
+      "grad_norm": 0.000493537459988147,
+      "learning_rate": 4.171988080034058e-07,
+      "loss": 0.5813,
+      "step": 12560
+    },
+    {
+      "epoch": 48.0032183908046,
+      "grad_norm": 0.010670820251107216,
+      "learning_rate": 4.0868454661558115e-07,
+      "loss": 0.761,
+      "step": 12570
+    },
+    {
+      "epoch": 48.0039846743295,
+      "grad_norm": 0.00417841924354434,
+      "learning_rate": 4.0017028522775656e-07,
+      "loss": 0.0001,
+      "step": 12580
+    },
+    {
+      "epoch": 48.004750957854405,
+      "grad_norm": 206.2652587890625,
+      "learning_rate": 3.916560238399319e-07,
+      "loss": 0.0169,
+      "step": 12590
+    },
+    {
+      "epoch": 48.00551724137931,
+      "grad_norm": 0.0004213191568851471,
+      "learning_rate": 3.831417624521073e-07,
+      "loss": 0.0001,
+      "step": 12600
+    },
+    {
+      "epoch": 48.00628352490421,
+      "grad_norm": 0.007983396761119366,
+      "learning_rate": 3.7462750106428265e-07,
+      "loss": 0.0003,
+      "step": 12610
+    },
+    {
+      "epoch": 48.007049808429116,
+      "grad_norm": 0.0005406314157880843,
+      "learning_rate": 3.6611323967645807e-07,
+      "loss": 0.0032,
+      "step": 12620
+    },
+    {
+      "epoch": 48.00781609195402,
+      "grad_norm": 0.0017200593138113618,
+      "learning_rate": 3.575989782886335e-07,
+      "loss": 0.0001,
+      "step": 12630
+    },
+    {
+      "epoch": 48.00858237547893,
+      "grad_norm": 0.0007993885083124042,
+      "learning_rate": 3.4908471690080885e-07,
+      "loss": 0.0001,
+      "step": 12640
+    },
+    {
+      "epoch": 48.009348659003834,
+      "grad_norm": 0.00035948160802945495,
+      "learning_rate": 3.4057045551298427e-07,
+      "loss": 0.6733,
+      "step": 12650
+    },
+    {
+      "epoch": 48.01011494252874,
+      "grad_norm": 0.0006828159675933421,
+      "learning_rate": 3.320561941251597e-07,
+      "loss": 0.0144,
+      "step": 12660
+    },
+    {
+      "epoch": 48.01088122605364,
+      "grad_norm": 0.0004076082550454885,
+      "learning_rate": 3.2354193273733505e-07,
+      "loss": 0.0001,
+      "step": 12670
+    },
+    {
+      "epoch": 48.011647509578545,
+      "grad_norm": 0.009096966125071049,
+      "learning_rate": 3.1502767134951047e-07,
+      "loss": 0.0002,
+      "step": 12680
+    },
+    {
+      "epoch": 48.01241379310345,
+      "grad_norm": 0.03335254639387131,
+      "learning_rate": 3.0651340996168583e-07,
+      "loss": 0.0002,
+      "step": 12690
+    },
+    {
+      "epoch": 48.01318007662835,
+      "grad_norm": 0.005149259697645903,
+      "learning_rate": 2.9799914857386125e-07,
+      "loss": 0.6139,
+      "step": 12700
+    },
+    {
+      "epoch": 48.013946360153255,
+      "grad_norm": 0.002243490656837821,
+      "learning_rate": 2.894848871860366e-07,
+      "loss": 0.0001,
+      "step": 12710
+    },
+    {
+      "epoch": 48.01471264367816,
+      "grad_norm": 0.03874532878398895,
+      "learning_rate": 2.8097062579821203e-07,
+      "loss": 0.0002,
+      "step": 12720
+    },
+    {
+      "epoch": 48.01547892720306,
+      "grad_norm": 0.21416586637496948,
+      "learning_rate": 2.7245636441038745e-07,
+      "loss": 0.0004,
+      "step": 12730
+    },
+    {
+      "epoch": 48.016245210727966,
+      "grad_norm": 0.0011970993364229798,
+      "learning_rate": 2.639421030225628e-07,
+      "loss": 0.0001,
+      "step": 12740
+    },
+    {
+      "epoch": 48.01701149425288,
+      "grad_norm": 0.0005035316571593285,
+      "learning_rate": 2.5542784163473823e-07,
+      "loss": 0.0006,
+      "step": 12750
+    },
+    {
+      "epoch": 48.01777777777778,
+      "grad_norm": 0.00046502408804371953,
+      "learning_rate": 2.469135802469136e-07,
+      "loss": 0.7216,
+      "step": 12760
+    },
+    {
+      "epoch": 48.018544061302684,
+      "grad_norm": 0.0005701349582523108,
+      "learning_rate": 2.3839931885908898e-07,
+      "loss": 0.2455,
+      "step": 12770
+    },
+    {
+      "epoch": 48.01931034482759,
+      "grad_norm": 0.0006246402626857162,
+      "learning_rate": 2.2988505747126437e-07,
+      "loss": 0.0002,
+      "step": 12780
+    },
+    {
+      "epoch": 48.02,
+      "eval_accuracy": 0.7333333333333333,
+      "eval_loss": 1.6705189943313599,
+      "eval_runtime": 17.3488,
+      "eval_samples_per_second": 2.594,
+      "eval_steps_per_second": 2.594,
+      "step": 12789
+    },
+    {
+      "epoch": 49.00007662835249,
+      "grad_norm": 0.0009901125449687243,
+      "learning_rate": 2.213707960834398e-07,
+      "loss": 0.0002,
+      "step": 12790
+    },
+    {
+      "epoch": 49.00084291187739,
+      "grad_norm": 0.00033536559203639627,
+      "learning_rate": 2.1285653469561518e-07,
+      "loss": 0.0002,
+      "step": 12800
+    },
+    {
+      "epoch": 49.001609195402295,
+      "grad_norm": 0.02358914539217949,
+      "learning_rate": 2.0434227330779057e-07,
+      "loss": 0.0004,
+      "step": 12810
+    },
+    {
+      "epoch": 49.002375478927206,
+      "grad_norm": 0.00042075422243215144,
+      "learning_rate": 1.9582801191996594e-07,
+      "loss": 0.5953,
+      "step": 12820
+    },
+    {
+      "epoch": 49.00314176245211,
+      "grad_norm": 0.0014315516455098987,
+      "learning_rate": 1.8731375053214133e-07,
+      "loss": 0.0001,
+      "step": 12830
+    },
+    {
+      "epoch": 49.00390804597701,
+      "grad_norm": 0.0005138420965522528,
+      "learning_rate": 1.7879948914431674e-07,
+      "loss": 0.0001,
+      "step": 12840
+    },
+    {
+      "epoch": 49.00467432950192,
+      "grad_norm": 0.0008787927799858153,
+      "learning_rate": 1.7028522775649214e-07,
+      "loss": 0.0001,
+      "step": 12850
+    },
+    {
+      "epoch": 49.00544061302682,
+      "grad_norm": 0.0018959089647978544,
+      "learning_rate": 1.6177096636866753e-07,
+      "loss": 0.0002,
+      "step": 12860
+    },
+    {
+      "epoch": 49.006206896551724,
+      "grad_norm": 0.002935798140242696,
+      "learning_rate": 1.5325670498084292e-07,
+      "loss": 0.0002,
+      "step": 12870
+    },
+    {
+      "epoch": 49.00697318007663,
+      "grad_norm": 0.021103687584400177,
+      "learning_rate": 1.447424435930183e-07,
+      "loss": 0.0019,
+      "step": 12880
+    },
+    {
+      "epoch": 49.00773946360153,
+      "grad_norm": 0.0006676113116554916,
+      "learning_rate": 1.3622818220519372e-07,
+      "loss": 0.0001,
+      "step": 12890
+    },
+    {
+      "epoch": 49.008505747126435,
+      "grad_norm": 0.0005559830460697412,
+      "learning_rate": 1.2771392081736911e-07,
+      "loss": 0.3907,
+      "step": 12900
+    },
+    {
+      "epoch": 49.00927203065134,
+      "grad_norm": 0.0017491438193246722,
+      "learning_rate": 1.1919965942954449e-07,
+      "loss": 0.5897,
+      "step": 12910
+    },
+    {
+      "epoch": 49.01003831417624,
+      "grad_norm": 0.0014832859160378575,
+      "learning_rate": 1.106853980417199e-07,
+      "loss": 0.0001,
+      "step": 12920
+    },
+    {
+      "epoch": 49.01080459770115,
+      "grad_norm": 0.001833673333749175,
+      "learning_rate": 1.0217113665389529e-07,
+      "loss": 0.0002,
+      "step": 12930
+    },
+    {
+      "epoch": 49.011570881226056,
+      "grad_norm": 0.00040809944039210677,
+      "learning_rate": 9.365687526607066e-08,
+      "loss": 0.0002,
+      "step": 12940
+    },
+    {
+      "epoch": 49.01233716475096,
+      "grad_norm": 0.0005316553870216012,
+      "learning_rate": 8.514261387824607e-08,
+      "loss": 0.0002,
+      "step": 12950
+    },
+    {
+      "epoch": 49.013103448275864,
+      "grad_norm": 0.0025573335587978363,
+      "learning_rate": 7.662835249042146e-08,
+      "loss": 0.0003,
+      "step": 12960
+    },
+    {
+      "epoch": 49.01386973180077,
+      "grad_norm": 0.0003654268220998347,
+      "learning_rate": 6.811409110259686e-08,
+      "loss": 0.0004,
+      "step": 12970
+    },
+    {
+      "epoch": 49.01463601532567,
+      "grad_norm": 0.02465401589870453,
+      "learning_rate": 5.9599829714772246e-08,
+      "loss": 0.0001,
+      "step": 12980
+    },
+    {
+      "epoch": 49.015402298850574,
+      "grad_norm": 0.0006912121898494661,
+      "learning_rate": 5.108556832694764e-08,
+      "loss": 0.0002,
+      "step": 12990
+    },
+    {
+      "epoch": 49.01616858237548,
+      "grad_norm": 0.020561689510941505,
+      "learning_rate": 4.2571306939123034e-08,
+      "loss": 0.0003,
+      "step": 13000
+    },
+    {
+      "epoch": 49.01693486590038,
+      "grad_norm": 0.013793153688311577,
+      "learning_rate": 3.405704555129843e-08,
+      "loss": 0.0008,
+      "step": 13010
+    },
+    {
+      "epoch": 49.017701149425285,
+      "grad_norm": 0.0005276736337691545,
+      "learning_rate": 2.554278416347382e-08,
+      "loss": 0.1876,
+      "step": 13020
+    },
+    {
+      "epoch": 49.01846743295019,
+      "grad_norm": 0.0007357195718213916,
+      "learning_rate": 1.7028522775649215e-08,
+      "loss": 0.0001,
+      "step": 13030
+    },
+    {
+      "epoch": 49.01923371647509,
+      "grad_norm": 0.0007454999140463769,
+      "learning_rate": 8.514261387824608e-09,
+      "loss": 0.0001,
+      "step": 13040
+    },
+    {
+      "epoch": 49.02,
+      "grad_norm": 0.009587045758962631,
+      "learning_rate": 0.0,
+      "loss": 0.0008,
+      "step": 13050
+    },
+    {
+      "epoch": 49.02,
+      "eval_accuracy": 0.7555555555555555,
+      "eval_loss": 1.6518940925598145,
+      "eval_runtime": 20.5521,
+      "eval_samples_per_second": 2.19,
+      "eval_steps_per_second": 2.19,
+      "step": 13050
+    },
+    {
+      "epoch": 49.02,
+      "step": 13050,
+      "total_flos": 5.730289341462282e+19,
+      "train_loss": 0.6190862722078898,
+      "train_runtime": 11470.2888,
+      "train_samples_per_second": 1.138,
+      "train_steps_per_second": 1.138
+    },
+    {
+      "epoch": 49.02,
+      "eval_accuracy": 0.8222222222222222,
+      "eval_loss": 1.1478798389434814,
+      "eval_runtime": 17.189,
+      "eval_samples_per_second": 2.618,
+      "eval_steps_per_second": 2.618,
+      "step": 13050
+    },
+    {
+      "epoch": 49.02,
+      "eval_accuracy": 0.8222222222222222,
+      "eval_loss": 1.1478798389434814,
+      "eval_runtime": 17.1577,
+      "eval_samples_per_second": 2.623,
+      "eval_steps_per_second": 2.623,
+      "step": 13050
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 13050,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.730289341462282e+19,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}