diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7395 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.990375360923965,
+  "eval_steps": 500,
+  "global_step": 5190,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0019249278152069298,
+      "grad_norm": 9.235594749450684,
+      "learning_rate": 3.8535645472061657e-07,
+      "loss": 2.3328,
+      "step": 1
+    },
+    {
+      "epoch": 0.009624639076034648,
+      "grad_norm": 9.342337608337402,
+      "learning_rate": 1.9267822736030827e-06,
+      "loss": 2.3107,
+      "step": 5
+    },
+    {
+      "epoch": 0.019249278152069296,
+      "grad_norm": 8.154550552368164,
+      "learning_rate": 3.853564547206165e-06,
+      "loss": 2.3049,
+      "step": 10
+    },
+    {
+      "epoch": 0.028873917228103944,
+      "grad_norm": 5.9875688552856445,
+      "learning_rate": 5.780346820809249e-06,
+      "loss": 2.1949,
+      "step": 15
+    },
+    {
+      "epoch": 0.03849855630413859,
+      "grad_norm": 2.7122750282287598,
+      "learning_rate": 7.70712909441233e-06,
+      "loss": 2.0383,
+      "step": 20
+    },
+    {
+      "epoch": 0.04812319538017324,
+      "grad_norm": 1.6343287229537964,
+      "learning_rate": 9.633911368015415e-06,
+      "loss": 1.9244,
+      "step": 25
+    },
+    {
+      "epoch": 0.05774783445620789,
+      "grad_norm": 0.805985152721405,
+      "learning_rate": 1.1560693641618498e-05,
+      "loss": 1.8037,
+      "step": 30
+    },
+    {
+      "epoch": 0.06737247353224254,
+      "grad_norm": 0.685213029384613,
+      "learning_rate": 1.348747591522158e-05,
+      "loss": 1.7133,
+      "step": 35
+    },
+    {
+      "epoch": 0.07699711260827719,
+      "grad_norm": 0.5439901351928711,
+      "learning_rate": 1.541425818882466e-05,
+      "loss": 1.6271,
+      "step": 40
+    },
+    {
+      "epoch": 0.08662175168431184,
+      "grad_norm": 0.5319092273712158,
+      "learning_rate": 1.7341040462427746e-05,
+      "loss": 1.5405,
+      "step": 45
+    },
+    {
+      "epoch": 0.09624639076034648,
+      "grad_norm": 0.5163573026657104,
+      "learning_rate": 1.926782273603083e-05,
+      "loss": 1.4612,
+      "step": 50
+    },
+    {
+      "epoch": 0.10587102983638114,
+      "grad_norm": 0.4213581085205078,
+      "learning_rate": 2.119460500963391e-05,
+      "loss": 1.3647,
+      "step": 55
+    },
+    {
+      "epoch": 0.11549566891241578,
+      "grad_norm": 0.37413254380226135,
+      "learning_rate": 2.3121387283236996e-05,
+      "loss": 1.3279,
+      "step": 60
+    },
+    {
+      "epoch": 0.12512030798845045,
+      "grad_norm": 0.3393540680408478,
+      "learning_rate": 2.504816955684008e-05,
+      "loss": 1.2962,
+      "step": 65
+    },
+    {
+      "epoch": 0.1347449470644851,
+      "grad_norm": 0.3041280210018158,
+      "learning_rate": 2.697495183044316e-05,
+      "loss": 1.2746,
+      "step": 70
+    },
+    {
+      "epoch": 0.14436958614051973,
+      "grad_norm": 0.29960623383522034,
+      "learning_rate": 2.8901734104046245e-05,
+      "loss": 1.2432,
+      "step": 75
+    },
+    {
+      "epoch": 0.15399422521655437,
+      "grad_norm": 0.28563690185546875,
+      "learning_rate": 3.082851637764932e-05,
+      "loss": 1.224,
+      "step": 80
+    },
+    {
+      "epoch": 0.16361886429258904,
+      "grad_norm": 0.3082931339740753,
+      "learning_rate": 3.275529865125241e-05,
+      "loss": 1.2034,
+      "step": 85
+    },
+    {
+      "epoch": 0.17324350336862368,
+      "grad_norm": 0.3015296757221222,
+      "learning_rate": 3.468208092485549e-05,
+      "loss": 1.186,
+      "step": 90
+    },
+    {
+      "epoch": 0.18286814244465832,
+      "grad_norm": 0.330247163772583,
+      "learning_rate": 3.660886319845858e-05,
+      "loss": 1.1795,
+      "step": 95
+    },
+    {
+      "epoch": 0.19249278152069296,
+      "grad_norm": 0.30705899000167847,
+      "learning_rate": 3.853564547206166e-05,
+      "loss": 1.171,
+      "step": 100
+    },
+    {
+      "epoch": 0.20211742059672763,
+      "grad_norm": 0.3239520192146301,
+      "learning_rate": 4.046242774566474e-05,
+      "loss": 1.1629,
+      "step": 105
+    },
+    {
+      "epoch": 0.21174205967276227,
+      "grad_norm": 0.31190788745880127,
+      "learning_rate": 4.238921001926782e-05,
+      "loss": 1.1507,
+      "step": 110
+    },
+    {
+      "epoch": 0.22136669874879691,
+      "grad_norm": 0.3129926025867462,
+      "learning_rate": 4.43159922928709e-05,
+      "loss": 1.1597,
+      "step": 115
+    },
+    {
+      "epoch": 0.23099133782483156,
+      "grad_norm": 0.32413914799690247,
+      "learning_rate": 4.624277456647399e-05,
+      "loss": 1.1507,
+      "step": 120
+    },
+    {
+      "epoch": 0.24061597690086622,
+      "grad_norm": 0.41083359718322754,
+      "learning_rate": 4.816955684007707e-05,
+      "loss": 1.1259,
+      "step": 125
+    },
+    {
+      "epoch": 0.2502406159769009,
+      "grad_norm": 0.3095736801624298,
+      "learning_rate": 5.009633911368016e-05,
+      "loss": 1.124,
+      "step": 130
+    },
+    {
+      "epoch": 0.2598652550529355,
+      "grad_norm": 0.3358061611652374,
+      "learning_rate": 5.2023121387283234e-05,
+      "loss": 1.1299,
+      "step": 135
+    },
+    {
+      "epoch": 0.2694898941289702,
+      "grad_norm": 0.37028777599334717,
+      "learning_rate": 5.394990366088632e-05,
+      "loss": 1.1085,
+      "step": 140
+    },
+    {
+      "epoch": 0.2791145332050048,
+      "grad_norm": 0.3638240396976471,
+      "learning_rate": 5.58766859344894e-05,
+      "loss": 1.1139,
+      "step": 145
+    },
+    {
+      "epoch": 0.28873917228103946,
+      "grad_norm": 0.3208532929420471,
+      "learning_rate": 5.780346820809249e-05,
+      "loss": 1.0867,
+      "step": 150
+    },
+    {
+      "epoch": 0.2983638113570741,
+      "grad_norm": 0.325976699590683,
+      "learning_rate": 5.973025048169557e-05,
+      "loss": 1.0794,
+      "step": 155
+    },
+    {
+      "epoch": 0.30798845043310874,
+      "grad_norm": 0.3301510214805603,
+      "learning_rate": 6.165703275529865e-05,
+      "loss": 1.0811,
+      "step": 160
+    },
+    {
+      "epoch": 0.3176130895091434,
+      "grad_norm": 0.35519587993621826,
+      "learning_rate": 6.358381502890174e-05,
+      "loss": 1.076,
+      "step": 165
+    },
+    {
+      "epoch": 0.3272377285851781,
+      "grad_norm": 0.38242989778518677,
+      "learning_rate": 6.551059730250482e-05,
+      "loss": 1.0774,
+      "step": 170
+    },
+    {
+      "epoch": 0.3368623676612127,
+      "grad_norm": 0.3178574740886688,
+      "learning_rate": 6.74373795761079e-05,
+      "loss": 1.0678,
+      "step": 175
+    },
+    {
+      "epoch": 0.34648700673724736,
+      "grad_norm": 0.2955685257911682,
+      "learning_rate": 6.936416184971098e-05,
+      "loss": 1.0741,
+      "step": 180
+    },
+    {
+      "epoch": 0.35611164581328203,
+      "grad_norm": 0.3037715554237366,
+      "learning_rate": 7.129094412331408e-05,
+      "loss": 1.0649,
+      "step": 185
+    },
+    {
+      "epoch": 0.36573628488931664,
+      "grad_norm": 0.3199213445186615,
+      "learning_rate": 7.321772639691716e-05,
+      "loss": 1.0635,
+      "step": 190
+    },
+    {
+      "epoch": 0.3753609239653513,
+      "grad_norm": 0.317488431930542,
+      "learning_rate": 7.514450867052023e-05,
+      "loss": 1.0526,
+      "step": 195
+    },
+    {
+      "epoch": 0.3849855630413859,
+      "grad_norm": 0.3228258490562439,
+      "learning_rate": 7.707129094412332e-05,
+      "loss": 1.064,
+      "step": 200
+    },
+    {
+      "epoch": 0.3946102021174206,
+      "grad_norm": 0.2934040129184723,
+      "learning_rate": 7.89980732177264e-05,
+      "loss": 1.0544,
+      "step": 205
+    },
+    {
+      "epoch": 0.40423484119345526,
+      "grad_norm": 0.32170167565345764,
+      "learning_rate": 8.092485549132948e-05,
+      "loss": 1.0508,
+      "step": 210
+    },
+    {
+      "epoch": 0.4138594802694899,
+      "grad_norm": 0.29049986600875854,
+      "learning_rate": 8.285163776493256e-05,
+      "loss": 1.0611,
+      "step": 215
+    },
+    {
+      "epoch": 0.42348411934552455,
+      "grad_norm": 0.31131693720817566,
+      "learning_rate": 8.477842003853564e-05,
+      "loss": 1.0581,
+      "step": 220
+    },
+    {
+      "epoch": 0.4331087584215592,
+      "grad_norm": 0.2872338891029358,
+      "learning_rate": 8.670520231213874e-05,
+      "loss": 1.0512,
+      "step": 225
+    },
+    {
+      "epoch": 0.44273339749759383,
+      "grad_norm": 0.3063661754131317,
+      "learning_rate": 8.86319845857418e-05,
+      "loss": 1.0508,
+      "step": 230
+    },
+    {
+      "epoch": 0.4523580365736285,
+      "grad_norm": 0.30761733651161194,
+      "learning_rate": 9.05587668593449e-05,
+      "loss": 1.0549,
+      "step": 235
+    },
+    {
+      "epoch": 0.4619826756496631,
+      "grad_norm": 0.2758205533027649,
+      "learning_rate": 9.248554913294798e-05,
+      "loss": 1.0446,
+      "step": 240
+    },
+    {
+      "epoch": 0.4716073147256978,
+      "grad_norm": 0.3492432236671448,
+      "learning_rate": 9.441233140655106e-05,
+      "loss": 1.0511,
+      "step": 245
+    },
+    {
+      "epoch": 0.48123195380173245,
+      "grad_norm": 0.27041804790496826,
+      "learning_rate": 9.633911368015414e-05,
+      "loss": 1.0275,
+      "step": 250
+    },
+    {
+      "epoch": 0.49085659287776706,
+      "grad_norm": 0.2999095916748047,
+      "learning_rate": 9.826589595375723e-05,
+      "loss": 1.0433,
+      "step": 255
+    },
+    {
+      "epoch": 0.5004812319538018,
+      "grad_norm": 0.297323614358902,
+      "learning_rate": 0.00010019267822736032,
+      "loss": 1.0416,
+      "step": 260
+    },
+    {
+      "epoch": 0.5101058710298364,
+      "grad_norm": 0.3357987403869629,
+      "learning_rate": 0.00010211946050096339,
+      "loss": 1.0374,
+      "step": 265
+    },
+    {
+      "epoch": 0.519730510105871,
+      "grad_norm": 0.2953435778617859,
+      "learning_rate": 0.00010404624277456647,
+      "loss": 1.0352,
+      "step": 270
+    },
+    {
+      "epoch": 0.5293551491819056,
+      "grad_norm": 0.32853737473487854,
+      "learning_rate": 0.00010597302504816958,
+      "loss": 1.0529,
+      "step": 275
+    },
+    {
+      "epoch": 0.5389797882579404,
+      "grad_norm": 0.28152966499328613,
+      "learning_rate": 0.00010789980732177264,
+      "loss": 1.0427,
+      "step": 280
+    },
+    {
+      "epoch": 0.548604427333975,
+      "grad_norm": 0.2928714454174042,
+      "learning_rate": 0.00010982658959537572,
+      "loss": 1.0375,
+      "step": 285
+    },
+    {
+      "epoch": 0.5582290664100096,
+      "grad_norm": 0.29662230610847473,
+      "learning_rate": 0.0001117533718689788,
+      "loss": 1.0326,
+      "step": 290
+    },
+    {
+      "epoch": 0.5678537054860443,
+      "grad_norm": 0.2677823305130005,
+      "learning_rate": 0.00011368015414258189,
+      "loss": 1.0477,
+      "step": 295
+    },
+    {
+      "epoch": 0.5774783445620789,
+      "grad_norm": 0.2860727906227112,
+      "learning_rate": 0.00011560693641618498,
+      "loss": 1.0272,
+      "step": 300
+    },
+    {
+      "epoch": 0.5871029836381135,
+      "grad_norm": 0.2599497437477112,
+      "learning_rate": 0.00011753371868978806,
+      "loss": 1.0364,
+      "step": 305
+    },
+    {
+      "epoch": 0.5967276227141483,
+      "grad_norm": 0.26607978343963623,
+      "learning_rate": 0.00011946050096339114,
+      "loss": 1.0338,
+      "step": 310
+    },
+    {
+      "epoch": 0.6063522617901829,
+      "grad_norm": 0.2653907239437103,
+      "learning_rate": 0.00012138728323699422,
+      "loss": 1.0274,
+      "step": 315
+    },
+    {
+      "epoch": 0.6159769008662175,
+      "grad_norm": 0.2570829689502716,
+      "learning_rate": 0.0001233140655105973,
+      "loss": 1.0349,
+      "step": 320
+    },
+    {
+      "epoch": 0.6256015399422522,
+      "grad_norm": 0.2542014420032501,
+      "learning_rate": 0.00012524084778420039,
+      "loss": 1.0306,
+      "step": 325
+    },
+    {
+      "epoch": 0.6352261790182868,
+      "grad_norm": 0.2354612797498703,
+      "learning_rate": 0.00012716763005780348,
+      "loss": 1.0336,
+      "step": 330
+    },
+    {
+      "epoch": 0.6448508180943214,
+      "grad_norm": 0.26090219616889954,
+      "learning_rate": 0.00012909441233140655,
+      "loss": 1.0319,
+      "step": 335
+    },
+    {
+      "epoch": 0.6544754571703562,
+      "grad_norm": 0.2287357598543167,
+      "learning_rate": 0.00013102119460500964,
+      "loss": 1.0228,
+      "step": 340
+    },
+    {
+      "epoch": 0.6641000962463908,
+      "grad_norm": 0.2653840184211731,
+      "learning_rate": 0.0001329479768786127,
+      "loss": 1.019,
+      "step": 345
+    },
+    {
+      "epoch": 0.6737247353224254,
+      "grad_norm": 0.25462430715560913,
+      "learning_rate": 0.0001348747591522158,
+      "loss": 1.0289,
+      "step": 350
+    },
+    {
+      "epoch": 0.6833493743984601,
+      "grad_norm": 0.24566137790679932,
+      "learning_rate": 0.0001368015414258189,
+      "loss": 1.0286,
+      "step": 355
+    },
+    {
+      "epoch": 0.6929740134744947,
+      "grad_norm": 0.24448491632938385,
+      "learning_rate": 0.00013872832369942197,
+      "loss": 1.0195,
+      "step": 360
+    },
+    {
+      "epoch": 0.7025986525505293,
+      "grad_norm": 0.2303464114665985,
+      "learning_rate": 0.00014065510597302506,
+      "loss": 1.0328,
+      "step": 365
+    },
+    {
+      "epoch": 0.7122232916265641,
+      "grad_norm": 0.2552158832550049,
+      "learning_rate": 0.00014258188824662816,
+      "loss": 1.0366,
+      "step": 370
+    },
+    {
+      "epoch": 0.7218479307025987,
+      "grad_norm": 0.22079892456531525,
+      "learning_rate": 0.00014450867052023122,
+      "loss": 1.024,
+      "step": 375
+    },
+    {
+      "epoch": 0.7314725697786333,
+      "grad_norm": 0.32242512702941895,
+      "learning_rate": 0.00014643545279383432,
+      "loss": 1.033,
+      "step": 380
+    },
+    {
+      "epoch": 0.7410972088546679,
+      "grad_norm": 0.2999092936515808,
+      "learning_rate": 0.00014836223506743738,
+      "loss": 1.0204,
+      "step": 385
+    },
+    {
+      "epoch": 0.7507218479307026,
+      "grad_norm": 0.26794490218162537,
+      "learning_rate": 0.00015028901734104045,
+      "loss": 1.0305,
+      "step": 390
+    },
+    {
+      "epoch": 0.7603464870067372,
+      "grad_norm": 0.33896663784980774,
+      "learning_rate": 0.00015221579961464357,
+      "loss": 1.0308,
+      "step": 395
+    },
+    {
+      "epoch": 0.7699711260827719,
+      "grad_norm": 0.22748759388923645,
+      "learning_rate": 0.00015414258188824664,
+      "loss": 1.0197,
+      "step": 400
+    },
+    {
+      "epoch": 0.7795957651588066,
+      "grad_norm": 0.23324738442897797,
+      "learning_rate": 0.0001560693641618497,
+      "loss": 1.0131,
+      "step": 405
+    },
+    {
+      "epoch": 0.7892204042348412,
+      "grad_norm": 0.24805064499378204,
+      "learning_rate": 0.0001579961464354528,
+      "loss": 1.0094,
+      "step": 410
+    },
+    {
+      "epoch": 0.7988450433108758,
+      "grad_norm": 0.24965739250183105,
+      "learning_rate": 0.00015992292870905587,
+      "loss": 1.0203,
+      "step": 415
+    },
+    {
+      "epoch": 0.8084696823869105,
+      "grad_norm": 0.22509600222110748,
+      "learning_rate": 0.00016184971098265897,
+      "loss": 1.0265,
+      "step": 420
+    },
+    {
+      "epoch": 0.8180943214629451,
+      "grad_norm": 0.2149883359670639,
+      "learning_rate": 0.00016377649325626206,
+      "loss": 1.0171,
+      "step": 425
+    },
+    {
+      "epoch": 0.8277189605389798,
+      "grad_norm": 0.24780240654945374,
+      "learning_rate": 0.00016570327552986513,
+      "loss": 1.0144,
+      "step": 430
+    },
+    {
+      "epoch": 0.8373435996150145,
+      "grad_norm": 0.2780991494655609,
+      "learning_rate": 0.00016763005780346822,
+      "loss": 1.0145,
+      "step": 435
+    },
+    {
+      "epoch": 0.8469682386910491,
+      "grad_norm": 0.22135606408119202,
+      "learning_rate": 0.0001695568400770713,
+      "loss": 1.0187,
+      "step": 440
+    },
+    {
+      "epoch": 0.8565928777670837,
+      "grad_norm": 0.20605282485485077,
+      "learning_rate": 0.00017148362235067438,
+      "loss": 1.0197,
+      "step": 445
+    },
+    {
+      "epoch": 0.8662175168431184,
+      "grad_norm": 0.24270793795585632,
+      "learning_rate": 0.00017341040462427748,
+      "loss": 1.0106,
+      "step": 450
+    },
+    {
+      "epoch": 0.875842155919153,
+      "grad_norm": 0.24285346269607544,
+      "learning_rate": 0.00017533718689788055,
+      "loss": 1.0242,
+      "step": 455
+    },
+    {
+      "epoch": 0.8854667949951877,
+      "grad_norm": 0.21814145147800446,
+      "learning_rate": 0.0001772639691714836,
+      "loss": 1.0176,
+      "step": 460
+    },
+    {
+      "epoch": 0.8950914340712224,
+      "grad_norm": 0.22261013090610504,
+      "learning_rate": 0.00017919075144508673,
+      "loss": 1.0099,
+      "step": 465
+    },
+    {
+      "epoch": 0.904716073147257,
+      "grad_norm": 0.21424554288387299,
+      "learning_rate": 0.0001811175337186898,
+      "loss": 1.0097,
+      "step": 470
+    },
+    {
+      "epoch": 0.9143407122232916,
+      "grad_norm": 0.2335994988679886,
+      "learning_rate": 0.00018304431599229287,
+      "loss": 1.0179,
+      "step": 475
+    },
+    {
+      "epoch": 0.9239653512993262,
+      "grad_norm": 0.20568034052848816,
+      "learning_rate": 0.00018497109826589596,
+      "loss": 1.0067,
+      "step": 480
+    },
+    {
+      "epoch": 0.933589990375361,
+      "grad_norm": 0.20264984667301178,
+      "learning_rate": 0.00018689788053949903,
+      "loss": 1.0147,
+      "step": 485
+    },
+    {
+      "epoch": 0.9432146294513956,
+      "grad_norm": 0.2133115977048874,
+      "learning_rate": 0.00018882466281310213,
+      "loss": 1.0071,
+      "step": 490
+    },
+    {
+      "epoch": 0.9528392685274302,
+      "grad_norm": 0.2007424235343933,
+      "learning_rate": 0.00019075144508670522,
+      "loss": 1.0095,
+      "step": 495
+    },
+    {
+      "epoch": 0.9624639076034649,
+      "grad_norm": 0.20568867027759552,
+      "learning_rate": 0.0001926782273603083,
+      "loss": 1.0113,
+      "step": 500
+    },
+    {
+      "epoch": 0.9720885466794995,
+      "grad_norm": 0.19897951185703278,
+      "learning_rate": 0.00019460500963391138,
+      "loss": 1.0129,
+      "step": 505
+    },
+    {
+      "epoch": 0.9817131857555341,
+      "grad_norm": 0.21554742753505707,
+      "learning_rate": 0.00019653179190751445,
+      "loss": 1.0107,
+      "step": 510
+    },
+    {
+      "epoch": 0.9913378248315688,
+      "grad_norm": 0.20981793105602264,
+      "learning_rate": 0.00019845857418111754,
+      "loss": 1.0008,
+      "step": 515
+    },
+    {
+      "epoch": 0.9990375360923965,
+      "eval_loss": 2.1032063961029053,
+      "eval_runtime": 0.7869,
+      "eval_samples_per_second": 13.979,
+      "eval_steps_per_second": 2.542,
+      "step": 519
+    },
+    {
+      "epoch": 1.0009624639076036,
+      "grad_norm": 0.22678163647651672,
+      "learning_rate": 0.0001999999773822188,
+      "loss": 1.0012,
+      "step": 520
+    },
+    {
+      "epoch": 1.0105871029836382,
+      "grad_norm": 0.2608613967895508,
+      "learning_rate": 0.00019999918576095053,
+      "loss": 0.9875,
+      "step": 525
+    },
+    {
+      "epoch": 1.0202117420596728,
+      "grad_norm": 0.2601936459541321,
+      "learning_rate": 0.0001999972632608527,
+      "loss": 0.9805,
+      "step": 530
+    },
+    {
+      "epoch": 1.0298363811357074,
+      "grad_norm": 0.21544857323169708,
+      "learning_rate": 0.00019999420990366674,
+      "loss": 0.9805,
+      "step": 535
+    },
+    {
+      "epoch": 1.039461020211742,
+      "grad_norm": 0.20171190798282623,
+      "learning_rate": 0.00019999002572392255,
+      "loss": 0.9798,
+      "step": 540
+    },
+    {
+      "epoch": 1.0490856592877766,
+      "grad_norm": 0.2205726057291031,
+      "learning_rate": 0.0001999847107689386,
+      "loss": 0.9805,
+      "step": 545
+    },
+    {
+      "epoch": 1.0587102983638113,
+      "grad_norm": 0.20397739112377167,
+      "learning_rate": 0.0001999782650988211,
+      "loss": 0.9952,
+      "step": 550
+    },
+    {
+      "epoch": 1.068334937439846,
+      "grad_norm": 0.207752525806427,
+      "learning_rate": 0.00019997068878646333,
+      "loss": 0.9786,
+      "step": 555
+    },
+    {
+      "epoch": 1.0779595765158807,
+      "grad_norm": 0.2041793167591095,
+      "learning_rate": 0.0001999619819175449,
+      "loss": 0.9951,
+      "step": 560
+    },
+    {
+      "epoch": 1.0875842155919153,
+      "grad_norm": 0.19135500490665436,
+      "learning_rate": 0.00019995214459053075,
+      "loss": 0.9912,
+      "step": 565
+    },
+    {
+      "epoch": 1.09720885466795,
+      "grad_norm": 0.2038804590702057,
+      "learning_rate": 0.00019994117691667004,
+      "loss": 0.9821,
+      "step": 570
+    },
+    {
+      "epoch": 1.1068334937439845,
+      "grad_norm": 0.21948496997356415,
+      "learning_rate": 0.00019992907901999484,
+      "loss": 0.9933,
+      "step": 575
+    },
+    {
+      "epoch": 1.1164581328200192,
+      "grad_norm": 0.21123313903808594,
+      "learning_rate": 0.0001999158510373189,
+      "loss": 0.9723,
+      "step": 580
+    },
+    {
+      "epoch": 1.126082771896054,
+      "grad_norm": 0.2110896110534668,
+      "learning_rate": 0.00019990149311823588,
+      "loss": 0.9789,
+      "step": 585
+    },
+    {
+      "epoch": 1.1357074109720886,
+      "grad_norm": 0.20370599627494812,
+      "learning_rate": 0.00019988600542511766,
+      "loss": 0.9902,
+      "step": 590
+    },
+    {
+      "epoch": 1.1453320500481232,
+      "grad_norm": 0.19531656801700592,
+      "learning_rate": 0.00019986938813311284,
+      "loss": 0.9846,
+      "step": 595
+    },
+    {
+      "epoch": 1.1549566891241578,
+      "grad_norm": 0.2497565895318985,
+      "learning_rate": 0.00019985164143014432,
+      "loss": 0.9864,
+      "step": 600
+    },
+    {
+      "epoch": 1.1645813282001924,
+      "grad_norm": 0.2870050072669983,
+      "learning_rate": 0.00019983276551690745,
+      "loss": 0.9851,
+      "step": 605
+    },
+    {
+      "epoch": 1.174205967276227,
+      "grad_norm": 0.20774626731872559,
+      "learning_rate": 0.0001998127606068677,
+      "loss": 0.9819,
+      "step": 610
+    },
+    {
+      "epoch": 1.1838306063522617,
+      "grad_norm": 0.2567305266857147,
+      "learning_rate": 0.00019979162692625817,
+      "loss": 0.9754,
+      "step": 615
+    },
+    {
+      "epoch": 1.1934552454282965,
+      "grad_norm": 0.1896723359823227,
+      "learning_rate": 0.00019976936471407717,
+      "loss": 0.9762,
+      "step": 620
+    },
+    {
+      "epoch": 1.2030798845043311,
+      "grad_norm": 0.19382244348526,
+      "learning_rate": 0.00019974597422208533,
+      "loss": 0.9783,
+      "step": 625
+    },
+    {
+      "epoch": 1.2127045235803657,
+      "grad_norm": 0.19210918247699738,
+      "learning_rate": 0.00019972145571480295,
+      "loss": 0.9778,
+      "step": 630
+    },
+    {
+      "epoch": 1.2223291626564003,
+      "grad_norm": 0.2057211995124817,
+      "learning_rate": 0.00019969580946950695,
+      "loss": 0.9632,
+      "step": 635
+    },
+    {
+      "epoch": 1.231953801732435,
+      "grad_norm": 0.23469866812229156,
+      "learning_rate": 0.0001996690357762276,
+      "loss": 0.9824,
+      "step": 640
+    },
+    {
+      "epoch": 1.2415784408084698,
+      "grad_norm": 0.19450876116752625,
+      "learning_rate": 0.00019964113493774538,
+      "loss": 0.9788,
+      "step": 645
+    },
+    {
+      "epoch": 1.2512030798845044,
+      "grad_norm": 0.18963035941123962,
+      "learning_rate": 0.00019961210726958758,
+      "loss": 0.9854,
+      "step": 650
+    },
+    {
+      "epoch": 1.260827718960539,
+      "grad_norm": 0.2049696296453476,
+      "learning_rate": 0.00019958195310002457,
+      "loss": 0.9901,
+      "step": 655
+    },
+    {
+      "epoch": 1.2704523580365736,
+      "grad_norm": 0.18745918571949005,
+      "learning_rate": 0.00019955067277006633,
+      "loss": 0.9772,
+      "step": 660
+    },
+    {
+      "epoch": 1.2800769971126083,
+      "grad_norm": 0.1893537938594818,
+      "learning_rate": 0.00019951826663345827,
+      "loss": 0.9862,
+      "step": 665
+    },
+    {
+      "epoch": 1.2897016361886429,
+      "grad_norm": 0.18441106379032135,
+      "learning_rate": 0.00019948473505667757,
+      "loss": 0.9836,
+      "step": 670
+    },
+    {
+      "epoch": 1.2993262752646775,
+      "grad_norm": 0.21260684728622437,
+      "learning_rate": 0.00019945007841892884,
+      "loss": 0.9878,
+      "step": 675
+    },
+    {
+      "epoch": 1.3089509143407123,
+      "grad_norm": 0.19159361720085144,
+      "learning_rate": 0.00019941429711213982,
+      "loss": 1.0004,
+      "step": 680
+    },
+    {
+      "epoch": 1.318575553416747,
+      "grad_norm": 0.19893284142017365,
+      "learning_rate": 0.000199377391540957,
+      "loss": 0.9728,
+      "step": 685
+    },
+    {
+      "epoch": 1.3282001924927815,
+      "grad_norm": 0.2625219225883484,
+      "learning_rate": 0.00019933936212274115,
+      "loss": 0.9815,
+      "step": 690
+    },
+    {
+      "epoch": 1.3378248315688162,
+      "grad_norm": 0.20059077441692352,
+      "learning_rate": 0.00019930020928756232,
+      "loss": 0.9869,
+      "step": 695
+    },
+    {
+      "epoch": 1.3474494706448508,
+      "grad_norm": 0.19443583488464355,
+      "learning_rate": 0.00019925993347819532,
+      "loss": 0.9852,
+      "step": 700
+    },
+    {
+      "epoch": 1.3570741097208856,
+      "grad_norm": 0.19254858791828156,
+      "learning_rate": 0.00019921853515011438,
+      "loss": 0.9768,
+      "step": 705
+    },
+    {
+      "epoch": 1.36669874879692,
+      "grad_norm": 0.1973366141319275,
+      "learning_rate": 0.0001991760147714883,
+      "loss": 0.9865,
+      "step": 710
+    },
+    {
+      "epoch": 1.3763233878729548,
+      "grad_norm": 0.2019069492816925,
+      "learning_rate": 0.00019913237282317495,
+      "loss": 0.9701,
+      "step": 715
+    },
+    {
+      "epoch": 1.3859480269489894,
+      "grad_norm": 0.20254430174827576,
+      "learning_rate": 0.0001990876097987159,
+      "loss": 0.9867,
+      "step": 720
+    },
+    {
+      "epoch": 1.395572666025024,
+      "grad_norm": 0.2121659815311432,
+      "learning_rate": 0.00019904172620433078,
+      "loss": 0.9688,
+      "step": 725
+    },
+    {
+      "epoch": 1.4051973051010587,
+      "grad_norm": 0.2147083729505539,
+      "learning_rate": 0.00019899472255891176,
+      "loss": 0.9802,
+      "step": 730
+    },
+    {
+      "epoch": 1.4148219441770933,
+      "grad_norm": 0.21038152277469635,
+      "learning_rate": 0.0001989465993940174,
+      "loss": 0.9759,
+      "step": 735
+    },
+    {
+      "epoch": 1.4244465832531281,
+      "grad_norm": 0.21153226494789124,
+      "learning_rate": 0.00019889735725386683,
+      "loss": 0.9735,
+      "step": 740
+    },
+    {
+      "epoch": 1.4340712223291627,
+      "grad_norm": 0.2074025273323059,
+      "learning_rate": 0.00019884699669533347,
+      "loss": 0.9913,
+      "step": 745
+    },
+    {
+      "epoch": 1.4436958614051973,
+      "grad_norm": 0.21015384793281555,
+      "learning_rate": 0.00019879551828793892,
+      "loss": 0.9737,
+      "step": 750
+    },
+    {
+      "epoch": 1.453320500481232,
+      "grad_norm": 0.21345528960227966,
+      "learning_rate": 0.0001987429226138463,
+      "loss": 0.9675,
+      "step": 755
+    },
+    {
+      "epoch": 1.4629451395572666,
+      "grad_norm": 0.21284109354019165,
+      "learning_rate": 0.0001986892102678538,
+      "loss": 0.9787,
+      "step": 760
+    },
+    {
+      "epoch": 1.4725697786333012,
+      "grad_norm": 0.19105084240436554,
+      "learning_rate": 0.0001986343818573879,
+      "loss": 0.9714,
+      "step": 765
+    },
+    {
+      "epoch": 1.4821944177093358,
+      "grad_norm": 0.18031322956085205,
+      "learning_rate": 0.0001985784380024966,
+      "loss": 0.965,
+      "step": 770
+    },
+    {
+      "epoch": 1.4918190567853706,
+      "grad_norm": 0.19423770904541016,
+      "learning_rate": 0.00019852137933584215,
+      "loss": 0.9743,
+      "step": 775
+    },
+    {
+      "epoch": 1.5014436958614052,
+      "grad_norm": 0.1923457533121109,
+      "learning_rate": 0.0001984632065026943,
+      "loss": 0.9872,
+      "step": 780
+    },
+    {
+      "epoch": 1.5110683349374399,
+      "grad_norm": 0.1957743912935257,
+      "learning_rate": 0.0001984039201609226,
+      "loss": 0.9799,
+      "step": 785
+    },
+    {
+      "epoch": 1.5206929740134745,
+      "grad_norm": 0.17838570475578308,
+      "learning_rate": 0.0001983435209809892,
+      "loss": 0.9765,
+      "step": 790
+    },
+    {
+      "epoch": 1.530317613089509,
+      "grad_norm": 0.1872684508562088,
+      "learning_rate": 0.00019828200964594123,
+      "loss": 0.9768,
+      "step": 795
+    },
+    {
+      "epoch": 1.539942252165544,
+      "grad_norm": 0.19497379660606384,
+      "learning_rate": 0.00019821938685140298,
+      "loss": 0.9686,
+      "step": 800
+    },
+    {
+      "epoch": 1.5495668912415783,
+      "grad_norm": 0.18703444302082062,
+      "learning_rate": 0.00019815565330556816,
+      "loss": 0.9785,
+      "step": 805
+    },
+    {
+      "epoch": 1.5591915303176132,
+      "grad_norm": 0.18727166950702667,
+      "learning_rate": 0.00019809080972919181,
+      "loss": 0.9748,
+      "step": 810
+    },
+    {
+      "epoch": 1.5688161693936478,
+      "grad_norm": 0.19498740136623383,
+      "learning_rate": 0.00019802485685558222,
+      "loss": 0.975,
+      "step": 815
+    },
+    {
+      "epoch": 1.5784408084696824,
+      "grad_norm": 0.1881551891565323,
+      "learning_rate": 0.00019795779543059248,
+      "loss": 0.9749,
+      "step": 820
+    },
+    {
+      "epoch": 1.588065447545717,
+      "grad_norm": 0.17449571192264557,
+      "learning_rate": 0.00019788962621261226,
+      "loss": 0.9676,
+      "step": 825
+    },
+    {
+      "epoch": 1.5976900866217516,
+      "grad_norm": 0.1892375349998474,
+      "learning_rate": 0.0001978203499725591,
+      "loss": 0.9722,
+      "step": 830
+    },
+    {
+      "epoch": 1.6073147256977864,
+      "grad_norm": 0.19908097386360168,
+      "learning_rate": 0.00019774996749386968,
+      "loss": 0.9674,
+      "step": 835
+    },
+    {
+      "epoch": 1.6169393647738208,
+      "grad_norm": 0.17946478724479675,
+      "learning_rate": 0.00019767847957249108,
+      "loss": 0.9741,
+      "step": 840
+    },
+    {
+      "epoch": 1.6265640038498557,
+      "grad_norm": 0.17460967600345612,
+      "learning_rate": 0.0001976058870168716,
+      "loss": 0.9726,
+      "step": 845
+    },
+    {
+      "epoch": 1.6361886429258903,
+      "grad_norm": 0.17595893144607544,
+      "learning_rate": 0.0001975321906479518,
+      "loss": 0.9783,
+      "step": 850
+    },
+    {
+      "epoch": 1.645813282001925,
+      "grad_norm": 0.18718552589416504,
+      "learning_rate": 0.00019745739129915508,
+      "loss": 0.9746,
+      "step": 855
+    },
+    {
+      "epoch": 1.6554379210779597,
+      "grad_norm": 0.18480895459651947,
+      "learning_rate": 0.00019738148981637835,
+      "loss": 0.9675,
+      "step": 860
+    },
+    {
+      "epoch": 1.6650625601539941,
+      "grad_norm": 0.1780669093132019,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 0.9648,
+      "step": 865
+    },
+    {
+      "epoch": 1.674687199230029,
+      "grad_norm": 0.17525720596313477,
+      "learning_rate": 0.00019722638389478217,
+      "loss": 0.9911,
+      "step": 870
+    },
+    {
+      "epoch": 1.6843118383060636,
+      "grad_norm": 0.1761050969362259,
+      "learning_rate": 0.00019714718121003705,
+      "loss": 0.9745,
+      "step": 875
+    },
+    {
+      "epoch": 1.6939364773820982,
+      "grad_norm": 0.193415105342865,
+      "learning_rate": 0.00019706687989944072,
+      "loss": 0.9669,
+      "step": 880
+    },
+    {
+      "epoch": 1.7035611164581328,
+      "grad_norm": 0.18116651475429535,
+      "learning_rate": 0.00019698548087111102,
+      "loss": 0.9573,
+      "step": 885
+    },
+    {
+      "epoch": 1.7131857555341674,
+      "grad_norm": 0.17790788412094116,
+      "learning_rate": 0.0001969029850455799,
+      "loss": 0.9738,
+      "step": 890
+    },
+    {
+      "epoch": 1.7228103946102022,
+      "grad_norm": 0.18143677711486816,
+      "learning_rate": 0.00019681939335578275,
+      "loss": 0.9641,
+      "step": 895
+    },
+    {
+      "epoch": 1.7324350336862366,
+      "grad_norm": 0.1727439910173416,
+      "learning_rate": 0.00019673470674704801,
+      "loss": 0.9612,
+      "step": 900
+    },
+    {
+      "epoch": 1.7420596727622715,
+      "grad_norm": 0.17776042222976685,
+      "learning_rate": 0.00019664892617708642,
+      "loss": 0.9704,
+      "step": 905
+    },
+    {
+      "epoch": 1.751684311838306,
+      "grad_norm": 0.1788305640220642,
+      "learning_rate": 0.00019656205261598016,
+      "loss": 0.9822,
+      "step": 910
+    },
+    {
+      "epoch": 1.7613089509143407,
+      "grad_norm": 0.18292832374572754,
+      "learning_rate": 0.00019647408704617192,
+      "loss": 0.981,
+      "step": 915
+    },
+    {
+      "epoch": 1.7709335899903753,
+      "grad_norm": 0.1903613954782486,
+      "learning_rate": 0.00019638503046245383,
+      "loss": 0.9815,
+      "step": 920
+    },
+    {
+      "epoch": 1.78055822906641,
+      "grad_norm": 0.18801650404930115,
+      "learning_rate": 0.00019629488387195614,
+      "loss": 0.9723,
+      "step": 925
+    },
+    {
+      "epoch": 1.7901828681424448,
+      "grad_norm": 0.19215719401836395,
+      "learning_rate": 0.0001962036482941359,
+      "loss": 0.9785,
+      "step": 930
+    },
+    {
+      "epoch": 1.7998075072184792,
+      "grad_norm": 0.1913854032754898,
+      "learning_rate": 0.00019611132476076527,
+      "loss": 0.9661,
+      "step": 935
+    },
+    {
+      "epoch": 1.809432146294514,
+      "grad_norm": 0.19718807935714722,
+      "learning_rate": 0.00019601791431592006,
+      "loss": 0.9791,
+      "step": 940
+    },
+    {
+      "epoch": 1.8190567853705486,
+      "grad_norm": 0.18217253684997559,
+      "learning_rate": 0.00019592341801596787,
+      "loss": 0.9575,
+      "step": 945
+    },
+    {
+      "epoch": 1.8286814244465832,
+      "grad_norm": 0.17967750132083893,
+      "learning_rate": 0.00019582783692955605,
+      "loss": 0.9637,
+      "step": 950
+    },
+    {
+      "epoch": 1.838306063522618,
+      "grad_norm": 0.17850783467292786,
+      "learning_rate": 0.00019573117213759957,
+      "loss": 0.9605,
+      "step": 955
+    },
+    {
+      "epoch": 1.8479307025986524,
+      "grad_norm": 0.19147521257400513,
+      "learning_rate": 0.00019563342473326913,
+      "loss": 0.9818,
+      "step": 960
+    },
+    {
+      "epoch": 1.8575553416746873,
+      "grad_norm": 0.17569051682949066,
+      "learning_rate": 0.00019553459582197835,
+      "loss": 0.9642,
+      "step": 965
+    },
+    {
+      "epoch": 1.867179980750722,
+      "grad_norm": 0.18762874603271484,
+      "learning_rate": 0.00019543468652137157,
+      "loss": 0.9744,
+      "step": 970
+    },
+    {
+      "epoch": 1.8768046198267565,
+      "grad_norm": 0.17426376044750214,
+      "learning_rate": 0.00019533369796131118,
+      "loss": 0.9725,
+      "step": 975
+    },
+    {
+      "epoch": 1.8864292589027911,
+      "grad_norm": 0.21174634993076324,
+      "learning_rate": 0.00019523163128386465,
+      "loss": 0.9793,
+      "step": 980
+    },
+    {
+      "epoch": 1.8960538979788257,
+      "grad_norm": 0.19356350600719452,
+      "learning_rate": 0.00019512848764329188,
+      "loss": 0.9632,
+      "step": 985
+    },
+    {
+      "epoch": 1.9056785370548606,
+      "grad_norm": 0.1924716979265213,
+      "learning_rate": 0.00019502426820603192,
+      "loss": 0.9791,
+      "step": 990
+    },
+    {
+      "epoch": 1.915303176130895,
+      "grad_norm": 0.20623841881752014,
+      "learning_rate": 0.00019491897415068997,
+      "loss": 0.9678,
+      "step": 995
+    },
+    {
+      "epoch": 1.9249278152069298,
+      "grad_norm": 0.1916794627904892,
+      "learning_rate": 0.00019481260666802386,
+      "loss": 0.9677,
+      "step": 1000
+    },
+    {
+      "epoch": 1.9345524542829644,
+      "grad_norm": 0.17562657594680786,
+      "learning_rate": 0.00019470516696093072,
+      "loss": 0.9818,
+      "step": 1005
+    },
+    {
+      "epoch": 1.944177093358999,
+      "grad_norm": 0.17680735886096954,
+      "learning_rate": 0.00019459665624443342,
+      "loss": 0.9789,
+      "step": 1010
+    },
+    {
+      "epoch": 1.9538017324350336,
+      "grad_norm": 0.17583592236042023,
+      "learning_rate": 0.00019448707574566657,
+      "loss": 0.9758,
+      "step": 1015
+    },
+    {
+      "epoch": 1.9634263715110682,
+      "grad_norm": 0.18164704740047455,
+      "learning_rate": 0.00019437642670386304,
+      "loss": 0.9596,
+      "step": 1020
+    },
+    {
+      "epoch": 1.973051010587103,
+      "grad_norm": 0.16976359486579895,
+      "learning_rate": 0.0001942647103703395,
+      "loss": 0.9725,
+      "step": 1025
+    },
+    {
+      "epoch": 1.9826756496631375,
+      "grad_norm": 0.16864246129989624,
+      "learning_rate": 0.00019415192800848263,
+      "loss": 0.9788,
+      "step": 1030
+    },
+    {
+      "epoch": 1.9923002887391723,
+      "grad_norm": 0.17933247983455658,
+      "learning_rate": 0.00019403808089373472,
+      "loss": 0.9747,
+      "step": 1035
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.1443910598754883,
+      "eval_runtime": 0.7795,
+      "eval_samples_per_second": 14.112,
+      "eval_steps_per_second": 2.566,
+      "step": 1039
+    },
+    {
+      "epoch": 2.001924927815207,
+      "grad_norm": 0.17538660764694214,
+      "learning_rate": 0.00019392317031357908,
+      "loss": 0.9576,
+      "step": 1040
+    },
+    {
+      "epoch": 2.0115495668912415,
+      "grad_norm": 0.18830431997776031,
+      "learning_rate": 0.00019380719756752584,
+      "loss": 0.9117,
+      "step": 1045
+    },
+    {
+      "epoch": 2.0211742059672764,
+      "grad_norm": 0.18357954919338226,
+      "learning_rate": 0.00019369016396709681,
+      "loss": 0.9149,
+      "step": 1050
+    },
+    {
+      "epoch": 2.0307988450433108,
+      "grad_norm": 0.19075176119804382,
+      "learning_rate": 0.000193572070835811,
+      "loss": 0.9114,
+      "step": 1055
+    },
+    {
+      "epoch": 2.0404234841193456,
+      "grad_norm": 0.19288337230682373,
+      "learning_rate": 0.0001934529195091695,
+      "loss": 0.9061,
+      "step": 1060
+    },
+    {
+      "epoch": 2.05004812319538,
+      "grad_norm": 0.1923680603504181,
+      "learning_rate": 0.00019333271133464047,
+      "loss": 0.9165,
+      "step": 1065
+    },
+    {
+      "epoch": 2.059672762271415,
+      "grad_norm": 0.19743940234184265,
+      "learning_rate": 0.00019321144767164367,
+      "loss": 0.9115,
+      "step": 1070
+    },
+    {
+      "epoch": 2.0692974013474497,
+      "grad_norm": 0.18134470283985138,
+      "learning_rate": 0.00019308912989153548,
+      "loss": 0.9117,
+      "step": 1075
+    },
+    {
+      "epoch": 2.078922040423484,
+      "grad_norm": 0.19912441074848175,
+      "learning_rate": 0.00019296575937759292,
+      "loss": 0.9139,
+      "step": 1080
+    },
+    {
+      "epoch": 2.088546679499519,
+      "grad_norm": 0.20187345147132874,
+      "learning_rate": 0.00019284133752499848,
+      "loss": 0.9233,
+      "step": 1085
+    },
+    {
+      "epoch": 2.0981713185755533,
+      "grad_norm": 0.19697998464107513,
+      "learning_rate": 0.00019271586574082393,
+      "loss": 0.9189,
+      "step": 1090
+    },
+    {
+      "epoch": 2.107795957651588,
+      "grad_norm": 0.1886579543352127,
+      "learning_rate": 0.0001925893454440147,
+      "loss": 0.9157,
+      "step": 1095
+    },
+    {
+      "epoch": 2.1174205967276225,
+      "grad_norm": 0.1850527673959732,
+      "learning_rate": 0.00019246177806537377,
+      "loss": 0.9173,
+      "step": 1100
+    },
+    {
+      "epoch": 2.1270452358036573,
+      "grad_norm": 0.19263537228107452,
+      "learning_rate": 0.00019233316504754523,
+      "loss": 0.9213,
+      "step": 1105
+    },
+    {
+      "epoch": 2.136669874879692,
+      "grad_norm": 0.18643324077129364,
+      "learning_rate": 0.00019220350784499837,
+      "loss": 0.9281,
+      "step": 1110
+    },
+    {
+      "epoch": 2.1462945139557266,
+      "grad_norm": 0.20145340263843536,
+      "learning_rate": 0.00019207280792401098,
+      "loss": 0.9154,
+      "step": 1115
+    },
+    {
+      "epoch": 2.1559191530317614,
+      "grad_norm": 0.20724299550056458,
+      "learning_rate": 0.00019194106676265283,
+      "loss": 0.9216,
+      "step": 1120
+    },
+    {
+      "epoch": 2.165543792107796,
+      "grad_norm": 0.20987021923065186,
+      "learning_rate": 0.000191808285850769,
+      "loss": 0.9191,
+      "step": 1125
+    },
+    {
+      "epoch": 2.1751684311838306,
+      "grad_norm": 0.19462813436985016,
+      "learning_rate": 0.00019167446668996285,
+      "loss": 0.9206,
+      "step": 1130
+    },
+    {
+      "epoch": 2.1847930702598655,
+      "grad_norm": 0.18060922622680664,
+      "learning_rate": 0.00019153961079357935,
+      "loss": 0.9194,
+      "step": 1135
+    },
+    {
+      "epoch": 2.1944177093359,
+      "grad_norm": 0.19130302965641022,
+      "learning_rate": 0.00019140371968668767,
+      "loss": 0.9209,
+      "step": 1140
+    },
+    {
+      "epoch": 2.2040423484119347,
+      "grad_norm": 0.1925574392080307,
+      "learning_rate": 0.00019126679490606404,
+      "loss": 0.915,
+      "step": 1145
+    },
+    {
+      "epoch": 2.213666987487969,
+      "grad_norm": 0.18374784290790558,
+      "learning_rate": 0.00019112883800017448,
+      "loss": 0.9266,
+      "step": 1150
+    },
+    {
+      "epoch": 2.223291626564004,
+      "grad_norm": 0.1928727775812149,
+      "learning_rate": 0.0001909898505291571,
+      "loss": 0.9177,
+      "step": 1155
+    },
+    {
+      "epoch": 2.2329162656400383,
+      "grad_norm": 0.19703041017055511,
+      "learning_rate": 0.00019084983406480462,
+      "loss": 0.9129,
+      "step": 1160
+    },
+    {
+      "epoch": 2.242540904716073,
+      "grad_norm": 0.19135095179080963,
+      "learning_rate": 0.00019070879019054645,
+      "loss": 0.9204,
+      "step": 1165
+    },
+    {
+      "epoch": 2.252165543792108,
+      "grad_norm": 0.18242081999778748,
+      "learning_rate": 0.00019056672050143087,
+      "loss": 0.9158,
+      "step": 1170
+    },
+    {
+      "epoch": 2.2617901828681424,
+      "grad_norm": 0.19838295876979828,
+      "learning_rate": 0.00019042362660410706,
+      "loss": 0.9282,
+      "step": 1175
+    },
+    {
+      "epoch": 2.271414821944177,
+      "grad_norm": 0.1942119151353836,
+      "learning_rate": 0.0001902795101168068,
+      "loss": 0.9224,
+      "step": 1180
+    },
+    {
+      "epoch": 2.2810394610202116,
+      "grad_norm": 0.1880965530872345,
+      "learning_rate": 0.00019013437266932615,
+      "loss": 0.919,
+      "step": 1185
+    },
+    {
+      "epoch": 2.2906641000962464,
+      "grad_norm": 0.18855926394462585,
+      "learning_rate": 0.00018998821590300713,
+      "loss": 0.9314,
+      "step": 1190
+    },
+    {
+      "epoch": 2.300288739172281,
+      "grad_norm": 0.20218202471733093,
+      "learning_rate": 0.00018984104147071917,
+      "loss": 0.9209,
+      "step": 1195
+    },
+    {
+      "epoch": 2.3099133782483157,
+      "grad_norm": 0.19384799897670746,
+      "learning_rate": 0.00018969285103684032,
+      "loss": 0.9147,
+      "step": 1200
+    },
+    {
+      "epoch": 2.3195380173243505,
+      "grad_norm": 0.1903255134820938,
+      "learning_rate": 0.00018954364627723843,
+      "loss": 0.9178,
+      "step": 1205
+    },
+    {
+      "epoch": 2.329162656400385,
+      "grad_norm": 0.180522158741951,
+      "learning_rate": 0.00018939342887925234,
+      "loss": 0.9215,
+      "step": 1210
+    },
+    {
+      "epoch": 2.3387872954764197,
+      "grad_norm": 0.1928156316280365,
+      "learning_rate": 0.00018924220054167257,
+      "loss": 0.9274,
+      "step": 1215
+    },
+    {
+      "epoch": 2.348411934552454,
+      "grad_norm": 0.19860059022903442,
+      "learning_rate": 0.00018908996297472235,
+      "loss": 0.9281,
+      "step": 1220
+    },
+    {
+      "epoch": 2.358036573628489,
+      "grad_norm": 0.19085602462291718,
+      "learning_rate": 0.00018893671790003804,
+      "loss": 0.9288,
+      "step": 1225
+    },
+    {
+      "epoch": 2.3676612127045233,
+      "grad_norm": 0.20947015285491943,
+      "learning_rate": 0.00018878246705064994,
+      "loss": 0.9245,
+      "step": 1230
+    },
+    {
+      "epoch": 2.377285851780558,
+      "grad_norm": 0.2144593894481659,
+      "learning_rate": 0.00018862721217096243,
+      "loss": 0.9122,
+      "step": 1235
+    },
+    {
+      "epoch": 2.386910490856593,
+      "grad_norm": 0.2063259780406952,
+      "learning_rate": 0.00018847095501673438,
+      "loss": 0.915,
+      "step": 1240
+    },
+    {
+      "epoch": 2.3965351299326274,
+      "grad_norm": 0.19159218668937683,
+      "learning_rate": 0.0001883136973550592,
+      "loss": 0.9172,
+      "step": 1245
+    },
+    {
+      "epoch": 2.4061597690086622,
+      "grad_norm": 0.19970135390758514,
+      "learning_rate": 0.00018815544096434503,
+      "loss": 0.9356,
+      "step": 1250
+    },
+    {
+      "epoch": 2.4157844080846966,
+      "grad_norm": 0.19337432086467743,
+      "learning_rate": 0.00018799618763429445,
+      "loss": 0.9284,
+      "step": 1255
+    },
+    {
+      "epoch": 2.4254090471607315,
+      "grad_norm": 0.19304610788822174,
+      "learning_rate": 0.00018783593916588432,
+      "loss": 0.9278,
+      "step": 1260
+    },
+    {
+      "epoch": 2.4350336862367663,
+      "grad_norm": 0.18972693383693695,
+      "learning_rate": 0.00018767469737134538,
+      "loss": 0.9251,
+      "step": 1265
+    },
+    {
+      "epoch": 2.4446583253128007,
+      "grad_norm": 0.19995278120040894,
+      "learning_rate": 0.0001875124640741418,
+      "loss": 0.9231,
+      "step": 1270
+    },
+    {
+      "epoch": 2.4542829643888355,
+      "grad_norm": 0.1899886578321457,
+      "learning_rate": 0.00018734924110895055,
+      "loss": 0.9289,
+      "step": 1275
+    },
+    {
+      "epoch": 2.46390760346487,
+      "grad_norm": 0.1865253895521164,
+      "learning_rate": 0.0001871850303216406,
+      "loss": 0.9141,
+      "step": 1280
+    },
+    {
+      "epoch": 2.4735322425409048,
+      "grad_norm": 0.205548956990242,
+      "learning_rate": 0.00018701983356925214,
+      "loss": 0.92,
+      "step": 1285
+    },
+    {
+      "epoch": 2.4831568816169396,
+      "grad_norm": 0.20036041736602783,
+      "learning_rate": 0.00018685365271997544,
+      "loss": 0.9274,
+      "step": 1290
+    },
+    {
+      "epoch": 2.492781520692974,
+      "grad_norm": 0.20605804026126862,
+      "learning_rate": 0.00018668648965312982,
+      "loss": 0.9262,
+      "step": 1295
+    },
+    {
+      "epoch": 2.502406159769009,
+      "grad_norm": 0.19875019788742065,
+      "learning_rate": 0.00018651834625914247,
+      "loss": 0.9288,
+      "step": 1300
+    },
+    {
+      "epoch": 2.512030798845043,
+      "grad_norm": 0.20208601653575897,
+      "learning_rate": 0.00018634922443952693,
+      "loss": 0.9246,
+      "step": 1305
+    },
+    {
+      "epoch": 2.521655437921078,
+      "grad_norm": 0.20923365652561188,
+      "learning_rate": 0.00018617912610686155,
+      "loss": 0.9285,
+      "step": 1310
+    },
+    {
+      "epoch": 2.5312800769971124,
+      "grad_norm": 0.21708457171916962,
+      "learning_rate": 0.00018600805318476807,
+      "loss": 0.9244,
+      "step": 1315
+    },
+    {
+      "epoch": 2.5409047160731473,
+      "grad_norm": 0.19935211539268494,
+      "learning_rate": 0.00018583600760788967,
+      "loss": 0.9261,
+      "step": 1320
+    },
+    {
+      "epoch": 2.550529355149182,
+      "grad_norm": 0.19352373480796814,
+      "learning_rate": 0.00018566299132186925,
+      "loss": 0.9203,
+      "step": 1325
+    },
+    {
+      "epoch": 2.5601539942252165,
+      "grad_norm": 0.18096321821212769,
+      "learning_rate": 0.00018548900628332726,
+      "loss": 0.924,
+      "step": 1330
+    },
+    {
+      "epoch": 2.5697786333012513,
+      "grad_norm": 0.20240572094917297,
+      "learning_rate": 0.0001853140544598397,
+      "loss": 0.9242,
+      "step": 1335
+    },
+    {
+      "epoch": 2.5794032723772857,
+      "grad_norm": 0.18877889215946198,
+      "learning_rate": 0.00018513813782991578,
+      "loss": 0.9101,
+      "step": 1340
+    },
+    {
+      "epoch": 2.5890279114533206,
+      "grad_norm": 0.1912551075220108,
+      "learning_rate": 0.00018496125838297572,
+      "loss": 0.9201,
+      "step": 1345
+    },
+    {
+      "epoch": 2.598652550529355,
+      "grad_norm": 0.19026340544223785,
+      "learning_rate": 0.0001847834181193279,
+      "loss": 0.9356,
+      "step": 1350
+    },
+    {
+      "epoch": 2.60827718960539,
+      "grad_norm": 0.19470341503620148,
+      "learning_rate": 0.00018460461905014664,
+      "loss": 0.9213,
+      "step": 1355
+    },
+    {
+      "epoch": 2.6179018286814246,
+      "grad_norm": 0.1977526694536209,
+      "learning_rate": 0.00018442486319744926,
+      "loss": 0.9292,
+      "step": 1360
+    },
+    {
+      "epoch": 2.627526467757459,
+      "grad_norm": 0.19127926230430603,
+      "learning_rate": 0.00018424415259407317,
+      "loss": 0.9283,
+      "step": 1365
+    },
+    {
+      "epoch": 2.637151106833494,
+      "grad_norm": 0.18255840241909027,
+      "learning_rate": 0.00018406248928365295,
+      "loss": 0.9179,
+      "step": 1370
+    },
+    {
+      "epoch": 2.6467757459095282,
+      "grad_norm": 0.18344487249851227,
+      "learning_rate": 0.00018387987532059725,
+      "loss": 0.9397,
+      "step": 1375
+    },
+    {
+      "epoch": 2.656400384985563,
+      "grad_norm": 0.1913861185312271,
+      "learning_rate": 0.00018369631277006555,
+      "loss": 0.9248,
+      "step": 1380
+    },
+    {
+      "epoch": 2.6660250240615975,
+      "grad_norm": 0.1795121282339096,
+      "learning_rate": 0.00018351180370794479,
+      "loss": 0.9223,
+      "step": 1385
+    },
+    {
+      "epoch": 2.6756496631376323,
+      "grad_norm": 0.19478866457939148,
+      "learning_rate": 0.00018332635022082582,
+      "loss": 0.9282,
+      "step": 1390
+    },
+    {
+      "epoch": 2.685274302213667,
+      "grad_norm": 0.1917424350976944,
+      "learning_rate": 0.00018313995440598002,
+      "loss": 0.9228,
+      "step": 1395
+    },
+    {
+      "epoch": 2.6948989412897015,
+      "grad_norm": 0.18964500725269318,
+      "learning_rate": 0.00018295261837133532,
+      "loss": 0.928,
+      "step": 1400
+    },
+    {
+      "epoch": 2.7045235803657364,
+      "grad_norm": 0.19044145941734314,
+      "learning_rate": 0.00018276434423545253,
+      "loss": 0.926,
+      "step": 1405
+    },
+    {
+      "epoch": 2.714148219441771,
+      "grad_norm": 0.1876942664384842,
+      "learning_rate": 0.0001825751341275013,
+      "loss": 0.9224,
+      "step": 1410
+    },
+    {
+      "epoch": 2.7237728585178056,
+      "grad_norm": 0.19307979941368103,
+      "learning_rate": 0.00018238499018723614,
+      "loss": 0.9322,
+      "step": 1415
+    },
+    {
+      "epoch": 2.73339749759384,
+      "grad_norm": 0.1879437267780304,
+      "learning_rate": 0.00018219391456497216,
+      "loss": 0.9104,
+      "step": 1420
+    },
+    {
+      "epoch": 2.743022136669875,
+      "grad_norm": 0.2002253383398056,
+      "learning_rate": 0.00018200190942156062,
+      "loss": 0.9266,
+      "step": 1425
+    },
+    {
+      "epoch": 2.7526467757459097,
+      "grad_norm": 0.1822872757911682,
+      "learning_rate": 0.00018180897692836483,
+      "loss": 0.9245,
+      "step": 1430
+    },
+    {
+      "epoch": 2.762271414821944,
+      "grad_norm": 0.1884424090385437,
+      "learning_rate": 0.0001816151192672352,
+      "loss": 0.9273,
+      "step": 1435
+    },
+    {
+      "epoch": 2.771896053897979,
+      "grad_norm": 0.1969207227230072,
+      "learning_rate": 0.00018142033863048485,
+      "loss": 0.929,
+      "step": 1440
+    },
+    {
+      "epoch": 2.7815206929740137,
+      "grad_norm": 0.1919521689414978,
+      "learning_rate": 0.0001812246372208647,
+      "loss": 0.9213,
+      "step": 1445
+    },
+    {
+      "epoch": 2.791145332050048,
+      "grad_norm": 0.18795301020145416,
+      "learning_rate": 0.00018102801725153862,
+      "loss": 0.9281,
+      "step": 1450
+    },
+    {
+      "epoch": 2.8007699711260825,
+      "grad_norm": 0.19035767018795013,
+      "learning_rate": 0.00018083048094605825,
+      "loss": 0.9264,
+      "step": 1455
+    },
+    {
+      "epoch": 2.8103946102021173,
+      "grad_norm": 0.181080624461174,
+      "learning_rate": 0.0001806320305383381,
+      "loss": 0.926,
+      "step": 1460
+    },
+    {
+      "epoch": 2.820019249278152,
+      "grad_norm": 0.18840213119983673,
+      "learning_rate": 0.00018043266827263003,
+      "loss": 0.9327,
+      "step": 1465
+    },
+    {
+      "epoch": 2.8296438883541866,
+      "grad_norm": 0.18549908697605133,
+      "learning_rate": 0.0001802323964034981,
+      "loss": 0.9345,
+      "step": 1470
+    },
+    {
+      "epoch": 2.8392685274302214,
+      "grad_norm": 0.18507707118988037,
+      "learning_rate": 0.00018003121719579294,
+      "loss": 0.9243,
+      "step": 1475
+    },
+    {
+      "epoch": 2.8488931665062562,
+      "grad_norm": 0.19053645431995392,
+      "learning_rate": 0.0001798291329246261,
+      "loss": 0.9136,
+      "step": 1480
+    },
+    {
+      "epoch": 2.8585178055822906,
+      "grad_norm": 0.18798498809337616,
+      "learning_rate": 0.00017962614587534444,
+      "loss": 0.9296,
+      "step": 1485
+    },
+    {
+      "epoch": 2.8681424446583255,
+      "grad_norm": 0.19244647026062012,
+      "learning_rate": 0.00017942225834350424,
+      "loss": 0.9212,
+      "step": 1490
+    },
+    {
+      "epoch": 2.87776708373436,
+      "grad_norm": 0.18958385288715363,
+      "learning_rate": 0.00017921747263484518,
+      "loss": 0.9204,
+      "step": 1495
+    },
+    {
+      "epoch": 2.8873917228103947,
+      "grad_norm": 0.1872030794620514,
+      "learning_rate": 0.00017901179106526434,
+      "loss": 0.9167,
+      "step": 1500
+    },
+    {
+      "epoch": 2.897016361886429,
+      "grad_norm": 0.1842317432165146,
+      "learning_rate": 0.00017880521596079003,
+      "loss": 0.9295,
+      "step": 1505
+    },
+    {
+      "epoch": 2.906641000962464,
+      "grad_norm": 0.1908566802740097,
+      "learning_rate": 0.00017859774965755534,
+      "loss": 0.933,
+      "step": 1510
+    },
+    {
+      "epoch": 2.9162656400384988,
+      "grad_norm": 0.17877928912639618,
+      "learning_rate": 0.0001783893945017719,
+      "loss": 0.9209,
+      "step": 1515
+    },
+    {
+      "epoch": 2.925890279114533,
+      "grad_norm": 0.19019804894924164,
+      "learning_rate": 0.00017818015284970328,
+      "loss": 0.9298,
+      "step": 1520
+    },
+    {
+      "epoch": 2.935514918190568,
+      "grad_norm": 0.17898397147655487,
+      "learning_rate": 0.0001779700270676382,
+      "loss": 0.9149,
+      "step": 1525
+    },
+    {
+      "epoch": 2.9451395572666024,
+      "grad_norm": 0.19317851960659027,
+      "learning_rate": 0.0001777590195318641,
+      "loss": 0.9268,
+      "step": 1530
+    },
+    {
+      "epoch": 2.954764196342637,
+      "grad_norm": 0.1835252344608307,
+      "learning_rate": 0.00017754713262863985,
+      "loss": 0.9156,
+      "step": 1535
+    },
+    {
+      "epoch": 2.9643888354186716,
+      "grad_norm": 0.18219447135925293,
+      "learning_rate": 0.00017733436875416917,
+      "loss": 0.928,
+      "step": 1540
+    },
+    {
+      "epoch": 2.9740134744947064,
+      "grad_norm": 0.19455976784229279,
+      "learning_rate": 0.00017712073031457331,
+      "loss": 0.9358,
+      "step": 1545
+    },
+    {
+      "epoch": 2.9836381135707413,
+      "grad_norm": 0.19101083278656006,
+      "learning_rate": 0.0001769062197258637,
+      "loss": 0.919,
+      "step": 1550
+    },
+    {
+      "epoch": 2.9932627526467757,
+      "grad_norm": 0.1850951611995697,
+      "learning_rate": 0.00017669083941391502,
+      "loss": 0.9289,
+      "step": 1555
+    },
+    {
+      "epoch": 2.9990375360923966,
+      "eval_loss": 2.251723289489746,
+      "eval_runtime": 0.7901,
+      "eval_samples_per_second": 13.922,
+      "eval_steps_per_second": 2.531,
+      "step": 1558
+    },
+    {
+      "epoch": 3.0028873917228105,
+      "grad_norm": 0.1839417815208435,
+      "learning_rate": 0.00017647459181443739,
+      "loss": 0.9099,
+      "step": 1560
+    },
+    {
+      "epoch": 3.012512030798845,
+      "grad_norm": 0.21318542957305908,
+      "learning_rate": 0.0001762574793729491,
+      "loss": 0.8622,
+      "step": 1565
+    },
+    {
+      "epoch": 3.0221366698748797,
+      "grad_norm": 0.20732618868350983,
+      "learning_rate": 0.00017603950454474877,
+      "loss": 0.8502,
+      "step": 1570
+    },
+    {
+      "epoch": 3.0317613089509146,
+      "grad_norm": 0.20737336575984955,
+      "learning_rate": 0.00017582066979488764,
+      "loss": 0.8479,
+      "step": 1575
+    },
+    {
+      "epoch": 3.041385948026949,
+      "grad_norm": 0.2138897329568863,
+      "learning_rate": 0.00017560097759814172,
+      "loss": 0.8517,
+      "step": 1580
+    },
+    {
+      "epoch": 3.051010587102984,
+      "grad_norm": 0.20526482164859772,
+      "learning_rate": 0.00017538043043898376,
+      "loss": 0.8548,
+      "step": 1585
+    },
+    {
+      "epoch": 3.060635226179018,
+      "grad_norm": 0.21120765805244446,
+      "learning_rate": 0.00017515903081155525,
+      "loss": 0.8531,
+      "step": 1590
+    },
+    {
+      "epoch": 3.070259865255053,
+      "grad_norm": 0.20420415699481964,
+      "learning_rate": 0.00017493678121963807,
+      "loss": 0.8607,
+      "step": 1595
+    },
+    {
+      "epoch": 3.0798845043310874,
+      "grad_norm": 0.2265135943889618,
+      "learning_rate": 0.00017471368417662627,
+      "loss": 0.8638,
+      "step": 1600
+    },
+    {
+      "epoch": 3.0895091434071222,
+      "grad_norm": 0.2099863588809967,
+      "learning_rate": 0.00017448974220549764,
+      "loss": 0.8648,
+      "step": 1605
+    },
+    {
+      "epoch": 3.099133782483157,
+      "grad_norm": 0.2183115929365158,
+      "learning_rate": 0.00017426495783878508,
+      "loss": 0.8554,
+      "step": 1610
+    },
+    {
+      "epoch": 3.1087584215591915,
+      "grad_norm": 0.2061695158481598,
+      "learning_rate": 0.00017403933361854814,
+      "loss": 0.8561,
+      "step": 1615
+    },
+    {
+      "epoch": 3.1183830606352263,
+      "grad_norm": 0.21093107759952545,
+      "learning_rate": 0.0001738128720963442,
+      "loss": 0.8639,
+      "step": 1620
+    },
+    {
+      "epoch": 3.1280076997112607,
+      "grad_norm": 0.22155196964740753,
+      "learning_rate": 0.0001735855758331994,
+      "loss": 0.8687,
+      "step": 1625
+    },
+    {
+      "epoch": 3.1376323387872955,
+      "grad_norm": 0.21988868713378906,
+      "learning_rate": 0.0001733574473995801,
+      "loss": 0.8571,
+      "step": 1630
+    },
+    {
+      "epoch": 3.14725697786333,
+      "grad_norm": 0.20397303998470306,
+      "learning_rate": 0.00017312848937536338,
+      "loss": 0.8556,
+      "step": 1635
+    },
+    {
+      "epoch": 3.1568816169393648,
+      "grad_norm": 0.21777671575546265,
+      "learning_rate": 0.00017289870434980824,
+      "loss": 0.8657,
+      "step": 1640
+    },
+    {
+      "epoch": 3.1665062560153996,
+      "grad_norm": 0.20753996074199677,
+      "learning_rate": 0.00017266809492152597,
+      "loss": 0.8578,
+      "step": 1645
+    },
+    {
+      "epoch": 3.176130895091434,
+      "grad_norm": 0.22726857662200928,
+      "learning_rate": 0.00017243666369845103,
+      "loss": 0.8713,
+      "step": 1650
+    },
+    {
+      "epoch": 3.185755534167469,
+      "grad_norm": 0.20830857753753662,
+      "learning_rate": 0.00017220441329781147,
+      "loss": 0.8621,
+      "step": 1655
+    },
+    {
+      "epoch": 3.195380173243503,
+      "grad_norm": 0.21678543090820312,
+      "learning_rate": 0.00017197134634609924,
+      "loss": 0.8589,
+      "step": 1660
+    },
+    {
+      "epoch": 3.205004812319538,
+      "grad_norm": 0.21865533292293549,
+      "learning_rate": 0.00017173746547904063,
+      "loss": 0.872,
+      "step": 1665
+    },
+    {
+      "epoch": 3.214629451395573,
+      "grad_norm": 0.20973502099514008,
+      "learning_rate": 0.0001715027733415664,
+      "loss": 0.8624,
+      "step": 1670
+    },
+    {
+      "epoch": 3.2242540904716073,
+      "grad_norm": 0.21278487145900726,
+      "learning_rate": 0.00017126727258778187,
+      "loss": 0.8693,
+      "step": 1675
+    },
+    {
+      "epoch": 3.233878729547642,
+      "grad_norm": 0.2145373523235321,
+      "learning_rate": 0.00017103096588093686,
+      "loss": 0.8665,
+      "step": 1680
+    },
+    {
+      "epoch": 3.2435033686236765,
+      "grad_norm": 0.21175837516784668,
+      "learning_rate": 0.00017079385589339568,
+      "loss": 0.8592,
+      "step": 1685
+    },
+    {
+      "epoch": 3.2531280076997113,
+      "grad_norm": 0.21969176828861237,
+      "learning_rate": 0.00017055594530660678,
+      "loss": 0.8686,
+      "step": 1690
+    },
+    {
+      "epoch": 3.2627526467757457,
+      "grad_norm": 0.23275814950466156,
+      "learning_rate": 0.00017031723681107256,
+      "loss": 0.8643,
+      "step": 1695
+    },
+    {
+      "epoch": 3.2723772858517806,
+      "grad_norm": 0.22712193429470062,
+      "learning_rate": 0.0001700777331063188,
+      "loss": 0.8774,
+      "step": 1700
+    },
+    {
+      "epoch": 3.2820019249278154,
+      "grad_norm": 0.2357400804758072,
+      "learning_rate": 0.0001698374369008643,
+      "loss": 0.8654,
+      "step": 1705
+    },
+    {
+      "epoch": 3.29162656400385,
+      "grad_norm": 0.21586911380290985,
+      "learning_rate": 0.00016959635091219011,
+      "loss": 0.8682,
+      "step": 1710
+    },
+    {
+      "epoch": 3.3012512030798846,
+      "grad_norm": 0.20854496955871582,
+      "learning_rate": 0.00016935447786670875,
+      "loss": 0.872,
+      "step": 1715
+    },
+    {
+      "epoch": 3.310875842155919,
+      "grad_norm": 0.22415196895599365,
+      "learning_rate": 0.00016911182049973364,
+      "loss": 0.8691,
+      "step": 1720
+    },
+    {
+      "epoch": 3.320500481231954,
+      "grad_norm": 0.21514172852039337,
+      "learning_rate": 0.00016886838155544785,
+      "loss": 0.8662,
+      "step": 1725
+    },
+    {
+      "epoch": 3.3301251203079882,
+      "grad_norm": 0.21508009731769562,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.8677,
+      "step": 1730
+    },
+    {
+      "epoch": 3.339749759384023,
+      "grad_norm": 0.21434170007705688,
+      "learning_rate": 0.00016837916995583965,
+      "loss": 0.8691,
+      "step": 1735
+    },
+    {
+      "epoch": 3.349374398460058,
+      "grad_norm": 0.21920685470104218,
+      "learning_rate": 0.00016813340283295265,
+      "loss": 0.8632,
+      "step": 1740
+    },
+    {
+      "epoch": 3.3589990375360923,
+      "grad_norm": 0.20799002051353455,
+      "learning_rate": 0.00016788686519756337,
+      "loss": 0.8711,
+      "step": 1745
+    },
+    {
+      "epoch": 3.368623676612127,
+      "grad_norm": 0.22760187089443207,
+      "learning_rate": 0.00016763955983773642,
+      "loss": 0.8716,
+      "step": 1750
+    },
+    {
+      "epoch": 3.3782483156881615,
+      "grad_norm": 0.20473913848400116,
+      "learning_rate": 0.00016739148955021853,
+      "loss": 0.8672,
+      "step": 1755
+    },
+    {
+      "epoch": 3.3878729547641964,
+      "grad_norm": 0.2237493246793747,
+      "learning_rate": 0.00016714265714040688,
+      "loss": 0.8711,
+      "step": 1760
+    },
+    {
+      "epoch": 3.3974975938402308,
+      "grad_norm": 0.21266481280326843,
+      "learning_rate": 0.00016689306542231754,
+      "loss": 0.8581,
+      "step": 1765
+    },
+    {
+      "epoch": 3.4071222329162656,
+      "grad_norm": 0.21926787495613098,
+      "learning_rate": 0.00016664271721855323,
+      "loss": 0.8647,
+      "step": 1770
+    },
+    {
+      "epoch": 3.4167468719923004,
+      "grad_norm": 0.21556758880615234,
+      "learning_rate": 0.00016639161536027196,
+      "loss": 0.8627,
+      "step": 1775
+    },
+    {
+      "epoch": 3.426371511068335,
+      "grad_norm": 0.22477813065052032,
+      "learning_rate": 0.00016613976268715458,
+      "loss": 0.8734,
+      "step": 1780
+    },
+    {
+      "epoch": 3.4359961501443697,
+      "grad_norm": 0.22144025564193726,
+      "learning_rate": 0.00016588716204737281,
+      "loss": 0.8633,
+      "step": 1785
+    },
+    {
+      "epoch": 3.445620789220404,
+      "grad_norm": 0.21546606719493866,
+      "learning_rate": 0.00016563381629755713,
+      "loss": 0.87,
+      "step": 1790
+    },
+    {
+      "epoch": 3.455245428296439,
+      "grad_norm": 0.21200338006019592,
+      "learning_rate": 0.00016537972830276424,
+      "loss": 0.8749,
+      "step": 1795
+    },
+    {
+      "epoch": 3.4648700673724737,
+      "grad_norm": 0.21702003479003906,
+      "learning_rate": 0.00016512490093644491,
+      "loss": 0.8736,
+      "step": 1800
+    },
+    {
+      "epoch": 3.474494706448508,
+      "grad_norm": 0.20890291035175323,
+      "learning_rate": 0.00016486933708041138,
+      "loss": 0.8658,
+      "step": 1805
+    },
+    {
+      "epoch": 3.484119345524543,
+      "grad_norm": 0.21432092785835266,
+      "learning_rate": 0.0001646130396248047,
+      "loss": 0.8671,
+      "step": 1810
+    },
+    {
+      "epoch": 3.4937439846005773,
+      "grad_norm": 0.21486730873584747,
+      "learning_rate": 0.0001643560114680621,
+      "loss": 0.8624,
+      "step": 1815
+    },
+    {
+      "epoch": 3.503368623676612,
+      "grad_norm": 0.2079630345106125,
+      "learning_rate": 0.0001640982555168843,
+      "loss": 0.8623,
+      "step": 1820
+    },
+    {
+      "epoch": 3.512993262752647,
+      "grad_norm": 0.21051821112632751,
+      "learning_rate": 0.00016383977468620252,
+      "loss": 0.8694,
+      "step": 1825
+    },
+    {
+      "epoch": 3.5226179018286814,
+      "grad_norm": 0.22331751883029938,
+      "learning_rate": 0.00016358057189914553,
+      "loss": 0.8867,
+      "step": 1830
+    },
+    {
+      "epoch": 3.5322425409047162,
+      "grad_norm": 0.21272289752960205,
+      "learning_rate": 0.00016332065008700666,
+      "loss": 0.8643,
+      "step": 1835
+    },
+    {
+      "epoch": 3.5418671799807506,
+      "grad_norm": 0.2075881063938141,
+      "learning_rate": 0.00016306001218921055,
+      "loss": 0.8758,
+      "step": 1840
+    },
+    {
+      "epoch": 3.5514918190567855,
+      "grad_norm": 0.21468383073806763,
+      "learning_rate": 0.00016279866115328012,
+      "loss": 0.8743,
+      "step": 1845
+    },
+    {
+      "epoch": 3.56111645813282,
+      "grad_norm": 0.20136167109012604,
+      "learning_rate": 0.00016253659993480284,
+      "loss": 0.874,
+      "step": 1850
+    },
+    {
+      "epoch": 3.5707410972088547,
+      "grad_norm": 0.2094564139842987,
+      "learning_rate": 0.00016227383149739776,
+      "loss": 0.8798,
+      "step": 1855
+    },
+    {
+      "epoch": 3.5803657362848895,
+      "grad_norm": 0.21963797509670258,
+      "learning_rate": 0.00016201035881268166,
+      "loss": 0.8751,
+      "step": 1860
+    },
+    {
+      "epoch": 3.589990375360924,
+      "grad_norm": 0.22210368514060974,
+      "learning_rate": 0.00016174618486023565,
+      "loss": 0.8709,
+      "step": 1865
+    },
+    {
+      "epoch": 3.5996150144369587,
+      "grad_norm": 0.22002506256103516,
+      "learning_rate": 0.00016148131262757134,
+      "loss": 0.8724,
+      "step": 1870
+    },
+    {
+      "epoch": 3.609239653512993,
+      "grad_norm": 0.21862515807151794,
+      "learning_rate": 0.0001612157451100971,
+      "loss": 0.8715,
+      "step": 1875
+    },
+    {
+      "epoch": 3.618864292589028,
+      "grad_norm": 0.21481823921203613,
+      "learning_rate": 0.0001609494853110843,
+      "loss": 0.8727,
+      "step": 1880
+    },
+    {
+      "epoch": 3.6284889316650624,
+      "grad_norm": 0.21671965718269348,
+      "learning_rate": 0.00016068253624163307,
+      "loss": 0.8695,
+      "step": 1885
+    },
+    {
+      "epoch": 3.638113570741097,
+      "grad_norm": 0.22262564301490784,
+      "learning_rate": 0.00016041490092063852,
+      "loss": 0.8707,
+      "step": 1890
+    },
+    {
+      "epoch": 3.647738209817132,
+      "grad_norm": 0.21777838468551636,
+      "learning_rate": 0.0001601465823747565,
+      "loss": 0.8719,
+      "step": 1895
+    },
+    {
+      "epoch": 3.6573628488931664,
+      "grad_norm": 0.2157593071460724,
+      "learning_rate": 0.00015987758363836932,
+      "loss": 0.8649,
+      "step": 1900
+    },
+    {
+      "epoch": 3.6669874879692013,
+      "grad_norm": 0.21907728910446167,
+      "learning_rate": 0.00015960790775355159,
+      "loss": 0.8727,
+      "step": 1905
+    },
+    {
+      "epoch": 3.6766121270452357,
+      "grad_norm": 0.2181127518415451,
+      "learning_rate": 0.00015933755777003552,
+      "loss": 0.8642,
+      "step": 1910
+    },
+    {
+      "epoch": 3.6862367661212705,
+      "grad_norm": 0.21002036333084106,
+      "learning_rate": 0.0001590665367451768,
+      "loss": 0.8853,
+      "step": 1915
+    },
+    {
+      "epoch": 3.695861405197305,
+      "grad_norm": 0.21628259122371674,
+      "learning_rate": 0.0001587948477439198,
+      "loss": 0.8781,
+      "step": 1920
+    },
+    {
+      "epoch": 3.7054860442733397,
+      "grad_norm": 0.21025903522968292,
+      "learning_rate": 0.00015852249383876285,
+      "loss": 0.8788,
+      "step": 1925
+    },
+    {
+      "epoch": 3.7151106833493746,
+      "grad_norm": 0.21036125719547272,
+      "learning_rate": 0.00015824947810972378,
+      "loss": 0.8769,
+      "step": 1930
+    },
+    {
+      "epoch": 3.724735322425409,
+      "grad_norm": 0.20949947834014893,
+      "learning_rate": 0.00015797580364430473,
+      "loss": 0.8689,
+      "step": 1935
+    },
+    {
+      "epoch": 3.734359961501444,
+      "grad_norm": 0.22593073546886444,
+      "learning_rate": 0.00015770147353745754,
+      "loss": 0.8763,
+      "step": 1940
+    },
+    {
+      "epoch": 3.7439846005774786,
+      "grad_norm": 0.22361914813518524,
+      "learning_rate": 0.00015742649089154858,
+      "loss": 0.8743,
+      "step": 1945
+    },
+    {
+      "epoch": 3.753609239653513,
+      "grad_norm": 0.21210341155529022,
+      "learning_rate": 0.00015715085881632366,
+      "loss": 0.8754,
+      "step": 1950
+    },
+    {
+      "epoch": 3.7632338787295474,
+      "grad_norm": 0.21233123540878296,
+      "learning_rate": 0.00015687458042887298,
+      "loss": 0.8823,
+      "step": 1955
+    },
+    {
+      "epoch": 3.7728585178055822,
+      "grad_norm": 0.20900115370750427,
+      "learning_rate": 0.00015659765885359572,
+      "loss": 0.8601,
+      "step": 1960
+    },
+    {
+      "epoch": 3.782483156881617,
+      "grad_norm": 0.20850348472595215,
+      "learning_rate": 0.0001563200972221649,
+      "loss": 0.8748,
+      "step": 1965
+    },
+    {
+      "epoch": 3.7921077959576515,
+      "grad_norm": 0.2235669642686844,
+      "learning_rate": 0.00015604189867349182,
+      "loss": 0.8767,
+      "step": 1970
+    },
+    {
+      "epoch": 3.8017324350336863,
+      "grad_norm": 0.20681613683700562,
+      "learning_rate": 0.00015576306635369053,
+      "loss": 0.87,
+      "step": 1975
+    },
+    {
+      "epoch": 3.811357074109721,
+      "grad_norm": 0.2126859724521637,
+      "learning_rate": 0.00015548360341604244,
+      "loss": 0.8767,
+      "step": 1980
+    },
+    {
+      "epoch": 3.8209817131857555,
+      "grad_norm": 0.21969568729400635,
+      "learning_rate": 0.00015520351302096043,
+      "loss": 0.8619,
+      "step": 1985
+    },
+    {
+      "epoch": 3.83060635226179,
+      "grad_norm": 0.20034681260585785,
+      "learning_rate": 0.0001549227983359533,
+      "loss": 0.879,
+      "step": 1990
+    },
+    {
+      "epoch": 3.8402309913378248,
+      "grad_norm": 0.22048155963420868,
+      "learning_rate": 0.00015464146253558987,
+      "loss": 0.8704,
+      "step": 1995
+    },
+    {
+      "epoch": 3.8498556304138596,
+      "grad_norm": 0.2217637300491333,
+      "learning_rate": 0.00015435950880146297,
+      "loss": 0.874,
+      "step": 2000
+    },
+    {
+      "epoch": 3.859480269489894,
+      "grad_norm": 0.2207387238740921,
+      "learning_rate": 0.00015407694032215375,
+      "loss": 0.871,
+      "step": 2005
+    },
+    {
+      "epoch": 3.869104908565929,
+      "grad_norm": 0.21759381890296936,
+      "learning_rate": 0.00015379376029319526,
+      "loss": 0.881,
+      "step": 2010
+    },
+    {
+      "epoch": 3.8787295476419636,
+      "grad_norm": 0.21979306638240814,
+      "learning_rate": 0.00015350997191703662,
+      "loss": 0.8707,
+      "step": 2015
+    },
+    {
+      "epoch": 3.888354186717998,
+      "grad_norm": 0.2088766098022461,
+      "learning_rate": 0.0001532255784030066,
+      "loss": 0.8715,
+      "step": 2020
+    },
+    {
+      "epoch": 3.897978825794033,
+      "grad_norm": 0.23208843171596527,
+      "learning_rate": 0.00015294058296727746,
+      "loss": 0.867,
+      "step": 2025
+    },
+    {
+      "epoch": 3.9076034648700673,
+      "grad_norm": 0.211493119597435,
+      "learning_rate": 0.00015265498883282848,
+      "loss": 0.8746,
+      "step": 2030
+    },
+    {
+      "epoch": 3.917228103946102,
+      "grad_norm": 0.2072470784187317,
+      "learning_rate": 0.00015236879922940952,
+      "loss": 0.8815,
+      "step": 2035
+    },
+    {
+      "epoch": 3.9268527430221365,
+      "grad_norm": 0.2107774019241333,
+      "learning_rate": 0.0001520820173935046,
+      "loss": 0.8762,
+      "step": 2040
+    },
+    {
+      "epoch": 3.9364773820981713,
+      "grad_norm": 0.22592873871326447,
+      "learning_rate": 0.00015179464656829526,
+      "loss": 0.8781,
+      "step": 2045
+    },
+    {
+      "epoch": 3.946102021174206,
+      "grad_norm": 0.210884690284729,
+      "learning_rate": 0.00015150669000362372,
+      "loss": 0.8759,
+      "step": 2050
+    },
+    {
+      "epoch": 3.9557266602502406,
+      "grad_norm": 0.22325028479099274,
+      "learning_rate": 0.00015121815095595631,
+      "loss": 0.8759,
+      "step": 2055
+    },
+    {
+      "epoch": 3.9653512993262754,
+      "grad_norm": 0.20822718739509583,
+      "learning_rate": 0.0001509290326883466,
+      "loss": 0.8743,
+      "step": 2060
+    },
+    {
+      "epoch": 3.97497593840231,
+      "grad_norm": 0.22340907156467438,
+      "learning_rate": 0.00015063933847039856,
+      "loss": 0.8768,
+      "step": 2065
+    },
+    {
+      "epoch": 3.9846005774783446,
+      "grad_norm": 0.21545882523059845,
+      "learning_rate": 0.0001503490715782294,
+      "loss": 0.8737,
+      "step": 2070
+    },
+    {
+      "epoch": 3.994225216554379,
+      "grad_norm": 0.21250423789024353,
+      "learning_rate": 0.00015005823529443268,
+      "loss": 0.8818,
+      "step": 2075
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.463193893432617,
+      "eval_runtime": 0.7794,
+      "eval_samples_per_second": 14.113,
+      "eval_steps_per_second": 2.566,
+      "step": 2078
+    },
+    {
+      "epoch": 4.003849855630414,
+      "grad_norm": 0.20480164885520935,
+      "learning_rate": 0.00014976683290804116,
+      "loss": 0.8452,
+      "step": 2080
+    },
+    {
+      "epoch": 4.013474494706449,
+      "grad_norm": 0.24909119307994843,
+      "learning_rate": 0.00014947486771448956,
+      "loss": 0.799,
+      "step": 2085
+    },
+    {
+      "epoch": 4.023099133782483,
+      "grad_norm": 0.2511972188949585,
+      "learning_rate": 0.00014918234301557732,
+      "loss": 0.7996,
+      "step": 2090
+    },
+    {
+      "epoch": 4.0327237728585175,
+      "grad_norm": 0.2290249615907669,
+      "learning_rate": 0.00014888926211943128,
+      "loss": 0.7821,
+      "step": 2095
+    },
+    {
+      "epoch": 4.042348411934553,
+      "grad_norm": 0.2516845762729645,
+      "learning_rate": 0.0001485956283404682,
+      "loss": 0.8027,
+      "step": 2100
+    },
+    {
+      "epoch": 4.051973051010587,
+      "grad_norm": 0.23257547616958618,
+      "learning_rate": 0.00014830144499935742,
+      "loss": 0.8051,
+      "step": 2105
+    },
+    {
+      "epoch": 4.0615976900866215,
+      "grad_norm": 0.23365622758865356,
+      "learning_rate": 0.00014800671542298312,
+      "loss": 0.8046,
+      "step": 2110
+    },
+    {
+      "epoch": 4.071222329162657,
+      "grad_norm": 0.25398579239845276,
+      "learning_rate": 0.00014771144294440682,
+      "loss": 0.7998,
+      "step": 2115
+    },
+    {
+      "epoch": 4.080846968238691,
+      "grad_norm": 0.25395774841308594,
+      "learning_rate": 0.00014741563090282965,
+      "loss": 0.7896,
+      "step": 2120
+    },
+    {
+      "epoch": 4.090471607314726,
+      "grad_norm": 0.23397642374038696,
+      "learning_rate": 0.00014711928264355466,
+      "loss": 0.7982,
+      "step": 2125
+    },
+    {
+      "epoch": 4.10009624639076,
+      "grad_norm": 0.24863800406455994,
+      "learning_rate": 0.0001468224015179488,
+      "loss": 0.8004,
+      "step": 2130
+    },
+    {
+      "epoch": 4.109720885466795,
+      "grad_norm": 0.24272161722183228,
+      "learning_rate": 0.00014652499088340523,
+      "loss": 0.7949,
+      "step": 2135
+    },
+    {
+      "epoch": 4.11934552454283,
+      "grad_norm": 0.24697747826576233,
+      "learning_rate": 0.00014622705410330522,
+      "loss": 0.792,
+      "step": 2140
+    },
+    {
+      "epoch": 4.128970163618864,
+      "grad_norm": 0.2412373572587967,
+      "learning_rate": 0.0001459285945469802,
+      "loss": 0.7999,
+      "step": 2145
+    },
+    {
+      "epoch": 4.138594802694899,
+      "grad_norm": 0.255993127822876,
+      "learning_rate": 0.0001456296155896736,
+      "loss": 0.7965,
+      "step": 2150
+    },
+    {
+      "epoch": 4.148219441770934,
+      "grad_norm": 0.23746897280216217,
+      "learning_rate": 0.00014533012061250264,
+      "loss": 0.8057,
+      "step": 2155
+    },
+    {
+      "epoch": 4.157844080846968,
+      "grad_norm": 0.24358995258808136,
+      "learning_rate": 0.00014503011300242023,
+      "loss": 0.8073,
+      "step": 2160
+    },
+    {
+      "epoch": 4.1674687199230025,
+      "grad_norm": 0.2651350498199463,
+      "learning_rate": 0.0001447295961521765,
+      "loss": 0.7961,
+      "step": 2165
+    },
+    {
+      "epoch": 4.177093358999038,
+      "grad_norm": 0.25750139355659485,
+      "learning_rate": 0.0001444285734602806,
+      "loss": 0.7961,
+      "step": 2170
+    },
+    {
+      "epoch": 4.186717998075072,
+      "grad_norm": 0.2478739321231842,
+      "learning_rate": 0.00014412704833096217,
+      "loss": 0.7955,
+      "step": 2175
+    },
+    {
+      "epoch": 4.196342637151107,
+      "grad_norm": 0.26254719495773315,
+      "learning_rate": 0.00014382502417413276,
+      "loss": 0.7929,
+      "step": 2180
+    },
+    {
+      "epoch": 4.205967276227142,
+      "grad_norm": 0.25435176491737366,
+      "learning_rate": 0.00014352250440534747,
+      "loss": 0.8052,
+      "step": 2185
+    },
+    {
+      "epoch": 4.215591915303176,
+      "grad_norm": 0.24811629951000214,
+      "learning_rate": 0.00014321949244576617,
+      "loss": 0.7989,
+      "step": 2190
+    },
+    {
+      "epoch": 4.225216554379211,
+      "grad_norm": 0.2621951103210449,
+      "learning_rate": 0.00014291599172211485,
+      "loss": 0.8092,
+      "step": 2195
+    },
+    {
+      "epoch": 4.234841193455245,
+      "grad_norm": 0.2780658006668091,
+      "learning_rate": 0.0001426120056666469,
+      "loss": 0.8058,
+      "step": 2200
+    },
+    {
+      "epoch": 4.24446583253128,
+      "grad_norm": 0.233393132686615,
+      "learning_rate": 0.0001423075377171043,
+      "loss": 0.8049,
+      "step": 2205
+    },
+    {
+      "epoch": 4.254090471607315,
+      "grad_norm": 0.26003360748291016,
+      "learning_rate": 0.00014200259131667858,
+      "loss": 0.8091,
+      "step": 2210
+    },
+    {
+      "epoch": 4.263715110683349,
+      "grad_norm": 0.25277137756347656,
+      "learning_rate": 0.00014169716991397214,
+      "loss": 0.8126,
+      "step": 2215
+    },
+    {
+      "epoch": 4.273339749759384,
+      "grad_norm": 0.23928789794445038,
+      "learning_rate": 0.00014139127696295912,
+      "loss": 0.8044,
+      "step": 2220
+    },
+    {
+      "epoch": 4.282964388835419,
+      "grad_norm": 0.254984587430954,
+      "learning_rate": 0.00014108491592294627,
+      "loss": 0.8036,
+      "step": 2225
+    },
+    {
+      "epoch": 4.292589027911453,
+      "grad_norm": 0.2602671682834625,
+      "learning_rate": 0.000140778090258534,
+      "loss": 0.8147,
+      "step": 2230
+    },
+    {
+      "epoch": 4.3022136669874875,
+      "grad_norm": 0.24539902806282043,
+      "learning_rate": 0.000140470803439577,
+      "loss": 0.8078,
+      "step": 2235
+    },
+    {
+      "epoch": 4.311838306063523,
+      "grad_norm": 0.24983367323875427,
+      "learning_rate": 0.00014016305894114516,
+      "loss": 0.8089,
+      "step": 2240
+    },
+    {
+      "epoch": 4.321462945139557,
+      "grad_norm": 0.2500509023666382,
+      "learning_rate": 0.0001398548602434842,
+      "loss": 0.8053,
+      "step": 2245
+    },
+    {
+      "epoch": 4.331087584215592,
+      "grad_norm": 0.24786844849586487,
+      "learning_rate": 0.00013954621083197628,
+      "loss": 0.8091,
+      "step": 2250
+    },
+    {
+      "epoch": 4.340712223291627,
+      "grad_norm": 0.2504083216190338,
+      "learning_rate": 0.00013923711419710076,
+      "loss": 0.8122,
+      "step": 2255
+    },
+    {
+      "epoch": 4.350336862367661,
+      "grad_norm": 0.24594616889953613,
+      "learning_rate": 0.0001389275738343944,
+      "loss": 0.8142,
+      "step": 2260
+    },
+    {
+      "epoch": 4.359961501443696,
+      "grad_norm": 0.25497034192085266,
+      "learning_rate": 0.00013861759324441223,
+      "loss": 0.8102,
+      "step": 2265
+    },
+    {
+      "epoch": 4.369586140519731,
+      "grad_norm": 0.26248982548713684,
+      "learning_rate": 0.00013830717593268764,
+      "loss": 0.8106,
+      "step": 2270
+    },
+    {
+      "epoch": 4.379210779595765,
+      "grad_norm": 0.24808135628700256,
+      "learning_rate": 0.00013799632540969286,
+      "loss": 0.8069,
+      "step": 2275
+    },
+    {
+      "epoch": 4.3888354186718,
+      "grad_norm": 0.2534014582633972,
+      "learning_rate": 0.00013768504519079923,
+      "loss": 0.8166,
+      "step": 2280
+    },
+    {
+      "epoch": 4.398460057747834,
+      "grad_norm": 0.24292294681072235,
+      "learning_rate": 0.0001373733387962376,
+      "loss": 0.8072,
+      "step": 2285
+    },
+    {
+      "epoch": 4.408084696823869,
+      "grad_norm": 0.24815544486045837,
+      "learning_rate": 0.00013706120975105822,
+      "loss": 0.8189,
+      "step": 2290
+    },
+    {
+      "epoch": 4.417709335899904,
+      "grad_norm": 0.24199172854423523,
+      "learning_rate": 0.00013674866158509117,
+      "loss": 0.8084,
+      "step": 2295
+    },
+    {
+      "epoch": 4.427333974975938,
+      "grad_norm": 0.26282939314842224,
+      "learning_rate": 0.00013643569783290622,
+      "loss": 0.8103,
+      "step": 2300
+    },
+    {
+      "epoch": 4.4369586140519734,
+      "grad_norm": 0.2644505202770233,
+      "learning_rate": 0.00013612232203377307,
+      "loss": 0.8106,
+      "step": 2305
+    },
+    {
+      "epoch": 4.446583253128008,
+      "grad_norm": 0.250636488199234,
+      "learning_rate": 0.0001358085377316211,
+      "loss": 0.823,
+      "step": 2310
+    },
+    {
+      "epoch": 4.456207892204042,
+      "grad_norm": 0.2760376036167145,
+      "learning_rate": 0.00013549434847499945,
+      "loss": 0.8109,
+      "step": 2315
+    },
+    {
+      "epoch": 4.465832531280077,
+      "grad_norm": 0.24669407308101654,
+      "learning_rate": 0.00013517975781703688,
+      "loss": 0.8135,
+      "step": 2320
+    },
+    {
+      "epoch": 4.475457170356112,
+      "grad_norm": 0.24369503557682037,
+      "learning_rate": 0.00013486476931540145,
+      "loss": 0.8083,
+      "step": 2325
+    },
+    {
+      "epoch": 4.485081809432146,
+      "grad_norm": 0.2656605839729309,
+      "learning_rate": 0.00013454938653226047,
+      "loss": 0.8082,
+      "step": 2330
+    },
+    {
+      "epoch": 4.494706448508181,
+      "grad_norm": 0.24139179289340973,
+      "learning_rate": 0.0001342336130342401,
+      "loss": 0.8046,
+      "step": 2335
+    },
+    {
+      "epoch": 4.504331087584216,
+      "grad_norm": 0.2464561015367508,
+      "learning_rate": 0.00013391745239238508,
+      "loss": 0.8205,
+      "step": 2340
+    },
+    {
+      "epoch": 4.51395572666025,
+      "grad_norm": 0.25290533900260925,
+      "learning_rate": 0.0001336009081821183,
+      "loss": 0.8135,
+      "step": 2345
+    },
+    {
+      "epoch": 4.523580365736285,
+      "grad_norm": 0.2681277096271515,
+      "learning_rate": 0.00013328398398320036,
+      "loss": 0.8111,
+      "step": 2350
+    },
+    {
+      "epoch": 4.53320500481232,
+      "grad_norm": 0.24826329946517944,
+      "learning_rate": 0.00013296668337968903,
+      "loss": 0.8161,
+      "step": 2355
+    },
+    {
+      "epoch": 4.542829643888354,
+      "grad_norm": 0.26754263043403625,
+      "learning_rate": 0.000132649009959899,
+      "loss": 0.8103,
+      "step": 2360
+    },
+    {
+      "epoch": 4.552454282964389,
+      "grad_norm": 0.2548888027667999,
+      "learning_rate": 0.00013233096731636088,
+      "loss": 0.8114,
+      "step": 2365
+    },
+    {
+      "epoch": 4.562078922040423,
+      "grad_norm": 0.2608910799026489,
+      "learning_rate": 0.00013201255904578095,
+      "loss": 0.8104,
+      "step": 2370
+    },
+    {
+      "epoch": 4.5717035611164585,
+      "grad_norm": 0.2469130903482437,
+      "learning_rate": 0.00013169378874900017,
+      "loss": 0.8084,
+      "step": 2375
+    },
+    {
+      "epoch": 4.581328200192493,
+      "grad_norm": 0.26305124163627625,
+      "learning_rate": 0.0001313746600309538,
+      "loss": 0.8198,
+      "step": 2380
+    },
+    {
+      "epoch": 4.590952839268527,
+      "grad_norm": 0.2730869650840759,
+      "learning_rate": 0.00013105517650063026,
+      "loss": 0.823,
+      "step": 2385
+    },
+    {
+      "epoch": 4.600577478344562,
+      "grad_norm": 0.25265151262283325,
+      "learning_rate": 0.0001307353417710306,
+      "loss": 0.8166,
+      "step": 2390
+    },
+    {
+      "epoch": 4.610202117420597,
+      "grad_norm": 0.2525179386138916,
+      "learning_rate": 0.00013041515945912753,
+      "loss": 0.8142,
+      "step": 2395
+    },
+    {
+      "epoch": 4.619826756496631,
+      "grad_norm": 0.2585461735725403,
+      "learning_rate": 0.00013009463318582447,
+      "loss": 0.8112,
+      "step": 2400
+    },
+    {
+      "epoch": 4.629451395572666,
+      "grad_norm": 0.25699469447135925,
+      "learning_rate": 0.00012977376657591474,
+      "loss": 0.815,
+      "step": 2405
+    },
+    {
+      "epoch": 4.639076034648701,
+      "grad_norm": 0.2651076316833496,
+      "learning_rate": 0.00012945256325804048,
+      "loss": 0.8215,
+      "step": 2410
+    },
+    {
+      "epoch": 4.648700673724735,
+      "grad_norm": 0.2517280876636505,
+      "learning_rate": 0.0001291310268646515,
+      "loss": 0.8126,
+      "step": 2415
+    },
+    {
+      "epoch": 4.65832531280077,
+      "grad_norm": 0.25369200110435486,
+      "learning_rate": 0.00012880916103196448,
+      "loss": 0.811,
+      "step": 2420
+    },
+    {
+      "epoch": 4.667949951876805,
+      "grad_norm": 0.2509647011756897,
+      "learning_rate": 0.0001284869693999216,
+      "loss": 0.8144,
+      "step": 2425
+    },
+    {
+      "epoch": 4.6775745909528395,
+      "grad_norm": 0.25037845969200134,
+      "learning_rate": 0.00012816445561214946,
+      "loss": 0.8145,
+      "step": 2430
+    },
+    {
+      "epoch": 4.687199230028874,
+      "grad_norm": 0.24885617196559906,
+      "learning_rate": 0.000127841623315918,
+      "loss": 0.815,
+      "step": 2435
+    },
+    {
+      "epoch": 4.696823869104908,
+      "grad_norm": 0.26731571555137634,
+      "learning_rate": 0.0001275184761620989,
+      "loss": 0.8151,
+      "step": 2440
+    },
+    {
+      "epoch": 4.7064485081809435,
+      "grad_norm": 0.24980269372463226,
+      "learning_rate": 0.00012719501780512476,
+      "loss": 0.8189,
+      "step": 2445
+    },
+    {
+      "epoch": 4.716073147256978,
+      "grad_norm": 0.26535722613334656,
+      "learning_rate": 0.0001268712519029474,
+      "loss": 0.8205,
+      "step": 2450
+    },
+    {
+      "epoch": 4.725697786333012,
+      "grad_norm": 0.24587014317512512,
+      "learning_rate": 0.00012654718211699674,
+      "loss": 0.8127,
+      "step": 2455
+    },
+    {
+      "epoch": 4.735322425409047,
+      "grad_norm": 0.26100653409957886,
+      "learning_rate": 0.00012622281211213915,
+      "loss": 0.8146,
+      "step": 2460
+    },
+    {
+      "epoch": 4.744947064485082,
+      "grad_norm": 0.24849233031272888,
+      "learning_rate": 0.00012589814555663626,
+      "loss": 0.8107,
+      "step": 2465
+    },
+    {
+      "epoch": 4.754571703561116,
+      "grad_norm": 0.2688848078250885,
+      "learning_rate": 0.0001255731861221033,
+      "loss": 0.8209,
+      "step": 2470
+    },
+    {
+      "epoch": 4.764196342637151,
+      "grad_norm": 0.2500625550746918,
+      "learning_rate": 0.00012524793748346758,
+      "loss": 0.815,
+      "step": 2475
+    },
+    {
+      "epoch": 4.773820981713186,
+      "grad_norm": 0.2789405882358551,
+      "learning_rate": 0.00012492240331892716,
+      "loss": 0.8196,
+      "step": 2480
+    },
+    {
+      "epoch": 4.78344562078922,
+      "grad_norm": 0.25875410437583923,
+      "learning_rate": 0.00012459658730990891,
+      "loss": 0.8196,
+      "step": 2485
+    },
+    {
+      "epoch": 4.793070259865255,
+      "grad_norm": 0.25247231125831604,
+      "learning_rate": 0.00012427049314102707,
+      "loss": 0.8242,
+      "step": 2490
+    },
+    {
+      "epoch": 4.80269489894129,
+      "grad_norm": 0.2572121024131775,
+      "learning_rate": 0.00012394412450004164,
+      "loss": 0.8215,
+      "step": 2495
+    },
+    {
+      "epoch": 4.8123195380173245,
+      "grad_norm": 0.25512033700942993,
+      "learning_rate": 0.0001236174850778165,
+      "loss": 0.8163,
+      "step": 2500
+    },
+    {
+      "epoch": 4.821944177093359,
+      "grad_norm": 0.25790128111839294,
+      "learning_rate": 0.0001232905785682778,
+      "loss": 0.8119,
+      "step": 2505
+    },
+    {
+      "epoch": 4.831568816169393,
+      "grad_norm": 0.26126110553741455,
+      "learning_rate": 0.00012296340866837222,
+      "loss": 0.8133,
+      "step": 2510
+    },
+    {
+      "epoch": 4.8411934552454285,
+      "grad_norm": 0.2542867362499237,
+      "learning_rate": 0.00012263597907802493,
+      "loss": 0.818,
+      "step": 2515
+    },
+    {
+      "epoch": 4.850818094321463,
+      "grad_norm": 0.2690134048461914,
+      "learning_rate": 0.00012230829350009804,
+      "loss": 0.8106,
+      "step": 2520
+    },
+    {
+      "epoch": 4.860442733397497,
+      "grad_norm": 0.25750601291656494,
+      "learning_rate": 0.00012198035564034856,
+      "loss": 0.8125,
+      "step": 2525
+    },
+    {
+      "epoch": 4.870067372473533,
+      "grad_norm": 0.2467714548110962,
+      "learning_rate": 0.00012165216920738651,
+      "loss": 0.8172,
+      "step": 2530
+    },
+    {
+      "epoch": 4.879692011549567,
+      "grad_norm": 0.25768086314201355,
+      "learning_rate": 0.000121323737912633,
+      "loss": 0.8186,
+      "step": 2535
+    },
+    {
+      "epoch": 4.889316650625601,
+      "grad_norm": 0.2579788863658905,
+      "learning_rate": 0.00012099506547027826,
+      "loss": 0.8124,
+      "step": 2540
+    },
+    {
+      "epoch": 4.898941289701636,
+      "grad_norm": 0.250635027885437,
+      "learning_rate": 0.00012066615559723961,
+      "loss": 0.8185,
+      "step": 2545
+    },
+    {
+      "epoch": 4.908565928777671,
+      "grad_norm": 0.24465559422969818,
+      "learning_rate": 0.00012033701201311945,
+      "loss": 0.8246,
+      "step": 2550
+    },
+    {
+      "epoch": 4.9181905678537055,
+      "grad_norm": 0.24917738139629364,
+      "learning_rate": 0.00012000763844016321,
+      "loss": 0.8112,
+      "step": 2555
+    },
+    {
+      "epoch": 4.92781520692974,
+      "grad_norm": 0.24168001115322113,
+      "learning_rate": 0.00011967803860321726,
+      "loss": 0.8169,
+      "step": 2560
+    },
+    {
+      "epoch": 4.937439846005775,
+      "grad_norm": 0.2604310214519501,
+      "learning_rate": 0.0001193482162296867,
+      "loss": 0.8092,
+      "step": 2565
+    },
+    {
+      "epoch": 4.9470644850818095,
+      "grad_norm": 0.2558085024356842,
+      "learning_rate": 0.00011901817504949331,
+      "loss": 0.8226,
+      "step": 2570
+    },
+    {
+      "epoch": 4.956689124157844,
+      "grad_norm": 0.2698078155517578,
+      "learning_rate": 0.00011868791879503324,
+      "loss": 0.8147,
+      "step": 2575
+    },
+    {
+      "epoch": 4.966313763233879,
+      "grad_norm": 0.268557608127594,
+      "learning_rate": 0.00011835745120113508,
+      "loss": 0.8039,
+      "step": 2580
+    },
+    {
+      "epoch": 4.975938402309914,
+      "grad_norm": 0.25237688422203064,
+      "learning_rate": 0.00011802677600501725,
+      "loss": 0.8129,
+      "step": 2585
+    },
+    {
+      "epoch": 4.985563041385948,
+      "grad_norm": 0.24979138374328613,
+      "learning_rate": 0.00011769589694624601,
+      "loss": 0.8222,
+      "step": 2590
+    },
+    {
+      "epoch": 4.995187680461982,
+      "grad_norm": 0.282382071018219,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.8109,
+      "step": 2595
+    },
+    {
+      "epoch": 4.999037536092397,
+      "eval_loss": 2.708376407623291,
+      "eval_runtime": 0.7926,
+      "eval_samples_per_second": 13.879,
+      "eval_steps_per_second": 2.523,
+      "step": 2597
+    },
+    {
+      "epoch": 5.004812319538018,
+      "grad_norm": 0.23464234173297882,
+      "learning_rate": 0.00011703354221049318,
+      "loss": 0.78,
+      "step": 2600
+    },
+    {
+      "epoch": 5.014436958614052,
+      "grad_norm": 0.29451891779899597,
+      "learning_rate": 0.0001167020740240021,
+      "loss": 0.7451,
+      "step": 2605
+    },
+    {
+      "epoch": 5.024061597690086,
+      "grad_norm": 0.26757895946502686,
+      "learning_rate": 0.00011637041695575383,
+      "loss": 0.7358,
+      "step": 2610
+    },
+    {
+      "epoch": 5.033686236766122,
+      "grad_norm": 0.27909424901008606,
+      "learning_rate": 0.00011603857475641846,
+      "loss": 0.7299,
+      "step": 2615
+    },
+    {
+      "epoch": 5.043310875842156,
+      "grad_norm": 0.27936622500419617,
+      "learning_rate": 0.0001157065511787598,
+      "loss": 0.7264,
+      "step": 2620
+    },
+    {
+      "epoch": 5.0529355149181905,
+      "grad_norm": 0.2764790952205658,
+      "learning_rate": 0.0001153743499775927,
+      "loss": 0.7414,
+      "step": 2625
+    },
+    {
+      "epoch": 5.062560153994225,
+      "grad_norm": 0.28827911615371704,
+      "learning_rate": 0.00011504197490974085,
+      "loss": 0.7344,
+      "step": 2630
+    },
+    {
+      "epoch": 5.07218479307026,
+      "grad_norm": 0.29319116473197937,
+      "learning_rate": 0.0001147094297339941,
+      "loss": 0.7419,
+      "step": 2635
+    },
+    {
+      "epoch": 5.0818094321462945,
+      "grad_norm": 0.27190330624580383,
+      "learning_rate": 0.0001143767182110661,
+      "loss": 0.7384,
+      "step": 2640
+    },
+    {
+      "epoch": 5.091434071222329,
+      "grad_norm": 0.28567731380462646,
+      "learning_rate": 0.00011404384410355167,
+      "loss": 0.7428,
+      "step": 2645
+    },
+    {
+      "epoch": 5.101058710298364,
+      "grad_norm": 0.27502113580703735,
+      "learning_rate": 0.00011371081117588417,
+      "loss": 0.751,
+      "step": 2650
+    },
+    {
+      "epoch": 5.110683349374399,
+      "grad_norm": 0.2895454168319702,
+      "learning_rate": 0.00011337762319429326,
+      "loss": 0.7389,
+      "step": 2655
+    },
+    {
+      "epoch": 5.120307988450433,
+      "grad_norm": 0.28590232133865356,
+      "learning_rate": 0.00011304428392676194,
+      "loss": 0.7351,
+      "step": 2660
+    },
+    {
+      "epoch": 5.129932627526467,
+      "grad_norm": 0.29666268825531006,
+      "learning_rate": 0.00011271079714298405,
+      "loss": 0.7437,
+      "step": 2665
+    },
+    {
+      "epoch": 5.139557266602503,
+      "grad_norm": 0.2858620584011078,
+      "learning_rate": 0.00011237716661432181,
+      "loss": 0.7393,
+      "step": 2670
+    },
+    {
+      "epoch": 5.149181905678537,
+      "grad_norm": 0.29355934262275696,
+      "learning_rate": 0.00011204339611376291,
+      "loss": 0.7429,
+      "step": 2675
+    },
+    {
+      "epoch": 5.1588065447545715,
+      "grad_norm": 0.31132546067237854,
+      "learning_rate": 0.00011170948941587805,
+      "loss": 0.7477,
+      "step": 2680
+    },
+    {
+      "epoch": 5.168431183830607,
+      "grad_norm": 0.2698726952075958,
+      "learning_rate": 0.00011137545029677809,
+      "loss": 0.7453,
+      "step": 2685
+    },
+    {
+      "epoch": 5.178055822906641,
+      "grad_norm": 0.2867010533809662,
+      "learning_rate": 0.0001110412825340715,
+      "loss": 0.7375,
+      "step": 2690
+    },
+    {
+      "epoch": 5.1876804619826755,
+      "grad_norm": 0.2847628593444824,
+      "learning_rate": 0.00011070698990682156,
+      "loss": 0.7492,
+      "step": 2695
+    },
+    {
+      "epoch": 5.19730510105871,
+      "grad_norm": 0.29182639718055725,
+      "learning_rate": 0.00011037257619550352,
+      "loss": 0.7399,
+      "step": 2700
+    },
+    {
+      "epoch": 5.206929740134745,
+      "grad_norm": 0.30024299025535583,
+      "learning_rate": 0.0001100380451819621,
+      "loss": 0.7509,
+      "step": 2705
+    },
+    {
+      "epoch": 5.21655437921078,
+      "grad_norm": 0.2791791260242462,
+      "learning_rate": 0.00010970340064936853,
+      "loss": 0.7515,
+      "step": 2710
+    },
+    {
+      "epoch": 5.226179018286814,
+      "grad_norm": 0.3051394522190094,
+      "learning_rate": 0.00010936864638217776,
+      "loss": 0.7458,
+      "step": 2715
+    },
+    {
+      "epoch": 5.235803657362849,
+      "grad_norm": 0.2900809049606323,
+      "learning_rate": 0.00010903378616608573,
+      "loss": 0.7433,
+      "step": 2720
+    },
+    {
+      "epoch": 5.245428296438884,
+      "grad_norm": 0.2843543291091919,
+      "learning_rate": 0.00010869882378798663,
+      "loss": 0.7454,
+      "step": 2725
+    },
+    {
+      "epoch": 5.255052935514918,
+      "grad_norm": 0.30490627884864807,
+      "learning_rate": 0.0001083637630359299,
+      "loss": 0.7461,
+      "step": 2730
+    },
+    {
+      "epoch": 5.264677574590952,
+      "grad_norm": 0.28951317071914673,
+      "learning_rate": 0.00010802860769907748,
+      "loss": 0.7496,
+      "step": 2735
+    },
+    {
+      "epoch": 5.274302213666988,
+      "grad_norm": 0.2910211980342865,
+      "learning_rate": 0.00010769336156766101,
+      "loss": 0.7465,
+      "step": 2740
+    },
+    {
+      "epoch": 5.283926852743022,
+      "grad_norm": 0.2923705279827118,
+      "learning_rate": 0.00010735802843293888,
+      "loss": 0.7409,
+      "step": 2745
+    },
+    {
+      "epoch": 5.2935514918190565,
+      "grad_norm": 0.2950255274772644,
+      "learning_rate": 0.0001070226120871534,
+      "loss": 0.7471,
+      "step": 2750
+    },
+    {
+      "epoch": 5.303176130895092,
+      "grad_norm": 0.29950594902038574,
+      "learning_rate": 0.00010668711632348787,
+      "loss": 0.7417,
+      "step": 2755
+    },
+    {
+      "epoch": 5.312800769971126,
+      "grad_norm": 0.28509971499443054,
+      "learning_rate": 0.0001063515449360238,
+      "loss": 0.7515,
+      "step": 2760
+    },
+    {
+      "epoch": 5.3224254090471605,
+      "grad_norm": 0.3036702871322632,
+      "learning_rate": 0.00010601590171969782,
+      "loss": 0.7395,
+      "step": 2765
+    },
+    {
+      "epoch": 5.332050048123195,
+      "grad_norm": 0.2864932119846344,
+      "learning_rate": 0.00010568019047025893,
+      "loss": 0.7473,
+      "step": 2770
+    },
+    {
+      "epoch": 5.34167468719923,
+      "grad_norm": 0.29944750666618347,
+      "learning_rate": 0.00010534441498422552,
+      "loss": 0.7454,
+      "step": 2775
+    },
+    {
+      "epoch": 5.351299326275265,
+      "grad_norm": 0.2880357503890991,
+      "learning_rate": 0.00010500857905884233,
+      "loss": 0.7455,
+      "step": 2780
+    },
+    {
+      "epoch": 5.360923965351299,
+      "grad_norm": 0.2973332107067108,
+      "learning_rate": 0.00010467268649203774,
+      "loss": 0.7607,
+      "step": 2785
+    },
+    {
+      "epoch": 5.370548604427334,
+      "grad_norm": 0.28307193517684937,
+      "learning_rate": 0.00010433674108238059,
+      "loss": 0.7522,
+      "step": 2790
+    },
+    {
+      "epoch": 5.380173243503369,
+      "grad_norm": 0.29455214738845825,
+      "learning_rate": 0.00010400074662903729,
+      "loss": 0.748,
+      "step": 2795
+    },
+    {
+      "epoch": 5.389797882579403,
+      "grad_norm": 0.2844898998737335,
+      "learning_rate": 0.00010366470693172896,
+      "loss": 0.7523,
+      "step": 2800
+    },
+    {
+      "epoch": 5.399422521655438,
+      "grad_norm": 0.29002171754837036,
+      "learning_rate": 0.0001033286257906883,
+      "loss": 0.7493,
+      "step": 2805
+    },
+    {
+      "epoch": 5.409047160731473,
+      "grad_norm": 0.2987057566642761,
+      "learning_rate": 0.00010299250700661678,
+      "loss": 0.7436,
+      "step": 2810
+    },
+    {
+      "epoch": 5.418671799807507,
+      "grad_norm": 0.2843535244464874,
+      "learning_rate": 0.00010265635438064145,
+      "loss": 0.7418,
+      "step": 2815
+    },
+    {
+      "epoch": 5.4282964388835415,
+      "grad_norm": 0.2849405109882355,
+      "learning_rate": 0.00010232017171427223,
+      "loss": 0.7489,
+      "step": 2820
+    },
+    {
+      "epoch": 5.437921077959577,
+      "grad_norm": 0.2880149781703949,
+      "learning_rate": 0.00010198396280935866,
+      "loss": 0.7563,
+      "step": 2825
+    },
+    {
+      "epoch": 5.447545717035611,
+      "grad_norm": 0.31955039501190186,
+      "learning_rate": 0.00010164773146804696,
+      "loss": 0.7544,
+      "step": 2830
+    },
+    {
+      "epoch": 5.457170356111646,
+      "grad_norm": 0.32376500964164734,
+      "learning_rate": 0.00010131148149273723,
+      "loss": 0.7469,
+      "step": 2835
+    },
+    {
+      "epoch": 5.466794995187681,
+      "grad_norm": 0.2932649254798889,
+      "learning_rate": 0.00010097521668604015,
+      "loss": 0.7548,
+      "step": 2840
+    },
+    {
+      "epoch": 5.476419634263715,
+      "grad_norm": 0.29111912846565247,
+      "learning_rate": 0.00010063894085073424,
+      "loss": 0.7517,
+      "step": 2845
+    },
+    {
+      "epoch": 5.48604427333975,
+      "grad_norm": 0.30052244663238525,
+      "learning_rate": 0.0001003026577897227,
+      "loss": 0.7468,
+      "step": 2850
+    },
+    {
+      "epoch": 5.495668912415784,
+      "grad_norm": 0.27394363284111023,
+      "learning_rate": 9.99663713059904e-05,
+      "loss": 0.7545,
+      "step": 2855
+    },
+    {
+      "epoch": 5.505293551491819,
+      "grad_norm": 0.29920995235443115,
+      "learning_rate": 9.9630085202561e-05,
+      "loss": 0.7578,
+      "step": 2860
+    },
+    {
+      "epoch": 5.514918190567854,
+      "grad_norm": 0.301736056804657,
+      "learning_rate": 9.929380328245378e-05,
+      "loss": 0.7474,
+      "step": 2865
+    },
+    {
+      "epoch": 5.524542829643888,
+      "grad_norm": 0.29692158102989197,
+      "learning_rate": 9.895752934864078e-05,
+      "loss": 0.7529,
+      "step": 2870
+    },
+    {
+      "epoch": 5.534167468719923,
+      "grad_norm": 0.28971490263938904,
+      "learning_rate": 9.862126720400364e-05,
+      "loss": 0.7533,
+      "step": 2875
+    },
+    {
+      "epoch": 5.543792107795958,
+      "grad_norm": 0.2907530665397644,
+      "learning_rate": 9.828502065129076e-05,
+      "loss": 0.7488,
+      "step": 2880
+    },
+    {
+      "epoch": 5.553416746871992,
+      "grad_norm": 0.28844624757766724,
+      "learning_rate": 9.794879349307419e-05,
+      "loss": 0.7534,
+      "step": 2885
+    },
+    {
+      "epoch": 5.563041385948027,
+      "grad_norm": 0.29487764835357666,
+      "learning_rate": 9.761258953170667e-05,
+      "loss": 0.7503,
+      "step": 2890
+    },
+    {
+      "epoch": 5.572666025024062,
+      "grad_norm": 0.29256966710090637,
+      "learning_rate": 9.72764125692785e-05,
+      "loss": 0.7516,
+      "step": 2895
+    },
+    {
+      "epoch": 5.582290664100096,
+      "grad_norm": 0.2992061376571655,
+      "learning_rate": 9.694026640757481e-05,
+      "loss": 0.7544,
+      "step": 2900
+    },
+    {
+      "epoch": 5.591915303176131,
+      "grad_norm": 0.28604987263679504,
+      "learning_rate": 9.660415484803226e-05,
+      "loss": 0.7484,
+      "step": 2905
+    },
+    {
+      "epoch": 5.601539942252166,
+      "grad_norm": 0.28531113266944885,
+      "learning_rate": 9.626808169169634e-05,
+      "loss": 0.7437,
+      "step": 2910
+    },
+    {
+      "epoch": 5.6111645813282,
+      "grad_norm": 0.2726121246814728,
+      "learning_rate": 9.593205073917817e-05,
+      "loss": 0.7589,
+      "step": 2915
+    },
+    {
+      "epoch": 5.620789220404235,
+      "grad_norm": 0.29796725511550903,
+      "learning_rate": 9.559606579061154e-05,
+      "loss": 0.7524,
+      "step": 2920
+    },
+    {
+      "epoch": 5.630413859480269,
+      "grad_norm": 0.3006713390350342,
+      "learning_rate": 9.526013064561006e-05,
+      "loss": 0.765,
+      "step": 2925
+    },
+    {
+      "epoch": 5.640038498556304,
+      "grad_norm": 0.30542224645614624,
+      "learning_rate": 9.492424910322413e-05,
+      "loss": 0.7545,
+      "step": 2930
+    },
+    {
+      "epoch": 5.649663137632339,
+      "grad_norm": 0.2783224284648895,
+      "learning_rate": 9.458842496189789e-05,
+      "loss": 0.7493,
+      "step": 2935
+    },
+    {
+      "epoch": 5.659287776708373,
+      "grad_norm": 0.3057067394256592,
+      "learning_rate": 9.425266201942645e-05,
+      "loss": 0.7668,
+      "step": 2940
+    },
+    {
+      "epoch": 5.668912415784408,
+      "grad_norm": 0.29461607336997986,
+      "learning_rate": 9.391696407291269e-05,
+      "loss": 0.7544,
+      "step": 2945
+    },
+    {
+      "epoch": 5.678537054860443,
+      "grad_norm": 0.2968499958515167,
+      "learning_rate": 9.358133491872453e-05,
+      "loss": 0.7508,
+      "step": 2950
+    },
+    {
+      "epoch": 5.688161693936477,
+      "grad_norm": 0.3040287494659424,
+      "learning_rate": 9.324577835245197e-05,
+      "loss": 0.7618,
+      "step": 2955
+    },
+    {
+      "epoch": 5.6977863330125125,
+      "grad_norm": 0.29871127009391785,
+      "learning_rate": 9.291029816886405e-05,
+      "loss": 0.7537,
+      "step": 2960
+    },
+    {
+      "epoch": 5.707410972088547,
+      "grad_norm": 0.2989570200443268,
+      "learning_rate": 9.257489816186606e-05,
+      "loss": 0.7472,
+      "step": 2965
+    },
+    {
+      "epoch": 5.717035611164581,
+      "grad_norm": 0.2932529151439667,
+      "learning_rate": 9.223958212445656e-05,
+      "loss": 0.7488,
+      "step": 2970
+    },
+    {
+      "epoch": 5.726660250240616,
+      "grad_norm": 0.29275083541870117,
+      "learning_rate": 9.190435384868448e-05,
+      "loss": 0.7532,
+      "step": 2975
+    },
+    {
+      "epoch": 5.736284889316651,
+      "grad_norm": 0.2811647057533264,
+      "learning_rate": 9.156921712560626e-05,
+      "loss": 0.7597,
+      "step": 2980
+    },
+    {
+      "epoch": 5.745909528392685,
+      "grad_norm": 0.2994243800640106,
+      "learning_rate": 9.123417574524307e-05,
+      "loss": 0.7494,
+      "step": 2985
+    },
+    {
+      "epoch": 5.75553416746872,
+      "grad_norm": 0.29861563444137573,
+      "learning_rate": 9.089923349653776e-05,
+      "loss": 0.7513,
+      "step": 2990
+    },
+    {
+      "epoch": 5.765158806544754,
+      "grad_norm": 0.27614521980285645,
+      "learning_rate": 9.056439416731223e-05,
+      "loss": 0.7498,
+      "step": 2995
+    },
+    {
+      "epoch": 5.774783445620789,
+      "grad_norm": 0.29117491841316223,
+      "learning_rate": 9.02296615442243e-05,
+      "loss": 0.7571,
+      "step": 3000
+    },
+    {
+      "epoch": 5.784408084696824,
+      "grad_norm": 0.32449835538864136,
+      "learning_rate": 8.989503941272522e-05,
+      "loss": 0.7575,
+      "step": 3005
+    },
+    {
+      "epoch": 5.794032723772858,
+      "grad_norm": 0.2920955419540405,
+      "learning_rate": 8.956053155701661e-05,
+      "loss": 0.7445,
+      "step": 3010
+    },
+    {
+      "epoch": 5.803657362848893,
+      "grad_norm": 0.3088265061378479,
+      "learning_rate": 8.922614176000783e-05,
+      "loss": 0.7534,
+      "step": 3015
+    },
+    {
+      "epoch": 5.813282001924928,
+      "grad_norm": 0.3056049942970276,
+      "learning_rate": 8.889187380327312e-05,
+      "loss": 0.7548,
+      "step": 3020
+    },
+    {
+      "epoch": 5.822906641000962,
+      "grad_norm": 0.28941500186920166,
+      "learning_rate": 8.855773146700872e-05,
+      "loss": 0.7485,
+      "step": 3025
+    },
+    {
+      "epoch": 5.8325312800769975,
+      "grad_norm": 0.2886408269405365,
+      "learning_rate": 8.82237185299904e-05,
+      "loss": 0.7422,
+      "step": 3030
+    },
+    {
+      "epoch": 5.842155919153032,
+      "grad_norm": 0.2928673028945923,
+      "learning_rate": 8.788983876953051e-05,
+      "loss": 0.7551,
+      "step": 3035
+    },
+    {
+      "epoch": 5.851780558229066,
+      "grad_norm": 0.3021661043167114,
+      "learning_rate": 8.755609596143534e-05,
+      "loss": 0.7445,
+      "step": 3040
+    },
+    {
+      "epoch": 5.861405197305101,
+      "grad_norm": 0.2965797781944275,
+      "learning_rate": 8.722249387996237e-05,
+      "loss": 0.7502,
+      "step": 3045
+    },
+    {
+      "epoch": 5.871029836381136,
+      "grad_norm": 0.3059804141521454,
+      "learning_rate": 8.688903629777762e-05,
+      "loss": 0.7544,
+      "step": 3050
+    },
+    {
+      "epoch": 5.88065447545717,
+      "grad_norm": 0.2819983661174774,
+      "learning_rate": 8.655572698591297e-05,
+      "loss": 0.7611,
+      "step": 3055
+    },
+    {
+      "epoch": 5.890279114533205,
+      "grad_norm": 0.297858327627182,
+      "learning_rate": 8.62225697137236e-05,
+      "loss": 0.7526,
+      "step": 3060
+    },
+    {
+      "epoch": 5.89990375360924,
+      "grad_norm": 0.2882884740829468,
+      "learning_rate": 8.588956824884523e-05,
+      "loss": 0.762,
+      "step": 3065
+    },
+    {
+      "epoch": 5.909528392685274,
+      "grad_norm": 0.31062471866607666,
+      "learning_rate": 8.555672635715162e-05,
+      "loss": 0.7537,
+      "step": 3070
+    },
+    {
+      "epoch": 5.919153031761309,
+      "grad_norm": 0.30393049120903015,
+      "learning_rate": 8.522404780271186e-05,
+      "loss": 0.75,
+      "step": 3075
+    },
+    {
+      "epoch": 5.928777670837343,
+      "grad_norm": 0.2902856469154358,
+      "learning_rate": 8.489153634774796e-05,
+      "loss": 0.7459,
+      "step": 3080
+    },
+    {
+      "epoch": 5.9384023099133785,
+      "grad_norm": 0.2876073718070984,
+      "learning_rate": 8.455919575259217e-05,
+      "loss": 0.7541,
+      "step": 3085
+    },
+    {
+      "epoch": 5.948026948989413,
+      "grad_norm": 0.3035559058189392,
+      "learning_rate": 8.422702977564453e-05,
+      "loss": 0.7564,
+      "step": 3090
+    },
+    {
+      "epoch": 5.957651588065447,
+      "grad_norm": 0.2893913984298706,
+      "learning_rate": 8.389504217333039e-05,
+      "loss": 0.749,
+      "step": 3095
+    },
+    {
+      "epoch": 5.9672762271414825,
+      "grad_norm": 0.2977910041809082,
+      "learning_rate": 8.356323670005772e-05,
+      "loss": 0.7509,
+      "step": 3100
+    },
+    {
+      "epoch": 5.976900866217517,
+      "grad_norm": 0.27759596705436707,
+      "learning_rate": 8.3231617108175e-05,
+      "loss": 0.7623,
+      "step": 3105
+    },
+    {
+      "epoch": 5.986525505293551,
+      "grad_norm": 0.30392059683799744,
+      "learning_rate": 8.290018714792852e-05,
+      "loss": 0.7565,
+      "step": 3110
+    },
+    {
+      "epoch": 5.996150144369587,
+      "grad_norm": 0.2790631949901581,
+      "learning_rate": 8.256895056742006e-05,
+      "loss": 0.7513,
+      "step": 3115
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.9357750415802,
+      "eval_runtime": 0.7796,
+      "eval_samples_per_second": 14.111,
+      "eval_steps_per_second": 2.566,
+      "step": 3117
+    },
+    {
+      "epoch": 6.005774783445621,
+      "grad_norm": 0.27328184247016907,
+      "learning_rate": 8.223791111256447e-05,
+      "loss": 0.7169,
+      "step": 3120
+    },
+    {
+      "epoch": 6.015399422521655,
+      "grad_norm": 0.3284066319465637,
+      "learning_rate": 8.190707252704736e-05,
+      "loss": 0.6924,
+      "step": 3125
+    },
+    {
+      "epoch": 6.02502406159769,
+      "grad_norm": 0.30812135338783264,
+      "learning_rate": 8.157643855228267e-05,
+      "loss": 0.6785,
+      "step": 3130
+    },
+    {
+      "epoch": 6.034648700673725,
+      "grad_norm": 0.3338078558444977,
+      "learning_rate": 8.12460129273705e-05,
+      "loss": 0.6847,
+      "step": 3135
+    },
+    {
+      "epoch": 6.044273339749759,
+      "grad_norm": 0.3224867284297943,
+      "learning_rate": 8.091579938905474e-05,
+      "loss": 0.6756,
+      "step": 3140
+    },
+    {
+      "epoch": 6.053897978825794,
+      "grad_norm": 0.317451149225235,
+      "learning_rate": 8.05858016716808e-05,
+      "loss": 0.6758,
+      "step": 3145
+    },
+    {
+      "epoch": 6.063522617901829,
+      "grad_norm": 0.29282692074775696,
+      "learning_rate": 8.025602350715332e-05,
+      "loss": 0.687,
+      "step": 3150
+    },
+    {
+      "epoch": 6.0731472569778635,
+      "grad_norm": 0.3204721510410309,
+      "learning_rate": 7.992646862489417e-05,
+      "loss": 0.6808,
+      "step": 3155
+    },
+    {
+      "epoch": 6.082771896053898,
+      "grad_norm": 0.3063673675060272,
+      "learning_rate": 7.959714075180008e-05,
+      "loss": 0.6764,
+      "step": 3160
+    },
+    {
+      "epoch": 6.092396535129932,
+      "grad_norm": 0.3125745356082916,
+      "learning_rate": 7.926804361220055e-05,
+      "loss": 0.6852,
+      "step": 3165
+    },
+    {
+      "epoch": 6.102021174205968,
+      "grad_norm": 0.31588083505630493,
+      "learning_rate": 7.893918092781583e-05,
+      "loss": 0.6805,
+      "step": 3170
+    },
+    {
+      "epoch": 6.111645813282002,
+      "grad_norm": 0.3146851062774658,
+      "learning_rate": 7.861055641771459e-05,
+      "loss": 0.6862,
+      "step": 3175
+    },
+    {
+      "epoch": 6.121270452358036,
+      "grad_norm": 0.33888891339302063,
+      "learning_rate": 7.828217379827215e-05,
+      "loss": 0.6943,
+      "step": 3180
+    },
+    {
+      "epoch": 6.130895091434072,
+      "grad_norm": 0.33557072281837463,
+      "learning_rate": 7.79540367831283e-05,
+      "loss": 0.6936,
+      "step": 3185
+    },
+    {
+      "epoch": 6.140519730510106,
+      "grad_norm": 0.33382484316825867,
+      "learning_rate": 7.762614908314521e-05,
+      "loss": 0.6935,
+      "step": 3190
+    },
+    {
+      "epoch": 6.15014436958614,
+      "grad_norm": 0.31766244769096375,
+      "learning_rate": 7.729851440636575e-05,
+      "loss": 0.6927,
+      "step": 3195
+    },
+    {
+      "epoch": 6.159769008662175,
+      "grad_norm": 0.3161802291870117,
+      "learning_rate": 7.69711364579712e-05,
+      "loss": 0.6902,
+      "step": 3200
+    },
+    {
+      "epoch": 6.16939364773821,
+      "grad_norm": 0.31405240297317505,
+      "learning_rate": 7.664401894023967e-05,
+      "loss": 0.6824,
+      "step": 3205
+    },
+    {
+      "epoch": 6.1790182868142445,
+      "grad_norm": 0.31615492701530457,
+      "learning_rate": 7.6317165552504e-05,
+      "loss": 0.6893,
+      "step": 3210
+    },
+    {
+      "epoch": 6.188642925890279,
+      "grad_norm": 0.3123544454574585,
+      "learning_rate": 7.59905799911101e-05,
+      "loss": 0.6788,
+      "step": 3215
+    },
+    {
+      "epoch": 6.198267564966314,
+      "grad_norm": 0.3448927402496338,
+      "learning_rate": 7.566426594937503e-05,
+      "loss": 0.6829,
+      "step": 3220
+    },
+    {
+      "epoch": 6.2078922040423485,
+      "grad_norm": 0.2904527187347412,
+      "learning_rate": 7.533822711754515e-05,
+      "loss": 0.6953,
+      "step": 3225
+    },
+    {
+      "epoch": 6.217516843118383,
+      "grad_norm": 0.31403473019599915,
+      "learning_rate": 7.501246718275471e-05,
+      "loss": 0.6819,
+      "step": 3230
+    },
+    {
+      "epoch": 6.227141482194417,
+      "grad_norm": 0.31581783294677734,
+      "learning_rate": 7.468698982898382e-05,
+      "loss": 0.6838,
+      "step": 3235
+    },
+    {
+      "epoch": 6.236766121270453,
+      "grad_norm": 0.3196973204612732,
+      "learning_rate": 7.436179873701688e-05,
+      "loss": 0.687,
+      "step": 3240
+    },
+    {
+      "epoch": 6.246390760346487,
+      "grad_norm": 0.3196184039115906,
+      "learning_rate": 7.403689758440115e-05,
+      "loss": 0.6897,
+      "step": 3245
+    },
+    {
+      "epoch": 6.256015399422521,
+      "grad_norm": 0.32126832008361816,
+      "learning_rate": 7.371229004540481e-05,
+      "loss": 0.6954,
+      "step": 3250
+    },
+    {
+      "epoch": 6.265640038498557,
+      "grad_norm": 0.3566059470176697,
+      "learning_rate": 7.338797979097571e-05,
+      "loss": 0.698,
+      "step": 3255
+    },
+    {
+      "epoch": 6.275264677574591,
+      "grad_norm": 0.3231862783432007,
+      "learning_rate": 7.306397048869977e-05,
+      "loss": 0.6864,
+      "step": 3260
+    },
+    {
+      "epoch": 6.2848893166506254,
+      "grad_norm": 0.3360905945301056,
+      "learning_rate": 7.274026580275937e-05,
+      "loss": 0.6981,
+      "step": 3265
+    },
+    {
+      "epoch": 6.29451395572666,
+      "grad_norm": 0.30905240774154663,
+      "learning_rate": 7.241686939389214e-05,
+      "loss": 0.6839,
+      "step": 3270
+    },
+    {
+      "epoch": 6.304138594802695,
+      "grad_norm": 0.29758358001708984,
+      "learning_rate": 7.20937849193493e-05,
+      "loss": 0.6899,
+      "step": 3275
+    },
+    {
+      "epoch": 6.3137632338787295,
+      "grad_norm": 0.32738837599754333,
+      "learning_rate": 7.177101603285458e-05,
+      "loss": 0.6907,
+      "step": 3280
+    },
+    {
+      "epoch": 6.323387872954764,
+      "grad_norm": 0.30813169479370117,
+      "learning_rate": 7.144856638456272e-05,
+      "loss": 0.6919,
+      "step": 3285
+    },
+    {
+      "epoch": 6.333012512030799,
+      "grad_norm": 0.340621680021286,
+      "learning_rate": 7.112643962101817e-05,
+      "loss": 0.6884,
+      "step": 3290
+    },
+    {
+      "epoch": 6.342637151106834,
+      "grad_norm": 0.3451749384403229,
+      "learning_rate": 7.080463938511405e-05,
+      "loss": 0.6937,
+      "step": 3295
+    },
+    {
+      "epoch": 6.352261790182868,
+      "grad_norm": 0.32087814807891846,
+      "learning_rate": 7.048316931605062e-05,
+      "loss": 0.6929,
+      "step": 3300
+    },
+    {
+      "epoch": 6.361886429258902,
+      "grad_norm": 0.30795004963874817,
+      "learning_rate": 7.016203304929451e-05,
+      "loss": 0.6983,
+      "step": 3305
+    },
+    {
+      "epoch": 6.371511068334938,
+      "grad_norm": 0.3312138617038727,
+      "learning_rate": 6.984123421653733e-05,
+      "loss": 0.6845,
+      "step": 3310
+    },
+    {
+      "epoch": 6.381135707410972,
+      "grad_norm": 0.3371661901473999,
+      "learning_rate": 6.952077644565469e-05,
+      "loss": 0.6898,
+      "step": 3315
+    },
+    {
+      "epoch": 6.390760346487006,
+      "grad_norm": 0.3481803834438324,
+      "learning_rate": 6.920066336066524e-05,
+      "loss": 0.6912,
+      "step": 3320
+    },
+    {
+      "epoch": 6.400384985563042,
+      "grad_norm": 0.32163578271865845,
+      "learning_rate": 6.888089858168949e-05,
+      "loss": 0.6901,
+      "step": 3325
+    },
+    {
+      "epoch": 6.410009624639076,
+      "grad_norm": 0.3223172128200531,
+      "learning_rate": 6.85614857249091e-05,
+      "loss": 0.6944,
+      "step": 3330
+    },
+    {
+      "epoch": 6.4196342637151105,
+      "grad_norm": 0.30212926864624023,
+      "learning_rate": 6.824242840252588e-05,
+      "loss": 0.7016,
+      "step": 3335
+    },
+    {
+      "epoch": 6.429258902791146,
+      "grad_norm": 0.32831230759620667,
+      "learning_rate": 6.79237302227209e-05,
+      "loss": 0.6869,
+      "step": 3340
+    },
+    {
+      "epoch": 6.43888354186718,
+      "grad_norm": 0.3248232305049896,
+      "learning_rate": 6.76053947896138e-05,
+      "loss": 0.6945,
+      "step": 3345
+    },
+    {
+      "epoch": 6.4485081809432145,
+      "grad_norm": 0.3347261846065521,
+      "learning_rate": 6.728742570322181e-05,
+      "loss": 0.6911,
+      "step": 3350
+    },
+    {
+      "epoch": 6.458132820019249,
+      "grad_norm": 0.3434222936630249,
+      "learning_rate": 6.69698265594194e-05,
+      "loss": 0.7001,
+      "step": 3355
+    },
+    {
+      "epoch": 6.467757459095284,
+      "grad_norm": 0.31891781091690063,
+      "learning_rate": 6.66526009498972e-05,
+      "loss": 0.6961,
+      "step": 3360
+    },
+    {
+      "epoch": 6.477382098171319,
+      "grad_norm": 0.32785654067993164,
+      "learning_rate": 6.633575246212175e-05,
+      "loss": 0.6986,
+      "step": 3365
+    },
+    {
+      "epoch": 6.487006737247353,
+      "grad_norm": 0.3148154020309448,
+      "learning_rate": 6.601928467929472e-05,
+      "loss": 0.6857,
+      "step": 3370
+    },
+    {
+      "epoch": 6.496631376323388,
+      "grad_norm": 0.3220577836036682,
+      "learning_rate": 6.570320118031232e-05,
+      "loss": 0.6933,
+      "step": 3375
+    },
+    {
+      "epoch": 6.506256015399423,
+      "grad_norm": 0.3030003309249878,
+      "learning_rate": 6.538750553972509e-05,
+      "loss": 0.6963,
+      "step": 3380
+    },
+    {
+      "epoch": 6.515880654475457,
+      "grad_norm": 0.32863059639930725,
+      "learning_rate": 6.507220132769723e-05,
+      "loss": 0.6929,
+      "step": 3385
+    },
+    {
+      "epoch": 6.5255052935514914,
+      "grad_norm": 0.35064488649368286,
+      "learning_rate": 6.475729210996637e-05,
+      "loss": 0.6864,
+      "step": 3390
+    },
+    {
+      "epoch": 6.535129932627527,
+      "grad_norm": 0.32089149951934814,
+      "learning_rate": 6.444278144780325e-05,
+      "loss": 0.6858,
+      "step": 3395
+    },
+    {
+      "epoch": 6.544754571703561,
+      "grad_norm": 0.4273422658443451,
+      "learning_rate": 6.41286728979712e-05,
+      "loss": 0.6968,
+      "step": 3400
+    },
+    {
+      "epoch": 6.5543792107795955,
+      "grad_norm": 0.33466604351997375,
+      "learning_rate": 6.38149700126863e-05,
+      "loss": 0.6966,
+      "step": 3405
+    },
+    {
+      "epoch": 6.564003849855631,
+      "grad_norm": 0.3052511513233185,
+      "learning_rate": 6.350167633957698e-05,
+      "loss": 0.6983,
+      "step": 3410
+    },
+    {
+      "epoch": 6.573628488931665,
+      "grad_norm": 0.3621208071708679,
+      "learning_rate": 6.318879542164385e-05,
+      "loss": 0.6986,
+      "step": 3415
+    },
+    {
+      "epoch": 6.5832531280077,
+      "grad_norm": 0.32712018489837646,
+      "learning_rate": 6.287633079721986e-05,
+      "loss": 0.6927,
+      "step": 3420
+    },
+    {
+      "epoch": 6.592877767083735,
+      "grad_norm": 0.3064589202404022,
+      "learning_rate": 6.256428599993e-05,
+      "loss": 0.6995,
+      "step": 3425
+    },
+    {
+      "epoch": 6.602502406159769,
+      "grad_norm": 0.3126335144042969,
+      "learning_rate": 6.225266455865157e-05,
+      "loss": 0.6985,
+      "step": 3430
+    },
+    {
+      "epoch": 6.612127045235804,
+      "grad_norm": 0.35115116834640503,
+      "learning_rate": 6.194146999747419e-05,
+      "loss": 0.6918,
+      "step": 3435
+    },
+    {
+      "epoch": 6.621751684311838,
+      "grad_norm": 0.32435253262519836,
+      "learning_rate": 6.163070583565993e-05,
+      "loss": 0.6988,
+      "step": 3440
+    },
+    {
+      "epoch": 6.631376323387873,
+      "grad_norm": 0.3202888071537018,
+      "learning_rate": 6.13203755876035e-05,
+      "loss": 0.6895,
+      "step": 3445
+    },
+    {
+      "epoch": 6.641000962463908,
+      "grad_norm": 0.3102019131183624,
+      "learning_rate": 6.1010482762792585e-05,
+      "loss": 0.6923,
+      "step": 3450
+    },
+    {
+      "epoch": 6.650625601539942,
+      "grad_norm": 0.3367016911506653,
+      "learning_rate": 6.070103086576802e-05,
+      "loss": 0.6915,
+      "step": 3455
+    },
+    {
+      "epoch": 6.6602502406159765,
+      "grad_norm": 0.3353261351585388,
+      "learning_rate": 6.039202339608432e-05,
+      "loss": 0.687,
+      "step": 3460
+    },
+    {
+      "epoch": 6.669874879692012,
+      "grad_norm": 0.30828601121902466,
+      "learning_rate": 6.0083463848269995e-05,
+      "loss": 0.6934,
+      "step": 3465
+    },
+    {
+      "epoch": 6.679499518768046,
+      "grad_norm": 0.3269566595554352,
+      "learning_rate": 5.977535571178809e-05,
+      "loss": 0.6967,
+      "step": 3470
+    },
+    {
+      "epoch": 6.6891241578440805,
+      "grad_norm": 0.339278906583786,
+      "learning_rate": 5.946770247099661e-05,
+      "loss": 0.691,
+      "step": 3475
+    },
+    {
+      "epoch": 6.698748796920116,
+      "grad_norm": 0.33345827460289,
+      "learning_rate": 5.9160507605109275e-05,
+      "loss": 0.7039,
+      "step": 3480
+    },
+    {
+      "epoch": 6.70837343599615,
+      "grad_norm": 0.318852037191391,
+      "learning_rate": 5.885377458815609e-05,
+      "loss": 0.7019,
+      "step": 3485
+    },
+    {
+      "epoch": 6.717998075072185,
+      "grad_norm": 0.3394601047039032,
+      "learning_rate": 5.8547506888944007e-05,
+      "loss": 0.6881,
+      "step": 3490
+    },
+    {
+      "epoch": 6.72762271414822,
+      "grad_norm": 0.32474079728126526,
+      "learning_rate": 5.824170797101787e-05,
+      "loss": 0.6879,
+      "step": 3495
+    },
+    {
+      "epoch": 6.737247353224254,
+      "grad_norm": 0.325595885515213,
+      "learning_rate": 5.7936381292621e-05,
+      "loss": 0.6951,
+      "step": 3500
+    },
+    {
+      "epoch": 6.746871992300289,
+      "grad_norm": 0.3558216989040375,
+      "learning_rate": 5.763153030665629e-05,
+      "loss": 0.6947,
+      "step": 3505
+    },
+    {
+      "epoch": 6.756496631376323,
+      "grad_norm": 0.3530566692352295,
+      "learning_rate": 5.7327158460647065e-05,
+      "loss": 0.6986,
+      "step": 3510
+    },
+    {
+      "epoch": 6.766121270452358,
+      "grad_norm": 0.33962172269821167,
+      "learning_rate": 5.702326919669817e-05,
+      "loss": 0.6964,
+      "step": 3515
+    },
+    {
+      "epoch": 6.775745909528393,
+      "grad_norm": 0.3108658790588379,
+      "learning_rate": 5.671986595145693e-05,
+      "loss": 0.6923,
+      "step": 3520
+    },
+    {
+      "epoch": 6.785370548604427,
+      "grad_norm": 0.32073214650154114,
+      "learning_rate": 5.64169521560743e-05,
+      "loss": 0.6792,
+      "step": 3525
+    },
+    {
+      "epoch": 6.7949951876804615,
+      "grad_norm": 0.3249306380748749,
+      "learning_rate": 5.611453123616618e-05,
+      "loss": 0.7013,
+      "step": 3530
+    },
+    {
+      "epoch": 6.804619826756497,
+      "grad_norm": 0.333997905254364,
+      "learning_rate": 5.581260661177463e-05,
+      "loss": 0.6923,
+      "step": 3535
+    },
+    {
+      "epoch": 6.814244465832531,
+      "grad_norm": 0.3433645963668823,
+      "learning_rate": 5.551118169732901e-05,
+      "loss": 0.7014,
+      "step": 3540
+    },
+    {
+      "epoch": 6.823869104908566,
+      "grad_norm": 0.3301408886909485,
+      "learning_rate": 5.521025990160772e-05,
+      "loss": 0.6966,
+      "step": 3545
+    },
+    {
+      "epoch": 6.833493743984601,
+      "grad_norm": 0.341169148683548,
+      "learning_rate": 5.4909844627699255e-05,
+      "loss": 0.6963,
+      "step": 3550
+    },
+    {
+      "epoch": 6.843118383060635,
+      "grad_norm": 0.31754934787750244,
+      "learning_rate": 5.460993927296407e-05,
+      "loss": 0.6996,
+      "step": 3555
+    },
+    {
+      "epoch": 6.85274302213667,
+      "grad_norm": 0.3002949655056,
+      "learning_rate": 5.4310547228995936e-05,
+      "loss": 0.6946,
+      "step": 3560
+    },
+    {
+      "epoch": 6.862367661212705,
+      "grad_norm": 0.3369508981704712,
+      "learning_rate": 5.4011671881583656e-05,
+      "loss": 0.6902,
+      "step": 3565
+    },
+    {
+      "epoch": 6.871992300288739,
+      "grad_norm": 0.3112001419067383,
+      "learning_rate": 5.371331661067284e-05,
+      "loss": 0.6935,
+      "step": 3570
+    },
+    {
+      "epoch": 6.881616939364774,
+      "grad_norm": 0.3145786225795746,
+      "learning_rate": 5.341548479032745e-05,
+      "loss": 0.7027,
+      "step": 3575
+    },
+    {
+      "epoch": 6.891241578440808,
+      "grad_norm": 0.32883113622665405,
+      "learning_rate": 5.311817978869198e-05,
+      "loss": 0.6928,
+      "step": 3580
+    },
+    {
+      "epoch": 6.900866217516843,
+      "grad_norm": 0.3237265646457672,
+      "learning_rate": 5.2821404967953114e-05,
+      "loss": 0.6865,
+      "step": 3585
+    },
+    {
+      "epoch": 6.910490856592878,
+      "grad_norm": 0.32935890555381775,
+      "learning_rate": 5.2525163684301806e-05,
+      "loss": 0.687,
+      "step": 3590
+    },
+    {
+      "epoch": 6.920115495668912,
+      "grad_norm": 0.342359721660614,
+      "learning_rate": 5.222945928789533e-05,
+      "loss": 0.691,
+      "step": 3595
+    },
+    {
+      "epoch": 6.929740134744947,
+      "grad_norm": 0.3421998620033264,
+      "learning_rate": 5.193429512281926e-05,
+      "loss": 0.6863,
+      "step": 3600
+    },
+    {
+      "epoch": 6.939364773820982,
+      "grad_norm": 0.33589935302734375,
+      "learning_rate": 5.1639674527049855e-05,
+      "loss": 0.6916,
+      "step": 3605
+    },
+    {
+      "epoch": 6.948989412897016,
+      "grad_norm": 0.3499864637851715,
+      "learning_rate": 5.134560083241624e-05,
+      "loss": 0.6878,
+      "step": 3610
+    },
+    {
+      "epoch": 6.958614051973051,
+      "grad_norm": 0.3289993405342102,
+      "learning_rate": 5.105207736456257e-05,
+      "loss": 0.6976,
+      "step": 3615
+    },
+    {
+      "epoch": 6.968238691049086,
+      "grad_norm": 0.32949408888816833,
+      "learning_rate": 5.0759107442910715e-05,
+      "loss": 0.6949,
+      "step": 3620
+    },
+    {
+      "epoch": 6.97786333012512,
+      "grad_norm": 0.3234226703643799,
+      "learning_rate": 5.046669438062238e-05,
+      "loss": 0.6958,
+      "step": 3625
+    },
+    {
+      "epoch": 6.987487969201155,
+      "grad_norm": 0.3094496726989746,
+      "learning_rate": 5.0174841484561953e-05,
+      "loss": 0.6938,
+      "step": 3630
+    },
+    {
+      "epoch": 6.99711260827719,
+      "grad_norm": 0.31556159257888794,
+      "learning_rate": 4.988355205525893e-05,
+      "loss": 0.7004,
+      "step": 3635
+    },
+    {
+      "epoch": 6.999037536092397,
+      "eval_loss": 3.276942253112793,
+      "eval_runtime": 0.7888,
+      "eval_samples_per_second": 13.945,
+      "eval_steps_per_second": 2.535,
+      "step": 3636
+    },
+    {
+      "epoch": 7.006737247353224,
+      "grad_norm": 0.26794806122779846,
+      "learning_rate": 4.959282938687061e-05,
+      "loss": 0.6482,
+      "step": 3640
+    },
+    {
+      "epoch": 7.016361886429259,
+      "grad_norm": 0.3672392666339874,
+      "learning_rate": 4.9302676767144926e-05,
+      "loss": 0.6471,
+      "step": 3645
+    },
+    {
+      "epoch": 7.025986525505293,
+      "grad_norm": 0.2901393175125122,
+      "learning_rate": 4.901309747738305e-05,
+      "loss": 0.654,
+      "step": 3650
+    },
+    {
+      "epoch": 7.035611164581328,
+      "grad_norm": 0.3516036868095398,
+      "learning_rate": 4.872409479240259e-05,
+      "loss": 0.6452,
+      "step": 3655
+    },
+    {
+      "epoch": 7.045235803657363,
+      "grad_norm": 0.3640913665294647,
+      "learning_rate": 4.843567198050031e-05,
+      "loss": 0.6369,
+      "step": 3660
+    },
+    {
+      "epoch": 7.054860442733397,
+      "grad_norm": 0.2963874936103821,
+      "learning_rate": 4.814783230341531e-05,
+      "loss": 0.6353,
+      "step": 3665
+    },
+    {
+      "epoch": 7.0644850818094325,
+      "grad_norm": 0.3295438587665558,
+      "learning_rate": 4.786057901629209e-05,
+      "loss": 0.6398,
+      "step": 3670
+    },
+    {
+      "epoch": 7.074109720885467,
+      "grad_norm": 0.3382556736469269,
+      "learning_rate": 4.757391536764366e-05,
+      "loss": 0.6452,
+      "step": 3675
+    },
+    {
+      "epoch": 7.083734359961501,
+      "grad_norm": 0.3277692496776581,
+      "learning_rate": 4.728784459931495e-05,
+      "loss": 0.637,
+      "step": 3680
+    },
+    {
+      "epoch": 7.0933589990375365,
+      "grad_norm": 0.3565356433391571,
+      "learning_rate": 4.700236994644609e-05,
+      "loss": 0.6379,
+      "step": 3685
+    },
+    {
+      "epoch": 7.102983638113571,
+      "grad_norm": 0.35193830728530884,
+      "learning_rate": 4.671749463743572e-05,
+      "loss": 0.6512,
+      "step": 3690
+    },
+    {
+      "epoch": 7.112608277189605,
+      "grad_norm": 0.32000118494033813,
+      "learning_rate": 4.64332218939047e-05,
+      "loss": 0.6445,
+      "step": 3695
+    },
+    {
+      "epoch": 7.12223291626564,
+      "grad_norm": 0.33006584644317627,
+      "learning_rate": 4.61495549306594e-05,
+      "loss": 0.6381,
+      "step": 3700
+    },
+    {
+      "epoch": 7.131857555341675,
+      "grad_norm": 0.3775092661380768,
+      "learning_rate": 4.586649695565563e-05,
+      "loss": 0.6331,
+      "step": 3705
+    },
+    {
+      "epoch": 7.141482194417709,
+      "grad_norm": 0.3325980007648468,
+      "learning_rate": 4.558405116996214e-05,
+      "loss": 0.6436,
+      "step": 3710
+    },
+    {
+      "epoch": 7.151106833493744,
+      "grad_norm": 0.3391129970550537,
+      "learning_rate": 4.530222076772456e-05,
+      "loss": 0.6415,
+      "step": 3715
+    },
+    {
+      "epoch": 7.160731472569779,
+      "grad_norm": 0.31919702887535095,
+      "learning_rate": 4.5021008936129216e-05,
+      "loss": 0.6441,
+      "step": 3720
+    },
+    {
+      "epoch": 7.170356111645813,
+      "grad_norm": 0.3420950770378113,
+      "learning_rate": 4.4740418855367005e-05,
+      "loss": 0.6524,
+      "step": 3725
+    },
+    {
+      "epoch": 7.179980750721848,
+      "grad_norm": 0.354056179523468,
+      "learning_rate": 4.4460453698597623e-05,
+      "loss": 0.6476,
+      "step": 3730
+    },
+    {
+      "epoch": 7.189605389797882,
+      "grad_norm": 0.31593650579452515,
+      "learning_rate": 4.418111663191354e-05,
+      "loss": 0.6473,
+      "step": 3735
+    },
+    {
+      "epoch": 7.1992300288739175,
+      "grad_norm": 0.33761167526245117,
+      "learning_rate": 4.390241081430423e-05,
+      "loss": 0.6402,
+      "step": 3740
+    },
+    {
+      "epoch": 7.208854667949952,
+      "grad_norm": 0.35358771681785583,
+      "learning_rate": 4.362433939762046e-05,
+      "loss": 0.6471,
+      "step": 3745
+    },
+    {
+      "epoch": 7.218479307025986,
+      "grad_norm": 0.32182127237319946,
+      "learning_rate": 4.3346905526538574e-05,
+      "loss": 0.6408,
+      "step": 3750
+    },
+    {
+      "epoch": 7.228103946102022,
+      "grad_norm": 0.3282702565193176,
+      "learning_rate": 4.307011233852505e-05,
+      "loss": 0.642,
+      "step": 3755
+    },
+    {
+      "epoch": 7.237728585178056,
+      "grad_norm": 0.33513620495796204,
+      "learning_rate": 4.279396296380097e-05,
+      "loss": 0.6391,
+      "step": 3760
+    },
+    {
+      "epoch": 7.24735322425409,
+      "grad_norm": 0.33494138717651367,
+      "learning_rate": 4.2518460525306524e-05,
+      "loss": 0.6401,
+      "step": 3765
+    },
+    {
+      "epoch": 7.256977863330125,
+      "grad_norm": 0.33716508746147156,
+      "learning_rate": 4.2243608138665906e-05,
+      "loss": 0.6499,
+      "step": 3770
+    },
+    {
+      "epoch": 7.26660250240616,
+      "grad_norm": 0.3404597043991089,
+      "learning_rate": 4.19694089121518e-05,
+      "loss": 0.6385,
+      "step": 3775
+    },
+    {
+      "epoch": 7.276227141482194,
+      "grad_norm": 0.32999253273010254,
+      "learning_rate": 4.169586594665048e-05,
+      "loss": 0.6433,
+      "step": 3780
+    },
+    {
+      "epoch": 7.285851780558229,
+      "grad_norm": 0.3411442041397095,
+      "learning_rate": 4.142298233562664e-05,
+      "loss": 0.6422,
+      "step": 3785
+    },
+    {
+      "epoch": 7.295476419634264,
+      "grad_norm": 0.3550765812397003,
+      "learning_rate": 4.115076116508837e-05,
+      "loss": 0.6458,
+      "step": 3790
+    },
+    {
+      "epoch": 7.3051010587102985,
+      "grad_norm": 0.3416723608970642,
+      "learning_rate": 4.08792055135524e-05,
+      "loss": 0.6456,
+      "step": 3795
+    },
+    {
+      "epoch": 7.314725697786333,
+      "grad_norm": 0.35609087347984314,
+      "learning_rate": 4.0608318452009e-05,
+      "loss": 0.6533,
+      "step": 3800
+    },
+    {
+      "epoch": 7.324350336862367,
+      "grad_norm": 0.332507461309433,
+      "learning_rate": 4.033810304388759e-05,
+      "loss": 0.6282,
+      "step": 3805
+    },
+    {
+      "epoch": 7.3339749759384025,
+      "grad_norm": 0.34344714879989624,
+      "learning_rate": 4.006856234502191e-05,
+      "loss": 0.633,
+      "step": 3810
+    },
+    {
+      "epoch": 7.343599615014437,
+      "grad_norm": 0.3543119430541992,
+      "learning_rate": 3.9799699403615457e-05,
+      "loss": 0.6417,
+      "step": 3815
+    },
+    {
+      "epoch": 7.353224254090471,
+      "grad_norm": 0.3393097221851349,
+      "learning_rate": 3.953151726020713e-05,
+      "loss": 0.6337,
+      "step": 3820
+    },
+    {
+      "epoch": 7.362848893166507,
+      "grad_norm": 0.34601929783821106,
+      "learning_rate": 3.926401894763663e-05,
+      "loss": 0.6514,
+      "step": 3825
+    },
+    {
+      "epoch": 7.372473532242541,
+      "grad_norm": 0.3476494550704956,
+      "learning_rate": 3.89972074910104e-05,
+      "loss": 0.6381,
+      "step": 3830
+    },
+    {
+      "epoch": 7.382098171318575,
+      "grad_norm": 0.3308873474597931,
+      "learning_rate": 3.8731085907667345e-05,
+      "loss": 0.6523,
+      "step": 3835
+    },
+    {
+      "epoch": 7.39172281039461,
+      "grad_norm": 0.33746767044067383,
+      "learning_rate": 3.846565720714451e-05,
+      "loss": 0.6386,
+      "step": 3840
+    },
+    {
+      "epoch": 7.401347449470645,
+      "grad_norm": 0.33146432042121887,
+      "learning_rate": 3.820092439114339e-05,
+      "loss": 0.6505,
+      "step": 3845
+    },
+    {
+      "epoch": 7.410972088546679,
+      "grad_norm": 0.34075871109962463,
+      "learning_rate": 3.793689045349575e-05,
+      "loss": 0.6292,
+      "step": 3850
+    },
+    {
+      "epoch": 7.420596727622714,
+      "grad_norm": 0.3384300172328949,
+      "learning_rate": 3.7673558380129735e-05,
+      "loss": 0.649,
+      "step": 3855
+    },
+    {
+      "epoch": 7.430221366698749,
+      "grad_norm": 0.35409146547317505,
+      "learning_rate": 3.741093114903631e-05,
+      "loss": 0.6401,
+      "step": 3860
+    },
+    {
+      "epoch": 7.4398460057747835,
+      "grad_norm": 0.3388952314853668,
+      "learning_rate": 3.7149011730235394e-05,
+      "loss": 0.646,
+      "step": 3865
+    },
+    {
+      "epoch": 7.449470644850818,
+      "grad_norm": 0.3542778789997101,
+      "learning_rate": 3.688780308574238e-05,
+      "loss": 0.6367,
+      "step": 3870
+    },
+    {
+      "epoch": 7.459095283926853,
+      "grad_norm": 0.33730167150497437,
+      "learning_rate": 3.66273081695346e-05,
+      "loss": 0.655,
+      "step": 3875
+    },
+    {
+      "epoch": 7.468719923002888,
+      "grad_norm": 0.3402201533317566,
+      "learning_rate": 3.6367529927517855e-05,
+      "loss": 0.6327,
+      "step": 3880
+    },
+    {
+      "epoch": 7.478344562078922,
+      "grad_norm": 0.3543342649936676,
+      "learning_rate": 3.610847129749323e-05,
+      "loss": 0.6534,
+      "step": 3885
+    },
+    {
+      "epoch": 7.487969201154956,
+      "grad_norm": 0.3624216914176941,
+      "learning_rate": 3.585013520912377e-05,
+      "loss": 0.6393,
+      "step": 3890
+    },
+    {
+      "epoch": 7.497593840230992,
+      "grad_norm": 0.3448854386806488,
+      "learning_rate": 3.559252458390142e-05,
+      "loss": 0.6473,
+      "step": 3895
+    },
+    {
+      "epoch": 7.507218479307026,
+      "grad_norm": 0.3260321021080017,
+      "learning_rate": 3.533564233511394e-05,
+      "loss": 0.635,
+      "step": 3900
+    },
+    {
+      "epoch": 7.51684311838306,
+      "grad_norm": 0.36959561705589294,
+      "learning_rate": 3.507949136781189e-05,
+      "loss": 0.6454,
+      "step": 3905
+    },
+    {
+      "epoch": 7.526467757459095,
+      "grad_norm": 0.3395916223526001,
+      "learning_rate": 3.482407457877598e-05,
+      "loss": 0.6491,
+      "step": 3910
+    },
+    {
+      "epoch": 7.53609239653513,
+      "grad_norm": 0.3479905426502228,
+      "learning_rate": 3.456939485648406e-05,
+      "loss": 0.638,
+      "step": 3915
+    },
+    {
+      "epoch": 7.5457170356111645,
+      "grad_norm": 0.3783397674560547,
+      "learning_rate": 3.4315455081078696e-05,
+      "loss": 0.6446,
+      "step": 3920
+    },
+    {
+      "epoch": 7.555341674687199,
+      "grad_norm": 0.34621936082839966,
+      "learning_rate": 3.4062258124334434e-05,
+      "loss": 0.64,
+      "step": 3925
+    },
+    {
+      "epoch": 7.564966313763234,
+      "grad_norm": 0.34806111454963684,
+      "learning_rate": 3.3809806849625314e-05,
+      "loss": 0.641,
+      "step": 3930
+    },
+    {
+      "epoch": 7.5745909528392685,
+      "grad_norm": 0.33737459778785706,
+      "learning_rate": 3.355810411189264e-05,
+      "loss": 0.6389,
+      "step": 3935
+    },
+    {
+      "epoch": 7.584215591915303,
+      "grad_norm": 0.36518171429634094,
+      "learning_rate": 3.330715275761257e-05,
+      "loss": 0.6448,
+      "step": 3940
+    },
+    {
+      "epoch": 7.593840230991338,
+      "grad_norm": 0.3364472985267639,
+      "learning_rate": 3.305695562476393e-05,
+      "loss": 0.6378,
+      "step": 3945
+    },
+    {
+      "epoch": 7.603464870067373,
+      "grad_norm": 0.345920592546463,
+      "learning_rate": 3.280751554279622e-05,
+      "loss": 0.634,
+      "step": 3950
+    },
+    {
+      "epoch": 7.613089509143407,
+      "grad_norm": 0.33815324306488037,
+      "learning_rate": 3.255883533259741e-05,
+      "loss": 0.6452,
+      "step": 3955
+    },
+    {
+      "epoch": 7.622714148219442,
+      "grad_norm": 0.34798070788383484,
+      "learning_rate": 3.2310917806462274e-05,
+      "loss": 0.6433,
+      "step": 3960
+    },
+    {
+      "epoch": 7.632338787295477,
+      "grad_norm": 0.34050893783569336,
+      "learning_rate": 3.2063765768060475e-05,
+      "loss": 0.6505,
+      "step": 3965
+    },
+    {
+      "epoch": 7.641963426371511,
+      "grad_norm": 0.3409608006477356,
+      "learning_rate": 3.1817382012404854e-05,
+      "loss": 0.6515,
+      "step": 3970
+    },
+    {
+      "epoch": 7.651588065447545,
+      "grad_norm": 0.3448992371559143,
+      "learning_rate": 3.157176932581983e-05,
+      "loss": 0.6355,
+      "step": 3975
+    },
+    {
+      "epoch": 7.661212704523581,
+      "grad_norm": 0.3314208984375,
+      "learning_rate": 3.132693048590988e-05,
+      "loss": 0.647,
+      "step": 3980
+    },
+    {
+      "epoch": 7.670837343599615,
+      "grad_norm": 0.34806132316589355,
+      "learning_rate": 3.108286826152818e-05,
+      "loss": 0.6377,
+      "step": 3985
+    },
+    {
+      "epoch": 7.6804619826756495,
+      "grad_norm": 0.3525891900062561,
+      "learning_rate": 3.083958541274518e-05,
+      "loss": 0.6326,
+      "step": 3990
+    },
+    {
+      "epoch": 7.690086621751684,
+      "grad_norm": 0.36846107244491577,
+      "learning_rate": 3.059708469081754e-05,
+      "loss": 0.6327,
+      "step": 3995
+    },
+    {
+      "epoch": 7.699711260827719,
+      "grad_norm": 0.33311864733695984,
+      "learning_rate": 3.035536883815696e-05,
+      "loss": 0.6379,
+      "step": 4000
+    },
+    {
+      "epoch": 7.709335899903754,
+      "grad_norm": 0.3615313172340393,
+      "learning_rate": 3.0114440588299033e-05,
+      "loss": 0.6522,
+      "step": 4005
+    },
+    {
+      "epoch": 7.718960538979788,
+      "grad_norm": 0.33901557326316833,
+      "learning_rate": 2.9874302665872544e-05,
+      "loss": 0.6495,
+      "step": 4010
+    },
+    {
+      "epoch": 7.728585178055823,
+      "grad_norm": 0.3336678743362427,
+      "learning_rate": 2.963495778656853e-05,
+      "loss": 0.6583,
+      "step": 4015
+    },
+    {
+      "epoch": 7.738209817131858,
+      "grad_norm": 0.38028064370155334,
+      "learning_rate": 2.9396408657109608e-05,
+      "loss": 0.6365,
+      "step": 4020
+    },
+    {
+      "epoch": 7.747834456207892,
+      "grad_norm": 0.3507869243621826,
+      "learning_rate": 2.9158657975219385e-05,
+      "loss": 0.6466,
+      "step": 4025
+    },
+    {
+      "epoch": 7.757459095283927,
+      "grad_norm": 0.3580639660358429,
+      "learning_rate": 2.8921708429591797e-05,
+      "loss": 0.6472,
+      "step": 4030
+    },
+    {
+      "epoch": 7.767083734359962,
+      "grad_norm": 0.3309887945652008,
+      "learning_rate": 2.8685562699860957e-05,
+      "loss": 0.6476,
+      "step": 4035
+    },
+    {
+      "epoch": 7.776708373435996,
+      "grad_norm": 0.3457421064376831,
+      "learning_rate": 2.8450223456570668e-05,
+      "loss": 0.6414,
+      "step": 4040
+    },
+    {
+      "epoch": 7.7863330125120305,
+      "grad_norm": 0.33013686537742615,
+      "learning_rate": 2.8215693361144324e-05,
+      "loss": 0.6535,
+      "step": 4045
+    },
+    {
+      "epoch": 7.795957651588066,
+      "grad_norm": 0.32177311182022095,
+      "learning_rate": 2.798197506585464e-05,
+      "loss": 0.6487,
+      "step": 4050
+    },
+    {
+      "epoch": 7.8055822906641,
+      "grad_norm": 0.3439447283744812,
+      "learning_rate": 2.774907121379393e-05,
+      "loss": 0.6354,
+      "step": 4055
+    },
+    {
+      "epoch": 7.8152069297401345,
+      "grad_norm": 0.34718647599220276,
+      "learning_rate": 2.751698443884394e-05,
+      "loss": 0.6504,
+      "step": 4060
+    },
+    {
+      "epoch": 7.824831568816169,
+      "grad_norm": 0.34381964802742004,
+      "learning_rate": 2.7285717365646256e-05,
+      "loss": 0.6453,
+      "step": 4065
+    },
+    {
+      "epoch": 7.834456207892204,
+      "grad_norm": 0.34925544261932373,
+      "learning_rate": 2.7055272609572568e-05,
+      "loss": 0.6484,
+      "step": 4070
+    },
+    {
+      "epoch": 7.844080846968239,
+      "grad_norm": 0.34031766653060913,
+      "learning_rate": 2.6825652776695076e-05,
+      "loss": 0.6462,
+      "step": 4075
+    },
+    {
+      "epoch": 7.853705486044273,
+      "grad_norm": 0.3397299349308014,
+      "learning_rate": 2.6596860463756935e-05,
+      "loss": 0.6444,
+      "step": 4080
+    },
+    {
+      "epoch": 7.863330125120308,
+      "grad_norm": 0.348021537065506,
+      "learning_rate": 2.636889825814307e-05,
+      "loss": 0.6389,
+      "step": 4085
+    },
+    {
+      "epoch": 7.872954764196343,
+      "grad_norm": 0.3368039131164551,
+      "learning_rate": 2.6141768737850814e-05,
+      "loss": 0.6453,
+      "step": 4090
+    },
+    {
+      "epoch": 7.882579403272377,
+      "grad_norm": 0.34815698862075806,
+      "learning_rate": 2.5915474471460732e-05,
+      "loss": 0.6474,
+      "step": 4095
+    },
+    {
+      "epoch": 7.892204042348412,
+      "grad_norm": 0.3499961793422699,
+      "learning_rate": 2.5690018018107642e-05,
+      "loss": 0.6436,
+      "step": 4100
+    },
+    {
+      "epoch": 7.901828681424447,
+      "grad_norm": 0.3426460921764374,
+      "learning_rate": 2.5465401927451537e-05,
+      "loss": 0.6437,
+      "step": 4105
+    },
+    {
+      "epoch": 7.911453320500481,
+      "grad_norm": 0.3375738561153412,
+      "learning_rate": 2.524162873964896e-05,
+      "loss": 0.6394,
+      "step": 4110
+    },
+    {
+      "epoch": 7.9210779595765155,
+      "grad_norm": 0.34224507212638855,
+      "learning_rate": 2.501870098532412e-05,
+      "loss": 0.6524,
+      "step": 4115
+    },
+    {
+      "epoch": 7.930702598652551,
+      "grad_norm": 0.3286498785018921,
+      "learning_rate": 2.4796621185540348e-05,
+      "loss": 0.6507,
+      "step": 4120
+    },
+    {
+      "epoch": 7.940327237728585,
+      "grad_norm": 0.36504673957824707,
+      "learning_rate": 2.4575391851771477e-05,
+      "loss": 0.6389,
+      "step": 4125
+    },
+    {
+      "epoch": 7.94995187680462,
+      "grad_norm": 0.3325868546962738,
+      "learning_rate": 2.4355015485873644e-05,
+      "loss": 0.6402,
+      "step": 4130
+    },
+    {
+      "epoch": 7.959576515880655,
+      "grad_norm": 0.35220691561698914,
+      "learning_rate": 2.4135494580056737e-05,
+      "loss": 0.6553,
+      "step": 4135
+    },
+    {
+      "epoch": 7.969201154956689,
+      "grad_norm": 0.3708426058292389,
+      "learning_rate": 2.3916831616856473e-05,
+      "loss": 0.6518,
+      "step": 4140
+    },
+    {
+      "epoch": 7.978825794032724,
+      "grad_norm": 0.34426939487457275,
+      "learning_rate": 2.3699029069106115e-05,
+      "loss": 0.6505,
+      "step": 4145
+    },
+    {
+      "epoch": 7.988450433108758,
+      "grad_norm": 0.3554341793060303,
+      "learning_rate": 2.348208939990866e-05,
+      "loss": 0.6497,
+      "step": 4150
+    },
+    {
+      "epoch": 7.998075072184793,
+      "grad_norm": 0.3434050381183624,
+      "learning_rate": 2.3266015062608838e-05,
+      "loss": 0.6466,
+      "step": 4155
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 3.694774627685547,
+      "eval_runtime": 0.7787,
+      "eval_samples_per_second": 14.127,
+      "eval_steps_per_second": 2.569,
+      "step": 4156
+    },
+    {
+      "epoch": 8.007699711260829,
+      "grad_norm": 0.28248271346092224,
+      "learning_rate": 2.3050808500765487e-05,
+      "loss": 0.6121,
+      "step": 4160
+    },
+    {
+      "epoch": 8.017324350336862,
+      "grad_norm": 0.36666032671928406,
+      "learning_rate": 2.2836472148123878e-05,
+      "loss": 0.6176,
+      "step": 4165
+    },
+    {
+      "epoch": 8.026948989412897,
+      "grad_norm": 0.32897964119911194,
+      "learning_rate": 2.2623008428588177e-05,
+      "loss": 0.6079,
+      "step": 4170
+    },
+    {
+      "epoch": 8.03657362848893,
+      "grad_norm": 0.32618117332458496,
+      "learning_rate": 2.24104197561941e-05,
+      "loss": 0.6043,
+      "step": 4175
+    },
+    {
+      "epoch": 8.046198267564966,
+      "grad_norm": 0.3435162305831909,
+      "learning_rate": 2.2198708535081446e-05,
+      "loss": 0.6082,
+      "step": 4180
+    },
+    {
+      "epoch": 8.055822906641001,
+      "grad_norm": 0.3350038528442383,
+      "learning_rate": 2.198787715946712e-05,
+      "loss": 0.6098,
+      "step": 4185
+    },
+    {
+      "epoch": 8.065447545717035,
+      "grad_norm": 0.3771952986717224,
+      "learning_rate": 2.1777928013617908e-05,
+      "loss": 0.6137,
+      "step": 4190
+    },
+    {
+      "epoch": 8.07507218479307,
+      "grad_norm": 0.3174493610858917,
+      "learning_rate": 2.1568863471823642e-05,
+      "loss": 0.6169,
+      "step": 4195
+    },
+    {
+      "epoch": 8.084696823869105,
+      "grad_norm": 0.33214735984802246,
+      "learning_rate": 2.1360685898370146e-05,
+      "loss": 0.6066,
+      "step": 4200
+    },
+    {
+      "epoch": 8.094321462945139,
+      "grad_norm": 0.3336653411388397,
+      "learning_rate": 2.1153397647512763e-05,
+      "loss": 0.6073,
+      "step": 4205
+    },
+    {
+      "epoch": 8.103946102021174,
+      "grad_norm": 0.32206472754478455,
+      "learning_rate": 2.0947001063449457e-05,
+      "loss": 0.6,
+      "step": 4210
+    },
+    {
+      "epoch": 8.11357074109721,
+      "grad_norm": 0.3184707760810852,
+      "learning_rate": 2.074149848029453e-05,
+      "loss": 0.6065,
+      "step": 4215
+    },
+    {
+      "epoch": 8.123195380173243,
+      "grad_norm": 0.3209008276462555,
+      "learning_rate": 2.0536892222052128e-05,
+      "loss": 0.608,
+      "step": 4220
+    },
+    {
+      "epoch": 8.132820019249278,
+      "grad_norm": 0.34929510951042175,
+      "learning_rate": 2.0333184602589962e-05,
+      "loss": 0.6125,
+      "step": 4225
+    },
+    {
+      "epoch": 8.142444658325314,
+      "grad_norm": 0.34042608737945557,
+      "learning_rate": 2.01303779256131e-05,
+      "loss": 0.6094,
+      "step": 4230
+    },
+    {
+      "epoch": 8.152069297401347,
+      "grad_norm": 0.33042535185813904,
+      "learning_rate": 1.992847448463798e-05,
+      "loss": 0.6122,
+      "step": 4235
+    },
+    {
+      "epoch": 8.161693936477382,
+      "grad_norm": 0.3154657781124115,
+      "learning_rate": 1.9727476562966508e-05,
+      "loss": 0.6141,
+      "step": 4240
+    },
+    {
+      "epoch": 8.171318575553416,
+      "grad_norm": 0.33518335223197937,
+      "learning_rate": 1.952738643366011e-05,
+      "loss": 0.6139,
+      "step": 4245
+    },
+    {
+      "epoch": 8.180943214629451,
+      "grad_norm": 0.3391817510128021,
+      "learning_rate": 1.9328206359514155e-05,
+      "loss": 0.6106,
+      "step": 4250
+    },
+    {
+      "epoch": 8.190567853705486,
+      "grad_norm": 0.33157217502593994,
+      "learning_rate": 1.9129938593032227e-05,
+      "loss": 0.6051,
+      "step": 4255
+    },
+    {
+      "epoch": 8.20019249278152,
+      "grad_norm": 0.3601199686527252,
+      "learning_rate": 1.8932585376400803e-05,
+      "loss": 0.6127,
+      "step": 4260
+    },
+    {
+      "epoch": 8.209817131857555,
+      "grad_norm": 0.3452966511249542,
+      "learning_rate": 1.8736148941463795e-05,
+      "loss": 0.6162,
+      "step": 4265
+    },
+    {
+      "epoch": 8.21944177093359,
+      "grad_norm": 0.3637758791446686,
+      "learning_rate": 1.854063150969737e-05,
+      "loss": 0.6232,
+      "step": 4270
+    },
+    {
+      "epoch": 8.229066410009624,
+      "grad_norm": 0.3771421015262604,
+      "learning_rate": 1.834603529218475e-05,
+      "loss": 0.6066,
+      "step": 4275
+    },
+    {
+      "epoch": 8.23869104908566,
+      "grad_norm": 0.338925302028656,
+      "learning_rate": 1.81523624895913e-05,
+      "loss": 0.6155,
+      "step": 4280
+    },
+    {
+      "epoch": 8.248315688161695,
+      "grad_norm": 0.3534870147705078,
+      "learning_rate": 1.7959615292139544e-05,
+      "loss": 0.614,
+      "step": 4285
+    },
+    {
+      "epoch": 8.257940327237728,
+      "grad_norm": 0.33125004172325134,
+      "learning_rate": 1.7767795879584504e-05,
+      "loss": 0.6175,
+      "step": 4290
+    },
+    {
+      "epoch": 8.267564966313763,
+      "grad_norm": 0.3411141037940979,
+      "learning_rate": 1.7576906421188967e-05,
+      "loss": 0.6114,
+      "step": 4295
+    },
+    {
+      "epoch": 8.277189605389799,
+      "grad_norm": 0.3340323865413666,
+      "learning_rate": 1.738694907569901e-05,
+      "loss": 0.6233,
+      "step": 4300
+    },
+    {
+      "epoch": 8.286814244465832,
+      "grad_norm": 0.3233914375305176,
+      "learning_rate": 1.7197925991319486e-05,
+      "loss": 0.6082,
+      "step": 4305
+    },
+    {
+      "epoch": 8.296438883541867,
+      "grad_norm": 0.3364531099796295,
+      "learning_rate": 1.7009839305689855e-05,
+      "loss": 0.6049,
+      "step": 4310
+    },
+    {
+      "epoch": 8.306063522617901,
+      "grad_norm": 0.34157273173332214,
+      "learning_rate": 1.682269114585996e-05,
+      "loss": 0.6141,
+      "step": 4315
+    },
+    {
+      "epoch": 8.315688161693936,
+      "grad_norm": 0.33447617292404175,
+      "learning_rate": 1.6636483628265942e-05,
+      "loss": 0.6093,
+      "step": 4320
+    },
+    {
+      "epoch": 8.325312800769971,
+      "grad_norm": 0.33221328258514404,
+      "learning_rate": 1.6451218858706374e-05,
+      "loss": 0.6073,
+      "step": 4325
+    },
+    {
+      "epoch": 8.334937439846005,
+      "grad_norm": 0.32823801040649414,
+      "learning_rate": 1.626689893231832e-05,
+      "loss": 0.6069,
+      "step": 4330
+    },
+    {
+      "epoch": 8.34456207892204,
+      "grad_norm": 0.3583478629589081,
+      "learning_rate": 1.60835259335538e-05,
+      "loss": 0.6171,
+      "step": 4335
+    },
+    {
+      "epoch": 8.354186717998076,
+      "grad_norm": 0.33178088068962097,
+      "learning_rate": 1.5901101936156136e-05,
+      "loss": 0.6066,
+      "step": 4340
+    },
+    {
+      "epoch": 8.363811357074109,
+      "grad_norm": 0.3466804623603821,
+      "learning_rate": 1.5719629003136506e-05,
+      "loss": 0.6023,
+      "step": 4345
+    },
+    {
+      "epoch": 8.373435996150144,
+      "grad_norm": 0.357316792011261,
+      "learning_rate": 1.5539109186750544e-05,
+      "loss": 0.6059,
+      "step": 4350
+    },
+    {
+      "epoch": 8.38306063522618,
+      "grad_norm": 0.3246915340423584,
+      "learning_rate": 1.5359544528475323e-05,
+      "loss": 0.6231,
+      "step": 4355
+    },
+    {
+      "epoch": 8.392685274302213,
+      "grad_norm": 0.3579736649990082,
+      "learning_rate": 1.5180937058986033e-05,
+      "loss": 0.617,
+      "step": 4360
+    },
+    {
+      "epoch": 8.402309913378248,
+      "grad_norm": 0.33767664432525635,
+      "learning_rate": 1.5003288798133198e-05,
+      "loss": 0.6135,
+      "step": 4365
+    },
+    {
+      "epoch": 8.411934552454284,
+      "grad_norm": 0.34384191036224365,
+      "learning_rate": 1.4826601754919755e-05,
+      "loss": 0.6045,
+      "step": 4370
+    },
+    {
+      "epoch": 8.421559191530317,
+      "grad_norm": 0.34475091099739075,
+      "learning_rate": 1.4650877927478357e-05,
+      "loss": 0.611,
+      "step": 4375
+    },
+    {
+      "epoch": 8.431183830606352,
+      "grad_norm": 0.3544045686721802,
+      "learning_rate": 1.4476119303048707e-05,
+      "loss": 0.6048,
+      "step": 4380
+    },
+    {
+      "epoch": 8.440808469682388,
+      "grad_norm": 0.3278457820415497,
+      "learning_rate": 1.43023278579552e-05,
+      "loss": 0.6216,
+      "step": 4385
+    },
+    {
+      "epoch": 8.450433108758421,
+      "grad_norm": 0.33195823431015015,
+      "learning_rate": 1.4129505557584511e-05,
+      "loss": 0.6106,
+      "step": 4390
+    },
+    {
+      "epoch": 8.460057747834457,
+      "grad_norm": 0.32435399293899536,
+      "learning_rate": 1.3957654356363349e-05,
+      "loss": 0.6142,
+      "step": 4395
+    },
+    {
+      "epoch": 8.46968238691049,
+      "grad_norm": 0.34540995955467224,
+      "learning_rate": 1.3786776197736417e-05,
+      "loss": 0.6112,
+      "step": 4400
+    },
+    {
+      "epoch": 8.479307025986525,
+      "grad_norm": 0.3274092972278595,
+      "learning_rate": 1.3616873014144327e-05,
+      "loss": 0.6151,
+      "step": 4405
+    },
+    {
+      "epoch": 8.48893166506256,
+      "grad_norm": 0.3616076409816742,
+      "learning_rate": 1.3447946727001881e-05,
+      "loss": 0.6167,
+      "step": 4410
+    },
+    {
+      "epoch": 8.498556304138594,
+      "grad_norm": 0.32997846603393555,
+      "learning_rate": 1.3279999246676256e-05,
+      "loss": 0.611,
+      "step": 4415
+    },
+    {
+      "epoch": 8.50818094321463,
+      "grad_norm": 0.34430432319641113,
+      "learning_rate": 1.3113032472465426e-05,
+      "loss": 0.613,
+      "step": 4420
+    },
+    {
+      "epoch": 8.517805582290665,
+      "grad_norm": 0.35246655344963074,
+      "learning_rate": 1.2947048292576636e-05,
+      "loss": 0.6133,
+      "step": 4425
+    },
+    {
+      "epoch": 8.527430221366698,
+      "grad_norm": 0.3330981433391571,
+      "learning_rate": 1.2782048584105166e-05,
+      "loss": 0.615,
+      "step": 4430
+    },
+    {
+      "epoch": 8.537054860442733,
+      "grad_norm": 0.33830517530441284,
+      "learning_rate": 1.2618035213012924e-05,
+      "loss": 0.6175,
+      "step": 4435
+    },
+    {
+      "epoch": 8.546679499518769,
+      "grad_norm": 0.3427278399467468,
+      "learning_rate": 1.2455010034107527e-05,
+      "loss": 0.6111,
+      "step": 4440
+    },
+    {
+      "epoch": 8.556304138594802,
+      "grad_norm": 0.3526034355163574,
+      "learning_rate": 1.2292974891021236e-05,
+      "loss": 0.6135,
+      "step": 4445
+    },
+    {
+      "epoch": 8.565928777670837,
+      "grad_norm": 0.3584502935409546,
+      "learning_rate": 1.2131931616190118e-05,
+      "loss": 0.6143,
+      "step": 4450
+    },
+    {
+      "epoch": 8.575553416746873,
+      "grad_norm": 0.32676076889038086,
+      "learning_rate": 1.1971882030833248e-05,
+      "loss": 0.6092,
+      "step": 4455
+    },
+    {
+      "epoch": 8.585178055822906,
+      "grad_norm": 0.3570641279220581,
+      "learning_rate": 1.181282794493227e-05,
+      "loss": 0.6101,
+      "step": 4460
+    },
+    {
+      "epoch": 8.594802694898942,
+      "grad_norm": 0.35699462890625,
+      "learning_rate": 1.165477115721083e-05,
+      "loss": 0.6116,
+      "step": 4465
+    },
+    {
+      "epoch": 8.604427333974975,
+      "grad_norm": 0.3642681837081909,
+      "learning_rate": 1.1497713455114212e-05,
+      "loss": 0.6204,
+      "step": 4470
+    },
+    {
+      "epoch": 8.61405197305101,
+      "grad_norm": 0.34195858240127563,
+      "learning_rate": 1.1341656614789208e-05,
+      "loss": 0.6105,
+      "step": 4475
+    },
+    {
+      "epoch": 8.623676612127046,
+      "grad_norm": 0.3449951410293579,
+      "learning_rate": 1.1186602401063917e-05,
+      "loss": 0.6061,
+      "step": 4480
+    },
+    {
+      "epoch": 8.63330125120308,
+      "grad_norm": 0.3435938358306885,
+      "learning_rate": 1.1032552567427912e-05,
+      "loss": 0.6097,
+      "step": 4485
+    },
+    {
+      "epoch": 8.642925890279114,
+      "grad_norm": 0.3187827169895172,
+      "learning_rate": 1.0879508856012366e-05,
+      "loss": 0.6022,
+      "step": 4490
+    },
+    {
+      "epoch": 8.65255052935515,
+      "grad_norm": 0.3434700667858124,
+      "learning_rate": 1.0727472997570243e-05,
+      "loss": 0.6116,
+      "step": 4495
+    },
+    {
+      "epoch": 8.662175168431183,
+      "grad_norm": 0.34856435656547546,
+      "learning_rate": 1.0576446711456933e-05,
+      "loss": 0.605,
+      "step": 4500
+    },
+    {
+      "epoch": 8.671799807507218,
+      "grad_norm": 0.3594229817390442,
+      "learning_rate": 1.0426431705610606e-05,
+      "loss": 0.6133,
+      "step": 4505
+    },
+    {
+      "epoch": 8.681424446583254,
+      "grad_norm": 0.3380817174911499,
+      "learning_rate": 1.0277429676533023e-05,
+      "loss": 0.6073,
+      "step": 4510
+    },
+    {
+      "epoch": 8.691049085659287,
+      "grad_norm": 0.3276160955429077,
+      "learning_rate": 1.012944230927031e-05,
+      "loss": 0.6021,
+      "step": 4515
+    },
+    {
+      "epoch": 8.700673724735323,
+      "grad_norm": 0.34987348318099976,
+      "learning_rate": 9.9824712773939e-06,
+      "loss": 0.617,
+      "step": 4520
+    },
+    {
+      "epoch": 8.710298363811358,
+      "grad_norm": 0.3415302336215973,
+      "learning_rate": 9.83651824298164e-06,
+      "loss": 0.6111,
+      "step": 4525
+    },
+    {
+      "epoch": 8.719923002887391,
+      "grad_norm": 0.34866005182266235,
+      "learning_rate": 9.69158485659889e-06,
+      "loss": 0.603,
+      "step": 4530
+    },
+    {
+      "epoch": 8.729547641963427,
+      "grad_norm": 0.36085546016693115,
+      "learning_rate": 9.547672757280001e-06,
+      "loss": 0.6042,
+      "step": 4535
+    },
+    {
+      "epoch": 8.739172281039462,
+      "grad_norm": 0.36267852783203125,
+      "learning_rate": 9.40478357250969e-06,
+      "loss": 0.6127,
+      "step": 4540
+    },
+    {
+      "epoch": 8.748796920115495,
+      "grad_norm": 0.36462917923927307,
+      "learning_rate": 9.262918918204643e-06,
+      "loss": 0.6123,
+      "step": 4545
+    },
+    {
+      "epoch": 8.75842155919153,
+      "grad_norm": 0.34768378734588623,
+      "learning_rate": 9.122080398695299e-06,
+      "loss": 0.6048,
+      "step": 4550
+    },
+    {
+      "epoch": 8.768046198267564,
+      "grad_norm": 0.330387681722641,
+      "learning_rate": 8.982269606707593e-06,
+      "loss": 0.6165,
+      "step": 4555
+    },
+    {
+      "epoch": 8.7776708373436,
+      "grad_norm": 0.3596397936344147,
+      "learning_rate": 8.843488123345044e-06,
+      "loss": 0.6072,
+      "step": 4560
+    },
+    {
+      "epoch": 8.787295476419635,
+      "grad_norm": 0.35082703828811646,
+      "learning_rate": 8.705737518070888e-06,
+      "loss": 0.6185,
+      "step": 4565
+    },
+    {
+      "epoch": 8.796920115495668,
+      "grad_norm": 0.33255165815353394,
+      "learning_rate": 8.569019348690189e-06,
+      "loss": 0.6099,
+      "step": 4570
+    },
+    {
+      "epoch": 8.806544754571703,
+      "grad_norm": 0.3488062620162964,
+      "learning_rate": 8.433335161332412e-06,
+      "loss": 0.6056,
+      "step": 4575
+    },
+    {
+      "epoch": 8.816169393647739,
+      "grad_norm": 0.35131949186325073,
+      "learning_rate": 8.298686490433771e-06,
+      "loss": 0.6102,
+      "step": 4580
+    },
+    {
+      "epoch": 8.825794032723772,
+      "grad_norm": 0.37358999252319336,
+      "learning_rate": 8.165074858719989e-06,
+      "loss": 0.6103,
+      "step": 4585
+    },
+    {
+      "epoch": 8.835418671799808,
+      "grad_norm": 0.35089996457099915,
+      "learning_rate": 8.032501777189017e-06,
+      "loss": 0.6112,
+      "step": 4590
+    },
+    {
+      "epoch": 8.845043310875843,
+      "grad_norm": 0.35341012477874756,
+      "learning_rate": 7.900968745093996e-06,
+      "loss": 0.6089,
+      "step": 4595
+    },
+    {
+      "epoch": 8.854667949951876,
+      "grad_norm": 0.3873613178730011,
+      "learning_rate": 7.770477249926256e-06,
+      "loss": 0.6111,
+      "step": 4600
+    },
+    {
+      "epoch": 8.864292589027912,
+      "grad_norm": 0.34750309586524963,
+      "learning_rate": 7.641028767398472e-06,
+      "loss": 0.616,
+      "step": 4605
+    },
+    {
+      "epoch": 8.873917228103947,
+      "grad_norm": 0.32477355003356934,
+      "learning_rate": 7.512624761428066e-06,
+      "loss": 0.6089,
+      "step": 4610
+    },
+    {
+      "epoch": 8.88354186717998,
+      "grad_norm": 0.35710757970809937,
+      "learning_rate": 7.385266684120573e-06,
+      "loss": 0.61,
+      "step": 4615
+    },
+    {
+      "epoch": 8.893166506256016,
+      "grad_norm": 0.34388595819473267,
+      "learning_rate": 7.258955975753279e-06,
+      "loss": 0.6076,
+      "step": 4620
+    },
+    {
+      "epoch": 8.90279114533205,
+      "grad_norm": 0.32944580912590027,
+      "learning_rate": 7.133694064758867e-06,
+      "loss": 0.606,
+      "step": 4625
+    },
+    {
+      "epoch": 8.912415784408084,
+      "grad_norm": 0.3470548093318939,
+      "learning_rate": 7.0094823677092856e-06,
+      "loss": 0.6015,
+      "step": 4630
+    },
+    {
+      "epoch": 8.92204042348412,
+      "grad_norm": 0.3423613905906677,
+      "learning_rate": 6.886322289299763e-06,
+      "loss": 0.6155,
+      "step": 4635
+    },
+    {
+      "epoch": 8.931665062560153,
+      "grad_norm": 0.35634317994117737,
+      "learning_rate": 6.764215222332914e-06,
+      "loss": 0.6146,
+      "step": 4640
+    },
+    {
+      "epoch": 8.941289701636189,
+      "grad_norm": 0.33485671877861023,
+      "learning_rate": 6.643162547702931e-06,
+      "loss": 0.6135,
+      "step": 4645
+    },
+    {
+      "epoch": 8.950914340712224,
+      "grad_norm": 0.35238829255104065,
+      "learning_rate": 6.523165634380046e-06,
+      "loss": 0.6044,
+      "step": 4650
+    },
+    {
+      "epoch": 8.960538979788257,
+      "grad_norm": 0.3438652753829956,
+      "learning_rate": 6.404225839394973e-06,
+      "loss": 0.6107,
+      "step": 4655
+    },
+    {
+      "epoch": 8.970163618864293,
+      "grad_norm": 0.352061003446579,
+      "learning_rate": 6.286344507823638e-06,
+      "loss": 0.6164,
+      "step": 4660
+    },
+    {
+      "epoch": 8.979788257940328,
+      "grad_norm": 0.3431857228279114,
+      "learning_rate": 6.169522972771924e-06,
+      "loss": 0.6144,
+      "step": 4665
+    },
+    {
+      "epoch": 8.989412897016361,
+      "grad_norm": 0.32378876209259033,
+      "learning_rate": 6.053762555360587e-06,
+      "loss": 0.6162,
+      "step": 4670
+    },
+    {
+      "epoch": 8.999037536092397,
+      "grad_norm": 0.36266306042671204,
+      "learning_rate": 5.939064564710373e-06,
+      "loss": 0.6132,
+      "step": 4675
+    },
+    {
+      "epoch": 8.999037536092397,
+      "eval_loss": 3.9708144664764404,
+      "eval_runtime": 0.7877,
+      "eval_samples_per_second": 13.964,
+      "eval_steps_per_second": 2.539,
+      "step": 4675
+    },
+    {
+      "epoch": 9.008662175168432,
+      "grad_norm": 0.3068545162677765,
+      "learning_rate": 5.825430297927092e-06,
+      "loss": 0.5915,
+      "step": 4680
+    },
+    {
+      "epoch": 9.018286814244465,
+      "grad_norm": 0.3031752407550812,
+      "learning_rate": 5.712861040087092e-06,
+      "loss": 0.586,
+      "step": 4685
+    },
+    {
+      "epoch": 9.0279114533205,
+      "grad_norm": 0.33787086606025696,
+      "learning_rate": 5.601358064222639e-06,
+      "loss": 0.5911,
+      "step": 4690
+    },
+    {
+      "epoch": 9.037536092396536,
+      "grad_norm": 0.35586461424827576,
+      "learning_rate": 5.49092263130756e-06,
+      "loss": 0.5828,
+      "step": 4695
+    },
+    {
+      "epoch": 9.04716073147257,
+      "grad_norm": 0.3516261875629425,
+      "learning_rate": 5.381555990242959e-06,
+      "loss": 0.5847,
+      "step": 4700
+    },
+    {
+      "epoch": 9.056785370548605,
+      "grad_norm": 0.34338730573654175,
+      "learning_rate": 5.273259377843087e-06,
+      "loss": 0.6036,
+      "step": 4705
+    },
+    {
+      "epoch": 9.066410009624638,
+      "grad_norm": 0.3557838499546051,
+      "learning_rate": 5.166034018821364e-06,
+      "loss": 0.5939,
+      "step": 4710
+    },
+    {
+      "epoch": 9.076034648700674,
+      "grad_norm": 0.31932586431503296,
+      "learning_rate": 5.059881125776589e-06,
+      "loss": 0.6016,
+      "step": 4715
+    },
+    {
+      "epoch": 9.085659287776709,
+      "grad_norm": 0.3272048532962799,
+      "learning_rate": 4.9548018991790846e-06,
+      "loss": 0.5909,
+      "step": 4720
+    },
+    {
+      "epoch": 9.095283926852742,
+      "grad_norm": 0.3446064889431,
+      "learning_rate": 4.850797527357287e-06,
+      "loss": 0.5827,
+      "step": 4725
+    },
+    {
+      "epoch": 9.104908565928778,
+      "grad_norm": 0.32635557651519775,
+      "learning_rate": 4.747869186484177e-06,
+      "loss": 0.5921,
+      "step": 4730
+    },
+    {
+      "epoch": 9.114533205004813,
+      "grad_norm": 0.31974223256111145,
+      "learning_rate": 4.64601804056406e-06,
+      "loss": 0.5932,
+      "step": 4735
+    },
+    {
+      "epoch": 9.124157844080846,
+      "grad_norm": 0.3654205799102783,
+      "learning_rate": 4.545245241419349e-06,
+      "loss": 0.5995,
+      "step": 4740
+    },
+    {
+      "epoch": 9.133782483156882,
+      "grad_norm": 0.35849812626838684,
+      "learning_rate": 4.445551928677594e-06,
+      "loss": 0.5995,
+      "step": 4745
+    },
+    {
+      "epoch": 9.143407122232917,
+      "grad_norm": 0.3359050750732422,
+      "learning_rate": 4.346939229758529e-06,
+      "loss": 0.5982,
+      "step": 4750
+    },
+    {
+      "epoch": 9.15303176130895,
+      "grad_norm": 0.33533555269241333,
+      "learning_rate": 4.2494082598613875e-06,
+      "loss": 0.6007,
+      "step": 4755
+    },
+    {
+      "epoch": 9.162656400384986,
+      "grad_norm": 0.3292589783668518,
+      "learning_rate": 4.152960121952209e-06,
+      "loss": 0.5974,
+      "step": 4760
+    },
+    {
+      "epoch": 9.172281039461021,
+      "grad_norm": 0.34592679142951965,
+      "learning_rate": 4.057595906751466e-06,
+      "loss": 0.5922,
+      "step": 4765
+    },
+    {
+      "epoch": 9.181905678537055,
+      "grad_norm": 0.34907424449920654,
+      "learning_rate": 3.963316692721663e-06,
+      "loss": 0.6007,
+      "step": 4770
+    },
+    {
+      "epoch": 9.19153031761309,
+      "grad_norm": 0.3478921949863434,
+      "learning_rate": 3.870123546055149e-06,
+      "loss": 0.5882,
+      "step": 4775
+    },
+    {
+      "epoch": 9.201154956689123,
+      "grad_norm": 0.3408016860485077,
+      "learning_rate": 3.7780175206620915e-06,
+      "loss": 0.595,
+      "step": 4780
+    },
+    {
+      "epoch": 9.210779595765159,
+      "grad_norm": 0.33491307497024536,
+      "learning_rate": 3.686999658158474e-06,
+      "loss": 0.5951,
+      "step": 4785
+    },
+    {
+      "epoch": 9.220404234841194,
+      "grad_norm": 0.3383229672908783,
+      "learning_rate": 3.597070987854456e-06,
+      "loss": 0.5966,
+      "step": 4790
+    },
+    {
+      "epoch": 9.230028873917227,
+      "grad_norm": 0.3315028250217438,
+      "learning_rate": 3.508232526742583e-06,
+      "loss": 0.5959,
+      "step": 4795
+    },
+    {
+      "epoch": 9.239653512993263,
+      "grad_norm": 0.30691462755203247,
+      "learning_rate": 3.420485279486385e-06,
+      "loss": 0.5853,
+      "step": 4800
+    },
+    {
+      "epoch": 9.249278152069298,
+      "grad_norm": 0.34303727746009827,
+      "learning_rate": 3.333830238409019e-06,
+      "loss": 0.5973,
+      "step": 4805
+    },
+    {
+      "epoch": 9.258902791145331,
+      "grad_norm": 0.3458213210105896,
+      "learning_rate": 3.248268383481934e-06,
+      "loss": 0.5978,
+      "step": 4810
+    },
+    {
+      "epoch": 9.268527430221367,
+      "grad_norm": 0.3539816737174988,
+      "learning_rate": 3.163800682313933e-06,
+      "loss": 0.5958,
+      "step": 4815
+    },
+    {
+      "epoch": 9.278152069297402,
+      "grad_norm": 0.3442062735557556,
+      "learning_rate": 3.080428090140142e-06,
+      "loss": 0.6022,
+      "step": 4820
+    },
+    {
+      "epoch": 9.287776708373435,
+      "grad_norm": 0.3180767893791199,
+      "learning_rate": 2.9981515498112456e-06,
+      "loss": 0.5955,
+      "step": 4825
+    },
+    {
+      "epoch": 9.29740134744947,
+      "grad_norm": 0.34698548913002014,
+      "learning_rate": 2.91697199178278e-06,
+      "loss": 0.5947,
+      "step": 4830
+    },
+    {
+      "epoch": 9.307025986525506,
+      "grad_norm": 0.3273780047893524,
+      "learning_rate": 2.8368903341046583e-06,
+      "loss": 0.5998,
+      "step": 4835
+    },
+    {
+      "epoch": 9.31665062560154,
+      "grad_norm": 0.31761637330055237,
+      "learning_rate": 2.757907482410771e-06,
+      "loss": 0.5841,
+      "step": 4840
+    },
+    {
+      "epoch": 9.326275264677575,
+      "grad_norm": 0.3708135783672333,
+      "learning_rate": 2.680024329908737e-06,
+      "loss": 0.5953,
+      "step": 4845
+    },
+    {
+      "epoch": 9.33589990375361,
+      "grad_norm": 0.309467613697052,
+      "learning_rate": 2.603241757369812e-06,
+      "loss": 0.5969,
+      "step": 4850
+    },
+    {
+      "epoch": 9.345524542829644,
+      "grad_norm": 0.32634660601615906,
+      "learning_rate": 2.5275606331189416e-06,
+      "loss": 0.602,
+      "step": 4855
+    },
+    {
+      "epoch": 9.355149181905679,
+      "grad_norm": 0.33582308888435364,
+      "learning_rate": 2.452981813024868e-06,
+      "loss": 0.5875,
+      "step": 4860
+    },
+    {
+      "epoch": 9.364773820981712,
+      "grad_norm": 0.3333386182785034,
+      "learning_rate": 2.379506140490595e-06,
+      "loss": 0.5986,
+      "step": 4865
+    },
+    {
+      "epoch": 9.374398460057748,
+      "grad_norm": 0.35826408863067627,
+      "learning_rate": 2.3071344464436595e-06,
+      "loss": 0.6015,
+      "step": 4870
+    },
+    {
+      "epoch": 9.384023099133783,
+      "grad_norm": 0.334588885307312,
+      "learning_rate": 2.235867549326931e-06,
+      "loss": 0.5942,
+      "step": 4875
+    },
+    {
+      "epoch": 9.393647738209816,
+      "grad_norm": 0.3338033854961395,
+      "learning_rate": 2.165706255089217e-06,
+      "loss": 0.5991,
+      "step": 4880
+    },
+    {
+      "epoch": 9.403272377285852,
+      "grad_norm": 0.3354242742061615,
+      "learning_rate": 2.0966513571761827e-06,
+      "loss": 0.5991,
+      "step": 4885
+    },
+    {
+      "epoch": 9.412897016361887,
+      "grad_norm": 0.34545251727104187,
+      "learning_rate": 2.028703636521434e-06,
+      "loss": 0.6058,
+      "step": 4890
+    },
+    {
+      "epoch": 9.42252165543792,
+      "grad_norm": 0.33035480976104736,
+      "learning_rate": 1.961863861537594e-06,
+      "loss": 0.5981,
+      "step": 4895
+    },
+    {
+      "epoch": 9.432146294513956,
+      "grad_norm": 0.33753854036331177,
+      "learning_rate": 1.8961327881076963e-06,
+      "loss": 0.5944,
+      "step": 4900
+    },
+    {
+      "epoch": 9.441770933589991,
+      "grad_norm": 0.34246233105659485,
+      "learning_rate": 1.8315111595765932e-06,
+      "loss": 0.5931,
+      "step": 4905
+    },
+    {
+      "epoch": 9.451395572666025,
+      "grad_norm": 0.33052095770835876,
+      "learning_rate": 1.767999706742529e-06,
+      "loss": 0.5986,
+      "step": 4910
+    },
+    {
+      "epoch": 9.46102021174206,
+      "grad_norm": 0.35342252254486084,
+      "learning_rate": 1.7055991478489464e-06,
+      "loss": 0.5938,
+      "step": 4915
+    },
+    {
+      "epoch": 9.470644850818095,
+      "grad_norm": 0.33293551206588745,
+      "learning_rate": 1.6443101885762812e-06,
+      "loss": 0.5917,
+      "step": 4920
+    },
+    {
+      "epoch": 9.480269489894129,
+      "grad_norm": 0.3331868648529053,
+      "learning_rate": 1.5841335220340593e-06,
+      "loss": 0.5951,
+      "step": 4925
+    },
+    {
+      "epoch": 9.489894128970164,
+      "grad_norm": 0.35304731130599976,
+      "learning_rate": 1.525069828753012e-06,
+      "loss": 0.602,
+      "step": 4930
+    },
+    {
+      "epoch": 9.499518768046197,
+      "grad_norm": 0.3421652019023895,
+      "learning_rate": 1.4671197766773615e-06,
+      "loss": 0.5966,
+      "step": 4935
+    },
+    {
+      "epoch": 9.509143407122233,
+      "grad_norm": 0.3255125880241394,
+      "learning_rate": 1.4102840211573264e-06,
+      "loss": 0.5944,
+      "step": 4940
+    },
+    {
+      "epoch": 9.518768046198268,
+      "grad_norm": 0.34258726239204407,
+      "learning_rate": 1.3545632049416502e-06,
+      "loss": 0.5889,
+      "step": 4945
+    },
+    {
+      "epoch": 9.528392685274301,
+      "grad_norm": 0.3264661729335785,
+      "learning_rate": 1.2999579581703947e-06,
+      "loss": 0.5954,
+      "step": 4950
+    },
+    {
+      "epoch": 9.538017324350337,
+      "grad_norm": 0.3256395161151886,
+      "learning_rate": 1.2464688983677697e-06,
+      "loss": 0.5907,
+      "step": 4955
+    },
+    {
+      "epoch": 9.547641963426372,
+      "grad_norm": 0.32232365012168884,
+      "learning_rate": 1.1940966304351265e-06,
+      "loss": 0.5949,
+      "step": 4960
+    },
+    {
+      "epoch": 9.557266602502406,
+      "grad_norm": 0.32586029171943665,
+      "learning_rate": 1.1428417466442076e-06,
+      "loss": 0.5885,
+      "step": 4965
+    },
+    {
+      "epoch": 9.56689124157844,
+      "grad_norm": 0.3531622886657715,
+      "learning_rate": 1.0927048266303419e-06,
+      "loss": 0.6064,
+      "step": 4970
+    },
+    {
+      "epoch": 9.576515880654476,
+      "grad_norm": 0.34918224811553955,
+      "learning_rate": 1.0436864373859712e-06,
+      "loss": 0.6043,
+      "step": 4975
+    },
+    {
+      "epoch": 9.58614051973051,
+      "grad_norm": 0.3377608358860016,
+      "learning_rate": 9.95787133254189e-07,
+      "loss": 0.5869,
+      "step": 4980
+    },
+    {
+      "epoch": 9.595765158806545,
+      "grad_norm": 0.32988688349723816,
+      "learning_rate": 9.490074559225015e-07,
+      "loss": 0.5957,
+      "step": 4985
+    },
+    {
+      "epoch": 9.60538979788258,
+      "grad_norm": 0.3335455656051636,
+      "learning_rate": 9.033479344166873e-07,
+      "loss": 0.5901,
+      "step": 4990
+    },
+    {
+      "epoch": 9.615014436958614,
+      "grad_norm": 0.34015801548957825,
+      "learning_rate": 8.588090850948027e-07,
+      "loss": 0.5956,
+      "step": 4995
+    },
+    {
+      "epoch": 9.624639076034649,
+      "grad_norm": 0.32440024614334106,
+      "learning_rate": 8.153914116413752e-07,
+      "loss": 0.6035,
+      "step": 5000
+    },
+    {
+      "epoch": 9.634263715110684,
+      "grad_norm": 0.33188602328300476,
+      "learning_rate": 7.730954050616746e-07,
+      "loss": 0.6025,
+      "step": 5005
+    },
+    {
+      "epoch": 9.643888354186718,
+      "grad_norm": 0.33264580368995667,
+      "learning_rate": 7.319215436761839e-07,
+      "loss": 0.5973,
+      "step": 5010
+    },
+    {
+      "epoch": 9.653512993262753,
+      "grad_norm": 0.342488557100296,
+      "learning_rate": 6.918702931151711e-07,
+      "loss": 0.5914,
+      "step": 5015
+    },
+    {
+      "epoch": 9.663137632338787,
+      "grad_norm": 0.33260515332221985,
+      "learning_rate": 6.529421063134478e-07,
+      "loss": 0.5964,
+      "step": 5020
+    },
+    {
+      "epoch": 9.672762271414822,
+      "grad_norm": 0.358557790517807,
+      "learning_rate": 6.151374235051966e-07,
+      "loss": 0.6021,
+      "step": 5025
+    },
+    {
+      "epoch": 9.682386910490857,
+      "grad_norm": 0.341327965259552,
+      "learning_rate": 5.784566722190965e-07,
+      "loss": 0.5911,
+      "step": 5030
+    },
+    {
+      "epoch": 9.69201154956689,
+      "grad_norm": 0.31675535440444946,
+      "learning_rate": 5.429002672733274e-07,
+      "loss": 0.6015,
+      "step": 5035
+    },
+    {
+      "epoch": 9.701636188642926,
+      "grad_norm": 0.31824976205825806,
+      "learning_rate": 5.084686107710513e-07,
+      "loss": 0.599,
+      "step": 5040
+    },
+    {
+      "epoch": 9.711260827718961,
+      "grad_norm": 0.3493671715259552,
+      "learning_rate": 4.751620920957489e-07,
+      "loss": 0.596,
+      "step": 5045
+    },
+    {
+      "epoch": 9.720885466794995,
+      "grad_norm": 0.34269365668296814,
+      "learning_rate": 4.429810879068463e-07,
+      "loss": 0.5969,
+      "step": 5050
+    },
+    {
+      "epoch": 9.73051010587103,
+      "grad_norm": 0.3367815613746643,
+      "learning_rate": 4.1192596213548427e-07,
+      "loss": 0.5885,
+      "step": 5055
+    },
+    {
+      "epoch": 9.740134744947065,
+      "grad_norm": 0.34025177359580994,
+      "learning_rate": 3.81997065980344e-07,
+      "loss": 0.6051,
+      "step": 5060
+    },
+    {
+      "epoch": 9.749759384023099,
+      "grad_norm": 0.3241323232650757,
+      "learning_rate": 3.5319473790373924e-07,
+      "loss": 0.5914,
+      "step": 5065
+    },
+    {
+      "epoch": 9.759384023099134,
+      "grad_norm": 0.3496091961860657,
+      "learning_rate": 3.2551930362776373e-07,
+      "loss": 0.5962,
+      "step": 5070
+    },
+    {
+      "epoch": 9.769008662175168,
+      "grad_norm": 0.38736647367477417,
+      "learning_rate": 2.989710761305942e-07,
+      "loss": 0.5941,
+      "step": 5075
+    },
+    {
+      "epoch": 9.778633301251203,
+      "grad_norm": 0.33493003249168396,
+      "learning_rate": 2.7355035564294865e-07,
+      "loss": 0.5971,
+      "step": 5080
+    },
+    {
+      "epoch": 9.788257940327238,
+      "grad_norm": 0.3347594738006592,
+      "learning_rate": 2.4925742964471144e-07,
+      "loss": 0.605,
+      "step": 5085
+    },
+    {
+      "epoch": 9.797882579403272,
+      "grad_norm": 0.3406401574611664,
+      "learning_rate": 2.2609257286169138e-07,
+      "loss": 0.5876,
+      "step": 5090
+    },
+    {
+      "epoch": 9.807507218479307,
+      "grad_norm": 0.34672555327415466,
+      "learning_rate": 2.0405604726246864e-07,
+      "loss": 0.5979,
+      "step": 5095
+    },
+    {
+      "epoch": 9.817131857555342,
+      "grad_norm": 0.3294496238231659,
+      "learning_rate": 1.8314810205547483e-07,
+      "loss": 0.584,
+      "step": 5100
+    },
+    {
+      "epoch": 9.826756496631376,
+      "grad_norm": 0.33348360657691956,
+      "learning_rate": 1.633689736861732e-07,
+      "loss": 0.5945,
+      "step": 5105
+    },
+    {
+      "epoch": 9.836381135707411,
+      "grad_norm": 0.32033050060272217,
+      "learning_rate": 1.4471888583436067e-07,
+      "loss": 0.5952,
+      "step": 5110
+    },
+    {
+      "epoch": 9.846005774783446,
+      "grad_norm": 0.35708528757095337,
+      "learning_rate": 1.2719804941163648e-07,
+      "loss": 0.5936,
+      "step": 5115
+    },
+    {
+      "epoch": 9.85563041385948,
+      "grad_norm": 0.34551671147346497,
+      "learning_rate": 1.108066625590487e-07,
+      "loss": 0.5902,
+      "step": 5120
+    },
+    {
+      "epoch": 9.865255052935515,
+      "grad_norm": 0.32259657979011536,
+      "learning_rate": 9.554491064484028e-08,
+      "loss": 0.5976,
+      "step": 5125
+    },
+    {
+      "epoch": 9.87487969201155,
+      "grad_norm": 0.3397790491580963,
+      "learning_rate": 8.141296626231754e-08,
+      "loss": 0.6072,
+      "step": 5130
+    },
+    {
+      "epoch": 9.884504331087584,
+      "grad_norm": 0.3509461283683777,
+      "learning_rate": 6.841098922797384e-08,
+      "loss": 0.6013,
+      "step": 5135
+    },
+    {
+      "epoch": 9.894128970163619,
+      "grad_norm": 0.3350575268268585,
+      "learning_rate": 5.653912657959115e-08,
+      "loss": 0.6012,
+      "step": 5140
+    },
+    {
+      "epoch": 9.903753609239654,
+      "grad_norm": 0.3318527042865753,
+      "learning_rate": 4.579751257466347e-08,
+      "loss": 0.6048,
+      "step": 5145
+    },
+    {
+      "epoch": 9.913378248315688,
+      "grad_norm": 0.37916940450668335,
+      "learning_rate": 3.618626868879815e-08,
+      "loss": 0.6068,
+      "step": 5150
+    },
+    {
+      "epoch": 9.923002887391723,
+      "grad_norm": 0.3555992841720581,
+      "learning_rate": 2.7705503614416928e-08,
+      "loss": 0.588,
+      "step": 5155
+    },
+    {
+      "epoch": 9.932627526467758,
+      "grad_norm": 0.32007142901420593,
+      "learning_rate": 2.0355313259468046e-08,
+      "loss": 0.5954,
+      "step": 5160
+    },
+    {
+      "epoch": 9.942252165543792,
+      "grad_norm": 0.3455217182636261,
+      "learning_rate": 1.4135780746382665e-08,
+      "loss": 0.5917,
+      "step": 5165
+    },
+    {
+      "epoch": 9.951876804619827,
+      "grad_norm": 0.32252103090286255,
+      "learning_rate": 9.046976411108965e-09,
+      "loss": 0.5889,
+      "step": 5170
+    },
+    {
+      "epoch": 9.96150144369586,
+      "grad_norm": 0.3604857921600342,
+      "learning_rate": 5.0889578023238794e-09,
+      "loss": 0.5959,
+      "step": 5175
+    },
+    {
+      "epoch": 9.971126082771896,
+      "grad_norm": 0.33323296904563904,
+      "learning_rate": 2.261769680789172e-09,
+      "loss": 0.5918,
+      "step": 5180
+    },
+    {
+      "epoch": 9.980750721847931,
+      "grad_norm": 0.33578982949256897,
+      "learning_rate": 5.654440188296306e-10,
+      "loss": 0.6011,
+      "step": 5185
+    },
+    {
+      "epoch": 9.990375360923965,
+      "grad_norm": 0.34376034140586853,
+      "learning_rate": 0.0,
+      "loss": 0.5965,
+      "step": 5190
+    },
+    {
+      "epoch": 9.990375360923965,
+      "eval_loss": 4.085933685302734,
+      "eval_runtime": 0.8044,
+      "eval_samples_per_second": 13.675,
+      "eval_steps_per_second": 2.486,
+      "step": 5190
+    },
+    {
+      "epoch": 9.990375360923965,
+      "step": 5190,
+      "total_flos": 7.743588771836199e+18,
+      "train_loss": 0.8018066772835792,
+      "train_runtime": 21791.6644,
+      "train_samples_per_second": 7.627,
+      "train_steps_per_second": 0.238
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5190,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.743588771836199e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}