Model save

Browse files

Files changed (4) hide show

README.md +76 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +2952 -0

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+---
+license: apache-2.0
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+model-index:
+- name: Mistral-7B-Instruct-v0.2-miracl-raft-sft-v2.0
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# Mistral-7B-Instruct-v0.2-miracl-raft-sft-v2.0
+This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.2086
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 3
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 24
+- total_eval_batch_size: 12
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3095        | 0.0987 | 200  | 1.2800          |
+| 1.249         | 0.1975 | 400  | 1.2516          |
+| 1.2514        | 0.2962 | 600  | 1.2369          |
+| 1.275         | 0.3950 | 800  | 1.2263          |
+| 1.1984        | 0.4937 | 1000 | 1.2197          |
+| 1.1556        | 0.5924 | 1200 | 1.2149          |
+| 1.2386        | 0.6912 | 1400 | 1.2116          |
+| 1.2661        | 0.7899 | 1600 | 1.2096          |
+| 1.2752        | 0.8887 | 1800 | 1.2088          |
+| 1.2701        | 0.9874 | 2000 | 1.2086          |
+### Framework versions
+- PEFT 0.7.1
+- Transformers 4.40.1
+- Pytorch 2.2.1+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997531473710195,
+    "total_flos": 4526278881050624.0,
+    "train_loss": 1.270192005722611,
+    "train_runtime": 113933.3906,
+    "train_samples": 48610,
+    "train_samples_per_second": 0.427,
+    "train_steps_per_second": 0.018
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997531473710195,
+    "total_flos": 4526278881050624.0,
+    "train_loss": 1.270192005722611,
+    "train_runtime": 113933.3906,
+    "train_samples": 48610,
+    "train_samples_per_second": 0.427,
+    "train_steps_per_second": 0.018
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2952 @@

+{
+  "best_metric": 1.208633542060852,
+  "best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Mistral-7B-Instruct-v0.2-miracl-raft-sft-v2.0/checkpoint-2000",
+  "epoch": 0.9997531473710195,
+  "eval_steps": 200,
+  "global_step": 2025,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004937052579609973,
+      "grad_norm": 3.3225639243513934,
+      "learning_rate": 4.926108374384237e-08,
+      "loss": 1.6639,
+      "step": 1
+    },
+    {
+      "epoch": 0.0024685262898049864,
+      "grad_norm": 2.953462068009019,
+      "learning_rate": 2.4630541871921185e-07,
+      "loss": 1.7388,
+      "step": 5
+    },
+    {
+      "epoch": 0.004937052579609973,
+      "grad_norm": 2.4057426602070646,
+      "learning_rate": 4.926108374384237e-07,
+      "loss": 1.5798,
+      "step": 10
+    },
+    {
+      "epoch": 0.00740557886941496,
+      "grad_norm": 3.186500367639329,
+      "learning_rate": 7.389162561576356e-07,
+      "loss": 1.659,
+      "step": 15
+    },
+    {
+      "epoch": 0.009874105159219946,
+      "grad_norm": 2.396847476828635,
+      "learning_rate": 9.852216748768474e-07,
+      "loss": 1.6374,
+      "step": 20
+    },
+    {
+      "epoch": 0.012342631449024932,
+      "grad_norm": 2.5391731431636106,
+      "learning_rate": 1.2315270935960593e-06,
+      "loss": 1.6314,
+      "step": 25
+    },
+    {
+      "epoch": 0.01481115773882992,
+      "grad_norm": 2.2574409610476462,
+      "learning_rate": 1.4778325123152712e-06,
+      "loss": 1.5888,
+      "step": 30
+    },
+    {
+      "epoch": 0.017279684028634903,
+      "grad_norm": 1.8342813542038656,
+      "learning_rate": 1.724137931034483e-06,
+      "loss": 1.5412,
+      "step": 35
+    },
+    {
+      "epoch": 0.01974821031843989,
+      "grad_norm": 1.8380484598711315,
+      "learning_rate": 1.970443349753695e-06,
+      "loss": 1.4889,
+      "step": 40
+    },
+    {
+      "epoch": 0.02221673660824488,
+      "grad_norm": 1.613058249947001,
+      "learning_rate": 2.2167487684729067e-06,
+      "loss": 1.5403,
+      "step": 45
+    },
+    {
+      "epoch": 0.024685262898049863,
+      "grad_norm": 1.8920016704605567,
+      "learning_rate": 2.4630541871921186e-06,
+      "loss": 1.4831,
+      "step": 50
+    },
+    {
+      "epoch": 0.02715378918785485,
+      "grad_norm": 1.2203038504329438,
+      "learning_rate": 2.70935960591133e-06,
+      "loss": 1.4233,
+      "step": 55
+    },
+    {
+      "epoch": 0.02962231547765984,
+      "grad_norm": 1.095184752565883,
+      "learning_rate": 2.9556650246305424e-06,
+      "loss": 1.4255,
+      "step": 60
+    },
+    {
+      "epoch": 0.03209084176746482,
+      "grad_norm": 0.8448900941377993,
+      "learning_rate": 3.201970443349754e-06,
+      "loss": 1.4892,
+      "step": 65
+    },
+    {
+      "epoch": 0.03455936805726981,
+      "grad_norm": 0.9405655862570454,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 1.4673,
+      "step": 70
+    },
+    {
+      "epoch": 0.0370278943470748,
+      "grad_norm": 0.6713923929675227,
+      "learning_rate": 3.6945812807881777e-06,
+      "loss": 1.4148,
+      "step": 75
+    },
+    {
+      "epoch": 0.03949642063687978,
+      "grad_norm": 0.7755902373813679,
+      "learning_rate": 3.94088669950739e-06,
+      "loss": 1.4867,
+      "step": 80
+    },
+    {
+      "epoch": 0.04196494692668477,
+      "grad_norm": 0.7400218273582495,
+      "learning_rate": 4.1871921182266015e-06,
+      "loss": 1.3834,
+      "step": 85
+    },
+    {
+      "epoch": 0.04443347321648976,
+      "grad_norm": 0.7245551973919236,
+      "learning_rate": 4.4334975369458135e-06,
+      "loss": 1.44,
+      "step": 90
+    },
+    {
+      "epoch": 0.04690199950629474,
+      "grad_norm": 0.6731482962094358,
+      "learning_rate": 4.6798029556650245e-06,
+      "loss": 1.362,
+      "step": 95
+    },
+    {
+      "epoch": 0.049370525796099726,
+      "grad_norm": 0.7105341248736622,
+      "learning_rate": 4.926108374384237e-06,
+      "loss": 1.3716,
+      "step": 100
+    },
+    {
+      "epoch": 0.05183905208590472,
+      "grad_norm": 0.6774223469533757,
+      "learning_rate": 5.172413793103449e-06,
+      "loss": 1.4056,
+      "step": 105
+    },
+    {
+      "epoch": 0.0543075783757097,
+      "grad_norm": 0.6745745206164803,
+      "learning_rate": 5.41871921182266e-06,
+      "loss": 1.337,
+      "step": 110
+    },
+    {
+      "epoch": 0.056776104665514686,
+      "grad_norm": 0.5935854583319804,
+      "learning_rate": 5.665024630541872e-06,
+      "loss": 1.3615,
+      "step": 115
+    },
+    {
+      "epoch": 0.05924463095531968,
+      "grad_norm": 0.556924082351685,
+      "learning_rate": 5.911330049261085e-06,
+      "loss": 1.4248,
+      "step": 120
+    },
+    {
+      "epoch": 0.06171315724512466,
+      "grad_norm": 0.5427807889259738,
+      "learning_rate": 6.157635467980296e-06,
+      "loss": 1.3286,
+      "step": 125
+    },
+    {
+      "epoch": 0.06418168353492965,
+      "grad_norm": 0.6063519166723843,
+      "learning_rate": 6.403940886699508e-06,
+      "loss": 1.3176,
+      "step": 130
+    },
+    {
+      "epoch": 0.06665020982473463,
+      "grad_norm": 0.5670363529677274,
+      "learning_rate": 6.65024630541872e-06,
+      "loss": 1.3273,
+      "step": 135
+    },
+    {
+      "epoch": 0.06911873611453961,
+      "grad_norm": 0.5846835420330245,
+      "learning_rate": 6.896551724137932e-06,
+      "loss": 1.326,
+      "step": 140
+    },
+    {
+      "epoch": 0.07158726240434461,
+      "grad_norm": 0.5686293276495719,
+      "learning_rate": 7.1428571428571436e-06,
+      "loss": 1.376,
+      "step": 145
+    },
+    {
+      "epoch": 0.0740557886941496,
+      "grad_norm": 0.5275299029056365,
+      "learning_rate": 7.3891625615763555e-06,
+      "loss": 1.3364,
+      "step": 150
+    },
+    {
+      "epoch": 0.07652431498395458,
+      "grad_norm": 0.5464387846115857,
+      "learning_rate": 7.635467980295567e-06,
+      "loss": 1.3654,
+      "step": 155
+    },
+    {
+      "epoch": 0.07899284127375956,
+      "grad_norm": 0.5229203264129956,
+      "learning_rate": 7.88177339901478e-06,
+      "loss": 1.3009,
+      "step": 160
+    },
+    {
+      "epoch": 0.08146136756356455,
+      "grad_norm": 0.5746356087172889,
+      "learning_rate": 8.12807881773399e-06,
+      "loss": 1.2611,
+      "step": 165
+    },
+    {
+      "epoch": 0.08392989385336953,
+      "grad_norm": 0.5922232695792946,
+      "learning_rate": 8.374384236453203e-06,
+      "loss": 1.3643,
+      "step": 170
+    },
+    {
+      "epoch": 0.08639842014317453,
+      "grad_norm": 0.5295655281983137,
+      "learning_rate": 8.620689655172414e-06,
+      "loss": 1.3165,
+      "step": 175
+    },
+    {
+      "epoch": 0.08886694643297952,
+      "grad_norm": 0.5850731545805168,
+      "learning_rate": 8.866995073891627e-06,
+      "loss": 1.3105,
+      "step": 180
+    },
+    {
+      "epoch": 0.0913354727227845,
+      "grad_norm": 0.5551320012809824,
+      "learning_rate": 9.113300492610838e-06,
+      "loss": 1.278,
+      "step": 185
+    },
+    {
+      "epoch": 0.09380399901258948,
+      "grad_norm": 0.5711739398485313,
+      "learning_rate": 9.359605911330049e-06,
+      "loss": 1.3197,
+      "step": 190
+    },
+    {
+      "epoch": 0.09627252530239447,
+      "grad_norm": 0.5559894427552352,
+      "learning_rate": 9.605911330049262e-06,
+      "loss": 1.3409,
+      "step": 195
+    },
+    {
+      "epoch": 0.09874105159219945,
+      "grad_norm": 0.5671580477100892,
+      "learning_rate": 9.852216748768475e-06,
+      "loss": 1.3095,
+      "step": 200
+    },
+    {
+      "epoch": 0.09874105159219945,
+      "eval_loss": 1.2800045013427734,
+      "eval_runtime": 2727.7043,
+      "eval_samples_per_second": 1.466,
+      "eval_steps_per_second": 0.122,
+      "step": 200
+    },
+    {
+      "epoch": 0.10120957788200444,
+      "grad_norm": 0.5279833943359168,
+      "learning_rate": 9.999970269475589e-06,
+      "loss": 1.2966,
+      "step": 205
+    },
+    {
+      "epoch": 0.10367810417180943,
+      "grad_norm": 0.5644566149733632,
+      "learning_rate": 9.99963580513638e-06,
+      "loss": 1.2874,
+      "step": 210
+    },
+    {
+      "epoch": 0.10614663046161442,
+      "grad_norm": 0.6258961143244912,
+      "learning_rate": 9.998929738244678e-06,
+      "loss": 1.3209,
+      "step": 215
+    },
+    {
+      "epoch": 0.1086151567514194,
+      "grad_norm": 0.4834621448531187,
+      "learning_rate": 9.997852121279563e-06,
+      "loss": 1.3313,
+      "step": 220
+    },
+    {
+      "epoch": 0.11108368304122439,
+      "grad_norm": 0.5481752837030147,
+      "learning_rate": 9.996403034335912e-06,
+      "loss": 1.2738,
+      "step": 225
+    },
+    {
+      "epoch": 0.11355220933102937,
+      "grad_norm": 0.5886589355414898,
+      "learning_rate": 9.994582585118449e-06,
+      "loss": 1.2758,
+      "step": 230
+    },
+    {
+      "epoch": 0.11602073562083436,
+      "grad_norm": 0.5757139355018718,
+      "learning_rate": 9.992390908933746e-06,
+      "loss": 1.3187,
+      "step": 235
+    },
+    {
+      "epoch": 0.11848926191063935,
+      "grad_norm": 0.5464825333851621,
+      "learning_rate": 9.989828168680164e-06,
+      "loss": 1.3677,
+      "step": 240
+    },
+    {
+      "epoch": 0.12095778820044434,
+      "grad_norm": 0.6372982363888493,
+      "learning_rate": 9.986894554835735e-06,
+      "loss": 1.2668,
+      "step": 245
+    },
+    {
+      "epoch": 0.12342631449024932,
+      "grad_norm": 0.5445141174147589,
+      "learning_rate": 9.983590285444025e-06,
+      "loss": 1.2917,
+      "step": 250
+    },
+    {
+      "epoch": 0.1258948407800543,
+      "grad_norm": 0.6832031232821291,
+      "learning_rate": 9.979915606097907e-06,
+      "loss": 1.2675,
+      "step": 255
+    },
+    {
+      "epoch": 0.1283633670698593,
+      "grad_norm": 0.62128138673847,
+      "learning_rate": 9.975870789921322e-06,
+      "loss": 1.3187,
+      "step": 260
+    },
+    {
+      "epoch": 0.13083189335966428,
+      "grad_norm": 0.5161196413352727,
+      "learning_rate": 9.971456137548971e-06,
+      "loss": 1.3031,
+      "step": 265
+    },
+    {
+      "epoch": 0.13330041964946926,
+      "grad_norm": 0.5524745641605668,
+      "learning_rate": 9.966671977103972e-06,
+      "loss": 1.2749,
+      "step": 270
+    },
+    {
+      "epoch": 0.13576894593927424,
+      "grad_norm": 0.6669242272051678,
+      "learning_rate": 9.961518664173473e-06,
+      "loss": 1.3409,
+      "step": 275
+    },
+    {
+      "epoch": 0.13823747222907923,
+      "grad_norm": 0.5555562003933405,
+      "learning_rate": 9.955996581782218e-06,
+      "loss": 1.2468,
+      "step": 280
+    },
+    {
+      "epoch": 0.14070599851888424,
+      "grad_norm": 0.6244202172570701,
+      "learning_rate": 9.950106140364089e-06,
+      "loss": 1.3318,
+      "step": 285
+    },
+    {
+      "epoch": 0.14317452480868922,
+      "grad_norm": 0.5100271270558925,
+      "learning_rate": 9.943847777731584e-06,
+      "loss": 1.2522,
+      "step": 290
+    },
+    {
+      "epoch": 0.1456430510984942,
+      "grad_norm": 0.5482368116306139,
+      "learning_rate": 9.937221959043294e-06,
+      "loss": 1.3044,
+      "step": 295
+    },
+    {
+      "epoch": 0.1481115773882992,
+      "grad_norm": 0.5919271032213149,
+      "learning_rate": 9.93022917676932e-06,
+      "loss": 1.3131,
+      "step": 300
+    },
+    {
+      "epoch": 0.15058010367810418,
+      "grad_norm": 0.5428829828459178,
+      "learning_rate": 9.922869950654662e-06,
+      "loss": 1.2306,
+      "step": 305
+    },
+    {
+      "epoch": 0.15304862996790916,
+      "grad_norm": 0.5461192699131175,
+      "learning_rate": 9.915144827680606e-06,
+      "loss": 1.3151,
+      "step": 310
+    },
+    {
+      "epoch": 0.15551715625771415,
+      "grad_norm": 0.5113904915941117,
+      "learning_rate": 9.907054382024058e-06,
+      "loss": 1.2813,
+      "step": 315
+    },
+    {
+      "epoch": 0.15798568254751913,
+      "grad_norm": 0.6272053783824121,
+      "learning_rate": 9.898599215014868e-06,
+      "loss": 1.3064,
+      "step": 320
+    },
+    {
+      "epoch": 0.1604542088373241,
+      "grad_norm": 0.5671094073178861,
+      "learning_rate": 9.889779955091142e-06,
+      "loss": 1.2734,
+      "step": 325
+    },
+    {
+      "epoch": 0.1629227351271291,
+      "grad_norm": 0.582371136771928,
+      "learning_rate": 9.880597257752522e-06,
+      "loss": 1.3075,
+      "step": 330
+    },
+    {
+      "epoch": 0.16539126141693408,
+      "grad_norm": 0.5520015589132342,
+      "learning_rate": 9.87105180551148e-06,
+      "loss": 1.2802,
+      "step": 335
+    },
+    {
+      "epoch": 0.16785978770673907,
+      "grad_norm": 0.5937587353133906,
+      "learning_rate": 9.861144307842574e-06,
+      "loss": 1.2893,
+      "step": 340
+    },
+    {
+      "epoch": 0.17032831399654405,
+      "grad_norm": 0.5371728696508287,
+      "learning_rate": 9.850875501129726e-06,
+      "loss": 1.219,
+      "step": 345
+    },
+    {
+      "epoch": 0.17279684028634906,
+      "grad_norm": 0.5892603164875664,
+      "learning_rate": 9.840246148611485e-06,
+      "loss": 1.3094,
+      "step": 350
+    },
+    {
+      "epoch": 0.17526536657615405,
+      "grad_norm": 0.5502008403202052,
+      "learning_rate": 9.829257040324308e-06,
+      "loss": 1.2543,
+      "step": 355
+    },
+    {
+      "epoch": 0.17773389286595903,
+      "grad_norm": 0.6273336128612022,
+      "learning_rate": 9.817908993043819e-06,
+      "loss": 1.3107,
+      "step": 360
+    },
+    {
+      "epoch": 0.18020241915576402,
+      "grad_norm": 0.5761032807193177,
+      "learning_rate": 9.806202850224123e-06,
+      "loss": 1.2657,
+      "step": 365
+    },
+    {
+      "epoch": 0.182670945445569,
+      "grad_norm": 0.5628854954179761,
+      "learning_rate": 9.794139481935108e-06,
+      "loss": 1.258,
+      "step": 370
+    },
+    {
+      "epoch": 0.18513947173537398,
+      "grad_norm": 0.5637909618250402,
+      "learning_rate": 9.781719784797773e-06,
+      "loss": 1.2406,
+      "step": 375
+    },
+    {
+      "epoch": 0.18760799802517897,
+      "grad_norm": 0.5212794091813217,
+      "learning_rate": 9.768944681917582e-06,
+      "loss": 1.2391,
+      "step": 380
+    },
+    {
+      "epoch": 0.19007652431498395,
+      "grad_norm": 0.6416799620777229,
+      "learning_rate": 9.755815122815871e-06,
+      "loss": 1.3188,
+      "step": 385
+    },
+    {
+      "epoch": 0.19254505060478894,
+      "grad_norm": 0.5487444911675088,
+      "learning_rate": 9.742332083359252e-06,
+      "loss": 1.2884,
+      "step": 390
+    },
+    {
+      "epoch": 0.19501357689459392,
+      "grad_norm": 0.5697317991057302,
+      "learning_rate": 9.728496565687096e-06,
+      "loss": 1.2798,
+      "step": 395
+    },
+    {
+      "epoch": 0.1974821031843989,
+      "grad_norm": 0.6703007559314436,
+      "learning_rate": 9.714309598137045e-06,
+      "loss": 1.249,
+      "step": 400
+    },
+    {
+      "epoch": 0.1974821031843989,
+      "eval_loss": 1.2516121864318848,
+      "eval_runtime": 2575.7168,
+      "eval_samples_per_second": 1.553,
+      "eval_steps_per_second": 0.13,
+      "step": 400
+    },
+    {
+      "epoch": 0.1999506294742039,
+      "grad_norm": 0.526231295870319,
+      "learning_rate": 9.699772235168572e-06,
+      "loss": 1.2554,
+      "step": 405
+    },
+    {
+      "epoch": 0.20241915576400887,
+      "grad_norm": 0.5513334850915074,
+      "learning_rate": 9.68488555728462e-06,
+      "loss": 1.2753,
+      "step": 410
+    },
+    {
+      "epoch": 0.20488768205381389,
+      "grad_norm": 0.5979774809603526,
+      "learning_rate": 9.669650670951282e-06,
+      "loss": 1.2562,
+      "step": 415
+    },
+    {
+      "epoch": 0.20735620834361887,
+      "grad_norm": 0.5596269907913185,
+      "learning_rate": 9.654068708515564e-06,
+      "loss": 1.2829,
+      "step": 420
+    },
+    {
+      "epoch": 0.20982473463342385,
+      "grad_norm": 0.5593282633769885,
+      "learning_rate": 9.638140828121232e-06,
+      "loss": 1.2843,
+      "step": 425
+    },
+    {
+      "epoch": 0.21229326092322884,
+      "grad_norm": 0.5775937654131708,
+      "learning_rate": 9.621868213622713e-06,
+      "loss": 1.3001,
+      "step": 430
+    },
+    {
+      "epoch": 0.21476178721303382,
+      "grad_norm": 0.5661901033745343,
+      "learning_rate": 9.605252074497125e-06,
+      "loss": 1.3038,
+      "step": 435
+    },
+    {
+      "epoch": 0.2172303135028388,
+      "grad_norm": 0.6132749209816828,
+      "learning_rate": 9.588293645754363e-06,
+      "loss": 1.2843,
+      "step": 440
+    },
+    {
+      "epoch": 0.2196988397926438,
+      "grad_norm": 0.5624360623535388,
+      "learning_rate": 9.570994187845323e-06,
+      "loss": 1.2342,
+      "step": 445
+    },
+    {
+      "epoch": 0.22216736608244878,
+      "grad_norm": 0.5567610470805882,
+      "learning_rate": 9.553354986568201e-06,
+      "loss": 1.2955,
+      "step": 450
+    },
+    {
+      "epoch": 0.22463589237225376,
+      "grad_norm": 0.6255724221196046,
+      "learning_rate": 9.53537735297294e-06,
+      "loss": 1.2921,
+      "step": 455
+    },
+    {
+      "epoch": 0.22710441866205874,
+      "grad_norm": 0.5322242379012073,
+      "learning_rate": 9.517062623263768e-06,
+      "loss": 1.3011,
+      "step": 460
+    },
+    {
+      "epoch": 0.22957294495186373,
+      "grad_norm": 0.5444205798338807,
+      "learning_rate": 9.498412158699905e-06,
+      "loss": 1.2733,
+      "step": 465
+    },
+    {
+      "epoch": 0.2320414712416687,
+      "grad_norm": 0.5426713243893322,
+      "learning_rate": 9.479427345494366e-06,
+      "loss": 1.2312,
+      "step": 470
+    },
+    {
+      "epoch": 0.23450999753147372,
+      "grad_norm": 0.5871783813919782,
+      "learning_rate": 9.460109594710942e-06,
+      "loss": 1.3655,
+      "step": 475
+    },
+    {
+      "epoch": 0.2369785238212787,
+      "grad_norm": 0.574852380091512,
+      "learning_rate": 9.440460342159314e-06,
+      "loss": 1.2915,
+      "step": 480
+    },
+    {
+      "epoch": 0.2394470501110837,
+      "grad_norm": 0.5336092545421678,
+      "learning_rate": 9.42048104828834e-06,
+      "loss": 1.2963,
+      "step": 485
+    },
+    {
+      "epoch": 0.24191557640088868,
+      "grad_norm": 0.5998428802300876,
+      "learning_rate": 9.40017319807751e-06,
+      "loss": 1.3058,
+      "step": 490
+    },
+    {
+      "epoch": 0.24438410269069366,
+      "grad_norm": 0.5421507806800733,
+      "learning_rate": 9.379538300926553e-06,
+      "loss": 1.2881,
+      "step": 495
+    },
+    {
+      "epoch": 0.24685262898049865,
+      "grad_norm": 0.5358621498972941,
+      "learning_rate": 9.358577890543277e-06,
+      "loss": 1.2602,
+      "step": 500
+    },
+    {
+      "epoch": 0.24932115527030363,
+      "grad_norm": 0.564112204428148,
+      "learning_rate": 9.33729352482956e-06,
+      "loss": 1.279,
+      "step": 505
+    },
+    {
+      "epoch": 0.2517896815601086,
+      "grad_norm": 0.6382679375882034,
+      "learning_rate": 9.315686785765556e-06,
+      "loss": 1.2534,
+      "step": 510
+    },
+    {
+      "epoch": 0.2542582078499136,
+      "grad_norm": 0.5744585475791394,
+      "learning_rate": 9.293759279292116e-06,
+      "loss": 1.2744,
+      "step": 515
+    },
+    {
+      "epoch": 0.2567267341397186,
+      "grad_norm": 0.615942623926986,
+      "learning_rate": 9.271512635191427e-06,
+      "loss": 1.3055,
+      "step": 520
+    },
+    {
+      "epoch": 0.25919526042952357,
+      "grad_norm": 0.5780670121734512,
+      "learning_rate": 9.248948506965877e-06,
+      "loss": 1.3175,
+      "step": 525
+    },
+    {
+      "epoch": 0.26166378671932855,
+      "grad_norm": 0.5777138377025286,
+      "learning_rate": 9.22606857171515e-06,
+      "loss": 1.2869,
+      "step": 530
+    },
+    {
+      "epoch": 0.26413231300913353,
+      "grad_norm": 0.5611724611846367,
+      "learning_rate": 9.202874530011583e-06,
+      "loss": 1.3199,
+      "step": 535
+    },
+    {
+      "epoch": 0.2666008392989385,
+      "grad_norm": 0.540794710590132,
+      "learning_rate": 9.179368105773768e-06,
+      "loss": 1.208,
+      "step": 540
+    },
+    {
+      "epoch": 0.2690693655887435,
+      "grad_norm": 0.5581497544995145,
+      "learning_rate": 9.155551046138408e-06,
+      "loss": 1.2638,
+      "step": 545
+    },
+    {
+      "epoch": 0.2715378918785485,
+      "grad_norm": 0.560865648598851,
+      "learning_rate": 9.131425121330477e-06,
+      "loss": 1.2629,
+      "step": 550
+    },
+    {
+      "epoch": 0.27400641816835347,
+      "grad_norm": 0.5458754463390333,
+      "learning_rate": 9.10699212453164e-06,
+      "loss": 1.2578,
+      "step": 555
+    },
+    {
+      "epoch": 0.27647494445815846,
+      "grad_norm": 0.5468153448281193,
+      "learning_rate": 9.082253871746962e-06,
+      "loss": 1.2488,
+      "step": 560
+    },
+    {
+      "epoch": 0.27894347074796344,
+      "grad_norm": 0.6168084406611584,
+      "learning_rate": 9.057212201669952e-06,
+      "loss": 1.2931,
+      "step": 565
+    },
+    {
+      "epoch": 0.2814119970377685,
+      "grad_norm": 0.5767023372783159,
+      "learning_rate": 9.031868975545884e-06,
+      "loss": 1.2267,
+      "step": 570
+    },
+    {
+      "epoch": 0.28388052332757346,
+      "grad_norm": 0.5315895904457054,
+      "learning_rate": 9.006226077033464e-06,
+      "loss": 1.2463,
+      "step": 575
+    },
+    {
+      "epoch": 0.28634904961737845,
+      "grad_norm": 0.5616058952533509,
+      "learning_rate": 8.980285412064827e-06,
+      "loss": 1.287,
+      "step": 580
+    },
+    {
+      "epoch": 0.28881757590718343,
+      "grad_norm": 0.5746998443271042,
+      "learning_rate": 8.954048908703873e-06,
+      "loss": 1.2929,
+      "step": 585
+    },
+    {
+      "epoch": 0.2912861021969884,
+      "grad_norm": 0.5551746835964705,
+      "learning_rate": 8.92751851700297e-06,
+      "loss": 1.298,
+      "step": 590
+    },
+    {
+      "epoch": 0.2937546284867934,
+      "grad_norm": 0.578564867995815,
+      "learning_rate": 8.900696208857996e-06,
+      "loss": 1.2973,
+      "step": 595
+    },
+    {
+      "epoch": 0.2962231547765984,
+      "grad_norm": 0.5925663520696334,
+      "learning_rate": 8.873583977861802e-06,
+      "loss": 1.2514,
+      "step": 600
+    },
+    {
+      "epoch": 0.2962231547765984,
+      "eval_loss": 1.2368682622909546,
+      "eval_runtime": 2566.7596,
+      "eval_samples_per_second": 1.558,
+      "eval_steps_per_second": 0.13,
+      "step": 600
+    },
+    {
+      "epoch": 0.29869168106640337,
+      "grad_norm": 0.5605310856508363,
+      "learning_rate": 8.846183839156015e-06,
+      "loss": 1.286,
+      "step": 605
+    },
+    {
+      "epoch": 0.30116020735620835,
+      "grad_norm": 0.6632798685747615,
+      "learning_rate": 8.818497829281272e-06,
+      "loss": 1.2916,
+      "step": 610
+    },
+    {
+      "epoch": 0.30362873364601334,
+      "grad_norm": 0.6145012170463651,
+      "learning_rate": 8.790528006025848e-06,
+      "loss": 1.2788,
+      "step": 615
+    },
+    {
+      "epoch": 0.3060972599358183,
+      "grad_norm": 0.6017170291600934,
+      "learning_rate": 8.762276448272709e-06,
+      "loss": 1.3156,
+      "step": 620
+    },
+    {
+      "epoch": 0.3085657862256233,
+      "grad_norm": 0.5728547538871892,
+      "learning_rate": 8.733745255844996e-06,
+      "loss": 1.2592,
+      "step": 625
+    },
+    {
+      "epoch": 0.3110343125154283,
+      "grad_norm": 0.558142508046803,
+      "learning_rate": 8.70493654934996e-06,
+      "loss": 1.309,
+      "step": 630
+    },
+    {
+      "epoch": 0.3135028388052333,
+      "grad_norm": 0.5596812007471911,
+      "learning_rate": 8.675852470021344e-06,
+      "loss": 1.2746,
+      "step": 635
+    },
+    {
+      "epoch": 0.31597136509503826,
+      "grad_norm": 0.5909265132847957,
+      "learning_rate": 8.646495179560221e-06,
+      "loss": 1.2686,
+      "step": 640
+    },
+    {
+      "epoch": 0.31843989138484324,
+      "grad_norm": 0.6185942591784858,
+      "learning_rate": 8.616866859974344e-06,
+      "loss": 1.2759,
+      "step": 645
+    },
+    {
+      "epoch": 0.3209084176746482,
+      "grad_norm": 0.6157204431679958,
+      "learning_rate": 8.586969713415949e-06,
+      "loss": 1.2957,
+      "step": 650
+    },
+    {
+      "epoch": 0.3233769439644532,
+      "grad_norm": 0.5974197754755597,
+      "learning_rate": 8.556805962018091e-06,
+      "loss": 1.27,
+      "step": 655
+    },
+    {
+      "epoch": 0.3258454702542582,
+      "grad_norm": 0.5389440161380957,
+      "learning_rate": 8.526377847729475e-06,
+      "loss": 1.2925,
+      "step": 660
+    },
+    {
+      "epoch": 0.3283139965440632,
+      "grad_norm": 0.5370983741740369,
+      "learning_rate": 8.495687632147817e-06,
+      "loss": 1.2522,
+      "step": 665
+    },
+    {
+      "epoch": 0.33078252283386816,
+      "grad_norm": 0.5639132359450145,
+      "learning_rate": 8.46473759635176e-06,
+      "loss": 1.2595,
+      "step": 670
+    },
+    {
+      "epoch": 0.33325104912367315,
+      "grad_norm": 0.5598705018251675,
+      "learning_rate": 8.433530040731321e-06,
+      "loss": 1.2746,
+      "step": 675
+    },
+    {
+      "epoch": 0.33571957541347813,
+      "grad_norm": 0.6303186487688077,
+      "learning_rate": 8.402067284816919e-06,
+      "loss": 1.2701,
+      "step": 680
+    },
+    {
+      "epoch": 0.3381881017032831,
+      "grad_norm": 0.562747309348665,
+      "learning_rate": 8.370351667106969e-06,
+      "loss": 1.2305,
+      "step": 685
+    },
+    {
+      "epoch": 0.3406566279930881,
+      "grad_norm": 0.5720387765798051,
+      "learning_rate": 8.338385544894073e-06,
+      "loss": 1.2047,
+      "step": 690
+    },
+    {
+      "epoch": 0.3431251542828931,
+      "grad_norm": 0.5465830505695308,
+      "learning_rate": 8.306171294089808e-06,
+      "loss": 1.2507,
+      "step": 695
+    },
+    {
+      "epoch": 0.3455936805726981,
+      "grad_norm": 0.5572297207326813,
+      "learning_rate": 8.273711309048145e-06,
+      "loss": 1.2599,
+      "step": 700
+    },
+    {
+      "epoch": 0.3480622068625031,
+      "grad_norm": 0.5916945311296786,
+      "learning_rate": 8.241008002387474e-06,
+      "loss": 1.2615,
+      "step": 705
+    },
+    {
+      "epoch": 0.3505307331523081,
+      "grad_norm": 0.6326075200444886,
+      "learning_rate": 8.208063804811293e-06,
+      "loss": 1.2559,
+      "step": 710
+    },
+    {
+      "epoch": 0.3529992594421131,
+      "grad_norm": 0.6229843020575793,
+      "learning_rate": 8.174881164927535e-06,
+      "loss": 1.2652,
+      "step": 715
+    },
+    {
+      "epoch": 0.35546778573191806,
+      "grad_norm": 0.5926153932237264,
+      "learning_rate": 8.141462549066581e-06,
+      "loss": 1.2423,
+      "step": 720
+    },
+    {
+      "epoch": 0.35793631202172305,
+      "grad_norm": 0.5293071287095781,
+      "learning_rate": 8.107810441097948e-06,
+      "loss": 1.2185,
+      "step": 725
+    },
+    {
+      "epoch": 0.36040483831152803,
+      "grad_norm": 0.5950082298726722,
+      "learning_rate": 8.073927342245663e-06,
+      "loss": 1.2458,
+      "step": 730
+    },
+    {
+      "epoch": 0.362873364601333,
+      "grad_norm": 0.5437872955630408,
+      "learning_rate": 8.039815770902368e-06,
+      "loss": 1.2699,
+      "step": 735
+    },
+    {
+      "epoch": 0.365341890891138,
+      "grad_norm": 0.5842632003875607,
+      "learning_rate": 8.005478262442132e-06,
+      "loss": 1.2489,
+      "step": 740
+    },
+    {
+      "epoch": 0.367810417180943,
+      "grad_norm": 0.5957543279120926,
+      "learning_rate": 7.970917369032011e-06,
+      "loss": 1.2808,
+      "step": 745
+    },
+    {
+      "epoch": 0.37027894347074797,
+      "grad_norm": 0.5573632520708609,
+      "learning_rate": 7.936135659442355e-06,
+      "loss": 1.2394,
+      "step": 750
+    },
+    {
+      "epoch": 0.37274746976055295,
+      "grad_norm": 0.5383442104756702,
+      "learning_rate": 7.901135718855877e-06,
+      "loss": 1.2584,
+      "step": 755
+    },
+    {
+      "epoch": 0.37521599605035794,
+      "grad_norm": 0.5269547291918393,
+      "learning_rate": 7.86592014867551e-06,
+      "loss": 1.32,
+      "step": 760
+    },
+    {
+      "epoch": 0.3776845223401629,
+      "grad_norm": 0.6059173481615415,
+      "learning_rate": 7.830491566331063e-06,
+      "loss": 1.2705,
+      "step": 765
+    },
+    {
+      "epoch": 0.3801530486299679,
+      "grad_norm": 0.5905241537228486,
+      "learning_rate": 7.794852605084661e-06,
+      "loss": 1.2661,
+      "step": 770
+    },
+    {
+      "epoch": 0.3826215749197729,
+      "grad_norm": 0.6119492506708828,
+      "learning_rate": 7.759005913835048e-06,
+      "loss": 1.2573,
+      "step": 775
+    },
+    {
+      "epoch": 0.3850901012095779,
+      "grad_norm": 0.6449864393640712,
+      "learning_rate": 7.722954156920675e-06,
+      "loss": 1.2681,
+      "step": 780
+    },
+    {
+      "epoch": 0.38755862749938286,
+      "grad_norm": 0.5777516112864801,
+      "learning_rate": 7.686700013921704e-06,
+      "loss": 1.2999,
+      "step": 785
+    },
+    {
+      "epoch": 0.39002715378918784,
+      "grad_norm": 0.5818063096150684,
+      "learning_rate": 7.650246179460826e-06,
+      "loss": 1.2842,
+      "step": 790
+    },
+    {
+      "epoch": 0.3924956800789928,
+      "grad_norm": 0.5844315528318011,
+      "learning_rate": 7.613595363002977e-06,
+      "loss": 1.2995,
+      "step": 795
+    },
+    {
+      "epoch": 0.3949642063687978,
+      "grad_norm": 0.5560255613889942,
+      "learning_rate": 7.57675028865397e-06,
+      "loss": 1.275,
+      "step": 800
+    },
+    {
+      "epoch": 0.3949642063687978,
+      "eval_loss": 1.2263342142105103,
+      "eval_runtime": 2463.6634,
+      "eval_samples_per_second": 1.624,
+      "eval_steps_per_second": 0.136,
+      "step": 800
+    },
+    {
+      "epoch": 0.3974327326586028,
+      "grad_norm": 0.5523940138743026,
+      "learning_rate": 7.539713694958013e-06,
+      "loss": 1.2202,
+      "step": 805
+    },
+    {
+      "epoch": 0.3999012589484078,
+      "grad_norm": 0.5936001183365429,
+      "learning_rate": 7.502488334694167e-06,
+      "loss": 1.2444,
+      "step": 810
+    },
+    {
+      "epoch": 0.40236978523821276,
+      "grad_norm": 0.6143038376732798,
+      "learning_rate": 7.465076974671739e-06,
+      "loss": 1.2032,
+      "step": 815
+    },
+    {
+      "epoch": 0.40483831152801775,
+      "grad_norm": 0.5865451493919344,
+      "learning_rate": 7.427482395524646e-06,
+      "loss": 1.2733,
+      "step": 820
+    },
+    {
+      "epoch": 0.4073068378178228,
+      "grad_norm": 0.5980943581114722,
+      "learning_rate": 7.389707391504728e-06,
+      "loss": 1.2732,
+      "step": 825
+    },
+    {
+      "epoch": 0.40977536410762777,
+      "grad_norm": 0.6323487686008166,
+      "learning_rate": 7.35175477027408e-06,
+      "loss": 1.244,
+      "step": 830
+    },
+    {
+      "epoch": 0.41224389039743276,
+      "grad_norm": 0.6562081554973773,
+      "learning_rate": 7.313627352696353e-06,
+      "loss": 1.2642,
+      "step": 835
+    },
+    {
+      "epoch": 0.41471241668723774,
+      "grad_norm": 0.5554470118072983,
+      "learning_rate": 7.2753279726271e-06,
+      "loss": 1.2556,
+      "step": 840
+    },
+    {
+      "epoch": 0.4171809429770427,
+      "grad_norm": 0.5740654163988275,
+      "learning_rate": 7.236859476703148e-06,
+      "loss": 1.2292,
+      "step": 845
+    },
+    {
+      "epoch": 0.4196494692668477,
+      "grad_norm": 0.6062582969566837,
+      "learning_rate": 7.198224724131012e-06,
+      "loss": 1.235,
+      "step": 850
+    },
+    {
+      "epoch": 0.4221179955566527,
+      "grad_norm": 0.5434614048201878,
+      "learning_rate": 7.159426586474388e-06,
+      "loss": 1.2224,
+      "step": 855
+    },
+    {
+      "epoch": 0.4245865218464577,
+      "grad_norm": 0.5254561702235886,
+      "learning_rate": 7.120467947440719e-06,
+      "loss": 1.2557,
+      "step": 860
+    },
+    {
+      "epoch": 0.42705504813626266,
+      "grad_norm": 0.5713031391494172,
+      "learning_rate": 7.081351702666863e-06,
+      "loss": 1.2063,
+      "step": 865
+    },
+    {
+      "epoch": 0.42952357442606764,
+      "grad_norm": 0.5969980245366532,
+      "learning_rate": 7.042080759503866e-06,
+      "loss": 1.2418,
+      "step": 870
+    },
+    {
+      "epoch": 0.43199210071587263,
+      "grad_norm": 0.5718940130718101,
+      "learning_rate": 7.00265803680088e-06,
+      "loss": 1.2108,
+      "step": 875
+    },
+    {
+      "epoch": 0.4344606270056776,
+      "grad_norm": 0.6045555591926912,
+      "learning_rate": 6.963086464688209e-06,
+      "loss": 1.2597,
+      "step": 880
+    },
+    {
+      "epoch": 0.4369291532954826,
+      "grad_norm": 0.5566709780037437,
+      "learning_rate": 6.923368984359526e-06,
+      "loss": 1.2174,
+      "step": 885
+    },
+    {
+      "epoch": 0.4393976795852876,
+      "grad_norm": 0.5630200258106689,
+      "learning_rate": 6.883508547853268e-06,
+      "loss": 1.2244,
+      "step": 890
+    },
+    {
+      "epoch": 0.44186620587509257,
+      "grad_norm": 0.5348314552481888,
+      "learning_rate": 6.843508117833224e-06,
+      "loss": 1.2687,
+      "step": 895
+    },
+    {
+      "epoch": 0.44433473216489755,
+      "grad_norm": 0.49625311943608336,
+      "learning_rate": 6.8033706673683276e-06,
+      "loss": 1.1986,
+      "step": 900
+    },
+    {
+      "epoch": 0.44680325845470253,
+      "grad_norm": 0.5542218838145379,
+      "learning_rate": 6.763099179711685e-06,
+      "loss": 1.2286,
+      "step": 905
+    },
+    {
+      "epoch": 0.4492717847445075,
+      "grad_norm": 0.594098893943127,
+      "learning_rate": 6.722696648078838e-06,
+      "loss": 1.2335,
+      "step": 910
+    },
+    {
+      "epoch": 0.4517403110343125,
+      "grad_norm": 0.5478077068384012,
+      "learning_rate": 6.682166075425298e-06,
+      "loss": 1.264,
+      "step": 915
+    },
+    {
+      "epoch": 0.4542088373241175,
+      "grad_norm": 0.5727528301850252,
+      "learning_rate": 6.641510474223338e-06,
+      "loss": 1.226,
+      "step": 920
+    },
+    {
+      "epoch": 0.45667736361392247,
+      "grad_norm": 0.5888269073825134,
+      "learning_rate": 6.600732866238097e-06,
+      "loss": 1.212,
+      "step": 925
+    },
+    {
+      "epoch": 0.45914588990372746,
+      "grad_norm": 0.5736288265128395,
+      "learning_rate": 6.559836282302984e-06,
+      "loss": 1.25,
+      "step": 930
+    },
+    {
+      "epoch": 0.46161441619353244,
+      "grad_norm": 0.6651036803926929,
+      "learning_rate": 6.5188237620943965e-06,
+      "loss": 1.2672,
+      "step": 935
+    },
+    {
+      "epoch": 0.4640829424833374,
+      "grad_norm": 0.5547382454730273,
+      "learning_rate": 6.477698353905808e-06,
+      "loss": 1.2887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4665514687731424,
+      "grad_norm": 0.5627833712727636,
+      "learning_rate": 6.436463114421199e-06,
+      "loss": 1.2674,
+      "step": 945
+    },
+    {
+      "epoch": 0.46901999506294745,
+      "grad_norm": 0.5562108977867529,
+      "learning_rate": 6.395121108487855e-06,
+      "loss": 1.2973,
+      "step": 950
+    },
+    {
+      "epoch": 0.47148852135275243,
+      "grad_norm": 0.5940300188918287,
+      "learning_rate": 6.353675408888582e-06,
+      "loss": 1.278,
+      "step": 955
+    },
+    {
+      "epoch": 0.4739570476425574,
+      "grad_norm": 0.6499724681591359,
+      "learning_rate": 6.312129096113313e-06,
+      "loss": 1.242,
+      "step": 960
+    },
+    {
+      "epoch": 0.4764255739323624,
+      "grad_norm": 0.5794092582819724,
+      "learning_rate": 6.270485258130146e-06,
+      "loss": 1.2263,
+      "step": 965
+    },
+    {
+      "epoch": 0.4788941002221674,
+      "grad_norm": 0.5810005883829364,
+      "learning_rate": 6.228746990155831e-06,
+      "loss": 1.2166,
+      "step": 970
+    },
+    {
+      "epoch": 0.48136262651197237,
+      "grad_norm": 0.5523321758038612,
+      "learning_rate": 6.186917394425715e-06,
+      "loss": 1.2666,
+      "step": 975
+    },
+    {
+      "epoch": 0.48383115280177735,
+      "grad_norm": 0.5353766340095819,
+      "learning_rate": 6.144999579963164e-06,
+      "loss": 1.2332,
+      "step": 980
+    },
+    {
+      "epoch": 0.48629967909158234,
+      "grad_norm": 0.5962559333577797,
+      "learning_rate": 6.102996662348485e-06,
+      "loss": 1.2985,
+      "step": 985
+    },
+    {
+      "epoch": 0.4887682053813873,
+      "grad_norm": 0.573508927377536,
+      "learning_rate": 6.060911763487353e-06,
+      "loss": 1.2353,
+      "step": 990
+    },
+    {
+      "epoch": 0.4912367316711923,
+      "grad_norm": 0.6190411186907346,
+      "learning_rate": 6.0187480113787765e-06,
+      "loss": 1.2668,
+      "step": 995
+    },
+    {
+      "epoch": 0.4937052579609973,
+      "grad_norm": 0.537107101144104,
+      "learning_rate": 5.976508539882604e-06,
+      "loss": 1.1984,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4937052579609973,
+      "eval_loss": 1.2196881771087646,
+      "eval_runtime": 2373.8686,
+      "eval_samples_per_second": 1.685,
+      "eval_steps_per_second": 0.141,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4961737842508023,
+      "grad_norm": 0.5673334311067016,
+      "learning_rate": 5.934196488486594e-06,
+      "loss": 1.2573,
+      "step": 1005
+    },
+    {
+      "epoch": 0.49864231054060726,
+      "grad_norm": 0.6141102747872601,
+      "learning_rate": 5.891815002073081e-06,
+      "loss": 1.2776,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5011108368304122,
+      "grad_norm": 0.5866475421501153,
+      "learning_rate": 5.849367230685214e-06,
+      "loss": 1.2139,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5035793631202172,
+      "grad_norm": 0.5973223110810923,
+      "learning_rate": 5.806856329292839e-06,
+      "loss": 1.2809,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5060478894100222,
+      "grad_norm": 0.6385978269750231,
+      "learning_rate": 5.764285457557994e-06,
+      "loss": 1.2511,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5085164156998272,
+      "grad_norm": 0.5607340345191899,
+      "learning_rate": 5.721657779600071e-06,
+      "loss": 1.2421,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5109849419896322,
+      "grad_norm": 0.5444555426859482,
+      "learning_rate": 5.678976463760635e-06,
+      "loss": 1.2561,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5134534682794372,
+      "grad_norm": 0.5663913305474535,
+      "learning_rate": 5.636244682367937e-06,
+      "loss": 1.2324,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5159219945692421,
+      "grad_norm": 0.6001697304401695,
+      "learning_rate": 5.593465611501127e-06,
+      "loss": 1.2206,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5183905208590471,
+      "grad_norm": 0.5922209574486257,
+      "learning_rate": 5.5506424307541895e-06,
+      "loss": 1.2777,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5208590471488521,
+      "grad_norm": 0.5810845811643376,
+      "learning_rate": 5.507778322999615e-06,
+      "loss": 1.2186,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5233275734386571,
+      "grad_norm": 0.5661815755139697,
+      "learning_rate": 5.464876474151835e-06,
+      "loss": 1.2465,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5257960997284621,
+      "grad_norm": 0.6016645517449551,
+      "learning_rate": 5.421940072930415e-06,
+      "loss": 1.2269,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5282646260182671,
+      "grad_norm": 0.6268744087157316,
+      "learning_rate": 5.3789723106230675e-06,
+      "loss": 1.2089,
+      "step": 1070
+    },
+    {
+      "epoch": 0.530733152308072,
+      "grad_norm": 0.5374231313658383,
+      "learning_rate": 5.3359763808484396e-06,
+      "loss": 1.2371,
+      "step": 1075
+    },
+    {
+      "epoch": 0.533201678597877,
+      "grad_norm": 0.5696825743006079,
+      "learning_rate": 5.292955479318756e-06,
+      "loss": 1.2288,
+      "step": 1080
+    },
+    {
+      "epoch": 0.535670204887682,
+      "grad_norm": 0.5474403893705062,
+      "learning_rate": 5.249912803602287e-06,
+      "loss": 1.2631,
+      "step": 1085
+    },
+    {
+      "epoch": 0.538138731177487,
+      "grad_norm": 0.611438366860115,
+      "learning_rate": 5.206851552885691e-06,
+      "loss": 1.2395,
+      "step": 1090
+    },
+    {
+      "epoch": 0.540607257467292,
+      "grad_norm": 0.6437738368971478,
+      "learning_rate": 5.163774927736228e-06,
+      "loss": 1.3132,
+      "step": 1095
+    },
+    {
+      "epoch": 0.543075783757097,
+      "grad_norm": 0.5438676695949717,
+      "learning_rate": 5.120686129863882e-06,
+      "loss": 1.2807,
+      "step": 1100
+    },
+    {
+      "epoch": 0.545544310046902,
+      "grad_norm": 0.6135072081701597,
+      "learning_rate": 5.077588361883379e-06,
+      "loss": 1.2239,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5480128363367069,
+      "grad_norm": 0.546701645842348,
+      "learning_rate": 5.0344848270761635e-06,
+      "loss": 1.2121,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5504813626265119,
+      "grad_norm": 0.6153049309551597,
+      "learning_rate": 4.9913787291523e-06,
+      "loss": 1.2832,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5529498889163169,
+      "grad_norm": 0.6148368644966669,
+      "learning_rate": 4.948273272012363e-06,
+      "loss": 1.2536,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5554184152061219,
+      "grad_norm": 0.5911800001869699,
+      "learning_rate": 4.905171659509294e-06,
+      "loss": 1.2789,
+      "step": 1125
+    },
+    {
+      "epoch": 0.5578869414959269,
+      "grad_norm": 0.5450128065258734,
+      "learning_rate": 4.862077095210284e-06,
+      "loss": 1.1595,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5603554677857319,
+      "grad_norm": 0.5629093671549396,
+      "learning_rate": 4.818992782158658e-06,
+      "loss": 1.2854,
+      "step": 1135
+    },
+    {
+      "epoch": 0.562823994075537,
+      "grad_norm": 0.6634778146032412,
+      "learning_rate": 4.775921922635806e-06,
+      "loss": 1.2405,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5652925203653419,
+      "grad_norm": 0.5439361692157106,
+      "learning_rate": 4.732867717923174e-06,
+      "loss": 1.265,
+      "step": 1145
+    },
+    {
+      "epoch": 0.5677610466551469,
+      "grad_norm": 0.5860651769650387,
+      "learning_rate": 4.689833368064326e-06,
+      "loss": 1.2511,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5702295729449519,
+      "grad_norm": 0.627265270599233,
+      "learning_rate": 4.646822071627089e-06,
+      "loss": 1.2813,
+      "step": 1155
+    },
+    {
+      "epoch": 0.5726980992347569,
+      "grad_norm": 0.5634927900565491,
+      "learning_rate": 4.603837025465829e-06,
+      "loss": 1.22,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5751666255245619,
+      "grad_norm": 0.6482363315867818,
+      "learning_rate": 4.560881424483833e-06,
+      "loss": 1.3095,
+      "step": 1165
+    },
+    {
+      "epoch": 0.5776351518143669,
+      "grad_norm": 0.4805380958857345,
+      "learning_rate": 4.517958461395846e-06,
+      "loss": 1.2737,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5801036781041718,
+      "grad_norm": 0.5854150858325277,
+      "learning_rate": 4.475071326490781e-06,
+      "loss": 1.2282,
+      "step": 1175
+    },
+    {
+      "epoch": 0.5825722043939768,
+      "grad_norm": 0.554230131541799,
+      "learning_rate": 4.432223207394577e-06,
+      "loss": 1.178,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5850407306837818,
+      "grad_norm": 0.6930360615517788,
+      "learning_rate": 4.389417288833292e-06,
+      "loss": 1.2781,
+      "step": 1185
+    },
+    {
+      "epoch": 0.5875092569735868,
+      "grad_norm": 0.6042088339838697,
+      "learning_rate": 4.346656752396388e-06,
+      "loss": 1.2813,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5899777832633918,
+      "grad_norm": 0.6280387565672664,
+      "learning_rate": 4.303944776300262e-06,
+      "loss": 1.2433,
+      "step": 1195
+    },
+    {
+      "epoch": 0.5924463095531968,
+      "grad_norm": 0.5502891803034431,
+      "learning_rate": 4.261284535152016e-06,
+      "loss": 1.1556,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5924463095531968,
+      "eval_loss": 1.2148913145065308,
+      "eval_runtime": 2558.7024,
+      "eval_samples_per_second": 1.563,
+      "eval_steps_per_second": 0.131,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5949148358430018,
+      "grad_norm": 0.5429417971755677,
+      "learning_rate": 4.218679199713505e-06,
+      "loss": 1.2398,
+      "step": 1205
+    },
+    {
+      "epoch": 0.5973833621328067,
+      "grad_norm": 0.5573592415141271,
+      "learning_rate": 4.176131936665669e-06,
+      "loss": 1.2348,
+      "step": 1210
+    },
+    {
+      "epoch": 0.5998518884226117,
+      "grad_norm": 0.5662130620287456,
+      "learning_rate": 4.133645908373159e-06,
+      "loss": 1.1894,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6023204147124167,
+      "grad_norm": 0.5330337777111593,
+      "learning_rate": 4.0912242726493e-06,
+      "loss": 1.267,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6047889410022217,
+      "grad_norm": 0.589763462299109,
+      "learning_rate": 4.048870182521374e-06,
+      "loss": 1.2461,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6072574672920267,
+      "grad_norm": 0.5798241574940401,
+      "learning_rate": 4.006586785996285e-06,
+      "loss": 1.2503,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6097259935818317,
+      "grad_norm": 0.5714021679563045,
+      "learning_rate": 3.96437722582656e-06,
+      "loss": 1.2322,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6121945198716366,
+      "grad_norm": 0.5926307509257247,
+      "learning_rate": 3.922244639276773e-06,
+      "loss": 1.2692,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6146630461614416,
+      "grad_norm": 0.6016557090563102,
+      "learning_rate": 3.880192157890365e-06,
+      "loss": 1.2642,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6171315724512466,
+      "grad_norm": 0.5454381088492659,
+      "learning_rate": 3.838222907256884e-06,
+      "loss": 1.239,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6196000987410516,
+      "grad_norm": 0.5582749852816064,
+      "learning_rate": 3.7963400067796774e-06,
+      "loss": 1.2851,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6220686250308566,
+      "grad_norm": 0.5562967849735465,
+      "learning_rate": 3.7545465694440363e-06,
+      "loss": 1.2432,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6245371513206616,
+      "grad_norm": 0.5419669962437569,
+      "learning_rate": 3.7128457015858198e-06,
+      "loss": 1.2103,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6270056776104665,
+      "grad_norm": 0.558873424565738,
+      "learning_rate": 3.6712405026605792e-06,
+      "loss": 1.2388,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6294742039002715,
+      "grad_norm": 0.5712282397945332,
+      "learning_rate": 3.6297340650131785e-06,
+      "loss": 1.2819,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6319427301900765,
+      "grad_norm": 0.5643697726223241,
+      "learning_rate": 3.5883294736479612e-06,
+      "loss": 1.2386,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6344112564798815,
+      "grad_norm": 0.6332020317807455,
+      "learning_rate": 3.5470298059994545e-06,
+      "loss": 1.2677,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6368797827696865,
+      "grad_norm": 0.6276157822500693,
+      "learning_rate": 3.5058381317036285e-06,
+      "loss": 1.2137,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6393483090594915,
+      "grad_norm": 0.5139753708360036,
+      "learning_rate": 3.46475751236975e-06,
+      "loss": 1.2436,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6418168353492965,
+      "grad_norm": 0.5868933304811402,
+      "learning_rate": 3.423791001352823e-06,
+      "loss": 1.1681,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6442853616391014,
+      "grad_norm": 0.5592137564928078,
+      "learning_rate": 3.382941643526644e-06,
+      "loss": 1.2443,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6467538879289064,
+      "grad_norm": 0.567548616583169,
+      "learning_rate": 3.3422124750574902e-06,
+      "loss": 1.2604,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6492224142187114,
+      "grad_norm": 0.568882999500645,
+      "learning_rate": 3.3016065231784587e-06,
+      "loss": 1.1595,
+      "step": 1315
+    },
+    {
+      "epoch": 0.6516909405085164,
+      "grad_norm": 0.628304707671549,
+      "learning_rate": 3.2611268059644535e-06,
+      "loss": 1.2841,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6541594667983214,
+      "grad_norm": 0.5686219665932154,
+      "learning_rate": 3.2207763321078737e-06,
+      "loss": 1.2347,
+      "step": 1325
+    },
+    {
+      "epoch": 0.6566279930881264,
+      "grad_norm": 0.6424587872522304,
+      "learning_rate": 3.1805581006949856e-06,
+      "loss": 1.2329,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6590965193779313,
+      "grad_norm": 0.6654374856920555,
+      "learning_rate": 3.1404751009830124e-06,
+      "loss": 1.2423,
+      "step": 1335
+    },
+    {
+      "epoch": 0.6615650456677363,
+      "grad_norm": 0.5206675422652753,
+      "learning_rate": 3.100530312177956e-06,
+      "loss": 1.2329,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6640335719575413,
+      "grad_norm": 0.6656795155578475,
+      "learning_rate": 3.0607267032131704e-06,
+      "loss": 1.3062,
+      "step": 1345
+    },
+    {
+      "epoch": 0.6665020982473463,
+      "grad_norm": 0.6071844948708964,
+      "learning_rate": 3.0210672325286806e-06,
+      "loss": 1.2656,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6689706245371513,
+      "grad_norm": 0.6211025479318184,
+      "learning_rate": 2.9815548478513034e-06,
+      "loss": 1.2167,
+      "step": 1355
+    },
+    {
+      "epoch": 0.6714391508269563,
+      "grad_norm": 0.5801456765244887,
+      "learning_rate": 2.9421924859755525e-06,
+      "loss": 1.2249,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6739076771167613,
+      "grad_norm": 0.564862030285346,
+      "learning_rate": 2.9029830725453545e-06,
+      "loss": 1.2414,
+      "step": 1365
+    },
+    {
+      "epoch": 0.6763762034065662,
+      "grad_norm": 0.5538133203567932,
+      "learning_rate": 2.8639295218366115e-06,
+      "loss": 1.2191,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6788447296963712,
+      "grad_norm": 0.5925104037633543,
+      "learning_rate": 2.8250347365405737e-06,
+      "loss": 1.2318,
+      "step": 1375
+    },
+    {
+      "epoch": 0.6813132559861762,
+      "grad_norm": 0.6173909875052214,
+      "learning_rate": 2.78630160754811e-06,
+      "loss": 1.2555,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6837817822759812,
+      "grad_norm": 0.6579800769123958,
+      "learning_rate": 2.747733013734835e-06,
+      "loss": 1.2553,
+      "step": 1385
+    },
+    {
+      "epoch": 0.6862503085657862,
+      "grad_norm": 0.6097488788659552,
+      "learning_rate": 2.709331821747133e-06,
+      "loss": 1.2482,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6887188348555913,
+      "grad_norm": 0.5717544066297715,
+      "learning_rate": 2.6711008857890928e-06,
+      "loss": 1.2477,
+      "step": 1395
+    },
+    {
+      "epoch": 0.6911873611453963,
+      "grad_norm": 0.5675063300875494,
+      "learning_rate": 2.63304304741037e-06,
+      "loss": 1.2386,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6911873611453963,
+      "eval_loss": 1.211606740951538,
+      "eval_runtime": 2914.6181,
+      "eval_samples_per_second": 1.372,
+      "eval_steps_per_second": 0.115,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6936558874352012,
+      "grad_norm": 0.623871781326139,
+      "learning_rate": 2.595161135294978e-06,
+      "loss": 1.2484,
+      "step": 1405
+    },
+    {
+      "epoch": 0.6961244137250062,
+      "grad_norm": 0.5967791678571923,
+      "learning_rate": 2.55745796505105e-06,
+      "loss": 1.2816,
+      "step": 1410
+    },
+    {
+      "epoch": 0.6985929400148112,
+      "grad_norm": 0.5958918786737188,
+      "learning_rate": 2.5199363390015645e-06,
+      "loss": 1.2518,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7010614663046162,
+      "grad_norm": 0.5716469845277612,
+      "learning_rate": 2.482599045976059e-06,
+      "loss": 1.2518,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7035299925944212,
+      "grad_norm": 0.5601354887821722,
+      "learning_rate": 2.445448861103348e-06,
+      "loss": 1.2114,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7059985188842262,
+      "grad_norm": 0.5783618487395104,
+      "learning_rate": 2.408488545605265e-06,
+      "loss": 1.2801,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7084670451740311,
+      "grad_norm": 0.600120666255256,
+      "learning_rate": 2.3717208465914193e-06,
+      "loss": 1.2928,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7109355714638361,
+      "grad_norm": 0.6823362059514299,
+      "learning_rate": 2.3351484968550264e-06,
+      "loss": 1.2306,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7134040977536411,
+      "grad_norm": 0.5869728269343567,
+      "learning_rate": 2.298774214669785e-06,
+      "loss": 1.2417,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7158726240434461,
+      "grad_norm": 0.597629982893601,
+      "learning_rate": 2.2626007035878377e-06,
+      "loss": 1.1912,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7183411503332511,
+      "grad_norm": 0.6222473980576229,
+      "learning_rate": 2.226630652238836e-06,
+      "loss": 1.2083,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7208096766230561,
+      "grad_norm": 0.5978767327421509,
+      "learning_rate": 2.1908667341300923e-06,
+      "loss": 1.2577,
+      "step": 1460
+    },
+    {
+      "epoch": 0.723278202912861,
+      "grad_norm": 0.6156905912164004,
+      "learning_rate": 2.155311607447877e-06,
+      "loss": 1.2922,
+      "step": 1465
+    },
+    {
+      "epoch": 0.725746729202666,
+      "grad_norm": 0.6341472520929511,
+      "learning_rate": 2.1199679148598434e-06,
+      "loss": 1.2667,
+      "step": 1470
+    },
+    {
+      "epoch": 0.728215255492471,
+      "grad_norm": 0.5655996654676207,
+      "learning_rate": 2.084838283318616e-06,
+      "loss": 1.1939,
+      "step": 1475
+    },
+    {
+      "epoch": 0.730683781782276,
+      "grad_norm": 0.5824088027115487,
+      "learning_rate": 2.0499253238665284e-06,
+      "loss": 1.242,
+      "step": 1480
+    },
+    {
+      "epoch": 0.733152308072081,
+      "grad_norm": 0.6063388402546945,
+      "learning_rate": 2.0152316314415602e-06,
+      "loss": 1.2482,
+      "step": 1485
+    },
+    {
+      "epoch": 0.735620834361886,
+      "grad_norm": 0.6226805122487513,
+      "learning_rate": 1.9807597846844737e-06,
+      "loss": 1.255,
+      "step": 1490
+    },
+    {
+      "epoch": 0.738089360651691,
+      "grad_norm": 0.5854379294811827,
+      "learning_rate": 1.9465123457471395e-06,
+      "loss": 1.1786,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7405578869414959,
+      "grad_norm": 0.5577052246580572,
+      "learning_rate": 1.9124918601021124e-06,
+      "loss": 1.2358,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7430264132313009,
+      "grad_norm": 0.5754079743445688,
+      "learning_rate": 1.8787008563534326e-06,
+      "loss": 1.1945,
+      "step": 1505
+    },
+    {
+      "epoch": 0.7454949395211059,
+      "grad_norm": 0.6099556355269008,
+      "learning_rate": 1.845141846048691e-06,
+      "loss": 1.2379,
+      "step": 1510
+    },
+    {
+      "epoch": 0.7479634658109109,
+      "grad_norm": 0.5782704010521243,
+      "learning_rate": 1.8118173234923447e-06,
+      "loss": 1.2542,
+      "step": 1515
+    },
+    {
+      "epoch": 0.7504319921007159,
+      "grad_norm": 0.5382858254483444,
+      "learning_rate": 1.778729765560337e-06,
+      "loss": 1.2327,
+      "step": 1520
+    },
+    {
+      "epoch": 0.7529005183905209,
+      "grad_norm": 0.6082642317550977,
+      "learning_rate": 1.7458816315159937e-06,
+      "loss": 1.2631,
+      "step": 1525
+    },
+    {
+      "epoch": 0.7553690446803258,
+      "grad_norm": 0.6120502232540203,
+      "learning_rate": 1.7132753628272403e-06,
+      "loss": 1.2687,
+      "step": 1530
+    },
+    {
+      "epoch": 0.7578375709701308,
+      "grad_norm": 0.5800190917782422,
+      "learning_rate": 1.6809133829851344e-06,
+      "loss": 1.1809,
+      "step": 1535
+    },
+    {
+      "epoch": 0.7603060972599358,
+      "grad_norm": 0.6248767795672576,
+      "learning_rate": 1.6487980973237434e-06,
+      "loss": 1.2102,
+      "step": 1540
+    },
+    {
+      "epoch": 0.7627746235497408,
+      "grad_norm": 0.6214869106372124,
+      "learning_rate": 1.6169318928413574e-06,
+      "loss": 1.3183,
+      "step": 1545
+    },
+    {
+      "epoch": 0.7652431498395458,
+      "grad_norm": 0.6509287986960063,
+      "learning_rate": 1.5853171380230791e-06,
+      "loss": 1.2394,
+      "step": 1550
+    },
+    {
+      "epoch": 0.7677116761293508,
+      "grad_norm": 0.5548564286839581,
+      "learning_rate": 1.5539561826647832e-06,
+      "loss": 1.2278,
+      "step": 1555
+    },
+    {
+      "epoch": 0.7701802024191557,
+      "grad_norm": 0.5873399173100068,
+      "learning_rate": 1.5228513576984633e-06,
+      "loss": 1.2419,
+      "step": 1560
+    },
+    {
+      "epoch": 0.7726487287089607,
+      "grad_norm": 0.5698526241039991,
+      "learning_rate": 1.4920049750189852e-06,
+      "loss": 1.2134,
+      "step": 1565
+    },
+    {
+      "epoch": 0.7751172549987657,
+      "grad_norm": 0.5462525752885333,
+      "learning_rate": 1.4614193273122562e-06,
+      "loss": 1.2013,
+      "step": 1570
+    },
+    {
+      "epoch": 0.7775857812885707,
+      "grad_norm": 0.5604406125512932,
+      "learning_rate": 1.4310966878848116e-06,
+      "loss": 1.2319,
+      "step": 1575
+    },
+    {
+      "epoch": 0.7800543075783757,
+      "grad_norm": 0.5512496837811336,
+      "learning_rate": 1.401039310494855e-06,
+      "loss": 1.2436,
+      "step": 1580
+    },
+    {
+      "epoch": 0.7825228338681807,
+      "grad_norm": 0.6804998312407946,
+      "learning_rate": 1.3712494291847416e-06,
+      "loss": 1.2567,
+      "step": 1585
+    },
+    {
+      "epoch": 0.7849913601579857,
+      "grad_norm": 0.6655723000722049,
+      "learning_rate": 1.3417292581149388e-06,
+      "loss": 1.2682,
+      "step": 1590
+    },
+    {
+      "epoch": 0.7874598864477906,
+      "grad_norm": 0.539222744257867,
+      "learning_rate": 1.3124809913994458e-06,
+      "loss": 1.2009,
+      "step": 1595
+    },
+    {
+      "epoch": 0.7899284127375956,
+      "grad_norm": 0.622721298212167,
+      "learning_rate": 1.2835068029427188e-06,
+      "loss": 1.2661,
+      "step": 1600
+    },
+    {
+      "epoch": 0.7899284127375956,
+      "eval_loss": 1.2096235752105713,
+      "eval_runtime": 2576.8943,
+      "eval_samples_per_second": 1.552,
+      "eval_steps_per_second": 0.13,
+      "step": 1600
+    },
+    {
+      "epoch": 0.7923969390274006,
+      "grad_norm": 0.5470842930259888,
+      "learning_rate": 1.2548088462781006e-06,
+      "loss": 1.2244,
+      "step": 1605
+    },
+    {
+      "epoch": 0.7948654653172056,
+      "grad_norm": 0.5718801309412294,
+      "learning_rate": 1.2263892544077439e-06,
+      "loss": 1.2498,
+      "step": 1610
+    },
+    {
+      "epoch": 0.7973339916070106,
+      "grad_norm": 0.5818869817428877,
+      "learning_rate": 1.1982501396440831e-06,
+      "loss": 1.2044,
+      "step": 1615
+    },
+    {
+      "epoch": 0.7998025178968156,
+      "grad_norm": 0.5534354350847027,
+      "learning_rate": 1.1703935934528327e-06,
+      "loss": 1.2328,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8022710441866205,
+      "grad_norm": 0.5862274808604895,
+      "learning_rate": 1.1428216862975383e-06,
+      "loss": 1.2741,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8047395704764255,
+      "grad_norm": 0.5781950796979888,
+      "learning_rate": 1.1155364674856834e-06,
+      "loss": 1.2679,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8072080967662305,
+      "grad_norm": 0.5751302301159884,
+      "learning_rate": 1.088539965016377e-06,
+      "loss": 1.2153,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8096766230560355,
+      "grad_norm": 0.6150065644184977,
+      "learning_rate": 1.0618341854296176e-06,
+      "loss": 1.2245,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8121451493458405,
+      "grad_norm": 0.5893743060234344,
+      "learning_rate": 1.0354211136571586e-06,
+      "loss": 1.2091,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8146136756356456,
+      "grad_norm": 0.554001627193442,
+      "learning_rate": 1.0093027128749722e-06,
+      "loss": 1.22,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8170822019254506,
+      "grad_norm": 0.5554016650617593,
+      "learning_rate": 9.834809243573406e-07,
+      "loss": 1.2736,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8195507282152555,
+      "grad_norm": 0.6467820952863279,
+      "learning_rate": 9.57957667332562e-07,
+      "loss": 1.2504,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8220192545050605,
+      "grad_norm": 0.5388841867240308,
+      "learning_rate": 9.327348388403063e-07,
+      "loss": 1.2134,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8244877807948655,
+      "grad_norm": 0.5511949198965124,
+      "learning_rate": 9.078143135906154e-07,
+      "loss": 1.2373,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8269563070846705,
+      "grad_norm": 0.5662492648467455,
+      "learning_rate": 8.831979438245619e-07,
+      "loss": 1.2379,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8294248333744755,
+      "grad_norm": 0.6308948625824087,
+      "learning_rate": 8.588875591765838e-07,
+      "loss": 1.1868,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8318933596642805,
+      "grad_norm": 0.576660126030343,
+      "learning_rate": 8.348849665384906e-07,
+      "loss": 1.2891,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8343618859540854,
+      "grad_norm": 0.556606789107177,
+      "learning_rate": 8.111919499251653e-07,
+      "loss": 1.2021,
+      "step": 1690
+    },
+    {
+      "epoch": 0.8368304122438904,
+      "grad_norm": 0.5661740275037651,
+      "learning_rate": 7.878102703419683e-07,
+      "loss": 1.2536,
+      "step": 1695
+    },
+    {
+      "epoch": 0.8392989385336954,
+      "grad_norm": 0.5967205392911274,
+      "learning_rate": 7.647416656538464e-07,
+      "loss": 1.2373,
+      "step": 1700
+    },
+    {
+      "epoch": 0.8417674648235004,
+      "grad_norm": 0.5528061162446166,
+      "learning_rate": 7.419878504561651e-07,
+      "loss": 1.2199,
+      "step": 1705
+    },
+    {
+      "epoch": 0.8442359911133054,
+      "grad_norm": 0.6479872928308008,
+      "learning_rate": 7.195505159472726e-07,
+      "loss": 1.2368,
+      "step": 1710
+    },
+    {
+      "epoch": 0.8467045174031104,
+      "grad_norm": 0.594834011459554,
+      "learning_rate": 6.974313298027946e-07,
+      "loss": 1.1997,
+      "step": 1715
+    },
+    {
+      "epoch": 0.8491730436929154,
+      "grad_norm": 0.5442970599231537,
+      "learning_rate": 6.756319360516856e-07,
+      "loss": 1.2037,
+      "step": 1720
+    },
+    {
+      "epoch": 0.8516415699827203,
+      "grad_norm": 0.6655980946948994,
+      "learning_rate": 6.541539549540383e-07,
+      "loss": 1.3013,
+      "step": 1725
+    },
+    {
+      "epoch": 0.8541100962725253,
+      "grad_norm": 0.599651741019629,
+      "learning_rate": 6.329989828806482e-07,
+      "loss": 1.2454,
+      "step": 1730
+    },
+    {
+      "epoch": 0.8565786225623303,
+      "grad_norm": 0.7507415296204425,
+      "learning_rate": 6.121685921943688e-07,
+      "loss": 1.2347,
+      "step": 1735
+    },
+    {
+      "epoch": 0.8590471488521353,
+      "grad_norm": 0.5883088948787556,
+      "learning_rate": 5.916643311332438e-07,
+      "loss": 1.2566,
+      "step": 1740
+    },
+    {
+      "epoch": 0.8615156751419403,
+      "grad_norm": 0.5844649067792757,
+      "learning_rate": 5.71487723695427e-07,
+      "loss": 1.2176,
+      "step": 1745
+    },
+    {
+      "epoch": 0.8639842014317453,
+      "grad_norm": 0.570757598339604,
+      "learning_rate": 5.516402695259165e-07,
+      "loss": 1.2111,
+      "step": 1750
+    },
+    {
+      "epoch": 0.8664527277215502,
+      "grad_norm": 0.6101964731318252,
+      "learning_rate": 5.321234438050893e-07,
+      "loss": 1.2552,
+      "step": 1755
+    },
+    {
+      "epoch": 0.8689212540113552,
+      "grad_norm": 0.6114031483570134,
+      "learning_rate": 5.12938697139056e-07,
+      "loss": 1.2339,
+      "step": 1760
+    },
+    {
+      "epoch": 0.8713897803011602,
+      "grad_norm": 0.5640524033820485,
+      "learning_rate": 4.940874554518465e-07,
+      "loss": 1.2594,
+      "step": 1765
+    },
+    {
+      "epoch": 0.8738583065909652,
+      "grad_norm": 0.6433079417694005,
+      "learning_rate": 4.755711198794233e-07,
+      "loss": 1.2854,
+      "step": 1770
+    },
+    {
+      "epoch": 0.8763268328807702,
+      "grad_norm": 0.604973387553276,
+      "learning_rate": 4.573910666655429e-07,
+      "loss": 1.3237,
+      "step": 1775
+    },
+    {
+      "epoch": 0.8787953591705752,
+      "grad_norm": 0.5628418770325067,
+      "learning_rate": 4.395486470594645e-07,
+      "loss": 1.1982,
+      "step": 1780
+    },
+    {
+      "epoch": 0.8812638854603801,
+      "grad_norm": 0.6659219563445046,
+      "learning_rate": 4.220451872155179e-07,
+      "loss": 1.2309,
+      "step": 1785
+    },
+    {
+      "epoch": 0.8837324117501851,
+      "grad_norm": 0.5361789546629312,
+      "learning_rate": 4.048819880945337e-07,
+      "loss": 1.199,
+      "step": 1790
+    },
+    {
+      "epoch": 0.8862009380399901,
+      "grad_norm": 0.5558192723511216,
+      "learning_rate": 3.880603253671522e-07,
+      "loss": 1.2263,
+      "step": 1795
+    },
+    {
+      "epoch": 0.8886694643297951,
+      "grad_norm": 0.5634804859248715,
+      "learning_rate": 3.7158144931900395e-07,
+      "loss": 1.2752,
+      "step": 1800
+    },
+    {
+      "epoch": 0.8886694643297951,
+      "eval_loss": 1.2087970972061157,
+      "eval_runtime": 2557.7862,
+      "eval_samples_per_second": 1.564,
+      "eval_steps_per_second": 0.131,
+      "step": 1800
+    },
+    {
+      "epoch": 0.8911379906196001,
+      "grad_norm": 0.6032610406878897,
+      "learning_rate": 3.5544658475778317e-07,
+      "loss": 1.1999,
+      "step": 1805
+    },
+    {
+      "epoch": 0.8936065169094051,
+      "grad_norm": 0.6216254522630721,
+      "learning_rate": 3.396569309222114e-07,
+      "loss": 1.2339,
+      "step": 1810
+    },
+    {
+      "epoch": 0.89607504319921,
+      "grad_norm": 0.5807256981071689,
+      "learning_rate": 3.2421366139290423e-07,
+      "loss": 1.3057,
+      "step": 1815
+    },
+    {
+      "epoch": 0.898543569489015,
+      "grad_norm": 0.5211008570948544,
+      "learning_rate": 3.091179240051462e-07,
+      "loss": 1.2022,
+      "step": 1820
+    },
+    {
+      "epoch": 0.90101209577882,
+      "grad_norm": 0.5525058863296126,
+      "learning_rate": 2.943708407635704e-07,
+      "loss": 1.2048,
+      "step": 1825
+    },
+    {
+      "epoch": 0.903480622068625,
+      "grad_norm": 0.6377145176064325,
+      "learning_rate": 2.799735077587695e-07,
+      "loss": 1.213,
+      "step": 1830
+    },
+    {
+      "epoch": 0.90594914835843,
+      "grad_norm": 0.5813161900855606,
+      "learning_rate": 2.659269950858273e-07,
+      "loss": 1.33,
+      "step": 1835
+    },
+    {
+      "epoch": 0.908417674648235,
+      "grad_norm": 0.6256712692686102,
+      "learning_rate": 2.5223234676478193e-07,
+      "loss": 1.2418,
+      "step": 1840
+    },
+    {
+      "epoch": 0.91088620093804,
+      "grad_norm": 0.598042344925788,
+      "learning_rate": 2.3889058066302873e-07,
+      "loss": 1.2928,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9133547272278449,
+      "grad_norm": 0.6144058961581507,
+      "learning_rate": 2.2590268841966357e-07,
+      "loss": 1.2522,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9158232535176499,
+      "grad_norm": 0.6086868817654493,
+      "learning_rate": 2.132696353717839e-07,
+      "loss": 1.2275,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9182917798074549,
+      "grad_norm": 0.6193803813904503,
+      "learning_rate": 2.0099236048273407e-07,
+      "loss": 1.2102,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9207603060972599,
+      "grad_norm": 0.6206660621687174,
+      "learning_rate": 1.890717762723182e-07,
+      "loss": 1.2413,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9232288323870649,
+      "grad_norm": 0.5195254310690817,
+      "learning_rate": 1.7750876874897627e-07,
+      "loss": 1.2536,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9256973586768699,
+      "grad_norm": 0.6172193600635592,
+      "learning_rate": 1.6630419734393e-07,
+      "loss": 1.1877,
+      "step": 1875
+    },
+    {
+      "epoch": 0.9281658849666748,
+      "grad_norm": 0.5854056073690375,
+      "learning_rate": 1.554588948473068e-07,
+      "loss": 1.2694,
+      "step": 1880
+    },
+    {
+      "epoch": 0.9306344112564798,
+      "grad_norm": 0.5939692455470944,
+      "learning_rate": 1.4497366734623874e-07,
+      "loss": 1.2223,
+      "step": 1885
+    },
+    {
+      "epoch": 0.9331029375462848,
+      "grad_norm": 0.558716522853661,
+      "learning_rate": 1.3484929416495096e-07,
+      "loss": 1.1465,
+      "step": 1890
+    },
+    {
+      "epoch": 0.9355714638360898,
+      "grad_norm": 0.601778856283905,
+      "learning_rate": 1.2508652780683916e-07,
+      "loss": 1.2618,
+      "step": 1895
+    },
+    {
+      "epoch": 0.9380399901258949,
+      "grad_norm": 0.5724230357863298,
+      "learning_rate": 1.1568609389853546e-07,
+      "loss": 1.199,
+      "step": 1900
+    },
+    {
+      "epoch": 0.9405085164156999,
+      "grad_norm": 0.5858685464797397,
+      "learning_rate": 1.0664869113598097e-07,
+      "loss": 1.2416,
+      "step": 1905
+    },
+    {
+      "epoch": 0.9429770427055049,
+      "grad_norm": 0.5955002776535666,
+      "learning_rate": 9.7974991232489e-08,
+      "loss": 1.2621,
+      "step": 1910
+    },
+    {
+      "epoch": 0.9454455689953098,
+      "grad_norm": 0.6031053768787782,
+      "learning_rate": 8.966563886882107e-08,
+      "loss": 1.2966,
+      "step": 1915
+    },
+    {
+      "epoch": 0.9479140952851148,
+      "grad_norm": 0.5626513433181811,
+      "learning_rate": 8.172125164527312e-08,
+      "loss": 1.197,
+      "step": 1920
+    },
+    {
+      "epoch": 0.9503826215749198,
+      "grad_norm": 0.6147790631492948,
+      "learning_rate": 7.414242003576876e-08,
+      "loss": 1.2476,
+      "step": 1925
+    },
+    {
+      "epoch": 0.9528511478647248,
+      "grad_norm": 0.6387128598756113,
+      "learning_rate": 6.692970734397176e-08,
+      "loss": 1.2717,
+      "step": 1930
+    },
+    {
+      "epoch": 0.9553196741545298,
+      "grad_norm": 0.58519229057596,
+      "learning_rate": 6.0083649661421e-08,
+      "loss": 1.2427,
+      "step": 1935
+    },
+    {
+      "epoch": 0.9577882004443348,
+      "grad_norm": 0.5732049204953203,
+      "learning_rate": 5.360475582768088e-08,
+      "loss": 1.2499,
+      "step": 1940
+    },
+    {
+      "epoch": 0.9602567267341398,
+      "grad_norm": 0.5510115335869762,
+      "learning_rate": 4.7493507392524226e-08,
+      "loss": 1.1837,
+      "step": 1945
+    },
+    {
+      "epoch": 0.9627252530239447,
+      "grad_norm": 0.5959129330379044,
+      "learning_rate": 4.175035858013987e-08,
+      "loss": 1.2595,
+      "step": 1950
+    },
+    {
+      "epoch": 0.9651937793137497,
+      "grad_norm": 0.6525575790551825,
+      "learning_rate": 3.637573625537183e-08,
+      "loss": 1.3283,
+      "step": 1955
+    },
+    {
+      "epoch": 0.9676623056035547,
+      "grad_norm": 0.6761446719619785,
+      "learning_rate": 3.13700398919925e-08,
+      "loss": 1.2633,
+      "step": 1960
+    },
+    {
+      "epoch": 0.9701308318933597,
+      "grad_norm": 0.5705669812908541,
+      "learning_rate": 2.673364154301028e-08,
+      "loss": 1.2446,
+      "step": 1965
+    },
+    {
+      "epoch": 0.9725993581831647,
+      "grad_norm": 0.6197155608101478,
+      "learning_rate": 2.2466885813018925e-08,
+      "loss": 1.2492,
+      "step": 1970
+    },
+    {
+      "epoch": 0.9750678844729697,
+      "grad_norm": 0.5667304098455904,
+      "learning_rate": 1.857008983258135e-08,
+      "loss": 1.2485,
+      "step": 1975
+    },
+    {
+      "epoch": 0.9775364107627746,
+      "grad_norm": 0.6113665999543747,
+      "learning_rate": 1.504354323466073e-08,
+      "loss": 1.2573,
+      "step": 1980
+    },
+    {
+      "epoch": 0.9800049370525796,
+      "grad_norm": 0.5726714283406965,
+      "learning_rate": 1.188750813309214e-08,
+      "loss": 1.2264,
+      "step": 1985
+    },
+    {
+      "epoch": 0.9824734633423846,
+      "grad_norm": 0.5521047354644366,
+      "learning_rate": 9.102219103103161e-09,
+      "loss": 1.2194,
+      "step": 1990
+    },
+    {
+      "epoch": 0.9849419896321896,
+      "grad_norm": 0.6819693929722572,
+      "learning_rate": 6.687883163873921e-09,
+      "loss": 1.244,
+      "step": 1995
+    },
+    {
+      "epoch": 0.9874105159219946,
+      "grad_norm": 0.6016814387388122,
+      "learning_rate": 4.644679763155524e-09,
+      "loss": 1.2701,
+      "step": 2000
+    },
+    {
+      "epoch": 0.9874105159219946,
+      "eval_loss": 1.208633542060852,
+      "eval_runtime": 2553.7159,
+      "eval_samples_per_second": 1.566,
+      "eval_steps_per_second": 0.131,
+      "step": 2000
+    },
+    {
+      "epoch": 0.9898790422117996,
+      "grad_norm": 0.5854483828292536,
+      "learning_rate": 2.97276076392905e-09,
+      "loss": 1.2735,
+      "step": 2005
+    },
+    {
+      "epoch": 0.9923475685016045,
+      "grad_norm": 0.6149856349841143,
+      "learning_rate": 1.6722504331195822e-09,
+      "loss": 1.1829,
+      "step": 2010
+    },
+    {
+      "epoch": 0.9948160947914095,
+      "grad_norm": 0.5776580228856067,
+      "learning_rate": 7.432454323597071e-10,
+      "loss": 1.2584,
+      "step": 2015
+    },
+    {
+      "epoch": 0.9972846210812145,
+      "grad_norm": 0.5955477076581019,
+      "learning_rate": 1.8581481080415242e-10,
+      "loss": 1.1737,
+      "step": 2020
+    },
+    {
+      "epoch": 0.9997531473710195,
+      "grad_norm": 0.6070167910291095,
+      "learning_rate": 0.0,
+      "loss": 1.1858,
+      "step": 2025
+    },
+    {
+      "epoch": 0.9997531473710195,
+      "step": 2025,
+      "total_flos": 4526278881050624.0,
+      "train_loss": 1.270192005722611,
+      "train_runtime": 113933.3906,
+      "train_samples_per_second": 0.427,
+      "train_steps_per_second": 0.018
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2025,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "total_flos": 4526278881050624.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}