li-muyang
/

zephyr-7b-sft-full

@@ -3,15 +3,11 @@ library_name: transformers
 license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
-- alignment-handbook
-- trl
-- sft
-- generated_from_trainer
 - trl
 - sft
 - generated_from_trainer
 datasets:
-- HuggingFaceH4/ultrachat_200k
 model-index:
 - name: zephyr-7b-sft-full
   results: []
@@ -22,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
 # zephyr-7b-sft-full
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.9420
 ## Model description
@@ -45,22 +41,37 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 2e-05
 - train_batch_size: 16
-- eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
-- num_devices: 8
-- total_train_batch_size: 128
-- total_eval_batch_size: 64
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 1
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss |
-|:-------------:|:-----:|:----:|:---------------:|
-| 0.9183        | 1.0   | 1084 | 0.9420          |
 ### Framework versions

 license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - trl
 - sft
 - generated_from_trainer
 datasets:
+- generator
 model-index:
 - name: zephyr-7b-sft-full
   results: []
 # zephyr-7b-sft-full
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.9934
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 2e-05
 - train_batch_size: 16
+- eval_batch_size: 16
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 16
+- total_train_batch_size: 256
+- total_eval_batch_size: 256
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 3.0
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.9681        | 0.1845 | 100  | 0.9788          |
+| 0.9962        | 0.3690 | 200  | 1.0030          |
+| 0.9917        | 0.5535 | 300  | 1.0008          |
+| 0.9652        | 0.7380 | 400  | 0.9939          |
+| 0.9666        | 0.9225 | 500  | 0.9816          |
+| 0.7366        | 1.1070 | 600  | 0.9852          |
+| 0.7228        | 1.2915 | 700  | 0.9835          |
+| 0.7319        | 1.4760 | 800  | 0.9644          |
+| 0.7177        | 1.6605 | 900  | 0.9529          |
+| 0.7095        | 1.8450 | 1000 | 0.9394          |
+| 0.4465        | 2.0295 | 1100 | 0.9917          |
+| 0.4341        | 2.2140 | 1200 | 0.9979          |
+| 0.432         | 2.3985 | 1300 | 0.9954          |
+| 0.4301        | 2.5830 | 1400 | 0.9943          |
+| 0.4361        | 2.7675 | 1500 | 0.9931          |
+| 0.4256        | 2.9520 | 1600 | 0.9934          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 1.0,
-    "eval_loss": 0.9419716000556946,
-    "eval_runtime": 916.5084,
-    "eval_samples": 23109,
-    "eval_samples_per_second": 16.748,
-    "eval_steps_per_second": 0.262,
-    "total_flos": 453935093514240.0,
-    "train_loss": 0.9848188322408613,
-    "train_runtime": 36728.3484,
     "train_samples": 207864,
-    "train_samples_per_second": 3.776,
-    "train_steps_per_second": 0.03
 }

 {
+    "epoch": 3.0,
+    "total_flos": 1361805280542720.0,
+    "train_loss": 0.713569560815634,
+    "train_runtime": 59769.2599,
     "train_samples": 207864,
+    "train_samples_per_second": 6.961,
+    "train_steps_per_second": 0.027
 }

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.0,
-    "total_flos": 453935093514240.0,
-    "train_loss": 0.9848188322408613,
-    "train_runtime": 36728.3484,
     "train_samples": 207864,
-    "train_samples_per_second": 3.776,
-    "train_steps_per_second": 0.03
 }

 {
+    "epoch": 3.0,
+    "total_flos": 1361805280542720.0,
+    "train_loss": 0.713569560815634,
+    "train_runtime": 59769.2599,
     "train_samples": 207864,
+    "train_samples_per_second": 6.961,
+    "train_steps_per_second": 0.027
 }

trainer_state.json CHANGED Viewed

@@ -1,1554 +1,2437 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
-  "eval_steps": 100.0,
-  "global_step": 1084,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0009225092250922509,
-      "grad_norm": 9.199869276397154,
-      "learning_rate": 1.8348623853211012e-07,
       "loss": 1.1392,
       "step": 1
     },
     {
-      "epoch": 0.004612546125461255,
-      "grad_norm": 7.233201206284356,
-      "learning_rate": 9.174311926605506e-07,
-      "loss": 1.1291,
       "step": 5
     },
     {
-      "epoch": 0.00922509225092251,
-      "grad_norm": 4.144472619198066,
-      "learning_rate": 1.8348623853211011e-06,
-      "loss": 1.0628,
       "step": 10
     },
     {
-      "epoch": 0.013837638376383764,
-      "grad_norm": 2.6527022539947374,
-      "learning_rate": 2.7522935779816517e-06,
-      "loss": 1.0102,
       "step": 15
     },
     {
-      "epoch": 0.01845018450184502,
-      "grad_norm": 2.7475157939980743,
-      "learning_rate": 3.6697247706422022e-06,
-      "loss": 1.0126,
       "step": 20
     },
     {
-      "epoch": 0.023062730627306273,
-      "grad_norm": 2.3644745727694243,
-      "learning_rate": 4.587155963302753e-06,
-      "loss": 1.0031,
       "step": 25
     },
     {
-      "epoch": 0.027675276752767528,
-      "grad_norm": 2.2459942557783146,
-      "learning_rate": 5.504587155963303e-06,
-      "loss": 0.9714,
       "step": 30
     },
     {
-      "epoch": 0.03228782287822878,
-      "grad_norm": 2.8633038603162237,
-      "learning_rate": 6.422018348623854e-06,
-      "loss": 0.998,
       "step": 35
     },
     {
-      "epoch": 0.03690036900369004,
-      "grad_norm": 3.2535687429097493,
-      "learning_rate": 7.3394495412844045e-06,
-      "loss": 1.0051,
       "step": 40
     },
     {
-      "epoch": 0.04151291512915129,
-      "grad_norm": 3.2137144467730088,
-      "learning_rate": 8.256880733944956e-06,
-      "loss": 0.9837,
       "step": 45
     },
     {
-      "epoch": 0.046125461254612546,
-      "grad_norm": 3.229126865982439,
-      "learning_rate": 9.174311926605506e-06,
-      "loss": 0.9808,
       "step": 50
     },
     {
-      "epoch": 0.0507380073800738,
-      "grad_norm": 2.559458190466229,
-      "learning_rate": 1.0091743119266055e-05,
-      "loss": 0.9816,
       "step": 55
     },
     {
-      "epoch": 0.055350553505535055,
-      "grad_norm": 2.2546043237809603,
-      "learning_rate": 1.1009174311926607e-05,
-      "loss": 0.9956,
       "step": 60
     },
     {
-      "epoch": 0.05996309963099631,
-      "grad_norm": 1.9138564480742573,
-      "learning_rate": 1.1926605504587156e-05,
-      "loss": 0.9995,
       "step": 65
     },
     {
-      "epoch": 0.06457564575645756,
-      "grad_norm": 2.5216610932942256,
-      "learning_rate": 1.2844036697247708e-05,
-      "loss": 0.9789,
       "step": 70
     },
     {
-      "epoch": 0.06918819188191883,
-      "grad_norm": 2.3850758267877654,
-      "learning_rate": 1.3761467889908258e-05,
-      "loss": 0.9904,
       "step": 75
     },
     {
-      "epoch": 0.07380073800738007,
-      "grad_norm": 2.4735172195856485,
-      "learning_rate": 1.4678899082568809e-05,
-      "loss": 0.9855,
       "step": 80
     },
     {
-      "epoch": 0.07841328413284133,
-      "grad_norm": 2.534406745674543,
-      "learning_rate": 1.559633027522936e-05,
-      "loss": 1.0087,
       "step": 85
     },
     {
-      "epoch": 0.08302583025830258,
-      "grad_norm": 2.0301929785147577,
-      "learning_rate": 1.6513761467889912e-05,
-      "loss": 1.0092,
       "step": 90
     },
     {
-      "epoch": 0.08763837638376384,
-      "grad_norm": 2.440748895363732,
-      "learning_rate": 1.743119266055046e-05,
-      "loss": 1.0159,
       "step": 95
     },
     {
-      "epoch": 0.09225092250922509,
-      "grad_norm": 1.8603714211581104,
-      "learning_rate": 1.834862385321101e-05,
-      "loss": 1.0225,
       "step": 100
     },
     {
-      "epoch": 0.09686346863468635,
-      "grad_norm": 2.056122625713521,
-      "learning_rate": 1.9266055045871563e-05,
-      "loss": 1.0501,
       "step": 105
     },
     {
-      "epoch": 0.1014760147601476,
-      "grad_norm": 1.8374568202095534,
-      "learning_rate": 1.9999948088910656e-05,
-      "loss": 1.0353,
       "step": 110
     },
     {
-      "epoch": 0.10608856088560886,
-      "grad_norm": 2.1690401471869323,
-      "learning_rate": 1.9998131257372878e-05,
-      "loss": 1.0457,
       "step": 115
     },
     {
-      "epoch": 0.11070110701107011,
-      "grad_norm": 2.333795049697145,
-      "learning_rate": 1.999371941029485e-05,
-      "loss": 1.0262,
       "step": 120
     },
     {
-      "epoch": 0.11531365313653137,
-      "grad_norm": 2.193045375950034,
-      "learning_rate": 1.9986713692771732e-05,
-      "loss": 1.0522,
       "step": 125
     },
     {
-      "epoch": 0.11992619926199262,
-      "grad_norm": 2.090844050250459,
-      "learning_rate": 1.9977115923137912e-05,
-      "loss": 1.0223,
       "step": 130
     },
     {
-      "epoch": 0.12453874538745388,
-      "grad_norm": 1.9519590219761234,
-      "learning_rate": 1.9964928592495046e-05,
-      "loss": 1.0536,
       "step": 135
     },
     {
-      "epoch": 0.12915129151291513,
-      "grad_norm": 1.9148433671554628,
-      "learning_rate": 1.9950154864065497e-05,
-      "loss": 1.0495,
       "step": 140
     },
     {
-      "epoch": 0.13376383763837638,
-      "grad_norm": 1.8328907186676047,
-      "learning_rate": 1.993279857237133e-05,
-      "loss": 1.029,
       "step": 145
     },
     {
-      "epoch": 0.13837638376383765,
-      "grad_norm": 1.890355857976019,
-      "learning_rate": 1.9912864222239045e-05,
-      "loss": 1.0171,
       "step": 150
     },
     {
-      "epoch": 0.1429889298892989,
-      "grad_norm": 1.883026281091422,
-      "learning_rate": 1.9890356987630362e-05,
-      "loss": 1.0687,
       "step": 155
     },
     {
-      "epoch": 0.14760147601476015,
-      "grad_norm": 2.0771293885484727,
-      "learning_rate": 1.986528271029931e-05,
-      "loss": 1.0274,
       "step": 160
     },
     {
-      "epoch": 0.1522140221402214,
-      "grad_norm": 2.359666912377762,
-      "learning_rate": 1.9837647898276008e-05,
-      "loss": 1.0406,
       "step": 165
     },
     {
-      "epoch": 0.15682656826568267,
-      "grad_norm": 2.3232785879151168,
-      "learning_rate": 1.9807459724177497e-05,
-      "loss": 1.0415,
       "step": 170
     },
     {
-      "epoch": 0.16143911439114392,
-      "grad_norm": 2.0936905877765186,
-      "learning_rate": 1.977472602334609e-05,
-      "loss": 1.033,
       "step": 175
     },
     {
-      "epoch": 0.16605166051660517,
-      "grad_norm": 1.8878158554896975,
-      "learning_rate": 1.973945529181572e-05,
-      "loss": 1.0364,
       "step": 180
     },
     {
-      "epoch": 0.1706642066420664,
-      "grad_norm": 1.9593900450172317,
-      "learning_rate": 1.9701656684106764e-05,
-      "loss": 1.0367,
       "step": 185
     },
     {
-      "epoch": 0.1752767527675277,
-      "grad_norm": 1.5858821175634874,
-      "learning_rate": 1.9661340010850025e-05,
-      "loss": 1.0214,
       "step": 190
     },
     {
-      "epoch": 0.17988929889298894,
-      "grad_norm": 1.6698918973649843,
-      "learning_rate": 1.9618515736240353e-05,
-      "loss": 1.0275,
       "step": 195
     },
     {
-      "epoch": 0.18450184501845018,
-      "grad_norm": 2.4565319786652853,
-      "learning_rate": 1.9573194975320672e-05,
-      "loss": 1.0387,
       "step": 200
     },
     {
-      "epoch": 0.18911439114391143,
-      "grad_norm": 1.8022901836910148,
-      "learning_rate": 1.952538949109708e-05,
-      "loss": 1.0283,
       "step": 205
     },
     {
-      "epoch": 0.1937269372693727,
-      "grad_norm": 1.8754516081940102,
-      "learning_rate": 1.9475111691485737e-05,
-      "loss": 1.0091,
       "step": 210
     },
     {
-      "epoch": 0.19833948339483395,
-      "grad_norm": 1.7377386091028952,
-      "learning_rate": 1.9422374626092414e-05,
-      "loss": 1.0196,
       "step": 215
     },
     {
-      "epoch": 0.2029520295202952,
-      "grad_norm": 1.6554468249518657,
-      "learning_rate": 1.936719198282545e-05,
-      "loss": 1.04,
       "step": 220
     },
     {
-      "epoch": 0.20756457564575645,
-      "grad_norm": 1.5846514749193226,
-      "learning_rate": 1.930957808434307e-05,
-      "loss": 1.0456,
       "step": 225
     },
     {
-      "epoch": 0.21217712177121772,
-      "grad_norm": 1.8827810885248515,
-      "learning_rate": 1.9249547884335917e-05,
-      "loss": 1.0264,
       "step": 230
     },
     {
-      "epoch": 0.21678966789667897,
-      "grad_norm": 1.7322775011210476,
-      "learning_rate": 1.9187116963645845e-05,
-      "loss": 1.0454,
       "step": 235
     },
     {
-      "epoch": 0.22140221402214022,
-      "grad_norm": 1.5658002998985245,
-      "learning_rate": 1.912230152622189e-05,
-      "loss": 1.0343,
       "step": 240
     },
     {
-      "epoch": 0.22601476014760147,
-      "grad_norm": 1.725101976536917,
-      "learning_rate": 1.9055118394914545e-05,
-      "loss": 1.0343,
       "step": 245
     },
     {
-      "epoch": 0.23062730627306274,
-      "grad_norm": 2.1904906773576087,
-      "learning_rate": 1.898558500710939e-05,
-      "loss": 1.0318,
       "step": 250
     },
     {
-      "epoch": 0.235239852398524,
-      "grad_norm": 1.8347760248935339,
-      "learning_rate": 1.891371941020121e-05,
-      "loss": 1.0389,
       "step": 255
     },
     {
-      "epoch": 0.23985239852398524,
-      "grad_norm": 1.857980454997456,
-      "learning_rate": 1.88395402569098e-05,
-      "loss": 1.0476,
       "step": 260
     },
     {
-      "epoch": 0.2444649446494465,
-      "grad_norm": 2.290390982017683,
-      "learning_rate": 1.8763066800438638e-05,
-      "loss": 1.0509,
       "step": 265
     },
     {
-      "epoch": 0.24907749077490776,
-      "grad_norm": 1.820494807037894,
-      "learning_rate": 1.868431888947773e-05,
-      "loss": 1.0473,
       "step": 270
     },
     {
-      "epoch": 0.253690036900369,
-      "grad_norm": 1.6939194646151865,
-      "learning_rate": 1.860331696305188e-05,
-      "loss": 1.0259,
       "step": 275
     },
     {
-      "epoch": 0.25830258302583026,
-      "grad_norm": 1.8537886490677649,
-      "learning_rate": 1.852008204521572e-05,
-      "loss": 1.0352,
       "step": 280
     },
     {
-      "epoch": 0.2629151291512915,
-      "grad_norm": 1.6085016709020972,
-      "learning_rate": 1.8434635739596945e-05,
-      "loss": 1.0253,
       "step": 285
     },
     {
-      "epoch": 0.26752767527675275,
-      "grad_norm": 1.6014838440490224,
-      "learning_rate": 1.834700022378907e-05,
-      "loss": 1.0361,
       "step": 290
     },
     {
-      "epoch": 0.272140221402214,
-      "grad_norm": 1.7338858274869917,
-      "learning_rate": 1.825719824359524e-05,
-      "loss": 1.0272,
       "step": 295
     },
     {
-      "epoch": 0.2767527675276753,
-      "grad_norm": 1.5899229118868345,
-      "learning_rate": 1.816525310712456e-05,
-      "loss": 1.0341,
       "step": 300
     },
     {
-      "epoch": 0.28136531365313655,
-      "grad_norm": 2.348086425746126,
-      "learning_rate": 1.8071188678742457e-05,
-      "loss": 1.0104,
       "step": 305
     },
     {
-      "epoch": 0.2859778597785978,
-      "grad_norm": 1.7668549809740794,
-      "learning_rate": 1.7975029372876706e-05,
-      "loss": 1.0333,
       "step": 310
     },
     {
-      "epoch": 0.29059040590405905,
-      "grad_norm": 1.6328725360023482,
-      "learning_rate": 1.787680014768065e-05,
-      "loss": 1.0221,
       "step": 315
     },
     {
-      "epoch": 0.2952029520295203,
-      "grad_norm": 1.6065778746306763,
-      "learning_rate": 1.777652649855531e-05,
-      "loss": 1.0219,
       "step": 320
     },
     {
-      "epoch": 0.29981549815498154,
-      "grad_norm": 1.6424952210499195,
-      "learning_rate": 1.7674234451532065e-05,
-      "loss": 1.0315,
       "step": 325
     },
     {
-      "epoch": 0.3044280442804428,
-      "grad_norm": 1.644109341241818,
-      "learning_rate": 1.7569950556517566e-05,
-      "loss": 1.0441,
       "step": 330
     },
     {
-      "epoch": 0.30904059040590404,
-      "grad_norm": 1.679640047213138,
-      "learning_rate": 1.7463701880402738e-05,
-      "loss": 1.0393,
       "step": 335
     },
     {
-      "epoch": 0.31365313653136534,
-      "grad_norm": 1.6684149890786646,
-      "learning_rate": 1.7355516000037555e-05,
-      "loss": 1.0293,
       "step": 340
     },
     {
-      "epoch": 0.3182656826568266,
-      "grad_norm": 1.5914184889923237,
-      "learning_rate": 1.7245420995073453e-05,
-      "loss": 1.0378,
       "step": 345
     },
     {
-      "epoch": 0.32287822878228783,
-      "grad_norm": 1.5692073860267395,
-      "learning_rate": 1.7133445440675268e-05,
-      "loss": 1.0143,
       "step": 350
     },
     {
-      "epoch": 0.3274907749077491,
-      "grad_norm": 1.5995342549553517,
-      "learning_rate": 1.7019618400104572e-05,
-      "loss": 1.0238,
       "step": 355
     },
     {
-      "epoch": 0.33210332103321033,
-      "grad_norm": 1.9464601435555215,
-      "learning_rate": 1.6903969417176244e-05,
-      "loss": 1.0288,
       "step": 360
     },
     {
-      "epoch": 0.3367158671586716,
-      "grad_norm": 1.57749745993734,
-      "learning_rate": 1.6786528508590436e-05,
-      "loss": 1.0185,
       "step": 365
     },
     {
-      "epoch": 0.3413284132841328,
-      "grad_norm": 3.1064222839234703,
-      "learning_rate": 1.666732615614169e-05,
-      "loss": 1.042,
       "step": 370
     },
     {
-      "epoch": 0.3459409594095941,
-      "grad_norm": 1.6220955769831102,
-      "learning_rate": 1.6546393298807405e-05,
-      "loss": 1.0267,
       "step": 375
     },
     {
-      "epoch": 0.3505535055350554,
-      "grad_norm": 1.4978378904701755,
-      "learning_rate": 1.6423761324717636e-05,
-      "loss": 1.0183,
       "step": 380
     },
     {
-      "epoch": 0.3551660516605166,
-      "grad_norm": 1.5484493050267714,
-      "learning_rate": 1.6299462063008272e-05,
-      "loss": 0.999,
       "step": 385
     },
     {
-      "epoch": 0.35977859778597787,
-      "grad_norm": 1.591209884115851,
-      "learning_rate": 1.61735277755598e-05,
-      "loss": 1.0099,
       "step": 390
     },
     {
-      "epoch": 0.3643911439114391,
-      "grad_norm": 1.5725700106534855,
-      "learning_rate": 1.6045991148623752e-05,
-      "loss": 1.03,
       "step": 395
     },
     {
-      "epoch": 0.36900369003690037,
-      "grad_norm": 1.605757495194556,
-      "learning_rate": 1.5916885284338937e-05,
-      "loss": 1.0104,
       "step": 400
     },
     {
-      "epoch": 0.3736162361623616,
-      "grad_norm": 1.5888640482959957,
-      "learning_rate": 1.5786243692139826e-05,
-      "loss": 1.0178,
       "step": 405
     },
     {
-      "epoch": 0.37822878228782286,
-      "grad_norm": 1.500246112968274,
-      "learning_rate": 1.5654100280059155e-05,
-      "loss": 1.0043,
       "step": 410
     },
     {
-      "epoch": 0.3828413284132841,
-      "grad_norm": 1.5809504761491522,
-      "learning_rate": 1.5520489345927095e-05,
-      "loss": 0.9976,
       "step": 415
     },
     {
-      "epoch": 0.3874538745387454,
-      "grad_norm": 1.8367787118836345,
-      "learning_rate": 1.538544556846925e-05,
-      "loss": 1.0417,
       "step": 420
     },
     {
-      "epoch": 0.39206642066420666,
-      "grad_norm": 1.492213176709814,
-      "learning_rate": 1.5249003998305787e-05,
-      "loss": 1.0099,
       "step": 425
     },
     {
-      "epoch": 0.3966789667896679,
-      "grad_norm": 1.5368549913004959,
-      "learning_rate": 1.5111200048854055e-05,
-      "loss": 1.0144,
       "step": 430
     },
     {
-      "epoch": 0.40129151291512916,
-      "grad_norm": 1.3456499423014299,
-      "learning_rate": 1.4972069487137024e-05,
-      "loss": 0.9951,
       "step": 435
     },
     {
-      "epoch": 0.4059040590405904,
-      "grad_norm": 1.4852289451988536,
-      "learning_rate": 1.4831648424499953e-05,
-      "loss": 1.0113,
       "step": 440
     },
     {
-      "epoch": 0.41051660516605165,
-      "grad_norm": 1.5328177858690297,
-      "learning_rate": 1.4689973307237687e-05,
-      "loss": 1.0115,
       "step": 445
     },
     {
-      "epoch": 0.4151291512915129,
-      "grad_norm": 1.5547653083379547,
-      "learning_rate": 1.4547080907135024e-05,
-      "loss": 1.0186,
       "step": 450
     },
     {
-      "epoch": 0.41974169741697415,
-      "grad_norm": 1.6012528016619334,
-      "learning_rate": 1.4403008311922593e-05,
-      "loss": 0.9945,
       "step": 455
     },
     {
-      "epoch": 0.42435424354243545,
-      "grad_norm": 1.455131706606304,
-      "learning_rate": 1.4257792915650728e-05,
-      "loss": 0.9964,
       "step": 460
     },
     {
-      "epoch": 0.4289667896678967,
-      "grad_norm": 1.487042234387078,
-      "learning_rate": 1.4111472408983843e-05,
-      "loss": 1.0065,
       "step": 465
     },
     {
-      "epoch": 0.43357933579335795,
-      "grad_norm": 1.5691743840811192,
-      "learning_rate": 1.3964084769417823e-05,
-      "loss": 1.02,
       "step": 470
     },
     {
-      "epoch": 0.4381918819188192,
-      "grad_norm": 1.5181505967295013,
-      "learning_rate": 1.3815668251422953e-05,
-      "loss": 1.0144,
       "step": 475
     },
     {
-      "epoch": 0.44280442804428044,
-      "grad_norm": 1.5130152153427416,
-      "learning_rate": 1.3666261376514978e-05,
-      "loss": 1.0013,
       "step": 480
     },
     {
-      "epoch": 0.4474169741697417,
-      "grad_norm": 1.4674728937674146,
-      "learning_rate": 1.3515902923256832e-05,
-      "loss": 1.0205,
       "step": 485
     },
     {
-      "epoch": 0.45202952029520294,
-      "grad_norm": 1.8760830941169657,
-      "learning_rate": 1.3364631917193671e-05,
-      "loss": 0.9969,
       "step": 490
     },
     {
-      "epoch": 0.4566420664206642,
-      "grad_norm": 1.4141733000626033,
-      "learning_rate": 1.321248762072377e-05,
-      "loss": 0.9836,
       "step": 495
     },
     {
-      "epoch": 0.4612546125461255,
-      "grad_norm": 1.6973965594511555,
-      "learning_rate": 1.3059509522907998e-05,
-      "loss": 1.0202,
       "step": 500
     },
     {
-      "epoch": 0.46586715867158673,
-      "grad_norm": 1.5269161840167254,
-      "learning_rate": 1.2905737329220394e-05,
-      "loss": 0.993,
       "step": 505
     },
     {
-      "epoch": 0.470479704797048,
-      "grad_norm": 1.4909374182409396,
-      "learning_rate": 1.2751210951242636e-05,
-      "loss": 1.0086,
       "step": 510
     },
     {
-      "epoch": 0.47509225092250923,
-      "grad_norm": 1.4153798189966285,
-      "learning_rate": 1.2595970496304975e-05,
-      "loss": 1.0111,
       "step": 515
     },
     {
-      "epoch": 0.4797047970479705,
-      "grad_norm": 1.5032582612787087,
-      "learning_rate": 1.2440056257076376e-05,
-      "loss": 1.0006,
       "step": 520
     },
     {
-      "epoch": 0.4843173431734317,
-      "grad_norm": 1.5509414258002427,
-      "learning_rate": 1.2283508701106559e-05,
-      "loss": 0.9802,
       "step": 525
     },
     {
-      "epoch": 0.488929889298893,
-      "grad_norm": 1.4222240180884669,
-      "learning_rate": 1.2126368460322637e-05,
-      "loss": 0.9947,
       "step": 530
     },
     {
-      "epoch": 0.4935424354243542,
-      "grad_norm": 1.4826692820488787,
-      "learning_rate": 1.1968676320483103e-05,
-      "loss": 0.9787,
       "step": 535
     },
     {
-      "epoch": 0.4981549815498155,
-      "grad_norm": 1.6379227632536115,
-      "learning_rate": 1.1810473210591882e-05,
-      "loss": 0.9932,
       "step": 540
     },
     {
-      "epoch": 0.5027675276752768,
-      "grad_norm": 1.4409220105845035,
-      "learning_rate": 1.1651800192275197e-05,
-      "loss": 0.9823,
       "step": 545
     },
     {
-      "epoch": 0.507380073800738,
-      "grad_norm": 1.5563205928244186,
-      "learning_rate": 1.1492698449124042e-05,
-      "loss": 0.9902,
       "step": 550
     },
     {
-      "epoch": 0.5119926199261993,
-      "grad_norm": 1.5195778886181963,
-      "learning_rate": 1.1333209276004959e-05,
-      "loss": 0.9963,
       "step": 555
     },
     {
-      "epoch": 0.5166051660516605,
-      "grad_norm": 1.5087175941528819,
-      "learning_rate": 1.1173374068341962e-05,
-      "loss": 0.9862,
       "step": 560
     },
     {
-      "epoch": 0.5212177121771218,
-      "grad_norm": 1.4673889879261985,
-      "learning_rate": 1.1013234311372353e-05,
-      "loss": 0.9816,
       "step": 565
     },
     {
-      "epoch": 0.525830258302583,
-      "grad_norm": 1.435236236989395,
-      "learning_rate": 1.0852831569379217e-05,
-      "loss": 0.9793,
       "step": 570
     },
     {
-      "epoch": 0.5304428044280443,
-      "grad_norm": 1.4114527764890656,
-      "learning_rate": 1.0692207474903421e-05,
-      "loss": 0.9791,
       "step": 575
     },
     {
-      "epoch": 0.5350553505535055,
-      "grad_norm": 1.420567074345227,
-      "learning_rate": 1.0531403717937888e-05,
-      "loss": 0.9773,
       "step": 580
     },
     {
-      "epoch": 0.5396678966789668,
-      "grad_norm": 1.4586967012029641,
-      "learning_rate": 1.037046203510694e-05,
-      "loss": 0.9769,
       "step": 585
     },
     {
-      "epoch": 0.544280442804428,
-      "grad_norm": 1.366063014582902,
-      "learning_rate": 1.0209424198833571e-05,
-      "loss": 0.9675,
       "step": 590
     },
     {
-      "epoch": 0.5488929889298892,
-      "grad_norm": 1.45679701107198,
-      "learning_rate": 1.0048332006497406e-05,
-      "loss": 0.9955,
       "step": 595
     },
     {
-      "epoch": 0.5535055350553506,
-      "grad_norm": 1.3265263093097257,
-      "learning_rate": 9.887227269586184e-06,
-      "loss": 0.9734,
       "step": 600
     },
     {
-      "epoch": 0.5581180811808119,
-      "grad_norm": 1.4378603761053164,
-      "learning_rate": 9.7261518028436e-06,
-      "loss": 0.9793,
       "step": 605
     },
     {
-      "epoch": 0.5627306273062731,
-      "grad_norm": 1.46938788585248,
-      "learning_rate": 9.565147413416266e-06,
-      "loss": 0.989,
       "step": 610
     },
     {
-      "epoch": 0.5673431734317343,
-      "grad_norm": 1.3602627962743308,
-      "learning_rate": 9.404255890002677e-06,
-      "loss": 0.9739,
       "step": 615
     },
     {
-      "epoch": 0.5719557195571956,
-      "grad_norm": 1.3509149354182792,
-      "learning_rate": 9.243518992006944e-06,
-      "loss": 0.9811,
       "step": 620
     },
     {
-      "epoch": 0.5765682656826568,
-      "grad_norm": 1.3289296656282001,
-      "learning_rate": 9.082978438700138e-06,
-      "loss": 0.969,
       "step": 625
     },
     {
-      "epoch": 0.5811808118081181,
-      "grad_norm": 1.3748603146537095,
-      "learning_rate": 8.922675898392072e-06,
-      "loss": 0.9783,
       "step": 630
     },
     {
-      "epoch": 0.5857933579335793,
-      "grad_norm": 1.4054407615389235,
-      "learning_rate": 8.762652977616258e-06,
-      "loss": 0.9872,
       "step": 635
     },
     {
-      "epoch": 0.5904059040590406,
-      "grad_norm": 1.4266146283393177,
-      "learning_rate": 8.602951210330942e-06,
-      "loss": 0.9875,
       "step": 640
     },
     {
-      "epoch": 0.5950184501845018,
-      "grad_norm": 1.3972501984892904,
-      "learning_rate": 8.443612047138965e-06,
-      "loss": 0.9622,
       "step": 645
     },
     {
-      "epoch": 0.5996309963099631,
-      "grad_norm": 1.3682517939432157,
-      "learning_rate": 8.284676844529258e-06,
-      "loss": 0.9803,
       "step": 650
     },
     {
-      "epoch": 0.6042435424354243,
-      "grad_norm": 1.3429050015886888,
-      "learning_rate": 8.126186854142752e-06,
-      "loss": 0.9712,
       "step": 655
     },
     {
-      "epoch": 0.6088560885608856,
-      "grad_norm": 1.4594824193372815,
-      "learning_rate": 7.968183212065537e-06,
-      "loss": 0.9622,
       "step": 660
     },
     {
-      "epoch": 0.6134686346863468,
-      "grad_norm": 1.3448226541798638,
-      "learning_rate": 7.81070692815195e-06,
-      "loss": 0.9722,
       "step": 665
     },
     {
-      "epoch": 0.6180811808118081,
-      "grad_norm": 1.3271486276830222,
-      "learning_rate": 7.6537988753805e-06,
-      "loss": 0.9757,
       "step": 670
     },
     {
-      "epoch": 0.6226937269372693,
-      "grad_norm": 1.3444906745257086,
-      "learning_rate": 7.497499779245268e-06,
-      "loss": 0.9727,
       "step": 675
     },
     {
-      "epoch": 0.6273062730627307,
-      "grad_norm": 1.3631286023111704,
-      "learning_rate": 7.3418502071856004e-06,
-      "loss": 0.966,
       "step": 680
     },
     {
-      "epoch": 0.6319188191881919,
-      "grad_norm": 1.4595389618690304,
-      "learning_rate": 7.186890558056836e-06,
-      "loss": 0.9646,
       "step": 685
     },
     {
-      "epoch": 0.6365313653136532,
-      "grad_norm": 1.3472290480867384,
-      "learning_rate": 7.0326610516447825e-06,
-      "loss": 0.9619,
       "step": 690
     },
     {
-      "epoch": 0.6411439114391144,
-      "grad_norm": 1.8007090081101473,
-      "learning_rate": 6.879201718226658e-06,
-      "loss": 0.9771,
       "step": 695
     },
     {
-      "epoch": 0.6457564575645757,
-      "grad_norm": 1.3774637805586714,
-      "learning_rate": 6.7265523881812335e-06,
-      "loss": 0.9421,
       "step": 700
     },
     {
-      "epoch": 0.6503690036900369,
-      "grad_norm": 1.346269997716162,
-      "learning_rate": 6.574752681650864e-06,
-      "loss": 0.9418,
       "step": 705
     },
     {
-      "epoch": 0.6549815498154982,
-      "grad_norm": 1.2742494405200788,
-      "learning_rate": 6.423841998258069e-06,
-      "loss": 0.9475,
       "step": 710
     },
     {
-      "epoch": 0.6595940959409594,
-      "grad_norm": 1.3211417146326536,
-      "learning_rate": 6.273859506879365e-06,
-      "loss": 0.9624,
       "step": 715
     },
     {
-      "epoch": 0.6642066420664207,
-      "grad_norm": 1.3166822514896896,
-      "learning_rate": 6.124844135478971e-06,
-      "loss": 0.9627,
       "step": 720
     },
     {
-      "epoch": 0.6688191881918819,
-      "grad_norm": 1.3120951075995704,
-      "learning_rate": 5.976834561005069e-06,
-      "loss": 0.9508,
       "step": 725
     },
     {
-      "epoch": 0.6734317343173432,
-      "grad_norm": 1.3290407324477753,
-      "learning_rate": 5.829869199351188e-06,
-      "loss": 0.9504,
       "step": 730
     },
     {
-      "epoch": 0.6780442804428044,
-      "grad_norm": 1.297841981615856,
-      "learning_rate": 5.68398619538536e-06,
-      "loss": 0.9528,
       "step": 735
     },
     {
-      "epoch": 0.6826568265682657,
-      "grad_norm": 1.3297326740637123,
-      "learning_rate": 5.53922341304961e-06,
-      "loss": 0.953,
       "step": 740
     },
     {
-      "epoch": 0.6872693726937269,
-      "grad_norm": 1.270375299309487,
-      "learning_rate": 5.39561842553239e-06,
-      "loss": 0.9556,
       "step": 745
     },
     {
-      "epoch": 0.6918819188191881,
-      "grad_norm": 1.3325965856351196,
-      "learning_rate": 5.2532085055164205e-06,
-      "loss": 0.9466,
       "step": 750
     },
     {
-      "epoch": 0.6964944649446494,
-      "grad_norm": 1.384058443252621,
-      "learning_rate": 5.112030615504601e-06,
-      "loss": 0.9568,
       "step": 755
     },
     {
-      "epoch": 0.7011070110701108,
-      "grad_norm": 1.3367666302254895,
-      "learning_rate": 4.972121398226371e-06,
-      "loss": 0.9515,
       "step": 760
     },
     {
-      "epoch": 0.705719557195572,
-      "grad_norm": 1.2932861344083342,
-      "learning_rate": 4.833517167127077e-06,
-      "loss": 0.9542,
       "step": 765
     },
     {
-      "epoch": 0.7103321033210332,
-      "grad_norm": 1.3002291268448138,
-      "learning_rate": 4.6962538969428416e-06,
-      "loss": 0.9493,
       "step": 770
     },
     {
-      "epoch": 0.7149446494464945,
-      "grad_norm": 1.3319825264438672,
-      "learning_rate": 4.560367214363295e-06,
-      "loss": 0.9402,
       "step": 775
     },
     {
-      "epoch": 0.7195571955719557,
-      "grad_norm": 1.3114581144726238,
-      "learning_rate": 4.425892388784681e-06,
-      "loss": 0.9418,
       "step": 780
     },
     {
-      "epoch": 0.724169741697417,
-      "grad_norm": 1.2845316321533105,
-      "learning_rate": 4.292864323155684e-06,
-      "loss": 0.941,
       "step": 785
     },
     {
-      "epoch": 0.7287822878228782,
-      "grad_norm": 1.3576670016030674,
-      "learning_rate": 4.161317544918345e-06,
-      "loss": 0.9514,
       "step": 790
     },
     {
-      "epoch": 0.7333948339483395,
-      "grad_norm": 1.3650798187979218,
-      "learning_rate": 4.031286197046493e-06,
-      "loss": 0.9358,
       "step": 795
     },
     {
-      "epoch": 0.7380073800738007,
-      "grad_norm": 1.2681901564602476,
-      "learning_rate": 3.902804029183907e-06,
-      "loss": 0.9258,
       "step": 800
     },
     {
-      "epoch": 0.742619926199262,
-      "grad_norm": 1.3091859558198018,
-      "learning_rate": 3.775904388884618e-06,
-      "loss": 0.9597,
       "step": 805
     },
     {
-      "epoch": 0.7472324723247232,
-      "grad_norm": 1.2731852848090845,
-      "learning_rate": 3.650620212957524e-06,
-      "loss": 0.9791,
       "step": 810
     },
     {
-      "epoch": 0.7518450184501845,
-      "grad_norm": 1.4443358599132654,
-      "learning_rate": 3.5269840189176616e-06,
-      "loss": 0.9559,
       "step": 815
     },
     {
-      "epoch": 0.7564575645756457,
-      "grad_norm": 1.2637245159696282,
-      "learning_rate": 3.405027896546277e-06,
-      "loss": 0.9522,
       "step": 820
     },
     {
-      "epoch": 0.761070110701107,
-      "grad_norm": 1.2944529056122984,
-      "learning_rate": 3.2847834995619067e-06,
-      "loss": 0.9334,
       "step": 825
     },
     {
-      "epoch": 0.7656826568265682,
-      "grad_norm": 1.2996833234879939,
-      "learning_rate": 3.1662820374046776e-06,
-      "loss": 0.9406,
       "step": 830
     },
     {
-      "epoch": 0.7702952029520295,
-      "grad_norm": 1.3668529549836468,
-      "learning_rate": 3.0495542671358745e-06,
-      "loss": 0.9494,
       "step": 835
     },
     {
-      "epoch": 0.7749077490774908,
-      "grad_norm": 1.271209707662383,
-      "learning_rate": 2.934630485454948e-06,
-      "loss": 0.9587,
       "step": 840
     },
     {
-      "epoch": 0.7795202952029521,
-      "grad_norm": 1.373724729262656,
-      "learning_rate": 2.8215405208360237e-06,
-      "loss": 0.9267,
       "step": 845
     },
     {
-      "epoch": 0.7841328413284133,
-      "grad_norm": 1.2858397081440855,
-      "learning_rate": 2.7103137257858867e-06,
-      "loss": 0.9368,
       "step": 850
     },
     {
-      "epoch": 0.7887453874538746,
-      "grad_norm": 1.3265877626226328,
-      "learning_rate": 2.600978969225558e-06,
-      "loss": 0.9363,
       "step": 855
     },
     {
-      "epoch": 0.7933579335793358,
-      "grad_norm": 1.3059330060904089,
-      "learning_rate": 2.493564628997369e-06,
-      "loss": 0.9331,
       "step": 860
     },
     {
-      "epoch": 0.7979704797047971,
-      "grad_norm": 1.2933907163136256,
-      "learning_rate": 2.3880985844994674e-06,
-      "loss": 0.9315,
       "step": 865
     },
     {
-      "epoch": 0.8025830258302583,
-      "grad_norm": 1.3825698113730438,
-      "learning_rate": 2.284608209449746e-06,
-      "loss": 0.9379,
       "step": 870
     },
     {
-      "epoch": 0.8071955719557196,
-      "grad_norm": 1.2868105668821606,
-      "learning_rate": 2.183120364780975e-06,
-      "loss": 0.9371,
       "step": 875
     },
     {
-      "epoch": 0.8118081180811808,
-      "grad_norm": 1.313632486404665,
-      "learning_rate": 2.083661391669043e-06,
-      "loss": 0.9338,
       "step": 880
     },
     {
-      "epoch": 0.816420664206642,
-      "grad_norm": 1.2863066943500752,
-      "learning_rate": 1.986257104696121e-06,
-      "loss": 0.933,
       "step": 885
     },
     {
-      "epoch": 0.8210332103321033,
-      "grad_norm": 1.3414781016896056,
-      "learning_rate": 1.8909327851504633e-06,
-      "loss": 0.9298,
       "step": 890
     },
     {
-      "epoch": 0.8256457564575646,
-      "grad_norm": 1.3005106720291775,
-      "learning_rate": 1.7977131744646724e-06,
-      "loss": 0.949,
       "step": 895
     },
     {
-      "epoch": 0.8302583025830258,
-      "grad_norm": 1.2541123587239678,
-      "learning_rate": 1.7066224677940313e-06,
-      "loss": 0.9364,
       "step": 900
     },
     {
-      "epoch": 0.834870848708487,
-      "grad_norm": 1.3443378836164728,
-      "learning_rate": 1.6176843077366755e-06,
-      "loss": 0.9341,
       "step": 905
     },
     {
-      "epoch": 0.8394833948339483,
-      "grad_norm": 1.3177278720111654,
-      "learning_rate": 1.5309217781971419e-06,
-      "loss": 0.9237,
       "step": 910
     },
     {
-      "epoch": 0.8440959409594095,
-      "grad_norm": 1.2988630794091358,
-      "learning_rate": 1.446357398394934e-06,
-      "loss": 0.9375,
       "step": 915
     },
     {
-      "epoch": 0.8487084870848709,
-      "grad_norm": 1.235110744813047,
-      "learning_rate": 1.3640131170196758e-06,
-      "loss": 0.9289,
       "step": 920
     },
     {
-      "epoch": 0.8533210332103321,
-      "grad_norm": 1.2841260381250454,
-      "learning_rate": 1.2839103065343084e-06,
-      "loss": 0.9376,
       "step": 925
     },
     {
-      "epoch": 0.8579335793357934,
-      "grad_norm": 1.3338182477923755,
-      "learning_rate": 1.2060697576278812e-06,
-      "loss": 0.9295,
       "step": 930
     },
     {
-      "epoch": 0.8625461254612546,
-      "grad_norm": 1.2483887496253951,
-      "learning_rate": 1.1305116738193211e-06,
-      "loss": 0.9191,
       "step": 935
     },
     {
-      "epoch": 0.8671586715867159,
-      "grad_norm": 1.3430815382294408,
-      "learning_rate": 1.0572556662136036e-06,
-      "loss": 0.9152,
       "step": 940
     },
     {
-      "epoch": 0.8717712177121771,
-      "grad_norm": 1.2731245020834576,
-      "learning_rate": 9.863207484116987e-07,
-      "loss": 0.9396,
       "step": 945
     },
     {
-      "epoch": 0.8763837638376384,
-      "grad_norm": 1.2635025334684167,
-      "learning_rate": 9.177253315755796e-07,
-      "loss": 0.9425,
       "step": 950
     },
     {
-      "epoch": 0.8809963099630996,
-      "grad_norm": 1.2447698053780658,
-      "learning_rate": 8.514872196496182e-07,
-      "loss": 0.9144,
       "step": 955
     },
     {
-      "epoch": 0.8856088560885609,
-      "grad_norm": 1.2339433320268876,
-      "learning_rate": 7.876236047395525e-07,
-      "loss": 0.9314,
       "step": 960
     },
     {
-      "epoch": 0.8902214022140221,
-      "grad_norm": 1.2899193810519798,
-      "learning_rate": 7.26151062650291e-07,
-      "loss": 0.9339,
       "step": 965
     },
     {
-      "epoch": 0.8948339483394834,
-      "grad_norm": 1.3045072453764868,
-      "learning_rate": 6.670855485836525e-07,
-      "loss": 0.9362,
       "step": 970
     },
     {
-      "epoch": 0.8994464944649446,
-      "grad_norm": 1.2294171724462792,
-      "learning_rate": 6.104423929971948e-07,
-      "loss": 0.9179,
       "step": 975
     },
     {
-      "epoch": 0.9040590405904059,
-      "grad_norm": 1.2969444296025832,
-      "learning_rate": 5.562362976251901e-07,
-      "loss": 0.9386,
       "step": 980
     },
     {
-      "epoch": 0.9086715867158671,
-      "grad_norm": 1.3563048623700384,
-      "learning_rate": 5.044813316627994e-07,
-      "loss": 0.9293,
       "step": 985
     },
     {
-      "epoch": 0.9132841328413284,
-      "grad_norm": 1.3602985297748236,
-      "learning_rate": 4.5519092811439627e-07,
-      "loss": 0.9325,
       "step": 990
     },
     {
-      "epoch": 0.9178966789667896,
-      "grad_norm": 1.2576795558330498,
-      "learning_rate": 4.083778803070504e-07,
-      "loss": 0.9384,
       "step": 995
     },
     {
-      "epoch": 0.922509225092251,
-      "grad_norm": 1.2395476578848061,
-      "learning_rate": 3.6405433856999684e-07,
-      "loss": 0.9195,
       "step": 1000
     },
     {
-      "epoch": 0.9271217712177122,
-      "grad_norm": 1.2838613340485654,
-      "learning_rate": 3.2223180708102933e-07,
-      "loss": 0.9372,
       "step": 1005
     },
     {
-      "epoch": 0.9317343173431735,
-      "grad_norm": 1.3034379499491984,
-      "learning_rate": 2.829211408805932e-07,
-      "loss": 0.9383,
       "step": 1010
     },
     {
-      "epoch": 0.9363468634686347,
-      "grad_norm": 1.277753433076976,
-      "learning_rate": 2.461325430543482e-07,
-      "loss": 0.9203,
       "step": 1015
     },
     {
-      "epoch": 0.940959409594096,
-      "grad_norm": 1.2622861558556866,
-      "learning_rate": 2.1187556208496885e-07,
-      "loss": 0.9231,
       "step": 1020
     },
     {
-      "epoch": 0.9455719557195572,
-      "grad_norm": 1.26550227170926,
-      "learning_rate": 1.8015908937382587e-07,
-      "loss": 0.9314,
       "step": 1025
     },
     {
-      "epoch": 0.9501845018450185,
-      "grad_norm": 1.293318597446816,
-      "learning_rate": 1.5099135693322776e-07,
-      "loss": 0.9153,
       "step": 1030
     },
     {
-      "epoch": 0.9547970479704797,
-      "grad_norm": 1.5032902478334003,
-      "learning_rate": 1.2437993524979984e-07,
-      "loss": 0.9369,
       "step": 1035
     },
     {
-      "epoch": 0.959409594095941,
-      "grad_norm": 1.2642728343874239,
-      "learning_rate": 1.0033173131956175e-07,
-      "loss": 0.9155,
       "step": 1040
     },
     {
-      "epoch": 0.9640221402214022,
-      "grad_norm": 1.263191604346459,
-      "learning_rate": 7.885298685522235e-08,
-      "loss": 0.9309,
       "step": 1045
     },
     {
-      "epoch": 0.9686346863468634,
-      "grad_norm": 1.2999256804979131,
-      "learning_rate": 5.99492766661347e-08,
-      "loss": 0.9399,
       "step": 1050
     },
     {
-      "epoch": 0.9732472324723247,
-      "grad_norm": 1.25468458833374,
-      "learning_rate": 4.362550721136338e-08,
-      "loss": 0.9387,
       "step": 1055
     },
     {
-      "epoch": 0.977859778597786,
-      "grad_norm": 1.2891226224049415,
-      "learning_rate": 2.988591532620322e-08,
-      "loss": 0.9259,
       "step": 1060
     },
     {
-      "epoch": 0.9824723247232472,
-      "grad_norm": 1.2367296717631957,
-      "learning_rate": 1.8734067122514464e-08,
-      "loss": 0.9255,
       "step": 1065
     },
     {
-      "epoch": 0.9870848708487084,
-      "grad_norm": 1.2584553913497565,
-      "learning_rate": 1.0172857063137643e-08,
-      "loss": 0.9337,
       "step": 1070
     },
     {
-      "epoch": 0.9916974169741697,
-      "grad_norm": 1.2509146926461707,
-      "learning_rate": 4.204507210633368e-09,
-      "loss": 0.9295,
       "step": 1075
     },
     {
-      "epoch": 0.996309963099631,
-      "grad_norm": 1.2915538030798408,
-      "learning_rate": 8.30566650548148e-10,
-      "loss": 0.9183,
       "step": 1080
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 0.9419716000556946,
-      "eval_runtime": 1011.6018,
-      "eval_samples_per_second": 15.174,
-      "eval_steps_per_second": 0.237,
-      "step": 1084
     },
     {
-      "epoch": 1.0,
-      "step": 1084,
-      "total_flos": 453935093514240.0,
-      "train_loss": 0.9848188322408613,
-      "train_runtime": 36728.3484,
-      "train_samples_per_second": 3.776,
-      "train_steps_per_second": 0.03
     }
   ],
   "logging_steps": 5,
-  "max_steps": 1084,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -1562,7 +2445,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 453935093514240.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 100,
+  "global_step": 1626,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0018450184501845018,
+      "grad_norm": 9.194052941983164,
+      "learning_rate": 1.226993865030675e-07,
       "loss": 1.1392,
       "step": 1
     },
     {
+      "epoch": 0.00922509225092251,
+      "grad_norm": 8.728469464225432,
+      "learning_rate": 6.134969325153375e-07,
+      "loss": 1.1321,
       "step": 5
     },
     {
+      "epoch": 0.01845018450184502,
+      "grad_norm": 5.066035045474869,
+      "learning_rate": 1.226993865030675e-06,
+      "loss": 1.0802,
       "step": 10
     },
     {
+      "epoch": 0.027675276752767528,
+      "grad_norm": 6.151048691792626,
+      "learning_rate": 1.8404907975460124e-06,
+      "loss": 1.0186,
       "step": 15
     },
     {
+      "epoch": 0.03690036900369004,
+      "grad_norm": 2.030218046940431,
+      "learning_rate": 2.45398773006135e-06,
+      "loss": 1.0181,
       "step": 20
     },
     {
+      "epoch": 0.046125461254612546,
+      "grad_norm": 1.7169054577646434,
+      "learning_rate": 3.0674846625766875e-06,
+      "loss": 0.9867,
       "step": 25
     },
     {
+      "epoch": 0.055350553505535055,
+      "grad_norm": 1.414702551784086,
+      "learning_rate": 3.680981595092025e-06,
+      "loss": 0.9848,
       "step": 30
     },
     {
+      "epoch": 0.06457564575645756,
+      "grad_norm": 1.471062511929668,
+      "learning_rate": 4.294478527607362e-06,
+      "loss": 0.975,
       "step": 35
     },
     {
+      "epoch": 0.07380073800738007,
+      "grad_norm": 1.9876641303020315,
+      "learning_rate": 4.9079754601227e-06,
+      "loss": 0.9616,
       "step": 40
     },
     {
+      "epoch": 0.08302583025830258,
+      "grad_norm": 1.7086605102377759,
+      "learning_rate": 5.521472392638038e-06,
+      "loss": 0.9716,
       "step": 45
     },
     {
+      "epoch": 0.09225092250922509,
+      "grad_norm": 2.202769359683669,
+      "learning_rate": 6.134969325153375e-06,
+      "loss": 0.9766,
       "step": 50
     },
     {
+      "epoch": 0.1014760147601476,
+      "grad_norm": 1.6222357117334487,
+      "learning_rate": 6.748466257668712e-06,
+      "loss": 0.9929,
       "step": 55
     },
     {
+      "epoch": 0.11070110701107011,
+      "grad_norm": 2.161648398755977,
+      "learning_rate": 7.36196319018405e-06,
+      "loss": 0.9774,
       "step": 60
     },
     {
+      "epoch": 0.11992619926199262,
+      "grad_norm": 1.7198404521131392,
+      "learning_rate": 7.975460122699386e-06,
+      "loss": 0.9743,
       "step": 65
     },
     {
+      "epoch": 0.12915129151291513,
+      "grad_norm": 2.5936580446065594,
+      "learning_rate": 8.588957055214725e-06,
+      "loss": 0.9878,
       "step": 70
     },
     {
+      "epoch": 0.13837638376383765,
+      "grad_norm": 2.188257188915145,
+      "learning_rate": 9.202453987730062e-06,
+      "loss": 0.9568,
       "step": 75
     },
     {
+      "epoch": 0.14760147601476015,
+      "grad_norm": 1.7531151641523148,
+      "learning_rate": 9.8159509202454e-06,
+      "loss": 0.9789,
       "step": 80
     },
     {
+      "epoch": 0.15682656826568267,
+      "grad_norm": 1.8091240872427208,
+      "learning_rate": 1.0429447852760737e-05,
+      "loss": 0.9678,
       "step": 85
     },
     {
+      "epoch": 0.16605166051660517,
+      "grad_norm": 2.098514635540621,
+      "learning_rate": 1.1042944785276076e-05,
+      "loss": 0.9617,
       "step": 90
     },
     {
+      "epoch": 0.1752767527675277,
+      "grad_norm": 2.4275494428488607,
+      "learning_rate": 1.1656441717791411e-05,
+      "loss": 0.9676,
       "step": 95
     },
     {
+      "epoch": 0.18450184501845018,
+      "grad_norm": 2.0637923302738095,
+      "learning_rate": 1.226993865030675e-05,
+      "loss": 0.9681,
+      "step": 100
+    },
+    {
+      "epoch": 0.18450184501845018,
+      "eval_loss": 0.9788174629211426,
+      "eval_runtime": 515.1712,
+      "eval_samples_per_second": 29.796,
+      "eval_steps_per_second": 0.116,
       "step": 100
     },
     {
+      "epoch": 0.1937269372693727,
+      "grad_norm": 2.069416549180579,
+      "learning_rate": 1.2883435582822085e-05,
+      "loss": 0.9528,
       "step": 105
     },
     {
+      "epoch": 0.2029520295202952,
+      "grad_norm": 2.2916715973700024,
+      "learning_rate": 1.3496932515337424e-05,
+      "loss": 0.9696,
       "step": 110
     },
     {
+      "epoch": 0.21217712177121772,
+      "grad_norm": 2.062468142825091,
+      "learning_rate": 1.4110429447852763e-05,
+      "loss": 0.9747,
       "step": 115
     },
     {
+      "epoch": 0.22140221402214022,
+      "grad_norm": 1.7271367882138293,
+      "learning_rate": 1.47239263803681e-05,
+      "loss": 0.9786,
       "step": 120
     },
     {
+      "epoch": 0.23062730627306274,
+      "grad_norm": 1.9545058702706481,
+      "learning_rate": 1.5337423312883436e-05,
+      "loss": 0.9758,
       "step": 125
     },
     {
+      "epoch": 0.23985239852398524,
+      "grad_norm": 1.9400595646067775,
+      "learning_rate": 1.5950920245398772e-05,
+      "loss": 0.9829,
       "step": 130
     },
     {
+      "epoch": 0.24907749077490776,
+      "grad_norm": 1.865861850010034,
+      "learning_rate": 1.656441717791411e-05,
+      "loss": 0.9915,
       "step": 135
     },
     {
+      "epoch": 0.25830258302583026,
+      "grad_norm": 1.9529698824708406,
+      "learning_rate": 1.717791411042945e-05,
+      "loss": 0.9831,
       "step": 140
     },
     {
+      "epoch": 0.26752767527675275,
+      "grad_norm": 1.8749039852563243,
+      "learning_rate": 1.7791411042944788e-05,
+      "loss": 0.9842,
       "step": 145
     },
     {
+      "epoch": 0.2767527675276753,
+      "grad_norm": 1.4867806820095497,
+      "learning_rate": 1.8404907975460123e-05,
+      "loss": 0.9859,
       "step": 150
     },
     {
+      "epoch": 0.2859778597785978,
+      "grad_norm": 2.1169911338934644,
+      "learning_rate": 1.9018404907975462e-05,
+      "loss": 0.9771,
       "step": 155
     },
     {
+      "epoch": 0.2952029520295203,
+      "grad_norm": 1.5398155481235816,
+      "learning_rate": 1.96319018404908e-05,
+      "loss": 0.9817,
       "step": 160
     },
     {
+      "epoch": 0.3044280442804428,
+      "grad_norm": 1.7130250807487832,
+      "learning_rate": 1.9999907776750355e-05,
+      "loss": 0.9997,
       "step": 165
     },
     {
+      "epoch": 0.31365313653136534,
+      "grad_norm": 2.1366171045520383,
+      "learning_rate": 1.9998870284726968e-05,
+      "loss": 1.0004,
       "step": 170
     },
     {
+      "epoch": 0.32287822878228783,
+      "grad_norm": 2.01400597362679,
+      "learning_rate": 1.9996680141616956e-05,
+      "loss": 0.9937,
       "step": 175
     },
     {
+      "epoch": 0.33210332103321033,
+      "grad_norm": 1.925569839756876,
+      "learning_rate": 1.9993337599895925e-05,
+      "loss": 0.9939,
       "step": 180
     },
     {
+      "epoch": 0.3413284132841328,
+      "grad_norm": 1.4590251035585917,
+      "learning_rate": 1.998884304488584e-05,
+      "loss": 0.9982,
       "step": 185
     },
     {
+      "epoch": 0.3505535055350554,
+      "grad_norm": 1.531094729781709,
+      "learning_rate": 1.998319699471061e-05,
+      "loss": 0.9925,
       "step": 190
     },
     {
+      "epoch": 0.35977859778597787,
+      "grad_norm": 1.9624667257758441,
+      "learning_rate": 1.997640010023634e-05,
+      "loss": 0.9765,
       "step": 195
     },
     {
+      "epoch": 0.36900369003690037,
+      "grad_norm": 1.6943282928766075,
+      "learning_rate": 1.9968453144996345e-05,
+      "loss": 0.9962,
+      "step": 200
+    },
+    {
+      "epoch": 0.36900369003690037,
+      "eval_loss": 1.0030262470245361,
+      "eval_runtime": 518.1283,
+      "eval_samples_per_second": 29.626,
+      "eval_steps_per_second": 0.116,
       "step": 200
     },
     {
+      "epoch": 0.37822878228782286,
+      "grad_norm": 3.0469311990676857,
+      "learning_rate": 1.9959357045100764e-05,
+      "loss": 0.9947,
       "step": 205
     },
     {
+      "epoch": 0.3874538745387454,
+      "grad_norm": 2.2551326104892864,
+      "learning_rate": 1.9949112849131005e-05,
+      "loss": 1.0023,
       "step": 210
     },
     {
+      "epoch": 0.3966789667896679,
+      "grad_norm": 1.8771683509279502,
+      "learning_rate": 1.993772173801884e-05,
+      "loss": 0.9934,
       "step": 215
     },
     {
+      "epoch": 0.4059040590405904,
+      "grad_norm": 1.8016877222967922,
+      "learning_rate": 1.992518502491028e-05,
+      "loss": 0.9807,
       "step": 220
     },
     {
+      "epoch": 0.4151291512915129,
+      "grad_norm": 1.4456497466009737,
+      "learning_rate": 1.9911504155014187e-05,
+      "loss": 0.9926,
       "step": 225
     },
     {
+      "epoch": 0.42435424354243545,
+      "grad_norm": 1.5156073716841811,
+      "learning_rate": 1.989668070543569e-05,
+      "loss": 0.9766,
       "step": 230
     },
     {
+      "epoch": 0.43357933579335795,
+      "grad_norm": 1.3959824735787207,
+      "learning_rate": 1.9880716384994355e-05,
+      "loss": 0.9964,
       "step": 235
     },
     {
+      "epoch": 0.44280442804428044,
+      "grad_norm": 1.4724192694561282,
+      "learning_rate": 1.9863613034027224e-05,
+      "loss": 0.9942,
       "step": 240
     },
     {
+      "epoch": 0.45202952029520294,
+      "grad_norm": 2.064409139190994,
+      "learning_rate": 1.9845372624176646e-05,
+      "loss": 1.0103,
       "step": 245
     },
     {
+      "epoch": 0.4612546125461255,
+      "grad_norm": 2.190902421105104,
+      "learning_rate": 1.982599725816299e-05,
+      "loss": 1.0075,
       "step": 250
     },
     {
+      "epoch": 0.470479704797048,
+      "grad_norm": 1.9443583169417478,
+      "learning_rate": 1.9805489169542245e-05,
+      "loss": 0.9971,
       "step": 255
     },
     {
+      "epoch": 0.4797047970479705,
+      "grad_norm": 1.553791831408308,
+      "learning_rate": 1.978385072244857e-05,
+      "loss": 0.9992,
       "step": 260
     },
     {
+      "epoch": 0.488929889298893,
+      "grad_norm": 1.4174068635451451,
+      "learning_rate": 1.9761084411321706e-05,
+      "loss": 0.9793,
       "step": 265
     },
     {
+      "epoch": 0.4981549815498155,
+      "grad_norm": 1.4969414214930414,
+      "learning_rate": 1.9737192860619477e-05,
+      "loss": 0.9791,
       "step": 270
     },
     {
+      "epoch": 0.507380073800738,
+      "grad_norm": 1.4025421975340602,
+      "learning_rate": 1.971217882451521e-05,
+      "loss": 0.9796,
       "step": 275
     },
     {
+      "epoch": 0.5166051660516605,
+      "grad_norm": 1.4448369862138994,
+      "learning_rate": 1.9686045186580258e-05,
+      "loss": 0.9884,
       "step": 280
     },
     {
+      "epoch": 0.525830258302583,
+      "grad_norm": 2.0639483249182464,
+      "learning_rate": 1.9658794959451583e-05,
+      "loss": 0.9831,
       "step": 285
     },
     {
+      "epoch": 0.5350553505535055,
+      "grad_norm": 1.6048970102781592,
+      "learning_rate": 1.9630431284484447e-05,
+      "loss": 0.9849,
       "step": 290
     },
     {
+      "epoch": 0.544280442804428,
+      "grad_norm": 1.4540480684938577,
+      "learning_rate": 1.960095743139033e-05,
+      "loss": 0.9796,
       "step": 295
     },
     {
+      "epoch": 0.5535055350553506,
+      "grad_norm": 1.424947900669971,
+      "learning_rate": 1.957037679785994e-05,
+      "loss": 0.9917,
+      "step": 300
+    },
+    {
+      "epoch": 0.5535055350553506,
+      "eval_loss": 1.0008341073989868,
+      "eval_runtime": 513.1068,
+      "eval_samples_per_second": 29.916,
+      "eval_steps_per_second": 0.117,
       "step": 300
     },
     {
+      "epoch": 0.5627306273062731,
+      "grad_norm": 1.2480517696242786,
+      "learning_rate": 1.953869290917158e-05,
+      "loss": 0.9943,
       "step": 305
     },
     {
+      "epoch": 0.5719557195571956,
+      "grad_norm": 1.191133450390735,
+      "learning_rate": 1.9505909417784758e-05,
+      "loss": 0.9899,
       "step": 310
     },
     {
+      "epoch": 0.5811808118081181,
+      "grad_norm": 1.1766418475997753,
+      "learning_rate": 1.9472030102919102e-05,
+      "loss": 0.9883,
       "step": 315
     },
     {
+      "epoch": 0.5904059040590406,
+      "grad_norm": 1.2121897211885717,
+      "learning_rate": 1.9437058870118745e-05,
+      "loss": 1.0037,
       "step": 320
     },
     {
+      "epoch": 0.5996309963099631,
+      "grad_norm": 1.2903187102851559,
+      "learning_rate": 1.940099975080207e-05,
+      "loss": 0.9892,
       "step": 325
     },
     {
+      "epoch": 0.6088560885608856,
+      "grad_norm": 1.4260318993897811,
+      "learning_rate": 1.9363856901796984e-05,
+      "loss": 0.9896,
       "step": 330
     },
     {
+      "epoch": 0.6180811808118081,
+      "grad_norm": 1.324489901337969,
+      "learning_rate": 1.9325634604861728e-05,
+      "loss": 0.9978,
       "step": 335
     },
     {
+      "epoch": 0.6273062730627307,
+      "grad_norm": 1.275426852454915,
+      "learning_rate": 1.9286337266191295e-05,
+      "loss": 0.993,
       "step": 340
     },
     {
+      "epoch": 0.6365313653136532,
+      "grad_norm": 1.329213272796139,
+      "learning_rate": 1.9245969415909464e-05,
+      "loss": 0.9879,
       "step": 345
     },
     {
+      "epoch": 0.6457564575645757,
+      "grad_norm": 1.4085398606096227,
+      "learning_rate": 1.9204535707546602e-05,
+      "loss": 0.9869,
       "step": 350
     },
     {
+      "epoch": 0.6549815498154982,
+      "grad_norm": 1.1848936755869721,
+      "learning_rate": 1.916204091750321e-05,
+      "loss": 0.9726,
       "step": 355
     },
     {
+      "epoch": 0.6642066420664207,
+      "grad_norm": 1.2968309154541056,
+      "learning_rate": 1.9118489944499287e-05,
+      "loss": 0.9902,
       "step": 360
     },
     {
+      "epoch": 0.6734317343173432,
+      "grad_norm": 1.2286246913756114,
+      "learning_rate": 1.907388780900964e-05,
+      "loss": 0.9811,
       "step": 365
     },
     {
+      "epoch": 0.6826568265682657,
+      "grad_norm": 1.2591567733071325,
+      "learning_rate": 1.902823965268513e-05,
+      "loss": 0.9858,
       "step": 370
     },
     {
+      "epoch": 0.6918819188191881,
+      "grad_norm": 1.4378514619406175,
+      "learning_rate": 1.8981550737759932e-05,
+      "loss": 0.9828,
       "step": 375
     },
     {
+      "epoch": 0.7011070110701108,
+      "grad_norm": 1.497308547977116,
+      "learning_rate": 1.8933826446444933e-05,
+      "loss": 0.9892,
       "step": 380
     },
     {
+      "epoch": 0.7103321033210332,
+      "grad_norm": 1.1745393096620436,
+      "learning_rate": 1.888507228030729e-05,
+      "loss": 0.9859,
       "step": 385
     },
     {
+      "epoch": 0.7195571955719557,
+      "grad_norm": 1.2233160586824499,
+      "learning_rate": 1.8835293859636177e-05,
+      "loss": 0.9763,
       "step": 390
     },
     {
+      "epoch": 0.7287822878228782,
+      "grad_norm": 1.3127902536541989,
+      "learning_rate": 1.8784496922794947e-05,
+      "loss": 0.981,
       "step": 395
     },
     {
+      "epoch": 0.7380073800738007,
+      "grad_norm": 1.3089866347753676,
+      "learning_rate": 1.873268732555957e-05,
+      "loss": 0.9652,
+      "step": 400
+    },
+    {
+      "epoch": 0.7380073800738007,
+      "eval_loss": 0.993894636631012,
+      "eval_runtime": 513.7147,
+      "eval_samples_per_second": 29.88,
+      "eval_steps_per_second": 0.117,
       "step": 400
     },
     {
+      "epoch": 0.7472324723247232,
+      "grad_norm": 1.3088189271034285,
+      "learning_rate": 1.8679871040443632e-05,
+      "loss": 1.0048,
       "step": 405
     },
     {
+      "epoch": 0.7564575645756457,
+      "grad_norm": 1.2954577066196238,
+      "learning_rate": 1.8626054156009807e-05,
+      "loss": 0.9927,
       "step": 410
     },
     {
+      "epoch": 0.7656826568265682,
+      "grad_norm": 1.317981053662398,
+      "learning_rate": 1.8571242876167995e-05,
+      "loss": 0.9752,
       "step": 415
     },
     {
+      "epoch": 0.7749077490774908,
+      "grad_norm": 1.4156756831610378,
+      "learning_rate": 1.851544351946014e-05,
+      "loss": 0.9945,
       "step": 420
     },
     {
+      "epoch": 0.7841328413284133,
+      "grad_norm": 1.1285773664771428,
+      "learning_rate": 1.845866251833183e-05,
+      "loss": 0.9708,
       "step": 425
     },
     {
+      "epoch": 0.7933579335793358,
+      "grad_norm": 1.2640468813011223,
+      "learning_rate": 1.8400906418390808e-05,
+      "loss": 0.9757,
       "step": 430
     },
     {
+      "epoch": 0.8025830258302583,
+      "grad_norm": 1.288546177133416,
+      "learning_rate": 1.834218187765237e-05,
+      "loss": 0.976,
       "step": 435
     },
     {
+      "epoch": 0.8118081180811808,
+      "grad_norm": 1.3086160465192265,
+      "learning_rate": 1.8282495665771864e-05,
+      "loss": 0.9761,
       "step": 440
     },
     {
+      "epoch": 0.8210332103321033,
+      "grad_norm": 1.1919282548241303,
+      "learning_rate": 1.8221854663264294e-05,
+      "loss": 0.9718,
       "step": 445
     },
     {
+      "epoch": 0.8302583025830258,
+      "grad_norm": 1.2454331164701038,
+      "learning_rate": 1.8160265860711134e-05,
+      "loss": 0.9842,
       "step": 450
     },
     {
+      "epoch": 0.8394833948339483,
+      "grad_norm": 1.183454477783249,
+      "learning_rate": 1.8097736357954487e-05,
+      "loss": 0.9705,
       "step": 455
     },
     {
+      "epoch": 0.8487084870848709,
+      "grad_norm": 1.1394535207411802,
+      "learning_rate": 1.8034273363278615e-05,
+      "loss": 0.9751,
       "step": 460
     },
     {
+      "epoch": 0.8579335793357934,
+      "grad_norm": 1.1866949984179949,
+      "learning_rate": 1.7969884192578977e-05,
+      "loss": 0.9749,
       "step": 465
     },
     {
+      "epoch": 0.8671586715867159,
+      "grad_norm": 1.299660479182102,
+      "learning_rate": 1.7904576268518886e-05,
+      "loss": 0.9598,
       "step": 470
     },
     {
+      "epoch": 0.8763837638376384,
+      "grad_norm": 1.2221383874437446,
+      "learning_rate": 1.783835711967382e-05,
+      "loss": 0.9842,
       "step": 475
     },
     {
+      "epoch": 0.8856088560885609,
+      "grad_norm": 1.2535423952991984,
+      "learning_rate": 1.7771234379663545e-05,
+      "loss": 0.9641,
       "step": 480
     },
     {
+      "epoch": 0.8948339483394834,
+      "grad_norm": 1.4654400132426395,
+      "learning_rate": 1.770321578627213e-05,
+      "loss": 0.9784,
       "step": 485
     },
     {
+      "epoch": 0.9040590405904059,
+      "grad_norm": 1.3747052246285973,
+      "learning_rate": 1.763430918055595e-05,
+      "loss": 0.9694,
       "step": 490
     },
     {
+      "epoch": 0.9132841328413284,
+      "grad_norm": 1.1551950486505687,
+      "learning_rate": 1.756452250593979e-05,
+      "loss": 0.9727,
       "step": 495
     },
     {
+      "epoch": 0.922509225092251,
+      "grad_norm": 1.128236535385729,
+      "learning_rate": 1.7493863807301116e-05,
+      "loss": 0.9666,
       "step": 500
     },
     {
+      "epoch": 0.922509225092251,
+      "eval_loss": 0.9816026091575623,
+      "eval_runtime": 517.2137,
+      "eval_samples_per_second": 29.678,
+      "eval_steps_per_second": 0.116,
+      "step": 500
+    },
+    {
+      "epoch": 0.9317343173431735,
+      "grad_norm": 1.230218009681161,
+      "learning_rate": 1.74223412300427e-05,
+      "loss": 0.9769,
       "step": 505
     },
     {
+      "epoch": 0.940959409594096,
+      "grad_norm": 1.1847589898088133,
+      "learning_rate": 1.7349963019153638e-05,
+      "loss": 0.9628,
       "step": 510
     },
     {
+      "epoch": 0.9501845018450185,
+      "grad_norm": 1.2246308831747907,
+      "learning_rate": 1.7276737518258865e-05,
+      "loss": 0.9602,
       "step": 515
     },
     {
+      "epoch": 0.959409594095941,
+      "grad_norm": 1.1390750572317663,
+      "learning_rate": 1.7202673168657318e-05,
+      "loss": 0.9627,
       "step": 520
     },
     {
+      "epoch": 0.9686346863468634,
+      "grad_norm": 1.1728205351456946,
+      "learning_rate": 1.7127778508348858e-05,
+      "loss": 0.9714,
       "step": 525
     },
     {
+      "epoch": 0.977859778597786,
+      "grad_norm": 1.2796699310011739,
+      "learning_rate": 1.7052062171050008e-05,
+      "loss": 0.967,
       "step": 530
     },
     {
+      "epoch": 0.9870848708487084,
+      "grad_norm": 1.1205342517216532,
+      "learning_rate": 1.6975532885198678e-05,
+      "loss": 0.9663,
       "step": 535
     },
     {
+      "epoch": 0.996309963099631,
+      "grad_norm": 1.185279277131673,
+      "learning_rate": 1.6898199472947972e-05,
+      "loss": 0.9581,
       "step": 540
     },
     {
+      "epoch": 1.0055350553505535,
+      "grad_norm": 3.007398366081561,
+      "learning_rate": 1.6820070849149174e-05,
+      "loss": 0.8519,
       "step": 545
     },
     {
+      "epoch": 1.014760147601476,
+      "grad_norm": 2.1038299784593337,
+      "learning_rate": 1.6741156020324086e-05,
+      "loss": 0.7509,
       "step": 550
     },
     {
+      "epoch": 1.0239852398523985,
+      "grad_norm": 1.5701183943228265,
+      "learning_rate": 1.6661464083626734e-05,
+      "loss": 0.7453,
       "step": 555
     },
     {
+      "epoch": 1.033210332103321,
+      "grad_norm": 1.2911074026361753,
+      "learning_rate": 1.6581004225794715e-05,
+      "loss": 0.7391,
       "step": 560
     },
     {
+      "epoch": 1.0424354243542435,
+      "grad_norm": 1.5938907876285198,
+      "learning_rate": 1.649978572209012e-05,
+      "loss": 0.7347,
       "step": 565
     },
     {
+      "epoch": 1.051660516605166,
+      "grad_norm": 1.3495506131008623,
+      "learning_rate": 1.6417817935230318e-05,
+      "loss": 0.7396,
       "step": 570
     },
     {
+      "epoch": 1.0608856088560885,
+      "grad_norm": 1.2781771587882627,
+      "learning_rate": 1.6335110314308654e-05,
+      "loss": 0.7305,
       "step": 575
     },
     {
+      "epoch": 1.070110701107011,
+      "grad_norm": 1.5798733908227265,
+      "learning_rate": 1.6251672393705155e-05,
+      "loss": 0.7365,
       "step": 580
     },
     {
+      "epoch": 1.0793357933579335,
+      "grad_norm": 1.416304183876239,
+      "learning_rate": 1.6167513791987423e-05,
+      "loss": 0.7373,
       "step": 585
     },
     {
+      "epoch": 1.088560885608856,
+      "grad_norm": 1.3677150489575043,
+      "learning_rate": 1.6082644210801846e-05,
+      "loss": 0.7299,
       "step": 590
     },
     {
+      "epoch": 1.0977859778597785,
+      "grad_norm": 1.3506677105351055,
+      "learning_rate": 1.5997073433755187e-05,
+      "loss": 0.7426,
       "step": 595
     },
     {
+      "epoch": 1.1070110701107012,
+      "grad_norm": 1.461155474048458,
+      "learning_rate": 1.5910811325286768e-05,
+      "loss": 0.7366,
       "step": 600
     },
     {
+      "epoch": 1.1070110701107012,
+      "eval_loss": 0.9852360486984253,
+      "eval_runtime": 516.2338,
+      "eval_samples_per_second": 29.735,
+      "eval_steps_per_second": 0.116,
+      "step": 600
+    },
+    {
+      "epoch": 1.1162361623616237,
+      "grad_norm": 1.2999195127889172,
+      "learning_rate": 1.582386782953129e-05,
+      "loss": 0.7351,
       "step": 605
     },
     {
+      "epoch": 1.1254612546125462,
+      "grad_norm": 1.5599221554130673,
+      "learning_rate": 1.5736252969172522e-05,
+      "loss": 0.7335,
       "step": 610
     },
     {
+      "epoch": 1.1346863468634687,
+      "grad_norm": 1.30824219510555,
+      "learning_rate": 1.5647976844287884e-05,
+      "loss": 0.7321,
       "step": 615
     },
     {
+      "epoch": 1.1439114391143912,
+      "grad_norm": 1.3590431139669035,
+      "learning_rate": 1.5559049631184136e-05,
+      "loss": 0.7294,
       "step": 620
     },
     {
+      "epoch": 1.1531365313653137,
+      "grad_norm": 1.5685872513743657,
+      "learning_rate": 1.5469481581224274e-05,
+      "loss": 0.7372,
       "step": 625
     },
     {
+      "epoch": 1.1623616236162362,
+      "grad_norm": 1.4194329169102744,
+      "learning_rate": 1.5379283019645757e-05,
+      "loss": 0.7423,
       "step": 630
     },
     {
+      "epoch": 1.1715867158671587,
+      "grad_norm": 1.8516238628155155,
+      "learning_rate": 1.5288464344370267e-05,
+      "loss": 0.7389,
       "step": 635
     },
     {
+      "epoch": 1.1808118081180812,
+      "grad_norm": 1.3787465939384576,
+      "learning_rate": 1.5197036024805018e-05,
+      "loss": 0.7277,
       "step": 640
     },
     {
+      "epoch": 1.1900369003690037,
+      "grad_norm": 1.2679935699299498,
+      "learning_rate": 1.5105008600635888e-05,
+      "loss": 0.7251,
       "step": 645
     },
     {
+      "epoch": 1.1992619926199262,
+      "grad_norm": 1.3661565990701046,
+      "learning_rate": 1.5012392680612408e-05,
+      "loss": 0.7348,
       "step": 650
     },
     {
+      "epoch": 1.2084870848708487,
+      "grad_norm": 1.380476117633752,
+      "learning_rate": 1.4919198941324813e-05,
+      "loss": 0.733,
       "step": 655
     },
     {
+      "epoch": 1.2177121771217712,
+      "grad_norm": 1.301175007422796,
+      "learning_rate": 1.4825438125973263e-05,
+      "loss": 0.7331,
       "step": 660
     },
     {
+      "epoch": 1.2269372693726937,
+      "grad_norm": 1.3531205842843421,
+      "learning_rate": 1.4731121043129392e-05,
+      "loss": 0.7379,
       "step": 665
     },
     {
+      "epoch": 1.2361623616236161,
+      "grad_norm": 1.444864127952419,
+      "learning_rate": 1.4636258565490304e-05,
+      "loss": 0.739,
       "step": 670
     },
     {
+      "epoch": 1.2453874538745389,
+      "grad_norm": 1.2863648775710423,
+      "learning_rate": 1.4540861628625207e-05,
+      "loss": 0.7368,
       "step": 675
     },
     {
+      "epoch": 1.2546125461254611,
+      "grad_norm": 1.2200332099647682,
+      "learning_rate": 1.444494122971476e-05,
+      "loss": 0.7343,
       "step": 680
     },
     {
+      "epoch": 1.2638376383763839,
+      "grad_norm": 1.3714375121406106,
+      "learning_rate": 1.4348508426283342e-05,
+      "loss": 0.7391,
       "step": 685
     },
     {
+      "epoch": 1.2730627306273063,
+      "grad_norm": 1.2638691361743832,
+      "learning_rate": 1.4251574334924395e-05,
+      "loss": 0.7397,
       "step": 690
     },
     {
+      "epoch": 1.2822878228782288,
+      "grad_norm": 1.4011111864399106,
+      "learning_rate": 1.4154150130018867e-05,
+      "loss": 0.7374,
       "step": 695
     },
     {
+      "epoch": 1.2915129151291513,
+      "grad_norm": 1.2912923761278596,
+      "learning_rate": 1.4056247042447096e-05,
+      "loss": 0.7228,
+      "step": 700
+    },
+    {
+      "epoch": 1.2915129151291513,
+      "eval_loss": 0.9835454225540161,
+      "eval_runtime": 517.9285,
+      "eval_samples_per_second": 29.637,
+      "eval_steps_per_second": 0.116,
       "step": 700
     },
     {
+      "epoch": 1.3007380073800738,
+      "grad_norm": 1.5854901671726367,
+      "learning_rate": 1.3957876358294115e-05,
+      "loss": 0.7296,
       "step": 705
     },
     {
+      "epoch": 1.3099630996309963,
+      "grad_norm": 1.38846996136312,
+      "learning_rate": 1.385904941754862e-05,
+      "loss": 0.7257,
       "step": 710
     },
     {
+      "epoch": 1.3191881918819188,
+      "grad_norm": 1.5297133474564781,
+      "learning_rate": 1.375977761279571e-05,
+      "loss": 0.7352,
       "step": 715
     },
     {
+      "epoch": 1.3284132841328413,
+      "grad_norm": 1.287259224142701,
+      "learning_rate": 1.366007238790358e-05,
+      "loss": 0.7301,
       "step": 720
     },
     {
+      "epoch": 1.3376383763837638,
+      "grad_norm": 1.2884194224179173,
+      "learning_rate": 1.3559945236704286e-05,
+      "loss": 0.7383,
       "step": 725
     },
     {
+      "epoch": 1.3468634686346863,
+      "grad_norm": 1.3779553004575515,
+      "learning_rate": 1.3459407701668762e-05,
+      "loss": 0.7313,
       "step": 730
     },
     {
+      "epoch": 1.3560885608856088,
+      "grad_norm": 1.5349656095564503,
+      "learning_rate": 1.3358471372576229e-05,
+      "loss": 0.7334,
       "step": 735
     },
     {
+      "epoch": 1.3653136531365313,
+      "grad_norm": 1.3570612666553503,
+      "learning_rate": 1.3257147885178125e-05,
+      "loss": 0.7253,
       "step": 740
     },
     {
+      "epoch": 1.3745387453874538,
+      "grad_norm": 1.3514442377769267,
+      "learning_rate": 1.3155448919856792e-05,
+      "loss": 0.7375,
       "step": 745
     },
     {
+      "epoch": 1.3837638376383765,
+      "grad_norm": 1.338752928401098,
+      "learning_rate": 1.3053386200278963e-05,
+      "loss": 0.7349,
       "step": 750
     },
     {
+      "epoch": 1.3929889298892988,
+      "grad_norm": 1.3943704063449442,
+      "learning_rate": 1.2950971492044272e-05,
+      "loss": 0.7338,
       "step": 755
     },
     {
+      "epoch": 1.4022140221402215,
+      "grad_norm": 1.3567491078204894,
+      "learning_rate": 1.2848216601328958e-05,
+      "loss": 0.7385,
       "step": 760
     },
     {
+      "epoch": 1.4114391143911438,
+      "grad_norm": 1.2556919848553412,
+      "learning_rate": 1.2745133373524855e-05,
+      "loss": 0.7457,
       "step": 765
     },
     {
+      "epoch": 1.4206642066420665,
+      "grad_norm": 1.3027608934231716,
+      "learning_rate": 1.2641733691873884e-05,
+      "loss": 0.7342,
       "step": 770
     },
     {
+      "epoch": 1.429889298892989,
+      "grad_norm": 1.2668132369825373,
+      "learning_rate": 1.2538029476098175e-05,
+      "loss": 0.7317,
       "step": 775
     },
     {
+      "epoch": 1.4391143911439115,
+      "grad_norm": 1.2498842281077402,
+      "learning_rate": 1.2434032681025986e-05,
+      "loss": 0.732,
       "step": 780
     },
     {
+      "epoch": 1.448339483394834,
+      "grad_norm": 1.221148464370588,
+      "learning_rate": 1.2329755295213568e-05,
+      "loss": 0.7168,
       "step": 785
     },
     {
+      "epoch": 1.4575645756457565,
+      "grad_norm": 1.2029873246463332,
+      "learning_rate": 1.2225209339563144e-05,
+      "loss": 0.7299,
       "step": 790
     },
     {
+      "epoch": 1.466789667896679,
+      "grad_norm": 1.2769506053242343,
+      "learning_rate": 1.2120406865937174e-05,
+      "loss": 0.7385,
       "step": 795
     },
     {
+      "epoch": 1.4760147601476015,
+      "grad_norm": 1.5254063393209267,
+      "learning_rate": 1.2015359955769021e-05,
+      "loss": 0.7319,
+      "step": 800
+    },
+    {
+      "epoch": 1.4760147601476015,
+      "eval_loss": 0.9644125699996948,
+      "eval_runtime": 512.8317,
+      "eval_samples_per_second": 29.932,
+      "eval_steps_per_second": 0.117,
       "step": 800
     },
     {
+      "epoch": 1.485239852398524,
+      "grad_norm": 1.4657220418578245,
+      "learning_rate": 1.1910080718670246e-05,
+      "loss": 0.7234,
       "step": 805
     },
     {
+      "epoch": 1.4944649446494465,
+      "grad_norm": 1.3333083489866098,
+      "learning_rate": 1.1804581291034615e-05,
+      "loss": 0.7314,
       "step": 810
     },
     {
+      "epoch": 1.503690036900369,
+      "grad_norm": 1.3111534531304956,
+      "learning_rate": 1.169887383463906e-05,
+      "loss": 0.7212,
       "step": 815
     },
     {
+      "epoch": 1.5129151291512914,
+      "grad_norm": 1.2536260067392955,
+      "learning_rate": 1.1592970535241668e-05,
+      "loss": 0.723,
       "step": 820
     },
     {
+      "epoch": 1.5221402214022142,
+      "grad_norm": 1.239943596383526,
+      "learning_rate": 1.1486883601176944e-05,
+      "loss": 0.7315,
       "step": 825
     },
     {
+      "epoch": 1.5313653136531364,
+      "grad_norm": 1.188861248391431,
+      "learning_rate": 1.1380625261948458e-05,
+      "loss": 0.7301,
       "step": 830
     },
     {
+      "epoch": 1.5405904059040592,
+      "grad_norm": 1.247650108627454,
+      "learning_rate": 1.127420776681905e-05,
+      "loss": 0.7202,
       "step": 835
     },
     {
+      "epoch": 1.5498154981549814,
+      "grad_norm": 1.4048683840262912,
+      "learning_rate": 1.1167643383398746e-05,
+      "loss": 0.7247,
       "step": 840
     },
     {
+      "epoch": 1.5590405904059041,
+      "grad_norm": 1.2897015340446114,
+      "learning_rate": 1.1060944396230583e-05,
+      "loss": 0.7311,
       "step": 845
     },
     {
+      "epoch": 1.5682656826568264,
+      "grad_norm": 1.21939417183643,
+      "learning_rate": 1.0954123105374468e-05,
+      "loss": 0.7249,
       "step": 850
     },
     {
+      "epoch": 1.5774907749077491,
+      "grad_norm": 1.2309319468475195,
+      "learning_rate": 1.0847191824989252e-05,
+      "loss": 0.7298,
       "step": 855
     },
     {
+      "epoch": 1.5867158671586716,
+      "grad_norm": 1.2218109998078897,
+      "learning_rate": 1.0740162881913165e-05,
+      "loss": 0.7223,
       "step": 860
     },
     {
+      "epoch": 1.5959409594095941,
+      "grad_norm": 1.4183791452745522,
+      "learning_rate": 1.0633048614242817e-05,
+      "loss": 0.7359,
       "step": 865
     },
     {
+      "epoch": 1.6051660516605166,
+      "grad_norm": 1.2210289040303786,
+      "learning_rate": 1.0525861369910877e-05,
+      "loss": 0.7302,
       "step": 870
     },
     {
+      "epoch": 1.6143911439114391,
+      "grad_norm": 1.3175608261808258,
+      "learning_rate": 1.0418613505262623e-05,
+      "loss": 0.7226,
       "step": 875
     },
     {
+      "epoch": 1.6236162361623616,
+      "grad_norm": 1.3018239201611663,
+      "learning_rate": 1.0311317383631532e-05,
+      "loss": 0.7227,
       "step": 880
     },
     {
+      "epoch": 1.632841328413284,
+      "grad_norm": 1.1647552351758403,
+      "learning_rate": 1.0203985373914056e-05,
+      "loss": 0.7204,
       "step": 885
     },
     {
+      "epoch": 1.6420664206642066,
+      "grad_norm": 1.210717925144679,
+      "learning_rate": 1.0096629849143757e-05,
+      "loss": 0.7115,
       "step": 890
     },
     {
+      "epoch": 1.651291512915129,
+      "grad_norm": 1.1959081633999162,
+      "learning_rate": 9.989263185064974e-06,
+      "loss": 0.7164,
       "step": 895
     },
     {
+      "epoch": 1.6605166051660518,
+      "grad_norm": 1.1679984043624778,
+      "learning_rate": 9.881897758706155e-06,
+      "loss": 0.7177,
       "step": 900
     },
     {
+      "epoch": 1.6605166051660518,
+      "eval_loss": 0.9529369473457336,
+      "eval_runtime": 516.4151,
+      "eval_samples_per_second": 29.724,
+      "eval_steps_per_second": 0.116,
+      "step": 900
+    },
+    {
+      "epoch": 1.669741697416974,
+      "grad_norm": 1.1784785526719634,
+      "learning_rate": 9.77454594695308e-06,
+      "loss": 0.7274,
       "step": 905
     },
     {
+      "epoch": 1.6789667896678968,
+      "grad_norm": 1.1964871209199903,
+      "learning_rate": 9.667220125122044e-06,
+      "loss": 0.7119,
       "step": 910
     },
     {
+      "epoch": 1.688191881918819,
+      "grad_norm": 1.173031357661576,
+      "learning_rate": 9.559932665533291e-06,
+      "loss": 0.7134,
       "step": 915
     },
     {
+      "epoch": 1.6974169741697418,
+      "grad_norm": 1.2312863536042935,
+      "learning_rate": 9.452695936084728e-06,
+      "loss": 0.7144,
       "step": 920
     },
     {
+      "epoch": 1.706642066420664,
+      "grad_norm": 1.2013984113686338,
+      "learning_rate": 9.345522298826177e-06,
+      "loss": 0.7146,
       "step": 925
     },
     {
+      "epoch": 1.7158671586715868,
+      "grad_norm": 1.1285995450468198,
+      "learning_rate": 9.238424108534333e-06,
+      "loss": 0.7126,
       "step": 930
     },
     {
+      "epoch": 1.725092250922509,
+      "grad_norm": 1.1727971825533714,
+      "learning_rate": 9.131413711288485e-06,
+      "loss": 0.7173,
       "step": 935
     },
     {
+      "epoch": 1.7343173431734318,
+      "grad_norm": 1.198238879588798,
+      "learning_rate": 9.024503443047318e-06,
+      "loss": 0.7186,
       "step": 940
     },
     {
+      "epoch": 1.7435424354243543,
+      "grad_norm": 1.2092538734459182,
+      "learning_rate": 8.917705628226823e-06,
+      "loss": 0.7064,
       "step": 945
     },
     {
+      "epoch": 1.7527675276752768,
+      "grad_norm": 1.1850959753551464,
+      "learning_rate": 8.81103257827957e-06,
+      "loss": 0.7196,
       "step": 950
     },
     {
+      "epoch": 1.7619926199261993,
+      "grad_norm": 1.1849846233150378,
+      "learning_rate": 8.704496590275479e-06,
+      "loss": 0.7181,
       "step": 955
     },
     {
+      "epoch": 1.7712177121771218,
+      "grad_norm": 1.1192440025321218,
+      "learning_rate": 8.598109945484208e-06,
+      "loss": 0.7127,
       "step": 960
     },
     {
+      "epoch": 1.7804428044280443,
+      "grad_norm": 1.185810311236685,
+      "learning_rate": 8.491884907959426e-06,
+      "loss": 0.7092,
       "step": 965
     },
     {
+      "epoch": 1.7896678966789668,
+      "grad_norm": 1.1653670987242044,
+      "learning_rate": 8.385833723125006e-06,
+      "loss": 0.7115,
       "step": 970
     },
     {
+      "epoch": 1.7988929889298892,
+      "grad_norm": 1.2928934171032893,
+      "learning_rate": 8.279968616363417e-06,
+      "loss": 0.7116,
       "step": 975
     },
     {
+      "epoch": 1.8081180811808117,
+      "grad_norm": 1.1749460908752425,
+      "learning_rate": 8.174301791606384e-06,
+      "loss": 0.7159,
       "step": 980
     },
     {
+      "epoch": 1.8173431734317345,
+      "grad_norm": 1.2968530721458553,
+      "learning_rate": 8.06884542992806e-06,
+      "loss": 0.7022,
       "step": 985
     },
     {
+      "epoch": 1.8265682656826567,
+      "grad_norm": 1.214409149915767,
+      "learning_rate": 7.963611688140814e-06,
+      "loss": 0.705,
       "step": 990
     },
     {
+      "epoch": 1.8357933579335795,
+      "grad_norm": 1.1751136227927774,
+      "learning_rate": 7.858612697393792e-06,
+      "loss": 0.7166,
       "step": 995
     },
     {
+      "epoch": 1.8450184501845017,
+      "grad_norm": 1.2707314516132002,
+      "learning_rate": 7.753860561774495e-06,
+      "loss": 0.7095,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8450184501845017,
+      "eval_loss": 0.9393758773803711,
+      "eval_runtime": 524.5955,
+      "eval_samples_per_second": 29.261,
+      "eval_steps_per_second": 0.114,
       "step": 1000
     },
     {
+      "epoch": 1.8542435424354244,
+      "grad_norm": 1.2737022554438457,
+      "learning_rate": 7.649367356913422e-06,
+      "loss": 0.7133,
       "step": 1005
     },
     {
+      "epoch": 1.8634686346863467,
+      "grad_norm": 1.2146494230865963,
+      "learning_rate": 7.545145128592009e-06,
+      "loss": 0.7162,
       "step": 1010
     },
     {
+      "epoch": 1.8726937269372694,
+      "grad_norm": 1.2563305762066708,
+      "learning_rate": 7.441205891354037e-06,
+      "loss": 0.7128,
       "step": 1015
     },
     {
+      "epoch": 1.881918819188192,
+      "grad_norm": 1.2400110075293442,
+      "learning_rate": 7.337561627120591e-06,
+      "loss": 0.7059,
       "step": 1020
     },
     {
+      "epoch": 1.8911439114391144,
+      "grad_norm": 1.2653437150866325,
+      "learning_rate": 7.234224283808832e-06,
+      "loss": 0.7058,
       "step": 1025
     },
     {
+      "epoch": 1.900369003690037,
+      "grad_norm": 1.1646651085367645,
+      "learning_rate": 7.131205773954636e-06,
+      "loss": 0.706,
       "step": 1030
     },
     {
+      "epoch": 1.9095940959409594,
+      "grad_norm": 1.1518551233990397,
+      "learning_rate": 7.028517973339361e-06,
+      "loss": 0.7138,
       "step": 1035
     },
     {
+      "epoch": 1.918819188191882,
+      "grad_norm": 1.223360815231687,
+      "learning_rate": 6.926172719620827e-06,
+      "loss": 0.697,
       "step": 1040
     },
     {
+      "epoch": 1.9280442804428044,
+      "grad_norm": 1.2198079984824493,
+      "learning_rate": 6.824181810968675e-06,
+      "loss": 0.7004,
       "step": 1045
     },
     {
+      "epoch": 1.937269372693727,
+      "grad_norm": 1.176959664107674,
+      "learning_rate": 6.722557004704322e-06,
+      "loss": 0.7082,
       "step": 1050
     },
     {
+      "epoch": 1.9464944649446494,
+      "grad_norm": 1.1844320699248965,
+      "learning_rate": 6.62131001594558e-06,
+      "loss": 0.7043,
       "step": 1055
     },
     {
+      "epoch": 1.9557195571955721,
+      "grad_norm": 1.148753422424237,
+      "learning_rate": 6.520452516256157e-06,
+      "loss": 0.6949,
       "step": 1060
     },
     {
+      "epoch": 1.9649446494464944,
+      "grad_norm": 1.1572577267352544,
+      "learning_rate": 6.419996132300203e-06,
+      "loss": 0.7071,
       "step": 1065
     },
     {
+      "epoch": 1.974169741697417,
+      "grad_norm": 1.2001014830908205,
+      "learning_rate": 6.319952444501984e-06,
+      "loss": 0.7103,
       "step": 1070
     },
     {
+      "epoch": 1.9833948339483394,
+      "grad_norm": 1.4841715888010063,
+      "learning_rate": 6.220332985710936e-06,
+      "loss": 0.694,
       "step": 1075
     },
     {
+      "epoch": 1.992619926199262,
+      "grad_norm": 1.4256755997357629,
+      "learning_rate": 6.121149239872151e-06,
+      "loss": 0.6964,
       "step": 1080
     },
     {
+      "epoch": 2.0018450184501844,
+      "grad_norm": 4.270149025567802,
+      "learning_rate": 6.0224126407025616e-06,
+      "loss": 0.6543,
+      "step": 1085
+    },
+    {
+      "epoch": 2.011070110701107,
+      "grad_norm": 2.6490744221351044,
+      "learning_rate": 5.924134570372863e-06,
+      "loss": 0.4529,
+      "step": 1090
+    },
+    {
+      "epoch": 2.0202952029520294,
+      "grad_norm": 2.2645999605838227,
+      "learning_rate": 5.826326358195391e-06,
+      "loss": 0.4559,
+      "step": 1095
+    },
+    {
+      "epoch": 2.029520295202952,
+      "grad_norm": 1.5705400512864462,
+      "learning_rate": 5.728999279318131e-06,
+      "loss": 0.4465,
+      "step": 1100
+    },
+    {
+      "epoch": 2.029520295202952,
+      "eval_loss": 0.9917108416557312,
+      "eval_runtime": 517.7798,
+      "eval_samples_per_second": 29.646,
+      "eval_steps_per_second": 0.116,
+      "step": 1100
+    },
+    {
+      "epoch": 2.0387453874538743,
+      "grad_norm": 1.6254518927847355,
+      "learning_rate": 5.632164553424904e-06,
+      "loss": 0.4353,
+      "step": 1105
+    },
+    {
+      "epoch": 2.047970479704797,
+      "grad_norm": 14.583137561537578,
+      "learning_rate": 5.5358333434420054e-06,
+      "loss": 0.4424,
+      "step": 1110
+    },
+    {
+      "epoch": 2.0571955719557193,
+      "grad_norm": 1.447005279720627,
+      "learning_rate": 5.440016754251364e-06,
+      "loss": 0.4423,
+      "step": 1115
+    },
+    {
+      "epoch": 2.066420664206642,
+      "grad_norm": 1.4595204240426687,
+      "learning_rate": 5.344725831410369e-06,
+      "loss": 0.4384,
+      "step": 1120
+    },
+    {
+      "epoch": 2.0756457564575648,
+      "grad_norm": 1.3190598016289843,
+      "learning_rate": 5.24997155987859e-06,
+      "loss": 0.4368,
+      "step": 1125
+    },
+    {
+      "epoch": 2.084870848708487,
+      "grad_norm": 1.322338946677976,
+      "learning_rate": 5.155764862751427e-06,
+      "loss": 0.4392,
+      "step": 1130
+    },
+    {
+      "epoch": 2.0940959409594098,
+      "grad_norm": 1.3472757392525208,
+      "learning_rate": 5.062116600000933e-06,
+      "loss": 0.4297,
+      "step": 1135
+    },
+    {
+      "epoch": 2.103321033210332,
+      "grad_norm": 1.2895577097092337,
+      "learning_rate": 4.969037567223881e-06,
+      "loss": 0.4413,
+      "step": 1140
+    },
+    {
+      "epoch": 2.1125461254612548,
+      "grad_norm": 1.3471090116973288,
+      "learning_rate": 4.876538494397274e-06,
+      "loss": 0.4317,
+      "step": 1145
+    },
+    {
+      "epoch": 2.121771217712177,
+      "grad_norm": 1.3092628602239211,
+      "learning_rate": 4.784630044641435e-06,
+      "loss": 0.4509,
+      "step": 1150
+    },
+    {
+      "epoch": 2.1309963099630997,
+      "grad_norm": 1.344809966917295,
+      "learning_rate": 4.6933228129907395e-06,
+      "loss": 0.4375,
+      "step": 1155
+    },
+    {
+      "epoch": 2.140221402214022,
+      "grad_norm": 1.3014430618254322,
+      "learning_rate": 4.602627325172279e-06,
+      "loss": 0.4424,
+      "step": 1160
+    },
+    {
+      "epoch": 2.1494464944649447,
+      "grad_norm": 1.3672933559982345,
+      "learning_rate": 4.512554036392448e-06,
+      "loss": 0.4419,
+      "step": 1165
+    },
+    {
+      "epoch": 2.158671586715867,
+      "grad_norm": 1.3446667993737584,
+      "learning_rate": 4.423113330131708e-06,
+      "loss": 0.4303,
+      "step": 1170
+    },
+    {
+      "epoch": 2.1678966789667897,
+      "grad_norm": 1.3257443131859206,
+      "learning_rate": 4.33431551694758e-06,
+      "loss": 0.4369,
+      "step": 1175
+    },
+    {
+      "epoch": 2.177121771217712,
+      "grad_norm": 1.3655737456565726,
+      "learning_rate": 4.246170833286075e-06,
+      "loss": 0.4293,
+      "step": 1180
+    },
+    {
+      "epoch": 2.1863468634686347,
+      "grad_norm": 1.3298593125645854,
+      "learning_rate": 4.1586894403016576e-06,
+      "loss": 0.439,
+      "step": 1185
+    },
+    {
+      "epoch": 2.195571955719557,
+      "grad_norm": 1.32505780264794,
+      "learning_rate": 4.071881422685877e-06,
+      "loss": 0.4285,
+      "step": 1190
+    },
+    {
+      "epoch": 2.2047970479704797,
+      "grad_norm": 1.3004312804341762,
+      "learning_rate": 3.985756787504837e-06,
+      "loss": 0.4353,
+      "step": 1195
+    },
+    {
+      "epoch": 2.2140221402214024,
+      "grad_norm": 1.3177561620055287,
+      "learning_rate": 3.9003254630455775e-06,
+      "loss": 0.4341,
+      "step": 1200
+    },
+    {
+      "epoch": 2.2140221402214024,
+      "eval_loss": 0.9978848695755005,
+      "eval_runtime": 514.7843,
+      "eval_samples_per_second": 29.818,
+      "eval_steps_per_second": 0.117,
+      "step": 1200
+    },
+    {
+      "epoch": 2.2232472324723247,
+      "grad_norm": 1.3438896554856818,
+      "learning_rate": 3.815597297671578e-06,
+      "loss": 0.4336,
+      "step": 1205
+    },
+    {
+      "epoch": 2.2324723247232474,
+      "grad_norm": 1.2896295540334282,
+      "learning_rate": 3.731582058687462e-06,
+      "loss": 0.435,
+      "step": 1210
+    },
+    {
+      "epoch": 2.2416974169741697,
+      "grad_norm": 1.358035688644123,
+      "learning_rate": 3.6482894312130146e-06,
+      "loss": 0.4324,
+      "step": 1215
+    },
+    {
+      "epoch": 2.2509225092250924,
+      "grad_norm": 1.312197292051631,
+      "learning_rate": 3.565729017066729e-06,
+      "loss": 0.4315,
+      "step": 1220
+    },
+    {
+      "epoch": 2.2601476014760147,
+      "grad_norm": 1.3227121347141655,
+      "learning_rate": 3.483910333658913e-06,
+      "loss": 0.4364,
+      "step": 1225
+    },
+    {
+      "epoch": 2.2693726937269374,
+      "grad_norm": 1.3256090212374516,
+      "learning_rate": 3.402842812894529e-06,
+      "loss": 0.4356,
+      "step": 1230
+    },
+    {
+      "epoch": 2.2785977859778597,
+      "grad_norm": 1.317549750635349,
+      "learning_rate": 3.3225358000859287e-06,
+      "loss": 0.4349,
+      "step": 1235
+    },
+    {
+      "epoch": 2.2878228782287824,
+      "grad_norm": 1.2612830347481554,
+      "learning_rate": 3.2429985528755127e-06,
+      "loss": 0.4306,
+      "step": 1240
+    },
+    {
+      "epoch": 2.2970479704797047,
+      "grad_norm": 1.3450073317730427,
+      "learning_rate": 3.1642402401685557e-06,
+      "loss": 0.4361,
+      "step": 1245
+    },
+    {
+      "epoch": 2.3062730627306274,
+      "grad_norm": 1.3431835139445107,
+      "learning_rate": 3.0862699410762043e-06,
+      "loss": 0.4393,
+      "step": 1250
+    },
+    {
+      "epoch": 2.3154981549815496,
+      "grad_norm": 1.3379126436430948,
+      "learning_rate": 3.0090966438688774e-06,
+      "loss": 0.4306,
+      "step": 1255
+    },
+    {
+      "epoch": 2.3247232472324724,
+      "grad_norm": 1.2809064467748859,
+      "learning_rate": 2.9327292449401067e-06,
+      "loss": 0.4416,
+      "step": 1260
+    },
+    {
+      "epoch": 2.3339483394833946,
+      "grad_norm": 1.3548015164880183,
+      "learning_rate": 2.8571765477809645e-06,
+      "loss": 0.4338,
+      "step": 1265
+    },
+    {
+      "epoch": 2.3431734317343174,
+      "grad_norm": 1.320665427008479,
+      "learning_rate": 2.7824472619652386e-06,
+      "loss": 0.4361,
+      "step": 1270
+    },
+    {
+      "epoch": 2.35239852398524,
+      "grad_norm": 1.3096646770487193,
+      "learning_rate": 2.7085500021453838e-06,
+      "loss": 0.4294,
+      "step": 1275
+    },
+    {
+      "epoch": 2.3616236162361623,
+      "grad_norm": 1.2800372167523524,
+      "learning_rate": 2.635493287059464e-06,
+      "loss": 0.4299,
+      "step": 1280
+    },
+    {
+      "epoch": 2.3708487084870846,
+      "grad_norm": 1.303993086907089,
+      "learning_rate": 2.563285538549104e-06,
+      "loss": 0.4361,
+      "step": 1285
+    },
+    {
+      "epoch": 2.3800738007380073,
+      "grad_norm": 1.2720280407092956,
+      "learning_rate": 2.491935080588658e-06,
+      "loss": 0.4384,
+      "step": 1290
+    },
+    {
+      "epoch": 2.38929889298893,
+      "grad_norm": 1.2941980810201439,
+      "learning_rate": 2.421450138325625e-06,
+      "loss": 0.4306,
+      "step": 1295
+    },
+    {
+      "epoch": 2.3985239852398523,
+      "grad_norm": 1.2949495993502738,
+      "learning_rate": 2.351838837132464e-06,
+      "loss": 0.432,
+      "step": 1300
+    },
+    {
+      "epoch": 2.3985239852398523,
+      "eval_loss": 0.9954376816749573,
+      "eval_runtime": 519.9495,
+      "eval_samples_per_second": 29.522,
+      "eval_steps_per_second": 0.115,
+      "step": 1300
+    },
+    {
+      "epoch": 2.407749077490775,
+      "grad_norm": 1.3018815365771563,
+      "learning_rate": 2.283109201669936e-06,
+      "loss": 0.4357,
+      "step": 1305
+    },
+    {
+      "epoch": 2.4169741697416973,
+      "grad_norm": 1.2956106687686837,
+      "learning_rate": 2.2152691549620155e-06,
+      "loss": 0.4283,
+      "step": 1310
+    },
+    {
+      "epoch": 2.42619926199262,
+      "grad_norm": 1.287230882437174,
+      "learning_rate": 2.148326517482543e-06,
+      "loss": 0.4303,
+      "step": 1315
+    },
+    {
+      "epoch": 2.4354243542435423,
+      "grad_norm": 1.2592322120333668,
+      "learning_rate": 2.0822890062537106e-06,
+      "loss": 0.4366,
+      "step": 1320
+    },
+    {
+      "epoch": 2.444649446494465,
+      "grad_norm": 1.3039469988205457,
+      "learning_rate": 2.01716423395644e-06,
+      "loss": 0.4317,
+      "step": 1325
+    },
+    {
+      "epoch": 2.4538745387453873,
+      "grad_norm": 1.282772824972497,
+      "learning_rate": 1.9529597080528207e-06,
+      "loss": 0.4272,
+      "step": 1330
+    },
+    {
+      "epoch": 2.46309963099631,
+      "grad_norm": 1.3227463435260074,
+      "learning_rate": 1.8896828299206494e-06,
+      "loss": 0.4256,
+      "step": 1335
+    },
+    {
+      "epoch": 2.4723247232472323,
+      "grad_norm": 1.3607936617452498,
+      "learning_rate": 1.8273408940002202e-06,
+      "loss": 0.4389,
+      "step": 1340
+    },
+    {
+      "epoch": 2.481549815498155,
+      "grad_norm": 1.2740801988744865,
+      "learning_rate": 1.7659410869534466e-06,
+      "loss": 0.4247,
+      "step": 1345
+    },
+    {
+      "epoch": 2.4907749077490777,
+      "grad_norm": 1.2544315701192987,
+      "learning_rate": 1.7054904868353717e-06,
+      "loss": 0.4256,
+      "step": 1350
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.31550558585801,
+      "learning_rate": 1.6459960622782466e-06,
+      "loss": 0.428,
+      "step": 1355
+    },
+    {
+      "epoch": 2.5092250922509223,
+      "grad_norm": 1.3030144767834306,
+      "learning_rate": 1.587464671688187e-06,
+      "loss": 0.4217,
+      "step": 1360
+    },
+    {
+      "epoch": 2.518450184501845,
+      "grad_norm": 1.261812680015863,
+      "learning_rate": 1.5299030624545563e-06,
+      "loss": 0.4381,
+      "step": 1365
+    },
+    {
+      "epoch": 2.5276752767527677,
+      "grad_norm": 1.3015065571944802,
+      "learning_rate": 1.4733178701721262e-06,
+      "loss": 0.4337,
+      "step": 1370
+    },
+    {
+      "epoch": 2.53690036900369,
+      "grad_norm": 1.2805139778312684,
+      "learning_rate": 1.4177156178761508e-06,
+      "loss": 0.4313,
+      "step": 1375
+    },
+    {
+      "epoch": 2.5461254612546127,
+      "grad_norm": 1.3271791125805354,
+      "learning_rate": 1.363102715290402e-06,
+      "loss": 0.4314,
+      "step": 1380
+    },
+    {
+      "epoch": 2.555350553505535,
+      "grad_norm": 1.3155240192251205,
+      "learning_rate": 1.3094854580882599e-06,
+      "loss": 0.4298,
+      "step": 1385
+    },
+    {
+      "epoch": 2.5645756457564577,
+      "grad_norm": 1.2884517504542843,
+      "learning_rate": 1.2568700271669676e-06,
+      "loss": 0.4315,
+      "step": 1390
+    },
+    {
+      "epoch": 2.57380073800738,
+      "grad_norm": 1.2601572769871257,
+      "learning_rate": 1.2052624879351105e-06,
+      "loss": 0.4341,
+      "step": 1395
+    },
+    {
+      "epoch": 2.5830258302583027,
+      "grad_norm": 1.283042988722646,
+      "learning_rate": 1.1546687896133924e-06,
+      "loss": 0.4301,
+      "step": 1400
+    },
+    {
+      "epoch": 2.5830258302583027,
+      "eval_loss": 0.9943162798881531,
+      "eval_runtime": 513.9906,
+      "eval_samples_per_second": 29.864,
+      "eval_steps_per_second": 0.117,
+      "step": 1400
+    },
+    {
+      "epoch": 2.592250922509225,
+      "grad_norm": 1.269448040169663,
+      "learning_rate": 1.1050947645488419e-06,
+      "loss": 0.424,
+      "step": 1405
+    },
+    {
+      "epoch": 2.6014760147601477,
+      "grad_norm": 1.291108826010762,
+      "learning_rate": 1.0565461275424504e-06,
+      "loss": 0.4288,
+      "step": 1410
+    },
+    {
+      "epoch": 2.61070110701107,
+      "grad_norm": 1.246075371329031,
+      "learning_rate": 1.0090284751903989e-06,
+      "loss": 0.4308,
+      "step": 1415
+    },
+    {
+      "epoch": 2.6199261992619927,
+      "grad_norm": 1.268331381912208,
+      "learning_rate": 9.625472852388739e-07,
+      "loss": 0.4274,
+      "step": 1420
+    },
+    {
+      "epoch": 2.6291512915129154,
+      "grad_norm": 1.2558980878489436,
+      "learning_rate": 9.171079159526186e-07,
+      "loss": 0.4263,
+      "step": 1425
+    },
+    {
+      "epoch": 2.6383763837638377,
+      "grad_norm": 1.2507458001549574,
+      "learning_rate": 8.727156054972374e-07,
+      "loss": 0.4364,
+      "step": 1430
+    },
+    {
+      "epoch": 2.64760147601476,
+      "grad_norm": 1.2344093421817917,
+      "learning_rate": 8.29375471335343e-07,
+      "loss": 0.43,
+      "step": 1435
+    },
+    {
+      "epoch": 2.6568265682656826,
+      "grad_norm": 1.2520176453134155,
+      "learning_rate": 7.870925096366366e-07,
+      "loss": 0.4298,
+      "step": 1440
+    },
+    {
+      "epoch": 2.6660516605166054,
+      "grad_norm": 1.2874930933327957,
+      "learning_rate": 7.458715947019468e-07,
+      "loss": 0.4262,
+      "step": 1445
+    },
+    {
+      "epoch": 2.6752767527675276,
+      "grad_norm": 1.2682188739552445,
+      "learning_rate": 7.057174784013432e-07,
+      "loss": 0.4339,
+      "step": 1450
+    },
+    {
+      "epoch": 2.6845018450184504,
+      "grad_norm": 1.2828645340804818,
+      "learning_rate": 6.666347896263326e-07,
+      "loss": 0.4274,
+      "step": 1455
+    },
+    {
+      "epoch": 2.6937269372693726,
+      "grad_norm": 1.2595258026091076,
+      "learning_rate": 6.286280337562656e-07,
+      "loss": 0.4303,
+      "step": 1460
+    },
+    {
+      "epoch": 2.7029520295202953,
+      "grad_norm": 1.24521822647123,
+      "learning_rate": 5.917015921389569e-07,
+      "loss": 0.4288,
+      "step": 1465
+    },
+    {
+      "epoch": 2.7121771217712176,
+      "grad_norm": 1.232445478302712,
+      "learning_rate": 5.558597215856065e-07,
+      "loss": 0.4285,
+      "step": 1470
+    },
+    {
+      "epoch": 2.7214022140221403,
+      "grad_norm": 1.216057817991593,
+      "learning_rate": 5.211065538800952e-07,
+      "loss": 0.4208,
+      "step": 1475
+    },
+    {
+      "epoch": 2.7306273062730626,
+      "grad_norm": 1.288524367589534,
+      "learning_rate": 4.874460953026705e-07,
+      "loss": 0.4255,
+      "step": 1480
+    },
+    {
+      "epoch": 2.7398523985239853,
+      "grad_norm": 1.2332155213343263,
+      "learning_rate": 4.548822261681107e-07,
+      "loss": 0.423,
+      "step": 1485
+    },
+    {
+      "epoch": 2.7490774907749076,
+      "grad_norm": 1.2278878382563285,
+      "learning_rate": 4.2341870037841516e-07,
+      "loss": 0.4291,
+      "step": 1490
+    },
+    {
+      "epoch": 2.7583025830258303,
+      "grad_norm": 1.262898121860552,
+      "learning_rate": 3.930591449900578e-07,
+      "loss": 0.4247,
+      "step": 1495
+    },
+    {
+      "epoch": 2.767527675276753,
+      "grad_norm": 1.2437619506416164,
+      "learning_rate": 3.638070597958665e-07,
+      "loss": 0.4361,
+      "step": 1500
+    },
+    {
+      "epoch": 2.767527675276753,
+      "eval_loss": 0.9930853247642517,
+      "eval_runtime": 516.1928,
+      "eval_samples_per_second": 29.737,
+      "eval_steps_per_second": 0.116,
+      "step": 1500
+    },
+    {
+      "epoch": 2.7767527675276753,
+      "grad_norm": 1.2468366513522777,
+      "learning_rate": 3.356658169215743e-07,
+      "loss": 0.4282,
+      "step": 1505
+    },
+    {
+      "epoch": 2.7859778597785976,
+      "grad_norm": 1.2336029324910027,
+      "learning_rate": 3.0863866043708393e-07,
+      "loss": 0.4267,
+      "step": 1510
+    },
+    {
+      "epoch": 2.7952029520295203,
+      "grad_norm": 1.3330748292636831,
+      "learning_rate": 2.8272870598250677e-07,
+      "loss": 0.4281,
+      "step": 1515
+    },
+    {
+      "epoch": 2.804428044280443,
+      "grad_norm": 1.2486193575900169,
+      "learning_rate": 2.5793894040898384e-07,
+      "loss": 0.4224,
+      "step": 1520
+    },
+    {
+      "epoch": 2.8136531365313653,
+      "grad_norm": 1.235394179484528,
+      "learning_rate": 2.3427222143438065e-07,
+      "loss": 0.4184,
+      "step": 1525
+    },
+    {
+      "epoch": 2.8228782287822876,
+      "grad_norm": 1.2913244981868073,
+      "learning_rate": 2.117312773138458e-07,
+      "loss": 0.4238,
+      "step": 1530
+    },
+    {
+      "epoch": 2.8321033210332103,
+      "grad_norm": 1.2580451640594703,
+      "learning_rate": 1.903187065253076e-07,
+      "loss": 0.4274,
+      "step": 1535
+    },
+    {
+      "epoch": 2.841328413284133,
+      "grad_norm": 1.262849856657073,
+      "learning_rate": 1.7003697746992398e-07,
+      "loss": 0.4242,
+      "step": 1540
+    },
+    {
+      "epoch": 2.8505535055350553,
+      "grad_norm": 1.2336423601103856,
+      "learning_rate": 1.5088842818752892e-07,
+      "loss": 0.4338,
+      "step": 1545
+    },
+    {
+      "epoch": 2.859778597785978,
+      "grad_norm": 1.279029201429549,
+      "learning_rate": 1.3287526608711132e-07,
+      "loss": 0.4247,
+      "step": 1550
+    },
+    {
+      "epoch": 2.8690036900369003,
+      "grad_norm": 1.2569044993333771,
+      "learning_rate": 1.1599956769234533e-07,
+      "loss": 0.4167,
+      "step": 1555
+    },
+    {
+      "epoch": 2.878228782287823,
+      "grad_norm": 1.229520630461672,
+      "learning_rate": 1.0026327840221728e-07,
+      "loss": 0.4182,
+      "step": 1560
+    },
+    {
+      "epoch": 2.8874538745387452,
+      "grad_norm": 1.255986608003343,
+      "learning_rate": 8.566821226675514e-08,
+      "loss": 0.4294,
+      "step": 1565
+    },
+    {
+      "epoch": 2.896678966789668,
+      "grad_norm": 1.2895814979486142,
+      "learning_rate": 7.22160517779169e-08,
+      "loss": 0.429,
+      "step": 1570
+    },
+    {
+      "epoch": 2.9059040590405907,
+      "grad_norm": 1.2790109206046127,
+      "learning_rate": 5.99083476756357e-08,
+      "loss": 0.4261,
+      "step": 1575
+    },
+    {
+      "epoch": 2.915129151291513,
+      "grad_norm": 1.2194809596900478,
+      "learning_rate": 4.87465187690439e-08,
+      "loss": 0.4211,
+      "step": 1580
+    },
+    {
+      "epoch": 2.9243542435424352,
+      "grad_norm": 1.2665552740156838,
+      "learning_rate": 3.873185177292737e-08,
+      "loss": 0.4251,
+      "step": 1585
+    },
+    {
+      "epoch": 2.933579335793358,
+      "grad_norm": 1.2812371627035533,
+      "learning_rate": 2.9865501159387355e-08,
+      "loss": 0.4282,
+      "step": 1590
+    },
+    {
+      "epoch": 2.9428044280442807,
+      "grad_norm": 1.2399165066075877,
+      "learning_rate": 2.214848902475808e-08,
+      "loss": 0.4341,
+      "step": 1595
+    },
+    {
+      "epoch": 2.952029520295203,
+      "grad_norm": 1.2154194504631015,
+      "learning_rate": 1.558170497178213e-08,
+      "loss": 0.4256,
+      "step": 1600
+    },
+    {
+      "epoch": 2.952029520295203,
+      "eval_loss": 0.9934021830558777,
+      "eval_runtime": 525.4852,
+      "eval_samples_per_second": 29.211,
+      "eval_steps_per_second": 0.114,
+      "step": 1600
+    },
+    {
+      "epoch": 2.961254612546125,
+      "grad_norm": 1.2717521820081574,
+      "learning_rate": 1.0165906007056914e-08,
+      "loss": 0.4323,
+      "step": 1605
+    },
+    {
+      "epoch": 2.970479704797048,
+      "grad_norm": 1.2491830905746684,
+      "learning_rate": 5.901716453770023e-09,
+      "loss": 0.4271,
+      "step": 1610
+    },
+    {
+      "epoch": 2.9797047970479706,
+      "grad_norm": 1.2521953436091506,
+      "learning_rate": 2.7896278797256983e-09,
+      "loss": 0.4256,
+      "step": 1615
+    },
+    {
+      "epoch": 2.988929889298893,
+      "grad_norm": 1.2335508198968657,
+      "learning_rate": 8.299990406823721e-10,
+      "loss": 0.4342,
+      "step": 1620
+    },
+    {
+      "epoch": 2.9981549815498156,
+      "grad_norm": 1.2480273735451688,
+      "learning_rate": 2.3055838990204693e-11,
+      "loss": 0.4266,
+      "step": 1625
     },
     {
+      "epoch": 3.0,
+      "step": 1626,
+      "total_flos": 1361805280542720.0,
+      "train_loss": 0.713569560815634,
+      "train_runtime": 59769.2599,
+      "train_samples_per_second": 6.961,
+      "train_steps_per_second": 0.027
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1626,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 1361805280542720.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null