Model save

Browse files

Files changed (6) hide show

README.md +15 -11
adapter_model.safetensors +1 -1
all_results.json +7 -12
runs/May21_04-02-00_deep-diver-main-tough-snake-1-0-0/events.out.tfevents.1716278664.deep-diver-main-tough-snake-1-0-0.385.0 +2 -2
train_results.json +7 -7
trainer_state.json +1236 -601

README.md CHANGED Viewed

@@ -2,13 +2,12 @@
 license: gemma
 library_name: peft
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
-- llama-duo/synth_summarize_dataset
 model-index:
 - name: gemma7b-summarize-gpt4o-30k
  results: []
@@ -17,12 +16,12 @@ model-index:
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/ddvw2m8z)
 # gemma7b-summarize-gpt4o-30k
-This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset dataset.
 It achieves the following results on the evaluation set:
-- Loss: 2.3811
 ## Model description
@@ -53,17 +52,22 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 5
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 0.9712 | 1.0 | 137 | 2.3077 |
-| 0.8675 | 2.0 | 274 | 2.2479 |
-| 0.7623 | 3.0 | 411 | 2.2756 |
-| 0.709 | 4.0 | 548 | 2.3417 |
-| 0.6601 | 5.0 | 685 | 2.3811 |
 ### Framework versions

 license: gemma
 library_name: peft
 tags:
 - trl
 - sft
 - generated_from_trainer
 base_model: google/gemma-7b
 datasets:
+- generator
 model-index:
 - name: gemma7b-summarize-gpt4o-30k
  results: []
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/gtgsbwvu)
 # gemma7b-summarize-gpt4o-30k
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 3.2430
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 1.1572 | 1.0 | 111 | 2.3072 |
+| 0.9296 | 2.0 | 222 | 2.1789 |
+| 0.8273 | 3.0 | 333 | 2.1709 |
+| 0.7586 | 4.0 | 444 | 2.2164 |
+| 0.6613 | 5.0 | 555 | 2.3182 |
+| 0.577 | 6.0 | 666 | 2.4774 |
+| 0.4958 | 7.0 | 777 | 2.7036 |
+| 0.4205 | 8.0 | 888 | 2.9689 |
+| 0.382 | 9.0 | 999 | 3.2252 |
+| 0.372 | 10.0 | 1110 | 3.2430 |
 ### Framework versions

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1578c9aa1d019a32f4f14559226badaa97bb33080e583876456fc629835a8cb8
 size 50056096

 version https://git-lfs.github.com/spec/v1
+oid sha256:32b4355c727acdb0c6029f34cd21b7f1e40baf4881b93221c8019898e95b873f
 size 50056096

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
- "epoch": 5.0,
- "eval_loss": 2.3811252117156982,
- "eval_runtime": 1.024,
- "eval_samples": 25,
- "eval_samples_per_second": 4.883,
- "eval_steps_per_second": 1.953,
- "total_flos": 1.0472781231601746e+18,
- "train_loss": 2.151051264783762,
- "train_runtime": 5341.9856,
- "train_samples": 29787,
- "train_samples_per_second": 2.052,
- "train_steps_per_second": 0.128
 }

 {
+ "epoch": 10.0,
+ "total_flos": 1.697049221804327e+18,
+ "train_loss": 1.8630313719715084,
+ "train_runtime": 9058.6901,
+ "train_samples": 32782,
+ "train_samples_per_second": 1.957,
+ "train_steps_per_second": 0.123
 }

runs/May21_04-02-00_deep-diver-main-tough-snake-1-0-0/events.out.tfevents.1716278664.deep-diver-main-tough-snake-1-0-0.385.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b3a789b1bcb5c469f2d0771d5392b716d51c212add7fcc76c2f0831c7fa8c9fd
-size 54552

 version https://git-lfs.github.com/spec/v1
+oid sha256:7fb3394383ff9d99394f680955b2c1d92f3e2570009d97a580da145d58da55e7
+size 55599

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
- "epoch": 5.0,
- "total_flos": 1.0472781231601746e+18,
- "train_loss": 2.151051264783762,
- "train_runtime": 5341.9856,
- "train_samples": 29787,
- "train_samples_per_second": 2.052,
- "train_steps_per_second": 0.128
 }

 {
+ "epoch": 10.0,
+ "total_flos": 1.697049221804327e+18,
+ "train_loss": 1.8630313719715084,
+ "train_runtime": 9058.6901,
+ "train_samples": 32782,
+ "train_samples_per_second": 1.957,
+ "train_steps_per_second": 0.123
 }

trainer_state.json CHANGED Viewed

@@ -1,1033 +1,1668 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
- "epoch": 5.0,
  "eval_steps": 500,
- "global_step": 685,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
  {
- "epoch": 0.0072992700729927005,
- "grad_norm": 708.0,
- "learning_rate": 2.898550724637681e-06,
- "loss": 56.8346,
  "step": 1
  },
  {
- "epoch": 0.0364963503649635,
- "grad_norm": 604.0,
- "learning_rate": 1.4492753623188407e-05,
- "loss": 52.9742,
  "step": 5
  },
  {
- "epoch": 0.072992700729927,
- "grad_norm": 340.0,
- "learning_rate": 2.8985507246376814e-05,
- "loss": 39.0746,
  "step": 10
  },
  {
- "epoch": 0.10948905109489052,
- "grad_norm": 40.25,
- "learning_rate": 4.347826086956522e-05,
- "loss": 20.8099,
  "step": 15
  },
  {
- "epoch": 0.145985401459854,
- "grad_norm": 25.5,
- "learning_rate": 5.797101449275363e-05,
- "loss": 17.6144,
  "step": 20
  },
  {
- "epoch": 0.18248175182481752,
- "grad_norm": 7.78125,
- "learning_rate": 7.246376811594203e-05,
- "loss": 15.3803,
  "step": 25
  },
  {
- "epoch": 0.21897810218978103,
- "grad_norm": 6.40625,
- "learning_rate": 8.695652173913044e-05,
- "loss": 14.0798,
  "step": 30
  },
  {
- "epoch": 0.25547445255474455,
- "grad_norm": 13.4375,
- "learning_rate": 0.00010144927536231885,
- "loss": 13.4032,
  "step": 35
  },
  {
- "epoch": 0.291970802919708,
- "grad_norm": 41.0,
- "learning_rate": 0.00011594202898550725,
- "loss": 10.8827,
  "step": 40
  },
  {
- "epoch": 0.3284671532846715,
- "grad_norm": 13.1875,
- "learning_rate": 0.00013043478260869567,
- "loss": 4.5915,
  "step": 45
  },
  {
- "epoch": 0.36496350364963503,
- "grad_norm": 4.09375,
- "learning_rate": 0.00014492753623188405,
- "loss": 1.9,
  "step": 50
  },
  {
- "epoch": 0.40145985401459855,
- "grad_norm": 2.28125,
- "learning_rate": 0.00015942028985507247,
- "loss": 1.6474,
  "step": 55
  },
  {
- "epoch": 0.43795620437956206,
- "grad_norm": 3.5,
- "learning_rate": 0.00017391304347826088,
- "loss": 1.477,
  "step": 60
  },
  {
- "epoch": 0.4744525547445255,
- "grad_norm": 2.28125,
- "learning_rate": 0.00018840579710144927,
- "loss": 1.3309,
  "step": 65
  },
  {
- "epoch": 0.5109489051094891,
- "grad_norm": 1.6171875,
- "learning_rate": 0.00019999869950890106,
- "loss": 1.2538,
  "step": 70
  },
  {
- "epoch": 0.5474452554744526,
- "grad_norm": 5.9375,
- "learning_rate": 0.0001999531858720213,
- "loss": 1.224,
  "step": 75
  },
  {
- "epoch": 0.583941605839416,
- "grad_norm": 2.25,
- "learning_rate": 0.00019984268150178167,
- "loss": 1.1823,
  "step": 80
  },
  {
- "epoch": 0.6204379562043796,
- "grad_norm": 2.078125,
- "learning_rate": 0.00019966725824941932,
- "loss": 1.1279,
  "step": 85
  },
  {
- "epoch": 0.656934306569343,
- "grad_norm": 3.0625,
- "learning_rate": 0.00019942703017718975,
- "loss": 1.127,
  "step": 90
  },
  {
- "epoch": 0.6934306569343066,
- "grad_norm": 1.75,
- "learning_rate": 0.000199122153484202,
- "loss": 1.1284,
  "step": 95
  },
  {
- "epoch": 0.7299270072992701,
- "grad_norm": 1.5625,
- "learning_rate": 0.00019875282640485645,
- "loss": 1.0566,
  "step": 100
  },
  {
- "epoch": 0.7664233576642335,
- "grad_norm": 4.53125,
- "learning_rate": 0.0001983192890799503,
- "loss": 1.0361,
  "step": 105
  },
  {
- "epoch": 0.8029197080291971,
- "grad_norm": 2.5,
- "learning_rate": 0.0001978218234005352,
- "loss": 1.0371,
  "step": 110
  },
  {
- "epoch": 0.8394160583941606,
- "grad_norm": 1.890625,
- "learning_rate": 0.00019726075282462845,
- "loss": 1.0235,
  "step": 115
  },
  {
- "epoch": 0.8759124087591241,
- "grad_norm": 0.67578125,
- "learning_rate": 0.00019663644216689683,
- "loss": 0.996,
  "step": 120
  },
  {
- "epoch": 0.9124087591240876,
- "grad_norm": 1.2421875,
- "learning_rate": 0.00019594929736144976,
- "loss": 0.9734,
  "step": 125
  },
  {
- "epoch": 0.948905109489051,
- "grad_norm": 1.5625,
- "learning_rate": 0.00019519976519789616,
- "loss": 0.978,
  "step": 130
  },
  {
- "epoch": 0.9854014598540146,
- "grad_norm": 0.95703125,
- "learning_rate": 0.00019438833303083678,
- "loss": 0.9712,
  "step": 135
  },
  {
- "epoch": 1.0,
- "eval_loss": 2.307734489440918,
- "eval_runtime": 0.9962,
- "eval_samples_per_second": 5.019,
- "eval_steps_per_second": 2.008,
- "step": 137
- },
- {
- "epoch": 1.0218978102189782,
- "grad_norm": 2.125,
- "learning_rate": 0.00019351552846298025,
- "loss": 0.9374,
  "step": 140
  },
  {
- "epoch": 1.0583941605839415,
- "grad_norm": 2.265625,
- "learning_rate": 0.0001925819190020898,
- "loss": 0.9173,
  "step": 145
  },
  {
- "epoch": 1.094890510948905,
- "grad_norm": 0.828125,
- "learning_rate": 0.00019158811169198313,
- "loss": 0.8916,
  "step": 150
  },
  {
- "epoch": 1.1313868613138687,
- "grad_norm": 1.0703125,
- "learning_rate": 0.0001905347527178252,
- "loss": 0.9418,
  "step": 155
  },
  {
- "epoch": 1.167883211678832,
- "grad_norm": 0.9140625,
- "learning_rate": 0.00018942252698597113,
- "loss": 0.9054,
  "step": 160
  },
  {
- "epoch": 1.2043795620437956,
- "grad_norm": 2.0625,
- "learning_rate": 0.00018825215767863214,
- "loss": 0.9039,
  "step": 165
  },
  {
- "epoch": 1.2408759124087592,
- "grad_norm": 1.5859375,
- "learning_rate": 0.00018702440578365387,
- "loss": 0.9146,
  "step": 170
  },
  {
- "epoch": 1.2773722627737225,
- "grad_norm": 1.3515625,
- "learning_rate": 0.00018574006959971333,
- "loss": 0.8896,
  "step": 175
  },
  {
- "epoch": 1.313868613138686,
- "grad_norm": 2.09375,
- "learning_rate": 0.00018439998421725554,
- "loss": 0.8947,
  "step": 180
  },
  {
- "epoch": 1.3503649635036497,
- "grad_norm": 0.80078125,
- "learning_rate": 0.00018300502097550806,
- "loss": 0.881,
  "step": 185
  },
  {
- "epoch": 1.3868613138686132,
- "grad_norm": 0.80078125,
- "learning_rate": 0.00018155608689592604,
- "loss": 0.8906,
  "step": 190
  },
  {
- "epoch": 1.4233576642335766,
- "grad_norm": 0.80859375,
- "learning_rate": 0.00018005412409243606,
- "loss": 0.8939,
  "step": 195
  },
  {
- "epoch": 1.4598540145985401,
- "grad_norm": 1.0234375,
- "learning_rate": 0.0001785001091588628,
- "loss": 0.9016,
  "step": 200
  },
  {
- "epoch": 1.4963503649635037,
- "grad_norm": 0.70703125,
- "learning_rate": 0.0001768950525339362,
- "loss": 0.8943,
  "step": 205
  },
  {
- "epoch": 1.5328467153284673,
- "grad_norm": 1.2109375,
- "learning_rate": 0.00017523999784429238,
- "loss": 0.8614,
  "step": 210
  },
  {
- "epoch": 1.5693430656934306,
- "grad_norm": 0.7734375,
- "learning_rate": 0.00017353602122589527,
- "loss": 0.8788,
  "step": 215
  },
  {
- "epoch": 1.6058394160583942,
- "grad_norm": 0.82421875,
- "learning_rate": 0.0001717842306243205,
- "loss": 0.8833,
  "step": 220
  },
  {
- "epoch": 1.6423357664233578,
- "grad_norm": 0.84765625,
- "learning_rate": 0.00016998576507435618,
- "loss": 0.8713,
  "step": 225
  },
  {
- "epoch": 1.6788321167883211,
- "grad_norm": 1.234375,
- "learning_rate": 0.00016814179395938913,
- "loss": 0.8661,
  "step": 230
  },
  {
- "epoch": 1.7153284671532847,
- "grad_norm": 0.91015625,
- "learning_rate": 0.00016625351625105796,
- "loss": 0.8413,
  "step": 235
  },
  {
- "epoch": 1.7518248175182483,
- "grad_norm": 0.63671875,
- "learning_rate": 0.0001643221597296679,
- "loss": 0.8741,
  "step": 240
  },
  {
- "epoch": 1.7883211678832116,
- "grad_norm": 0.73046875,
- "learning_rate": 0.00016234898018587337,
- "loss": 0.8744,
  "step": 245
  },
  {
- "epoch": 1.8248175182481752,
- "grad_norm": 0.671875,
- "learning_rate": 0.00016033526060414842,
- "loss": 0.8517,
  "step": 250
  },
  {
- "epoch": 1.8613138686131387,
- "grad_norm": 1.0234375,
- "learning_rate": 0.00015828231032857503,
- "loss": 0.8899,
  "step": 255
  },
  {
- "epoch": 1.897810218978102,
- "grad_norm": 0.66796875,
- "learning_rate": 0.00015619146421149232,
- "loss": 0.8537,
  "step": 260
  },
  {
- "epoch": 1.9343065693430657,
- "grad_norm": 0.7109375,
- "learning_rate": 0.00015406408174555976,
- "loss": 0.8329,
  "step": 265
  },
  {
- "epoch": 1.9708029197080292,
- "grad_norm": 0.71875,
- "learning_rate": 0.00015190154617979938,
- "loss": 0.8675,
  "step": 270
  },
  {
- "epoch": 2.0,
- "eval_loss": 2.247941017150879,
- "eval_runtime": 0.9979,
- "eval_samples_per_second": 5.01,
- "eval_steps_per_second": 2.004,
- "step": 274
- },
- {
- "epoch": 2.0072992700729926,
- "grad_norm": 0.80859375,
- "learning_rate": 0.00014970526362019079,
- "loss": 0.8435,
  "step": 275
  },
  {
- "epoch": 2.0437956204379564,
- "grad_norm": 1.515625,
- "learning_rate": 0.00014747666211540459,
- "loss": 0.7774,
  "step": 280
  },
  {
- "epoch": 2.0802919708029197,
- "grad_norm": 1.0859375,
- "learning_rate": 0.00014521719072826858,
- "loss": 0.79,
  "step": 285
  },
  {
- "epoch": 2.116788321167883,
- "grad_norm": 0.498046875,
- "learning_rate": 0.00014292831859356997,
- "loss": 0.7929,
  "step": 290
  },
  {
- "epoch": 2.153284671532847,
- "grad_norm": 1.59375,
- "learning_rate": 0.00014061153396280674,
- "loss": 0.8032,
  "step": 295
  },
  {
- "epoch": 2.18978102189781,
- "grad_norm": 0.83203125,
- "learning_rate": 0.000138268343236509,
- "loss": 0.7932,
  "step": 300
  },
  {
- "epoch": 2.2262773722627736,
- "grad_norm": 0.734375,
- "learning_rate": 0.00013590026998475986,
- "loss": 0.7657,
  "step": 305
  },
  {
- "epoch": 2.2627737226277373,
- "grad_norm": 0.609375,
- "learning_rate": 0.0001335088539565523,
- "loss": 0.783,
  "step": 310
  },
  {
- "epoch": 2.2992700729927007,
- "grad_norm": 0.71484375,
- "learning_rate": 0.00013109565007862596,
- "loss": 0.7755,
  "step": 315
  },
  {
- "epoch": 2.335766423357664,
- "grad_norm": 0.609375,
- "learning_rate": 0.0001286622274444361,
- "loss": 0.7723,
  "step": 320
  },
  {
- "epoch": 2.372262773722628,
- "grad_norm": 1.3359375,
- "learning_rate": 0.00012621016829391022,
- "loss": 0.7739,
  "step": 325
  },
  {
- "epoch": 2.408759124087591,
- "grad_norm": 1.1328125,
- "learning_rate": 0.00012374106698465732,
- "loss": 0.7821,
  "step": 330
  },
  {
- "epoch": 2.445255474452555,
- "grad_norm": 0.91015625,
- "learning_rate": 0.00012125652895529766,
- "loss": 0.7852,
  "step": 335
  },
  {
- "epoch": 2.4817518248175183,
- "grad_norm": 0.74609375,
- "learning_rate": 0.00011875816968158815,
- "loss": 0.7792,
  "step": 340
  },
  {
- "epoch": 2.5182481751824817,
- "grad_norm": 0.625,
- "learning_rate": 0.00011624761362602061,
- "loss": 0.7799,
  "step": 345
  },
  {
- "epoch": 2.554744525547445,
- "grad_norm": 0.81640625,
- "learning_rate": 0.00011372649318157749,
- "loss": 0.7914,
  "step": 350
  },
  {
- "epoch": 2.591240875912409,
- "grad_norm": 0.80078125,
- "learning_rate": 0.00011119644761033078,
- "loss": 0.7847,
  "step": 355
  },
  {
- "epoch": 2.627737226277372,
- "grad_norm": 0.984375,
- "learning_rate": 0.0001086591219775746,
- "loss": 0.8049,
  "step": 360
  },
  {
- "epoch": 2.664233576642336,
- "grad_norm": 0.81640625,
- "learning_rate": 0.00010611616608218429,
- "loss": 0.7865,
  "step": 365
  },
  {
- "epoch": 2.7007299270072993,
- "grad_norm": 0.51953125,
- "learning_rate": 0.00010356923338389806,
- "loss": 0.7908,
  "step": 370
  },
  {
- "epoch": 2.7372262773722627,
- "grad_norm": 0.53125,
- "learning_rate": 0.00010101997992821797,
- "loss": 0.7925,
  "step": 375
  },
  {
- "epoch": 2.7737226277372264,
- "grad_norm": 0.49609375,
- "learning_rate": 9.847006326962974e-05,
- "loss": 0.799,
  "step": 380
  },
  {
- "epoch": 2.81021897810219,
- "grad_norm": 0.51171875,
- "learning_rate": 9.592114139384145e-05,
- "loss": 0.7832,
  "step": 385
  },
  {
- "epoch": 2.846715328467153,
- "grad_norm": 0.7109375,
- "learning_rate": 9.337487163974164e-05,
- "loss": 0.7796,
  "step": 390
  },
  {
- "epoch": 2.883211678832117,
- "grad_norm": 0.6328125,
- "learning_rate": 9.083290962177828e-05,
- "loss": 0.7839,
  "step": 395
  },
  {
- "epoch": 2.9197080291970803,
- "grad_norm": 0.59765625,
- "learning_rate": 8.829690815345886e-05,
- "loss": 0.7781,
  "step": 400
  },
  {
- "epoch": 2.9562043795620436,
- "grad_norm": 0.58203125,
- "learning_rate": 8.57685161726715e-05,
- "loss": 0.7457,
  "step": 405
  },
  {
- "epoch": 2.9927007299270074,
- "grad_norm": 0.6171875,
- "learning_rate": 8.324937766952638e-05,
- "loss": 0.7623,
  "step": 410
  },
  {
- "epoch": 3.0,
- "eval_loss": 2.275648355484009,
- "eval_runtime": 0.9945,
- "eval_samples_per_second": 5.028,
- "eval_steps_per_second": 2.011,
- "step": 411
- },
- {
- "epoch": 3.0291970802919708,
- "grad_norm": 0.8359375,
- "learning_rate": 8.074113061741397e-05,
- "loss": 0.7329,
  "step": 415
  },
  {
- "epoch": 3.065693430656934,
- "grad_norm": 0.50390625,
- "learning_rate": 7.824540590797568e-05,
- "loss": 0.7052,
  "step": 420
  },
  {
- "epoch": 3.102189781021898,
- "grad_norm": 0.5703125,
- "learning_rate": 7.576382629067877e-05,
- "loss": 0.7015,
  "step": 425
  },
  {
- "epoch": 3.1386861313868613,
- "grad_norm": 0.6015625,
- "learning_rate": 7.329800531768584e-05,
- "loss": 0.696,
  "step": 430
  },
  {
- "epoch": 3.1751824817518246,
- "grad_norm": 0.55078125,
- "learning_rate": 7.084954629470417e-05,
- "loss": 0.7154,
  "step": 435
  },
  {
- "epoch": 3.2116788321167884,
- "grad_norm": 0.59765625,
- "learning_rate": 6.842004123849752e-05,
- "loss": 0.7113,
  "step": 440
  },
  {
- "epoch": 3.2481751824817517,
- "grad_norm": 0.5625,
- "learning_rate": 6.601106984173835e-05,
- "loss": 0.7139,
  "step": 445
  },
  {
- "epoch": 3.2846715328467155,
- "grad_norm": 0.59765625,
- "learning_rate": 6.362419844587287e-05,
- "loss": 0.6967,
  "step": 450
  },
  {
- "epoch": 3.321167883211679,
- "grad_norm": 0.52734375,
- "learning_rate": 6.126097902266772e-05,
- "loss": 0.7073,
  "step": 455
  },
  {
- "epoch": 3.3576642335766422,
- "grad_norm": 0.5625,
- "learning_rate": 5.8922948165099524e-05,
- "loss": 0.6857,
  "step": 460
  },
  {
- "epoch": 3.394160583941606,
- "grad_norm": 0.55859375,
- "learning_rate": 5.6611626088244194e-05,
- "loss": 0.7199,
  "step": 465
  },
  {
- "epoch": 3.4306569343065694,
- "grad_norm": 0.58203125,
- "learning_rate": 5.432851564081534e-05,
- "loss": 0.7075,
  "step": 470
  },
  {
- "epoch": 3.4671532846715327,
- "grad_norm": 0.52734375,
- "learning_rate": 5.207510132799436e-05,
- "loss": 0.7006,
  "step": 475
  },
  {
- "epoch": 3.5036496350364965,
- "grad_norm": 0.53515625,
- "learning_rate": 4.9852848346187566e-05,
- "loss": 0.7151,
  "step": 480
  },
  {
- "epoch": 3.54014598540146,
- "grad_norm": 0.546875,
- "learning_rate": 4.7663201630338816e-05,
- "loss": 0.7129,
  "step": 485
  },
  {
- "epoch": 3.576642335766423,
- "grad_norm": 0.5859375,
- "learning_rate": 4.550758491441526e-05,
- "loss": 0.7139,
  "step": 490
  },
  {
- "epoch": 3.613138686131387,
- "grad_norm": 0.51953125,
- "learning_rate": 4.3387399805679255e-05,
- "loss": 0.7162,
  "step": 495
  },
  {
- "epoch": 3.6496350364963503,
- "grad_norm": 0.55859375,
- "learning_rate": 4.1304024873346705e-05,
- "loss": 0.7132,
  "step": 500
  },
  {
- "epoch": 3.686131386861314,
- "grad_norm": 0.57421875,
- "learning_rate": 3.9258814752225284e-05,
- "loss": 0.7007,
  "step": 505
  },
  {
- "epoch": 3.7226277372262775,
- "grad_norm": 0.546875,
- "learning_rate": 3.725309926191479e-05,
- "loss": 0.7037,
  "step": 510
  },
  {
- "epoch": 3.759124087591241,
- "grad_norm": 0.73828125,
- "learning_rate": 3.528818254214329e-05,
- "loss": 0.7255,
  "step": 515
  },
  {
- "epoch": 3.795620437956204,
- "grad_norm": 0.52734375,
- "learning_rate": 3.336534220479961e-05,
- "loss": 0.6966,
  "step": 520
  },
  {
- "epoch": 3.832116788321168,
- "grad_norm": 0.5078125,
- "learning_rate": 3.1485828503215585e-05,
- "loss": 0.7143,
  "step": 525
  },
  {
- "epoch": 3.8686131386861313,
- "grad_norm": 0.6328125,
- "learning_rate": 2.9650863519236418e-05,
- "loss": 0.7005,
  "step": 530
  },
  {
- "epoch": 3.905109489051095,
- "grad_norm": 0.5703125,
- "learning_rate": 2.7861640368608844e-05,
- "loss": 0.7005,
  "step": 535
  },
  {
- "epoch": 3.9416058394160585,
- "grad_norm": 0.53125,
- "learning_rate": 2.6119322425203197e-05,
- "loss": 0.7139,
  "step": 540
  },
  {
- "epoch": 3.978102189781022,
- "grad_norm": 0.51953125,
- "learning_rate": 2.4425042564574184e-05,
- "loss": 0.709,
  "step": 545
  },
  {
- "epoch": 4.0,
- "eval_loss": 2.341665267944336,
- "eval_runtime": 0.9977,
- "eval_samples_per_second": 5.012,
- "eval_steps_per_second": 2.005,
- "step": 548
  },
  {
- "epoch": 4.014598540145985,
- "grad_norm": 0.53515625,
- "learning_rate": 2.277990242735185e-05,
- "loss": 0.6801,
- "step": 550
  },
  {
- "epoch": 4.0510948905109485,
- "grad_norm": 0.52734375,
- "learning_rate": 2.118497170294195e-05,
- "loss": 0.6495,
  "step": 555
  },
  {
- "epoch": 4.087591240875913,
- "grad_norm": 0.5625,
- "learning_rate": 1.9641287434001355e-05,
- "loss": 0.672,
  "step": 560
  },
  {
- "epoch": 4.124087591240876,
- "grad_norm": 0.55078125,
- "learning_rate": 1.8149853342140645e-05,
- "loss": 0.6611,
  "step": 565
  },
  {
- "epoch": 4.160583941605839,
- "grad_norm": 0.59375,
- "learning_rate": 1.671163917529285e-05,
- "loss": 0.662,
  "step": 570
  },
  {
- "epoch": 4.197080291970803,
- "grad_norm": 0.51171875,
- "learning_rate": 1.5327580077171587e-05,
- "loss": 0.6635,
  "step": 575
  },
  {
- "epoch": 4.233576642335766,
- "grad_norm": 0.54296875,
- "learning_rate": 1.3998575979229944e-05,
- "loss": 0.6624,
  "step": 580
  },
  {
- "epoch": 4.2700729927007295,
- "grad_norm": 0.50390625,
- "learning_rate": 1.272549101551438e-05,
- "loss": 0.6523,
  "step": 585
  },
  {
- "epoch": 4.306569343065694,
- "grad_norm": 0.51171875,
- "learning_rate": 1.1509152960794666e-05,
- "loss": 0.6607,
  "step": 590
  },
  {
- "epoch": 4.343065693430657,
- "grad_norm": 0.546875,
- "learning_rate": 1.035035269233493e-05,
- "loss": 0.6626,
  "step": 595
  },
  {
- "epoch": 4.37956204379562,
- "grad_norm": 0.54296875,
- "learning_rate": 9.249843675656212e-06,
- "loss": 0.678,
  "step": 600
  },
  {
- "epoch": 4.416058394160584,
- "grad_norm": 0.5234375,
- "learning_rate": 8.208341474624071e-06,
- "loss": 0.6783,
  "step": 605
  },
  {
- "epoch": 4.452554744525547,
- "grad_norm": 0.53515625,
- "learning_rate": 7.226523286180776e-06,
- "loss": 0.6699,
  "step": 610
  },
  {
- "epoch": 4.489051094890511,
- "grad_norm": 0.5703125,
- "learning_rate": 6.3050275000238414e-06,
- "loss": 0.6607,
  "step": 615
  },
  {
- "epoch": 4.525547445255475,
- "grad_norm": 0.5234375,
- "learning_rate": 5.4444532835175144e-06,
- "loss": 0.6702,
  "step": 620
  },
  {
- "epoch": 4.562043795620438,
- "grad_norm": 0.5234375,
- "learning_rate": 4.6453601921072395e-06,
- "loss": 0.6793,
  "step": 625
  },
  {
- "epoch": 4.598540145985401,
- "grad_norm": 0.5234375,
- "learning_rate": 3.908267805490051e-06,
- "loss": 0.6622,
  "step": 630
  },
  {
- "epoch": 4.635036496350365,
- "grad_norm": 0.54296875,
- "learning_rate": 3.233655389777801e-06,
- "loss": 0.677,
  "step": 635
  },
  {
- "epoch": 4.671532846715328,
- "grad_norm": 0.5234375,
- "learning_rate": 2.62196158587269e-06,
- "loss": 0.6588,
  "step": 640
  },
  {
- "epoch": 4.708029197080292,
- "grad_norm": 0.5234375,
- "learning_rate": 2.073584124257899e-06,
- "loss": 0.6621,
  "step": 645
  },
  {
- "epoch": 4.744525547445256,
- "grad_norm": 0.53515625,
- "learning_rate": 1.5888795663883904e-06,
- "loss": 0.6655,
  "step": 650
  },
  {
- "epoch": 4.781021897810219,
- "grad_norm": 0.515625,
- "learning_rate": 1.1681630728506699e-06,
- "loss": 0.6653,
  "step": 655
  },
  {
- "epoch": 4.817518248175182,
- "grad_norm": 0.52734375,
- "learning_rate": 8.117081984415298e-07,
- "loss": 0.6734,
  "step": 660
  },
  {
- "epoch": 4.854014598540146,
- "grad_norm": 0.5390625,
- "learning_rate": 5.19746714299596e-07,
- "loss": 0.6541,
  "step": 665
  },
  {
- "epoch": 4.89051094890511,
- "grad_norm": 0.5390625,
- "learning_rate": 2.9246845720496407e-07,
- "loss": 0.6722,
  "step": 670
  },
  {
- "epoch": 4.927007299270073,
- "grad_norm": 0.55859375,
- "learning_rate": 1.300212061451367e-07,
- "loss": 0.6472,
  "step": 675
  },
  {
- "epoch": 4.963503649635037,
- "grad_norm": 0.51953125,
- "learning_rate": 3.251058622737446e-08,
- "loss": 0.667,
  "step": 680
  },
  {
- "epoch": 5.0,
- "grad_norm": 0.52734375,
- "learning_rate": 0.0,
- "loss": 0.6601,
  "step": 685
  },
  {
- "epoch": 5.0,
- "eval_loss": 2.3811252117156982,
- "eval_runtime": 0.9953,
- "eval_samples_per_second": 5.024,
- "eval_steps_per_second": 2.01,
- "step": 685
  },
  {
- "epoch": 5.0,
- "step": 685,
- "total_flos": 1.0472781231601746e+18,
- "train_loss": 2.151051264783762,
- "train_runtime": 5341.9856,
- "train_samples_per_second": 2.052,
- "train_steps_per_second": 0.128
  }
  ],
  "logging_steps": 5,
- "max_steps": 685,
  "num_input_tokens_seen": 0,
- "num_train_epochs": 5,
  "save_steps": 100,
  "stateful_callbacks": {
  "TrainerControl": {
@@ -1041,7 +1676,7 @@
  "attributes": {}
  }
  },
- "total_flos": 1.0472781231601746e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null

 {
  "best_metric": null,
  "best_model_checkpoint": null,
+ "epoch": 10.0,
  "eval_steps": 500,
+ "global_step": 1110,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
  {
+ "epoch": 0.009009009009009009,
+ "grad_norm": 608.0,
+ "learning_rate": 1.801801801801802e-06,
+ "loss": 58.5641,
  "step": 1
  },
  {
+ "epoch": 0.04504504504504504,
+ "grad_norm": 532.0,
+ "learning_rate": 9.00900900900901e-06,
+ "loss": 54.6181,
  "step": 5
  },
  {
+ "epoch": 0.09009009009009009,
+ "grad_norm": 446.0,
+ "learning_rate": 1.801801801801802e-05,
+ "loss": 50.0236,
  "step": 10
  },
  {
+ "epoch": 0.13513513513513514,
+ "grad_norm": 193.0,
+ "learning_rate": 2.702702702702703e-05,
+ "loss": 33.1549,
  "step": 15
  },
  {
+ "epoch": 0.18018018018018017,
+ "grad_norm": 44.5,
+ "learning_rate": 3.603603603603604e-05,
+ "loss": 25.2428,
  "step": 20
  },
  {
+ "epoch": 0.22522522522522523,
+ "grad_norm": 26.625,
+ "learning_rate": 4.5045045045045046e-05,
+ "loss": 22.4735,
  "step": 25
  },
  {
+ "epoch": 0.2702702702702703,
+ "grad_norm": 17.25,
+ "learning_rate": 5.405405405405406e-05,
+ "loss": 20.4661,
  "step": 30
  },
  {
+ "epoch": 0.3153153153153153,
+ "grad_norm": 7.6875,
+ "learning_rate": 6.306306306306306e-05,
+ "loss": 19.1401,
  "step": 35
  },
  {
+ "epoch": 0.36036036036036034,
+ "grad_norm": 11.6875,
+ "learning_rate": 7.207207207207208e-05,
+ "loss": 18.3188,
  "step": 40
  },
  {
+ "epoch": 0.40540540540540543,
+ "grad_norm": 23.5,
+ "learning_rate": 8.108108108108109e-05,
+ "loss": 16.7622,
  "step": 45
  },
  {
+ "epoch": 0.45045045045045046,
+ "grad_norm": 56.25,
+ "learning_rate": 9.009009009009009e-05,
+ "loss": 12.6183,
  "step": 50
  },
  {
+ "epoch": 0.4954954954954955,
+ "grad_norm": 13.0,
+ "learning_rate": 9.90990990990991e-05,
+ "loss": 4.3593,
  "step": 55
  },
  {
+ "epoch": 0.5405405405405406,
+ "grad_norm": 3.875,
+ "learning_rate": 0.00010810810810810812,
+ "loss": 2.18,
  "step": 60
  },
  {
+ "epoch": 0.5855855855855856,
+ "grad_norm": 2.421875,
+ "learning_rate": 0.00011711711711711712,
+ "loss": 1.8179,
  "step": 65
  },
  {
+ "epoch": 0.6306306306306306,
+ "grad_norm": 3.265625,
+ "learning_rate": 0.00012612612612612612,
+ "loss": 1.5974,
  "step": 70
  },
  {
+ "epoch": 0.6756756756756757,
+ "grad_norm": 2.375,
+ "learning_rate": 0.00013513513513513514,
+ "loss": 1.486,
  "step": 75
  },
  {
+ "epoch": 0.7207207207207207,
+ "grad_norm": 1.5078125,
+ "learning_rate": 0.00014414414414414415,
+ "loss": 1.361,
  "step": 80
  },
  {
+ "epoch": 0.7657657657657657,
+ "grad_norm": 3.890625,
+ "learning_rate": 0.00015315315315315314,
+ "loss": 1.3001,
  "step": 85
  },
  {
+ "epoch": 0.8108108108108109,
+ "grad_norm": 4.46875,
+ "learning_rate": 0.00016216216216216218,
+ "loss": 1.261,
  "step": 90
  },
  {
+ "epoch": 0.8558558558558559,
+ "grad_norm": 5.03125,
+ "learning_rate": 0.0001711711711711712,
+ "loss": 1.2015,
  "step": 95
  },
  {
+ "epoch": 0.9009009009009009,
+ "grad_norm": 32.25,
+ "learning_rate": 0.00018018018018018018,
+ "loss": 1.1886,
  "step": 100
  },
  {
+ "epoch": 0.9459459459459459,
+ "grad_norm": 1.703125,
+ "learning_rate": 0.0001891891891891892,
+ "loss": 1.1679,
  "step": 105
  },
  {
+ "epoch": 0.990990990990991,
+ "grad_norm": 2.984375,
+ "learning_rate": 0.0001981981981981982,
+ "loss": 1.1572,
  "step": 110
  },
  {
+ "epoch": 1.0,
+ "eval_loss": 2.307225465774536,
+ "eval_runtime": 1.0056,
+ "eval_samples_per_second": 4.972,
+ "eval_steps_per_second": 1.989,
+ "step": 111
+ },
+ {
+ "epoch": 1.0360360360360361,
+ "grad_norm": 1.546875,
+ "learning_rate": 0.00019999208860571255,
+ "loss": 1.0473,
  "step": 115
  },
  {
+ "epoch": 1.0810810810810811,
+ "grad_norm": 1.546875,
+ "learning_rate": 0.0001999599507118322,
+ "loss": 1.0618,
  "step": 120
  },
  {
+ "epoch": 1.1261261261261262,
+ "grad_norm": 10.0,
+ "learning_rate": 0.00019990309979553045,
+ "loss": 1.0458,
  "step": 125
  },
  {
+ "epoch": 1.1711711711711712,
+ "grad_norm": 8.4375,
+ "learning_rate": 0.00019982154991201608,
+ "loss": 1.0364,
  "step": 130
  },
  {
+ "epoch": 1.2162162162162162,
+ "grad_norm": 2.0,
+ "learning_rate": 0.00019971532122280464,
+ "loss": 1.0457,
  "step": 135
  },
  {
+ "epoch": 1.2612612612612613,
+ "grad_norm": 1.4453125,
+ "learning_rate": 0.00019958443999073397,
+ "loss": 0.9906,
  "step": 140
  },
  {
+ "epoch": 1.3063063063063063,
+ "grad_norm": 18.25,
+ "learning_rate": 0.00019942893857347128,
+ "loss": 0.9911,
  "step": 145
  },
  {
+ "epoch": 1.3513513513513513,
+ "grad_norm": 2.578125,
+ "learning_rate": 0.0001992488554155135,
+ "loss": 0.9996,
  "step": 150
  },
  {
+ "epoch": 1.3963963963963963,
+ "grad_norm": 1.7734375,
+ "learning_rate": 0.00019904423503868247,
+ "loss": 0.9656,
  "step": 155
  },
  {
+ "epoch": 1.4414414414414414,
+ "grad_norm": 5.65625,
+ "learning_rate": 0.00019881512803111796,
+ "loss": 0.9753,
  "step": 160
  },
  {
+ "epoch": 1.4864864864864864,
+ "grad_norm": 3.78125,
+ "learning_rate": 0.00019856159103477086,
+ "loss": 0.9239,
  "step": 165
  },
  {
+ "epoch": 1.5315315315315314,
+ "grad_norm": 0.86328125,
+ "learning_rate": 0.00019828368673139947,
+ "loss": 0.9428,
  "step": 170
  },
  {
+ "epoch": 1.5765765765765765,
+ "grad_norm": 0.7265625,
+ "learning_rate": 0.00019798148382707296,
+ "loss": 0.9455,
  "step": 175
  },
  {
+ "epoch": 1.6216216216216215,
+ "grad_norm": 1.8125,
+ "learning_rate": 0.00019765505703518496,
+ "loss": 0.9373,
  "step": 180
  },
  {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 1.0078125,
+ "learning_rate": 0.00019730448705798239,
+ "loss": 0.9659,
  "step": 185
  },
  {
+ "epoch": 1.7117117117117115,
+ "grad_norm": 3.859375,
+ "learning_rate": 0.00019692986056661356,
+ "loss": 0.9271,
  "step": 190
  },
  {
+ "epoch": 1.7567567567567568,
+ "grad_norm": 3.34375,
+ "learning_rate": 0.00019653127017970034,
+ "loss": 0.9303,
  "step": 195
  },
  {
+ "epoch": 1.8018018018018018,
+ "grad_norm": 0.91796875,
+ "learning_rate": 0.0001961088144404403,
+ "loss": 0.9333,
  "step": 200
  },
  {
+ "epoch": 1.8468468468468469,
+ "grad_norm": 1.4453125,
+ "learning_rate": 0.00019566259779224378,
+ "loss": 0.8923,
  "step": 205
  },
  {
+ "epoch": 1.8918918918918919,
+ "grad_norm": 3.171875,
+ "learning_rate": 0.00019519273055291266,
+ "loss": 0.9,
  "step": 210
  },
  {
+ "epoch": 1.936936936936937,
+ "grad_norm": 14.3125,
+ "learning_rate": 0.00019469932888736632,
+ "loss": 0.8988,
  "step": 215
  },
  {
+ "epoch": 1.981981981981982,
+ "grad_norm": 3.46875,
+ "learning_rate": 0.0001941825147789225,
+ "loss": 0.9296,
  "step": 220
  },
  {
+ "epoch": 2.0,
+ "eval_loss": 2.178852081298828,
+ "eval_runtime": 1.0053,
+ "eval_samples_per_second": 4.973,
+ "eval_steps_per_second": 1.989,
+ "step": 222
+ },
+ {
+ "epoch": 2.027027027027027,
+ "grad_norm": 1.4921875,
+ "learning_rate": 0.00019364241599913924,
+ "loss": 0.8696,
  "step": 225
  },
  {
+ "epoch": 2.0720720720720722,
+ "grad_norm": 2.65625,
+ "learning_rate": 0.0001930791660762262,
+ "loss": 0.8363,
  "step": 230
  },
  {
+ "epoch": 2.1171171171171173,
+ "grad_norm": 1.265625,
+ "learning_rate": 0.00019249290426203252,
+ "loss": 0.821,
  "step": 235
  },
  {
+ "epoch": 2.1621621621621623,
+ "grad_norm": 2.546875,
+ "learning_rate": 0.00019188377549761963,
+ "loss": 0.8511,
  "step": 240
  },
  {
+ "epoch": 2.2072072072072073,
+ "grad_norm": 0.828125,
+ "learning_rate": 0.0001912519303774276,
+ "loss": 0.8231,
  "step": 245
  },
  {
+ "epoch": 2.2522522522522523,
+ "grad_norm": 0.73046875,
+ "learning_rate": 0.000190597525112044,
+ "loss": 0.8496,
  "step": 250
  },
  {
+ "epoch": 2.2972972972972974,
+ "grad_norm": 1.2421875,
+ "learning_rate": 0.00018992072148958368,
+ "loss": 0.852,
  "step": 255
  },
  {
+ "epoch": 2.3423423423423424,
+ "grad_norm": 1.578125,
+ "learning_rate": 0.0001892216868356904,
+ "loss": 0.8131,
  "step": 260
  },
  {
+ "epoch": 2.3873873873873874,
+ "grad_norm": 1.5078125,
+ "learning_rate": 0.00018850059397216876,
+ "loss": 0.8483,
  "step": 265
  },
  {
+ "epoch": 2.4324324324324325,
+ "grad_norm": 1.125,
+ "learning_rate": 0.00018775762117425777,
+ "loss": 0.8432,
  "step": 270
  },
  {
+ "epoch": 2.4774774774774775,
+ "grad_norm": 0.6015625,
+ "learning_rate": 0.00018699295212655596,
+ "loss": 0.8493,
  "step": 275
  },
  {
+ "epoch": 2.5225225225225225,
+ "grad_norm": 0.8828125,
+ "learning_rate": 0.00018620677587760916,
+ "loss": 0.7998,
  "step": 280
  },
  {
+ "epoch": 2.5675675675675675,
+ "grad_norm": 0.73046875,
+ "learning_rate": 0.0001853992867931721,
+ "loss": 0.8256,
  "step": 285
  },
  {
+ "epoch": 2.6126126126126126,
+ "grad_norm": 0.6796875,
+ "learning_rate": 0.00018457068450815562,
+ "loss": 0.8162,
  "step": 290
  },
  {
+ "epoch": 2.6576576576576576,
+ "grad_norm": 0.671875,
+ "learning_rate": 0.0001837211738772711,
+ "loss": 0.8338,
  "step": 295
  },
  {
+ "epoch": 2.7027027027027026,
+ "grad_norm": 0.9140625,
+ "learning_rate": 0.00018285096492438424,
+ "loss": 0.8279,
  "step": 300
  },
  {
+ "epoch": 2.7477477477477477,
+ "grad_norm": 0.60546875,
+ "learning_rate": 0.00018196027279059117,
+ "loss": 0.7962,
  "step": 305
  },
  {
+ "epoch": 2.7927927927927927,
+ "grad_norm": 2.78125,
+ "learning_rate": 0.0001810493176810292,
+ "loss": 0.8192,
  "step": 310
  },
  {
+ "epoch": 2.8378378378378377,
+ "grad_norm": 0.63671875,
+ "learning_rate": 0.00018011832481043576,
+ "loss": 0.8147,
  "step": 315
  },
  {
+ "epoch": 2.8828828828828827,
+ "grad_norm": 0.56640625,
+ "learning_rate": 0.00017916752434746856,
+ "loss": 0.8255,
  "step": 320
  },
  {
+ "epoch": 2.9279279279279278,
+ "grad_norm": 1.8046875,
+ "learning_rate": 0.0001781971513578013,
+ "loss": 0.8059,
  "step": 325
  },
  {
+ "epoch": 2.972972972972973,
+ "grad_norm": 1.1640625,
+ "learning_rate": 0.00017720744574600863,
+ "loss": 0.8273,
  "step": 330
  },
  {
+ "epoch": 3.0,
+ "eval_loss": 2.1709225177764893,
+ "eval_runtime": 1.0054,
+ "eval_samples_per_second": 4.973,
+ "eval_steps_per_second": 1.989,
+ "step": 333
+ },
+ {
+ "epoch": 3.018018018018018,
+ "grad_norm": 0.83984375,
+ "learning_rate": 0.00017619865219625452,
+ "loss": 0.7934,
  "step": 335
  },
  {
+ "epoch": 3.063063063063063,
+ "grad_norm": 1.5859375,
+ "learning_rate": 0.00017517102011179933,
+ "loss": 0.7096,
  "step": 340
  },
  {
+ "epoch": 3.108108108108108,
+ "grad_norm": 0.91796875,
+ "learning_rate": 0.00017412480355334005,
+ "loss": 0.7203,
  "step": 345
  },
  {
+ "epoch": 3.153153153153153,
+ "grad_norm": 1.9296875,
+ "learning_rate": 0.00017306026117619889,
+ "loss": 0.7237,
  "step": 350
  },
  {
+ "epoch": 3.1981981981981984,
+ "grad_norm": 1.6328125,
+ "learning_rate": 0.00017197765616637636,
+ "loss": 0.738,
  "step": 355
  },
  {
+ "epoch": 3.2432432432432434,
+ "grad_norm": 2.0625,
+ "learning_rate": 0.00017087725617548385,
+ "loss": 0.7214,
  "step": 360
  },
  {
+ "epoch": 3.2882882882882885,
+ "grad_norm": 3.53125,
+ "learning_rate": 0.0001697593332545723,
+ "loss": 0.7549,
  "step": 365
  },
  {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 4.25,
+ "learning_rate": 0.0001686241637868734,
+ "loss": 0.7575,
  "step": 370
  },
  {
+ "epoch": 3.3783783783783785,
+ "grad_norm": 1.6484375,
+ "learning_rate": 0.00016747202841946928,
+ "loss": 0.7392,
  "step": 375
  },
  {
+ "epoch": 3.4234234234234235,
+ "grad_norm": 1.6171875,
+ "learning_rate": 0.00016630321199390867,
+ "loss": 0.7251,
  "step": 380
  },
  {
+ "epoch": 3.4684684684684686,
+ "grad_norm": 1.5546875,
+ "learning_rate": 0.0001651180034757856,
+ "loss": 0.7285,
  "step": 385
  },
  {
+ "epoch": 3.5135135135135136,
+ "grad_norm": 1.0078125,
+ "learning_rate": 0.0001639166958832985,
+ "loss": 0.7114,
  "step": 390
  },
  {
+ "epoch": 3.5585585585585586,
+ "grad_norm": 1.2421875,
+ "learning_rate": 0.00016269958621480788,
+ "loss": 0.7223,
  "step": 395
  },
  {
+ "epoch": 3.6036036036036037,
+ "grad_norm": 0.61328125,
+ "learning_rate": 0.00016146697537540924,
+ "loss": 0.7273,
  "step": 400
  },
  {
+ "epoch": 3.6486486486486487,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00016021916810254097,
+ "loss": 0.7328,
  "step": 405
  },
  {
+ "epoch": 3.6936936936936937,
+ "grad_norm": 1.1015625,
+ "learning_rate": 0.00015895647289064396,
+ "loss": 0.7409,
  "step": 410
  },
  {
+ "epoch": 3.7387387387387387,
+ "grad_norm": 0.7578125,
+ "learning_rate": 0.000157679201914893,
+ "loss": 0.7247,
  "step": 415
  },
  {
+ "epoch": 3.7837837837837838,
+ "grad_norm": 1.890625,
+ "learning_rate": 0.0001563876709540178,
+ "loss": 0.7446,
  "step": 420
  },
  {
+ "epoch": 3.828828828828829,
+ "grad_norm": 0.7109375,
+ "learning_rate": 0.0001550821993122334,
+ "loss": 0.7421,
  "step": 425
  },
  {
+ "epoch": 3.873873873873874,
+ "grad_norm": 0.73046875,
+ "learning_rate": 0.00015376310974029873,
+ "loss": 0.7362,
  "step": 430
  },
  {
+ "epoch": 3.918918918918919,
+ "grad_norm": 0.66015625,
+ "learning_rate": 0.00015243072835572318,
+ "loss": 0.7398,
  "step": 435
  },
  {
+ "epoch": 3.963963963963964,
+ "grad_norm": 0.69921875,
+ "learning_rate": 0.0001510853845621409,
+ "loss": 0.7586,
  "step": 440
  },
  {
+ "epoch": 4.0,
+ "eval_loss": 2.2163968086242676,
+ "eval_runtime": 1.0061,
+ "eval_samples_per_second": 4.97,
+ "eval_steps_per_second": 1.988,
+ "step": 444
+ },
+ {
+ "epoch": 4.009009009009009,
+ "grad_norm": 0.58203125,
+ "learning_rate": 0.00014972741096787242,
+ "loss": 0.7128,
  "step": 445
  },
  {
+ "epoch": 4.054054054054054,
+ "grad_norm": 0.75,
+ "learning_rate": 0.00014835714330369446,
+ "loss": 0.6463,
  "step": 450
  },
  {
+ "epoch": 4.099099099099099,
+ "grad_norm": 0.83203125,
+ "learning_rate": 0.00014697492033983707,
+ "loss": 0.6453,
  "step": 455
  },
  {
+ "epoch": 4.1441441441441444,
+ "grad_norm": 0.55859375,
+ "learning_rate": 0.00014558108380223012,
+ "loss": 0.647,
  "step": 460
  },
  {
+ "epoch": 4.1891891891891895,
+ "grad_norm": 1.3125,
+ "learning_rate": 0.00014417597828801832,
+ "loss": 0.626,
  "step": 465
  },
  {
+ "epoch": 4.2342342342342345,
+ "grad_norm": 0.85546875,
+ "learning_rate": 0.00014275995118036693,
+ "loss": 0.6334,
  "step": 470
  },
  {
+ "epoch": 4.2792792792792795,
+ "grad_norm": 0.69921875,
+ "learning_rate": 0.0001413333525625784,
+ "loss": 0.6435,
  "step": 475
  },
  {
+ "epoch": 4.324324324324325,
+ "grad_norm": 0.8046875,
+ "learning_rate": 0.00013989653513154165,
+ "loss": 0.6439,
  "step": 480
  },
  {
+ "epoch": 4.36936936936937,
+ "grad_norm": 1.0859375,
+ "learning_rate": 0.00013844985411053492,
+ "loss": 0.6559,
  "step": 485
  },
  {
+ "epoch": 4.414414414414415,
+ "grad_norm": 1.3359375,
+ "learning_rate": 0.00013699366716140435,
+ "loss": 0.6654,
  "step": 490
  },
  {
+ "epoch": 4.45945945945946,
+ "grad_norm": 0.80859375,
+ "learning_rate": 0.00013552833429613938,
+ "loss": 0.6783,
  "step": 495
  },
  {
+ "epoch": 4.504504504504505,
+ "grad_norm": 0.6875,
+ "learning_rate": 0.00013405421778786737,
+ "loss": 0.6543,
  "step": 500
  },
  {
+ "epoch": 4.54954954954955,
+ "grad_norm": 0.62890625,
+ "learning_rate": 0.00013257168208128908,
+ "loss": 0.6608,
  "step": 505
  },
  {
+ "epoch": 4.594594594594595,
+ "grad_norm": 0.60546875,
+ "learning_rate": 0.00013108109370257712,
+ "loss": 0.6621,
  "step": 510
  },
  {
+ "epoch": 4.63963963963964,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00012958282116876026,
+ "loss": 0.656,
  "step": 515
  },
  {
+ "epoch": 4.684684684684685,
+ "grad_norm": 0.65234375,
+ "learning_rate": 0.00012807723489661495,
+ "loss": 0.6505,
  "step": 520
  },
  {
+ "epoch": 4.72972972972973,
+ "grad_norm": 0.921875,
+ "learning_rate": 0.00012656470711108764,
+ "loss": 0.6789,
  "step": 525
  },
  {
+ "epoch": 4.774774774774775,
+ "grad_norm": 0.61328125,
+ "learning_rate": 0.00012504561175326985,
+ "loss": 0.6588,
  "step": 530
  },
  {
+ "epoch": 4.81981981981982,
+ "grad_norm": 0.703125,
+ "learning_rate": 0.00012352032438794902,
+ "loss": 0.6534,
  "step": 535
  },
  {
+ "epoch": 4.864864864864865,
+ "grad_norm": 0.74609375,
+ "learning_rate": 0.00012198922211075778,
+ "loss": 0.6482,
  "step": 540
  },
  {
+ "epoch": 4.90990990990991,
+ "grad_norm": 0.94140625,
+ "learning_rate": 0.00012045268345494511,
+ "loss": 0.6595,
  "step": 545
  },
  {
+ "epoch": 4.954954954954955,
+ "grad_norm": 0.59765625,
+ "learning_rate": 0.00011891108829779165,
+ "loss": 0.6624,
+ "step": 550
  },
  {
+ "epoch": 5.0,
+ "grad_norm": 0.578125,
+ "learning_rate": 0.00011736481776669306,
+ "loss": 0.6613,
+ "step": 555
  },
  {
+ "epoch": 5.0,
+ "eval_loss": 2.3182225227355957,
+ "eval_runtime": 1.0028,
+ "eval_samples_per_second": 4.986,
+ "eval_steps_per_second": 1.994,
  "step": 555
  },
  {
+ "epoch": 5.045045045045045,
+ "grad_norm": 0.98046875,
+ "learning_rate": 0.0001158142541449341,
+ "loss": 0.5564,
  "step": 560
  },
  {
+ "epoch": 5.09009009009009,
+ "grad_norm": 0.69140625,
+ "learning_rate": 0.00011425978077717709,
+ "loss": 0.5273,
  "step": 565
  },
  {
+ "epoch": 5.135135135135135,
+ "grad_norm": 0.69921875,
+ "learning_rate": 0.00011270178197468789,
+ "loss": 0.5589,
  "step": 570
  },
  {
+ "epoch": 5.18018018018018,
+ "grad_norm": 0.6171875,
+ "learning_rate": 0.00011114064292032282,
+ "loss": 0.5593,
  "step": 575
  },
  {
+ "epoch": 5.225225225225225,
+ "grad_norm": 0.69921875,
+ "learning_rate": 0.00010957674957330042,
+ "loss": 0.5672,
  "step": 580
  },
  {
+ "epoch": 5.27027027027027,
+ "grad_norm": 0.69140625,
+ "learning_rate": 0.00010801048857378071,
+ "loss": 0.5444,
  "step": 585
  },
  {
+ "epoch": 5.315315315315315,
+ "grad_norm": 0.66796875,
+ "learning_rate": 0.00010644224714727681,
+ "loss": 0.5747,
  "step": 590
  },
  {
+ "epoch": 5.36036036036036,
+ "grad_norm": 0.68359375,
+ "learning_rate": 0.0001048724130089212,
+ "loss": 0.5609,
  "step": 595
  },
  {
+ "epoch": 5.405405405405405,
+ "grad_norm": 0.8984375,
+ "learning_rate": 0.00010330137426761135,
+ "loss": 0.5625,
  "step": 600
  },
  {
+ "epoch": 5.45045045045045,
+ "grad_norm": 0.76171875,
+ "learning_rate": 0.00010172951933005775,
+ "loss": 0.5671,
  "step": 605
  },
  {
+ "epoch": 5.495495495495495,
+ "grad_norm": 0.80859375,
+ "learning_rate": 0.00010015723680475846,
+ "loss": 0.564,
  "step": 610
  },
  {
+ "epoch": 5.54054054054054,
+ "grad_norm": 0.76171875,
+ "learning_rate": 9.858491540592382e-05,
+ "loss": 0.5784,
  "step": 615
  },
  {
+ "epoch": 5.585585585585585,
+ "grad_norm": 0.7265625,
+ "learning_rate": 9.70129438573747e-05,
+ "loss": 0.5672,
  "step": 620
  },
  {
+ "epoch": 5.63063063063063,
+ "grad_norm": 0.75390625,
+ "learning_rate": 9.54417107964389e-05,
+ "loss": 0.5592,
  "step": 625
  },
  {
+ "epoch": 5.675675675675675,
+ "grad_norm": 0.734375,
+ "learning_rate": 9.38716046778684e-05,
+ "loss": 0.5634,
  "step": 630
  },
  {
+ "epoch": 5.7207207207207205,
+ "grad_norm": 0.6640625,
+ "learning_rate": 9.230301367780208e-05,
+ "loss": 0.5691,
  "step": 635
  },
  {
+ "epoch": 5.7657657657657655,
+ "grad_norm": 0.6875,
+ "learning_rate": 9.07363255977973e-05,
+ "loss": 0.5722,
  "step": 640
  },
  {
+ "epoch": 5.8108108108108105,
+ "grad_norm": 0.76953125,
+ "learning_rate": 8.917192776895382e-05,
+ "loss": 0.5827,
  "step": 645
  },
  {
+ "epoch": 5.8558558558558556,
+ "grad_norm": 0.83203125,
+ "learning_rate": 8.76102069561545e-05,
+ "loss": 0.5745,
  "step": 650
  },
  {
+ "epoch": 5.900900900900901,
+ "grad_norm": 0.7265625,
+ "learning_rate": 8.605154926244543e-05,
+ "loss": 0.5614,
  "step": 655
  },
  {
+ "epoch": 5.945945945945946,
+ "grad_norm": 0.65625,
+ "learning_rate": 8.449634003358022e-05,
+ "loss": 0.5731,
  "step": 660
  },
  {
+ "epoch": 5.990990990990991,
+ "grad_norm": 0.8828125,
+ "learning_rate": 8.294496376275104e-05,
+ "loss": 0.577,
  "step": 665
  },
  {
+ "epoch": 6.0,
+ "eval_loss": 2.4773526191711426,
+ "eval_runtime": 1.0034,
+ "eval_samples_per_second": 4.983,
+ "eval_steps_per_second": 1.993,
+ "step": 666
+ },
+ {
+ "epoch": 6.036036036036036,
+ "grad_norm": 0.8984375,
+ "learning_rate": 8.13978039955308e-05,
+ "loss": 0.5142,
  "step": 670
  },
  {
+ "epoch": 6.081081081081081,
+ "grad_norm": 0.8359375,
+ "learning_rate": 7.985524323504948e-05,
+ "loss": 0.4725,
  "step": 675
  },
  {
+ "epoch": 6.126126126126126,
+ "grad_norm": 0.7734375,
+ "learning_rate": 7.831766284742807e-05,
+ "loss": 0.4671,
  "step": 680
  },
  {
+ "epoch": 6.171171171171171,
+ "grad_norm": 0.7578125,
+ "learning_rate": 7.678544296749384e-05,
+ "loss": 0.4804,
  "step": 685
  },
  {
+ "epoch": 6.216216216216216,
+ "grad_norm": 0.82421875,
+ "learning_rate": 7.525896240479976e-05,
+ "loss": 0.4704,
+ "step": 690
  },
  {
+ "epoch": 6.261261261261261,
+ "grad_norm": 0.75,
+ "learning_rate": 7.37385985499718e-05,
+ "loss": 0.4659,
+ "step": 695
+ },
+ {
+ "epoch": 6.306306306306306,
+ "grad_norm": 0.71484375,
+ "learning_rate": 7.222472728140695e-05,
+ "loss": 0.4697,
+ "step": 700
+ },
+ {
+ "epoch": 6.351351351351352,
+ "grad_norm": 0.79296875,
+ "learning_rate": 7.071772287234497e-05,
+ "loss": 0.4912,
+ "step": 705
+ },
+ {
+ "epoch": 6.396396396396397,
+ "grad_norm": 0.76953125,
+ "learning_rate": 6.921795789833723e-05,
+ "loss": 0.4689,
+ "step": 710
+ },
+ {
+ "epoch": 6.441441441441442,
+ "grad_norm": 0.66796875,
+ "learning_rate": 6.772580314513508e-05,
+ "loss": 0.4753,
+ "step": 715
+ },
+ {
+ "epoch": 6.486486486486487,
+ "grad_norm": 0.75,
+ "learning_rate": 6.624162751702076e-05,
+ "loss": 0.4759,
+ "step": 720
+ },
+ {
+ "epoch": 6.531531531531532,
+ "grad_norm": 0.70703125,
+ "learning_rate": 6.476579794560356e-05,
+ "loss": 0.489,
+ "step": 725
+ },
+ {
+ "epoch": 6.576576576576577,
+ "grad_norm": 0.7265625,
+ "learning_rate": 6.329867929910347e-05,
+ "loss": 0.473,
+ "step": 730
+ },
+ {
+ "epoch": 6.621621621621622,
+ "grad_norm": 0.7109375,
+ "learning_rate": 6.184063429214515e-05,
+ "loss": 0.4793,
+ "step": 735
+ },
+ {
+ "epoch": 6.666666666666667,
+ "grad_norm": 0.76171875,
+ "learning_rate": 6.039202339608432e-05,
+ "loss": 0.5071,
+ "step": 740
+ },
+ {
+ "epoch": 6.711711711711712,
+ "grad_norm": 0.69921875,
+ "learning_rate": 5.895320474988864e-05,
+ "loss": 0.4741,
+ "step": 745
+ },
+ {
+ "epoch": 6.756756756756757,
+ "grad_norm": 0.69921875,
+ "learning_rate": 5.752453407159522e-05,
+ "loss": 0.4799,
+ "step": 750
+ },
+ {
+ "epoch": 6.801801801801802,
+ "grad_norm": 0.7578125,
+ "learning_rate": 5.610636457036693e-05,
+ "loss": 0.4901,
+ "step": 755
+ },
+ {
+ "epoch": 6.846846846846847,
+ "grad_norm": 0.6953125,
+ "learning_rate": 5.469904685916861e-05,
+ "loss": 0.4858,
+ "step": 760
+ },
+ {
+ "epoch": 6.891891891891892,
+ "grad_norm": 0.76953125,
+ "learning_rate": 5.33029288680852e-05,
+ "loss": 0.4895,
+ "step": 765
+ },
+ {
+ "epoch": 6.936936936936937,
+ "grad_norm": 0.70703125,
+ "learning_rate": 5.191835575830352e-05,
+ "loss": 0.4935,
+ "step": 770
+ },
+ {
+ "epoch": 6.981981981981982,
+ "grad_norm": 0.69921875,
+ "learning_rate": 5.0545669836778144e-05,
+ "loss": 0.4958,
+ "step": 775
+ },
+ {
+ "epoch": 7.0,
+ "eval_loss": 2.7035882472991943,
+ "eval_runtime": 1.0058,
+ "eval_samples_per_second": 4.971,
+ "eval_steps_per_second": 1.988,
+ "step": 777
+ },
+ {
+ "epoch": 7.027027027027027,
+ "grad_norm": 0.6875,
+ "learning_rate": 4.918521047160308e-05,
+ "loss": 0.4443,
+ "step": 780
+ },
+ {
+ "epoch": 7.072072072072072,
+ "grad_norm": 0.7734375,
+ "learning_rate": 4.783731400811022e-05,
+ "loss": 0.4139,
+ "step": 785
+ },
+ {
+ "epoch": 7.117117117117117,
+ "grad_norm": 0.734375,
+ "learning_rate": 4.650231368571486e-05,
+ "loss": 0.41,
+ "step": 790
+ },
+ {
+ "epoch": 7.162162162162162,
+ "grad_norm": 0.90625,
+ "learning_rate": 4.518053955552903e-05,
+ "loss": 0.4291,
+ "step": 795
+ },
+ {
+ "epoch": 7.207207207207207,
+ "grad_norm": 0.71875,
+ "learning_rate": 4.387231839876349e-05,
+ "loss": 0.4141,
+ "step": 800
+ },
+ {
+ "epoch": 7.252252252252252,
+ "grad_norm": 0.7265625,
+ "learning_rate": 4.2577973645937674e-05,
+ "loss": 0.4139,
+ "step": 805
+ },
+ {
+ "epoch": 7.297297297297297,
+ "grad_norm": 0.76171875,
+ "learning_rate": 4.129782529691815e-05,
+ "loss": 0.4278,
+ "step": 810
+ },
+ {
+ "epoch": 7.342342342342342,
+ "grad_norm": 0.73046875,
+ "learning_rate": 4.003218984180552e-05,
+ "loss": 0.4148,
+ "step": 815
+ },
+ {
+ "epoch": 7.387387387387387,
+ "grad_norm": 0.79296875,
+ "learning_rate": 3.878138018268866e-05,
+ "loss": 0.4168,
+ "step": 820
+ },
+ {
+ "epoch": 7.4324324324324325,
+ "grad_norm": 0.82421875,
+ "learning_rate": 3.7545705556286126e-05,
+ "loss": 0.4182,
+ "step": 825
+ },
+ {
+ "epoch": 7.4774774774774775,
+ "grad_norm": 0.70703125,
+ "learning_rate": 3.632547145749395e-05,
+ "loss": 0.4239,
+ "step": 830
+ },
+ {
+ "epoch": 7.5225225225225225,
+ "grad_norm": 0.78515625,
+ "learning_rate": 3.5120979563858266e-05,
+ "loss": 0.4137,
+ "step": 835
+ },
+ {
+ "epoch": 7.5675675675675675,
+ "grad_norm": 0.73828125,
+ "learning_rate": 3.393252766099187e-05,
+ "loss": 0.4111,
+ "step": 840
+ },
+ {
+ "epoch": 7.612612612612613,
+ "grad_norm": 0.7421875,
+ "learning_rate": 3.2760409568952766e-05,
+ "loss": 0.4179,
+ "step": 845
+ },
+ {
+ "epoch": 7.657657657657658,
+ "grad_norm": 0.76171875,
+ "learning_rate": 3.1604915069603436e-05,
+ "loss": 0.429,
+ "step": 850
+ },
+ {
+ "epoch": 7.702702702702703,
+ "grad_norm": 0.75,
+ "learning_rate": 3.0466329834968233e-05,
+ "loss": 0.4118,
+ "step": 855
+ },
+ {
+ "epoch": 7.747747747747748,
+ "grad_norm": 0.71484375,
+ "learning_rate": 2.9344935356606773e-05,
+ "loss": 0.4049,
+ "step": 860
+ },
+ {
+ "epoch": 7.792792792792793,
+ "grad_norm": 0.74609375,
+ "learning_rate": 2.8241008876021215e-05,
+ "loss": 0.413,
+ "step": 865
+ },
+ {
+ "epoch": 7.837837837837838,
+ "grad_norm": 0.72265625,
+ "learning_rate": 2.7154823316113932e-05,
+ "loss": 0.4071,
+ "step": 870
+ },
+ {
+ "epoch": 7.882882882882883,
+ "grad_norm": 0.734375,
+ "learning_rate": 2.60866472137129e-05,
+ "loss": 0.4073,
+ "step": 875
+ },
+ {
+ "epoch": 7.927927927927928,
+ "grad_norm": 0.71875,
+ "learning_rate": 2.5036744653181753e-05,
+ "loss": 0.4124,
+ "step": 880
+ },
+ {
+ "epoch": 7.972972972972973,
+ "grad_norm": 0.7578125,
+ "learning_rate": 2.4005375201130274e-05,
+ "loss": 0.4205,
+ "step": 885
+ },
+ {
+ "epoch": 8.0,
+ "eval_loss": 2.9689488410949707,
+ "eval_runtime": 1.0053,
+ "eval_samples_per_second": 4.973,
+ "eval_steps_per_second": 1.989,
+ "step": 888
+ },
+ {
+ "epoch": 8.018018018018019,
+ "grad_norm": 0.68359375,
+ "learning_rate": 2.29927938422419e-05,
+ "loss": 0.4012,
+ "step": 890
+ },
+ {
+ "epoch": 8.063063063063064,
+ "grad_norm": 1.0078125,
+ "learning_rate": 2.199925091623418e-05,
+ "loss": 0.3781,
+ "step": 895
+ },
+ {
+ "epoch": 8.108108108108109,
+ "grad_norm": 0.8671875,
+ "learning_rate": 2.102499205596743e-05,
+ "loss": 0.3809,
+ "step": 900
+ },
+ {
+ "epoch": 8.153153153153154,
+ "grad_norm": 0.70703125,
+ "learning_rate": 2.0070258126717e-05,
+ "loss": 0.3699,
+ "step": 905
+ },
+ {
+ "epoch": 8.198198198198199,
+ "grad_norm": 0.6875,
+ "learning_rate": 1.913528516662452e-05,
+ "loss": 0.3742,
+ "step": 910
+ },
+ {
+ "epoch": 8.243243243243244,
+ "grad_norm": 0.70703125,
+ "learning_rate": 1.8220304328342252e-05,
+ "loss": 0.378,
+ "step": 915
+ },
+ {
+ "epoch": 8.288288288288289,
+ "grad_norm": 0.70703125,
+ "learning_rate": 1.7325541821885384e-05,
+ "loss": 0.3842,
+ "step": 920
+ },
+ {
+ "epoch": 8.333333333333334,
+ "grad_norm": 0.75390625,
+ "learning_rate": 1.6451218858706374e-05,
+ "loss": 0.3894,
+ "step": 925
+ },
+ {
+ "epoch": 8.378378378378379,
+ "grad_norm": 0.71875,
+ "learning_rate": 1.5597551597004966e-05,
+ "loss": 0.3758,
+ "step": 930
+ },
+ {
+ "epoch": 8.423423423423424,
+ "grad_norm": 0.671875,
+ "learning_rate": 1.476475108828762e-05,
+ "loss": 0.3717,
+ "step": 935
+ },
+ {
+ "epoch": 8.468468468468469,
+ "grad_norm": 0.703125,
+ "learning_rate": 1.3953023225189243e-05,
+ "loss": 0.3771,
+ "step": 940
+ },
+ {
+ "epoch": 8.513513513513514,
+ "grad_norm": 0.71875,
+ "learning_rate": 1.3162568690570743e-05,
+ "loss": 0.3759,
+ "step": 945
+ },
+ {
+ "epoch": 8.558558558558559,
+ "grad_norm": 0.74609375,
+ "learning_rate": 1.23935829079042e-05,
+ "loss": 0.3786,
+ "step": 950
+ },
+ {
+ "epoch": 8.603603603603604,
+ "grad_norm": 0.7109375,
+ "learning_rate": 1.1646255992958466e-05,
+ "loss": 0.3734,
+ "step": 955
+ },
+ {
+ "epoch": 8.64864864864865,
+ "grad_norm": 0.7265625,
+ "learning_rate": 1.0920772706797167e-05,
+ "loss": 0.3809,
+ "step": 960
+ },
+ {
+ "epoch": 8.693693693693694,
+ "grad_norm": 0.7109375,
+ "learning_rate": 1.0217312410100089e-05,
+ "loss": 0.3767,
+ "step": 965
+ },
+ {
+ "epoch": 8.73873873873874,
+ "grad_norm": 0.68359375,
+ "learning_rate": 9.536049018820192e-06,
+ "loss": 0.3786,
+ "step": 970
+ },
+ {
+ "epoch": 8.783783783783784,
+ "grad_norm": 0.71875,
+ "learning_rate": 8.87715096118642e-06,
+ "loss": 0.3786,
+ "step": 975
+ },
+ {
+ "epoch": 8.82882882882883,
+ "grad_norm": 0.74609375,
+ "learning_rate": 8.240781136063346e-06,
+ "loss": 0.3868,
+ "step": 980
+ },
+ {
+ "epoch": 8.873873873873874,
+ "grad_norm": 0.72265625,
+ "learning_rate": 7.6270968726777414e-06,
+ "loss": 0.3767,
+ "step": 985
+ },
+ {
+ "epoch": 8.91891891891892,
+ "grad_norm": 0.7578125,
+ "learning_rate": 7.03624989172228e-06,
+ "loss": 0.3791,
+ "step": 990
+ },
+ {
+ "epoch": 8.963963963963964,
+ "grad_norm": 0.71875,
+ "learning_rate": 6.468386267845717e-06,
+ "loss": 0.382,
+ "step": 995
+ },
+ {
+ "epoch": 9.0,
+ "eval_loss": 3.2251663208007812,
+ "eval_runtime": 1.0069,
+ "eval_samples_per_second": 4.966,
+ "eval_steps_per_second": 1.986,
+ "step": 999
+ },
+ {
+ "epoch": 9.00900900900901,
+ "grad_norm": 0.71484375,
+ "learning_rate": 5.9236463935389065e-06,
+ "loss": 0.3794,
+ "step": 1000
+ },
+ {
+ "epoch": 9.054054054054054,
+ "grad_norm": 0.671875,
+ "learning_rate": 5.402164944425758e-06,
+ "loss": 0.3777,
+ "step": 1005
+ },
+ {
+ "epoch": 9.0990990990991,
+ "grad_norm": 0.734375,
+ "learning_rate": 4.904070845967468e-06,
+ "loss": 0.3779,
+ "step": 1010
+ },
+ {
+ "epoch": 9.144144144144144,
+ "grad_norm": 0.703125,
+ "learning_rate": 4.429487241588304e-06,
+ "loss": 0.3744,
+ "step": 1015
+ },
+ {
+ "epoch": 9.18918918918919,
+ "grad_norm": 0.6953125,
+ "learning_rate": 3.9785314622310495e-06,
+ "loss": 0.3694,
+ "step": 1020
+ },
+ {
+ "epoch": 9.234234234234235,
+ "grad_norm": 0.71875,
+ "learning_rate": 3.5513149973492976e-06,
+ "loss": 0.3751,
+ "step": 1025
+ },
+ {
+ "epoch": 9.27927927927928,
+ "grad_norm": 0.7265625,
+ "learning_rate": 3.1479434673440167e-06,
+ "loss": 0.3685,
+ "step": 1030
+ },
+ {
+ "epoch": 9.324324324324325,
+ "grad_norm": 0.68359375,
+ "learning_rate": 2.7685165974510986e-06,
+ "loss": 0.3653,
+ "step": 1035
+ },
+ {
+ "epoch": 9.36936936936937,
+ "grad_norm": 0.71484375,
+ "learning_rate": 2.4131281930864002e-06,
+ "loss": 0.3728,
+ "step": 1040
+ },
+ {
+ "epoch": 9.414414414414415,
+ "grad_norm": 0.734375,
+ "learning_rate": 2.0818661166542074e-06,
+ "loss": 0.3693,
+ "step": 1045
+ },
+ {
+ "epoch": 9.45945945945946,
+ "grad_norm": 0.6875,
+ "learning_rate": 1.7748122658251876e-06,
+ "loss": 0.3764,
+ "step": 1050
+ },
+ {
+ "epoch": 9.504504504504505,
+ "grad_norm": 0.7265625,
+ "learning_rate": 1.4920425532888526e-06,
+ "loss": 0.3654,
+ "step": 1055
+ },
+ {
+ "epoch": 9.54954954954955,
+ "grad_norm": 0.66796875,
+ "learning_rate": 1.2336268879856727e-06,
+ "loss": 0.3747,
+ "step": 1060
+ },
+ {
+ "epoch": 9.594594594594595,
+ "grad_norm": 0.69140625,
+ "learning_rate": 9.996291578236228e-07,
+ "loss": 0.3711,
+ "step": 1065
+ },
+ {
+ "epoch": 9.63963963963964,
+ "grad_norm": 0.71484375,
+ "learning_rate": 7.901072138831511e-07,
+ "loss": 0.3722,
+ "step": 1070
+ },
+ {
+ "epoch": 9.684684684684685,
+ "grad_norm": 0.7109375,
+ "learning_rate": 6.051128561147756e-07,
+ "loss": 0.3612,
+ "step": 1075
+ },
+ {
+ "epoch": 9.72972972972973,
+ "grad_norm": 0.74609375,
+ "learning_rate": 4.44691820532539e-07,
+ "loss": 0.3647,
+ "step": 1080
+ },
+ {
+ "epoch": 9.774774774774775,
+ "grad_norm": 0.6875,
+ "learning_rate": 3.0888376790679795e-07,
+ "loss": 0.3672,
+ "step": 1085
+ },
+ {
+ "epoch": 9.81981981981982,
+ "grad_norm": 0.6484375,
+ "learning_rate": 1.977222739588891e-07,
+ "loss": 0.3659,
+ "step": 1090
+ },
+ {
+ "epoch": 9.864864864864865,
+ "grad_norm": 0.67578125,
+ "learning_rate": 1.1123482106021322e-07,
+ "loss": 0.3692,
+ "step": 1095
+ },
+ {
+ "epoch": 9.90990990990991,
+ "grad_norm": 0.6875,
+ "learning_rate": 4.9442791437848136e-08,
+ "loss": 0.3663,
+ "step": 1100
+ },
+ {
+ "epoch": 9.954954954954955,
+ "grad_norm": 0.6875,
+ "learning_rate": 1.2361461888166226e-08,
+ "loss": 0.3673,
+ "step": 1105
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.6640625,
+ "learning_rate": 0.0,
+ "loss": 0.372,
+ "step": 1110
+ },
+ {
+ "epoch": 10.0,
+ "eval_loss": 3.242992401123047,
+ "eval_runtime": 1.0031,
+ "eval_samples_per_second": 4.984,
+ "eval_steps_per_second": 1.994,
+ "step": 1110
+ },
+ {
+ "epoch": 10.0,
+ "step": 1110,
+ "total_flos": 1.697049221804327e+18,
+ "train_loss": 1.8630313719715084,
+ "train_runtime": 9058.6901,
+ "train_samples_per_second": 1.957,
+ "train_steps_per_second": 0.123
  }
  ],
  "logging_steps": 5,
+ "max_steps": 1110,
  "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
  "save_steps": 100,
  "stateful_callbacks": {
  "TrainerControl": {
  "attributes": {}
  }
  },
+ "total_flos": 1.697049221804327e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null