Model save

Browse files

Files changed (5) hide show

README.md +13 -9
all_results.json +5 -5
runs/Aug21_02-51-44_ip-172-31-10-237/events.out.tfevents.1724208715.ip-172-31-10-237.960579.0 +2 -2
train_results.json +5 -5
trainer_state.json +654 -115

README.md CHANGED Viewed

@@ -1,11 +1,8 @@
 ---
 base_model: meta-llama/Meta-Llama-3-8B
-datasets:
-- HuggingFaceH4/ultrachat_200k
 library_name: peft
 license: llama3
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
@@ -19,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
 # llama3-sudo-sanity
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.7491
 ## Model description
@@ -52,15 +49,22 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 3
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
-| 1.8955        | 0.9899 | 49   | 1.8463          |
-| 1.8698        | 2.0    | 99   | 1.7678          |
-| 1.8282        | 2.9697 | 147  | 1.7491          |
 ### Framework versions

 ---
 base_model: meta-llama/Meta-Llama-3-8B
 library_name: peft
 license: llama3
 tags:
 - trl
 - sft
 - generated_from_trainer
 # llama3-sudo-sanity
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.1030
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
+| 1.8735        | 0.9899 | 49   | 1.8325          |
+| 1.8231        | 2.0    | 99   | 1.7239          |
+| 1.7516        | 2.9899 | 148  | 1.6330          |
+| 1.6586        | 4.0    | 198  | 1.5280          |
+| 1.5571        | 4.9899 | 247  | 1.4166          |
+| 1.4677        | 6.0    | 297  | 1.3068          |
+| 1.3422        | 6.9899 | 346  | 1.2082          |
+| 1.2609        | 8.0    | 396  | 1.1378          |
+| 1.1647        | 8.9899 | 445  | 1.1074          |
+| 1.1571        | 9.8990 | 490  | 1.1030          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 2.9696969696969697,
-    "total_flos": 706816481427456.0,
-    "train_loss": 1.9096906704156578,
-    "train_runtime": 1619.2688,
     "train_samples": 6321,
-    "train_samples_per_second": 11.711,
     "train_steps_per_second": 0.091
 }

 {
+    "epoch": 9.8989898989899,
+    "total_flos": 2344635780825088.0,
+    "train_loss": 1.5290005391957808,
+    "train_runtime": 5413.3076,
     "train_samples": 6321,
+    "train_samples_per_second": 11.677,
     "train_steps_per_second": 0.091
 }

runs/Aug21_02-51-44_ip-172-31-10-237/events.out.tfevents.1724208715.ip-172-31-10-237.960579.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e5ed61f02b2dde45022f0805f6821c207a3231eb7876c3663dc764a5e92cbc8
-size 28793

 version https://git-lfs.github.com/spec/v1
+oid sha256:658ebebff6d9c7b4b26c19b8f53502f8bcef7731700fb8153bb4498d3dfd5fb4
+size 29418

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 2.9696969696969697,
-    "total_flos": 706816481427456.0,
-    "train_loss": 1.9096906704156578,
-    "train_runtime": 1619.2688,
     "train_samples": 6321,
-    "train_samples_per_second": 11.711,
     "train_steps_per_second": 0.091
 }

 {
+    "epoch": 9.8989898989899,
+    "total_flos": 2344635780825088.0,
+    "train_loss": 1.5290005391957808,
+    "train_runtime": 5413.3076,
     "train_samples": 6321,
+    "train_samples_per_second": 11.677,
     "train_steps_per_second": 0.091
 }

trainer_state.json CHANGED Viewed

@@ -1,261 +1,800 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.9696969696969697,
   "eval_steps": 500,
-  "global_step": 147,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.020202020202020204,
-      "grad_norm": 1.08310938404347,
-      "learning_rate": 1.3333333333333333e-05,
-      "loss": 2.5976,
       "step": 1
     },
     {
       "epoch": 0.10101010101010101,
-      "grad_norm": 1.2015655639458453,
-      "learning_rate": 6.666666666666667e-05,
-      "loss": 2.5737,
       "step": 5
     },
     {
       "epoch": 0.20202020202020202,
-      "grad_norm": 0.5529484610069302,
-      "learning_rate": 0.00013333333333333334,
-      "loss": 2.4242,
       "step": 10
     },
     {
       "epoch": 0.30303030303030304,
-      "grad_norm": 0.44235368767500005,
-      "learning_rate": 0.0002,
-      "loss": 2.2287,
       "step": 15
     },
     {
       "epoch": 0.40404040404040403,
-      "grad_norm": 0.34186780209717693,
-      "learning_rate": 0.00019929278846732884,
-      "loss": 2.1199,
       "step": 20
     },
     {
       "epoch": 0.5050505050505051,
-      "grad_norm": 0.2690393065112005,
-      "learning_rate": 0.00019718115683235417,
-      "loss": 1.963,
       "step": 25
     },
     {
       "epoch": 0.6060606060606061,
-      "grad_norm": 0.2895526654213361,
-      "learning_rate": 0.0001936949724999762,
-      "loss": 1.9819,
       "step": 30
     },
     {
       "epoch": 0.7070707070707071,
-      "grad_norm": 0.21284939456784369,
-      "learning_rate": 0.00018888354486549237,
-      "loss": 1.9367,
       "step": 35
     },
     {
       "epoch": 0.8080808080808081,
-      "grad_norm": 0.220661039014496,
-      "learning_rate": 0.00018281492787113708,
-      "loss": 1.9123,
       "step": 40
     },
     {
       "epoch": 0.9090909090909091,
-      "grad_norm": 0.25800435226254403,
-      "learning_rate": 0.00017557495743542585,
-      "loss": 1.8955,
       "step": 45
     },
     {
       "epoch": 0.98989898989899,
-      "eval_loss": 1.8462597131729126,
-      "eval_runtime": 164.2887,
-      "eval_samples_per_second": 38.475,
-      "eval_steps_per_second": 2.41,
       "step": 49
     },
     {
       "epoch": 1.0101010101010102,
-      "grad_norm": 0.21280470872825646,
-      "learning_rate": 0.00016726603737012529,
-      "loss": 1.9023,
       "step": 50
     },
     {
       "epoch": 1.1111111111111112,
-      "grad_norm": 0.2281964993767009,
-      "learning_rate": 0.00015800569095711982,
-      "loss": 1.8291,
       "step": 55
     },
     {
       "epoch": 1.2121212121212122,
-      "grad_norm": 0.2468620521645798,
-      "learning_rate": 0.0001479248986720057,
-      "loss": 1.8448,
       "step": 60
     },
     {
       "epoch": 1.3131313131313131,
-      "grad_norm": 0.24256005211247963,
-      "learning_rate": 0.00013716624556603274,
-      "loss": 1.8667,
       "step": 65
     },
     {
       "epoch": 1.4141414141414141,
-      "grad_norm": 0.22252607581763362,
-      "learning_rate": 0.00012588190451025207,
-      "loss": 1.8213,
       "step": 70
     },
     {
       "epoch": 1.5151515151515151,
-      "grad_norm": 0.2742840702900069,
-      "learning_rate": 0.00011423148382732853,
-      "loss": 1.8122,
       "step": 75
     },
     {
       "epoch": 1.6161616161616161,
-      "grad_norm": 0.25494953140215326,
-      "learning_rate": 0.00010237976975461075,
-      "loss": 1.8726,
       "step": 80
     },
     {
       "epoch": 1.7171717171717171,
-      "grad_norm": 0.2657201331656562,
-      "learning_rate": 9.049439566958175e-05,
-      "loss": 1.8425,
       "step": 85
     },
     {
       "epoch": 1.8181818181818183,
-      "grad_norm": 0.2700733549650734,
-      "learning_rate": 7.874347104470234e-05,
-      "loss": 1.8205,
       "step": 90
     },
     {
       "epoch": 1.9191919191919191,
-      "grad_norm": 0.28284460248100685,
-      "learning_rate": 6.729320366825784e-05,
-      "loss": 1.8698,
       "step": 95
     },
     {
       "epoch": 2.0,
-      "eval_loss": 1.7677603960037231,
-      "eval_runtime": 177.7383,
-      "eval_samples_per_second": 35.564,
-      "eval_steps_per_second": 2.228,
       "step": 99
     },
     {
       "epoch": 2.0202020202020203,
-      "grad_norm": 0.28052694585968674,
-      "learning_rate": 5.630554876306407e-05,
-      "loss": 1.8412,
       "step": 100
     },
     {
       "epoch": 2.121212121212121,
-      "grad_norm": 0.30253313940579424,
-      "learning_rate": 4.593591825444028e-05,
-      "loss": 1.7843,
       "step": 105
     },
     {
       "epoch": 2.2222222222222223,
-      "grad_norm": 0.2919901233198437,
-      "learning_rate": 3.6330982588091186e-05,
-      "loss": 1.8085,
       "step": 110
     },
     {
       "epoch": 2.323232323232323,
-      "grad_norm": 0.2985415087687047,
-      "learning_rate": 2.7626596189492983e-05,
-      "loss": 1.7548,
       "step": 115
     },
     {
       "epoch": 2.4242424242424243,
-      "grad_norm": 0.31809894547826195,
-      "learning_rate": 1.994587590756397e-05,
-      "loss": 1.758,
       "step": 120
     },
     {
       "epoch": 2.525252525252525,
-      "grad_norm": 0.31151542636398494,
-      "learning_rate": 1.339745962155613e-05,
-      "loss": 1.7844,
       "step": 125
     },
     {
       "epoch": 2.6262626262626263,
-      "grad_norm": 0.3034143797714973,
-      "learning_rate": 8.073969641833445e-06,
-      "loss": 1.7975,
       "step": 130
     },
     {
       "epoch": 2.7272727272727275,
-      "grad_norm": 0.28778283893396456,
-      "learning_rate": 4.050702638550275e-06,
-      "loss": 1.7661,
       "step": 135
     },
     {
       "epoch": 2.8282828282828283,
-      "grad_norm": 0.2907166952704868,
-      "learning_rate": 1.3845646281813507e-06,
-      "loss": 1.7619,
       "step": 140
     },
     {
       "epoch": 2.929292929292929,
-      "grad_norm": 0.31772634861484544,
-      "learning_rate": 1.1326608169920372e-07,
-      "loss": 1.8282,
       "step": 145
     },
     {
-      "epoch": 2.9696969696969697,
-      "eval_loss": 1.7491472959518433,
-      "eval_runtime": 161.8737,
-      "eval_samples_per_second": 39.049,
-      "eval_steps_per_second": 2.446,
-      "step": 147
     },
     {
-      "epoch": 2.9696969696969697,
-      "step": 147,
-      "total_flos": 706816481427456.0,
-      "train_loss": 1.9096906704156578,
-      "train_runtime": 1619.2688,
-      "train_samples_per_second": 11.711,
       "train_steps_per_second": 0.091
     }
   ],
   "logging_steps": 5,
-  "max_steps": 147,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
   "save_steps": 25,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -269,7 +808,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 706816481427456.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 9.8989898989899,
   "eval_steps": 500,
+  "global_step": 490,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.020202020202020204,
+      "grad_norm": 1.1391378054420909,
+      "learning_rate": 4.081632653061224e-06,
+      "loss": 2.5995,
       "step": 1
     },
     {
       "epoch": 0.10101010101010101,
+      "grad_norm": 1.006731604503432,
+      "learning_rate": 2.0408163265306123e-05,
+      "loss": 2.5925,
       "step": 5
     },
     {
       "epoch": 0.20202020202020202,
+      "grad_norm": 1.3965084950072466,
+      "learning_rate": 4.0816326530612245e-05,
+      "loss": 2.546,
       "step": 10
     },
     {
       "epoch": 0.30303030303030304,
+      "grad_norm": 0.59095362852847,
+      "learning_rate": 6.122448979591838e-05,
+      "loss": 2.396,
       "step": 15
     },
     {
       "epoch": 0.40404040404040403,
+      "grad_norm": 0.33359551466461584,
+      "learning_rate": 8.163265306122449e-05,
+      "loss": 2.2744,
       "step": 20
     },
     {
       "epoch": 0.5050505050505051,
+      "grad_norm": 0.3767673243983956,
+      "learning_rate": 0.00010204081632653062,
+      "loss": 2.1608,
       "step": 25
     },
     {
       "epoch": 0.6060606060606061,
+      "grad_norm": 0.3530777092096336,
+      "learning_rate": 0.00012244897959183676,
+      "loss": 2.0261,
       "step": 30
     },
     {
       "epoch": 0.7070707070707071,
+      "grad_norm": 0.36168305575388426,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 2.0091,
       "step": 35
     },
     {
       "epoch": 0.8080808080808081,
+      "grad_norm": 0.2764545304686734,
+      "learning_rate": 0.00016326530612244898,
+      "loss": 1.9434,
       "step": 40
     },
     {
       "epoch": 0.9090909090909091,
+      "grad_norm": 0.2653033039849191,
+      "learning_rate": 0.00018367346938775512,
+      "loss": 1.8735,
       "step": 45
     },
     {
       "epoch": 0.98989898989899,
+      "eval_loss": 1.8325327634811401,
+      "eval_runtime": 177.2293,
+      "eval_samples_per_second": 35.666,
+      "eval_steps_per_second": 2.234,
       "step": 49
     },
     {
       "epoch": 1.0101010101010102,
+      "grad_norm": 0.2170406034599688,
+      "learning_rate": 0.00019999746258949147,
+      "loss": 1.8679,
       "step": 50
     },
     {
       "epoch": 1.1111111111111112,
+      "grad_norm": 0.2414932638214735,
+      "learning_rate": 0.00019990866674170983,
+      "loss": 1.8705,
       "step": 55
     },
     {
       "epoch": 1.2121212121212122,
+      "grad_norm": 0.280526285390802,
+      "learning_rate": 0.00019969312910817183,
+      "loss": 1.8428,
       "step": 60
     },
     {
       "epoch": 1.3131313131313131,
+      "grad_norm": 0.5716516166032067,
+      "learning_rate": 0.000199351123114852,
+      "loss": 1.8267,
       "step": 65
     },
     {
       "epoch": 1.4141414141414141,
+      "grad_norm": 0.22548657696350605,
+      "learning_rate": 0.00019888308262251285,
+      "loss": 1.7959,
       "step": 70
     },
     {
       "epoch": 1.5151515151515151,
+      "grad_norm": 0.2400943754848952,
+      "learning_rate": 0.00019828960137631928,
+      "loss": 1.8328,
       "step": 75
     },
     {
       "epoch": 1.6161616161616161,
+      "grad_norm": 0.24338845667140263,
+      "learning_rate": 0.00019757143225262728,
+      "loss": 1.8287,
       "step": 80
     },
     {
       "epoch": 1.7171717171717171,
+      "grad_norm": 0.23391106368008455,
+      "learning_rate": 0.00019672948630390294,
+      "loss": 1.8345,
       "step": 85
     },
     {
       "epoch": 1.8181818181818183,
+      "grad_norm": 0.27223367547235244,
+      "learning_rate": 0.00019576483160298246,
+      "loss": 1.7731,
       "step": 90
     },
     {
       "epoch": 1.9191919191919191,
+      "grad_norm": 0.25522514087403636,
+      "learning_rate": 0.00019467869188814023,
+      "loss": 1.8231,
       "step": 95
     },
     {
       "epoch": 2.0,
+      "eval_loss": 1.7238675355911255,
+      "eval_runtime": 175.3209,
+      "eval_samples_per_second": 36.054,
+      "eval_steps_per_second": 2.259,
       "step": 99
     },
     {
       "epoch": 2.0202020202020203,
+      "grad_norm": 0.24156163537353126,
+      "learning_rate": 0.00019347244501068312,
+      "loss": 1.8199,
       "step": 100
     },
     {
       "epoch": 2.121212121212121,
+      "grad_norm": 0.26738911600323706,
+      "learning_rate": 0.00019214762118704076,
+      "loss": 1.7554,
       "step": 105
     },
     {
       "epoch": 2.2222222222222223,
+      "grad_norm": 0.2974142479913874,
+      "learning_rate": 0.000190705901057569,
+      "loss": 1.7693,
       "step": 110
     },
     {
       "epoch": 2.323232323232323,
+      "grad_norm": 0.32489025521491743,
+      "learning_rate": 0.00018914911355452895,
+      "loss": 1.7036,
       "step": 115
     },
     {
       "epoch": 2.4242424242424243,
+      "grad_norm": 0.34564309759210704,
+      "learning_rate": 0.00018747923358194662,
+      "loss": 1.7449,
       "step": 120
     },
     {
       "epoch": 2.525252525252525,
+      "grad_norm": 0.33251849318291843,
+      "learning_rate": 0.00018569837951029595,
+      "loss": 1.7556,
       "step": 125
     },
     {
       "epoch": 2.6262626262626263,
+      "grad_norm": 0.33565250510381917,
+      "learning_rate": 0.00018380881048918405,
+      "loss": 1.744,
       "step": 130
     },
     {
       "epoch": 2.7272727272727275,
+      "grad_norm": 0.3427001935365706,
+      "learning_rate": 0.00018181292358144703,
+      "loss": 1.7234,
       "step": 135
     },
     {
       "epoch": 2.8282828282828283,
+      "grad_norm": 0.32846296076937864,
+      "learning_rate": 0.00017971325072229226,
+      "loss": 1.7274,
       "step": 140
     },
     {
       "epoch": 2.929292929292929,
+      "grad_norm": 0.3446715050022125,
+      "learning_rate": 0.0001775124555073452,
+      "loss": 1.7516,
       "step": 145
     },
     {
+      "epoch": 2.98989898989899,
+      "eval_loss": 1.6329888105392456,
+      "eval_runtime": 174.9738,
+      "eval_samples_per_second": 36.125,
+      "eval_steps_per_second": 2.263,
+      "step": 148
     },
     {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 0.34753378836184123,
+      "learning_rate": 0.0001752133298136744,
+      "loss": 1.7442,
+      "step": 150
+    },
+    {
+      "epoch": 3.1313131313131315,
+      "grad_norm": 0.3899145091665638,
+      "learning_rate": 0.0001728187902580819,
+      "loss": 1.6414,
+      "step": 155
+    },
+    {
+      "epoch": 3.2323232323232323,
+      "grad_norm": 0.3969944429695798,
+      "learning_rate": 0.00017033187449715196,
+      "loss": 1.6411,
+      "step": 160
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 0.4463802224093316,
+      "learning_rate": 0.00016775573737375096,
+      "loss": 1.6955,
+      "step": 165
+    },
+    {
+      "epoch": 3.4343434343434343,
+      "grad_norm": 0.4873799041554826,
+      "learning_rate": 0.0001650936469148681,
+      "loss": 1.6812,
+      "step": 170
+    },
+    {
+      "epoch": 3.5353535353535355,
+      "grad_norm": 0.5138644486787001,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 1.6455,
+      "step": 175
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 0.4441989179255284,
+      "learning_rate": 0.00015952521900645144,
+      "loss": 1.6537,
+      "step": 180
+    },
+    {
+      "epoch": 3.7373737373737375,
+      "grad_norm": 0.45397642246696135,
+      "learning_rate": 0.0001566259455336474,
+      "loss": 1.6384,
+      "step": 185
+    },
+    {
+      "epoch": 3.8383838383838382,
+      "grad_norm": 0.48522658034874977,
+      "learning_rate": 0.0001536548377176263,
+      "loss": 1.6292,
+      "step": 190
+    },
+    {
+      "epoch": 3.9393939393939394,
+      "grad_norm": 0.43244857762556244,
+      "learning_rate": 0.0001506156646359123,
+      "loss": 1.6586,
+      "step": 195
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.5279655456542969,
+      "eval_runtime": 175.1053,
+      "eval_samples_per_second": 36.098,
+      "eval_steps_per_second": 2.261,
+      "step": 198
+    },
+    {
+      "epoch": 4.040404040404041,
+      "grad_norm": 0.48231588190167396,
+      "learning_rate": 0.0001475122817120253,
+      "loss": 1.6137,
+      "step": 200
+    },
+    {
+      "epoch": 4.141414141414141,
+      "grad_norm": 0.5842989060014684,
+      "learning_rate": 0.00014434862582458135,
+      "loss": 1.5082,
+      "step": 205
+    },
+    {
+      "epoch": 4.242424242424242,
+      "grad_norm": 0.5870996767325578,
+      "learning_rate": 0.00014112871031306119,
+      "loss": 1.5382,
+      "step": 210
+    },
+    {
+      "epoch": 4.343434343434343,
+      "grad_norm": 0.6294490520638103,
+      "learning_rate": 0.0001378566198865818,
+      "loss": 1.5738,
+      "step": 215
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 0.6361554344671604,
+      "learning_rate": 0.00013453650544213076,
+      "loss": 1.5609,
+      "step": 220
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 0.5845910737228225,
+      "learning_rate": 0.00013117257879883583,
+      "loss": 1.5832,
+      "step": 225
+    },
+    {
+      "epoch": 4.646464646464646,
+      "grad_norm": 0.6362570401491278,
+      "learning_rate": 0.00012776910735495003,
+      "loss": 1.5386,
+      "step": 230
+    },
+    {
+      "epoch": 4.747474747474747,
+      "grad_norm": 0.6079381787055775,
+      "learning_rate": 0.0001243304086743309,
+      "loss": 1.5408,
+      "step": 235
+    },
+    {
+      "epoch": 4.848484848484849,
+      "grad_norm": 0.5955494164961348,
+      "learning_rate": 0.0001208608450092801,
+      "loss": 1.5767,
+      "step": 240
+    },
+    {
+      "epoch": 4.94949494949495,
+      "grad_norm": 0.5941973746172844,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 1.5571,
+      "step": 245
+    },
+    {
+      "epoch": 4.98989898989899,
+      "eval_loss": 1.4166467189788818,
+      "eval_runtime": 174.6193,
+      "eval_samples_per_second": 36.199,
+      "eval_steps_per_second": 2.268,
+      "step": 247
+    },
+    {
+      "epoch": 5.05050505050505,
+      "grad_norm": 0.6955112160268645,
+      "learning_rate": 0.0001138467619245374,
+      "loss": 1.5011,
+      "step": 250
+    },
+    {
+      "epoch": 5.151515151515151,
+      "grad_norm": 0.7116916227562953,
+      "learning_rate": 0.00011031114040574437,
+      "loss": 1.4537,
+      "step": 255
+    },
+    {
+      "epoch": 5.252525252525253,
+      "grad_norm": 0.8295579161716972,
+      "learning_rate": 0.0001067624384166495,
+      "loss": 1.398,
+      "step": 260
+    },
+    {
+      "epoch": 5.353535353535354,
+      "grad_norm": 0.7415551092257379,
+      "learning_rate": 0.00010320515775716555,
+      "loss": 1.4474,
+      "step": 265
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 0.7957507416152227,
+      "learning_rate": 9.96438111099047e-05,
+      "loss": 1.4459,
+      "step": 270
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 0.8098108632452509,
+      "learning_rate": 9.608291631549574e-05,
+      "loss": 1.4266,
+      "step": 275
+    },
+    {
+      "epoch": 5.656565656565657,
+      "grad_norm": 0.8498743613190896,
+      "learning_rate": 9.252699064135758e-05,
+      "loss": 1.3931,
+      "step": 280
+    },
+    {
+      "epoch": 5.757575757575758,
+      "grad_norm": 0.7968761297367668,
+      "learning_rate": 8.898054505119989e-05,
+      "loss": 1.4628,
+      "step": 285
+    },
+    {
+      "epoch": 5.858585858585858,
+      "grad_norm": 0.8166566096084199,
+      "learning_rate": 8.54480784825207e-05,
+      "loss": 1.4777,
+      "step": 290
+    },
+    {
+      "epoch": 5.959595959595959,
+      "grad_norm": 0.7564583944169918,
+      "learning_rate": 8.193407213936012e-05,
+      "loss": 1.4677,
+      "step": 295
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 1.3067700862884521,
+      "eval_runtime": 175.1357,
+      "eval_samples_per_second": 36.092,
+      "eval_steps_per_second": 2.261,
+      "step": 297
+    },
+    {
+      "epoch": 6.0606060606060606,
+      "grad_norm": 0.8369827825158762,
+      "learning_rate": 7.844298380755003e-05,
+      "loss": 1.375,
+      "step": 300
+    },
+    {
+      "epoch": 6.161616161616162,
+      "grad_norm": 0.9204188282340791,
+      "learning_rate": 7.497924219967209e-05,
+      "loss": 1.2999,
+      "step": 305
+    },
+    {
+      "epoch": 6.262626262626263,
+      "grad_norm": 0.9559880600184892,
+      "learning_rate": 7.154724133689677e-05,
+      "loss": 1.3084,
+      "step": 310
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 0.9272296702059781,
+      "learning_rate": 6.815133497483157e-05,
+      "loss": 1.3405,
+      "step": 315
+    },
+    {
+      "epoch": 6.4646464646464645,
+      "grad_norm": 1.0203421193696094,
+      "learning_rate": 6.479583108044899e-05,
+      "loss": 1.3165,
+      "step": 320
+    },
+    {
+      "epoch": 6.565656565656566,
+      "grad_norm": 0.8932381508297077,
+      "learning_rate": 6.148498636710092e-05,
+      "loss": 1.3641,
+      "step": 325
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 0.9527012454845684,
+      "learning_rate": 5.822300089455211e-05,
+      "loss": 1.3179,
+      "step": 330
+    },
+    {
+      "epoch": 6.767676767676767,
+      "grad_norm": 0.9644270167383292,
+      "learning_rate": 5.5014012740883115e-05,
+      "loss": 1.3295,
+      "step": 335
+    },
+    {
+      "epoch": 6.8686868686868685,
+      "grad_norm": 0.9489303492473159,
+      "learning_rate": 5.1862092753021754e-05,
+      "loss": 1.3482,
+      "step": 340
+    },
+    {
+      "epoch": 6.96969696969697,
+      "grad_norm": 0.9417366559193787,
+      "learning_rate": 4.8771239382562287e-05,
+      "loss": 1.3422,
+      "step": 345
+    },
+    {
+      "epoch": 6.98989898989899,
+      "eval_loss": 1.2082042694091797,
+      "eval_runtime": 174.4886,
+      "eval_samples_per_second": 36.226,
+      "eval_steps_per_second": 2.269,
+      "step": 346
+    },
+    {
+      "epoch": 7.070707070707071,
+      "grad_norm": 1.0189885797060951,
+      "learning_rate": 4.574537361342407e-05,
+      "loss": 1.2447,
+      "step": 350
+    },
+    {
+      "epoch": 7.171717171717171,
+      "grad_norm": 1.0456878645941505,
+      "learning_rate": 4.278833398778306e-05,
+      "loss": 1.2438,
+      "step": 355
+    },
+    {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 1.0906515200546398,
+      "learning_rate": 3.990387173658774e-05,
+      "loss": 1.2135,
+      "step": 360
+    },
+    {
+      "epoch": 7.373737373737374,
+      "grad_norm": 1.1138045736907602,
+      "learning_rate": 3.7095646020835754e-05,
+      "loss": 1.2152,
+      "step": 365
+    },
+    {
+      "epoch": 7.474747474747475,
+      "grad_norm": 1.1333935442018617,
+      "learning_rate": 3.436721928964819e-05,
+      "loss": 1.2004,
+      "step": 370
+    },
+    {
+      "epoch": 7.575757575757576,
+      "grad_norm": 1.0135992096066218,
+      "learning_rate": 3.172205276103033e-05,
+      "loss": 1.1904,
+      "step": 375
+    },
+    {
+      "epoch": 7.6767676767676765,
+      "grad_norm": 1.0811091792166911,
+      "learning_rate": 2.916350203105207e-05,
+      "loss": 1.2475,
+      "step": 380
+    },
+    {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1.1722915377984628,
+      "learning_rate": 2.669481281701739e-05,
+      "loss": 1.2273,
+      "step": 385
+    },
+    {
+      "epoch": 7.878787878787879,
+      "grad_norm": 1.0151820296117031,
+      "learning_rate": 2.4319116840023813e-05,
+      "loss": 1.2462,
+      "step": 390
+    },
+    {
+      "epoch": 7.97979797979798,
+      "grad_norm": 1.0359433658999384,
+      "learning_rate": 2.2039427852134788e-05,
+      "loss": 1.2609,
+      "step": 395
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 1.137781023979187,
+      "eval_runtime": 163.6859,
+      "eval_samples_per_second": 38.617,
+      "eval_steps_per_second": 2.419,
+      "step": 396
+    },
+    {
+      "epoch": 8.080808080808081,
+      "grad_norm": 1.0914414519615843,
+      "learning_rate": 1.985863781320435e-05,
+      "loss": 1.1457,
+      "step": 400
+    },
+    {
+      "epoch": 8.181818181818182,
+      "grad_norm": 1.2849174669504693,
+      "learning_rate": 1.777951322220508e-05,
+      "loss": 1.1925,
+      "step": 405
+    },
+    {
+      "epoch": 8.282828282828282,
+      "grad_norm": 1.0562037397284274,
+      "learning_rate": 1.580469160771253e-05,
+      "loss": 1.1653,
+      "step": 410
+    },
+    {
+      "epoch": 8.383838383838384,
+      "grad_norm": 1.1942325172166053,
+      "learning_rate": 1.3936678181998374e-05,
+      "loss": 1.1451,
+      "step": 415
+    },
+    {
+      "epoch": 8.484848484848484,
+      "grad_norm": 1.2292184186394104,
+      "learning_rate": 1.2177842662977135e-05,
+      "loss": 1.1432,
+      "step": 420
+    },
+    {
+      "epoch": 8.585858585858587,
+      "grad_norm": 1.1449254109310076,
+      "learning_rate": 1.0530416268037702e-05,
+      "loss": 1.1459,
+      "step": 425
+    },
+    {
+      "epoch": 8.686868686868687,
+      "grad_norm": 1.1159137674762092,
+      "learning_rate": 8.99648888357335e-06,
+      "loss": 1.1889,
+      "step": 430
+    },
+    {
+      "epoch": 8.787878787878787,
+      "grad_norm": 1.1893818134430183,
+      "learning_rate": 7.578006413801075e-06,
+      "loss": 1.1809,
+      "step": 435
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 1.131890459862098,
+      "learning_rate": 6.276768312233228e-06,
+      "loss": 1.1806,
+      "step": 440
+    },
+    {
+      "epoch": 8.98989898989899,
+      "grad_norm": 1.0926795618787801,
+      "learning_rate": 5.094425298933136e-06,
+      "loss": 1.1647,
+      "step": 445
+    },
+    {
+      "epoch": 8.98989898989899,
+      "eval_loss": 1.107386827468872,
+      "eval_runtime": 163.223,
+      "eval_samples_per_second": 38.726,
+      "eval_steps_per_second": 2.426,
+      "step": 445
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 1.1051037332814697,
+      "learning_rate": 4.0324772664503296e-06,
+      "loss": 1.1438,
+      "step": 450
+    },
+    {
+      "epoch": 9.191919191919192,
+      "grad_norm": 1.164376038164282,
+      "learning_rate": 3.092271377092215e-06,
+      "loss": 1.1481,
+      "step": 455
+    },
+    {
+      "epoch": 9.292929292929292,
+      "grad_norm": 1.2303081966513765,
+      "learning_rate": 2.2750003539455998e-06,
+      "loss": 1.1202,
+      "step": 460
+    },
+    {
+      "epoch": 9.393939393939394,
+      "grad_norm": 1.2010643624794166,
+      "learning_rate": 1.5817009678162685e-06,
+      "loss": 1.142,
+      "step": 465
+    },
+    {
+      "epoch": 9.494949494949495,
+      "grad_norm": 1.2423558300638782,
+      "learning_rate": 1.013252722005842e-06,
+      "loss": 1.1842,
+      "step": 470
+    },
+    {
+      "epoch": 9.595959595959595,
+      "grad_norm": 1.1980002179676799,
+      "learning_rate": 5.703767365946466e-07,
+      "loss": 1.1236,
+      "step": 475
+    },
+    {
+      "epoch": 9.696969696969697,
+      "grad_norm": 1.2034300162155251,
+      "learning_rate": 2.536348336456551e-07,
+      "loss": 1.1168,
+      "step": 480
+    },
+    {
+      "epoch": 9.797979797979798,
+      "grad_norm": 1.2022297984086214,
+      "learning_rate": 6.342882449029696e-08,
+      "loss": 1.1133,
+      "step": 485
+    },
+    {
+      "epoch": 9.8989898989899,
+      "grad_norm": 1.0699752057421905,
+      "learning_rate": 0.0,
+      "loss": 1.1571,
+      "step": 490
+    },
+    {
+      "epoch": 9.8989898989899,
+      "eval_loss": 1.1029597520828247,
+      "eval_runtime": 163.4283,
+      "eval_samples_per_second": 38.678,
+      "eval_steps_per_second": 2.423,
+      "step": 490
+    },
+    {
+      "epoch": 9.8989898989899,
+      "step": 490,
+      "total_flos": 2344635780825088.0,
+      "train_loss": 1.5290005391957808,
+      "train_runtime": 5413.3076,
+      "train_samples_per_second": 11.677,
       "train_steps_per_second": 0.091
     }
   ],
   "logging_steps": 5,
+  "max_steps": 490,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
   "save_steps": 25,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 2344635780825088.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null