Upload model finetuned on codet5p-220m using strategy src_fm_fc_ms_ff

Browse files

Files changed (12) hide show

README.md +50 -3
added_tokens.json +8 -0
config.json +33 -0
generation_config.json +8 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
special_tokens_map.json +59 -0
tokenizer.json +0 -0
tokenizer_config.json +113 -0
trainer_state.json +1603 -0
training_args.bin +3 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,50 @@
----
-license: apache-2.0
----

+---
+license: bsd-3-clause
+base_model: Salesforce/codet5p-220m
+tags:
+- generated_from_trainer
+model-index:
+- name: finetuning_02_codet5p_src_fm_fc_ms_ff
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# finetuning_02_codet5p_src_fm_fc_ms_ff
+This model is a fine-tuned version of [Salesforce/codet5p-220m](https://huggingface.co/Salesforce/codet5p-220m) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 32
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 800
+- num_epochs: 5
+- mixed_precision_training: Native AMP
+### Framework versions
+- Transformers 4.38.2
+- Pytorch 2.1.0
+- Datasets 2.20.0
+- Tokenizers 0.15.2

added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "</ECTX>": 32103,
+  "</FCTX>": 32101,
+  "</PRIVATE_FCTX>": 32105,
+  "<ECTX>": 32102,
+  "<FCTX>": 32100,
+  "<PRIVATE_FCTX>": 32104
+}

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "Salesforce/codet5p-220m",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "vocab_size": 32106
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3c450de6ca234b9791cabe32b7c7416f795cff2c18f4cbcaffde3c6c2df5286
+size 891635790

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "additional_special_tokens": [
+    "<FCTX>",
+    "</FCTX>",
+    "<ECTX>",
+    "</ECTX>",
+    "<PRIVATE_FCTX>",
+    "</PRIVATE_FCTX>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32100": {
+      "content": "<FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32101": {
+      "content": "</FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32102": {
+      "content": "<ECTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32103": {
+      "content": "</ECTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32104": {
+      "content": "<PRIVATE_FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32105": {
+      "content": "</PRIVATE_FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<FCTX>",
+    "</FCTX>",
+    "<ECTX>",
+    "</ECTX>",
+    "<PRIVATE_FCTX>",
+    "</PRIVATE_FCTX>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1603 @@

+{
+  "best_metric": 1.0753824710845947,
+  "best_model_checkpoint": "/root/finetuning_executions/finetuning_02_codet5p_src_fm_fc_ms_ff/checkpoint-17548",
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 87740,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.5287591218948364,
+      "learning_rate": 2.4687500000000004e-05,
+      "loss": 1.4862,
+      "step": 400
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.919360876083374,
+      "learning_rate": 4.96875e-05,
+      "loss": 1.1714,
+      "step": 800
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.1510220766067505,
+      "learning_rate": 4.977283183804923e-05,
+      "loss": 1.1326,
+      "step": 1200
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.1849422454833984,
+      "learning_rate": 4.9542788129744654e-05,
+      "loss": 1.1176,
+      "step": 1600
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.052920937538147,
+      "learning_rate": 4.931274442144008e-05,
+      "loss": 1.0981,
+      "step": 2000
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.174275517463684,
+      "learning_rate": 4.90827007131355e-05,
+      "loss": 1.0811,
+      "step": 2400
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.0344840288162231,
+      "learning_rate": 4.885265700483092e-05,
+      "loss": 1.065,
+      "step": 2800
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.2671674489974976,
+      "learning_rate": 4.862261329652634e-05,
+      "loss": 1.0578,
+      "step": 3200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.1277002096176147,
+      "learning_rate": 4.839256958822176e-05,
+      "loss": 1.0421,
+      "step": 3600
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.1894861459732056,
+      "learning_rate": 4.8162525879917186e-05,
+      "loss": 1.031,
+      "step": 4000
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.2189041376113892,
+      "learning_rate": 4.793248217161261e-05,
+      "loss": 1.0322,
+      "step": 4400
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.2372210025787354,
+      "learning_rate": 4.770243846330803e-05,
+      "loss": 1.0155,
+      "step": 4800
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.2500073909759521,
+      "learning_rate": 4.7472394755003454e-05,
+      "loss": 1.0211,
+      "step": 5200
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9148824214935303,
+      "learning_rate": 4.724235104669887e-05,
+      "loss": 1.0001,
+      "step": 5600
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.1473156213760376,
+      "learning_rate": 4.7012307338394294e-05,
+      "loss": 0.9869,
+      "step": 6000
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.1870834827423096,
+      "learning_rate": 4.6782263630089717e-05,
+      "loss": 0.9799,
+      "step": 6400
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.1499440670013428,
+      "learning_rate": 4.655221992178514e-05,
+      "loss": 0.9745,
+      "step": 6800
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.0729453563690186,
+      "learning_rate": 4.632217621348056e-05,
+      "loss": 0.9871,
+      "step": 7200
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.3007827997207642,
+      "learning_rate": 4.6092132505175986e-05,
+      "loss": 0.9612,
+      "step": 7600
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.1860408782958984,
+      "learning_rate": 4.586208879687141e-05,
+      "loss": 0.9636,
+      "step": 8000
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.0349955558776855,
+      "learning_rate": 4.5632045088566825e-05,
+      "loss": 0.9645,
+      "step": 8400
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.3005322217941284,
+      "learning_rate": 4.5402001380262254e-05,
+      "loss": 0.9536,
+      "step": 8800
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.2307965755462646,
+      "learning_rate": 4.517195767195768e-05,
+      "loss": 0.9474,
+      "step": 9200
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.0385469198226929,
+      "learning_rate": 4.49419139636531e-05,
+      "loss": 0.9402,
+      "step": 9600
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.1734727621078491,
+      "learning_rate": 4.471187025534852e-05,
+      "loss": 0.9321,
+      "step": 10000
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.3363800048828125,
+      "learning_rate": 4.448182654704394e-05,
+      "loss": 0.9192,
+      "step": 10400
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.073585033416748,
+      "learning_rate": 4.425178283873936e-05,
+      "loss": 0.9378,
+      "step": 10800
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0610324144363403,
+      "learning_rate": 4.4021739130434786e-05,
+      "loss": 0.9187,
+      "step": 11200
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.039048194885254,
+      "learning_rate": 4.379169542213021e-05,
+      "loss": 0.9191,
+      "step": 11600
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.0391401052474976,
+      "learning_rate": 4.356165171382563e-05,
+      "loss": 0.91,
+      "step": 12000
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.082083821296692,
+      "learning_rate": 4.3331608005521054e-05,
+      "loss": 0.9166,
+      "step": 12400
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.0464677810668945,
+      "learning_rate": 4.310156429721647e-05,
+      "loss": 0.9234,
+      "step": 12800
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.0795680284500122,
+      "learning_rate": 4.2871520588911894e-05,
+      "loss": 0.9004,
+      "step": 13200
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.2177696228027344,
+      "learning_rate": 4.2641476880607317e-05,
+      "loss": 0.8991,
+      "step": 13600
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9279542565345764,
+      "learning_rate": 4.241143317230274e-05,
+      "loss": 0.901,
+      "step": 14000
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.2393149137496948,
+      "learning_rate": 4.218138946399816e-05,
+      "loss": 0.8898,
+      "step": 14400
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.2811025381088257,
+      "learning_rate": 4.1951920864964345e-05,
+      "loss": 0.8975,
+      "step": 14800
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.0508288145065308,
+      "learning_rate": 4.172187715665977e-05,
+      "loss": 0.897,
+      "step": 15200
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.962242066860199,
+      "learning_rate": 4.149183344835519e-05,
+      "loss": 0.8776,
+      "step": 15600
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.9615252017974854,
+      "learning_rate": 4.126178974005061e-05,
+      "loss": 0.873,
+      "step": 16000
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.040337324142456,
+      "learning_rate": 4.103174603174603e-05,
+      "loss": 0.8831,
+      "step": 16400
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.0600088834762573,
+      "learning_rate": 4.0801702323441453e-05,
+      "loss": 0.8759,
+      "step": 16800
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9814367890357971,
+      "learning_rate": 4.0572233724407636e-05,
+      "loss": 0.8732,
+      "step": 17200
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.0753824710845947,
+      "eval_runtime": 239.6966,
+      "eval_samples_per_second": 251.464,
+      "eval_steps_per_second": 3.93,
+      "step": 17548
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.055283784866333,
+      "learning_rate": 4.034219001610306e-05,
+      "loss": 0.8697,
+      "step": 17600
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.1038569211959839,
+      "learning_rate": 4.011272141706924e-05,
+      "loss": 0.8246,
+      "step": 18000
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.9692428708076477,
+      "learning_rate": 3.9882677708764665e-05,
+      "loss": 0.8284,
+      "step": 18400
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.093485951423645,
+      "learning_rate": 3.965263400046009e-05,
+      "loss": 0.8271,
+      "step": 18800
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.1435869932174683,
+      "learning_rate": 3.942259029215551e-05,
+      "loss": 0.8198,
+      "step": 19200
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.389695644378662,
+      "learning_rate": 3.9192546583850934e-05,
+      "loss": 0.8223,
+      "step": 19600
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.081563949584961,
+      "learning_rate": 3.896307798481712e-05,
+      "loss": 0.8078,
+      "step": 20000
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.20356023311615,
+      "learning_rate": 3.873303427651253e-05,
+      "loss": 0.8216,
+      "step": 20400
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.2045621871948242,
+      "learning_rate": 3.850299056820796e-05,
+      "loss": 0.8222,
+      "step": 20800
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.969454824924469,
+      "learning_rate": 3.8272946859903386e-05,
+      "loss": 0.803,
+      "step": 21200
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.2209794521331787,
+      "learning_rate": 3.804290315159881e-05,
+      "loss": 0.8115,
+      "step": 21600
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.0688341856002808,
+      "learning_rate": 3.781285944329423e-05,
+      "loss": 0.8051,
+      "step": 22000
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.1031506061553955,
+      "learning_rate": 3.7582815734989655e-05,
+      "loss": 0.8059,
+      "step": 22400
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.9878343939781189,
+      "learning_rate": 3.735277202668507e-05,
+      "loss": 0.8054,
+      "step": 22800
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.327987790107727,
+      "learning_rate": 3.7122728318380494e-05,
+      "loss": 0.8131,
+      "step": 23200
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.0833244323730469,
+      "learning_rate": 3.689268461007592e-05,
+      "loss": 0.7936,
+      "step": 23600
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.1618777513504028,
+      "learning_rate": 3.666264090177134e-05,
+      "loss": 0.7991,
+      "step": 24000
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 1.022359013557434,
+      "learning_rate": 3.643259719346676e-05,
+      "loss": 0.8002,
+      "step": 24400
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.2475693225860596,
+      "learning_rate": 3.6202553485162186e-05,
+      "loss": 0.8001,
+      "step": 24800
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 1.1127784252166748,
+      "learning_rate": 3.59725097768576e-05,
+      "loss": 0.7865,
+      "step": 25200
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.2091097831726074,
+      "learning_rate": 3.5742466068553025e-05,
+      "loss": 0.7899,
+      "step": 25600
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9588549733161926,
+      "learning_rate": 3.551242236024845e-05,
+      "loss": 0.7942,
+      "step": 26000
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.195241093635559,
+      "learning_rate": 3.528237865194387e-05,
+      "loss": 0.7813,
+      "step": 26400
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.9788525700569153,
+      "learning_rate": 3.5052334943639294e-05,
+      "loss": 0.7805,
+      "step": 26800
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.2794181108474731,
+      "learning_rate": 3.482286634460548e-05,
+      "loss": 0.7763,
+      "step": 27200
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.9700046181678772,
+      "learning_rate": 3.45928226363009e-05,
+      "loss": 0.7801,
+      "step": 27600
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.2326452732086182,
+      "learning_rate": 3.436335403726708e-05,
+      "loss": 0.7864,
+      "step": 28000
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.2367639541625977,
+      "learning_rate": 3.4133310328962506e-05,
+      "loss": 0.7845,
+      "step": 28400
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.077854871749878,
+      "learning_rate": 3.390326662065793e-05,
+      "loss": 0.7869,
+      "step": 28800
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.0575716495513916,
+      "learning_rate": 3.3673222912353345e-05,
+      "loss": 0.7838,
+      "step": 29200
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.1674555540084839,
+      "learning_rate": 3.344317920404877e-05,
+      "loss": 0.7827,
+      "step": 29600
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.148335337638855,
+      "learning_rate": 3.321313549574419e-05,
+      "loss": 0.7781,
+      "step": 30000
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.0287448167800903,
+      "learning_rate": 3.2983091787439614e-05,
+      "loss": 0.7652,
+      "step": 30400
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.2461556196212769,
+      "learning_rate": 3.275304807913504e-05,
+      "loss": 0.7773,
+      "step": 30800
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.1946007013320923,
+      "learning_rate": 3.252357948010122e-05,
+      "loss": 0.7694,
+      "step": 31200
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.019499659538269,
+      "learning_rate": 3.229353577179664e-05,
+      "loss": 0.7803,
+      "step": 31600
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.3375366926193237,
+      "learning_rate": 3.2063492063492065e-05,
+      "loss": 0.7684,
+      "step": 32000
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.2477443218231201,
+      "learning_rate": 3.183344835518749e-05,
+      "loss": 0.7657,
+      "step": 32400
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.1749552488327026,
+      "learning_rate": 3.160340464688291e-05,
+      "loss": 0.767,
+      "step": 32800
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.0863006114959717,
+      "learning_rate": 3.1373360938578334e-05,
+      "loss": 0.767,
+      "step": 33200
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.9976168870925903,
+      "learning_rate": 3.114389233954452e-05,
+      "loss": 0.7536,
+      "step": 33600
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.1924540996551514,
+      "learning_rate": 3.09144237405107e-05,
+      "loss": 0.7622,
+      "step": 34000
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.0996850728988647,
+      "learning_rate": 3.068438003220612e-05,
+      "loss": 0.7569,
+      "step": 34400
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 1.2163282632827759,
+      "learning_rate": 3.0454336323901546e-05,
+      "loss": 0.7667,
+      "step": 34800
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.0829898118972778,
+      "eval_runtime": 239.7954,
+      "eval_samples_per_second": 251.36,
+      "eval_steps_per_second": 3.928,
+      "step": 35096
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 1.1651737689971924,
+      "learning_rate": 3.0224292615596966e-05,
+      "loss": 0.7442,
+      "step": 35200
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 1.1764894723892212,
+      "learning_rate": 2.999424890729239e-05,
+      "loss": 0.714,
+      "step": 35600
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 1.1951353549957275,
+      "learning_rate": 2.976420519898781e-05,
+      "loss": 0.7076,
+      "step": 36000
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 1.1282097101211548,
+      "learning_rate": 2.953416149068323e-05,
+      "loss": 0.7105,
+      "step": 36400
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.3397319316864014,
+      "learning_rate": 2.9304117782378654e-05,
+      "loss": 0.7023,
+      "step": 36800
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.1150188446044922,
+      "learning_rate": 2.9074074074074077e-05,
+      "loss": 0.7035,
+      "step": 37200
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 1.2119678258895874,
+      "learning_rate": 2.8844030365769497e-05,
+      "loss": 0.7168,
+      "step": 37600
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 1.167506456375122,
+      "learning_rate": 2.861398665746492e-05,
+      "loss": 0.7125,
+      "step": 38000
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 1.0915708541870117,
+      "learning_rate": 2.8384518058431102e-05,
+      "loss": 0.7101,
+      "step": 38400
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.135021686553955,
+      "learning_rate": 2.8154474350126525e-05,
+      "loss": 0.7145,
+      "step": 38800
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 1.3739718198776245,
+      "learning_rate": 2.792443064182195e-05,
+      "loss": 0.7096,
+      "step": 39200
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 1.1629129648208618,
+      "learning_rate": 2.7694386933517368e-05,
+      "loss": 0.7053,
+      "step": 39600
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.9963687062263489,
+      "learning_rate": 2.746434322521279e-05,
+      "loss": 0.7012,
+      "step": 40000
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.0318909883499146,
+      "learning_rate": 2.7234874626178974e-05,
+      "loss": 0.713,
+      "step": 40400
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 1.0613532066345215,
+      "learning_rate": 2.7004830917874397e-05,
+      "loss": 0.704,
+      "step": 40800
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 1.1298637390136719,
+      "learning_rate": 2.677478720956982e-05,
+      "loss": 0.708,
+      "step": 41200
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 1.1079801321029663,
+      "learning_rate": 2.654474350126524e-05,
+      "loss": 0.6975,
+      "step": 41600
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 1.0751113891601562,
+      "learning_rate": 2.6314699792960662e-05,
+      "loss": 0.6999,
+      "step": 42000
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 1.1240077018737793,
+      "learning_rate": 2.6085231193926845e-05,
+      "loss": 0.7055,
+      "step": 42400
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.0788402557373047,
+      "learning_rate": 2.5855187485622268e-05,
+      "loss": 0.7055,
+      "step": 42800
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 1.00369131565094,
+      "learning_rate": 2.562514377731769e-05,
+      "loss": 0.6949,
+      "step": 43200
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.1382017135620117,
+      "learning_rate": 2.539510006901311e-05,
+      "loss": 0.7093,
+      "step": 43600
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 1.0273314714431763,
+      "learning_rate": 2.5165056360708534e-05,
+      "loss": 0.7066,
+      "step": 44000
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 1.331964373588562,
+      "learning_rate": 2.4935012652403957e-05,
+      "loss": 0.7037,
+      "step": 44400
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 1.102133870124817,
+      "learning_rate": 2.470496894409938e-05,
+      "loss": 0.7028,
+      "step": 44800
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 1.131090521812439,
+      "learning_rate": 2.4474925235794803e-05,
+      "loss": 0.6871,
+      "step": 45200
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 1.1939336061477661,
+      "learning_rate": 2.4244881527490222e-05,
+      "loss": 0.6966,
+      "step": 45600
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.344831109046936,
+      "learning_rate": 2.4014837819185645e-05,
+      "loss": 0.6933,
+      "step": 46000
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.9559622406959534,
+      "learning_rate": 2.3784794110881068e-05,
+      "loss": 0.6916,
+      "step": 46400
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 1.182010293006897,
+      "learning_rate": 2.355475040257649e-05,
+      "loss": 0.6903,
+      "step": 46800
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.080712080001831,
+      "learning_rate": 2.3325281803542674e-05,
+      "loss": 0.6965,
+      "step": 47200
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.2468616962432861,
+      "learning_rate": 2.3095238095238097e-05,
+      "loss": 0.6906,
+      "step": 47600
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 1.0585706233978271,
+      "learning_rate": 2.286519438693352e-05,
+      "loss": 0.6966,
+      "step": 48000
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 1.2725940942764282,
+      "learning_rate": 2.2635150678628943e-05,
+      "loss": 0.6894,
+      "step": 48400
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 1.1753593683242798,
+      "learning_rate": 2.2405106970324362e-05,
+      "loss": 0.6806,
+      "step": 48800
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.117319941520691,
+      "learning_rate": 2.2175063262019785e-05,
+      "loss": 0.6879,
+      "step": 49200
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 1.2521744966506958,
+      "learning_rate": 2.194501955371521e-05,
+      "loss": 0.6808,
+      "step": 49600
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.396971344947815,
+      "learning_rate": 2.1714975845410628e-05,
+      "loss": 0.6798,
+      "step": 50000
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 1.0855846405029297,
+      "learning_rate": 2.148493213710605e-05,
+      "loss": 0.6978,
+      "step": 50400
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.199013113975525,
+      "learning_rate": 2.1254888428801474e-05,
+      "loss": 0.6882,
+      "step": 50800
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.366407871246338,
+      "learning_rate": 2.1024844720496894e-05,
+      "loss": 0.6882,
+      "step": 51200
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.1709498167037964,
+      "learning_rate": 2.0794801012192317e-05,
+      "loss": 0.6907,
+      "step": 51600
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 1.1881307363510132,
+      "learning_rate": 2.05653324131585e-05,
+      "loss": 0.6883,
+      "step": 52000
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 1.4105783700942993,
+      "learning_rate": 2.0335288704853922e-05,
+      "loss": 0.6833,
+      "step": 52400
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.0844900608062744,
+      "eval_runtime": 239.8565,
+      "eval_samples_per_second": 251.296,
+      "eval_steps_per_second": 3.927,
+      "step": 52644
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 1.4675981998443604,
+      "learning_rate": 2.0105244996549345e-05,
+      "loss": 0.6679,
+      "step": 52800
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 1.151491403579712,
+      "learning_rate": 1.9875201288244768e-05,
+      "loss": 0.6501,
+      "step": 53200
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 1.0938260555267334,
+      "learning_rate": 1.964515757994019e-05,
+      "loss": 0.6396,
+      "step": 53600
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 1.055185317993164,
+      "learning_rate": 1.941511387163561e-05,
+      "loss": 0.6442,
+      "step": 54000
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 1.0307785272598267,
+      "learning_rate": 1.9185645272601797e-05,
+      "loss": 0.6489,
+      "step": 54400
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 1.184102177619934,
+      "learning_rate": 1.8955601564297217e-05,
+      "loss": 0.6454,
+      "step": 54800
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.1798542737960815,
+      "learning_rate": 1.872555785599264e-05,
+      "loss": 0.6552,
+      "step": 55200
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 1.1375089883804321,
+      "learning_rate": 1.8496089256958822e-05,
+      "loss": 0.6359,
+      "step": 55600
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 1.0475974082946777,
+      "learning_rate": 1.8266045548654245e-05,
+      "loss": 0.6374,
+      "step": 56000
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 1.0948106050491333,
+      "learning_rate": 1.803600184034967e-05,
+      "loss": 0.6431,
+      "step": 56400
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 1.1488378047943115,
+      "learning_rate": 1.7805958132045088e-05,
+      "loss": 0.646,
+      "step": 56800
+    },
+    {
+      "epoch": 3.26,
+      "grad_norm": 1.1257692575454712,
+      "learning_rate": 1.757591442374051e-05,
+      "loss": 0.6408,
+      "step": 57200
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 1.1101455688476562,
+      "learning_rate": 1.7345870715435934e-05,
+      "loss": 0.6389,
+      "step": 57600
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 1.329904556274414,
+      "learning_rate": 1.7115827007131354e-05,
+      "loss": 0.6399,
+      "step": 58000
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 1.2944815158843994,
+      "learning_rate": 1.6885783298826777e-05,
+      "loss": 0.6421,
+      "step": 58400
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 1.1607027053833008,
+      "learning_rate": 1.6655739590522203e-05,
+      "loss": 0.637,
+      "step": 58800
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 1.0392543077468872,
+      "learning_rate": 1.6426270991488382e-05,
+      "loss": 0.6411,
+      "step": 59200
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 1.3244273662567139,
+      "learning_rate": 1.6196227283183805e-05,
+      "loss": 0.6473,
+      "step": 59600
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 1.1351373195648193,
+      "learning_rate": 1.5966183574879228e-05,
+      "loss": 0.6298,
+      "step": 60000
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 1.1698590517044067,
+      "learning_rate": 1.573613986657465e-05,
+      "loss": 0.6355,
+      "step": 60400
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 1.2005553245544434,
+      "learning_rate": 1.5506671267540834e-05,
+      "loss": 0.6395,
+      "step": 60800
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 0.97503662109375,
+      "learning_rate": 1.5276627559236257e-05,
+      "loss": 0.6437,
+      "step": 61200
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 1.2518908977508545,
+      "learning_rate": 1.5046583850931678e-05,
+      "loss": 0.6385,
+      "step": 61600
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 1.2661454677581787,
+      "learning_rate": 1.48165401426271e-05,
+      "loss": 0.6403,
+      "step": 62000
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 1.2612046003341675,
+      "learning_rate": 1.4586496434322523e-05,
+      "loss": 0.6442,
+      "step": 62400
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 1.1942335367202759,
+      "learning_rate": 1.4356452726017944e-05,
+      "loss": 0.6383,
+      "step": 62800
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 1.1030133962631226,
+      "learning_rate": 1.4126409017713365e-05,
+      "loss": 0.6277,
+      "step": 63200
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 1.2485852241516113,
+      "learning_rate": 1.3896365309408788e-05,
+      "loss": 0.6414,
+      "step": 63600
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 0.9925839900970459,
+      "learning_rate": 1.366632160110421e-05,
+      "loss": 0.6337,
+      "step": 64000
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 1.3896905183792114,
+      "learning_rate": 1.343627789279963e-05,
+      "loss": 0.6314,
+      "step": 64400
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.1392475366592407,
+      "learning_rate": 1.3206809293765815e-05,
+      "loss": 0.6312,
+      "step": 64800
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 1.2051880359649658,
+      "learning_rate": 1.2976765585461237e-05,
+      "loss": 0.6198,
+      "step": 65200
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 1.3581410646438599,
+      "learning_rate": 1.2746721877156661e-05,
+      "loss": 0.634,
+      "step": 65600
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 1.4071406126022339,
+      "learning_rate": 1.2516678168852084e-05,
+      "loss": 0.633,
+      "step": 66000
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 1.1921656131744385,
+      "learning_rate": 1.2286634460547504e-05,
+      "loss": 0.6206,
+      "step": 66400
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 1.4039461612701416,
+      "learning_rate": 1.2056590752242927e-05,
+      "loss": 0.6341,
+      "step": 66800
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 1.3369255065917969,
+      "learning_rate": 1.182654704393835e-05,
+      "loss": 0.6427,
+      "step": 67200
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 1.2129446268081665,
+      "learning_rate": 1.1596503335633771e-05,
+      "loss": 0.6293,
+      "step": 67600
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 1.264256238937378,
+      "learning_rate": 1.1366459627329192e-05,
+      "loss": 0.6282,
+      "step": 68000
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 1.1778966188430786,
+      "learning_rate": 1.1136415919024615e-05,
+      "loss": 0.6383,
+      "step": 68400
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 1.045240044593811,
+      "learning_rate": 1.0906372210720037e-05,
+      "loss": 0.6315,
+      "step": 68800
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.2942785024642944,
+      "learning_rate": 1.0676903611686221e-05,
+      "loss": 0.6276,
+      "step": 69200
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 1.2519258260726929,
+      "learning_rate": 1.0446859903381644e-05,
+      "loss": 0.6228,
+      "step": 69600
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 1.2884622812271118,
+      "learning_rate": 1.0216816195077065e-05,
+      "loss": 0.6234,
+      "step": 70000
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.09929621219635,
+      "eval_runtime": 239.9825,
+      "eval_samples_per_second": 251.164,
+      "eval_steps_per_second": 3.925,
+      "step": 70192
+    },
+    {
+      "epoch": 4.01,
+      "grad_norm": 1.029523253440857,
+      "learning_rate": 9.986772486772487e-06,
+      "loss": 0.607,
+      "step": 70400
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 1.1874665021896362,
+      "learning_rate": 9.75672877846791e-06,
+      "loss": 0.5992,
+      "step": 70800
+    },
+    {
+      "epoch": 4.06,
+      "grad_norm": 1.3719263076782227,
+      "learning_rate": 9.526685070163331e-06,
+      "loss": 0.5932,
+      "step": 71200
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 1.1106728315353394,
+      "learning_rate": 9.296641361858754e-06,
+      "loss": 0.6082,
+      "step": 71600
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 1.1333997249603271,
+      "learning_rate": 9.066597653554177e-06,
+      "loss": 0.5958,
+      "step": 72000
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 1.2606267929077148,
+      "learning_rate": 8.837129054520358e-06,
+      "loss": 0.6015,
+      "step": 72400
+    },
+    {
+      "epoch": 4.15,
+      "grad_norm": 1.123744249343872,
+      "learning_rate": 8.607085346215783e-06,
+      "loss": 0.6011,
+      "step": 72800
+    },
+    {
+      "epoch": 4.17,
+      "grad_norm": 1.155521273612976,
+      "learning_rate": 8.377041637911204e-06,
+      "loss": 0.5973,
+      "step": 73200
+    },
+    {
+      "epoch": 4.19,
+      "grad_norm": 1.1591954231262207,
+      "learning_rate": 8.146997929606625e-06,
+      "loss": 0.5924,
+      "step": 73600
+    },
+    {
+      "epoch": 4.22,
+      "grad_norm": 1.3380868434906006,
+      "learning_rate": 7.916954221302048e-06,
+      "loss": 0.5983,
+      "step": 74000
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 1.2216105461120605,
+      "learning_rate": 7.68691051299747e-06,
+      "loss": 0.5947,
+      "step": 74400
+    },
+    {
+      "epoch": 4.26,
+      "grad_norm": 1.0791873931884766,
+      "learning_rate": 7.4568668046928916e-06,
+      "loss": 0.6054,
+      "step": 74800
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 1.1365481615066528,
+      "learning_rate": 7.2268230963883145e-06,
+      "loss": 0.6092,
+      "step": 75200
+    },
+    {
+      "epoch": 4.31,
+      "grad_norm": 1.1376712322235107,
+      "learning_rate": 6.997354497354498e-06,
+      "loss": 0.5942,
+      "step": 75600
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 1.1192513704299927,
+      "learning_rate": 6.76731078904992e-06,
+      "loss": 0.5955,
+      "step": 76000
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 1.1927390098571777,
+      "learning_rate": 6.537267080745342e-06,
+      "loss": 0.5901,
+      "step": 76400
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 1.236060619354248,
+      "learning_rate": 6.307223372440764e-06,
+      "loss": 0.6,
+      "step": 76800
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.077643871307373,
+      "learning_rate": 6.077179664136186e-06,
+      "loss": 0.6088,
+      "step": 77200
+    },
+    {
+      "epoch": 4.42,
+      "grad_norm": 1.3172234296798706,
+      "learning_rate": 5.847135955831608e-06,
+      "loss": 0.5944,
+      "step": 77600
+    },
+    {
+      "epoch": 4.44,
+      "grad_norm": 1.2222837209701538,
+      "learning_rate": 5.61709224752703e-06,
+      "loss": 0.5976,
+      "step": 78000
+    },
+    {
+      "epoch": 4.47,
+      "grad_norm": 1.2887938022613525,
+      "learning_rate": 5.387048539222452e-06,
+      "loss": 0.6023,
+      "step": 78400
+    },
+    {
+      "epoch": 4.49,
+      "grad_norm": 1.1380060911178589,
+      "learning_rate": 5.157579940188636e-06,
+      "loss": 0.5938,
+      "step": 78800
+    },
+    {
+      "epoch": 4.51,
+      "grad_norm": 1.2178806066513062,
+      "learning_rate": 4.927536231884058e-06,
+      "loss": 0.5916,
+      "step": 79200
+    },
+    {
+      "epoch": 4.54,
+      "grad_norm": 1.2010163068771362,
+      "learning_rate": 4.69749252357948e-06,
+      "loss": 0.5891,
+      "step": 79600
+    },
+    {
+      "epoch": 4.56,
+      "grad_norm": 1.2172470092773438,
+      "learning_rate": 4.467448815274902e-06,
+      "loss": 0.6019,
+      "step": 80000
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 1.2008330821990967,
+      "learning_rate": 4.2374051069703245e-06,
+      "loss": 0.596,
+      "step": 80400
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 1.3656328916549683,
+      "learning_rate": 4.007936507936508e-06,
+      "loss": 0.6001,
+      "step": 80800
+    },
+    {
+      "epoch": 4.63,
+      "grad_norm": 1.336308240890503,
+      "learning_rate": 3.7778927996319303e-06,
+      "loss": 0.5912,
+      "step": 81200
+    },
+    {
+      "epoch": 4.65,
+      "grad_norm": 1.1399625539779663,
+      "learning_rate": 3.5478490913273524e-06,
+      "loss": 0.5962,
+      "step": 81600
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 1.237598180770874,
+      "learning_rate": 3.317805383022775e-06,
+      "loss": 0.5869,
+      "step": 82000
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 1.1215174198150635,
+      "learning_rate": 3.0877616747181967e-06,
+      "loss": 0.5927,
+      "step": 82400
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 1.3274859189987183,
+      "learning_rate": 2.857717966413619e-06,
+      "loss": 0.6066,
+      "step": 82800
+    },
+    {
+      "epoch": 4.74,
+      "grad_norm": 1.276289463043213,
+      "learning_rate": 2.628249367379802e-06,
+      "loss": 0.5994,
+      "step": 83200
+    },
+    {
+      "epoch": 4.76,
+      "grad_norm": 1.154296636581421,
+      "learning_rate": 2.3982056590752246e-06,
+      "loss": 0.5907,
+      "step": 83600
+    },
+    {
+      "epoch": 4.79,
+      "grad_norm": 1.1015737056732178,
+      "learning_rate": 2.1681619507706463e-06,
+      "loss": 0.6043,
+      "step": 84000
+    },
+    {
+      "epoch": 4.81,
+      "grad_norm": 1.356696367263794,
+      "learning_rate": 1.9381182424660685e-06,
+      "loss": 0.5883,
+      "step": 84400
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 1.1508524417877197,
+      "learning_rate": 1.7080745341614908e-06,
+      "loss": 0.5899,
+      "step": 84800
+    },
+    {
+      "epoch": 4.86,
+      "grad_norm": 1.1132149696350098,
+      "learning_rate": 1.478030825856913e-06,
+      "loss": 0.5994,
+      "step": 85200
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 1.306624174118042,
+      "learning_rate": 1.2485622268230964e-06,
+      "loss": 0.5891,
+      "step": 85600
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 1.234307050704956,
+      "learning_rate": 1.0185185185185188e-06,
+      "loss": 0.5916,
+      "step": 86000
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 1.0994372367858887,
+      "learning_rate": 7.884748102139407e-07,
+      "loss": 0.5908,
+      "step": 86400
+    },
+    {
+      "epoch": 4.95,
+      "grad_norm": 1.2712494134902954,
+      "learning_rate": 5.584311019093628e-07,
+      "loss": 0.5963,
+      "step": 86800
+    },
+    {
+      "epoch": 4.97,
+      "grad_norm": 1.2190104722976685,
+      "learning_rate": 3.283873936047849e-07,
+      "loss": 0.5902,
+      "step": 87200
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 1.3301359415054321,
+      "learning_rate": 9.891879457096849e-08,
+      "loss": 0.591,
+      "step": 87600
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.1109092235565186,
+      "eval_runtime": 240.0172,
+      "eval_samples_per_second": 251.128,
+      "eval_steps_per_second": 3.925,
+      "step": 87740
+    },
+    {
+      "epoch": 5.0,
+      "step": 87740,
+      "total_flos": 1.7097588901675008e+18,
+      "train_loss": 0.7401387438351292,
+      "train_runtime": 31448.3717,
+      "train_samples_per_second": 89.283,
+      "train_steps_per_second": 2.79
+    }
+  ],
+  "logging_steps": 400,
+  "max_steps": 87740,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "total_flos": 1.7097588901675008e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621a59c1fa7abfb2989a3fb9a8fe10944574d433fca08f964e9d5588799068a9
+size 5176

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff