Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

README.md +50 -0
config.json +30 -0
generation_config.json +6 -0
optimizer.pt +3 -0
pytorch_model-00001-of-00003.bin +3 -0
pytorch_model-00002-of-00003.bin +3 -0
pytorch_model-00003-of-00003.bin +3 -0
pytorch_model.bin.index.json +531 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +11 -0
tokenizer.json +0 -0
tokenizer_config.json +7 -0
trainer_state.json +2713 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+---
+license: apache-2.0
+base_model: EleutherAI/polyglot-ko-12.8b
+tags:
+- generated_from_trainer
+model-index:
+- name: gridone-ko-llm-12.8b-v1.1d
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gridone-ko-llm-12.8b-v1.1d
+This model is a fine-tuned version of [EleutherAI/polyglot-ko-12.8b](https://huggingface.co/EleutherAI/polyglot-ko-12.8b) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 8
+### Framework versions
+- Transformers 4.32.0.dev0
+- Pytorch 2.0.0+cu117
+- Datasets 2.11.0
+- Tokenizers 0.13.3

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "EleutherAI/polyglot-ko-12.8b",
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 20480,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_steps": "global_step301000",
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.5,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.32.0.dev0",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 30080
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "transformers_version": "4.32.0.dev0"
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e0fb9bcae2d6d87c41d93ed7644fb4792c73655c348a1528d0edc63a6597bc3
+size 24400263

pytorch_model-00001-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cef5ac08250f925f719e5145cfe42b00f7bd45adb70639259cf59699e122bf71
+size 9957073034

pytorch_model-00002-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:176324a1173af6a4589ad0e14d23dc782d865d797fae87d226bcfdd818f9ca74
+size 9858779099

pytorch_model-00003-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61cdbf74d8266e06d4e9dde0e4614146cef11e8ee2f72df7c69199970cc23957
+size 5971549140

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,531 @@

+{
+  "metadata": {
+    "total_size": 25787212800
+  },
+  "weight_map": {
+    "embed_out.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.embed_in.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.final_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.final_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.0.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.15.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.16.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.2.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.20.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.3.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.30.attention.dense.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.attention.dense.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.31.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.attention.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.31.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.31.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.31.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "gpt_neox.layers.32.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.35.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.36.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.37.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.38.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.attention.dense.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.attention.dense.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.attention.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.39.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "gpt_neox.layers.4.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.attention.dense.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.attention.dense.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.attention.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
+    "gpt_neox.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin"
+  }
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7508d4b8dd267de5cc58e972da25236687927651336a28f292c92f7f23951475
+size 14575

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b02be2c961050591e39e048540caca52c7186951e99ec479ac50c29b65770a6
+size 627

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|sep|>",
+    "<|acc|>",
+    "<|tel|>",
+    "<|rrn|>"
+  ],
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2713 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.998330550918197,
+  "eval_steps": 500,
+  "global_step": 449,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 5e-05,
+      "loss": 2.3424,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 5e-05,
+      "loss": 2.3416,
+      "step": 2
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 4.995805369127517e-05,
+      "loss": 2.3702,
+      "step": 3
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 4.9916107382550336e-05,
+      "loss": 2.1306,
+      "step": 4
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 4.9874161073825505e-05,
+      "loss": 2.3988,
+      "step": 5
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.9874161073825505e-05,
+      "loss": 2.6274,
+      "step": 6
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.9874161073825505e-05,
+      "loss": 2.693,
+      "step": 7
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.983221476510067e-05,
+      "loss": 2.6681,
+      "step": 8
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 4.9790268456375845e-05,
+      "loss": 2.3159,
+      "step": 9
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 4.974832214765101e-05,
+      "loss": 2.2781,
+      "step": 10
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 4.970637583892618e-05,
+      "loss": 2.328,
+      "step": 11
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 4.966442953020135e-05,
+      "loss": 2.1489,
+      "step": 12
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 4.962248322147651e-05,
+      "loss": 2.0534,
+      "step": 13
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 4.958053691275168e-05,
+      "loss": 2.1968,
+      "step": 14
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 4.9538590604026845e-05,
+      "loss": 1.99,
+      "step": 15
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 4.9496644295302015e-05,
+      "loss": 1.9691,
+      "step": 16
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 4.945469798657718e-05,
+      "loss": 2.0833,
+      "step": 17
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 4.9412751677852355e-05,
+      "loss": 1.9973,
+      "step": 18
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 4.937080536912752e-05,
+      "loss": 1.9415,
+      "step": 19
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 4.932885906040269e-05,
+      "loss": 2.0112,
+      "step": 20
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 4.928691275167786e-05,
+      "loss": 2.1733,
+      "step": 21
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 4.924496644295302e-05,
+      "loss": 2.4138,
+      "step": 22
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 4.920302013422819e-05,
+      "loss": 2.1533,
+      "step": 23
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 4.9161073825503354e-05,
+      "loss": 2.0635,
+      "step": 24
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 4.9119127516778524e-05,
+      "loss": 1.9881,
+      "step": 25
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 4.9077181208053694e-05,
+      "loss": 1.9963,
+      "step": 26
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9035234899328864e-05,
+      "loss": 2.0764,
+      "step": 27
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.8993288590604034e-05,
+      "loss": 1.959,
+      "step": 28
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 4.89513422818792e-05,
+      "loss": 2.0066,
+      "step": 29
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 4.890939597315437e-05,
+      "loss": 1.9056,
+      "step": 30
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.886744966442953e-05,
+      "loss": 1.9512,
+      "step": 31
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 4.88255033557047e-05,
+      "loss": 1.962,
+      "step": 32
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 4.878355704697986e-05,
+      "loss": 1.8714,
+      "step": 33
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.874161073825503e-05,
+      "loss": 1.8087,
+      "step": 34
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 4.86996644295302e-05,
+      "loss": 1.9392,
+      "step": 35
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 4.865771812080537e-05,
+      "loss": 1.9197,
+      "step": 36
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.861577181208054e-05,
+      "loss": 1.9624,
+      "step": 37
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 4.8573825503355706e-05,
+      "loss": 1.9549,
+      "step": 38
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 4.8531879194630876e-05,
+      "loss": 1.8778,
+      "step": 39
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.848993288590604e-05,
+      "loss": 1.9317,
+      "step": 40
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.844798657718121e-05,
+      "loss": 1.9082,
+      "step": 41
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 4.840604026845638e-05,
+      "loss": 1.8862,
+      "step": 42
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.836409395973154e-05,
+      "loss": 1.9192,
+      "step": 43
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 4.832214765100672e-05,
+      "loss": 1.9409,
+      "step": 44
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 4.828020134228188e-05,
+      "loss": 1.8827,
+      "step": 45
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 4.823825503355705e-05,
+      "loss": 1.8866,
+      "step": 46
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 4.8196308724832215e-05,
+      "loss": 1.9024,
+      "step": 47
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 4.8154362416107385e-05,
+      "loss": 1.8225,
+      "step": 48
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 4.8112416107382555e-05,
+      "loss": 1.846,
+      "step": 49
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 4.807046979865772e-05,
+      "loss": 1.9671,
+      "step": 50
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 4.802852348993289e-05,
+      "loss": 1.9424,
+      "step": 51
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 4.798657718120805e-05,
+      "loss": 1.8501,
+      "step": 52
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 4.794463087248323e-05,
+      "loss": 1.866,
+      "step": 53
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 4.790268456375839e-05,
+      "loss": 1.8838,
+      "step": 54
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 4.786073825503356e-05,
+      "loss": 1.8818,
+      "step": 55
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 4.7818791946308725e-05,
+      "loss": 1.879,
+      "step": 56
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 4.7776845637583895e-05,
+      "loss": 1.8943,
+      "step": 57
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 4.7734899328859064e-05,
+      "loss": 1.8722,
+      "step": 58
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 4.769295302013423e-05,
+      "loss": 1.8072,
+      "step": 59
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 4.76510067114094e-05,
+      "loss": 1.8448,
+      "step": 60
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 4.760906040268457e-05,
+      "loss": 1.87,
+      "step": 61
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 4.756711409395974e-05,
+      "loss": 1.9172,
+      "step": 62
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 4.75251677852349e-05,
+      "loss": 1.8836,
+      "step": 63
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 4.748322147651007e-05,
+      "loss": 1.8625,
+      "step": 64
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 4.744127516778524e-05,
+      "loss": 1.8304,
+      "step": 65
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 4.7399328859060404e-05,
+      "loss": 1.8414,
+      "step": 66
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 4.7357382550335574e-05,
+      "loss": 1.9091,
+      "step": 67
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 4.731543624161074e-05,
+      "loss": 1.8627,
+      "step": 68
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 4.727348993288591e-05,
+      "loss": 1.881,
+      "step": 69
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 4.723154362416108e-05,
+      "loss": 1.7762,
+      "step": 70
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 4.718959731543625e-05,
+      "loss": 1.8732,
+      "step": 71
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 4.714765100671141e-05,
+      "loss": 1.858,
+      "step": 72
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 4.710570469798658e-05,
+      "loss": 1.8388,
+      "step": 73
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 4.706375838926175e-05,
+      "loss": 1.8227,
+      "step": 74
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 4.702181208053691e-05,
+      "loss": 1.7938,
+      "step": 75
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 4.697986577181208e-05,
+      "loss": 1.8297,
+      "step": 76
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 4.6937919463087246e-05,
+      "loss": 1.8499,
+      "step": 77
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 4.6895973154362416e-05,
+      "loss": 1.8268,
+      "step": 78
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.6854026845637586e-05,
+      "loss": 1.8147,
+      "step": 79
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.6812080536912756e-05,
+      "loss": 1.8521,
+      "step": 80
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 4.6770134228187926e-05,
+      "loss": 1.851,
+      "step": 81
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 4.672818791946309e-05,
+      "loss": 1.8616,
+      "step": 82
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 4.668624161073826e-05,
+      "loss": 1.8746,
+      "step": 83
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.664429530201342e-05,
+      "loss": 1.8594,
+      "step": 84
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.660234899328859e-05,
+      "loss": 1.8699,
+      "step": 85
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.6560402684563755e-05,
+      "loss": 1.8696,
+      "step": 86
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.6518456375838925e-05,
+      "loss": 1.8658,
+      "step": 87
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.6476510067114095e-05,
+      "loss": 1.8581,
+      "step": 88
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.6434563758389265e-05,
+      "loss": 1.8231,
+      "step": 89
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.6392617449664435e-05,
+      "loss": 1.8652,
+      "step": 90
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.63506711409396e-05,
+      "loss": 1.8778,
+      "step": 91
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.630872483221477e-05,
+      "loss": 1.8551,
+      "step": 92
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.626677852348993e-05,
+      "loss": 1.8503,
+      "step": 93
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 4.62248322147651e-05,
+      "loss": 1.8302,
+      "step": 94
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 4.618288590604027e-05,
+      "loss": 1.8571,
+      "step": 95
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 4.6140939597315434e-05,
+      "loss": 1.8549,
+      "step": 96
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 4.609899328859061e-05,
+      "loss": 1.8925,
+      "step": 97
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 4.6057046979865774e-05,
+      "loss": 1.8608,
+      "step": 98
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 4.6015100671140944e-05,
+      "loss": 1.8521,
+      "step": 99
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 4.597315436241611e-05,
+      "loss": 1.8835,
+      "step": 100
+    },
+    {
+      "epoch": 0.67,
+      "learning_rate": 4.593120805369128e-05,
+      "loss": 1.7923,
+      "step": 101
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 4.588926174496645e-05,
+      "loss": 1.7976,
+      "step": 102
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 4.584731543624161e-05,
+      "loss": 1.8535,
+      "step": 103
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 4.580536912751678e-05,
+      "loss": 1.829,
+      "step": 104
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 4.576342281879195e-05,
+      "loss": 1.8247,
+      "step": 105
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 4.572147651006712e-05,
+      "loss": 1.8068,
+      "step": 106
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 4.5679530201342284e-05,
+      "loss": 1.7786,
+      "step": 107
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 4.5637583892617453e-05,
+      "loss": 1.8292,
+      "step": 108
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 4.559563758389262e-05,
+      "loss": 1.8265,
+      "step": 109
+    },
+    {
+      "epoch": 0.73,
+      "learning_rate": 4.5553691275167787e-05,
+      "loss": 1.8142,
+      "step": 110
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 4.5511744966442957e-05,
+      "loss": 1.8448,
+      "step": 111
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 4.546979865771812e-05,
+      "loss": 1.7959,
+      "step": 112
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 4.542785234899329e-05,
+      "loss": 1.8464,
+      "step": 113
+    },
+    {
+      "epoch": 0.76,
+      "learning_rate": 4.538590604026846e-05,
+      "loss": 1.8974,
+      "step": 114
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 4.534395973154363e-05,
+      "loss": 1.8465,
+      "step": 115
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 4.530201342281879e-05,
+      "loss": 1.8479,
+      "step": 116
+    },
+    {
+      "epoch": 0.78,
+      "learning_rate": 4.526006711409396e-05,
+      "loss": 1.8569,
+      "step": 117
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 4.521812080536913e-05,
+      "loss": 1.8481,
+      "step": 118
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 4.5176174496644296e-05,
+      "loss": 1.8177,
+      "step": 119
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 4.5134228187919466e-05,
+      "loss": 1.8628,
+      "step": 120
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 4.509228187919463e-05,
+      "loss": 1.8455,
+      "step": 121
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 4.50503355704698e-05,
+      "loss": 1.8556,
+      "step": 122
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 4.500838926174497e-05,
+      "loss": 1.7393,
+      "step": 123
+    },
+    {
+      "epoch": 0.83,
+      "learning_rate": 4.496644295302014e-05,
+      "loss": 1.8481,
+      "step": 124
+    },
+    {
+      "epoch": 0.83,
+      "learning_rate": 4.49244966442953e-05,
+      "loss": 1.8426,
+      "step": 125
+    },
+    {
+      "epoch": 0.84,
+      "learning_rate": 4.488255033557047e-05,
+      "loss": 1.8246,
+      "step": 126
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 4.484060402684564e-05,
+      "loss": 1.8377,
+      "step": 127
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 4.4798657718120805e-05,
+      "loss": 1.7963,
+      "step": 128
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 4.4756711409395975e-05,
+      "loss": 1.8088,
+      "step": 129
+    },
+    {
+      "epoch": 0.87,
+      "learning_rate": 4.471476510067114e-05,
+      "loss": 1.8482,
+      "step": 130
+    },
+    {
+      "epoch": 0.87,
+      "learning_rate": 4.467281879194631e-05,
+      "loss": 1.8094,
+      "step": 131
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 4.463087248322148e-05,
+      "loss": 1.8875,
+      "step": 132
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 4.458892617449665e-05,
+      "loss": 1.7656,
+      "step": 133
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 4.454697986577182e-05,
+      "loss": 1.8591,
+      "step": 134
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 4.450503355704698e-05,
+      "loss": 1.8469,
+      "step": 135
+    },
+    {
+      "epoch": 0.91,
+      "learning_rate": 4.446308724832215e-05,
+      "loss": 1.7949,
+      "step": 136
+    },
+    {
+      "epoch": 0.91,
+      "learning_rate": 4.4421140939597314e-05,
+      "loss": 1.8503,
+      "step": 137
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 4.4379194630872484e-05,
+      "loss": 1.7922,
+      "step": 138
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 4.4337248322147654e-05,
+      "loss": 1.8318,
+      "step": 139
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 4.4295302013422824e-05,
+      "loss": 1.8904,
+      "step": 140
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 4.4253355704697994e-05,
+      "loss": 1.7657,
+      "step": 141
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 4.421140939597316e-05,
+      "loss": 1.8428,
+      "step": 142
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 4.416946308724833e-05,
+      "loss": 1.8724,
+      "step": 143
+    },
+    {
+      "epoch": 0.96,
+      "learning_rate": 4.412751677852349e-05,
+      "loss": 1.8586,
+      "step": 144
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 4.408557046979866e-05,
+      "loss": 1.8459,
+      "step": 145
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 4.4043624161073823e-05,
+      "loss": 1.8466,
+      "step": 146
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 4.4001677852348993e-05,
+      "loss": 1.8361,
+      "step": 147
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 4.395973154362416e-05,
+      "loss": 1.7921,
+      "step": 148
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 4.391778523489933e-05,
+      "loss": 1.821,
+      "step": 149
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 4.38758389261745e-05,
+      "loss": 1.6893,
+      "step": 150
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 4.3833892617449666e-05,
+      "loss": 1.1744,
+      "step": 151
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 4.3791946308724836e-05,
+      "loss": 1.1708,
+      "step": 152
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 4.375e-05,
+      "loss": 1.0879,
+      "step": 153
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 4.370805369127517e-05,
+      "loss": 1.1454,
+      "step": 154
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 4.366610738255034e-05,
+      "loss": 1.0553,
+      "step": 155
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 4.36241610738255e-05,
+      "loss": 1.0153,
+      "step": 156
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 4.358221476510067e-05,
+      "loss": 1.0504,
+      "step": 157
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 4.354026845637584e-05,
+      "loss": 1.0519,
+      "step": 158
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 4.349832214765101e-05,
+      "loss": 1.029,
+      "step": 159
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 4.3456375838926176e-05,
+      "loss": 1.0393,
+      "step": 160
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 4.3414429530201346e-05,
+      "loss": 1.0251,
+      "step": 161
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 4.337248322147651e-05,
+      "loss": 1.0165,
+      "step": 162
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 4.333053691275168e-05,
+      "loss": 1.1326,
+      "step": 163
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 4.328859060402685e-05,
+      "loss": 1.0326,
+      "step": 164
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 4.324664429530201e-05,
+      "loss": 1.0497,
+      "step": 165
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 4.320469798657718e-05,
+      "loss": 0.9533,
+      "step": 166
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 4.316275167785235e-05,
+      "loss": 1.039,
+      "step": 167
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 4.312080536912752e-05,
+      "loss": 1.1064,
+      "step": 168
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 4.3078859060402685e-05,
+      "loss": 1.0939,
+      "step": 169
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 4.3036912751677855e-05,
+      "loss": 1.0078,
+      "step": 170
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 4.2994966442953025e-05,
+      "loss": 1.0189,
+      "step": 171
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 4.295302013422819e-05,
+      "loss": 0.9982,
+      "step": 172
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 4.291107382550336e-05,
+      "loss": 0.9927,
+      "step": 173
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 4.286912751677852e-05,
+      "loss": 0.9942,
+      "step": 174
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 4.28271812080537e-05,
+      "loss": 1.0255,
+      "step": 175
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 4.278523489932886e-05,
+      "loss": 1.0477,
+      "step": 176
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 4.274328859060403e-05,
+      "loss": 0.952,
+      "step": 177
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 4.27013422818792e-05,
+      "loss": 1.01,
+      "step": 178
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 4.2659395973154364e-05,
+      "loss": 1.0382,
+      "step": 179
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 4.2617449664429534e-05,
+      "loss": 1.0118,
+      "step": 180
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 4.25755033557047e-05,
+      "loss": 1.0751,
+      "step": 181
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 4.253355704697987e-05,
+      "loss": 1.006,
+      "step": 182
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 4.249161073825503e-05,
+      "loss": 1.0463,
+      "step": 183
+    },
+    {
+      "epoch": 1.23,
+      "learning_rate": 4.244966442953021e-05,
+      "loss": 1.0699,
+      "step": 184
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 4.240771812080537e-05,
+      "loss": 1.0076,
+      "step": 185
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 4.236577181208054e-05,
+      "loss": 0.9696,
+      "step": 186
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 4.232382550335571e-05,
+      "loss": 0.9489,
+      "step": 187
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 4.228187919463087e-05,
+      "loss": 0.9653,
+      "step": 188
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 4.223993288590604e-05,
+      "loss": 1.1345,
+      "step": 189
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 4.2197986577181206e-05,
+      "loss": 1.0358,
+      "step": 190
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 4.2156040268456376e-05,
+      "loss": 1.0029,
+      "step": 191
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 4.2114093959731546e-05,
+      "loss": 0.9662,
+      "step": 192
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 4.2072147651006716e-05,
+      "loss": 0.9887,
+      "step": 193
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 4.2030201342281886e-05,
+      "loss": 0.9726,
+      "step": 194
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 4.198825503355705e-05,
+      "loss": 1.064,
+      "step": 195
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 4.194630872483222e-05,
+      "loss": 1.0207,
+      "step": 196
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 4.190436241610738e-05,
+      "loss": 1.0091,
+      "step": 197
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 4.186241610738255e-05,
+      "loss": 1.0282,
+      "step": 198
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 4.1820469798657716e-05,
+      "loss": 1.0127,
+      "step": 199
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 4.1778523489932886e-05,
+      "loss": 1.0243,
+      "step": 200
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 4.1736577181208055e-05,
+      "loss": 1.0646,
+      "step": 201
+    },
+    {
+      "epoch": 1.35,
+      "learning_rate": 4.1694630872483225e-05,
+      "loss": 0.9856,
+      "step": 202
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 4.1652684563758395e-05,
+      "loss": 1.0238,
+      "step": 203
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 4.161073825503356e-05,
+      "loss": 1.09,
+      "step": 204
+    },
+    {
+      "epoch": 1.37,
+      "learning_rate": 4.156879194630873e-05,
+      "loss": 0.9666,
+      "step": 205
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 4.152684563758389e-05,
+      "loss": 1.0156,
+      "step": 206
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 4.148489932885906e-05,
+      "loss": 1.0441,
+      "step": 207
+    },
+    {
+      "epoch": 1.39,
+      "learning_rate": 4.144295302013423e-05,
+      "loss": 1.1158,
+      "step": 208
+    },
+    {
+      "epoch": 1.4,
+      "learning_rate": 4.1401006711409395e-05,
+      "loss": 1.1311,
+      "step": 209
+    },
+    {
+      "epoch": 1.4,
+      "learning_rate": 4.135906040268457e-05,
+      "loss": 1.1143,
+      "step": 210
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 4.1317114093959735e-05,
+      "loss": 1.0636,
+      "step": 211
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 4.1275167785234905e-05,
+      "loss": 1.0763,
+      "step": 212
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 4.123322147651007e-05,
+      "loss": 0.94,
+      "step": 213
+    },
+    {
+      "epoch": 1.43,
+      "learning_rate": 4.119127516778524e-05,
+      "loss": 1.0561,
+      "step": 214
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 4.11493288590604e-05,
+      "loss": 1.0693,
+      "step": 215
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 4.110738255033557e-05,
+      "loss": 1.1091,
+      "step": 216
+    },
+    {
+      "epoch": 1.45,
+      "learning_rate": 4.106543624161074e-05,
+      "loss": 0.9472,
+      "step": 217
+    },
+    {
+      "epoch": 1.46,
+      "learning_rate": 4.1023489932885904e-05,
+      "loss": 1.0918,
+      "step": 218
+    },
+    {
+      "epoch": 1.46,
+      "learning_rate": 4.098154362416108e-05,
+      "loss": 1.135,
+      "step": 219
+    },
+    {
+      "epoch": 1.47,
+      "learning_rate": 4.0939597315436244e-05,
+      "loss": 1.215,
+      "step": 220
+    },
+    {
+      "epoch": 1.48,
+      "learning_rate": 4.0897651006711414e-05,
+      "loss": 1.0823,
+      "step": 221
+    },
+    {
+      "epoch": 1.48,
+      "learning_rate": 4.085570469798658e-05,
+      "loss": 1.0458,
+      "step": 222
+    },
+    {
+      "epoch": 1.49,
+      "learning_rate": 4.081375838926175e-05,
+      "loss": 1.0106,
+      "step": 223
+    },
+    {
+      "epoch": 1.5,
+      "learning_rate": 4.077181208053692e-05,
+      "loss": 1.0731,
+      "step": 224
+    },
+    {
+      "epoch": 1.5,
+      "learning_rate": 4.072986577181208e-05,
+      "loss": 1.0038,
+      "step": 225
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 4.068791946308725e-05,
+      "loss": 0.9899,
+      "step": 226
+    },
+    {
+      "epoch": 1.52,
+      "learning_rate": 4.064597315436241e-05,
+      "loss": 1.0594,
+      "step": 227
+    },
+    {
+      "epoch": 1.52,
+      "learning_rate": 4.060402684563759e-05,
+      "loss": 1.0711,
+      "step": 228
+    },
+    {
+      "epoch": 1.53,
+      "learning_rate": 4.056208053691275e-05,
+      "loss": 1.0418,
+      "step": 229
+    },
+    {
+      "epoch": 1.54,
+      "learning_rate": 4.052013422818792e-05,
+      "loss": 1.126,
+      "step": 230
+    },
+    {
+      "epoch": 1.54,
+      "learning_rate": 4.047818791946309e-05,
+      "loss": 0.9917,
+      "step": 231
+    },
+    {
+      "epoch": 1.55,
+      "learning_rate": 4.0436241610738256e-05,
+      "loss": 1.0509,
+      "step": 232
+    },
+    {
+      "epoch": 1.56,
+      "learning_rate": 4.0394295302013426e-05,
+      "loss": 1.0557,
+      "step": 233
+    },
+    {
+      "epoch": 1.56,
+      "learning_rate": 4.035234899328859e-05,
+      "loss": 1.0436,
+      "step": 234
+    },
+    {
+      "epoch": 1.57,
+      "learning_rate": 4.031040268456376e-05,
+      "loss": 0.9919,
+      "step": 235
+    },
+    {
+      "epoch": 1.58,
+      "learning_rate": 4.026845637583892e-05,
+      "loss": 1.0404,
+      "step": 236
+    },
+    {
+      "epoch": 1.58,
+      "learning_rate": 4.02265100671141e-05,
+      "loss": 0.9761,
+      "step": 237
+    },
+    {
+      "epoch": 1.59,
+      "learning_rate": 4.018456375838926e-05,
+      "loss": 1.0817,
+      "step": 238
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 4.014261744966443e-05,
+      "loss": 1.0359,
+      "step": 239
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 4.01006711409396e-05,
+      "loss": 1.1567,
+      "step": 240
+    },
+    {
+      "epoch": 1.61,
+      "learning_rate": 4.0058724832214765e-05,
+      "loss": 1.1174,
+      "step": 241
+    },
+    {
+      "epoch": 1.62,
+      "learning_rate": 4.0016778523489935e-05,
+      "loss": 1.0707,
+      "step": 242
+    },
+    {
+      "epoch": 1.62,
+      "learning_rate": 3.99748322147651e-05,
+      "loss": 1.0341,
+      "step": 243
+    },
+    {
+      "epoch": 1.63,
+      "learning_rate": 3.993288590604027e-05,
+      "loss": 1.0668,
+      "step": 244
+    },
+    {
+      "epoch": 1.64,
+      "learning_rate": 3.989093959731544e-05,
+      "loss": 1.1055,
+      "step": 245
+    },
+    {
+      "epoch": 1.64,
+      "learning_rate": 3.984899328859061e-05,
+      "loss": 0.9733,
+      "step": 246
+    },
+    {
+      "epoch": 1.65,
+      "learning_rate": 3.980704697986578e-05,
+      "loss": 1.0354,
+      "step": 247
+    },
+    {
+      "epoch": 1.66,
+      "learning_rate": 3.976510067114094e-05,
+      "loss": 1.0075,
+      "step": 248
+    },
+    {
+      "epoch": 1.66,
+      "learning_rate": 3.972315436241611e-05,
+      "loss": 1.0697,
+      "step": 249
+    },
+    {
+      "epoch": 1.67,
+      "learning_rate": 3.9681208053691275e-05,
+      "loss": 1.0748,
+      "step": 250
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 3.9639261744966445e-05,
+      "loss": 1.0527,
+      "step": 251
+    },
+    {
+      "epoch": 1.68,
+      "learning_rate": 3.959731543624161e-05,
+      "loss": 1.0426,
+      "step": 252
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 3.955536912751678e-05,
+      "loss": 0.9371,
+      "step": 253
+    },
+    {
+      "epoch": 1.7,
+      "learning_rate": 3.951342281879195e-05,
+      "loss": 1.0303,
+      "step": 254
+    },
+    {
+      "epoch": 1.7,
+      "learning_rate": 3.947147651006712e-05,
+      "loss": 1.0596,
+      "step": 255
+    },
+    {
+      "epoch": 1.71,
+      "learning_rate": 3.942953020134229e-05,
+      "loss": 1.1421,
+      "step": 256
+    },
+    {
+      "epoch": 1.72,
+      "learning_rate": 3.938758389261745e-05,
+      "loss": 1.0909,
+      "step": 257
+    },
+    {
+      "epoch": 1.72,
+      "learning_rate": 3.934563758389262e-05,
+      "loss": 1.0902,
+      "step": 258
+    },
+    {
+      "epoch": 1.73,
+      "learning_rate": 3.9303691275167784e-05,
+      "loss": 1.1381,
+      "step": 259
+    },
+    {
+      "epoch": 1.74,
+      "learning_rate": 3.9261744966442954e-05,
+      "loss": 1.1158,
+      "step": 260
+    },
+    {
+      "epoch": 1.74,
+      "learning_rate": 3.9219798657718124e-05,
+      "loss": 1.0368,
+      "step": 261
+    },
+    {
+      "epoch": 1.75,
+      "learning_rate": 3.917785234899329e-05,
+      "loss": 1.0452,
+      "step": 262
+    },
+    {
+      "epoch": 1.76,
+      "learning_rate": 3.9135906040268464e-05,
+      "loss": 1.1253,
+      "step": 263
+    },
+    {
+      "epoch": 1.76,
+      "learning_rate": 3.909395973154363e-05,
+      "loss": 1.066,
+      "step": 264
+    },
+    {
+      "epoch": 1.77,
+      "learning_rate": 3.90520134228188e-05,
+      "loss": 0.997,
+      "step": 265
+    },
+    {
+      "epoch": 1.78,
+      "learning_rate": 3.901006711409396e-05,
+      "loss": 1.168,
+      "step": 266
+    },
+    {
+      "epoch": 1.78,
+      "learning_rate": 3.896812080536913e-05,
+      "loss": 1.0828,
+      "step": 267
+    },
+    {
+      "epoch": 1.79,
+      "learning_rate": 3.89261744966443e-05,
+      "loss": 0.9894,
+      "step": 268
+    },
+    {
+      "epoch": 1.8,
+      "learning_rate": 3.888422818791946e-05,
+      "loss": 1.0228,
+      "step": 269
+    },
+    {
+      "epoch": 1.8,
+      "learning_rate": 3.884228187919463e-05,
+      "loss": 1.0008,
+      "step": 270
+    },
+    {
+      "epoch": 1.81,
+      "learning_rate": 3.8800335570469796e-05,
+      "loss": 1.1658,
+      "step": 271
+    },
+    {
+      "epoch": 1.82,
+      "learning_rate": 3.875838926174497e-05,
+      "loss": 1.0294,
+      "step": 272
+    },
+    {
+      "epoch": 1.82,
+      "learning_rate": 3.8716442953020136e-05,
+      "loss": 1.0961,
+      "step": 273
+    },
+    {
+      "epoch": 1.83,
+      "learning_rate": 3.8674496644295306e-05,
+      "loss": 1.056,
+      "step": 274
+    },
+    {
+      "epoch": 1.84,
+      "learning_rate": 3.863255033557047e-05,
+      "loss": 1.1555,
+      "step": 275
+    },
+    {
+      "epoch": 1.84,
+      "learning_rate": 3.859060402684564e-05,
+      "loss": 1.0154,
+      "step": 276
+    },
+    {
+      "epoch": 1.85,
+      "learning_rate": 3.854865771812081e-05,
+      "loss": 1.1223,
+      "step": 277
+    },
+    {
+      "epoch": 1.86,
+      "learning_rate": 3.850671140939597e-05,
+      "loss": 1.024,
+      "step": 278
+    },
+    {
+      "epoch": 1.86,
+      "learning_rate": 3.846476510067114e-05,
+      "loss": 1.1126,
+      "step": 279
+    },
+    {
+      "epoch": 1.87,
+      "learning_rate": 3.8422818791946305e-05,
+      "loss": 1.0596,
+      "step": 280
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 3.838087248322148e-05,
+      "loss": 0.9871,
+      "step": 281
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 3.8338926174496645e-05,
+      "loss": 1.0154,
+      "step": 282
+    },
+    {
+      "epoch": 1.89,
+      "learning_rate": 3.8296979865771815e-05,
+      "loss": 1.0347,
+      "step": 283
+    },
+    {
+      "epoch": 1.9,
+      "learning_rate": 3.8255033557046985e-05,
+      "loss": 1.0645,
+      "step": 284
+    },
+    {
+      "epoch": 1.9,
+      "learning_rate": 3.821308724832215e-05,
+      "loss": 1.1103,
+      "step": 285
+    },
+    {
+      "epoch": 1.91,
+      "learning_rate": 3.817114093959732e-05,
+      "loss": 1.0555,
+      "step": 286
+    },
+    {
+      "epoch": 1.92,
+      "learning_rate": 3.812919463087248e-05,
+      "loss": 1.0087,
+      "step": 287
+    },
+    {
+      "epoch": 1.92,
+      "learning_rate": 3.808724832214765e-05,
+      "loss": 1.0094,
+      "step": 288
+    },
+    {
+      "epoch": 1.93,
+      "learning_rate": 3.804530201342282e-05,
+      "loss": 1.0472,
+      "step": 289
+    },
+    {
+      "epoch": 1.94,
+      "learning_rate": 3.800335570469799e-05,
+      "loss": 1.0593,
+      "step": 290
+    },
+    {
+      "epoch": 1.94,
+      "learning_rate": 3.7961409395973154e-05,
+      "loss": 0.9796,
+      "step": 291
+    },
+    {
+      "epoch": 1.95,
+      "learning_rate": 3.7919463087248324e-05,
+      "loss": 1.095,
+      "step": 292
+    },
+    {
+      "epoch": 1.96,
+      "learning_rate": 3.7877516778523494e-05,
+      "loss": 1.1117,
+      "step": 293
+    },
+    {
+      "epoch": 1.96,
+      "learning_rate": 3.783557046979866e-05,
+      "loss": 1.0029,
+      "step": 294
+    },
+    {
+      "epoch": 1.97,
+      "learning_rate": 3.779362416107383e-05,
+      "loss": 1.0608,
+      "step": 295
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 3.775167785234899e-05,
+      "loss": 0.9456,
+      "step": 296
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 3.770973154362416e-05,
+      "loss": 1.0542,
+      "step": 297
+    },
+    {
+      "epoch": 1.99,
+      "learning_rate": 3.766778523489933e-05,
+      "loss": 1.1852,
+      "step": 298
+    },
+    {
+      "epoch": 2.0,
+      "learning_rate": 3.76258389261745e-05,
+      "loss": 1.1161,
+      "step": 299
+    },
+    {
+      "epoch": 2.0,
+      "learning_rate": 3.758389261744967e-05,
+      "loss": 0.6874,
+      "step": 300
+    },
+    {
+      "epoch": 2.01,
+      "learning_rate": 3.7541946308724834e-05,
+      "loss": 0.2995,
+      "step": 301
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 0.3496,
+      "step": 302
+    },
+    {
+      "epoch": 2.02,
+      "learning_rate": 3.745805369127517e-05,
+      "loss": 0.3164,
+      "step": 303
+    },
+    {
+      "epoch": 2.03,
+      "learning_rate": 3.741610738255034e-05,
+      "loss": 0.3035,
+      "step": 304
+    },
+    {
+      "epoch": 2.04,
+      "learning_rate": 3.7374161073825507e-05,
+      "loss": 0.2868,
+      "step": 305
+    },
+    {
+      "epoch": 2.04,
+      "learning_rate": 3.733221476510067e-05,
+      "loss": 0.3521,
+      "step": 306
+    },
+    {
+      "epoch": 2.05,
+      "learning_rate": 3.7290268456375846e-05,
+      "loss": 0.3126,
+      "step": 307
+    },
+    {
+      "epoch": 2.06,
+      "learning_rate": 3.724832214765101e-05,
+      "loss": 0.3399,
+      "step": 308
+    },
+    {
+      "epoch": 2.06,
+      "learning_rate": 3.720637583892618e-05,
+      "loss": 0.2909,
+      "step": 309
+    },
+    {
+      "epoch": 2.07,
+      "learning_rate": 3.716442953020134e-05,
+      "loss": 0.3073,
+      "step": 310
+    },
+    {
+      "epoch": 2.08,
+      "learning_rate": 3.712248322147651e-05,
+      "loss": 0.2761,
+      "step": 311
+    },
+    {
+      "epoch": 2.08,
+      "learning_rate": 3.7080536912751676e-05,
+      "loss": 0.3051,
+      "step": 312
+    },
+    {
+      "epoch": 2.09,
+      "learning_rate": 3.7038590604026846e-05,
+      "loss": 0.3028,
+      "step": 313
+    },
+    {
+      "epoch": 2.1,
+      "learning_rate": 3.6996644295302016e-05,
+      "loss": 0.2823,
+      "step": 314
+    },
+    {
+      "epoch": 2.1,
+      "learning_rate": 3.695469798657718e-05,
+      "loss": 0.2829,
+      "step": 315
+    },
+    {
+      "epoch": 2.11,
+      "learning_rate": 3.6912751677852356e-05,
+      "loss": 0.2801,
+      "step": 316
+    },
+    {
+      "epoch": 2.12,
+      "learning_rate": 3.687080536912752e-05,
+      "loss": 0.3159,
+      "step": 317
+    },
+    {
+      "epoch": 2.12,
+      "learning_rate": 3.682885906040269e-05,
+      "loss": 0.3164,
+      "step": 318
+    },
+    {
+      "epoch": 2.13,
+      "learning_rate": 3.678691275167785e-05,
+      "loss": 0.2789,
+      "step": 319
+    },
+    {
+      "epoch": 2.14,
+      "learning_rate": 3.674496644295302e-05,
+      "loss": 0.3364,
+      "step": 320
+    },
+    {
+      "epoch": 2.14,
+      "learning_rate": 3.670302013422819e-05,
+      "loss": 0.3064,
+      "step": 321
+    },
+    {
+      "epoch": 2.15,
+      "learning_rate": 3.6661073825503355e-05,
+      "loss": 0.3183,
+      "step": 322
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 3.6619127516778525e-05,
+      "loss": 0.3108,
+      "step": 323
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 3.6577181208053695e-05,
+      "loss": 0.2877,
+      "step": 324
+    },
+    {
+      "epoch": 2.17,
+      "learning_rate": 3.6535234899328865e-05,
+      "loss": 0.2948,
+      "step": 325
+    },
+    {
+      "epoch": 2.18,
+      "learning_rate": 3.649328859060403e-05,
+      "loss": 0.3345,
+      "step": 326
+    },
+    {
+      "epoch": 2.18,
+      "learning_rate": 3.64513422818792e-05,
+      "loss": 0.3397,
+      "step": 327
+    },
+    {
+      "epoch": 2.19,
+      "learning_rate": 3.640939597315436e-05,
+      "loss": 0.2967,
+      "step": 328
+    },
+    {
+      "epoch": 2.2,
+      "learning_rate": 3.636744966442953e-05,
+      "loss": 0.2848,
+      "step": 329
+    },
+    {
+      "epoch": 2.2,
+      "learning_rate": 3.63255033557047e-05,
+      "loss": 0.3183,
+      "step": 330
+    },
+    {
+      "epoch": 2.21,
+      "learning_rate": 3.6283557046979864e-05,
+      "loss": 0.3042,
+      "step": 331
+    },
+    {
+      "epoch": 2.22,
+      "learning_rate": 3.6241610738255034e-05,
+      "loss": 0.3128,
+      "step": 332
+    },
+    {
+      "epoch": 2.22,
+      "learning_rate": 3.6199664429530204e-05,
+      "loss": 0.3144,
+      "step": 333
+    },
+    {
+      "epoch": 2.23,
+      "learning_rate": 3.6157718120805374e-05,
+      "loss": 0.288,
+      "step": 334
+    },
+    {
+      "epoch": 2.24,
+      "learning_rate": 3.611577181208054e-05,
+      "loss": 0.305,
+      "step": 335
+    },
+    {
+      "epoch": 2.24,
+      "learning_rate": 3.607382550335571e-05,
+      "loss": 0.3115,
+      "step": 336
+    },
+    {
+      "epoch": 2.25,
+      "learning_rate": 3.603187919463088e-05,
+      "loss": 0.3321,
+      "step": 337
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 3.598993288590604e-05,
+      "loss": 0.2799,
+      "step": 338
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 3.594798657718121e-05,
+      "loss": 0.2965,
+      "step": 339
+    },
+    {
+      "epoch": 2.27,
+      "learning_rate": 3.5906040268456373e-05,
+      "loss": 0.3046,
+      "step": 340
+    },
+    {
+      "epoch": 2.28,
+      "learning_rate": 3.5864093959731543e-05,
+      "loss": 0.2886,
+      "step": 341
+    },
+    {
+      "epoch": 2.28,
+      "learning_rate": 3.582214765100671e-05,
+      "loss": 0.3059,
+      "step": 342
+    },
+    {
+      "epoch": 2.29,
+      "learning_rate": 3.578020134228188e-05,
+      "loss": 0.3201,
+      "step": 343
+    },
+    {
+      "epoch": 2.3,
+      "learning_rate": 3.5738255033557046e-05,
+      "loss": 0.2766,
+      "step": 344
+    },
+    {
+      "epoch": 2.3,
+      "learning_rate": 3.5696308724832216e-05,
+      "loss": 0.2934,
+      "step": 345
+    },
+    {
+      "epoch": 2.31,
+      "learning_rate": 3.5654362416107386e-05,
+      "loss": 0.2731,
+      "step": 346
+    },
+    {
+      "epoch": 2.32,
+      "learning_rate": 3.561241610738255e-05,
+      "loss": 0.2676,
+      "step": 347
+    },
+    {
+      "epoch": 2.32,
+      "learning_rate": 3.557046979865772e-05,
+      "loss": 0.2982,
+      "step": 348
+    },
+    {
+      "epoch": 2.33,
+      "learning_rate": 3.552852348993288e-05,
+      "loss": 0.2929,
+      "step": 349
+    },
+    {
+      "epoch": 2.34,
+      "learning_rate": 3.548657718120805e-05,
+      "loss": 0.2891,
+      "step": 350
+    },
+    {
+      "epoch": 2.34,
+      "learning_rate": 3.544463087248322e-05,
+      "loss": 0.2792,
+      "step": 351
+    },
+    {
+      "epoch": 2.35,
+      "learning_rate": 3.540268456375839e-05,
+      "loss": 0.328,
+      "step": 352
+    },
+    {
+      "epoch": 2.36,
+      "learning_rate": 3.536073825503356e-05,
+      "loss": 0.2943,
+      "step": 353
+    },
+    {
+      "epoch": 2.36,
+      "learning_rate": 3.5318791946308726e-05,
+      "loss": 0.2564,
+      "step": 354
+    },
+    {
+      "epoch": 2.37,
+      "learning_rate": 3.5276845637583896e-05,
+      "loss": 0.2757,
+      "step": 355
+    },
+    {
+      "epoch": 2.38,
+      "learning_rate": 3.523489932885906e-05,
+      "loss": 0.3344,
+      "step": 356
+    },
+    {
+      "epoch": 2.38,
+      "learning_rate": 3.519295302013423e-05,
+      "loss": 0.2906,
+      "step": 357
+    },
+    {
+      "epoch": 2.39,
+      "learning_rate": 3.51510067114094e-05,
+      "loss": 0.2798,
+      "step": 358
+    },
+    {
+      "epoch": 2.4,
+      "learning_rate": 3.510906040268457e-05,
+      "loss": 0.2937,
+      "step": 359
+    },
+    {
+      "epoch": 2.4,
+      "learning_rate": 3.506711409395974e-05,
+      "loss": 0.2893,
+      "step": 360
+    },
+    {
+      "epoch": 2.41,
+      "learning_rate": 3.50251677852349e-05,
+      "loss": 0.299,
+      "step": 361
+    },
+    {
+      "epoch": 2.42,
+      "learning_rate": 3.498322147651007e-05,
+      "loss": 0.3113,
+      "step": 362
+    },
+    {
+      "epoch": 2.42,
+      "learning_rate": 3.4941275167785235e-05,
+      "loss": 0.3086,
+      "step": 363
+    },
+    {
+      "epoch": 2.43,
+      "learning_rate": 3.4899328859060405e-05,
+      "loss": 0.3392,
+      "step": 364
+    },
+    {
+      "epoch": 2.44,
+      "learning_rate": 3.485738255033557e-05,
+      "loss": 0.2652,
+      "step": 365
+    },
+    {
+      "epoch": 2.44,
+      "learning_rate": 3.481543624161074e-05,
+      "loss": 0.2882,
+      "step": 366
+    },
+    {
+      "epoch": 2.45,
+      "learning_rate": 3.477348993288591e-05,
+      "loss": 0.2948,
+      "step": 367
+    },
+    {
+      "epoch": 2.46,
+      "learning_rate": 3.473154362416108e-05,
+      "loss": 0.2739,
+      "step": 368
+    },
+    {
+      "epoch": 2.46,
+      "learning_rate": 3.468959731543625e-05,
+      "loss": 0.2882,
+      "step": 369
+    },
+    {
+      "epoch": 2.47,
+      "learning_rate": 3.464765100671141e-05,
+      "loss": 0.3521,
+      "step": 370
+    },
+    {
+      "epoch": 2.48,
+      "learning_rate": 3.460570469798658e-05,
+      "loss": 0.2963,
+      "step": 371
+    },
+    {
+      "epoch": 2.48,
+      "learning_rate": 3.4563758389261744e-05,
+      "loss": 0.2624,
+      "step": 372
+    },
+    {
+      "epoch": 2.49,
+      "learning_rate": 3.4521812080536914e-05,
+      "loss": 0.2858,
+      "step": 373
+    },
+    {
+      "epoch": 2.5,
+      "learning_rate": 3.4479865771812084e-05,
+      "loss": 0.3327,
+      "step": 374
+    },
+    {
+      "epoch": 2.5,
+      "learning_rate": 3.443791946308725e-05,
+      "loss": 0.3024,
+      "step": 375
+    },
+    {
+      "epoch": 2.51,
+      "learning_rate": 3.439597315436242e-05,
+      "loss": 0.308,
+      "step": 376
+    },
+    {
+      "epoch": 2.52,
+      "learning_rate": 3.435402684563759e-05,
+      "loss": 0.2932,
+      "step": 377
+    },
+    {
+      "epoch": 2.52,
+      "learning_rate": 3.431208053691276e-05,
+      "loss": 0.324,
+      "step": 378
+    },
+    {
+      "epoch": 2.53,
+      "learning_rate": 3.427013422818792e-05,
+      "loss": 0.325,
+      "step": 379
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 3.422818791946309e-05,
+      "loss": 0.3378,
+      "step": 380
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 3.418624161073825e-05,
+      "loss": 0.3061,
+      "step": 381
+    },
+    {
+      "epoch": 2.55,
+      "learning_rate": 3.414429530201342e-05,
+      "loss": 0.2856,
+      "step": 382
+    },
+    {
+      "epoch": 2.56,
+      "learning_rate": 3.410234899328859e-05,
+      "loss": 0.3567,
+      "step": 383
+    },
+    {
+      "epoch": 2.56,
+      "learning_rate": 3.4060402684563756e-05,
+      "loss": 0.3063,
+      "step": 384
+    },
+    {
+      "epoch": 2.57,
+      "learning_rate": 3.4018456375838926e-05,
+      "loss": 0.2806,
+      "step": 385
+    },
+    {
+      "epoch": 2.58,
+      "learning_rate": 3.3976510067114096e-05,
+      "loss": 0.3239,
+      "step": 386
+    },
+    {
+      "epoch": 2.58,
+      "learning_rate": 3.3934563758389266e-05,
+      "loss": 0.3089,
+      "step": 387
+    },
+    {
+      "epoch": 2.59,
+      "learning_rate": 3.389261744966443e-05,
+      "loss": 0.3388,
+      "step": 388
+    },
+    {
+      "epoch": 2.6,
+      "learning_rate": 3.38506711409396e-05,
+      "loss": 0.2863,
+      "step": 389
+    },
+    {
+      "epoch": 2.6,
+      "learning_rate": 3.380872483221477e-05,
+      "loss": 0.2794,
+      "step": 390
+    },
+    {
+      "epoch": 2.61,
+      "learning_rate": 3.376677852348993e-05,
+      "loss": 0.3331,
+      "step": 391
+    },
+    {
+      "epoch": 2.62,
+      "learning_rate": 3.37248322147651e-05,
+      "loss": 0.3712,
+      "step": 392
+    },
+    {
+      "epoch": 2.62,
+      "learning_rate": 3.3682885906040266e-05,
+      "loss": 0.2779,
+      "step": 393
+    },
+    {
+      "epoch": 2.63,
+      "learning_rate": 3.3640939597315436e-05,
+      "loss": 0.3076,
+      "step": 394
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 3.3598993288590605e-05,
+      "loss": 0.3019,
+      "step": 395
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 3.3557046979865775e-05,
+      "loss": 0.3249,
+      "step": 396
+    },
+    {
+      "epoch": 2.65,
+      "learning_rate": 3.3515100671140945e-05,
+      "loss": 0.3447,
+      "step": 397
+    },
+    {
+      "epoch": 2.66,
+      "learning_rate": 3.347315436241611e-05,
+      "loss": 0.3528,
+      "step": 398
+    },
+    {
+      "epoch": 2.66,
+      "learning_rate": 3.343120805369128e-05,
+      "loss": 0.3519,
+      "step": 399
+    },
+    {
+      "epoch": 2.67,
+      "learning_rate": 3.338926174496644e-05,
+      "loss": 0.3262,
+      "step": 400
+    },
+    {
+      "epoch": 2.68,
+      "learning_rate": 3.334731543624161e-05,
+      "loss": 0.3264,
+      "step": 401
+    },
+    {
+      "epoch": 2.68,
+      "learning_rate": 3.3305369127516775e-05,
+      "loss": 0.295,
+      "step": 402
+    },
+    {
+      "epoch": 2.69,
+      "learning_rate": 3.326342281879195e-05,
+      "loss": 0.2763,
+      "step": 403
+    },
+    {
+      "epoch": 2.7,
+      "learning_rate": 3.3221476510067115e-05,
+      "loss": 0.2954,
+      "step": 404
+    },
+    {
+      "epoch": 2.7,
+      "learning_rate": 3.3179530201342285e-05,
+      "loss": 0.3035,
+      "step": 405
+    },
+    {
+      "epoch": 2.71,
+      "learning_rate": 3.3137583892617455e-05,
+      "loss": 0.3123,
+      "step": 406
+    },
+    {
+      "epoch": 2.72,
+      "learning_rate": 3.309563758389262e-05,
+      "loss": 0.3183,
+      "step": 407
+    },
+    {
+      "epoch": 2.72,
+      "learning_rate": 3.305369127516779e-05,
+      "loss": 0.3119,
+      "step": 408
+    },
+    {
+      "epoch": 2.73,
+      "learning_rate": 3.301174496644295e-05,
+      "loss": 0.3318,
+      "step": 409
+    },
+    {
+      "epoch": 2.74,
+      "learning_rate": 3.296979865771812e-05,
+      "loss": 0.355,
+      "step": 410
+    },
+    {
+      "epoch": 2.74,
+      "learning_rate": 3.292785234899329e-05,
+      "loss": 0.2798,
+      "step": 411
+    },
+    {
+      "epoch": 2.75,
+      "learning_rate": 3.288590604026846e-05,
+      "loss": 0.3053,
+      "step": 412
+    },
+    {
+      "epoch": 2.76,
+      "learning_rate": 3.284395973154363e-05,
+      "loss": 0.3159,
+      "step": 413
+    },
+    {
+      "epoch": 2.76,
+      "learning_rate": 3.2802013422818794e-05,
+      "loss": 0.2852,
+      "step": 414
+    },
+    {
+      "epoch": 2.77,
+      "learning_rate": 3.2760067114093964e-05,
+      "loss": 0.2801,
+      "step": 415
+    },
+    {
+      "epoch": 2.78,
+      "learning_rate": 3.271812080536913e-05,
+      "loss": 0.2946,
+      "step": 416
+    },
+    {
+      "epoch": 2.78,
+      "learning_rate": 3.26761744966443e-05,
+      "loss": 0.2972,
+      "step": 417
+    },
+    {
+      "epoch": 2.79,
+      "learning_rate": 3.263422818791946e-05,
+      "loss": 0.3313,
+      "step": 418
+    },
+    {
+      "epoch": 2.8,
+      "learning_rate": 3.259228187919463e-05,
+      "loss": 0.3031,
+      "step": 419
+    },
+    {
+      "epoch": 2.8,
+      "learning_rate": 3.25503355704698e-05,
+      "loss": 0.2732,
+      "step": 420
+    },
+    {
+      "epoch": 2.81,
+      "learning_rate": 3.250838926174497e-05,
+      "loss": 0.2799,
+      "step": 421
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 3.246644295302014e-05,
+      "loss": 0.3164,
+      "step": 422
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 3.24244966442953e-05,
+      "loss": 0.3081,
+      "step": 423
+    },
+    {
+      "epoch": 2.83,
+      "learning_rate": 3.238255033557047e-05,
+      "loss": 0.311,
+      "step": 424
+    },
+    {
+      "epoch": 2.84,
+      "learning_rate": 3.2340604026845636e-05,
+      "loss": 0.2908,
+      "step": 425
+    },
+    {
+      "epoch": 2.84,
+      "learning_rate": 3.2298657718120806e-05,
+      "loss": 0.3139,
+      "step": 426
+    },
+    {
+      "epoch": 2.85,
+      "learning_rate": 3.2256711409395976e-05,
+      "loss": 0.3179,
+      "step": 427
+    },
+    {
+      "epoch": 2.86,
+      "learning_rate": 3.221476510067114e-05,
+      "loss": 0.2754,
+      "step": 428
+    },
+    {
+      "epoch": 2.86,
+      "learning_rate": 3.217281879194631e-05,
+      "loss": 0.2746,
+      "step": 429
+    },
+    {
+      "epoch": 2.87,
+      "learning_rate": 3.213087248322148e-05,
+      "loss": 0.3103,
+      "step": 430
+    },
+    {
+      "epoch": 2.88,
+      "learning_rate": 3.208892617449665e-05,
+      "loss": 0.2891,
+      "step": 431
+    },
+    {
+      "epoch": 2.88,
+      "learning_rate": 3.204697986577181e-05,
+      "loss": 0.3248,
+      "step": 432
+    },
+    {
+      "epoch": 2.89,
+      "learning_rate": 3.200503355704698e-05,
+      "loss": 0.3306,
+      "step": 433
+    },
+    {
+      "epoch": 2.9,
+      "learning_rate": 3.196308724832215e-05,
+      "loss": 0.3585,
+      "step": 434
+    },
+    {
+      "epoch": 2.9,
+      "learning_rate": 3.1921140939597315e-05,
+      "loss": 0.3127,
+      "step": 435
+    },
+    {
+      "epoch": 2.91,
+      "learning_rate": 3.1879194630872485e-05,
+      "loss": 0.2553,
+      "step": 436
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 3.183724832214765e-05,
+      "loss": 0.3163,
+      "step": 437
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 3.1795302013422825e-05,
+      "loss": 0.2985,
+      "step": 438
+    },
+    {
+      "epoch": 2.93,
+      "learning_rate": 3.175335570469799e-05,
+      "loss": 0.2864,
+      "step": 439
+    },
+    {
+      "epoch": 2.94,
+      "learning_rate": 3.171140939597316e-05,
+      "loss": 0.2892,
+      "step": 440
+    },
+    {
+      "epoch": 2.94,
+      "learning_rate": 3.166946308724832e-05,
+      "loss": 0.2679,
+      "step": 441
+    },
+    {
+      "epoch": 2.95,
+      "learning_rate": 3.162751677852349e-05,
+      "loss": 0.2818,
+      "step": 442
+    },
+    {
+      "epoch": 2.96,
+      "learning_rate": 3.158557046979866e-05,
+      "loss": 0.3748,
+      "step": 443
+    },
+    {
+      "epoch": 2.96,
+      "learning_rate": 3.1543624161073825e-05,
+      "loss": 0.3409,
+      "step": 444
+    },
+    {
+      "epoch": 2.97,
+      "learning_rate": 3.1501677852348995e-05,
+      "loss": 0.2921,
+      "step": 445
+    },
+    {
+      "epoch": 2.98,
+      "learning_rate": 3.145973154362416e-05,
+      "loss": 0.3034,
+      "step": 446
+    },
+    {
+      "epoch": 2.98,
+      "learning_rate": 3.1417785234899334e-05,
+      "loss": 0.3328,
+      "step": 447
+    },
+    {
+      "epoch": 2.99,
+      "learning_rate": 3.13758389261745e-05,
+      "loss": 0.3351,
+      "step": 448
+    },
+    {
+      "epoch": 3.0,
+      "learning_rate": 3.133389261744967e-05,
+      "loss": 0.3032,
+      "step": 449
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1192,
+  "num_train_epochs": 8,
+  "save_steps": 32,
+  "total_flos": 1.1252392353438106e+18,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e958a56f84f43fcb3a0dd6ceff97b22bbd87dcdac1c403dc3affa48704646e3e
+size 4091