dophys commited on Sep 22

Commit

83f6dbf

•

1 Parent(s): cdc9691

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +2 -0
1_Pooling/config.json +10 -0
README.md +144 -0
checkpoint-853/1_Pooling/config.json +10 -0
checkpoint-853/README.md +144 -0
checkpoint-853/config.json +28 -0
checkpoint-853/config_sentence_transformers.json +10 -0
checkpoint-853/global_step853/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
checkpoint-853/global_step853/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
checkpoint-853/global_step853/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
checkpoint-853/global_step853/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
checkpoint-853/global_step853/mp_rank_00_model_states.pt +3 -0
checkpoint-853/latest +1 -0
checkpoint-853/model.safetensors +3 -0
checkpoint-853/modules.json +20 -0
checkpoint-853/rng_state_0.pth +3 -0
checkpoint-853/rng_state_1.pth +3 -0
checkpoint-853/rng_state_2.pth +3 -0
checkpoint-853/rng_state_3.pth +3 -0
checkpoint-853/sentence_bert_config.json +4 -0
checkpoint-853/sentencepiece.bpe.model +3 -0
checkpoint-853/special_tokens_map.json +51 -0
checkpoint-853/tokenizer.json +3 -0
checkpoint-853/tokenizer_config.json +55 -0
checkpoint-853/trainer_state.json +3015 -0
checkpoint-853/training_args.bin +3 -0
checkpoint-853/zero_to_fp32.py +604 -0
config.json +28 -0
config_sentence_transformers.json +10 -0
model.safetensors +3 -0
modules.json +20 -0
runs/Aug22_17-17-24_autodl-container-c024408f5d-9bcd732d/events.out.tfevents.1724318254.autodl-container-c024408f5d-9bcd732d.5345.0 +3 -0
runs/Aug22_17-18-40_autodl-container-c024408f5d-9bcd732d/events.out.tfevents.1724318333.autodl-container-c024408f5d-9bcd732d.6318.0 +3 -0
sentence_bert_config.json +4 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +55 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-853/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,144 @@

+---
+datasets: []
+language: []
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+widget: []
+---
+# SentenceTransformer
+This is a [sentence-transformers](https://www.SBERT.net) model trained. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
+- **Maximum Sequence Length:** 8192 tokens
+- **Output Dimensionality:** 1024 tokens
+- **Similarity Function:** Cosine Similarity
+<!-- - **Training Dataset:** Unknown -->
+<!-- - **Language:** Unknown -->
+<!-- - **License:** Unknown -->
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("sentence_transformers_model_id")
+# Run inference
+sentences = [
+    'The weather is lovely today.',
+    "It's so sunny outside!",
+    'He drove to the stadium.',
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 1024]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities.shape)
+# [3, 3]
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Framework Versions
+- Python: 3.12.3
+- Sentence Transformers: 3.0.1
+- Transformers: 4.42.1
+- PyTorch: 2.3.0+cu121
+- Accelerate: 0.31.0
+- Datasets: 2.20.0
+- Tokenizers: 0.19.1
+## Citation
+### BibTeX
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

checkpoint-853/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

checkpoint-853/README.md ADDED Viewed

	@@ -0,0 +1,144 @@

+---
+datasets: []
+language: []
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+widget: []
+---
+# SentenceTransformer
+This is a [sentence-transformers](https://www.SBERT.net) model trained. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
+- **Maximum Sequence Length:** 8192 tokens
+- **Output Dimensionality:** 1024 tokens
+- **Similarity Function:** Cosine Similarity
+<!-- - **Training Dataset:** Unknown -->
+<!-- - **Language:** Unknown -->
+<!-- - **License:** Unknown -->
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("sentence_transformers_model_id")
+# Run inference
+sentences = [
+    'The weather is lovely today.',
+    "It's so sunny outside!",
+    'He drove to the stadium.',
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 1024]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities.shape)
+# [3, 3]
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Framework Versions
+- Python: 3.12.3
+- Sentence Transformers: 3.0.1
+- Transformers: 4.42.1
+- PyTorch: 2.3.0+cu121
+- Accelerate: 0.31.0
+- Datasets: 2.20.0
+- Tokenizers: 0.19.1
+## Citation
+### BibTeX
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

checkpoint-853/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/root/autodl-tmp/bge-m3_r4/checkpoint-853",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

checkpoint-853/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.0.1",
+    "transformers": "4.42.1",
+    "pytorch": "2.3.0+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": null
+}

checkpoint-853/global_step853/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac25212b0dd7c1c33af9d13534f23a2f21eab85bf4c9e2e0f1128572d543b424
+size 1703267856

checkpoint-853/global_step853/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bdff9ef70123a1f1f4dad2085d3a4b06230abdf3586968517c8dc6aa38cbf9f
+size 1703270288

checkpoint-853/global_step853/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1c03c434680e35f1b5c8e28cdcf1cd0576573eaca6204bf2443374bb66ee115
+size 1703283152

checkpoint-853/global_step853/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb0bf36954e8652094b0ae7bddbb405e2158e63dbeaad5c4b3db557401ede14
+size 1703283472

checkpoint-853/global_step853/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85e1bc9c6dfde3f944cdedf1fa2ff240aaba02f760d9b2d19a313a37ad62e470
+size 1135627884

checkpoint-853/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step853

checkpoint-853/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff2b3be09c7552fc58248f097a32771e376f56eb50737f93e0f41cef389d71d
+size 2271064456

checkpoint-853/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

checkpoint-853/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea33f4d3831c1aa0fff107f94bdb1e3f44f9720a90a553c21c0e244b5b8e09f0
+size 14960

checkpoint-853/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:605ff35753da8ab8637f8d7e383db916a3b20e716fe7662e9f25bf8f312a8a16
+size 14960

checkpoint-853/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8113ead6f45e448461169ca78aec213478a5414d0f423bf84379df9f5363aea
+size 14960

checkpoint-853/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86bee34831933d6a7c1c2561a8aedecde5c4cd60815ddd9e2a1b33907cc7843f
+size 14960

checkpoint-853/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}

checkpoint-853/sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

checkpoint-853/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-853/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
+size 17098251

checkpoint-853/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

checkpoint-853/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3015 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9994141769185706,
+  "eval_steps": 500,
+  "global_step": 853,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0023432923257176333,
+      "grad_norm": 0.00011052378977183253,
+      "learning_rate": 5e-06,
+      "loss": 0.0,
+      "step": 2
+    },
+    {
+      "epoch": 0.0046865846514352666,
+      "grad_norm": 0.00020697808940894902,
+      "learning_rate": 4.9941245593419514e-06,
+      "loss": 0.0,
+      "step": 4
+    },
+    {
+      "epoch": 0.007029876977152899,
+      "grad_norm": 0.0012532881228253245,
+      "learning_rate": 4.982373678025853e-06,
+      "loss": 0.0,
+      "step": 6
+    },
+    {
+      "epoch": 0.009373169302870533,
+      "grad_norm": 0.0008086035377345979,
+      "learning_rate": 4.970622796709754e-06,
+      "loss": 0.0,
+      "step": 8
+    },
+    {
+      "epoch": 0.011716461628588167,
+      "grad_norm": 0.0021155672147870064,
+      "learning_rate": 4.958871915393655e-06,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.014059753954305799,
+      "grad_norm": 0.0012233309680595994,
+      "learning_rate": 4.947121034077556e-06,
+      "loss": 0.0,
+      "step": 12
+    },
+    {
+      "epoch": 0.016403046280023433,
+      "grad_norm": 0.0027737286873161793,
+      "learning_rate": 4.9353701527614576e-06,
+      "loss": 0.0,
+      "step": 14
+    },
+    {
+      "epoch": 0.018746338605741066,
+      "grad_norm": 0.0042906939052045345,
+      "learning_rate": 4.923619271445359e-06,
+      "loss": 0.0,
+      "step": 16
+    },
+    {
+      "epoch": 0.0210896309314587,
+      "grad_norm": 0.0005172386299818754,
+      "learning_rate": 4.91186839012926e-06,
+      "loss": 0.0,
+      "step": 18
+    },
+    {
+      "epoch": 0.023432923257176334,
+      "grad_norm": 0.002410772955045104,
+      "learning_rate": 4.900117508813161e-06,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.025776215582893967,
+      "grad_norm": 0.6443753242492676,
+      "learning_rate": 4.8883666274970625e-06,
+      "loss": 0.0027,
+      "step": 22
+    },
+    {
+      "epoch": 0.028119507908611598,
+      "grad_norm": 0.004394118674099445,
+      "learning_rate": 4.876615746180964e-06,
+      "loss": 0.0001,
+      "step": 24
+    },
+    {
+      "epoch": 0.03046280023432923,
+      "grad_norm": 0.006466630846261978,
+      "learning_rate": 4.864864864864866e-06,
+      "loss": 0.0001,
+      "step": 26
+    },
+    {
+      "epoch": 0.032806092560046865,
+      "grad_norm": 0.011924203485250473,
+      "learning_rate": 4.853113983548767e-06,
+      "loss": 0.0001,
+      "step": 28
+    },
+    {
+      "epoch": 0.0351493848857645,
+      "grad_norm": 0.23746930062770844,
+      "learning_rate": 4.841363102232668e-06,
+      "loss": 0.0001,
+      "step": 30
+    },
+    {
+      "epoch": 0.03749267721148213,
+      "grad_norm": 0.0031001348979771137,
+      "learning_rate": 4.8296122209165694e-06,
+      "loss": 0.0,
+      "step": 32
+    },
+    {
+      "epoch": 0.03983596953719976,
+      "grad_norm": 0.0029028633143752813,
+      "learning_rate": 4.817861339600471e-06,
+      "loss": 0.0,
+      "step": 34
+    },
+    {
+      "epoch": 0.0421792618629174,
+      "grad_norm": 0.014626468531787395,
+      "learning_rate": 4.806110458284372e-06,
+      "loss": 0.0001,
+      "step": 36
+    },
+    {
+      "epoch": 0.04452255418863503,
+      "grad_norm": 0.001155451056547463,
+      "learning_rate": 4.794359576968273e-06,
+      "loss": 0.0,
+      "step": 38
+    },
+    {
+      "epoch": 0.04686584651435267,
+      "grad_norm": 0.003476829966530204,
+      "learning_rate": 4.782608695652174e-06,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.0492091388400703,
+      "grad_norm": 0.0002227002551080659,
+      "learning_rate": 4.7708578143360756e-06,
+      "loss": 0.0,
+      "step": 42
+    },
+    {
+      "epoch": 0.051552431165787935,
+      "grad_norm": 0.0001427282695658505,
+      "learning_rate": 4.759106933019977e-06,
+      "loss": 0.0,
+      "step": 44
+    },
+    {
+      "epoch": 0.053895723491505565,
+      "grad_norm": 0.0027408564928919077,
+      "learning_rate": 4.747356051703878e-06,
+      "loss": 0.0002,
+      "step": 46
+    },
+    {
+      "epoch": 0.056239015817223195,
+      "grad_norm": 0.0020253027323633432,
+      "learning_rate": 4.735605170387779e-06,
+      "loss": 0.0,
+      "step": 48
+    },
+    {
+      "epoch": 0.05858230814294083,
+      "grad_norm": 0.001760220737196505,
+      "learning_rate": 4.723854289071681e-06,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.06092560046865846,
+      "grad_norm": 0.0010492791188880801,
+      "learning_rate": 4.7121034077555825e-06,
+      "loss": 0.0,
+      "step": 52
+    },
+    {
+      "epoch": 0.0632688927943761,
+      "grad_norm": 0.002001305343583226,
+      "learning_rate": 4.700352526439484e-06,
+      "loss": 0.0,
+      "step": 54
+    },
+    {
+      "epoch": 0.06561218512009373,
+      "grad_norm": 0.18566887080669403,
+      "learning_rate": 4.688601645123384e-06,
+      "loss": 0.0009,
+      "step": 56
+    },
+    {
+      "epoch": 0.06795547744581136,
+      "grad_norm": 0.0009072807151824236,
+      "learning_rate": 4.676850763807285e-06,
+      "loss": 0.0,
+      "step": 58
+    },
+    {
+      "epoch": 0.070298769771529,
+      "grad_norm": 0.003983665257692337,
+      "learning_rate": 4.665099882491187e-06,
+      "loss": 0.0006,
+      "step": 60
+    },
+    {
+      "epoch": 0.07264206209724663,
+      "grad_norm": 0.01946200616657734,
+      "learning_rate": 4.653349001175089e-06,
+      "loss": 0.0001,
+      "step": 62
+    },
+    {
+      "epoch": 0.07498535442296426,
+      "grad_norm": 0.004048655740916729,
+      "learning_rate": 4.64159811985899e-06,
+      "loss": 0.0,
+      "step": 64
+    },
+    {
+      "epoch": 0.0773286467486819,
+      "grad_norm": 0.0005872617475688457,
+      "learning_rate": 4.629847238542891e-06,
+      "loss": 0.0001,
+      "step": 66
+    },
+    {
+      "epoch": 0.07967193907439953,
+      "grad_norm": 0.008831903338432312,
+      "learning_rate": 4.618096357226792e-06,
+      "loss": 0.0001,
+      "step": 68
+    },
+    {
+      "epoch": 0.08201523140011717,
+      "grad_norm": 0.006819219794124365,
+      "learning_rate": 4.6063454759106936e-06,
+      "loss": 0.0001,
+      "step": 70
+    },
+    {
+      "epoch": 0.0843585237258348,
+      "grad_norm": 0.0007863900391384959,
+      "learning_rate": 4.594594594594596e-06,
+      "loss": 0.0,
+      "step": 72
+    },
+    {
+      "epoch": 0.08670181605155243,
+      "grad_norm": 0.032210394740104675,
+      "learning_rate": 4.582843713278496e-06,
+      "loss": 0.0001,
+      "step": 74
+    },
+    {
+      "epoch": 0.08904510837727006,
+      "grad_norm": 0.2614983916282654,
+      "learning_rate": 4.571092831962397e-06,
+      "loss": 0.0008,
+      "step": 76
+    },
+    {
+      "epoch": 0.0913884007029877,
+      "grad_norm": 0.0012551415711641312,
+      "learning_rate": 4.5593419506462985e-06,
+      "loss": 0.0,
+      "step": 78
+    },
+    {
+      "epoch": 0.09373169302870533,
+      "grad_norm": 0.0019108065171167254,
+      "learning_rate": 4.5475910693302e-06,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.09607498535442296,
+      "grad_norm": 0.02294810675084591,
+      "learning_rate": 4.535840188014101e-06,
+      "loss": 0.0001,
+      "step": 82
+    },
+    {
+      "epoch": 0.0984182776801406,
+      "grad_norm": 0.0012388118775561452,
+      "learning_rate": 4.524089306698003e-06,
+      "loss": 0.0,
+      "step": 84
+    },
+    {
+      "epoch": 0.10076157000585823,
+      "grad_norm": 0.001227575121447444,
+      "learning_rate": 4.512338425381904e-06,
+      "loss": 0.0001,
+      "step": 86
+    },
+    {
+      "epoch": 0.10310486233157587,
+      "grad_norm": 0.004755712114274502,
+      "learning_rate": 4.5005875440658054e-06,
+      "loss": 0.0001,
+      "step": 88
+    },
+    {
+      "epoch": 0.1054481546572935,
+      "grad_norm": 0.00837083999067545,
+      "learning_rate": 4.488836662749707e-06,
+      "loss": 0.0001,
+      "step": 90
+    },
+    {
+      "epoch": 0.10779144698301113,
+      "grad_norm": 0.48219314217567444,
+      "learning_rate": 4.477085781433608e-06,
+      "loss": 0.0017,
+      "step": 92
+    },
+    {
+      "epoch": 0.11013473930872876,
+      "grad_norm": 0.022060217335820198,
+      "learning_rate": 4.465334900117509e-06,
+      "loss": 0.0001,
+      "step": 94
+    },
+    {
+      "epoch": 0.11247803163444639,
+      "grad_norm": 0.0019385352497920394,
+      "learning_rate": 4.45358401880141e-06,
+      "loss": 0.0,
+      "step": 96
+    },
+    {
+      "epoch": 0.11482132396016403,
+      "grad_norm": 0.01225442998111248,
+      "learning_rate": 4.4418331374853116e-06,
+      "loss": 0.0001,
+      "step": 98
+    },
+    {
+      "epoch": 0.11716461628588166,
+      "grad_norm": 0.0005759520572610199,
+      "learning_rate": 4.430082256169213e-06,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.1195079086115993,
+      "grad_norm": 0.02452813647687435,
+      "learning_rate": 4.418331374853114e-06,
+      "loss": 0.0001,
+      "step": 102
+    },
+    {
+      "epoch": 0.12185120093731693,
+      "grad_norm": 0.0078084710985422134,
+      "learning_rate": 4.406580493537015e-06,
+      "loss": 0.0001,
+      "step": 104
+    },
+    {
+      "epoch": 0.12419449326303457,
+      "grad_norm": 0.004263446666300297,
+      "learning_rate": 4.394829612220917e-06,
+      "loss": 0.0001,
+      "step": 106
+    },
+    {
+      "epoch": 0.1265377855887522,
+      "grad_norm": 0.0016304058954119682,
+      "learning_rate": 4.3830787309048185e-06,
+      "loss": 0.0001,
+      "step": 108
+    },
+    {
+      "epoch": 0.12888107791446984,
+      "grad_norm": 0.011672005988657475,
+      "learning_rate": 4.37132784958872e-06,
+      "loss": 0.0002,
+      "step": 110
+    },
+    {
+      "epoch": 0.13122437024018746,
+      "grad_norm": 0.002603155327960849,
+      "learning_rate": 4.359576968272621e-06,
+      "loss": 0.0,
+      "step": 112
+    },
+    {
+      "epoch": 0.1335676625659051,
+      "grad_norm": 0.005059251096099615,
+      "learning_rate": 4.347826086956522e-06,
+      "loss": 0.0001,
+      "step": 114
+    },
+    {
+      "epoch": 0.13591095489162272,
+      "grad_norm": 0.0005816388293169439,
+      "learning_rate": 4.3360752056404234e-06,
+      "loss": 0.0001,
+      "step": 116
+    },
+    {
+      "epoch": 0.13825424721734036,
+      "grad_norm": 0.019756818190217018,
+      "learning_rate": 4.324324324324325e-06,
+      "loss": 0.0001,
+      "step": 118
+    },
+    {
+      "epoch": 0.140597539543058,
+      "grad_norm": 0.0023519208189100027,
+      "learning_rate": 4.312573443008226e-06,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.14294083186877563,
+      "grad_norm": 0.0028086318634450436,
+      "learning_rate": 4.300822561692127e-06,
+      "loss": 0.0,
+      "step": 122
+    },
+    {
+      "epoch": 0.14528412419449327,
+      "grad_norm": 0.0022307527251541615,
+      "learning_rate": 4.289071680376028e-06,
+      "loss": 0.0,
+      "step": 124
+    },
+    {
+      "epoch": 0.14762741652021089,
+      "grad_norm": 0.014247684739530087,
+      "learning_rate": 4.2773207990599296e-06,
+      "loss": 0.0001,
+      "step": 126
+    },
+    {
+      "epoch": 0.14997070884592853,
+      "grad_norm": 0.00011139630805701017,
+      "learning_rate": 4.265569917743831e-06,
+      "loss": 0.0,
+      "step": 128
+    },
+    {
+      "epoch": 0.15231400117164617,
+      "grad_norm": 0.000514341751113534,
+      "learning_rate": 4.253819036427733e-06,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.1546572934973638,
+      "grad_norm": 0.002176255453377962,
+      "learning_rate": 4.242068155111634e-06,
+      "loss": 0.0001,
+      "step": 132
+    },
+    {
+      "epoch": 0.15700058582308143,
+      "grad_norm": 0.018497969955205917,
+      "learning_rate": 4.230317273795535e-06,
+      "loss": 0.0001,
+      "step": 134
+    },
+    {
+      "epoch": 0.15934387814879905,
+      "grad_norm": 0.013157431036233902,
+      "learning_rate": 4.2185663924794365e-06,
+      "loss": 0.0001,
+      "step": 136
+    },
+    {
+      "epoch": 0.1616871704745167,
+      "grad_norm": 0.007630129344761372,
+      "learning_rate": 4.206815511163338e-06,
+      "loss": 0.0,
+      "step": 138
+    },
+    {
+      "epoch": 0.16403046280023434,
+      "grad_norm": 0.0008055138750933111,
+      "learning_rate": 4.195064629847239e-06,
+      "loss": 0.0001,
+      "step": 140
+    },
+    {
+      "epoch": 0.16637375512595196,
+      "grad_norm": 0.006306421477347612,
+      "learning_rate": 4.18331374853114e-06,
+      "loss": 0.0,
+      "step": 142
+    },
+    {
+      "epoch": 0.1687170474516696,
+      "grad_norm": 0.020266445353627205,
+      "learning_rate": 4.1715628672150414e-06,
+      "loss": 0.0001,
+      "step": 144
+    },
+    {
+      "epoch": 0.17106033977738722,
+      "grad_norm": 0.00037427974166348577,
+      "learning_rate": 4.159811985898943e-06,
+      "loss": 0.0,
+      "step": 146
+    },
+    {
+      "epoch": 0.17340363210310486,
+      "grad_norm": 0.004259356763213873,
+      "learning_rate": 4.148061104582844e-06,
+      "loss": 0.0001,
+      "step": 148
+    },
+    {
+      "epoch": 0.1757469244288225,
+      "grad_norm": 0.0010232679778710008,
+      "learning_rate": 4.136310223266745e-06,
+      "loss": 0.0001,
+      "step": 150
+    },
+    {
+      "epoch": 0.17809021675454012,
+      "grad_norm": 0.003952402155846357,
+      "learning_rate": 4.124559341950647e-06,
+      "loss": 0.0,
+      "step": 152
+    },
+    {
+      "epoch": 0.18043350908025776,
+      "grad_norm": 0.0013295585522428155,
+      "learning_rate": 4.112808460634548e-06,
+      "loss": 0.0,
+      "step": 154
+    },
+    {
+      "epoch": 0.1827768014059754,
+      "grad_norm": 0.013831949792802334,
+      "learning_rate": 4.10105757931845e-06,
+      "loss": 0.0001,
+      "step": 156
+    },
+    {
+      "epoch": 0.18512009373169303,
+      "grad_norm": 0.0036904062144458294,
+      "learning_rate": 4.089306698002351e-06,
+      "loss": 0.0,
+      "step": 158
+    },
+    {
+      "epoch": 0.18746338605741067,
+      "grad_norm": 0.002993196714669466,
+      "learning_rate": 4.077555816686252e-06,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.18980667838312829,
+      "grad_norm": 0.0016740068094804883,
+      "learning_rate": 4.0658049353701525e-06,
+      "loss": 0.0001,
+      "step": 162
+    },
+    {
+      "epoch": 0.19214997070884593,
+      "grad_norm": 0.012307717464864254,
+      "learning_rate": 4.0540540540540545e-06,
+      "loss": 0.0001,
+      "step": 164
+    },
+    {
+      "epoch": 0.19449326303456357,
+      "grad_norm": 0.0012654109159484506,
+      "learning_rate": 4.042303172737956e-06,
+      "loss": 0.0,
+      "step": 166
+    },
+    {
+      "epoch": 0.1968365553602812,
+      "grad_norm": 0.12437883019447327,
+      "learning_rate": 4.030552291421857e-06,
+      "loss": 0.0006,
+      "step": 168
+    },
+    {
+      "epoch": 0.19917984768599883,
+      "grad_norm": 8.974138472694904e-05,
+      "learning_rate": 4.018801410105758e-06,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 0.20152314001171645,
+      "grad_norm": 0.0011903212871402502,
+      "learning_rate": 4.007050528789659e-06,
+      "loss": 0.0001,
+      "step": 172
+    },
+    {
+      "epoch": 0.2038664323374341,
+      "grad_norm": 0.012350277975201607,
+      "learning_rate": 3.995299647473561e-06,
+      "loss": 0.0001,
+      "step": 174
+    },
+    {
+      "epoch": 0.20620972466315174,
+      "grad_norm": 0.01664598099887371,
+      "learning_rate": 3.983548766157463e-06,
+      "loss": 0.0001,
+      "step": 176
+    },
+    {
+      "epoch": 0.20855301698886936,
+      "grad_norm": 0.0064240009523928165,
+      "learning_rate": 3.971797884841364e-06,
+      "loss": 0.0001,
+      "step": 178
+    },
+    {
+      "epoch": 0.210896309314587,
+      "grad_norm": 0.0031362581066787243,
+      "learning_rate": 3.960047003525264e-06,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.21323960164030462,
+      "grad_norm": 0.00012566300574690104,
+      "learning_rate": 3.9482961222091655e-06,
+      "loss": 0.0001,
+      "step": 182
+    },
+    {
+      "epoch": 0.21558289396602226,
+      "grad_norm": 0.0018261070363223553,
+      "learning_rate": 3.936545240893067e-06,
+      "loss": 0.0,
+      "step": 184
+    },
+    {
+      "epoch": 0.2179261862917399,
+      "grad_norm": 0.0010897299507632852,
+      "learning_rate": 3.924794359576969e-06,
+      "loss": 0.0,
+      "step": 186
+    },
+    {
+      "epoch": 0.22026947861745752,
+      "grad_norm": 0.006528445053845644,
+      "learning_rate": 3.91304347826087e-06,
+      "loss": 0.0,
+      "step": 188
+    },
+    {
+      "epoch": 0.22261277094317516,
+      "grad_norm": 0.4626096785068512,
+      "learning_rate": 3.901292596944771e-06,
+      "loss": 0.0009,
+      "step": 190
+    },
+    {
+      "epoch": 0.22495606326889278,
+      "grad_norm": 0.002359338803216815,
+      "learning_rate": 3.8895417156286725e-06,
+      "loss": 0.0,
+      "step": 192
+    },
+    {
+      "epoch": 0.22729935559461042,
+      "grad_norm": 0.004821418318897486,
+      "learning_rate": 3.877790834312574e-06,
+      "loss": 0.0,
+      "step": 194
+    },
+    {
+      "epoch": 0.22964264792032807,
+      "grad_norm": 0.0011465001152828336,
+      "learning_rate": 3.866039952996475e-06,
+      "loss": 0.0008,
+      "step": 196
+    },
+    {
+      "epoch": 0.23198594024604569,
+      "grad_norm": 0.0007381247123703361,
+      "learning_rate": 3.854289071680376e-06,
+      "loss": 0.0001,
+      "step": 198
+    },
+    {
+      "epoch": 0.23432923257176333,
+      "grad_norm": 0.0023091183975338936,
+      "learning_rate": 3.842538190364277e-06,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.23667252489748097,
+      "grad_norm": 0.0005714365397579968,
+      "learning_rate": 3.830787309048179e-06,
+      "loss": 0.0,
+      "step": 202
+    },
+    {
+      "epoch": 0.2390158172231986,
+      "grad_norm": 0.00351692084223032,
+      "learning_rate": 3.81903642773208e-06,
+      "loss": 0.0,
+      "step": 204
+    },
+    {
+      "epoch": 0.24135910954891623,
+      "grad_norm": 5.926425728830509e-05,
+      "learning_rate": 3.8072855464159815e-06,
+      "loss": 0.0,
+      "step": 206
+    },
+    {
+      "epoch": 0.24370240187463385,
+      "grad_norm": 0.0016421001637354493,
+      "learning_rate": 3.7955346650998827e-06,
+      "loss": 0.0,
+      "step": 208
+    },
+    {
+      "epoch": 0.2460456942003515,
+      "grad_norm": 0.012118808925151825,
+      "learning_rate": 3.7837837837837844e-06,
+      "loss": 0.0001,
+      "step": 210
+    },
+    {
+      "epoch": 0.24838898652606914,
+      "grad_norm": 0.00024874648079276085,
+      "learning_rate": 3.7720329024676856e-06,
+      "loss": 0.0002,
+      "step": 212
+    },
+    {
+      "epoch": 0.2507322788517868,
+      "grad_norm": 0.0017625248292461038,
+      "learning_rate": 3.760282021151587e-06,
+      "loss": 0.0,
+      "step": 214
+    },
+    {
+      "epoch": 0.2530755711775044,
+      "grad_norm": 0.0007431196281686425,
+      "learning_rate": 3.748531139835488e-06,
+      "loss": 0.0,
+      "step": 216
+    },
+    {
+      "epoch": 0.255418863503222,
+      "grad_norm": 0.0007026457460597157,
+      "learning_rate": 3.7367802585193893e-06,
+      "loss": 0.0,
+      "step": 218
+    },
+    {
+      "epoch": 0.2577621558289397,
+      "grad_norm": 0.002397920237854123,
+      "learning_rate": 3.72502937720329e-06,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.2601054481546573,
+      "grad_norm": 0.003177257487550378,
+      "learning_rate": 3.713278495887192e-06,
+      "loss": 0.0,
+      "step": 222
+    },
+    {
+      "epoch": 0.2624487404803749,
+      "grad_norm": 0.003142025787383318,
+      "learning_rate": 3.7015276145710934e-06,
+      "loss": 0.0001,
+      "step": 224
+    },
+    {
+      "epoch": 0.26479203280609254,
+      "grad_norm": 0.03788410872220993,
+      "learning_rate": 3.6897767332549946e-06,
+      "loss": 0.0002,
+      "step": 226
+    },
+    {
+      "epoch": 0.2671353251318102,
+      "grad_norm": 0.005685464479029179,
+      "learning_rate": 3.6780258519388954e-06,
+      "loss": 0.0003,
+      "step": 228
+    },
+    {
+      "epoch": 0.2694786174575278,
+      "grad_norm": 0.0010328789940103889,
+      "learning_rate": 3.6662749706227966e-06,
+      "loss": 0.0003,
+      "step": 230
+    },
+    {
+      "epoch": 0.27182190978324544,
+      "grad_norm": 0.0052024442702531815,
+      "learning_rate": 3.6545240893066987e-06,
+      "loss": 0.0,
+      "step": 232
+    },
+    {
+      "epoch": 0.2741652021089631,
+      "grad_norm": 0.006033598445355892,
+      "learning_rate": 3.6427732079906e-06,
+      "loss": 0.0,
+      "step": 234
+    },
+    {
+      "epoch": 0.27650849443468073,
+      "grad_norm": 0.00023948443413246423,
+      "learning_rate": 3.6310223266745007e-06,
+      "loss": 0.0001,
+      "step": 236
+    },
+    {
+      "epoch": 0.27885178676039835,
+      "grad_norm": 0.00016467843670397997,
+      "learning_rate": 3.619271445358402e-06,
+      "loss": 0.0,
+      "step": 238
+    },
+    {
+      "epoch": 0.281195079086116,
+      "grad_norm": 0.003566320287063718,
+      "learning_rate": 3.607520564042303e-06,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.28353837141183363,
+      "grad_norm": 0.00033969045034609735,
+      "learning_rate": 3.5957696827262044e-06,
+      "loss": 0.0,
+      "step": 242
+    },
+    {
+      "epoch": 0.28588166373755125,
+      "grad_norm": 0.0033994223922491074,
+      "learning_rate": 3.5840188014101065e-06,
+      "loss": 0.0,
+      "step": 244
+    },
+    {
+      "epoch": 0.28822495606326887,
+      "grad_norm": 0.14746786653995514,
+      "learning_rate": 3.5722679200940073e-06,
+      "loss": 0.0008,
+      "step": 246
+    },
+    {
+      "epoch": 0.29056824838898654,
+      "grad_norm": 0.012470235116779804,
+      "learning_rate": 3.5605170387779085e-06,
+      "loss": 0.0,
+      "step": 248
+    },
+    {
+      "epoch": 0.29291154071470415,
+      "grad_norm": 0.08307931572198868,
+      "learning_rate": 3.5487661574618097e-06,
+      "loss": 0.0003,
+      "step": 250
+    },
+    {
+      "epoch": 0.29525483304042177,
+      "grad_norm": 0.00033245363738387823,
+      "learning_rate": 3.537015276145711e-06,
+      "loss": 0.0,
+      "step": 252
+    },
+    {
+      "epoch": 0.29759812536613944,
+      "grad_norm": 0.0018247144762426615,
+      "learning_rate": 3.525264394829612e-06,
+      "loss": 0.0,
+      "step": 254
+    },
+    {
+      "epoch": 0.29994141769185706,
+      "grad_norm": 0.0011103990254923701,
+      "learning_rate": 3.513513513513514e-06,
+      "loss": 0.0001,
+      "step": 256
+    },
+    {
+      "epoch": 0.3022847100175747,
+      "grad_norm": 0.0010811882093548775,
+      "learning_rate": 3.501762632197415e-06,
+      "loss": 0.0,
+      "step": 258
+    },
+    {
+      "epoch": 0.30462800234329235,
+      "grad_norm": 0.011172047816216946,
+      "learning_rate": 3.4900117508813163e-06,
+      "loss": 0.0001,
+      "step": 260
+    },
+    {
+      "epoch": 0.30697129466900996,
+      "grad_norm": 0.0013676233356818557,
+      "learning_rate": 3.4782608695652175e-06,
+      "loss": 0.0,
+      "step": 262
+    },
+    {
+      "epoch": 0.3093145869947276,
+      "grad_norm": 0.002147970488294959,
+      "learning_rate": 3.4665099882491187e-06,
+      "loss": 0.0,
+      "step": 264
+    },
+    {
+      "epoch": 0.31165787932044525,
+      "grad_norm": 0.0009826518362388015,
+      "learning_rate": 3.4547591069330204e-06,
+      "loss": 0.0,
+      "step": 266
+    },
+    {
+      "epoch": 0.31400117164616287,
+      "grad_norm": 0.001499099307693541,
+      "learning_rate": 3.4430082256169216e-06,
+      "loss": 0.0,
+      "step": 268
+    },
+    {
+      "epoch": 0.3163444639718805,
+      "grad_norm": 0.001323301112279296,
+      "learning_rate": 3.431257344300823e-06,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 0.3186877562975981,
+      "grad_norm": 0.018010340631008148,
+      "learning_rate": 3.419506462984724e-06,
+      "loss": 0.0005,
+      "step": 272
+    },
+    {
+      "epoch": 0.3210310486233158,
+      "grad_norm": 0.0024064648896455765,
+      "learning_rate": 3.4077555816686253e-06,
+      "loss": 0.0,
+      "step": 274
+    },
+    {
+      "epoch": 0.3233743409490334,
+      "grad_norm": 0.02396260015666485,
+      "learning_rate": 3.3960047003525265e-06,
+      "loss": 0.0001,
+      "step": 276
+    },
+    {
+      "epoch": 0.325717633274751,
+      "grad_norm": 0.002070352202281356,
+      "learning_rate": 3.384253819036428e-06,
+      "loss": 0.0,
+      "step": 278
+    },
+    {
+      "epoch": 0.3280609256004687,
+      "grad_norm": 0.0003108434902969748,
+      "learning_rate": 3.3725029377203294e-06,
+      "loss": 0.0001,
+      "step": 280
+    },
+    {
+      "epoch": 0.3304042179261863,
+      "grad_norm": 0.006573045626282692,
+      "learning_rate": 3.3607520564042306e-06,
+      "loss": 0.0001,
+      "step": 282
+    },
+    {
+      "epoch": 0.3327475102519039,
+      "grad_norm": 0.0004413512069731951,
+      "learning_rate": 3.349001175088132e-06,
+      "loss": 0.0001,
+      "step": 284
+    },
+    {
+      "epoch": 0.3350908025776216,
+      "grad_norm": 0.0005645502242259681,
+      "learning_rate": 3.337250293772033e-06,
+      "loss": 0.0,
+      "step": 286
+    },
+    {
+      "epoch": 0.3374340949033392,
+      "grad_norm": 0.00034579774364829063,
+      "learning_rate": 3.3254994124559343e-06,
+      "loss": 0.0,
+      "step": 288
+    },
+    {
+      "epoch": 0.3397773872290568,
+      "grad_norm": 0.003136229468509555,
+      "learning_rate": 3.313748531139836e-06,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 0.34212067955477443,
+      "grad_norm": 0.0031148705165833235,
+      "learning_rate": 3.301997649823737e-06,
+      "loss": 0.0,
+      "step": 292
+    },
+    {
+      "epoch": 0.3444639718804921,
+      "grad_norm": 0.0012612566351890564,
+      "learning_rate": 3.2902467685076384e-06,
+      "loss": 0.0,
+      "step": 294
+    },
+    {
+      "epoch": 0.3468072642062097,
+      "grad_norm": 0.0007469533011317253,
+      "learning_rate": 3.2784958871915396e-06,
+      "loss": 0.0,
+      "step": 296
+    },
+    {
+      "epoch": 0.34915055653192734,
+      "grad_norm": 0.04412250965833664,
+      "learning_rate": 3.266745005875441e-06,
+      "loss": 0.0003,
+      "step": 298
+    },
+    {
+      "epoch": 0.351493848857645,
+      "grad_norm": 0.004462533164769411,
+      "learning_rate": 3.2549941245593425e-06,
+      "loss": 0.0088,
+      "step": 300
+    },
+    {
+      "epoch": 0.3538371411833626,
+      "grad_norm": 0.002911294111981988,
+      "learning_rate": 3.2432432432432437e-06,
+      "loss": 0.0006,
+      "step": 302
+    },
+    {
+      "epoch": 0.35618043350908024,
+      "grad_norm": 0.0015191801358014345,
+      "learning_rate": 3.231492361927145e-06,
+      "loss": 0.0,
+      "step": 304
+    },
+    {
+      "epoch": 0.3585237258347979,
+      "grad_norm": 0.017380721867084503,
+      "learning_rate": 3.219741480611046e-06,
+      "loss": 0.0094,
+      "step": 306
+    },
+    {
+      "epoch": 0.36086701816051553,
+      "grad_norm": 0.002749436302110553,
+      "learning_rate": 3.2079905992949474e-06,
+      "loss": 0.0001,
+      "step": 308
+    },
+    {
+      "epoch": 0.36321031048623315,
+      "grad_norm": 0.0008673086995258927,
+      "learning_rate": 3.1962397179788486e-06,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 0.3655536028119508,
+      "grad_norm": 0.00361701101064682,
+      "learning_rate": 3.1844888366627503e-06,
+      "loss": 0.0,
+      "step": 312
+    },
+    {
+      "epoch": 0.36789689513766843,
+      "grad_norm": 0.006906528025865555,
+      "learning_rate": 3.1727379553466515e-06,
+      "loss": 0.0,
+      "step": 314
+    },
+    {
+      "epoch": 0.37024018746338605,
+      "grad_norm": 2.259305238723755,
+      "learning_rate": 3.1609870740305527e-06,
+      "loss": 0.0157,
+      "step": 316
+    },
+    {
+      "epoch": 0.37258347978910367,
+      "grad_norm": 0.00017454673070460558,
+      "learning_rate": 3.149236192714454e-06,
+      "loss": 0.0,
+      "step": 318
+    },
+    {
+      "epoch": 0.37492677211482134,
+      "grad_norm": 0.16197967529296875,
+      "learning_rate": 3.137485311398355e-06,
+      "loss": 0.0009,
+      "step": 320
+    },
+    {
+      "epoch": 0.37727006444053895,
+      "grad_norm": 0.002247605938464403,
+      "learning_rate": 3.1257344300822564e-06,
+      "loss": 0.0,
+      "step": 322
+    },
+    {
+      "epoch": 0.37961335676625657,
+      "grad_norm": 0.023727795109152794,
+      "learning_rate": 3.113983548766158e-06,
+      "loss": 0.0001,
+      "step": 324
+    },
+    {
+      "epoch": 0.38195664909197424,
+      "grad_norm": 0.008455273695290089,
+      "learning_rate": 3.1022326674500592e-06,
+      "loss": 0.0001,
+      "step": 326
+    },
+    {
+      "epoch": 0.38429994141769186,
+      "grad_norm": 0.00022873218404129148,
+      "learning_rate": 3.0904817861339605e-06,
+      "loss": 0.0,
+      "step": 328
+    },
+    {
+      "epoch": 0.3866432337434095,
+      "grad_norm": 3.000872850418091,
+      "learning_rate": 3.0787309048178617e-06,
+      "loss": 0.055,
+      "step": 330
+    },
+    {
+      "epoch": 0.38898652606912715,
+      "grad_norm": 0.002177221467718482,
+      "learning_rate": 3.066980023501763e-06,
+      "loss": 0.0,
+      "step": 332
+    },
+    {
+      "epoch": 0.39132981839484476,
+      "grad_norm": 0.002786975121125579,
+      "learning_rate": 3.0552291421856637e-06,
+      "loss": 0.0,
+      "step": 334
+    },
+    {
+      "epoch": 0.3936731107205624,
+      "grad_norm": 0.004335256293416023,
+      "learning_rate": 3.043478260869566e-06,
+      "loss": 0.0,
+      "step": 336
+    },
+    {
+      "epoch": 0.39601640304628,
+      "grad_norm": 0.007627409417182207,
+      "learning_rate": 3.031727379553467e-06,
+      "loss": 0.0001,
+      "step": 338
+    },
+    {
+      "epoch": 0.39835969537199767,
+      "grad_norm": 0.002631911775097251,
+      "learning_rate": 3.0199764982373682e-06,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.4007029876977153,
+      "grad_norm": 0.009561799466609955,
+      "learning_rate": 3.008225616921269e-06,
+      "loss": 0.0001,
+      "step": 342
+    },
+    {
+      "epoch": 0.4030462800234329,
+      "grad_norm": 0.0026635443791747093,
+      "learning_rate": 2.9964747356051703e-06,
+      "loss": 0.0001,
+      "step": 344
+    },
+    {
+      "epoch": 0.4053895723491506,
+      "grad_norm": 0.0001533351169200614,
+      "learning_rate": 2.9847238542890723e-06,
+      "loss": 0.0,
+      "step": 346
+    },
+    {
+      "epoch": 0.4077328646748682,
+      "grad_norm": 0.0835270956158638,
+      "learning_rate": 2.9729729729729736e-06,
+      "loss": 0.0005,
+      "step": 348
+    },
+    {
+      "epoch": 0.4100761570005858,
+      "grad_norm": 0.003761101048439741,
+      "learning_rate": 2.9612220916568744e-06,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 0.4124194493263035,
+      "grad_norm": 0.01136633288115263,
+      "learning_rate": 2.9494712103407756e-06,
+      "loss": 0.0002,
+      "step": 352
+    },
+    {
+      "epoch": 0.4147627416520211,
+      "grad_norm": 0.007711971178650856,
+      "learning_rate": 2.937720329024677e-06,
+      "loss": 0.0001,
+      "step": 354
+    },
+    {
+      "epoch": 0.4171060339777387,
+      "grad_norm": 0.0003854953683912754,
+      "learning_rate": 2.925969447708578e-06,
+      "loss": 0.0,
+      "step": 356
+    },
+    {
+      "epoch": 0.4194493263034564,
+      "grad_norm": 0.019140860065817833,
+      "learning_rate": 2.91421856639248e-06,
+      "loss": 0.0001,
+      "step": 358
+    },
+    {
+      "epoch": 0.421792618629174,
+      "grad_norm": 0.0013410028768703341,
+      "learning_rate": 2.902467685076381e-06,
+      "loss": 0.0003,
+      "step": 360
+    },
+    {
+      "epoch": 0.4241359109548916,
+      "grad_norm": 0.0011243935441598296,
+      "learning_rate": 2.890716803760282e-06,
+      "loss": 0.0001,
+      "step": 362
+    },
+    {
+      "epoch": 0.42647920328060923,
+      "grad_norm": 0.012134709395468235,
+      "learning_rate": 2.8789659224441834e-06,
+      "loss": 0.0001,
+      "step": 364
+    },
+    {
+      "epoch": 0.4288224956063269,
+      "grad_norm": 0.0028234529308974743,
+      "learning_rate": 2.8672150411280846e-06,
+      "loss": 0.0,
+      "step": 366
+    },
+    {
+      "epoch": 0.4311657879320445,
+      "grad_norm": 0.004319467581808567,
+      "learning_rate": 2.855464159811986e-06,
+      "loss": 0.0,
+      "step": 368
+    },
+    {
+      "epoch": 0.43350908025776214,
+      "grad_norm": 0.0068093533627688885,
+      "learning_rate": 2.8437132784958875e-06,
+      "loss": 0.0001,
+      "step": 370
+    },
+    {
+      "epoch": 0.4358523725834798,
+      "grad_norm": 0.016774361953139305,
+      "learning_rate": 2.8319623971797887e-06,
+      "loss": 0.0001,
+      "step": 372
+    },
+    {
+      "epoch": 0.4381956649091974,
+      "grad_norm": 0.014978869818150997,
+      "learning_rate": 2.82021151586369e-06,
+      "loss": 0.0001,
+      "step": 374
+    },
+    {
+      "epoch": 0.44053895723491504,
+      "grad_norm": 0.0010881100315600634,
+      "learning_rate": 2.808460634547591e-06,
+      "loss": 0.0004,
+      "step": 376
+    },
+    {
+      "epoch": 0.4428822495606327,
+      "grad_norm": 0.05522293969988823,
+      "learning_rate": 2.7967097532314924e-06,
+      "loss": 0.0002,
+      "step": 378
+    },
+    {
+      "epoch": 0.44522554188635033,
+      "grad_norm": 0.0027575818821787834,
+      "learning_rate": 2.784958871915394e-06,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.44756883421206795,
+      "grad_norm": 0.0006020054570399225,
+      "learning_rate": 2.7732079905992952e-06,
+      "loss": 0.0005,
+      "step": 382
+    },
+    {
+      "epoch": 0.44991212653778556,
+      "grad_norm": 0.0025616425555199385,
+      "learning_rate": 2.7614571092831965e-06,
+      "loss": 0.0,
+      "step": 384
+    },
+    {
+      "epoch": 0.45225541886350323,
+      "grad_norm": 0.0018823420396074653,
+      "learning_rate": 2.7497062279670977e-06,
+      "loss": 0.0,
+      "step": 386
+    },
+    {
+      "epoch": 0.45459871118922085,
+      "grad_norm": 0.003241207217797637,
+      "learning_rate": 2.737955346650999e-06,
+      "loss": 0.0,
+      "step": 388
+    },
+    {
+      "epoch": 0.45694200351493847,
+      "grad_norm": 0.0010485474485903978,
+      "learning_rate": 2.7262044653349e-06,
+      "loss": 0.0002,
+      "step": 390
+    },
+    {
+      "epoch": 0.45928529584065614,
+      "grad_norm": 0.013366922736167908,
+      "learning_rate": 2.714453584018802e-06,
+      "loss": 0.0001,
+      "step": 392
+    },
+    {
+      "epoch": 0.46162858816637375,
+      "grad_norm": 0.0005886501166969538,
+      "learning_rate": 2.702702702702703e-06,
+      "loss": 0.0,
+      "step": 394
+    },
+    {
+      "epoch": 0.46397188049209137,
+      "grad_norm": 7.603697304148227e-05,
+      "learning_rate": 2.6909518213866042e-06,
+      "loss": 0.0,
+      "step": 396
+    },
+    {
+      "epoch": 0.46631517281780904,
+      "grad_norm": 0.000614571908954531,
+      "learning_rate": 2.6792009400705055e-06,
+      "loss": 0.0023,
+      "step": 398
+    },
+    {
+      "epoch": 0.46865846514352666,
+      "grad_norm": 0.046423882246017456,
+      "learning_rate": 2.6674500587544067e-06,
+      "loss": 0.0002,
+      "step": 400
+    },
+    {
+      "epoch": 0.4710017574692443,
+      "grad_norm": 0.0005994020029902458,
+      "learning_rate": 2.655699177438308e-06,
+      "loss": 0.0,
+      "step": 402
+    },
+    {
+      "epoch": 0.47334504979496195,
+      "grad_norm": 0.011609828099608421,
+      "learning_rate": 2.6439482961222096e-06,
+      "loss": 0.0001,
+      "step": 404
+    },
+    {
+      "epoch": 0.47568834212067956,
+      "grad_norm": 0.007135775871574879,
+      "learning_rate": 2.632197414806111e-06,
+      "loss": 0.0002,
+      "step": 406
+    },
+    {
+      "epoch": 0.4780316344463972,
+      "grad_norm": 0.0028773818630725145,
+      "learning_rate": 2.620446533490012e-06,
+      "loss": 0.0,
+      "step": 408
+    },
+    {
+      "epoch": 0.4803749267721148,
+      "grad_norm": 0.13341404497623444,
+      "learning_rate": 2.6086956521739132e-06,
+      "loss": 0.0008,
+      "step": 410
+    },
+    {
+      "epoch": 0.48271821909783247,
+      "grad_norm": 0.03130058944225311,
+      "learning_rate": 2.5969447708578145e-06,
+      "loss": 0.0001,
+      "step": 412
+    },
+    {
+      "epoch": 0.4850615114235501,
+      "grad_norm": 0.006637818645685911,
+      "learning_rate": 2.5851938895417157e-06,
+      "loss": 0.0001,
+      "step": 414
+    },
+    {
+      "epoch": 0.4874048037492677,
+      "grad_norm": 0.0006390800117515028,
+      "learning_rate": 2.5734430082256173e-06,
+      "loss": 0.0001,
+      "step": 416
+    },
+    {
+      "epoch": 0.4897480960749854,
+      "grad_norm": 0.02106345072388649,
+      "learning_rate": 2.5616921269095186e-06,
+      "loss": 0.0002,
+      "step": 418
+    },
+    {
+      "epoch": 0.492091388400703,
+      "grad_norm": 0.0009213433368131518,
+      "learning_rate": 2.5499412455934198e-06,
+      "loss": 0.0001,
+      "step": 420
+    },
+    {
+      "epoch": 0.4944346807264206,
+      "grad_norm": 2.5962471961975098,
+      "learning_rate": 2.538190364277321e-06,
+      "loss": 0.1436,
+      "step": 422
+    },
+    {
+      "epoch": 0.4967779730521383,
+      "grad_norm": 0.009386847727000713,
+      "learning_rate": 2.5264394829612222e-06,
+      "loss": 0.0001,
+      "step": 424
+    },
+    {
+      "epoch": 0.4991212653778559,
+      "grad_norm": 0.01308267842978239,
+      "learning_rate": 2.514688601645124e-06,
+      "loss": 0.0001,
+      "step": 426
+    },
+    {
+      "epoch": 0.5014645577035736,
+      "grad_norm": 0.006409250665456057,
+      "learning_rate": 2.502937720329025e-06,
+      "loss": 0.0,
+      "step": 428
+    },
+    {
+      "epoch": 0.5038078500292912,
+      "grad_norm": 0.0018047624034807086,
+      "learning_rate": 2.4911868390129263e-06,
+      "loss": 0.0001,
+      "step": 430
+    },
+    {
+      "epoch": 0.5061511423550088,
+      "grad_norm": 0.007056268397718668,
+      "learning_rate": 2.4794359576968276e-06,
+      "loss": 0.0,
+      "step": 432
+    },
+    {
+      "epoch": 0.5084944346807264,
+      "grad_norm": 2.4651243686676025,
+      "learning_rate": 2.4676850763807288e-06,
+      "loss": 0.0245,
+      "step": 434
+    },
+    {
+      "epoch": 0.510837727006444,
+      "grad_norm": 0.0025760605931282043,
+      "learning_rate": 2.45593419506463e-06,
+      "loss": 0.0,
+      "step": 436
+    },
+    {
+      "epoch": 0.5131810193321616,
+      "grad_norm": 0.059660654515028,
+      "learning_rate": 2.4441833137485312e-06,
+      "loss": 0.0003,
+      "step": 438
+    },
+    {
+      "epoch": 0.5155243116578794,
+      "grad_norm": 0.032668206840753555,
+      "learning_rate": 2.432432432432433e-06,
+      "loss": 0.0002,
+      "step": 440
+    },
+    {
+      "epoch": 0.517867603983597,
+      "grad_norm": 0.002476097084581852,
+      "learning_rate": 2.420681551116334e-06,
+      "loss": 0.0,
+      "step": 442
+    },
+    {
+      "epoch": 0.5202108963093146,
+      "grad_norm": 0.0005356927285902202,
+      "learning_rate": 2.4089306698002353e-06,
+      "loss": 0.0,
+      "step": 444
+    },
+    {
+      "epoch": 0.5225541886350322,
+      "grad_norm": 0.01949264481663704,
+      "learning_rate": 2.3971797884841366e-06,
+      "loss": 0.0001,
+      "step": 446
+    },
+    {
+      "epoch": 0.5248974809607498,
+      "grad_norm": 0.4609091281890869,
+      "learning_rate": 2.3854289071680378e-06,
+      "loss": 0.0013,
+      "step": 448
+    },
+    {
+      "epoch": 0.5272407732864675,
+      "grad_norm": 0.002268969314172864,
+      "learning_rate": 2.373678025851939e-06,
+      "loss": 0.027,
+      "step": 450
+    },
+    {
+      "epoch": 0.5295840656121851,
+      "grad_norm": 0.42679542303085327,
+      "learning_rate": 2.3619271445358407e-06,
+      "loss": 0.002,
+      "step": 452
+    },
+    {
+      "epoch": 0.5319273579379028,
+      "grad_norm": 0.030775954946875572,
+      "learning_rate": 2.350176263219742e-06,
+      "loss": 0.0001,
+      "step": 454
+    },
+    {
+      "epoch": 0.5342706502636204,
+      "grad_norm": 0.006208465900272131,
+      "learning_rate": 2.3384253819036427e-06,
+      "loss": 0.0001,
+      "step": 456
+    },
+    {
+      "epoch": 0.536613942589338,
+      "grad_norm": 0.001203950378112495,
+      "learning_rate": 2.3266745005875443e-06,
+      "loss": 0.0,
+      "step": 458
+    },
+    {
+      "epoch": 0.5389572349150556,
+      "grad_norm": 0.0013062539510428905,
+      "learning_rate": 2.3149236192714456e-06,
+      "loss": 0.0001,
+      "step": 460
+    },
+    {
+      "epoch": 0.5413005272407733,
+      "grad_norm": 0.014242034405469894,
+      "learning_rate": 2.3031727379553468e-06,
+      "loss": 0.0001,
+      "step": 462
+    },
+    {
+      "epoch": 0.5436438195664909,
+      "grad_norm": 0.0024558689910918474,
+      "learning_rate": 2.291421856639248e-06,
+      "loss": 0.0,
+      "step": 464
+    },
+    {
+      "epoch": 0.5459871118922085,
+      "grad_norm": 0.006871205288916826,
+      "learning_rate": 2.2796709753231492e-06,
+      "loss": 0.0,
+      "step": 466
+    },
+    {
+      "epoch": 0.5483304042179262,
+      "grad_norm": 0.016744021326303482,
+      "learning_rate": 2.2679200940070505e-06,
+      "loss": 0.0001,
+      "step": 468
+    },
+    {
+      "epoch": 0.5506736965436438,
+      "grad_norm": 0.0025478950701653957,
+      "learning_rate": 2.256169212690952e-06,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 0.5530169888693615,
+      "grad_norm": 0.002553507685661316,
+      "learning_rate": 2.2444183313748533e-06,
+      "loss": 0.0,
+      "step": 472
+    },
+    {
+      "epoch": 0.5553602811950791,
+      "grad_norm": 0.0018396044615656137,
+      "learning_rate": 2.2326674500587546e-06,
+      "loss": 0.0002,
+      "step": 474
+    },
+    {
+      "epoch": 0.5577035735207967,
+      "grad_norm": 0.002036860678344965,
+      "learning_rate": 2.2209165687426558e-06,
+      "loss": 0.0,
+      "step": 476
+    },
+    {
+      "epoch": 0.5600468658465143,
+      "grad_norm": 0.0024688418488949537,
+      "learning_rate": 2.209165687426557e-06,
+      "loss": 0.0,
+      "step": 478
+    },
+    {
+      "epoch": 0.562390158172232,
+      "grad_norm": 0.0028820293955504894,
+      "learning_rate": 2.1974148061104587e-06,
+      "loss": 0.0001,
+      "step": 480
+    },
+    {
+      "epoch": 0.5647334504979497,
+      "grad_norm": 0.00978305283933878,
+      "learning_rate": 2.18566392479436e-06,
+      "loss": 0.0001,
+      "step": 482
+    },
+    {
+      "epoch": 0.5670767428236673,
+      "grad_norm": 0.147267147898674,
+      "learning_rate": 2.173913043478261e-06,
+      "loss": 0.0014,
+      "step": 484
+    },
+    {
+      "epoch": 0.5694200351493849,
+      "grad_norm": 0.005025573540478945,
+      "learning_rate": 2.1621621621621623e-06,
+      "loss": 0.0006,
+      "step": 486
+    },
+    {
+      "epoch": 0.5717633274751025,
+      "grad_norm": 0.0010051846038550138,
+      "learning_rate": 2.1504112808460636e-06,
+      "loss": 0.0003,
+      "step": 488
+    },
+    {
+      "epoch": 0.5741066198008201,
+      "grad_norm": 0.009055075235664845,
+      "learning_rate": 2.1386603995299648e-06,
+      "loss": 0.0001,
+      "step": 490
+    },
+    {
+      "epoch": 0.5764499121265377,
+      "grad_norm": 0.0077414545230567455,
+      "learning_rate": 2.1269095182138664e-06,
+      "loss": 0.0001,
+      "step": 492
+    },
+    {
+      "epoch": 0.5787932044522555,
+      "grad_norm": 0.0059761228039860725,
+      "learning_rate": 2.1151586368977677e-06,
+      "loss": 0.0001,
+      "step": 494
+    },
+    {
+      "epoch": 0.5811364967779731,
+      "grad_norm": 0.0014180493308231235,
+      "learning_rate": 2.103407755581669e-06,
+      "loss": 0.0,
+      "step": 496
+    },
+    {
+      "epoch": 0.5834797891036907,
+      "grad_norm": 0.0022345769684761763,
+      "learning_rate": 2.09165687426557e-06,
+      "loss": 0.0,
+      "step": 498
+    },
+    {
+      "epoch": 0.5858230814294083,
+      "grad_norm": 0.005645833443850279,
+      "learning_rate": 2.0799059929494713e-06,
+      "loss": 0.0001,
+      "step": 500
+    },
+    {
+      "epoch": 0.5881663737551259,
+      "grad_norm": 0.011956258676946163,
+      "learning_rate": 2.0681551116333726e-06,
+      "loss": 0.0001,
+      "step": 502
+    },
+    {
+      "epoch": 0.5905096660808435,
+      "grad_norm": 0.01774289458990097,
+      "learning_rate": 2.056404230317274e-06,
+      "loss": 0.0002,
+      "step": 504
+    },
+    {
+      "epoch": 0.5928529584065613,
+      "grad_norm": 0.21751126646995544,
+      "learning_rate": 2.0446533490011754e-06,
+      "loss": 0.0012,
+      "step": 506
+    },
+    {
+      "epoch": 0.5951962507322789,
+      "grad_norm": 0.00307491235435009,
+      "learning_rate": 2.0329024676850762e-06,
+      "loss": 0.0,
+      "step": 508
+    },
+    {
+      "epoch": 0.5975395430579965,
+      "grad_norm": 0.021330738440155983,
+      "learning_rate": 2.021151586368978e-06,
+      "loss": 0.0002,
+      "step": 510
+    },
+    {
+      "epoch": 0.5998828353837141,
+      "grad_norm": 0.020080704241991043,
+      "learning_rate": 2.009400705052879e-06,
+      "loss": 0.0001,
+      "step": 512
+    },
+    {
+      "epoch": 0.6022261277094317,
+      "grad_norm": 0.020522406324744225,
+      "learning_rate": 1.9976498237367803e-06,
+      "loss": 0.0002,
+      "step": 514
+    },
+    {
+      "epoch": 0.6045694200351494,
+      "grad_norm": 0.0004171329492237419,
+      "learning_rate": 1.985898942420682e-06,
+      "loss": 0.0,
+      "step": 516
+    },
+    {
+      "epoch": 0.606912712360867,
+      "grad_norm": 0.0027696220204234123,
+      "learning_rate": 1.9741480611045828e-06,
+      "loss": 0.0,
+      "step": 518
+    },
+    {
+      "epoch": 0.6092560046865847,
+      "grad_norm": 0.021467505022883415,
+      "learning_rate": 1.9623971797884844e-06,
+      "loss": 0.0002,
+      "step": 520
+    },
+    {
+      "epoch": 0.6115992970123023,
+      "grad_norm": 0.011968536302447319,
+      "learning_rate": 1.9506462984723856e-06,
+      "loss": 0.0001,
+      "step": 522
+    },
+    {
+      "epoch": 0.6139425893380199,
+      "grad_norm": 0.0011503971181809902,
+      "learning_rate": 1.938895417156287e-06,
+      "loss": 0.0004,
+      "step": 524
+    },
+    {
+      "epoch": 0.6162858816637375,
+      "grad_norm": 0.02280554361641407,
+      "learning_rate": 1.927144535840188e-06,
+      "loss": 0.0002,
+      "step": 526
+    },
+    {
+      "epoch": 0.6186291739894552,
+      "grad_norm": 0.008415359072387218,
+      "learning_rate": 1.9153936545240893e-06,
+      "loss": 0.0001,
+      "step": 528
+    },
+    {
+      "epoch": 0.6209724663151728,
+      "grad_norm": 0.0024012764915823936,
+      "learning_rate": 1.9036427732079908e-06,
+      "loss": 0.0001,
+      "step": 530
+    },
+    {
+      "epoch": 0.6233157586408905,
+      "grad_norm": 0.010776808485388756,
+      "learning_rate": 1.8918918918918922e-06,
+      "loss": 0.0001,
+      "step": 532
+    },
+    {
+      "epoch": 0.6256590509666081,
+      "grad_norm": 0.017337538301944733,
+      "learning_rate": 1.8801410105757934e-06,
+      "loss": 0.0001,
+      "step": 534
+    },
+    {
+      "epoch": 0.6280023432923257,
+      "grad_norm": 0.0019926901441067457,
+      "learning_rate": 1.8683901292596946e-06,
+      "loss": 0.0001,
+      "step": 536
+    },
+    {
+      "epoch": 0.6303456356180434,
+      "grad_norm": 0.013480707071721554,
+      "learning_rate": 1.856639247943596e-06,
+      "loss": 0.0002,
+      "step": 538
+    },
+    {
+      "epoch": 0.632688927943761,
+      "grad_norm": 0.005608106963336468,
+      "learning_rate": 1.8448883666274973e-06,
+      "loss": 0.0002,
+      "step": 540
+    },
+    {
+      "epoch": 0.6350322202694786,
+      "grad_norm": 0.002639380283653736,
+      "learning_rate": 1.8331374853113983e-06,
+      "loss": 0.0001,
+      "step": 542
+    },
+    {
+      "epoch": 0.6373755125951962,
+      "grad_norm": 0.0022652854677289724,
+      "learning_rate": 1.8213866039953e-06,
+      "loss": 0.0002,
+      "step": 544
+    },
+    {
+      "epoch": 0.6397188049209139,
+      "grad_norm": 0.003624632954597473,
+      "learning_rate": 1.809635722679201e-06,
+      "loss": 0.0001,
+      "step": 546
+    },
+    {
+      "epoch": 0.6420620972466315,
+      "grad_norm": 0.007647163700312376,
+      "learning_rate": 1.7978848413631022e-06,
+      "loss": 0.0004,
+      "step": 548
+    },
+    {
+      "epoch": 0.6444053895723492,
+      "grad_norm": 0.012163680978119373,
+      "learning_rate": 1.7861339600470036e-06,
+      "loss": 0.0002,
+      "step": 550
+    },
+    {
+      "epoch": 0.6467486818980668,
+      "grad_norm": 0.09023822844028473,
+      "learning_rate": 1.7743830787309049e-06,
+      "loss": 0.0009,
+      "step": 552
+    },
+    {
+      "epoch": 0.6490919742237844,
+      "grad_norm": 0.006924999412149191,
+      "learning_rate": 1.762632197414806e-06,
+      "loss": 0.0001,
+      "step": 554
+    },
+    {
+      "epoch": 0.651435266549502,
+      "grad_norm": 0.0006185275269672275,
+      "learning_rate": 1.7508813160987075e-06,
+      "loss": 0.0001,
+      "step": 556
+    },
+    {
+      "epoch": 0.6537785588752196,
+      "grad_norm": 0.011605402454733849,
+      "learning_rate": 1.7391304347826088e-06,
+      "loss": 0.0006,
+      "step": 558
+    },
+    {
+      "epoch": 0.6561218512009374,
+      "grad_norm": 0.024394473060965538,
+      "learning_rate": 1.7273795534665102e-06,
+      "loss": 0.0001,
+      "step": 560
+    },
+    {
+      "epoch": 0.658465143526655,
+      "grad_norm": 0.023466341197490692,
+      "learning_rate": 1.7156286721504114e-06,
+      "loss": 0.0002,
+      "step": 562
+    },
+    {
+      "epoch": 0.6608084358523726,
+      "grad_norm": 0.010153519921004772,
+      "learning_rate": 1.7038777908343126e-06,
+      "loss": 0.0004,
+      "step": 564
+    },
+    {
+      "epoch": 0.6631517281780902,
+      "grad_norm": 0.43800845742225647,
+      "learning_rate": 1.692126909518214e-06,
+      "loss": 0.0012,
+      "step": 566
+    },
+    {
+      "epoch": 0.6654950205038078,
+      "grad_norm": 0.008404972031712532,
+      "learning_rate": 1.6803760282021153e-06,
+      "loss": 0.0001,
+      "step": 568
+    },
+    {
+      "epoch": 0.6678383128295254,
+      "grad_norm": 0.10615257918834686,
+      "learning_rate": 1.6686251468860165e-06,
+      "loss": 0.0005,
+      "step": 570
+    },
+    {
+      "epoch": 0.6701816051552432,
+      "grad_norm": 0.019307592883706093,
+      "learning_rate": 1.656874265569918e-06,
+      "loss": 0.0003,
+      "step": 572
+    },
+    {
+      "epoch": 0.6725248974809608,
+      "grad_norm": 0.012227280996739864,
+      "learning_rate": 1.6451233842538192e-06,
+      "loss": 0.0002,
+      "step": 574
+    },
+    {
+      "epoch": 0.6748681898066784,
+      "grad_norm": 0.002821948379278183,
+      "learning_rate": 1.6333725029377204e-06,
+      "loss": 0.0,
+      "step": 576
+    },
+    {
+      "epoch": 0.677211482132396,
+      "grad_norm": 0.010473825968801975,
+      "learning_rate": 1.6216216216216219e-06,
+      "loss": 0.0003,
+      "step": 578
+    },
+    {
+      "epoch": 0.6795547744581136,
+      "grad_norm": 0.014046385884284973,
+      "learning_rate": 1.609870740305523e-06,
+      "loss": 0.0236,
+      "step": 580
+    },
+    {
+      "epoch": 0.6818980667838312,
+      "grad_norm": 0.0017795696621760726,
+      "learning_rate": 1.5981198589894243e-06,
+      "loss": 0.0001,
+      "step": 582
+    },
+    {
+      "epoch": 0.6842413591095489,
+      "grad_norm": 0.0006959863239899278,
+      "learning_rate": 1.5863689776733257e-06,
+      "loss": 0.0002,
+      "step": 584
+    },
+    {
+      "epoch": 0.6865846514352666,
+      "grad_norm": 0.019652947783470154,
+      "learning_rate": 1.574618096357227e-06,
+      "loss": 0.0003,
+      "step": 586
+    },
+    {
+      "epoch": 0.6889279437609842,
+      "grad_norm": 0.002340570092201233,
+      "learning_rate": 1.5628672150411282e-06,
+      "loss": 0.0,
+      "step": 588
+    },
+    {
+      "epoch": 0.6912712360867018,
+      "grad_norm": 0.011190817691385746,
+      "learning_rate": 1.5511163337250296e-06,
+      "loss": 0.0002,
+      "step": 590
+    },
+    {
+      "epoch": 0.6936145284124194,
+      "grad_norm": 0.001152676297351718,
+      "learning_rate": 1.5393654524089308e-06,
+      "loss": 0.0001,
+      "step": 592
+    },
+    {
+      "epoch": 0.6959578207381371,
+      "grad_norm": 0.003393592080101371,
+      "learning_rate": 1.5276145710928319e-06,
+      "loss": 0.0001,
+      "step": 594
+    },
+    {
+      "epoch": 0.6983011130638547,
+      "grad_norm": 0.007921353913843632,
+      "learning_rate": 1.5158636897767335e-06,
+      "loss": 0.0001,
+      "step": 596
+    },
+    {
+      "epoch": 0.7006444053895724,
+      "grad_norm": 0.1039208471775055,
+      "learning_rate": 1.5041128084606345e-06,
+      "loss": 0.0002,
+      "step": 598
+    },
+    {
+      "epoch": 0.70298769771529,
+      "grad_norm": 0.0011576958931982517,
+      "learning_rate": 1.4923619271445362e-06,
+      "loss": 0.0001,
+      "step": 600
+    },
+    {
+      "epoch": 0.7053309900410076,
+      "grad_norm": 0.06407307088375092,
+      "learning_rate": 1.4806110458284372e-06,
+      "loss": 0.0003,
+      "step": 602
+    },
+    {
+      "epoch": 0.7076742823667252,
+      "grad_norm": 0.012639104388654232,
+      "learning_rate": 1.4688601645123384e-06,
+      "loss": 0.0002,
+      "step": 604
+    },
+    {
+      "epoch": 0.7100175746924429,
+      "grad_norm": 0.0019591290038079023,
+      "learning_rate": 1.45710928319624e-06,
+      "loss": 0.0068,
+      "step": 606
+    },
+    {
+      "epoch": 0.7123608670181605,
+      "grad_norm": 0.0008327167597599328,
+      "learning_rate": 1.445358401880141e-06,
+      "loss": 0.0001,
+      "step": 608
+    },
+    {
+      "epoch": 0.7147041593438781,
+      "grad_norm": 0.0013139324728399515,
+      "learning_rate": 1.4336075205640423e-06,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.7170474516695958,
+      "grad_norm": 0.00803992711007595,
+      "learning_rate": 1.4218566392479437e-06,
+      "loss": 0.0002,
+      "step": 612
+    },
+    {
+      "epoch": 0.7193907439953134,
+      "grad_norm": 0.011399227194488049,
+      "learning_rate": 1.410105757931845e-06,
+      "loss": 0.0002,
+      "step": 614
+    },
+    {
+      "epoch": 0.7217340363210311,
+      "grad_norm": 0.007171169854700565,
+      "learning_rate": 1.3983548766157462e-06,
+      "loss": 0.0002,
+      "step": 616
+    },
+    {
+      "epoch": 0.7240773286467487,
+      "grad_norm": 0.7272996306419373,
+      "learning_rate": 1.3866039952996476e-06,
+      "loss": 0.0028,
+      "step": 618
+    },
+    {
+      "epoch": 0.7264206209724663,
+      "grad_norm": 0.0037387118209153414,
+      "learning_rate": 1.3748531139835488e-06,
+      "loss": 0.0001,
+      "step": 620
+    },
+    {
+      "epoch": 0.7287639132981839,
+      "grad_norm": 0.015048849396407604,
+      "learning_rate": 1.36310223266745e-06,
+      "loss": 0.0002,
+      "step": 622
+    },
+    {
+      "epoch": 0.7311072056239016,
+      "grad_norm": 0.0023705060593783855,
+      "learning_rate": 1.3513513513513515e-06,
+      "loss": 0.0001,
+      "step": 624
+    },
+    {
+      "epoch": 0.7334504979496193,
+      "grad_norm": 0.03966263309121132,
+      "learning_rate": 1.3396004700352527e-06,
+      "loss": 0.0003,
+      "step": 626
+    },
+    {
+      "epoch": 0.7357937902753369,
+      "grad_norm": 0.0033043306320905685,
+      "learning_rate": 1.327849588719154e-06,
+      "loss": 0.0004,
+      "step": 628
+    },
+    {
+      "epoch": 0.7381370826010545,
+      "grad_norm": 0.35459718108177185,
+      "learning_rate": 1.3160987074030554e-06,
+      "loss": 0.0034,
+      "step": 630
+    },
+    {
+      "epoch": 0.7404803749267721,
+      "grad_norm": 0.016441915184259415,
+      "learning_rate": 1.3043478260869566e-06,
+      "loss": 0.0002,
+      "step": 632
+    },
+    {
+      "epoch": 0.7428236672524897,
+      "grad_norm": 0.0045352657325565815,
+      "learning_rate": 1.2925969447708578e-06,
+      "loss": 0.0002,
+      "step": 634
+    },
+    {
+      "epoch": 0.7451669595782073,
+      "grad_norm": 0.06311573088169098,
+      "learning_rate": 1.2808460634547593e-06,
+      "loss": 0.0005,
+      "step": 636
+    },
+    {
+      "epoch": 0.7475102519039251,
+      "grad_norm": 0.11154340207576752,
+      "learning_rate": 1.2690951821386605e-06,
+      "loss": 0.0009,
+      "step": 638
+    },
+    {
+      "epoch": 0.7498535442296427,
+      "grad_norm": 0.01816423609852791,
+      "learning_rate": 1.257344300822562e-06,
+      "loss": 0.0006,
+      "step": 640
+    },
+    {
+      "epoch": 0.7521968365553603,
+      "grad_norm": 0.027273530140519142,
+      "learning_rate": 1.2455934195064632e-06,
+      "loss": 0.0005,
+      "step": 642
+    },
+    {
+      "epoch": 0.7545401288810779,
+      "grad_norm": 0.006555743515491486,
+      "learning_rate": 1.2338425381903644e-06,
+      "loss": 0.0003,
+      "step": 644
+    },
+    {
+      "epoch": 0.7568834212067955,
+      "grad_norm": 0.0030812753830105066,
+      "learning_rate": 1.2220916568742656e-06,
+      "loss": 0.0279,
+      "step": 646
+    },
+    {
+      "epoch": 0.7592267135325131,
+      "grad_norm": 0.01702543906867504,
+      "learning_rate": 1.210340775558167e-06,
+      "loss": 0.0001,
+      "step": 648
+    },
+    {
+      "epoch": 0.7615700058582309,
+      "grad_norm": 0.02607725001871586,
+      "learning_rate": 1.1985898942420683e-06,
+      "loss": 0.0001,
+      "step": 650
+    },
+    {
+      "epoch": 0.7639132981839485,
+      "grad_norm": 0.006388965994119644,
+      "learning_rate": 1.1868390129259695e-06,
+      "loss": 0.0001,
+      "step": 652
+    },
+    {
+      "epoch": 0.7662565905096661,
+      "grad_norm": 0.008253968320786953,
+      "learning_rate": 1.175088131609871e-06,
+      "loss": 0.0001,
+      "step": 654
+    },
+    {
+      "epoch": 0.7685998828353837,
+      "grad_norm": 0.004699599463492632,
+      "learning_rate": 1.1633372502937722e-06,
+      "loss": 0.0002,
+      "step": 656
+    },
+    {
+      "epoch": 0.7709431751611013,
+      "grad_norm": 0.0012458263663575053,
+      "learning_rate": 1.1515863689776734e-06,
+      "loss": 0.0122,
+      "step": 658
+    },
+    {
+      "epoch": 0.773286467486819,
+      "grad_norm": 0.02383268252015114,
+      "learning_rate": 1.1398354876615746e-06,
+      "loss": 0.0003,
+      "step": 660
+    },
+    {
+      "epoch": 0.7756297598125366,
+      "grad_norm": 0.015058089047670364,
+      "learning_rate": 1.128084606345476e-06,
+      "loss": 0.0001,
+      "step": 662
+    },
+    {
+      "epoch": 0.7779730521382543,
+      "grad_norm": 0.01569475792348385,
+      "learning_rate": 1.1163337250293773e-06,
+      "loss": 0.0003,
+      "step": 664
+    },
+    {
+      "epoch": 0.7803163444639719,
+      "grad_norm": 0.04253750294446945,
+      "learning_rate": 1.1045828437132785e-06,
+      "loss": 0.0002,
+      "step": 666
+    },
+    {
+      "epoch": 0.7826596367896895,
+      "grad_norm": 0.015156907960772514,
+      "learning_rate": 1.09283196239718e-06,
+      "loss": 0.0002,
+      "step": 668
+    },
+    {
+      "epoch": 0.7850029291154071,
+      "grad_norm": 0.03742622211575508,
+      "learning_rate": 1.0810810810810812e-06,
+      "loss": 0.0005,
+      "step": 670
+    },
+    {
+      "epoch": 0.7873462214411248,
+      "grad_norm": 0.027262985706329346,
+      "learning_rate": 1.0693301997649824e-06,
+      "loss": 0.0002,
+      "step": 672
+    },
+    {
+      "epoch": 0.7896895137668424,
+      "grad_norm": 0.007641313597559929,
+      "learning_rate": 1.0575793184488838e-06,
+      "loss": 0.0002,
+      "step": 674
+    },
+    {
+      "epoch": 0.79203280609256,
+      "grad_norm": 0.04441560059785843,
+      "learning_rate": 1.045828437132785e-06,
+      "loss": 0.0005,
+      "step": 676
+    },
+    {
+      "epoch": 0.7943760984182777,
+      "grad_norm": 0.020478103309869766,
+      "learning_rate": 1.0340775558166863e-06,
+      "loss": 0.0002,
+      "step": 678
+    },
+    {
+      "epoch": 0.7967193907439953,
+      "grad_norm": 0.10936477035284042,
+      "learning_rate": 1.0223266745005877e-06,
+      "loss": 0.001,
+      "step": 680
+    },
+    {
+      "epoch": 0.799062683069713,
+      "grad_norm": 0.01284460723400116,
+      "learning_rate": 1.010575793184489e-06,
+      "loss": 0.0015,
+      "step": 682
+    },
+    {
+      "epoch": 0.8014059753954306,
+      "grad_norm": 0.003440434578806162,
+      "learning_rate": 9.988249118683902e-07,
+      "loss": 0.0,
+      "step": 684
+    },
+    {
+      "epoch": 0.8037492677211482,
+      "grad_norm": 0.013081365264952183,
+      "learning_rate": 9.870740305522914e-07,
+      "loss": 0.0009,
+      "step": 686
+    },
+    {
+      "epoch": 0.8060925600468658,
+      "grad_norm": 0.013380183838307858,
+      "learning_rate": 9.753231492361928e-07,
+      "loss": 0.0002,
+      "step": 688
+    },
+    {
+      "epoch": 0.8084358523725835,
+      "grad_norm": 0.03771582618355751,
+      "learning_rate": 9.63572267920094e-07,
+      "loss": 0.0003,
+      "step": 690
+    },
+    {
+      "epoch": 0.8107791446983011,
+      "grad_norm": 0.0009556732256896794,
+      "learning_rate": 9.518213866039954e-07,
+      "loss": 0.0005,
+      "step": 692
+    },
+    {
+      "epoch": 0.8131224370240188,
+      "grad_norm": 0.0019481348572298884,
+      "learning_rate": 9.400705052878967e-07,
+      "loss": 0.0001,
+      "step": 694
+    },
+    {
+      "epoch": 0.8154657293497364,
+      "grad_norm": 0.0021866948809474707,
+      "learning_rate": 9.28319623971798e-07,
+      "loss": 0.0002,
+      "step": 696
+    },
+    {
+      "epoch": 0.817809021675454,
+      "grad_norm": 0.007546517997980118,
+      "learning_rate": 9.165687426556992e-07,
+      "loss": 0.0007,
+      "step": 698
+    },
+    {
+      "epoch": 0.8201523140011716,
+      "grad_norm": 2.074432611465454,
+      "learning_rate": 9.048178613396005e-07,
+      "loss": 0.0251,
+      "step": 700
+    },
+    {
+      "epoch": 0.8224956063268892,
+      "grad_norm": 0.003374068532139063,
+      "learning_rate": 8.930669800235018e-07,
+      "loss": 0.0001,
+      "step": 702
+    },
+    {
+      "epoch": 0.824838898652607,
+      "grad_norm": 0.010109562426805496,
+      "learning_rate": 8.81316098707403e-07,
+      "loss": 0.0006,
+      "step": 704
+    },
+    {
+      "epoch": 0.8271821909783246,
+      "grad_norm": 0.017352379858493805,
+      "learning_rate": 8.695652173913044e-07,
+      "loss": 0.0001,
+      "step": 706
+    },
+    {
+      "epoch": 0.8295254833040422,
+      "grad_norm": 0.016872087493538857,
+      "learning_rate": 8.578143360752057e-07,
+      "loss": 0.0002,
+      "step": 708
+    },
+    {
+      "epoch": 0.8318687756297598,
+      "grad_norm": 0.041937246918678284,
+      "learning_rate": 8.46063454759107e-07,
+      "loss": 0.0228,
+      "step": 710
+    },
+    {
+      "epoch": 0.8342120679554774,
+      "grad_norm": 0.02908233553171158,
+      "learning_rate": 8.343125734430083e-07,
+      "loss": 0.0002,
+      "step": 712
+    },
+    {
+      "epoch": 0.836555360281195,
+      "grad_norm": 0.0012463816674426198,
+      "learning_rate": 8.225616921269096e-07,
+      "loss": 0.0004,
+      "step": 714
+    },
+    {
+      "epoch": 0.8388986526069128,
+      "grad_norm": 0.04300675913691521,
+      "learning_rate": 8.108108108108109e-07,
+      "loss": 0.0006,
+      "step": 716
+    },
+    {
+      "epoch": 0.8412419449326304,
+      "grad_norm": 2.7622828483581543,
+      "learning_rate": 7.990599294947122e-07,
+      "loss": 0.149,
+      "step": 718
+    },
+    {
+      "epoch": 0.843585237258348,
+      "grad_norm": 0.010049765929579735,
+      "learning_rate": 7.873090481786135e-07,
+      "loss": 0.0002,
+      "step": 720
+    },
+    {
+      "epoch": 0.8459285295840656,
+      "grad_norm": 0.011876920238137245,
+      "learning_rate": 7.755581668625148e-07,
+      "loss": 0.0001,
+      "step": 722
+    },
+    {
+      "epoch": 0.8482718219097832,
+      "grad_norm": 0.014826681464910507,
+      "learning_rate": 7.638072855464159e-07,
+      "loss": 0.0003,
+      "step": 724
+    },
+    {
+      "epoch": 0.8506151142355008,
+      "grad_norm": 0.16368882358074188,
+      "learning_rate": 7.520564042303173e-07,
+      "loss": 0.0013,
+      "step": 726
+    },
+    {
+      "epoch": 0.8529584065612185,
+      "grad_norm": 0.02603282406926155,
+      "learning_rate": 7.403055229142186e-07,
+      "loss": 0.0004,
+      "step": 728
+    },
+    {
+      "epoch": 0.8553016988869362,
+      "grad_norm": 0.7740702629089355,
+      "learning_rate": 7.2855464159812e-07,
+      "loss": 0.0043,
+      "step": 730
+    },
+    {
+      "epoch": 0.8576449912126538,
+      "grad_norm": 0.010226438753306866,
+      "learning_rate": 7.168037602820211e-07,
+      "loss": 0.0002,
+      "step": 732
+    },
+    {
+      "epoch": 0.8599882835383714,
+      "grad_norm": 0.02008165791630745,
+      "learning_rate": 7.050528789659225e-07,
+      "loss": 0.0002,
+      "step": 734
+    },
+    {
+      "epoch": 0.862331575864089,
+      "grad_norm": 0.09208586066961288,
+      "learning_rate": 6.933019976498238e-07,
+      "loss": 0.0008,
+      "step": 736
+    },
+    {
+      "epoch": 0.8646748681898067,
+      "grad_norm": 0.01933148130774498,
+      "learning_rate": 6.81551116333725e-07,
+      "loss": 0.0011,
+      "step": 738
+    },
+    {
+      "epoch": 0.8670181605155243,
+      "grad_norm": 0.04433580860495567,
+      "learning_rate": 6.698002350176264e-07,
+      "loss": 0.0003,
+      "step": 740
+    },
+    {
+      "epoch": 0.869361452841242,
+      "grad_norm": 0.01631711982190609,
+      "learning_rate": 6.580493537015277e-07,
+      "loss": 0.0003,
+      "step": 742
+    },
+    {
+      "epoch": 0.8717047451669596,
+      "grad_norm": 0.042307399213314056,
+      "learning_rate": 6.462984723854289e-07,
+      "loss": 0.0004,
+      "step": 744
+    },
+    {
+      "epoch": 0.8740480374926772,
+      "grad_norm": 0.22414757311344147,
+      "learning_rate": 6.345475910693303e-07,
+      "loss": 0.0018,
+      "step": 746
+    },
+    {
+      "epoch": 0.8763913298183948,
+      "grad_norm": 0.17513447999954224,
+      "learning_rate": 6.227967097532316e-07,
+      "loss": 0.0015,
+      "step": 748
+    },
+    {
+      "epoch": 0.8787346221441125,
+      "grad_norm": 0.3218580186367035,
+      "learning_rate": 6.110458284371328e-07,
+      "loss": 0.0029,
+      "step": 750
+    },
+    {
+      "epoch": 0.8810779144698301,
+      "grad_norm": 0.026706017553806305,
+      "learning_rate": 5.992949471210341e-07,
+      "loss": 0.0004,
+      "step": 752
+    },
+    {
+      "epoch": 0.8834212067955477,
+      "grad_norm": 0.4114263951778412,
+      "learning_rate": 5.875440658049355e-07,
+      "loss": 0.0035,
+      "step": 754
+    },
+    {
+      "epoch": 0.8857644991212654,
+      "grad_norm": 0.25009235739707947,
+      "learning_rate": 5.757931844888367e-07,
+      "loss": 0.0016,
+      "step": 756
+    },
+    {
+      "epoch": 0.888107791446983,
+      "grad_norm": 1.2960833311080933,
+      "learning_rate": 5.64042303172738e-07,
+      "loss": 0.0059,
+      "step": 758
+    },
+    {
+      "epoch": 0.8904510837727007,
+      "grad_norm": 0.28417083621025085,
+      "learning_rate": 5.522914218566393e-07,
+      "loss": 0.0059,
+      "step": 760
+    },
+    {
+      "epoch": 0.8927943760984183,
+      "grad_norm": 0.2292051613330841,
+      "learning_rate": 5.405405405405406e-07,
+      "loss": 0.0015,
+      "step": 762
+    },
+    {
+      "epoch": 0.8951376684241359,
+      "grad_norm": 0.012189504690468311,
+      "learning_rate": 5.287896592244419e-07,
+      "loss": 0.0007,
+      "step": 764
+    },
+    {
+      "epoch": 0.8974809607498535,
+      "grad_norm": 0.09458251297473907,
+      "learning_rate": 5.170387779083431e-07,
+      "loss": 0.0004,
+      "step": 766
+    },
+    {
+      "epoch": 0.8998242530755711,
+      "grad_norm": 0.027070222422480583,
+      "learning_rate": 5.052878965922445e-07,
+      "loss": 0.0012,
+      "step": 768
+    },
+    {
+      "epoch": 0.9021675454012889,
+      "grad_norm": 0.047401878982782364,
+      "learning_rate": 4.935370152761457e-07,
+      "loss": 0.0003,
+      "step": 770
+    },
+    {
+      "epoch": 0.9045108377270065,
+      "grad_norm": 0.06239737570285797,
+      "learning_rate": 4.81786133960047e-07,
+      "loss": 0.0012,
+      "step": 772
+    },
+    {
+      "epoch": 0.9068541300527241,
+      "grad_norm": 2.6842846870422363,
+      "learning_rate": 4.7003525264394836e-07,
+      "loss": 0.1103,
+      "step": 774
+    },
+    {
+      "epoch": 0.9091974223784417,
+      "grad_norm": 0.057395774871110916,
+      "learning_rate": 4.582843713278496e-07,
+      "loss": 0.0004,
+      "step": 776
+    },
+    {
+      "epoch": 0.9115407147041593,
+      "grad_norm": 0.16248440742492676,
+      "learning_rate": 4.465334900117509e-07,
+      "loss": 0.0018,
+      "step": 778
+    },
+    {
+      "epoch": 0.9138840070298769,
+      "grad_norm": 0.11067284643650055,
+      "learning_rate": 4.347826086956522e-07,
+      "loss": 0.0011,
+      "step": 780
+    },
+    {
+      "epoch": 0.9162272993555947,
+      "grad_norm": 0.07208680361509323,
+      "learning_rate": 4.230317273795535e-07,
+      "loss": 0.0011,
+      "step": 782
+    },
+    {
+      "epoch": 0.9185705916813123,
+      "grad_norm": 0.4830150604248047,
+      "learning_rate": 4.112808460634548e-07,
+      "loss": 0.0022,
+      "step": 784
+    },
+    {
+      "epoch": 0.9209138840070299,
+      "grad_norm": 0.01794450171291828,
+      "learning_rate": 3.995299647473561e-07,
+      "loss": 0.0011,
+      "step": 786
+    },
+    {
+      "epoch": 0.9232571763327475,
+      "grad_norm": 3.0485081672668457,
+      "learning_rate": 3.877790834312574e-07,
+      "loss": 0.0508,
+      "step": 788
+    },
+    {
+      "epoch": 0.9256004686584651,
+      "grad_norm": 3.130112648010254,
+      "learning_rate": 3.7602820211515863e-07,
+      "loss": 0.0194,
+      "step": 790
+    },
+    {
+      "epoch": 0.9279437609841827,
+      "grad_norm": 3.5992815494537354,
+      "learning_rate": 3.6427732079906e-07,
+      "loss": 0.1036,
+      "step": 792
+    },
+    {
+      "epoch": 0.9302870533099004,
+      "grad_norm": 0.0751647800207138,
+      "learning_rate": 3.5252643948296124e-07,
+      "loss": 0.0003,
+      "step": 794
+    },
+    {
+      "epoch": 0.9326303456356181,
+      "grad_norm": 0.03622612729668617,
+      "learning_rate": 3.407755581668625e-07,
+      "loss": 0.0011,
+      "step": 796
+    },
+    {
+      "epoch": 0.9349736379613357,
+      "grad_norm": 0.22365981340408325,
+      "learning_rate": 3.2902467685076385e-07,
+      "loss": 0.0028,
+      "step": 798
+    },
+    {
+      "epoch": 0.9373169302870533,
+      "grad_norm": 0.04666091129183769,
+      "learning_rate": 3.172737955346651e-07,
+      "loss": 0.0041,
+      "step": 800
+    },
+    {
+      "epoch": 0.9396602226127709,
+      "grad_norm": 5.363467693328857,
+      "learning_rate": 3.055229142185664e-07,
+      "loss": 0.2217,
+      "step": 802
+    },
+    {
+      "epoch": 0.9420035149384886,
+      "grad_norm": 0.06753694266080856,
+      "learning_rate": 2.9377203290246774e-07,
+      "loss": 0.0026,
+      "step": 804
+    },
+    {
+      "epoch": 0.9443468072642062,
+      "grad_norm": 2.554419994354248,
+      "learning_rate": 2.82021151586369e-07,
+      "loss": 0.0791,
+      "step": 806
+    },
+    {
+      "epoch": 0.9466900995899239,
+      "grad_norm": 0.14563411474227905,
+      "learning_rate": 2.702702702702703e-07,
+      "loss": 0.0208,
+      "step": 808
+    },
+    {
+      "epoch": 0.9490333919156415,
+      "grad_norm": 2.30971360206604,
+      "learning_rate": 2.5851938895417157e-07,
+      "loss": 0.1119,
+      "step": 810
+    },
+    {
+      "epoch": 0.9513766842413591,
+      "grad_norm": 4.073694229125977,
+      "learning_rate": 2.4676850763807285e-07,
+      "loss": 0.1057,
+      "step": 812
+    },
+    {
+      "epoch": 0.9537199765670767,
+      "grad_norm": 2.3215789794921875,
+      "learning_rate": 2.3501762632197418e-07,
+      "loss": 0.0286,
+      "step": 814
+    },
+    {
+      "epoch": 0.9560632688927944,
+      "grad_norm": 0.46727773547172546,
+      "learning_rate": 2.2326674500587546e-07,
+      "loss": 0.0714,
+      "step": 816
+    },
+    {
+      "epoch": 0.958406561218512,
+      "grad_norm": 2.0026137828826904,
+      "learning_rate": 2.1151586368977676e-07,
+      "loss": 0.0455,
+      "step": 818
+    },
+    {
+      "epoch": 0.9607498535442296,
+      "grad_norm": 3.2537143230438232,
+      "learning_rate": 1.9976498237367804e-07,
+      "loss": 0.0765,
+      "step": 820
+    },
+    {
+      "epoch": 0.9630931458699473,
+      "grad_norm": 3.485633134841919,
+      "learning_rate": 1.8801410105757932e-07,
+      "loss": 0.0493,
+      "step": 822
+    },
+    {
+      "epoch": 0.9654364381956649,
+      "grad_norm": 2.769423246383667,
+      "learning_rate": 1.7626321974148062e-07,
+      "loss": 0.0602,
+      "step": 824
+    },
+    {
+      "epoch": 0.9677797305213826,
+      "grad_norm": 2.236210823059082,
+      "learning_rate": 1.6451233842538192e-07,
+      "loss": 0.1404,
+      "step": 826
+    },
+    {
+      "epoch": 0.9701230228471002,
+      "grad_norm": 0.06197360157966614,
+      "learning_rate": 1.527614571092832e-07,
+      "loss": 0.0472,
+      "step": 828
+    },
+    {
+      "epoch": 0.9724663151728178,
+      "grad_norm": 0.8206185698509216,
+      "learning_rate": 1.410105757931845e-07,
+      "loss": 0.0686,
+      "step": 830
+    },
+    {
+      "epoch": 0.9748096074985354,
+      "grad_norm": 2.434030771255493,
+      "learning_rate": 1.2925969447708578e-07,
+      "loss": 0.1322,
+      "step": 832
+    },
+    {
+      "epoch": 0.9771528998242531,
+      "grad_norm": 0.03143630549311638,
+      "learning_rate": 1.1750881316098709e-07,
+      "loss": 0.1134,
+      "step": 834
+    },
+    {
+      "epoch": 0.9794961921499707,
+      "grad_norm": 0.1770186424255371,
+      "learning_rate": 1.0575793184488838e-07,
+      "loss": 0.0011,
+      "step": 836
+    },
+    {
+      "epoch": 0.9818394844756884,
+      "grad_norm": 6.03350830078125,
+      "learning_rate": 9.400705052878966e-08,
+      "loss": 0.4193,
+      "step": 838
+    },
+    {
+      "epoch": 0.984182776801406,
+      "grad_norm": 4.842612266540527,
+      "learning_rate": 8.225616921269096e-08,
+      "loss": 0.0951,
+      "step": 840
+    },
+    {
+      "epoch": 0.9865260691271236,
+      "grad_norm": 3.111945629119873,
+      "learning_rate": 7.050528789659225e-08,
+      "loss": 0.1375,
+      "step": 842
+    },
+    {
+      "epoch": 0.9888693614528412,
+      "grad_norm": 3.4468753337860107,
+      "learning_rate": 5.8754406580493544e-08,
+      "loss": 0.157,
+      "step": 844
+    },
+    {
+      "epoch": 0.9912126537785588,
+      "grad_norm": 5.563467502593994,
+      "learning_rate": 4.700352526439483e-08,
+      "loss": 0.1989,
+      "step": 846
+    },
+    {
+      "epoch": 0.9935559461042766,
+      "grad_norm": 0.20900146663188934,
+      "learning_rate": 3.5252643948296127e-08,
+      "loss": 0.169,
+      "step": 848
+    },
+    {
+      "epoch": 0.9958992384299942,
+      "grad_norm": 2.651283025741577,
+      "learning_rate": 2.3501762632197414e-08,
+      "loss": 0.0203,
+      "step": 850
+    },
+    {
+      "epoch": 0.9982425307557118,
+      "grad_norm": 3.192451000213623,
+      "learning_rate": 1.1750881316098707e-08,
+      "loss": 0.0786,
+      "step": 852
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 853,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 20000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-853/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:347d3fe43a026d8e0d3dc886116e49e992d313a7046135a66cde752c9308dfd5
+size 6776

checkpoint-853/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/root/autodl-tmp/bge-m3_r4",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.0.1",
+    "transformers": "4.42.1",
+    "pytorch": "2.3.0+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": null
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff2b3be09c7552fc58248f097a32771e376f56eb50737f93e0f41cef389d71d
+size 2271064456

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

runs/Aug22_17-17-24_autodl-container-c024408f5d-9bcd732d/events.out.tfevents.1724318254.autodl-container-c024408f5d-9bcd732d.5345.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04011eacde85360baeb929a4b75d2544f7b0d570ca1f0fed03e9b6168747a9cb
+size 5560

runs/Aug22_17-18-40_autodl-container-c024408f5d-9bcd732d/events.out.tfevents.1724318333.autodl-container-c024408f5d-9bcd732d.6318.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14fbf8c125536ee0ff2735de382a9ec6685c0ddf465533d2cb0127204a6e3f67
+size 95134

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
+size 17098251

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:347d3fe43a026d8e0d3dc886116e49e992d313a7046135a66cde752c9308dfd5
+size 6776