diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237
--- /dev/null
+++ b/README.md
@@ -0,0 +1,202 @@
+---
+base_model: liuhaotian/llava-v1.6-vicuna-13b
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e6d1f521fdefcdd69001e3f15155910cbbf70
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "q_proj",
+ "k_proj",
+ "gate_proj",
+ "up_proj",
+ "down_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7f52c6fccec0de4ab49229bc2d89cb6e3bc42548
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:582f98ea28b87a902c530b0ebdd7bdc472bdb58830ca975afdb7caab0b5d85fb
+size 65046168
diff --git a/checkpoint-256/README.md b/checkpoint-256/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237
--- /dev/null
+++ b/checkpoint-256/README.md
@@ -0,0 +1,202 @@
+---
+base_model: liuhaotian/llava-v1.6-vicuna-13b
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-256/adapter_config.json b/checkpoint-256/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..42947e9d5eddd8b5dfb4d8381d68cd4c25aabb56
--- /dev/null
+++ b/checkpoint-256/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "up_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "o_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-256/adapter_model.safetensors b/checkpoint-256/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7f52c6fccec0de4ab49229bc2d89cb6e3bc42548
--- /dev/null
+++ b/checkpoint-256/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:582f98ea28b87a902c530b0ebdd7bdc472bdb58830ca975afdb7caab0b5d85fb
+size 65046168
diff --git a/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3a4b6ebd97ff1f0833b4a89c7520ca2eef3dd86
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:465bf9bcd385ef29a3aded4895139c4327721f5e279f0c5928a4350b26901d89
+size 775138
diff --git a/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7befc2e8c633a242b7382f047242f50e018cd
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10c68ac2bffff7c6cd1da2bd544a0b27610ccc8747cf54ddfb246cdf24ae5f3b
+size 191825901
diff --git a/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a588b63eadd1126c02cf1b9faca2ef84a2e6a2d3
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3144132b88f2cfa0c1803ff5ffa79ab3b81f0b89322c27f987f899c8ff3914e
+size 775138
diff --git a/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f69c29ad364039e46e0de66daf775c20de9df1e
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c6d3e5ddc3618ab6c11874ff9c3aaa56b9014dabc1ff40a2dae3695964202c5
+size 191825901
diff --git a/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a5ff1efbf48f68803ee715c14d85ba5c3570a4f
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20274aa80392ac7f568845aa97ec6813a7365b6d537f0f55ac2a56b0cc84bbe6
+size 775138
diff --git a/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..545d2a89b3975d1e5f41a49772bea0aad92030d9
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b7c5723c3fa9c44b9378020f471d5b5ab1102856a7c288397a1809227947d98
+size 191825901
diff --git a/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e45710325a9e9575cdb0d5fb8413b1d7f2b7507
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d07a7fb704a5010ca2b049a5b38e040049ce1bb62dd2ad63debef296372b740
+size 775138
diff --git a/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6de9a83b4ccd83289ae9855f79ccc37e5daf25af
--- /dev/null
+++ b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16b92dff9100643c9a0e96243df4bc82354374bb04eaa1541637da45bc3a7bf4
+size 191825901
diff --git a/checkpoint-256/latest b/checkpoint-256/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1
--- /dev/null
+++ b/checkpoint-256/latest
@@ -0,0 +1 @@
+global_step256
\ No newline at end of file
diff --git a/checkpoint-256/rng_state_0.pth b/checkpoint-256/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..196981c40334fb7dd951e7d84111eb3493959006
--- /dev/null
+++ b/checkpoint-256/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a82dfa4cdf25f3321dc609ee6cef399ec5eb5a776c270e602eed6aa9dbcbae97
+size 14960
diff --git a/checkpoint-256/rng_state_1.pth b/checkpoint-256/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5c668a886b54bae30d8d32655a29cd71d37f128e
--- /dev/null
+++ b/checkpoint-256/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a29c2706ec243f7c4af15e34459ed1c098adc1768fbab00b1ca5adede23f126
+size 14960
diff --git a/checkpoint-256/rng_state_2.pth b/checkpoint-256/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d8d9d67b555eeeddca57f7c8422458b27ea4aad1
--- /dev/null
+++ b/checkpoint-256/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a9cdb67e15cdb7445043e4cafda66c8a090655a1f350cb975bf3231ca39c36c
+size 14960
diff --git a/checkpoint-256/rng_state_3.pth b/checkpoint-256/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dd171c0d77e1cc5f9e1023225de4bf2e3858ea12
--- /dev/null
+++ b/checkpoint-256/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:962c98fdd1ccee0e253d9a2f1d5f6ee58bdbf1d8f2b333298226b9554d259ac0
+size 14960
diff --git a/checkpoint-256/special_tokens_map.json b/checkpoint-256/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e
--- /dev/null
+++ b/checkpoint-256/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-256/tokenizer.model b/checkpoint-256/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-256/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-256/tokenizer_config.json b/checkpoint-256/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3
--- /dev/null
+++ b/checkpoint-256/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 2048,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-256/trainer_state.json b/checkpoint-256/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae0074f37d17375ef64e91f32c2e178805ce2899
--- /dev/null
+++ b/checkpoint-256/trainer_state.json
@@ -0,0 +1,3873 @@
+{
+ "best_metric": 0.6575854420661926,
+ "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256",
+ "epoch": 8.0,
+ "eval_steps": 1.0,
+ "global_step": 256,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.03125,
+ "grad_norm": 0.5230235555406132,
+ "learning_rate": 0.0,
+ "loss": 1.5809,
+ "step": 1
+ },
+ {
+ "epoch": 0.03125,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 82.059,
+ "eval_samples_per_second": 2.437,
+ "eval_steps_per_second": 0.305,
+ "step": 1
+ },
+ {
+ "epoch": 0.0625,
+ "grad_norm": 0.5095402010892089,
+ "learning_rate": 2e-05,
+ "loss": 1.4958,
+ "step": 2
+ },
+ {
+ "epoch": 0.0625,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 76.5747,
+ "eval_samples_per_second": 2.612,
+ "eval_steps_per_second": 0.326,
+ "step": 2
+ },
+ {
+ "epoch": 0.09375,
+ "grad_norm": 0.4998514282504938,
+ "learning_rate": 2e-05,
+ "loss": 1.5552,
+ "step": 3
+ },
+ {
+ "epoch": 0.09375,
+ "eval_loss": 1.5956931114196777,
+ "eval_runtime": 76.1563,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 3
+ },
+ {
+ "epoch": 0.125,
+ "grad_norm": 0.4280580315108126,
+ "learning_rate": 2e-05,
+ "loss": 1.4846,
+ "step": 4
+ },
+ {
+ "epoch": 0.125,
+ "eval_loss": 1.5584176778793335,
+ "eval_runtime": 76.1235,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 4
+ },
+ {
+ "epoch": 0.15625,
+ "grad_norm": 0.5678499435986384,
+ "learning_rate": 2e-05,
+ "loss": 1.5036,
+ "step": 5
+ },
+ {
+ "epoch": 0.15625,
+ "eval_loss": 1.5207562446594238,
+ "eval_runtime": 76.1514,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 5
+ },
+ {
+ "epoch": 0.1875,
+ "grad_norm": 0.5368461657542534,
+ "learning_rate": 2e-05,
+ "loss": 1.476,
+ "step": 6
+ },
+ {
+ "epoch": 0.1875,
+ "eval_loss": 1.4807783365249634,
+ "eval_runtime": 77.3444,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 6
+ },
+ {
+ "epoch": 0.21875,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4358,
+ "step": 7
+ },
+ {
+ "epoch": 0.21875,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.066,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 7
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4369,
+ "step": 8
+ },
+ {
+ "epoch": 0.25,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.2807,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.323,
+ "step": 8
+ },
+ {
+ "epoch": 0.28125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4471,
+ "step": 9
+ },
+ {
+ "epoch": 0.28125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.1562,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 9
+ },
+ {
+ "epoch": 0.3125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.3666,
+ "step": 10
+ },
+ {
+ "epoch": 0.3125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 77.1645,
+ "eval_samples_per_second": 2.592,
+ "eval_steps_per_second": 0.324,
+ "step": 10
+ },
+ {
+ "epoch": 0.34375,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4149,
+ "step": 11
+ },
+ {
+ "epoch": 0.34375,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.7627,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 11
+ },
+ {
+ "epoch": 0.375,
+ "grad_norm": 0.684588966714067,
+ "learning_rate": 2e-05,
+ "loss": 1.3883,
+ "step": 12
+ },
+ {
+ "epoch": 0.375,
+ "eval_loss": 1.3679308891296387,
+ "eval_runtime": 78.4315,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 12
+ },
+ {
+ "epoch": 0.40625,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.4271,
+ "step": 13
+ },
+ {
+ "epoch": 0.40625,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.685,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 13
+ },
+ {
+ "epoch": 0.4375,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.2495,
+ "step": 14
+ },
+ {
+ "epoch": 0.4375,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.0511,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 14
+ },
+ {
+ "epoch": 0.46875,
+ "grad_norm": 0.6028103951693778,
+ "learning_rate": 2e-05,
+ "loss": 1.3513,
+ "step": 15
+ },
+ {
+ "epoch": 0.46875,
+ "eval_loss": 1.3032653331756592,
+ "eval_runtime": 78.0271,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 15
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 0.769290402283396,
+ "learning_rate": 2e-05,
+ "loss": 1.3117,
+ "step": 16
+ },
+ {
+ "epoch": 0.5,
+ "eval_loss": 1.2661188840866089,
+ "eval_runtime": 78.1857,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 16
+ },
+ {
+ "epoch": 0.53125,
+ "grad_norm": 1.3279338025863765,
+ "learning_rate": 2e-05,
+ "loss": 1.2768,
+ "step": 17
+ },
+ {
+ "epoch": 0.53125,
+ "eval_loss": 1.2299447059631348,
+ "eval_runtime": 78.2064,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 17
+ },
+ {
+ "epoch": 0.5625,
+ "grad_norm": 0.7410327159336384,
+ "learning_rate": 2e-05,
+ "loss": 1.256,
+ "step": 18
+ },
+ {
+ "epoch": 0.5625,
+ "eval_loss": 1.2044258117675781,
+ "eval_runtime": 78.072,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 18
+ },
+ {
+ "epoch": 0.59375,
+ "grad_norm": 0.44078820770408506,
+ "learning_rate": 2e-05,
+ "loss": 1.1252,
+ "step": 19
+ },
+ {
+ "epoch": 0.59375,
+ "eval_loss": 1.1826122999191284,
+ "eval_runtime": 78.7312,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.318,
+ "step": 19
+ },
+ {
+ "epoch": 0.625,
+ "grad_norm": 0.49020841613371097,
+ "learning_rate": 2e-05,
+ "loss": 1.2249,
+ "step": 20
+ },
+ {
+ "epoch": 0.625,
+ "eval_loss": 1.1616511344909668,
+ "eval_runtime": 78.2736,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 20
+ },
+ {
+ "epoch": 0.65625,
+ "grad_norm": 0.43031322695269714,
+ "learning_rate": 2e-05,
+ "loss": 1.1466,
+ "step": 21
+ },
+ {
+ "epoch": 0.65625,
+ "eval_loss": 1.1410629749298096,
+ "eval_runtime": 79.6432,
+ "eval_samples_per_second": 2.511,
+ "eval_steps_per_second": 0.314,
+ "step": 21
+ },
+ {
+ "epoch": 0.6875,
+ "grad_norm": 0.45632085445955545,
+ "learning_rate": 2e-05,
+ "loss": 1.1951,
+ "step": 22
+ },
+ {
+ "epoch": 0.6875,
+ "eval_loss": 1.1204684972763062,
+ "eval_runtime": 79.0609,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 22
+ },
+ {
+ "epoch": 0.71875,
+ "grad_norm": 0.40048586945364495,
+ "learning_rate": 2e-05,
+ "loss": 1.1826,
+ "step": 23
+ },
+ {
+ "epoch": 0.71875,
+ "eval_loss": 1.1002545356750488,
+ "eval_runtime": 82.8578,
+ "eval_samples_per_second": 2.414,
+ "eval_steps_per_second": 0.302,
+ "step": 23
+ },
+ {
+ "epoch": 0.75,
+ "grad_norm": 0.3703033261027938,
+ "learning_rate": 2e-05,
+ "loss": 1.1543,
+ "step": 24
+ },
+ {
+ "epoch": 0.75,
+ "eval_loss": 1.0805977582931519,
+ "eval_runtime": 76.1407,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 24
+ },
+ {
+ "epoch": 0.78125,
+ "grad_norm": 0.3986313105418924,
+ "learning_rate": 2e-05,
+ "loss": 1.1046,
+ "step": 25
+ },
+ {
+ "epoch": 0.78125,
+ "eval_loss": 1.0610157251358032,
+ "eval_runtime": 76.3083,
+ "eval_samples_per_second": 2.621,
+ "eval_steps_per_second": 0.328,
+ "step": 25
+ },
+ {
+ "epoch": 0.8125,
+ "grad_norm": 0.36265027203577943,
+ "learning_rate": 2e-05,
+ "loss": 1.1048,
+ "step": 26
+ },
+ {
+ "epoch": 0.8125,
+ "eval_loss": 1.0421289205551147,
+ "eval_runtime": 77.2186,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 26
+ },
+ {
+ "epoch": 0.84375,
+ "grad_norm": 0.3881748990218768,
+ "learning_rate": 2e-05,
+ "loss": 1.0425,
+ "step": 27
+ },
+ {
+ "epoch": 0.84375,
+ "eval_loss": 1.0240073204040527,
+ "eval_runtime": 77.8662,
+ "eval_samples_per_second": 2.569,
+ "eval_steps_per_second": 0.321,
+ "step": 27
+ },
+ {
+ "epoch": 0.875,
+ "grad_norm": 0.3734031294324286,
+ "learning_rate": 2e-05,
+ "loss": 1.0484,
+ "step": 28
+ },
+ {
+ "epoch": 0.875,
+ "eval_loss": 1.0066957473754883,
+ "eval_runtime": 77.269,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 28
+ },
+ {
+ "epoch": 0.90625,
+ "grad_norm": 0.29695383079342563,
+ "learning_rate": 2e-05,
+ "loss": 1.0387,
+ "step": 29
+ },
+ {
+ "epoch": 0.90625,
+ "eval_loss": 0.9906074404716492,
+ "eval_runtime": 77.2245,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 29
+ },
+ {
+ "epoch": 0.9375,
+ "grad_norm": 0.29273146875026623,
+ "learning_rate": 2e-05,
+ "loss": 1.0568,
+ "step": 30
+ },
+ {
+ "epoch": 0.9375,
+ "eval_loss": 0.975755512714386,
+ "eval_runtime": 78.0056,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.32,
+ "step": 30
+ },
+ {
+ "epoch": 0.96875,
+ "grad_norm": 0.35070440686850546,
+ "learning_rate": 2e-05,
+ "loss": 0.9114,
+ "step": 31
+ },
+ {
+ "epoch": 0.96875,
+ "eval_loss": 0.9615123271942139,
+ "eval_runtime": 77.9051,
+ "eval_samples_per_second": 2.567,
+ "eval_steps_per_second": 0.321,
+ "step": 31
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.30846157140439384,
+ "learning_rate": 2e-05,
+ "loss": 0.9941,
+ "step": 32
+ },
+ {
+ "epoch": 1.0,
+ "eval_loss": 0.9480571150779724,
+ "eval_runtime": 77.2322,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 32
+ },
+ {
+ "epoch": 1.03125,
+ "grad_norm": 0.2950381371932973,
+ "learning_rate": 2e-05,
+ "loss": 1.0297,
+ "step": 33
+ },
+ {
+ "epoch": 1.03125,
+ "eval_loss": 0.9356330037117004,
+ "eval_runtime": 81.8443,
+ "eval_samples_per_second": 2.444,
+ "eval_steps_per_second": 0.305,
+ "step": 33
+ },
+ {
+ "epoch": 1.0625,
+ "grad_norm": 0.27080038065834283,
+ "learning_rate": 2e-05,
+ "loss": 1.021,
+ "step": 34
+ },
+ {
+ "epoch": 1.0625,
+ "eval_loss": 0.9245791435241699,
+ "eval_runtime": 76.2071,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 34
+ },
+ {
+ "epoch": 1.09375,
+ "grad_norm": 0.23165081252649894,
+ "learning_rate": 2e-05,
+ "loss": 1.0366,
+ "step": 35
+ },
+ {
+ "epoch": 1.09375,
+ "eval_loss": 0.9151126146316528,
+ "eval_runtime": 77.0412,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 35
+ },
+ {
+ "epoch": 1.125,
+ "grad_norm": 0.4033780922500775,
+ "learning_rate": 2e-05,
+ "loss": 1.0127,
+ "step": 36
+ },
+ {
+ "epoch": 1.125,
+ "eval_loss": 0.9063960313796997,
+ "eval_runtime": 76.9327,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 36
+ },
+ {
+ "epoch": 1.15625,
+ "grad_norm": 0.2398039831439168,
+ "learning_rate": 2e-05,
+ "loss": 0.9418,
+ "step": 37
+ },
+ {
+ "epoch": 1.15625,
+ "eval_loss": 0.8982363939285278,
+ "eval_runtime": 76.1234,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 37
+ },
+ {
+ "epoch": 1.1875,
+ "grad_norm": 0.28793451241246804,
+ "learning_rate": 2e-05,
+ "loss": 0.9643,
+ "step": 38
+ },
+ {
+ "epoch": 1.1875,
+ "eval_loss": 0.8908895254135132,
+ "eval_runtime": 76.2877,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 38
+ },
+ {
+ "epoch": 1.21875,
+ "grad_norm": 0.2927691606307197,
+ "learning_rate": 2e-05,
+ "loss": 1.0087,
+ "step": 39
+ },
+ {
+ "epoch": 1.21875,
+ "eval_loss": 0.8845618367195129,
+ "eval_runtime": 76.2282,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 39
+ },
+ {
+ "epoch": 1.25,
+ "grad_norm": 0.26410982001408806,
+ "learning_rate": 2e-05,
+ "loss": 0.986,
+ "step": 40
+ },
+ {
+ "epoch": 1.25,
+ "eval_loss": 0.8784474730491638,
+ "eval_runtime": 76.2512,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 40
+ },
+ {
+ "epoch": 1.28125,
+ "grad_norm": 0.29182630949665306,
+ "learning_rate": 2e-05,
+ "loss": 0.9711,
+ "step": 41
+ },
+ {
+ "epoch": 1.28125,
+ "eval_loss": 0.8725223541259766,
+ "eval_runtime": 77.1229,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 41
+ },
+ {
+ "epoch": 1.3125,
+ "grad_norm": 0.36402838796832665,
+ "learning_rate": 2e-05,
+ "loss": 0.9263,
+ "step": 42
+ },
+ {
+ "epoch": 1.3125,
+ "eval_loss": 0.8662790060043335,
+ "eval_runtime": 77.2362,
+ "eval_samples_per_second": 2.589,
+ "eval_steps_per_second": 0.324,
+ "step": 42
+ },
+ {
+ "epoch": 1.34375,
+ "grad_norm": 0.29338184478895163,
+ "learning_rate": 2e-05,
+ "loss": 0.8947,
+ "step": 43
+ },
+ {
+ "epoch": 1.34375,
+ "eval_loss": 0.8600431680679321,
+ "eval_runtime": 77.1213,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 43
+ },
+ {
+ "epoch": 1.375,
+ "grad_norm": 0.2201714229702277,
+ "learning_rate": 2e-05,
+ "loss": 0.9059,
+ "step": 44
+ },
+ {
+ "epoch": 1.375,
+ "eval_loss": 0.8545799255371094,
+ "eval_runtime": 77.991,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 44
+ },
+ {
+ "epoch": 1.40625,
+ "grad_norm": 0.2254966625243654,
+ "learning_rate": 2e-05,
+ "loss": 0.8942,
+ "step": 45
+ },
+ {
+ "epoch": 1.40625,
+ "eval_loss": 0.8497399687767029,
+ "eval_runtime": 77.2698,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 45
+ },
+ {
+ "epoch": 1.4375,
+ "grad_norm": 0.21753318432075458,
+ "learning_rate": 2e-05,
+ "loss": 0.9376,
+ "step": 46
+ },
+ {
+ "epoch": 1.4375,
+ "eval_loss": 0.8452473282814026,
+ "eval_runtime": 77.0568,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 46
+ },
+ {
+ "epoch": 1.46875,
+ "grad_norm": 0.21449718265972945,
+ "learning_rate": 2e-05,
+ "loss": 0.9369,
+ "step": 47
+ },
+ {
+ "epoch": 1.46875,
+ "eval_loss": 0.841134786605835,
+ "eval_runtime": 77.225,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 47
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 0.2109063266748924,
+ "learning_rate": 2e-05,
+ "loss": 0.8511,
+ "step": 48
+ },
+ {
+ "epoch": 1.5,
+ "eval_loss": 0.8373770117759705,
+ "eval_runtime": 76.2309,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 48
+ },
+ {
+ "epoch": 1.53125,
+ "grad_norm": 0.232838633689838,
+ "learning_rate": 2e-05,
+ "loss": 0.8694,
+ "step": 49
+ },
+ {
+ "epoch": 1.53125,
+ "eval_loss": 0.8338289856910706,
+ "eval_runtime": 76.277,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 49
+ },
+ {
+ "epoch": 1.5625,
+ "grad_norm": 0.4189704940803984,
+ "learning_rate": 2e-05,
+ "loss": 0.8464,
+ "step": 50
+ },
+ {
+ "epoch": 1.5625,
+ "eval_loss": 0.8297132849693298,
+ "eval_runtime": 76.2872,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 50
+ },
+ {
+ "epoch": 1.59375,
+ "grad_norm": 0.2171618165123276,
+ "learning_rate": 2e-05,
+ "loss": 0.8785,
+ "step": 51
+ },
+ {
+ "epoch": 1.59375,
+ "eval_loss": 0.8257431983947754,
+ "eval_runtime": 76.2639,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 51
+ },
+ {
+ "epoch": 1.625,
+ "grad_norm": 0.21934651037670305,
+ "learning_rate": 2e-05,
+ "loss": 0.7645,
+ "step": 52
+ },
+ {
+ "epoch": 1.625,
+ "eval_loss": 0.8223557472229004,
+ "eval_runtime": 76.2383,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 52
+ },
+ {
+ "epoch": 1.65625,
+ "grad_norm": 0.24183530733164746,
+ "learning_rate": 2e-05,
+ "loss": 0.9218,
+ "step": 53
+ },
+ {
+ "epoch": 1.65625,
+ "eval_loss": 0.8189653158187866,
+ "eval_runtime": 76.9819,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 53
+ },
+ {
+ "epoch": 1.6875,
+ "grad_norm": 0.23450930244279267,
+ "learning_rate": 2e-05,
+ "loss": 0.8896,
+ "step": 54
+ },
+ {
+ "epoch": 1.6875,
+ "eval_loss": 0.8152530193328857,
+ "eval_runtime": 76.2378,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 54
+ },
+ {
+ "epoch": 1.71875,
+ "grad_norm": 0.22081665899796085,
+ "learning_rate": 2e-05,
+ "loss": 0.8798,
+ "step": 55
+ },
+ {
+ "epoch": 1.71875,
+ "eval_loss": 0.8122122287750244,
+ "eval_runtime": 76.289,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 55
+ },
+ {
+ "epoch": 1.75,
+ "grad_norm": 0.21311746114111046,
+ "learning_rate": 2e-05,
+ "loss": 0.9482,
+ "step": 56
+ },
+ {
+ "epoch": 1.75,
+ "eval_loss": 0.8092318773269653,
+ "eval_runtime": 77.8321,
+ "eval_samples_per_second": 2.57,
+ "eval_steps_per_second": 0.321,
+ "step": 56
+ },
+ {
+ "epoch": 1.78125,
+ "grad_norm": 0.2496565307107556,
+ "learning_rate": 2e-05,
+ "loss": 0.8917,
+ "step": 57
+ },
+ {
+ "epoch": 1.78125,
+ "eval_loss": 0.8070546984672546,
+ "eval_runtime": 77.2651,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 57
+ },
+ {
+ "epoch": 1.8125,
+ "grad_norm": 0.2137866456424736,
+ "learning_rate": 2e-05,
+ "loss": 0.909,
+ "step": 58
+ },
+ {
+ "epoch": 1.8125,
+ "eval_loss": 0.8049566745758057,
+ "eval_runtime": 78.0925,
+ "eval_samples_per_second": 2.561,
+ "eval_steps_per_second": 0.32,
+ "step": 58
+ },
+ {
+ "epoch": 1.84375,
+ "grad_norm": 0.22567502859345095,
+ "learning_rate": 2e-05,
+ "loss": 0.8611,
+ "step": 59
+ },
+ {
+ "epoch": 1.84375,
+ "eval_loss": 0.8028810024261475,
+ "eval_runtime": 78.0553,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 59
+ },
+ {
+ "epoch": 1.875,
+ "grad_norm": 0.23303796552302508,
+ "learning_rate": 2e-05,
+ "loss": 0.9209,
+ "step": 60
+ },
+ {
+ "epoch": 1.875,
+ "eval_loss": 0.800568699836731,
+ "eval_runtime": 78.052,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 60
+ },
+ {
+ "epoch": 1.90625,
+ "grad_norm": 0.24566727726974544,
+ "learning_rate": 2e-05,
+ "loss": 0.8239,
+ "step": 61
+ },
+ {
+ "epoch": 1.90625,
+ "eval_loss": 0.7976545691490173,
+ "eval_runtime": 77.3056,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 61
+ },
+ {
+ "epoch": 1.9375,
+ "grad_norm": 0.23014192522354907,
+ "learning_rate": 2e-05,
+ "loss": 0.8814,
+ "step": 62
+ },
+ {
+ "epoch": 1.9375,
+ "eval_loss": 0.7945474982261658,
+ "eval_runtime": 77.3398,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 62
+ },
+ {
+ "epoch": 1.96875,
+ "grad_norm": 0.23042819102671622,
+ "learning_rate": 2e-05,
+ "loss": 0.9064,
+ "step": 63
+ },
+ {
+ "epoch": 1.96875,
+ "eval_loss": 0.7918359637260437,
+ "eval_runtime": 77.4272,
+ "eval_samples_per_second": 2.583,
+ "eval_steps_per_second": 0.323,
+ "step": 63
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.23940667173206315,
+ "learning_rate": 2e-05,
+ "loss": 0.8658,
+ "step": 64
+ },
+ {
+ "epoch": 2.0,
+ "eval_loss": 0.7891160845756531,
+ "eval_runtime": 77.3236,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 64
+ },
+ {
+ "epoch": 2.03125,
+ "grad_norm": 0.22630342930143643,
+ "learning_rate": 2e-05,
+ "loss": 0.8403,
+ "step": 65
+ },
+ {
+ "epoch": 2.03125,
+ "eval_loss": 0.7859742641448975,
+ "eval_runtime": 77.2001,
+ "eval_samples_per_second": 2.591,
+ "eval_steps_per_second": 0.324,
+ "step": 65
+ },
+ {
+ "epoch": 2.0625,
+ "grad_norm": 0.20949240460260976,
+ "learning_rate": 2e-05,
+ "loss": 0.8472,
+ "step": 66
+ },
+ {
+ "epoch": 2.0625,
+ "eval_loss": 0.7834083437919617,
+ "eval_runtime": 78.9646,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 66
+ },
+ {
+ "epoch": 2.09375,
+ "grad_norm": 0.22714400479820654,
+ "learning_rate": 2e-05,
+ "loss": 0.841,
+ "step": 67
+ },
+ {
+ "epoch": 2.09375,
+ "eval_loss": 0.7805308699607849,
+ "eval_runtime": 78.7552,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.317,
+ "step": 67
+ },
+ {
+ "epoch": 2.125,
+ "grad_norm": 0.23345123077006047,
+ "learning_rate": 2e-05,
+ "loss": 0.9028,
+ "step": 68
+ },
+ {
+ "epoch": 2.125,
+ "eval_loss": 0.7779514789581299,
+ "eval_runtime": 78.3387,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 68
+ },
+ {
+ "epoch": 2.15625,
+ "grad_norm": 0.251841542575211,
+ "learning_rate": 2e-05,
+ "loss": 0.8381,
+ "step": 69
+ },
+ {
+ "epoch": 2.15625,
+ "eval_loss": 0.7756664752960205,
+ "eval_runtime": 78.3109,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 69
+ },
+ {
+ "epoch": 2.1875,
+ "grad_norm": 0.23548386839773608,
+ "learning_rate": 2e-05,
+ "loss": 0.7914,
+ "step": 70
+ },
+ {
+ "epoch": 2.1875,
+ "eval_loss": 0.7733604907989502,
+ "eval_runtime": 78.9712,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 70
+ },
+ {
+ "epoch": 2.21875,
+ "grad_norm": 0.23262740912668387,
+ "learning_rate": 2e-05,
+ "loss": 0.8778,
+ "step": 71
+ },
+ {
+ "epoch": 2.21875,
+ "eval_loss": 0.771755576133728,
+ "eval_runtime": 78.2633,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 71
+ },
+ {
+ "epoch": 2.25,
+ "grad_norm": 0.22075289612357513,
+ "learning_rate": 2e-05,
+ "loss": 0.7945,
+ "step": 72
+ },
+ {
+ "epoch": 2.25,
+ "eval_loss": 0.7705450654029846,
+ "eval_runtime": 78.3151,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 72
+ },
+ {
+ "epoch": 2.28125,
+ "grad_norm": 0.25520381955936466,
+ "learning_rate": 2e-05,
+ "loss": 0.8387,
+ "step": 73
+ },
+ {
+ "epoch": 2.28125,
+ "eval_loss": 0.7695029973983765,
+ "eval_runtime": 78.2901,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 73
+ },
+ {
+ "epoch": 2.3125,
+ "grad_norm": 0.2047305385827267,
+ "learning_rate": 2e-05,
+ "loss": 0.8404,
+ "step": 74
+ },
+ {
+ "epoch": 2.3125,
+ "eval_loss": 0.7684457302093506,
+ "eval_runtime": 78.3875,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 74
+ },
+ {
+ "epoch": 2.34375,
+ "grad_norm": 0.2262323045133288,
+ "learning_rate": 2e-05,
+ "loss": 0.8811,
+ "step": 75
+ },
+ {
+ "epoch": 2.34375,
+ "eval_loss": 0.7671162486076355,
+ "eval_runtime": 78.202,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 75
+ },
+ {
+ "epoch": 2.375,
+ "grad_norm": 0.21885464923925876,
+ "learning_rate": 2e-05,
+ "loss": 0.7942,
+ "step": 76
+ },
+ {
+ "epoch": 2.375,
+ "eval_loss": 0.7658494710922241,
+ "eval_runtime": 78.1746,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 76
+ },
+ {
+ "epoch": 2.40625,
+ "grad_norm": 0.21717306953626966,
+ "learning_rate": 2e-05,
+ "loss": 0.8497,
+ "step": 77
+ },
+ {
+ "epoch": 2.40625,
+ "eval_loss": 0.7642120122909546,
+ "eval_runtime": 78.2026,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 77
+ },
+ {
+ "epoch": 2.4375,
+ "grad_norm": 0.2530725583748258,
+ "learning_rate": 2e-05,
+ "loss": 0.8584,
+ "step": 78
+ },
+ {
+ "epoch": 2.4375,
+ "eval_loss": 0.7625510692596436,
+ "eval_runtime": 78.1991,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 78
+ },
+ {
+ "epoch": 2.46875,
+ "grad_norm": 0.25354787036627263,
+ "learning_rate": 2e-05,
+ "loss": 0.8569,
+ "step": 79
+ },
+ {
+ "epoch": 2.46875,
+ "eval_loss": 0.7616268396377563,
+ "eval_runtime": 78.2915,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 79
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 0.2800865746664007,
+ "learning_rate": 2e-05,
+ "loss": 0.9116,
+ "step": 80
+ },
+ {
+ "epoch": 2.5,
+ "eval_loss": 0.7603214979171753,
+ "eval_runtime": 78.2749,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 80
+ },
+ {
+ "epoch": 2.53125,
+ "grad_norm": 0.268139688449618,
+ "learning_rate": 2e-05,
+ "loss": 0.8397,
+ "step": 81
+ },
+ {
+ "epoch": 2.53125,
+ "eval_loss": 0.7584869265556335,
+ "eval_runtime": 79.1445,
+ "eval_samples_per_second": 2.527,
+ "eval_steps_per_second": 0.316,
+ "step": 81
+ },
+ {
+ "epoch": 2.5625,
+ "grad_norm": 0.3128648654463789,
+ "learning_rate": 2e-05,
+ "loss": 0.8888,
+ "step": 82
+ },
+ {
+ "epoch": 2.5625,
+ "eval_loss": 0.7566561102867126,
+ "eval_runtime": 79.2089,
+ "eval_samples_per_second": 2.525,
+ "eval_steps_per_second": 0.316,
+ "step": 82
+ },
+ {
+ "epoch": 2.59375,
+ "grad_norm": 0.2502355211215609,
+ "learning_rate": 2e-05,
+ "loss": 0.8346,
+ "step": 83
+ },
+ {
+ "epoch": 2.59375,
+ "eval_loss": 0.7547345161437988,
+ "eval_runtime": 79.2691,
+ "eval_samples_per_second": 2.523,
+ "eval_steps_per_second": 0.315,
+ "step": 83
+ },
+ {
+ "epoch": 2.625,
+ "grad_norm": 0.25281184629018644,
+ "learning_rate": 2e-05,
+ "loss": 0.795,
+ "step": 84
+ },
+ {
+ "epoch": 2.625,
+ "eval_loss": 0.7527951598167419,
+ "eval_runtime": 79.4068,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 84
+ },
+ {
+ "epoch": 2.65625,
+ "grad_norm": 0.24246729562645003,
+ "learning_rate": 2e-05,
+ "loss": 0.7649,
+ "step": 85
+ },
+ {
+ "epoch": 2.65625,
+ "eval_loss": 0.7509815096855164,
+ "eval_runtime": 79.1612,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 85
+ },
+ {
+ "epoch": 2.6875,
+ "grad_norm": 0.27005475109453947,
+ "learning_rate": 2e-05,
+ "loss": 0.7964,
+ "step": 86
+ },
+ {
+ "epoch": 2.6875,
+ "eval_loss": 0.7485950589179993,
+ "eval_runtime": 80.0714,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 86
+ },
+ {
+ "epoch": 2.71875,
+ "grad_norm": 0.2723492355800971,
+ "learning_rate": 2e-05,
+ "loss": 0.8117,
+ "step": 87
+ },
+ {
+ "epoch": 2.71875,
+ "eval_loss": 0.7459420561790466,
+ "eval_runtime": 79.4075,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 87
+ },
+ {
+ "epoch": 2.75,
+ "grad_norm": 0.2946493898427159,
+ "learning_rate": 2e-05,
+ "loss": 0.8986,
+ "step": 88
+ },
+ {
+ "epoch": 2.75,
+ "eval_loss": 0.7436455488204956,
+ "eval_runtime": 79.3721,
+ "eval_samples_per_second": 2.52,
+ "eval_steps_per_second": 0.315,
+ "step": 88
+ },
+ {
+ "epoch": 2.78125,
+ "grad_norm": 0.26411214734213284,
+ "learning_rate": 2e-05,
+ "loss": 0.8145,
+ "step": 89
+ },
+ {
+ "epoch": 2.78125,
+ "eval_loss": 0.7424752712249756,
+ "eval_runtime": 79.2988,
+ "eval_samples_per_second": 2.522,
+ "eval_steps_per_second": 0.315,
+ "step": 89
+ },
+ {
+ "epoch": 2.8125,
+ "grad_norm": 0.27115747269014817,
+ "learning_rate": 2e-05,
+ "loss": 0.8457,
+ "step": 90
+ },
+ {
+ "epoch": 2.8125,
+ "eval_loss": 0.7416408658027649,
+ "eval_runtime": 79.4004,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 90
+ },
+ {
+ "epoch": 2.84375,
+ "grad_norm": 0.25831877964821937,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 91
+ },
+ {
+ "epoch": 2.84375,
+ "eval_loss": 0.7404463291168213,
+ "eval_runtime": 81.7767,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 91
+ },
+ {
+ "epoch": 2.875,
+ "grad_norm": 0.31273388454942935,
+ "learning_rate": 2e-05,
+ "loss": 0.8562,
+ "step": 92
+ },
+ {
+ "epoch": 2.875,
+ "eval_loss": 0.7384185791015625,
+ "eval_runtime": 82.3443,
+ "eval_samples_per_second": 2.429,
+ "eval_steps_per_second": 0.304,
+ "step": 92
+ },
+ {
+ "epoch": 2.90625,
+ "grad_norm": 0.2838267071008901,
+ "learning_rate": 2e-05,
+ "loss": 0.7869,
+ "step": 93
+ },
+ {
+ "epoch": 2.90625,
+ "eval_loss": 0.7366807460784912,
+ "eval_runtime": 82.2622,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 93
+ },
+ {
+ "epoch": 2.9375,
+ "grad_norm": 0.28625827941831467,
+ "learning_rate": 2e-05,
+ "loss": 0.8618,
+ "step": 94
+ },
+ {
+ "epoch": 2.9375,
+ "eval_loss": 0.7357398867607117,
+ "eval_runtime": 81.9471,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 94
+ },
+ {
+ "epoch": 2.96875,
+ "grad_norm": 0.25548002643954326,
+ "learning_rate": 2e-05,
+ "loss": 0.8085,
+ "step": 95
+ },
+ {
+ "epoch": 2.96875,
+ "eval_loss": 0.7356534004211426,
+ "eval_runtime": 82.1186,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 95
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 0.27081450830961107,
+ "learning_rate": 2e-05,
+ "loss": 0.7684,
+ "step": 96
+ },
+ {
+ "epoch": 3.0,
+ "eval_loss": 0.7346957921981812,
+ "eval_runtime": 81.5463,
+ "eval_samples_per_second": 2.453,
+ "eval_steps_per_second": 0.307,
+ "step": 96
+ },
+ {
+ "epoch": 3.03125,
+ "grad_norm": 0.2985486737236676,
+ "learning_rate": 2e-05,
+ "loss": 0.7274,
+ "step": 97
+ },
+ {
+ "epoch": 3.03125,
+ "eval_loss": 0.7325752377510071,
+ "eval_runtime": 81.7804,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 97
+ },
+ {
+ "epoch": 3.0625,
+ "grad_norm": 0.29149719690624026,
+ "learning_rate": 2e-05,
+ "loss": 0.8119,
+ "step": 98
+ },
+ {
+ "epoch": 3.0625,
+ "eval_loss": 0.7298976182937622,
+ "eval_runtime": 76.2764,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 98
+ },
+ {
+ "epoch": 3.09375,
+ "grad_norm": 0.25227859825215865,
+ "learning_rate": 2e-05,
+ "loss": 0.7888,
+ "step": 99
+ },
+ {
+ "epoch": 3.09375,
+ "eval_loss": 0.727373480796814,
+ "eval_runtime": 76.2418,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 99
+ },
+ {
+ "epoch": 3.125,
+ "grad_norm": 0.27316954971752555,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 100
+ },
+ {
+ "epoch": 3.125,
+ "eval_loss": 0.7254325747489929,
+ "eval_runtime": 76.1474,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 100
+ },
+ {
+ "epoch": 3.15625,
+ "grad_norm": 0.24239788607957785,
+ "learning_rate": 2e-05,
+ "loss": 0.7535,
+ "step": 101
+ },
+ {
+ "epoch": 3.15625,
+ "eval_loss": 0.724058985710144,
+ "eval_runtime": 76.2391,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 101
+ },
+ {
+ "epoch": 3.1875,
+ "grad_norm": 0.25648385925427025,
+ "learning_rate": 2e-05,
+ "loss": 0.8195,
+ "step": 102
+ },
+ {
+ "epoch": 3.1875,
+ "eval_loss": 0.7235870957374573,
+ "eval_runtime": 76.9134,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 102
+ },
+ {
+ "epoch": 3.21875,
+ "grad_norm": 0.29620170789161204,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 103
+ },
+ {
+ "epoch": 3.21875,
+ "eval_loss": 0.7228152751922607,
+ "eval_runtime": 76.095,
+ "eval_samples_per_second": 2.628,
+ "eval_steps_per_second": 0.329,
+ "step": 103
+ },
+ {
+ "epoch": 3.25,
+ "grad_norm": 0.3484116181139593,
+ "learning_rate": 2e-05,
+ "loss": 0.7478,
+ "step": 104
+ },
+ {
+ "epoch": 3.25,
+ "eval_loss": 0.7209363579750061,
+ "eval_runtime": 76.9377,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 104
+ },
+ {
+ "epoch": 3.28125,
+ "grad_norm": 0.25212350156184643,
+ "learning_rate": 2e-05,
+ "loss": 0.7885,
+ "step": 105
+ },
+ {
+ "epoch": 3.28125,
+ "eval_loss": 0.7197096347808838,
+ "eval_runtime": 76.2008,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 105
+ },
+ {
+ "epoch": 3.3125,
+ "grad_norm": 0.264200147608962,
+ "learning_rate": 2e-05,
+ "loss": 0.8371,
+ "step": 106
+ },
+ {
+ "epoch": 3.3125,
+ "eval_loss": 0.7197055220603943,
+ "eval_runtime": 78.1542,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 106
+ },
+ {
+ "epoch": 3.34375,
+ "grad_norm": 0.3309431084940201,
+ "learning_rate": 2e-05,
+ "loss": 0.6999,
+ "step": 107
+ },
+ {
+ "epoch": 3.34375,
+ "eval_loss": 0.7187016010284424,
+ "eval_runtime": 78.4259,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 107
+ },
+ {
+ "epoch": 3.375,
+ "grad_norm": 0.3131644456919823,
+ "learning_rate": 2e-05,
+ "loss": 0.7587,
+ "step": 108
+ },
+ {
+ "epoch": 3.375,
+ "eval_loss": 0.717018187046051,
+ "eval_runtime": 78.4558,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 108
+ },
+ {
+ "epoch": 3.40625,
+ "grad_norm": 0.33527684120780293,
+ "learning_rate": 2e-05,
+ "loss": 0.7468,
+ "step": 109
+ },
+ {
+ "epoch": 3.40625,
+ "eval_loss": 0.7147062420845032,
+ "eval_runtime": 78.2334,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.32,
+ "step": 109
+ },
+ {
+ "epoch": 3.4375,
+ "grad_norm": 0.29542683956231724,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 110
+ },
+ {
+ "epoch": 3.4375,
+ "eval_loss": 0.7130224704742432,
+ "eval_runtime": 79.1179,
+ "eval_samples_per_second": 2.528,
+ "eval_steps_per_second": 0.316,
+ "step": 110
+ },
+ {
+ "epoch": 3.46875,
+ "grad_norm": 0.31128698002926114,
+ "learning_rate": 2e-05,
+ "loss": 0.8153,
+ "step": 111
+ },
+ {
+ "epoch": 3.46875,
+ "eval_loss": 0.7120551466941833,
+ "eval_runtime": 80.292,
+ "eval_samples_per_second": 2.491,
+ "eval_steps_per_second": 0.311,
+ "step": 111
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.32502558864214215,
+ "learning_rate": 2e-05,
+ "loss": 0.8043,
+ "step": 112
+ },
+ {
+ "epoch": 3.5,
+ "eval_loss": 0.7117202877998352,
+ "eval_runtime": 79.7539,
+ "eval_samples_per_second": 2.508,
+ "eval_steps_per_second": 0.313,
+ "step": 112
+ },
+ {
+ "epoch": 3.53125,
+ "grad_norm": 0.34335720855758517,
+ "learning_rate": 2e-05,
+ "loss": 0.871,
+ "step": 113
+ },
+ {
+ "epoch": 3.53125,
+ "eval_loss": 0.7117029428482056,
+ "eval_runtime": 80.0281,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 113
+ },
+ {
+ "epoch": 3.5625,
+ "grad_norm": 0.31951931695644,
+ "learning_rate": 2e-05,
+ "loss": 0.7453,
+ "step": 114
+ },
+ {
+ "epoch": 3.5625,
+ "eval_loss": 0.7116554379463196,
+ "eval_runtime": 79.7209,
+ "eval_samples_per_second": 2.509,
+ "eval_steps_per_second": 0.314,
+ "step": 114
+ },
+ {
+ "epoch": 3.59375,
+ "grad_norm": 0.28067192963874266,
+ "learning_rate": 2e-05,
+ "loss": 0.8045,
+ "step": 115
+ },
+ {
+ "epoch": 3.59375,
+ "eval_loss": 0.7118353843688965,
+ "eval_runtime": 80.0195,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 115
+ },
+ {
+ "epoch": 3.625,
+ "grad_norm": 0.2739718257400276,
+ "learning_rate": 2e-05,
+ "loss": 0.775,
+ "step": 116
+ },
+ {
+ "epoch": 3.625,
+ "eval_loss": 0.7122579216957092,
+ "eval_runtime": 76.2052,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 116
+ },
+ {
+ "epoch": 3.65625,
+ "grad_norm": 0.31401723658881836,
+ "learning_rate": 2e-05,
+ "loss": 0.7826,
+ "step": 117
+ },
+ {
+ "epoch": 3.65625,
+ "eval_loss": 0.7118574380874634,
+ "eval_runtime": 76.1509,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 117
+ },
+ {
+ "epoch": 3.6875,
+ "grad_norm": 0.36925964858634625,
+ "learning_rate": 2e-05,
+ "loss": 0.7884,
+ "step": 118
+ },
+ {
+ "epoch": 3.6875,
+ "eval_loss": 0.710691511631012,
+ "eval_runtime": 76.2305,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 118
+ },
+ {
+ "epoch": 3.71875,
+ "grad_norm": 0.3050583880654791,
+ "learning_rate": 2e-05,
+ "loss": 0.8402,
+ "step": 119
+ },
+ {
+ "epoch": 3.71875,
+ "eval_loss": 0.7096763849258423,
+ "eval_runtime": 77.0581,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 119
+ },
+ {
+ "epoch": 3.75,
+ "grad_norm": 0.2648625651290031,
+ "learning_rate": 2e-05,
+ "loss": 0.7889,
+ "step": 120
+ },
+ {
+ "epoch": 3.75,
+ "eval_loss": 0.7094223499298096,
+ "eval_runtime": 76.1379,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 120
+ },
+ {
+ "epoch": 3.78125,
+ "grad_norm": 0.3107221696449271,
+ "learning_rate": 2e-05,
+ "loss": 0.7615,
+ "step": 121
+ },
+ {
+ "epoch": 3.78125,
+ "eval_loss": 0.7081363201141357,
+ "eval_runtime": 76.626,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 121
+ },
+ {
+ "epoch": 3.8125,
+ "grad_norm": 0.3455151299995048,
+ "learning_rate": 2e-05,
+ "loss": 0.8342,
+ "step": 122
+ },
+ {
+ "epoch": 3.8125,
+ "eval_loss": 0.7063001990318298,
+ "eval_runtime": 77.0293,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 122
+ },
+ {
+ "epoch": 3.84375,
+ "grad_norm": 0.28847071926472523,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 123
+ },
+ {
+ "epoch": 3.84375,
+ "eval_loss": 0.7044610381126404,
+ "eval_runtime": 76.2385,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 123
+ },
+ {
+ "epoch": 3.875,
+ "grad_norm": 0.26753816515069856,
+ "learning_rate": 2e-05,
+ "loss": 0.7653,
+ "step": 124
+ },
+ {
+ "epoch": 3.875,
+ "eval_loss": 0.7033799886703491,
+ "eval_runtime": 76.1985,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 124
+ },
+ {
+ "epoch": 3.90625,
+ "grad_norm": 0.3465046292893005,
+ "learning_rate": 2e-05,
+ "loss": 0.8144,
+ "step": 125
+ },
+ {
+ "epoch": 3.90625,
+ "eval_loss": 0.7021930813789368,
+ "eval_runtime": 76.2234,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 125
+ },
+ {
+ "epoch": 3.9375,
+ "grad_norm": 0.3451690427620698,
+ "learning_rate": 2e-05,
+ "loss": 0.7871,
+ "step": 126
+ },
+ {
+ "epoch": 3.9375,
+ "eval_loss": 0.7013542652130127,
+ "eval_runtime": 78.0752,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 126
+ },
+ {
+ "epoch": 3.96875,
+ "grad_norm": 0.31571858642673567,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 127
+ },
+ {
+ "epoch": 3.96875,
+ "eval_loss": 0.7007560729980469,
+ "eval_runtime": 78.3558,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 127
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.3247003540270338,
+ "learning_rate": 2e-05,
+ "loss": 0.6714,
+ "step": 128
+ },
+ {
+ "epoch": 4.0,
+ "eval_loss": 0.6999780535697937,
+ "eval_runtime": 78.9788,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 128
+ },
+ {
+ "epoch": 4.03125,
+ "grad_norm": 0.2814983490019739,
+ "learning_rate": 2e-05,
+ "loss": 0.7797,
+ "step": 129
+ },
+ {
+ "epoch": 4.03125,
+ "eval_loss": 0.6998200416564941,
+ "eval_runtime": 78.3093,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 129
+ },
+ {
+ "epoch": 4.0625,
+ "grad_norm": 0.31961631715145106,
+ "learning_rate": 2e-05,
+ "loss": 0.7993,
+ "step": 130
+ },
+ {
+ "epoch": 4.0625,
+ "eval_loss": 0.6995271444320679,
+ "eval_runtime": 78.2172,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 130
+ },
+ {
+ "epoch": 4.09375,
+ "grad_norm": 0.32333364662215863,
+ "learning_rate": 2e-05,
+ "loss": 0.7896,
+ "step": 131
+ },
+ {
+ "epoch": 4.09375,
+ "eval_loss": 0.6992727518081665,
+ "eval_runtime": 79.0125,
+ "eval_samples_per_second": 2.531,
+ "eval_steps_per_second": 0.316,
+ "step": 131
+ },
+ {
+ "epoch": 4.125,
+ "grad_norm": 0.3255859640449829,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 132
+ },
+ {
+ "epoch": 4.125,
+ "eval_loss": 0.6988572478294373,
+ "eval_runtime": 79.0,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 132
+ },
+ {
+ "epoch": 4.15625,
+ "grad_norm": 0.3307068947429175,
+ "learning_rate": 2e-05,
+ "loss": 0.8416,
+ "step": 133
+ },
+ {
+ "epoch": 4.15625,
+ "eval_loss": 0.6981343030929565,
+ "eval_runtime": 78.3309,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 133
+ },
+ {
+ "epoch": 4.1875,
+ "grad_norm": 0.3842303818116732,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 134
+ },
+ {
+ "epoch": 4.1875,
+ "eval_loss": 0.6968980431556702,
+ "eval_runtime": 78.5608,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 134
+ },
+ {
+ "epoch": 4.21875,
+ "grad_norm": 0.331839472419003,
+ "learning_rate": 2e-05,
+ "loss": 0.7643,
+ "step": 135
+ },
+ {
+ "epoch": 4.21875,
+ "eval_loss": 0.6955949664115906,
+ "eval_runtime": 78.3566,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 135
+ },
+ {
+ "epoch": 4.25,
+ "grad_norm": 0.31864813130499836,
+ "learning_rate": 2e-05,
+ "loss": 0.7369,
+ "step": 136
+ },
+ {
+ "epoch": 4.25,
+ "eval_loss": 0.6951528787612915,
+ "eval_runtime": 79.7802,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 136
+ },
+ {
+ "epoch": 4.28125,
+ "grad_norm": 0.352549164434451,
+ "learning_rate": 2e-05,
+ "loss": 0.7332,
+ "step": 137
+ },
+ {
+ "epoch": 4.28125,
+ "eval_loss": 0.6947290897369385,
+ "eval_runtime": 79.8171,
+ "eval_samples_per_second": 2.506,
+ "eval_steps_per_second": 0.313,
+ "step": 137
+ },
+ {
+ "epoch": 4.3125,
+ "grad_norm": 0.37128812818896284,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 138
+ },
+ {
+ "epoch": 4.3125,
+ "eval_loss": 0.6937370300292969,
+ "eval_runtime": 79.7782,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 138
+ },
+ {
+ "epoch": 4.34375,
+ "grad_norm": 0.3348014941412048,
+ "learning_rate": 2e-05,
+ "loss": 0.7079,
+ "step": 139
+ },
+ {
+ "epoch": 4.34375,
+ "eval_loss": 0.692456066608429,
+ "eval_runtime": 79.9308,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 139
+ },
+ {
+ "epoch": 4.375,
+ "grad_norm": 0.34411051658527964,
+ "learning_rate": 2e-05,
+ "loss": 0.7465,
+ "step": 140
+ },
+ {
+ "epoch": 4.375,
+ "eval_loss": 0.6915809512138367,
+ "eval_runtime": 79.943,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 140
+ },
+ {
+ "epoch": 4.40625,
+ "grad_norm": 0.3373909601921749,
+ "learning_rate": 2e-05,
+ "loss": 0.7648,
+ "step": 141
+ },
+ {
+ "epoch": 4.40625,
+ "eval_loss": 0.6912103295326233,
+ "eval_runtime": 79.8515,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 141
+ },
+ {
+ "epoch": 4.4375,
+ "grad_norm": 0.33253827371305456,
+ "learning_rate": 2e-05,
+ "loss": 0.7224,
+ "step": 142
+ },
+ {
+ "epoch": 4.4375,
+ "eval_loss": 0.6912806630134583,
+ "eval_runtime": 80.6475,
+ "eval_samples_per_second": 2.48,
+ "eval_steps_per_second": 0.31,
+ "step": 142
+ },
+ {
+ "epoch": 4.46875,
+ "grad_norm": 0.38458075172588313,
+ "learning_rate": 2e-05,
+ "loss": 0.7261,
+ "step": 143
+ },
+ {
+ "epoch": 4.46875,
+ "eval_loss": 0.6905419230461121,
+ "eval_runtime": 80.2606,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.311,
+ "step": 143
+ },
+ {
+ "epoch": 4.5,
+ "grad_norm": 0.31351962640463144,
+ "learning_rate": 2e-05,
+ "loss": 0.6909,
+ "step": 144
+ },
+ {
+ "epoch": 4.5,
+ "eval_loss": 0.6898491382598877,
+ "eval_runtime": 79.9965,
+ "eval_samples_per_second": 2.5,
+ "eval_steps_per_second": 0.313,
+ "step": 144
+ },
+ {
+ "epoch": 4.53125,
+ "grad_norm": 0.35474372115704583,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 145
+ },
+ {
+ "epoch": 4.53125,
+ "eval_loss": 0.6893147230148315,
+ "eval_runtime": 1475.5758,
+ "eval_samples_per_second": 0.136,
+ "eval_steps_per_second": 0.017,
+ "step": 145
+ },
+ {
+ "epoch": 4.5625,
+ "grad_norm": 0.3479568917421202,
+ "learning_rate": 2e-05,
+ "loss": 0.6638,
+ "step": 146
+ },
+ {
+ "epoch": 4.5625,
+ "eval_loss": 0.6884538531303406,
+ "eval_runtime": 84.6835,
+ "eval_samples_per_second": 2.362,
+ "eval_steps_per_second": 0.295,
+ "step": 146
+ },
+ {
+ "epoch": 4.59375,
+ "grad_norm": 0.3421823344428645,
+ "learning_rate": 2e-05,
+ "loss": 0.7339,
+ "step": 147
+ },
+ {
+ "epoch": 4.59375,
+ "eval_loss": 0.6873475909233093,
+ "eval_runtime": 83.3138,
+ "eval_samples_per_second": 2.401,
+ "eval_steps_per_second": 0.3,
+ "step": 147
+ },
+ {
+ "epoch": 4.625,
+ "grad_norm": 0.3642187020830788,
+ "learning_rate": 2e-05,
+ "loss": 0.6825,
+ "step": 148
+ },
+ {
+ "epoch": 4.625,
+ "eval_loss": 0.6858401298522949,
+ "eval_runtime": 82.1066,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 148
+ },
+ {
+ "epoch": 4.65625,
+ "grad_norm": 0.35097547901391785,
+ "learning_rate": 2e-05,
+ "loss": 0.7986,
+ "step": 149
+ },
+ {
+ "epoch": 4.65625,
+ "eval_loss": 0.6848779320716858,
+ "eval_runtime": 84.4076,
+ "eval_samples_per_second": 2.369,
+ "eval_steps_per_second": 0.296,
+ "step": 149
+ },
+ {
+ "epoch": 4.6875,
+ "grad_norm": 0.3568694843794629,
+ "learning_rate": 2e-05,
+ "loss": 0.7176,
+ "step": 150
+ },
+ {
+ "epoch": 4.6875,
+ "eval_loss": 0.6842290759086609,
+ "eval_runtime": 82.5945,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 150
+ },
+ {
+ "epoch": 4.71875,
+ "grad_norm": 0.34258633585260334,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 151
+ },
+ {
+ "epoch": 4.71875,
+ "eval_loss": 0.6838659048080444,
+ "eval_runtime": 85.9626,
+ "eval_samples_per_second": 2.327,
+ "eval_steps_per_second": 0.291,
+ "step": 151
+ },
+ {
+ "epoch": 4.75,
+ "grad_norm": 0.42319523894659655,
+ "learning_rate": 2e-05,
+ "loss": 0.7675,
+ "step": 152
+ },
+ {
+ "epoch": 4.75,
+ "eval_loss": 0.6830299496650696,
+ "eval_runtime": 85.7189,
+ "eval_samples_per_second": 2.333,
+ "eval_steps_per_second": 0.292,
+ "step": 152
+ },
+ {
+ "epoch": 4.78125,
+ "grad_norm": 0.3632195533127194,
+ "learning_rate": 2e-05,
+ "loss": 0.715,
+ "step": 153
+ },
+ {
+ "epoch": 4.78125,
+ "eval_loss": 0.6826379895210266,
+ "eval_runtime": 87.8244,
+ "eval_samples_per_second": 2.277,
+ "eval_steps_per_second": 0.285,
+ "step": 153
+ },
+ {
+ "epoch": 4.8125,
+ "grad_norm": 0.3738308004604413,
+ "learning_rate": 2e-05,
+ "loss": 0.7344,
+ "step": 154
+ },
+ {
+ "epoch": 4.8125,
+ "eval_loss": 0.6826817393302917,
+ "eval_runtime": 86.5822,
+ "eval_samples_per_second": 2.31,
+ "eval_steps_per_second": 0.289,
+ "step": 154
+ },
+ {
+ "epoch": 4.84375,
+ "grad_norm": 0.3618696330632776,
+ "learning_rate": 2e-05,
+ "loss": 0.6632,
+ "step": 155
+ },
+ {
+ "epoch": 4.84375,
+ "eval_loss": 0.6827967166900635,
+ "eval_runtime": 82.1829,
+ "eval_samples_per_second": 2.434,
+ "eval_steps_per_second": 0.304,
+ "step": 155
+ },
+ {
+ "epoch": 4.875,
+ "grad_norm": 0.38901912569992203,
+ "learning_rate": 2e-05,
+ "loss": 0.7788,
+ "step": 156
+ },
+ {
+ "epoch": 4.875,
+ "eval_loss": 0.6821711659431458,
+ "eval_runtime": 84.4511,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 156
+ },
+ {
+ "epoch": 4.90625,
+ "grad_norm": 0.3516096507348829,
+ "learning_rate": 2e-05,
+ "loss": 0.7794,
+ "step": 157
+ },
+ {
+ "epoch": 4.90625,
+ "eval_loss": 0.6819837689399719,
+ "eval_runtime": 84.1594,
+ "eval_samples_per_second": 2.376,
+ "eval_steps_per_second": 0.297,
+ "step": 157
+ },
+ {
+ "epoch": 4.9375,
+ "grad_norm": 0.36066902463794986,
+ "learning_rate": 2e-05,
+ "loss": 0.7674,
+ "step": 158
+ },
+ {
+ "epoch": 4.9375,
+ "eval_loss": 0.6817716956138611,
+ "eval_runtime": 83.8929,
+ "eval_samples_per_second": 2.384,
+ "eval_steps_per_second": 0.298,
+ "step": 158
+ },
+ {
+ "epoch": 4.96875,
+ "grad_norm": 0.36641784926154175,
+ "learning_rate": 2e-05,
+ "loss": 0.7116,
+ "step": 159
+ },
+ {
+ "epoch": 4.96875,
+ "eval_loss": 0.6816902160644531,
+ "eval_runtime": 84.4431,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 159
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.4020716293225933,
+ "learning_rate": 2e-05,
+ "loss": 0.7142,
+ "step": 160
+ },
+ {
+ "epoch": 5.0,
+ "eval_loss": 0.6811469793319702,
+ "eval_runtime": 86.0681,
+ "eval_samples_per_second": 2.324,
+ "eval_steps_per_second": 0.29,
+ "step": 160
+ },
+ {
+ "epoch": 5.03125,
+ "grad_norm": 0.38360882669254054,
+ "learning_rate": 2e-05,
+ "loss": 0.6756,
+ "step": 161
+ },
+ {
+ "epoch": 5.03125,
+ "eval_loss": 0.6798409223556519,
+ "eval_runtime": 81.9903,
+ "eval_samples_per_second": 2.439,
+ "eval_steps_per_second": 0.305,
+ "step": 161
+ },
+ {
+ "epoch": 5.0625,
+ "grad_norm": 0.34966156213066135,
+ "learning_rate": 2e-05,
+ "loss": 0.827,
+ "step": 162
+ },
+ {
+ "epoch": 5.0625,
+ "eval_loss": 0.6788859367370605,
+ "eval_runtime": 76.1753,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 162
+ },
+ {
+ "epoch": 5.09375,
+ "grad_norm": 0.41140842939901384,
+ "learning_rate": 2e-05,
+ "loss": 0.6409,
+ "step": 163
+ },
+ {
+ "epoch": 5.09375,
+ "eval_loss": 0.6787077188491821,
+ "eval_runtime": 76.2239,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 163
+ },
+ {
+ "epoch": 5.125,
+ "grad_norm": 0.4222084070163774,
+ "learning_rate": 2e-05,
+ "loss": 0.7774,
+ "step": 164
+ },
+ {
+ "epoch": 5.125,
+ "eval_loss": 0.6796822547912598,
+ "eval_runtime": 76.2141,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 164
+ },
+ {
+ "epoch": 5.15625,
+ "grad_norm": 0.4644454724424921,
+ "learning_rate": 2e-05,
+ "loss": 0.6057,
+ "step": 165
+ },
+ {
+ "epoch": 5.15625,
+ "eval_loss": 0.6794346570968628,
+ "eval_runtime": 76.3216,
+ "eval_samples_per_second": 2.62,
+ "eval_steps_per_second": 0.328,
+ "step": 165
+ },
+ {
+ "epoch": 5.1875,
+ "grad_norm": 0.46128725263272996,
+ "learning_rate": 2e-05,
+ "loss": 0.7158,
+ "step": 166
+ },
+ {
+ "epoch": 5.1875,
+ "eval_loss": 0.6791612505912781,
+ "eval_runtime": 78.4909,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.319,
+ "step": 166
+ },
+ {
+ "epoch": 5.21875,
+ "grad_norm": 0.37300666872025545,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 167
+ },
+ {
+ "epoch": 5.21875,
+ "eval_loss": 0.6788016557693481,
+ "eval_runtime": 78.5697,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 167
+ },
+ {
+ "epoch": 5.25,
+ "grad_norm": 0.41454648576180214,
+ "learning_rate": 2e-05,
+ "loss": 0.7759,
+ "step": 168
+ },
+ {
+ "epoch": 5.25,
+ "eval_loss": 0.6787048578262329,
+ "eval_runtime": 78.5317,
+ "eval_samples_per_second": 2.547,
+ "eval_steps_per_second": 0.318,
+ "step": 168
+ },
+ {
+ "epoch": 5.28125,
+ "grad_norm": 0.40724665091386236,
+ "learning_rate": 2e-05,
+ "loss": 0.6944,
+ "step": 169
+ },
+ {
+ "epoch": 5.28125,
+ "eval_loss": 0.679679811000824,
+ "eval_runtime": 78.6899,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 169
+ },
+ {
+ "epoch": 5.3125,
+ "grad_norm": 0.3875110486208986,
+ "learning_rate": 2e-05,
+ "loss": 0.6634,
+ "step": 170
+ },
+ {
+ "epoch": 5.3125,
+ "eval_loss": 0.6819935441017151,
+ "eval_runtime": 78.3617,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 170
+ },
+ {
+ "epoch": 5.34375,
+ "grad_norm": 0.47956532155617193,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 171
+ },
+ {
+ "epoch": 5.34375,
+ "eval_loss": 0.6825206875801086,
+ "eval_runtime": 78.4435,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 171
+ },
+ {
+ "epoch": 5.375,
+ "grad_norm": 0.4599359590587781,
+ "learning_rate": 2e-05,
+ "loss": 0.7718,
+ "step": 172
+ },
+ {
+ "epoch": 5.375,
+ "eval_loss": 0.6816768050193787,
+ "eval_runtime": 78.3005,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 172
+ },
+ {
+ "epoch": 5.40625,
+ "grad_norm": 0.4057490487995386,
+ "learning_rate": 2e-05,
+ "loss": 0.7292,
+ "step": 173
+ },
+ {
+ "epoch": 5.40625,
+ "eval_loss": 0.6806090474128723,
+ "eval_runtime": 78.3313,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 173
+ },
+ {
+ "epoch": 5.4375,
+ "grad_norm": 0.4143979315360467,
+ "learning_rate": 2e-05,
+ "loss": 0.7697,
+ "step": 174
+ },
+ {
+ "epoch": 5.4375,
+ "eval_loss": 0.6795693039894104,
+ "eval_runtime": 78.4526,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 174
+ },
+ {
+ "epoch": 5.46875,
+ "grad_norm": 0.4219663662343445,
+ "learning_rate": 2e-05,
+ "loss": 0.7534,
+ "step": 175
+ },
+ {
+ "epoch": 5.46875,
+ "eval_loss": 0.6793847680091858,
+ "eval_runtime": 78.8009,
+ "eval_samples_per_second": 2.538,
+ "eval_steps_per_second": 0.317,
+ "step": 175
+ },
+ {
+ "epoch": 5.5,
+ "grad_norm": 0.4491811321927657,
+ "learning_rate": 2e-05,
+ "loss": 0.7004,
+ "step": 176
+ },
+ {
+ "epoch": 5.5,
+ "eval_loss": 0.6775352358818054,
+ "eval_runtime": 80.0685,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 176
+ },
+ {
+ "epoch": 5.53125,
+ "grad_norm": 0.46366516532638885,
+ "learning_rate": 2e-05,
+ "loss": 0.7357,
+ "step": 177
+ },
+ {
+ "epoch": 5.53125,
+ "eval_loss": 0.6748698949813843,
+ "eval_runtime": 80.0487,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 177
+ },
+ {
+ "epoch": 5.5625,
+ "grad_norm": 0.3815188640227797,
+ "learning_rate": 2e-05,
+ "loss": 0.7592,
+ "step": 178
+ },
+ {
+ "epoch": 5.5625,
+ "eval_loss": 0.6728273034095764,
+ "eval_runtime": 80.0318,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 178
+ },
+ {
+ "epoch": 5.59375,
+ "grad_norm": 0.41025429416666304,
+ "learning_rate": 2e-05,
+ "loss": 0.6585,
+ "step": 179
+ },
+ {
+ "epoch": 5.59375,
+ "eval_loss": 0.6718859672546387,
+ "eval_runtime": 79.8801,
+ "eval_samples_per_second": 2.504,
+ "eval_steps_per_second": 0.313,
+ "step": 179
+ },
+ {
+ "epoch": 5.625,
+ "grad_norm": 0.40652817592240054,
+ "learning_rate": 2e-05,
+ "loss": 0.6611,
+ "step": 180
+ },
+ {
+ "epoch": 5.625,
+ "eval_loss": 0.6715708374977112,
+ "eval_runtime": 76.7261,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 180
+ },
+ {
+ "epoch": 5.65625,
+ "grad_norm": 0.40753961326688415,
+ "learning_rate": 2e-05,
+ "loss": 0.6779,
+ "step": 181
+ },
+ {
+ "epoch": 5.65625,
+ "eval_loss": 0.6719761490821838,
+ "eval_runtime": 77.0136,
+ "eval_samples_per_second": 2.597,
+ "eval_steps_per_second": 0.325,
+ "step": 181
+ },
+ {
+ "epoch": 5.6875,
+ "grad_norm": 0.4232811980671673,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 182
+ },
+ {
+ "epoch": 5.6875,
+ "eval_loss": 0.6724664568901062,
+ "eval_runtime": 76.9731,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 182
+ },
+ {
+ "epoch": 5.71875,
+ "grad_norm": 0.5132756318549849,
+ "learning_rate": 2e-05,
+ "loss": 0.6801,
+ "step": 183
+ },
+ {
+ "epoch": 5.71875,
+ "eval_loss": 0.6723365783691406,
+ "eval_runtime": 76.4132,
+ "eval_samples_per_second": 2.617,
+ "eval_steps_per_second": 0.327,
+ "step": 183
+ },
+ {
+ "epoch": 5.75,
+ "grad_norm": 0.43526879230161264,
+ "learning_rate": 2e-05,
+ "loss": 0.6673,
+ "step": 184
+ },
+ {
+ "epoch": 5.75,
+ "eval_loss": 0.672926664352417,
+ "eval_runtime": 76.1936,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 184
+ },
+ {
+ "epoch": 5.78125,
+ "grad_norm": 0.46965560853038507,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 185
+ },
+ {
+ "epoch": 5.78125,
+ "eval_loss": 0.6731134057044983,
+ "eval_runtime": 76.2345,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 185
+ },
+ {
+ "epoch": 5.8125,
+ "grad_norm": 0.4733296318676217,
+ "learning_rate": 2e-05,
+ "loss": 0.6791,
+ "step": 186
+ },
+ {
+ "epoch": 5.8125,
+ "eval_loss": 0.6726363301277161,
+ "eval_runtime": 78.3939,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 186
+ },
+ {
+ "epoch": 5.84375,
+ "grad_norm": 0.4662943253655961,
+ "learning_rate": 2e-05,
+ "loss": 0.7371,
+ "step": 187
+ },
+ {
+ "epoch": 5.84375,
+ "eval_loss": 0.6726526021957397,
+ "eval_runtime": 79.1834,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 187
+ },
+ {
+ "epoch": 5.875,
+ "grad_norm": 0.4420962889993382,
+ "learning_rate": 2e-05,
+ "loss": 0.675,
+ "step": 188
+ },
+ {
+ "epoch": 5.875,
+ "eval_loss": 0.6727125644683838,
+ "eval_runtime": 78.252,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.319,
+ "step": 188
+ },
+ {
+ "epoch": 5.90625,
+ "grad_norm": 0.4345166976944551,
+ "learning_rate": 2e-05,
+ "loss": 0.6748,
+ "step": 189
+ },
+ {
+ "epoch": 5.90625,
+ "eval_loss": 0.6725904941558838,
+ "eval_runtime": 78.3914,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 189
+ },
+ {
+ "epoch": 5.9375,
+ "grad_norm": 0.45109463315374526,
+ "learning_rate": 2e-05,
+ "loss": 0.7024,
+ "step": 190
+ },
+ {
+ "epoch": 5.9375,
+ "eval_loss": 0.6718384027481079,
+ "eval_runtime": 78.4361,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 190
+ },
+ {
+ "epoch": 5.96875,
+ "grad_norm": 0.42953871838795626,
+ "learning_rate": 2e-05,
+ "loss": 0.6904,
+ "step": 191
+ },
+ {
+ "epoch": 5.96875,
+ "eval_loss": 0.6703083515167236,
+ "eval_runtime": 78.3863,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 191
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.4248607379284984,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 192
+ },
+ {
+ "epoch": 6.0,
+ "eval_loss": 0.6693080067634583,
+ "eval_runtime": 78.4373,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 192
+ },
+ {
+ "epoch": 6.03125,
+ "grad_norm": 0.42839417453459494,
+ "learning_rate": 2e-05,
+ "loss": 0.7457,
+ "step": 193
+ },
+ {
+ "epoch": 6.03125,
+ "eval_loss": 0.6689594984054565,
+ "eval_runtime": 78.4169,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 193
+ },
+ {
+ "epoch": 6.0625,
+ "grad_norm": 0.4216922788166874,
+ "learning_rate": 2e-05,
+ "loss": 0.7189,
+ "step": 194
+ },
+ {
+ "epoch": 6.0625,
+ "eval_loss": 0.6689300537109375,
+ "eval_runtime": 78.9793,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 194
+ },
+ {
+ "epoch": 6.09375,
+ "grad_norm": 0.45199575791858004,
+ "learning_rate": 2e-05,
+ "loss": 0.6438,
+ "step": 195
+ },
+ {
+ "epoch": 6.09375,
+ "eval_loss": 0.6690151691436768,
+ "eval_runtime": 78.5002,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.318,
+ "step": 195
+ },
+ {
+ "epoch": 6.125,
+ "grad_norm": 0.4166923177293841,
+ "learning_rate": 2e-05,
+ "loss": 0.6885,
+ "step": 196
+ },
+ {
+ "epoch": 6.125,
+ "eval_loss": 0.6688613891601562,
+ "eval_runtime": 80.5497,
+ "eval_samples_per_second": 2.483,
+ "eval_steps_per_second": 0.31,
+ "step": 196
+ },
+ {
+ "epoch": 6.15625,
+ "grad_norm": 0.45164281863366285,
+ "learning_rate": 2e-05,
+ "loss": 0.7197,
+ "step": 197
+ },
+ {
+ "epoch": 6.15625,
+ "eval_loss": 0.6687932014465332,
+ "eval_runtime": 80.1482,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 197
+ },
+ {
+ "epoch": 6.1875,
+ "grad_norm": 0.45653924787504446,
+ "learning_rate": 2e-05,
+ "loss": 0.776,
+ "step": 198
+ },
+ {
+ "epoch": 6.1875,
+ "eval_loss": 0.6690963506698608,
+ "eval_runtime": 80.4464,
+ "eval_samples_per_second": 2.486,
+ "eval_steps_per_second": 0.311,
+ "step": 198
+ },
+ {
+ "epoch": 6.21875,
+ "grad_norm": 0.4966562341334706,
+ "learning_rate": 2e-05,
+ "loss": 0.6532,
+ "step": 199
+ },
+ {
+ "epoch": 6.21875,
+ "eval_loss": 0.669116735458374,
+ "eval_runtime": 79.8294,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 199
+ },
+ {
+ "epoch": 6.25,
+ "grad_norm": 0.4838469303220975,
+ "learning_rate": 2e-05,
+ "loss": 0.6883,
+ "step": 200
+ },
+ {
+ "epoch": 6.25,
+ "eval_loss": 0.6693156957626343,
+ "eval_runtime": 80.25,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.312,
+ "step": 200
+ },
+ {
+ "epoch": 6.28125,
+ "grad_norm": 0.4836820906895964,
+ "learning_rate": 2e-05,
+ "loss": 0.7106,
+ "step": 201
+ },
+ {
+ "epoch": 6.28125,
+ "eval_loss": 0.6704170107841492,
+ "eval_runtime": 79.9636,
+ "eval_samples_per_second": 2.501,
+ "eval_steps_per_second": 0.313,
+ "step": 201
+ },
+ {
+ "epoch": 6.3125,
+ "grad_norm": 0.4945855983140219,
+ "learning_rate": 2e-05,
+ "loss": 0.6336,
+ "step": 202
+ },
+ {
+ "epoch": 6.3125,
+ "eval_loss": 0.6708824038505554,
+ "eval_runtime": 80.8044,
+ "eval_samples_per_second": 2.475,
+ "eval_steps_per_second": 0.309,
+ "step": 202
+ },
+ {
+ "epoch": 6.34375,
+ "grad_norm": 0.44587847230103017,
+ "learning_rate": 2e-05,
+ "loss": 0.7811,
+ "step": 203
+ },
+ {
+ "epoch": 6.34375,
+ "eval_loss": 0.6723968982696533,
+ "eval_runtime": 80.1715,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 203
+ },
+ {
+ "epoch": 6.375,
+ "grad_norm": 0.5351063503195825,
+ "learning_rate": 2e-05,
+ "loss": 0.6222,
+ "step": 204
+ },
+ {
+ "epoch": 6.375,
+ "eval_loss": 0.672196626663208,
+ "eval_runtime": 79.927,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 204
+ },
+ {
+ "epoch": 6.40625,
+ "grad_norm": 0.4742985088010474,
+ "learning_rate": 2e-05,
+ "loss": 0.6157,
+ "step": 205
+ },
+ {
+ "epoch": 6.40625,
+ "eval_loss": 0.671062171459198,
+ "eval_runtime": 80.1997,
+ "eval_samples_per_second": 2.494,
+ "eval_steps_per_second": 0.312,
+ "step": 205
+ },
+ {
+ "epoch": 6.4375,
+ "grad_norm": 0.5188882333349506,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 206
+ },
+ {
+ "epoch": 6.4375,
+ "eval_loss": 0.6701972484588623,
+ "eval_runtime": 81.6643,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 206
+ },
+ {
+ "epoch": 6.46875,
+ "grad_norm": 0.45328063593983603,
+ "learning_rate": 2e-05,
+ "loss": 0.7058,
+ "step": 207
+ },
+ {
+ "epoch": 6.46875,
+ "eval_loss": 0.6699164509773254,
+ "eval_runtime": 81.2228,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 207
+ },
+ {
+ "epoch": 6.5,
+ "grad_norm": 0.5197645538332801,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 208
+ },
+ {
+ "epoch": 6.5,
+ "eval_loss": 0.6702597141265869,
+ "eval_runtime": 81.1451,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 208
+ },
+ {
+ "epoch": 6.53125,
+ "grad_norm": 0.5762528184834232,
+ "learning_rate": 2e-05,
+ "loss": 0.6259,
+ "step": 209
+ },
+ {
+ "epoch": 6.53125,
+ "eval_loss": 0.6696366667747498,
+ "eval_runtime": 81.1643,
+ "eval_samples_per_second": 2.464,
+ "eval_steps_per_second": 0.308,
+ "step": 209
+ },
+ {
+ "epoch": 6.5625,
+ "grad_norm": 0.5249503180293145,
+ "learning_rate": 2e-05,
+ "loss": 0.6045,
+ "step": 210
+ },
+ {
+ "epoch": 6.5625,
+ "eval_loss": 0.6688054800033569,
+ "eval_runtime": 80.9492,
+ "eval_samples_per_second": 2.471,
+ "eval_steps_per_second": 0.309,
+ "step": 210
+ },
+ {
+ "epoch": 6.59375,
+ "grad_norm": 0.543503888655844,
+ "learning_rate": 2e-05,
+ "loss": 0.6496,
+ "step": 211
+ },
+ {
+ "epoch": 6.59375,
+ "eval_loss": 0.6689916849136353,
+ "eval_runtime": 81.6473,
+ "eval_samples_per_second": 2.45,
+ "eval_steps_per_second": 0.306,
+ "step": 211
+ },
+ {
+ "epoch": 6.625,
+ "grad_norm": 0.48119553592193554,
+ "learning_rate": 2e-05,
+ "loss": 0.6211,
+ "step": 212
+ },
+ {
+ "epoch": 6.625,
+ "eval_loss": 0.6703050136566162,
+ "eval_runtime": 81.9207,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 212
+ },
+ {
+ "epoch": 6.65625,
+ "grad_norm": 0.5153356086819314,
+ "learning_rate": 2e-05,
+ "loss": 0.7135,
+ "step": 213
+ },
+ {
+ "epoch": 6.65625,
+ "eval_loss": 0.6702842116355896,
+ "eval_runtime": 81.1503,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 213
+ },
+ {
+ "epoch": 6.6875,
+ "grad_norm": 0.5249915042825578,
+ "learning_rate": 2e-05,
+ "loss": 0.6635,
+ "step": 214
+ },
+ {
+ "epoch": 6.6875,
+ "eval_loss": 0.6687333583831787,
+ "eval_runtime": 81.6743,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 214
+ },
+ {
+ "epoch": 6.71875,
+ "grad_norm": 0.5204840219868723,
+ "learning_rate": 2e-05,
+ "loss": 0.6701,
+ "step": 215
+ },
+ {
+ "epoch": 6.71875,
+ "eval_loss": 0.6657728552818298,
+ "eval_runtime": 81.106,
+ "eval_samples_per_second": 2.466,
+ "eval_steps_per_second": 0.308,
+ "step": 215
+ },
+ {
+ "epoch": 6.75,
+ "grad_norm": 0.5266935225120133,
+ "learning_rate": 2e-05,
+ "loss": 0.6637,
+ "step": 216
+ },
+ {
+ "epoch": 6.75,
+ "eval_loss": 0.6641908884048462,
+ "eval_runtime": 82.2613,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 216
+ },
+ {
+ "epoch": 6.78125,
+ "grad_norm": 0.5438859451742696,
+ "learning_rate": 2e-05,
+ "loss": 0.6168,
+ "step": 217
+ },
+ {
+ "epoch": 6.78125,
+ "eval_loss": 0.6652233600616455,
+ "eval_runtime": 82.042,
+ "eval_samples_per_second": 2.438,
+ "eval_steps_per_second": 0.305,
+ "step": 217
+ },
+ {
+ "epoch": 6.8125,
+ "grad_norm": 0.5716385253433929,
+ "learning_rate": 2e-05,
+ "loss": 0.6062,
+ "step": 218
+ },
+ {
+ "epoch": 6.8125,
+ "eval_loss": 0.6656240820884705,
+ "eval_runtime": 81.233,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 218
+ },
+ {
+ "epoch": 6.84375,
+ "grad_norm": 1.0572787630142522,
+ "learning_rate": 2e-05,
+ "loss": 0.7037,
+ "step": 219
+ },
+ {
+ "epoch": 6.84375,
+ "eval_loss": 0.6645559072494507,
+ "eval_runtime": 81.2099,
+ "eval_samples_per_second": 2.463,
+ "eval_steps_per_second": 0.308,
+ "step": 219
+ },
+ {
+ "epoch": 6.875,
+ "grad_norm": 0.5924889323251107,
+ "learning_rate": 2e-05,
+ "loss": 0.712,
+ "step": 220
+ },
+ {
+ "epoch": 6.875,
+ "eval_loss": 0.6619111895561218,
+ "eval_runtime": 81.7826,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 220
+ },
+ {
+ "epoch": 6.90625,
+ "grad_norm": 0.5290576915218269,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 221
+ },
+ {
+ "epoch": 6.90625,
+ "eval_loss": 0.6609540581703186,
+ "eval_runtime": 82.9922,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 221
+ },
+ {
+ "epoch": 6.9375,
+ "grad_norm": 0.5831209517049147,
+ "learning_rate": 2e-05,
+ "loss": 0.6547,
+ "step": 222
+ },
+ {
+ "epoch": 6.9375,
+ "eval_loss": 0.660676896572113,
+ "eval_runtime": 83.6541,
+ "eval_samples_per_second": 2.391,
+ "eval_steps_per_second": 0.299,
+ "step": 222
+ },
+ {
+ "epoch": 6.96875,
+ "grad_norm": 0.5320966369511158,
+ "learning_rate": 2e-05,
+ "loss": 0.6968,
+ "step": 223
+ },
+ {
+ "epoch": 6.96875,
+ "eval_loss": 0.6618594527244568,
+ "eval_runtime": 83.1148,
+ "eval_samples_per_second": 2.406,
+ "eval_steps_per_second": 0.301,
+ "step": 223
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5829636446837394,
+ "learning_rate": 2e-05,
+ "loss": 0.7407,
+ "step": 224
+ },
+ {
+ "epoch": 7.0,
+ "eval_loss": 0.6635661125183105,
+ "eval_runtime": 82.8183,
+ "eval_samples_per_second": 2.415,
+ "eval_steps_per_second": 0.302,
+ "step": 224
+ },
+ {
+ "epoch": 7.03125,
+ "grad_norm": 0.4975095056459566,
+ "learning_rate": 2e-05,
+ "loss": 0.6535,
+ "step": 225
+ },
+ {
+ "epoch": 7.03125,
+ "eval_loss": 0.6641671657562256,
+ "eval_runtime": 83.0267,
+ "eval_samples_per_second": 2.409,
+ "eval_steps_per_second": 0.301,
+ "step": 225
+ },
+ {
+ "epoch": 7.0625,
+ "grad_norm": 0.5625698523064815,
+ "learning_rate": 2e-05,
+ "loss": 0.6012,
+ "step": 226
+ },
+ {
+ "epoch": 7.0625,
+ "eval_loss": 0.6639044880867004,
+ "eval_runtime": 83.3881,
+ "eval_samples_per_second": 2.398,
+ "eval_steps_per_second": 0.3,
+ "step": 226
+ },
+ {
+ "epoch": 7.09375,
+ "grad_norm": 0.5436196850683295,
+ "learning_rate": 2e-05,
+ "loss": 0.6485,
+ "step": 227
+ },
+ {
+ "epoch": 7.09375,
+ "eval_loss": 0.6651788353919983,
+ "eval_runtime": 82.7096,
+ "eval_samples_per_second": 2.418,
+ "eval_steps_per_second": 0.302,
+ "step": 227
+ },
+ {
+ "epoch": 7.125,
+ "grad_norm": 0.5598906287609361,
+ "learning_rate": 2e-05,
+ "loss": 0.6142,
+ "step": 228
+ },
+ {
+ "epoch": 7.125,
+ "eval_loss": 0.6688636541366577,
+ "eval_runtime": 82.601,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 228
+ },
+ {
+ "epoch": 7.15625,
+ "grad_norm": 0.7572979310697923,
+ "learning_rate": 2e-05,
+ "loss": 0.6221,
+ "step": 229
+ },
+ {
+ "epoch": 7.15625,
+ "eval_loss": 0.6699694991111755,
+ "eval_runtime": 82.6032,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 229
+ },
+ {
+ "epoch": 7.1875,
+ "grad_norm": 0.6173309690580897,
+ "learning_rate": 2e-05,
+ "loss": 0.5919,
+ "step": 230
+ },
+ {
+ "epoch": 7.1875,
+ "eval_loss": 0.6706527471542358,
+ "eval_runtime": 82.9732,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 230
+ },
+ {
+ "epoch": 7.21875,
+ "grad_norm": 0.643241771517866,
+ "learning_rate": 2e-05,
+ "loss": 0.7081,
+ "step": 231
+ },
+ {
+ "epoch": 7.21875,
+ "eval_loss": 0.6700320243835449,
+ "eval_runtime": 84.5621,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 231
+ },
+ {
+ "epoch": 7.25,
+ "grad_norm": 0.577638137570571,
+ "learning_rate": 2e-05,
+ "loss": 0.6873,
+ "step": 232
+ },
+ {
+ "epoch": 7.25,
+ "eval_loss": 0.669111430644989,
+ "eval_runtime": 84.5124,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 232
+ },
+ {
+ "epoch": 7.28125,
+ "grad_norm": 0.7229488296023369,
+ "learning_rate": 2e-05,
+ "loss": 0.6301,
+ "step": 233
+ },
+ {
+ "epoch": 7.28125,
+ "eval_loss": 0.6664154529571533,
+ "eval_runtime": 84.6437,
+ "eval_samples_per_second": 2.363,
+ "eval_steps_per_second": 0.295,
+ "step": 233
+ },
+ {
+ "epoch": 7.3125,
+ "grad_norm": 0.5827815449039045,
+ "learning_rate": 2e-05,
+ "loss": 0.669,
+ "step": 234
+ },
+ {
+ "epoch": 7.3125,
+ "eval_loss": 0.6641202569007874,
+ "eval_runtime": 84.489,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 234
+ },
+ {
+ "epoch": 7.34375,
+ "grad_norm": 0.57507354017269,
+ "learning_rate": 2e-05,
+ "loss": 0.6474,
+ "step": 235
+ },
+ {
+ "epoch": 7.34375,
+ "eval_loss": 0.6623325347900391,
+ "eval_runtime": 84.5536,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 235
+ },
+ {
+ "epoch": 7.375,
+ "grad_norm": 0.5810844862533651,
+ "learning_rate": 2e-05,
+ "loss": 0.6048,
+ "step": 236
+ },
+ {
+ "epoch": 7.375,
+ "eval_loss": 0.6619194746017456,
+ "eval_runtime": 84.2296,
+ "eval_samples_per_second": 2.374,
+ "eval_steps_per_second": 0.297,
+ "step": 236
+ },
+ {
+ "epoch": 7.40625,
+ "grad_norm": 0.6075032415813726,
+ "learning_rate": 2e-05,
+ "loss": 0.6529,
+ "step": 237
+ },
+ {
+ "epoch": 7.40625,
+ "eval_loss": 0.6626202464103699,
+ "eval_runtime": 84.9703,
+ "eval_samples_per_second": 2.354,
+ "eval_steps_per_second": 0.294,
+ "step": 237
+ },
+ {
+ "epoch": 7.4375,
+ "grad_norm": 0.6402642234375245,
+ "learning_rate": 2e-05,
+ "loss": 0.6433,
+ "step": 238
+ },
+ {
+ "epoch": 7.4375,
+ "eval_loss": 0.663289487361908,
+ "eval_runtime": 84.8924,
+ "eval_samples_per_second": 2.356,
+ "eval_steps_per_second": 0.294,
+ "step": 238
+ },
+ {
+ "epoch": 7.46875,
+ "grad_norm": 0.6335996982657431,
+ "learning_rate": 2e-05,
+ "loss": 0.6815,
+ "step": 239
+ },
+ {
+ "epoch": 7.46875,
+ "eval_loss": 0.6636109948158264,
+ "eval_runtime": 85.0551,
+ "eval_samples_per_second": 2.351,
+ "eval_steps_per_second": 0.294,
+ "step": 239
+ },
+ {
+ "epoch": 7.5,
+ "grad_norm": 0.5796846795848909,
+ "learning_rate": 2e-05,
+ "loss": 0.6236,
+ "step": 240
+ },
+ {
+ "epoch": 7.5,
+ "eval_loss": 0.6652829051017761,
+ "eval_runtime": 84.7574,
+ "eval_samples_per_second": 2.36,
+ "eval_steps_per_second": 0.295,
+ "step": 240
+ },
+ {
+ "epoch": 7.53125,
+ "grad_norm": 0.5380402145760035,
+ "learning_rate": 2e-05,
+ "loss": 0.6564,
+ "step": 241
+ },
+ {
+ "epoch": 7.53125,
+ "eval_loss": 0.6676375865936279,
+ "eval_runtime": 86.2058,
+ "eval_samples_per_second": 2.32,
+ "eval_steps_per_second": 0.29,
+ "step": 241
+ },
+ {
+ "epoch": 7.5625,
+ "grad_norm": 0.5964298255824012,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 242
+ },
+ {
+ "epoch": 7.5625,
+ "eval_loss": 0.6698520183563232,
+ "eval_runtime": 85.8955,
+ "eval_samples_per_second": 2.328,
+ "eval_steps_per_second": 0.291,
+ "step": 242
+ },
+ {
+ "epoch": 7.59375,
+ "grad_norm": 0.561279296875,
+ "learning_rate": 2e-05,
+ "loss": 0.6395,
+ "step": 243
+ },
+ {
+ "epoch": 7.59375,
+ "eval_loss": 0.6705803871154785,
+ "eval_runtime": 86.0036,
+ "eval_samples_per_second": 2.325,
+ "eval_steps_per_second": 0.291,
+ "step": 243
+ },
+ {
+ "epoch": 7.625,
+ "grad_norm": 0.6757292755073548,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 244
+ },
+ {
+ "epoch": 7.625,
+ "eval_loss": 0.6679538488388062,
+ "eval_runtime": 85.5379,
+ "eval_samples_per_second": 2.338,
+ "eval_steps_per_second": 0.292,
+ "step": 244
+ },
+ {
+ "epoch": 7.65625,
+ "grad_norm": 0.659077163070129,
+ "learning_rate": 2e-05,
+ "loss": 0.6078,
+ "step": 245
+ },
+ {
+ "epoch": 7.65625,
+ "eval_loss": 0.6667564511299133,
+ "eval_runtime": 85.752,
+ "eval_samples_per_second": 2.332,
+ "eval_steps_per_second": 0.292,
+ "step": 245
+ },
+ {
+ "epoch": 7.6875,
+ "grad_norm": 0.6215405566454576,
+ "learning_rate": 2e-05,
+ "loss": 0.6603,
+ "step": 246
+ },
+ {
+ "epoch": 7.6875,
+ "eval_loss": 0.665945291519165,
+ "eval_runtime": 92.3086,
+ "eval_samples_per_second": 2.167,
+ "eval_steps_per_second": 0.271,
+ "step": 246
+ },
+ {
+ "epoch": 7.71875,
+ "grad_norm": 0.6130534921490498,
+ "learning_rate": 2e-05,
+ "loss": 0.6435,
+ "step": 247
+ },
+ {
+ "epoch": 7.71875,
+ "eval_loss": 0.6661685109138489,
+ "eval_runtime": 87.1917,
+ "eval_samples_per_second": 2.294,
+ "eval_steps_per_second": 0.287,
+ "step": 247
+ },
+ {
+ "epoch": 7.75,
+ "grad_norm": 0.6025415602868736,
+ "learning_rate": 2e-05,
+ "loss": 0.6308,
+ "step": 248
+ },
+ {
+ "epoch": 7.75,
+ "eval_loss": 0.6658704280853271,
+ "eval_runtime": 86.8233,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 248
+ },
+ {
+ "epoch": 7.78125,
+ "grad_norm": 0.6901593792019413,
+ "learning_rate": 2e-05,
+ "loss": 0.6777,
+ "step": 249
+ },
+ {
+ "epoch": 7.78125,
+ "eval_loss": 0.6652414202690125,
+ "eval_runtime": 86.7625,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 249
+ },
+ {
+ "epoch": 7.8125,
+ "grad_norm": 0.6436454697341579,
+ "learning_rate": 2e-05,
+ "loss": 0.6912,
+ "step": 250
+ },
+ {
+ "epoch": 7.8125,
+ "eval_loss": 0.6654212474822998,
+ "eval_runtime": 86.871,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 250
+ },
+ {
+ "epoch": 7.84375,
+ "grad_norm": 0.649040103024529,
+ "learning_rate": 2e-05,
+ "loss": 0.6025,
+ "step": 251
+ },
+ {
+ "epoch": 7.84375,
+ "eval_loss": 0.6654068231582642,
+ "eval_runtime": 86.7458,
+ "eval_samples_per_second": 2.306,
+ "eval_steps_per_second": 0.288,
+ "step": 251
+ },
+ {
+ "epoch": 7.875,
+ "grad_norm": 0.6595522131680224,
+ "learning_rate": 2e-05,
+ "loss": 0.5973,
+ "step": 252
+ },
+ {
+ "epoch": 7.875,
+ "eval_loss": 0.6644830107688904,
+ "eval_runtime": 86.8739,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 252
+ },
+ {
+ "epoch": 7.90625,
+ "grad_norm": 0.6689891717273936,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 253
+ },
+ {
+ "epoch": 7.90625,
+ "eval_loss": 0.6616199612617493,
+ "eval_runtime": 86.8222,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 253
+ },
+ {
+ "epoch": 7.9375,
+ "grad_norm": 0.6306846778314292,
+ "learning_rate": 2e-05,
+ "loss": 0.6599,
+ "step": 254
+ },
+ {
+ "epoch": 7.9375,
+ "eval_loss": 0.6592965126037598,
+ "eval_runtime": 86.8577,
+ "eval_samples_per_second": 2.303,
+ "eval_steps_per_second": 0.288,
+ "step": 254
+ },
+ {
+ "epoch": 7.96875,
+ "grad_norm": 0.6021327993890785,
+ "learning_rate": 2e-05,
+ "loss": 0.575,
+ "step": 255
+ },
+ {
+ "epoch": 7.96875,
+ "eval_loss": 0.6580593585968018,
+ "eval_runtime": 86.7582,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 255
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.6174712675568311,
+ "learning_rate": 2e-05,
+ "loss": 0.6341,
+ "step": 256
+ },
+ {
+ "epoch": 8.0,
+ "eval_loss": 0.6575854420661926,
+ "eval_runtime": 76.7634,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 256
+ }
+ ],
+ "logging_steps": 1.0,
+ "max_steps": 256,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 8,
+ "save_steps": 5,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 489287119011840.0,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-256/training_args.bin b/checkpoint-256/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..80ada675dd1830db80d38411cbdff13cd138ca48
--- /dev/null
+++ b/checkpoint-256/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f188470aed89e37f0d7f17497d5475eb84bc639c6ba047e7db9629674c365735
+size 8312
diff --git a/checkpoint-256/zero_to_fp32.py b/checkpoint-256/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-256/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-320/README.md b/checkpoint-320/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237
--- /dev/null
+++ b/checkpoint-320/README.md
@@ -0,0 +1,202 @@
+---
+base_model: liuhaotian/llava-v1.6-vicuna-13b
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-320/adapter_config.json b/checkpoint-320/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e6d1f521fdefcdd69001e3f15155910cbbf70
--- /dev/null
+++ b/checkpoint-320/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "q_proj",
+ "k_proj",
+ "gate_proj",
+ "up_proj",
+ "down_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-320/adapter_model.safetensors b/checkpoint-320/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..484a556d418162e49a42bf701fbc5f757a28bb98
--- /dev/null
+++ b/checkpoint-320/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6431e6a5485bf9971a8a505d2ce8ac8f1350546005403146f8f3bab2c6c30f02
+size 65046168
diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75c956d3e74c7830dd9731bc3b88e0154a1db0c4
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800218503888cfc6c6dacef7580b008ce5c28bc77e6d725edc53eb163eb213f4
+size 775138
diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e82e70645c9a178744a8dc23e8cc5fbce7ba8ccc
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02fd758d84cad0221cd366926fce830c888c4a25795ffa592310e55569f8be57
+size 191825901
diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b4567d0cbd31a38caf5de7073a062d6abcf6d32
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b02e6ef69e3a0ba9ce109c85ba6b611ee8a5dcaa7acfb914f2610b48ae72d5b
+size 775138
diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3730bd9d75aefc097b1e5a3abeb2413da1dc9cb
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6b3cb3c0b9269d0f01562ac97cf7ed7cfd9c803885af1dde56b9a6eda5fd47d
+size 191825901
diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81c78494a8837ced1c0b0002e749676e251cbdec
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86079448784de87bb5a32ee61f4e61ec37c91808e8572ee637f70dab7710618a
+size 775138
diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..783b56308c1f4124e732268a794a4da8a08d2018
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cde72030703e6207c5148a37bfc458cf75f4464644ba5922dd636d1ed703699
+size 191825901
diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de420fe7ee111721efd9fc1753514c891c64b756
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:975b4b22694bd9e2244d3d9338e3df16410609bec48870a280c085c9d0ea6385
+size 775138
diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..85726d1d970e58bdf098e8b232c650a427dbaae0
--- /dev/null
+++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e7e632b974b0cd0cffd82ac220632d1d22d44d1da09616b219d5f96b07e535c
+size 191825901
diff --git a/checkpoint-320/latest b/checkpoint-320/latest
new file mode 100644
index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213
--- /dev/null
+++ b/checkpoint-320/latest
@@ -0,0 +1 @@
+global_step320
\ No newline at end of file
diff --git a/checkpoint-320/rng_state_0.pth b/checkpoint-320/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9a527d58e9f544523cda61b77080b3d03d824
--- /dev/null
+++ b/checkpoint-320/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03af8976c83af29b26ac3bdd42a804bb9e4d7c51eff643b3ad188c88c846c088
+size 14960
diff --git a/checkpoint-320/rng_state_1.pth b/checkpoint-320/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3182bd59e6fba7b8b28fe95246df8e1b8a1c9ee2
--- /dev/null
+++ b/checkpoint-320/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65a0552a908836fab6a8e6f840ac7d8de6dafa58227414f46353830c2cac6eae
+size 14960
diff --git a/checkpoint-320/rng_state_2.pth b/checkpoint-320/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..45b0262f434b63f9d834d4e880b61b53f2dadb0f
--- /dev/null
+++ b/checkpoint-320/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05eeeac05a2df77ec2d80d022a5d4c0d3a738fa5f3f0c7f6560893b766f6a722
+size 14960
diff --git a/checkpoint-320/rng_state_3.pth b/checkpoint-320/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f3ec2550fa5f351a1500bfa21341cdc5fdb3005a
--- /dev/null
+++ b/checkpoint-320/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70a680c028586c6979517de8d634e39a4908c3611cac7b62d70f1bcb83f6c594
+size 14960
diff --git a/checkpoint-320/special_tokens_map.json b/checkpoint-320/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e
--- /dev/null
+++ b/checkpoint-320/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-320/tokenizer.model b/checkpoint-320/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-320/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-320/tokenizer_config.json b/checkpoint-320/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3
--- /dev/null
+++ b/checkpoint-320/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 2048,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false
+}
diff --git a/checkpoint-320/trainer_state.json b/checkpoint-320/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b98de20abbfc7e1ce6ed2b7e2c979d4d87692b59
--- /dev/null
+++ b/checkpoint-320/trainer_state.json
@@ -0,0 +1,4833 @@
+{
+ "best_metric": 0.6575854420661926,
+ "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256",
+ "epoch": 10.0,
+ "eval_steps": 1.0,
+ "global_step": 320,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.03125,
+ "grad_norm": 0.5230235555406132,
+ "learning_rate": 0.0,
+ "loss": 1.5809,
+ "step": 1
+ },
+ {
+ "epoch": 0.03125,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 82.059,
+ "eval_samples_per_second": 2.437,
+ "eval_steps_per_second": 0.305,
+ "step": 1
+ },
+ {
+ "epoch": 0.0625,
+ "grad_norm": 0.5095402010892089,
+ "learning_rate": 2e-05,
+ "loss": 1.4958,
+ "step": 2
+ },
+ {
+ "epoch": 0.0625,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 76.5747,
+ "eval_samples_per_second": 2.612,
+ "eval_steps_per_second": 0.326,
+ "step": 2
+ },
+ {
+ "epoch": 0.09375,
+ "grad_norm": 0.4998514282504938,
+ "learning_rate": 2e-05,
+ "loss": 1.5552,
+ "step": 3
+ },
+ {
+ "epoch": 0.09375,
+ "eval_loss": 1.5956931114196777,
+ "eval_runtime": 76.1563,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 3
+ },
+ {
+ "epoch": 0.125,
+ "grad_norm": 0.4280580315108126,
+ "learning_rate": 2e-05,
+ "loss": 1.4846,
+ "step": 4
+ },
+ {
+ "epoch": 0.125,
+ "eval_loss": 1.5584176778793335,
+ "eval_runtime": 76.1235,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 4
+ },
+ {
+ "epoch": 0.15625,
+ "grad_norm": 0.5678499435986384,
+ "learning_rate": 2e-05,
+ "loss": 1.5036,
+ "step": 5
+ },
+ {
+ "epoch": 0.15625,
+ "eval_loss": 1.5207562446594238,
+ "eval_runtime": 76.1514,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 5
+ },
+ {
+ "epoch": 0.1875,
+ "grad_norm": 0.5368461657542534,
+ "learning_rate": 2e-05,
+ "loss": 1.476,
+ "step": 6
+ },
+ {
+ "epoch": 0.1875,
+ "eval_loss": 1.4807783365249634,
+ "eval_runtime": 77.3444,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 6
+ },
+ {
+ "epoch": 0.21875,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4358,
+ "step": 7
+ },
+ {
+ "epoch": 0.21875,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.066,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 7
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4369,
+ "step": 8
+ },
+ {
+ "epoch": 0.25,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.2807,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.323,
+ "step": 8
+ },
+ {
+ "epoch": 0.28125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4471,
+ "step": 9
+ },
+ {
+ "epoch": 0.28125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.1562,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 9
+ },
+ {
+ "epoch": 0.3125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.3666,
+ "step": 10
+ },
+ {
+ "epoch": 0.3125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 77.1645,
+ "eval_samples_per_second": 2.592,
+ "eval_steps_per_second": 0.324,
+ "step": 10
+ },
+ {
+ "epoch": 0.34375,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4149,
+ "step": 11
+ },
+ {
+ "epoch": 0.34375,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.7627,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 11
+ },
+ {
+ "epoch": 0.375,
+ "grad_norm": 0.684588966714067,
+ "learning_rate": 2e-05,
+ "loss": 1.3883,
+ "step": 12
+ },
+ {
+ "epoch": 0.375,
+ "eval_loss": 1.3679308891296387,
+ "eval_runtime": 78.4315,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 12
+ },
+ {
+ "epoch": 0.40625,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.4271,
+ "step": 13
+ },
+ {
+ "epoch": 0.40625,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.685,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 13
+ },
+ {
+ "epoch": 0.4375,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.2495,
+ "step": 14
+ },
+ {
+ "epoch": 0.4375,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.0511,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 14
+ },
+ {
+ "epoch": 0.46875,
+ "grad_norm": 0.6028103951693778,
+ "learning_rate": 2e-05,
+ "loss": 1.3513,
+ "step": 15
+ },
+ {
+ "epoch": 0.46875,
+ "eval_loss": 1.3032653331756592,
+ "eval_runtime": 78.0271,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 15
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 0.769290402283396,
+ "learning_rate": 2e-05,
+ "loss": 1.3117,
+ "step": 16
+ },
+ {
+ "epoch": 0.5,
+ "eval_loss": 1.2661188840866089,
+ "eval_runtime": 78.1857,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 16
+ },
+ {
+ "epoch": 0.53125,
+ "grad_norm": 1.3279338025863765,
+ "learning_rate": 2e-05,
+ "loss": 1.2768,
+ "step": 17
+ },
+ {
+ "epoch": 0.53125,
+ "eval_loss": 1.2299447059631348,
+ "eval_runtime": 78.2064,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 17
+ },
+ {
+ "epoch": 0.5625,
+ "grad_norm": 0.7410327159336384,
+ "learning_rate": 2e-05,
+ "loss": 1.256,
+ "step": 18
+ },
+ {
+ "epoch": 0.5625,
+ "eval_loss": 1.2044258117675781,
+ "eval_runtime": 78.072,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 18
+ },
+ {
+ "epoch": 0.59375,
+ "grad_norm": 0.44078820770408506,
+ "learning_rate": 2e-05,
+ "loss": 1.1252,
+ "step": 19
+ },
+ {
+ "epoch": 0.59375,
+ "eval_loss": 1.1826122999191284,
+ "eval_runtime": 78.7312,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.318,
+ "step": 19
+ },
+ {
+ "epoch": 0.625,
+ "grad_norm": 0.49020841613371097,
+ "learning_rate": 2e-05,
+ "loss": 1.2249,
+ "step": 20
+ },
+ {
+ "epoch": 0.625,
+ "eval_loss": 1.1616511344909668,
+ "eval_runtime": 78.2736,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 20
+ },
+ {
+ "epoch": 0.65625,
+ "grad_norm": 0.43031322695269714,
+ "learning_rate": 2e-05,
+ "loss": 1.1466,
+ "step": 21
+ },
+ {
+ "epoch": 0.65625,
+ "eval_loss": 1.1410629749298096,
+ "eval_runtime": 79.6432,
+ "eval_samples_per_second": 2.511,
+ "eval_steps_per_second": 0.314,
+ "step": 21
+ },
+ {
+ "epoch": 0.6875,
+ "grad_norm": 0.45632085445955545,
+ "learning_rate": 2e-05,
+ "loss": 1.1951,
+ "step": 22
+ },
+ {
+ "epoch": 0.6875,
+ "eval_loss": 1.1204684972763062,
+ "eval_runtime": 79.0609,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 22
+ },
+ {
+ "epoch": 0.71875,
+ "grad_norm": 0.40048586945364495,
+ "learning_rate": 2e-05,
+ "loss": 1.1826,
+ "step": 23
+ },
+ {
+ "epoch": 0.71875,
+ "eval_loss": 1.1002545356750488,
+ "eval_runtime": 82.8578,
+ "eval_samples_per_second": 2.414,
+ "eval_steps_per_second": 0.302,
+ "step": 23
+ },
+ {
+ "epoch": 0.75,
+ "grad_norm": 0.3703033261027938,
+ "learning_rate": 2e-05,
+ "loss": 1.1543,
+ "step": 24
+ },
+ {
+ "epoch": 0.75,
+ "eval_loss": 1.0805977582931519,
+ "eval_runtime": 76.1407,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 24
+ },
+ {
+ "epoch": 0.78125,
+ "grad_norm": 0.3986313105418924,
+ "learning_rate": 2e-05,
+ "loss": 1.1046,
+ "step": 25
+ },
+ {
+ "epoch": 0.78125,
+ "eval_loss": 1.0610157251358032,
+ "eval_runtime": 76.3083,
+ "eval_samples_per_second": 2.621,
+ "eval_steps_per_second": 0.328,
+ "step": 25
+ },
+ {
+ "epoch": 0.8125,
+ "grad_norm": 0.36265027203577943,
+ "learning_rate": 2e-05,
+ "loss": 1.1048,
+ "step": 26
+ },
+ {
+ "epoch": 0.8125,
+ "eval_loss": 1.0421289205551147,
+ "eval_runtime": 77.2186,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 26
+ },
+ {
+ "epoch": 0.84375,
+ "grad_norm": 0.3881748990218768,
+ "learning_rate": 2e-05,
+ "loss": 1.0425,
+ "step": 27
+ },
+ {
+ "epoch": 0.84375,
+ "eval_loss": 1.0240073204040527,
+ "eval_runtime": 77.8662,
+ "eval_samples_per_second": 2.569,
+ "eval_steps_per_second": 0.321,
+ "step": 27
+ },
+ {
+ "epoch": 0.875,
+ "grad_norm": 0.3734031294324286,
+ "learning_rate": 2e-05,
+ "loss": 1.0484,
+ "step": 28
+ },
+ {
+ "epoch": 0.875,
+ "eval_loss": 1.0066957473754883,
+ "eval_runtime": 77.269,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 28
+ },
+ {
+ "epoch": 0.90625,
+ "grad_norm": 0.29695383079342563,
+ "learning_rate": 2e-05,
+ "loss": 1.0387,
+ "step": 29
+ },
+ {
+ "epoch": 0.90625,
+ "eval_loss": 0.9906074404716492,
+ "eval_runtime": 77.2245,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 29
+ },
+ {
+ "epoch": 0.9375,
+ "grad_norm": 0.29273146875026623,
+ "learning_rate": 2e-05,
+ "loss": 1.0568,
+ "step": 30
+ },
+ {
+ "epoch": 0.9375,
+ "eval_loss": 0.975755512714386,
+ "eval_runtime": 78.0056,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.32,
+ "step": 30
+ },
+ {
+ "epoch": 0.96875,
+ "grad_norm": 0.35070440686850546,
+ "learning_rate": 2e-05,
+ "loss": 0.9114,
+ "step": 31
+ },
+ {
+ "epoch": 0.96875,
+ "eval_loss": 0.9615123271942139,
+ "eval_runtime": 77.9051,
+ "eval_samples_per_second": 2.567,
+ "eval_steps_per_second": 0.321,
+ "step": 31
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.30846157140439384,
+ "learning_rate": 2e-05,
+ "loss": 0.9941,
+ "step": 32
+ },
+ {
+ "epoch": 1.0,
+ "eval_loss": 0.9480571150779724,
+ "eval_runtime": 77.2322,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 32
+ },
+ {
+ "epoch": 1.03125,
+ "grad_norm": 0.2950381371932973,
+ "learning_rate": 2e-05,
+ "loss": 1.0297,
+ "step": 33
+ },
+ {
+ "epoch": 1.03125,
+ "eval_loss": 0.9356330037117004,
+ "eval_runtime": 81.8443,
+ "eval_samples_per_second": 2.444,
+ "eval_steps_per_second": 0.305,
+ "step": 33
+ },
+ {
+ "epoch": 1.0625,
+ "grad_norm": 0.27080038065834283,
+ "learning_rate": 2e-05,
+ "loss": 1.021,
+ "step": 34
+ },
+ {
+ "epoch": 1.0625,
+ "eval_loss": 0.9245791435241699,
+ "eval_runtime": 76.2071,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 34
+ },
+ {
+ "epoch": 1.09375,
+ "grad_norm": 0.23165081252649894,
+ "learning_rate": 2e-05,
+ "loss": 1.0366,
+ "step": 35
+ },
+ {
+ "epoch": 1.09375,
+ "eval_loss": 0.9151126146316528,
+ "eval_runtime": 77.0412,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 35
+ },
+ {
+ "epoch": 1.125,
+ "grad_norm": 0.4033780922500775,
+ "learning_rate": 2e-05,
+ "loss": 1.0127,
+ "step": 36
+ },
+ {
+ "epoch": 1.125,
+ "eval_loss": 0.9063960313796997,
+ "eval_runtime": 76.9327,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 36
+ },
+ {
+ "epoch": 1.15625,
+ "grad_norm": 0.2398039831439168,
+ "learning_rate": 2e-05,
+ "loss": 0.9418,
+ "step": 37
+ },
+ {
+ "epoch": 1.15625,
+ "eval_loss": 0.8982363939285278,
+ "eval_runtime": 76.1234,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 37
+ },
+ {
+ "epoch": 1.1875,
+ "grad_norm": 0.28793451241246804,
+ "learning_rate": 2e-05,
+ "loss": 0.9643,
+ "step": 38
+ },
+ {
+ "epoch": 1.1875,
+ "eval_loss": 0.8908895254135132,
+ "eval_runtime": 76.2877,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 38
+ },
+ {
+ "epoch": 1.21875,
+ "grad_norm": 0.2927691606307197,
+ "learning_rate": 2e-05,
+ "loss": 1.0087,
+ "step": 39
+ },
+ {
+ "epoch": 1.21875,
+ "eval_loss": 0.8845618367195129,
+ "eval_runtime": 76.2282,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 39
+ },
+ {
+ "epoch": 1.25,
+ "grad_norm": 0.26410982001408806,
+ "learning_rate": 2e-05,
+ "loss": 0.986,
+ "step": 40
+ },
+ {
+ "epoch": 1.25,
+ "eval_loss": 0.8784474730491638,
+ "eval_runtime": 76.2512,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 40
+ },
+ {
+ "epoch": 1.28125,
+ "grad_norm": 0.29182630949665306,
+ "learning_rate": 2e-05,
+ "loss": 0.9711,
+ "step": 41
+ },
+ {
+ "epoch": 1.28125,
+ "eval_loss": 0.8725223541259766,
+ "eval_runtime": 77.1229,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 41
+ },
+ {
+ "epoch": 1.3125,
+ "grad_norm": 0.36402838796832665,
+ "learning_rate": 2e-05,
+ "loss": 0.9263,
+ "step": 42
+ },
+ {
+ "epoch": 1.3125,
+ "eval_loss": 0.8662790060043335,
+ "eval_runtime": 77.2362,
+ "eval_samples_per_second": 2.589,
+ "eval_steps_per_second": 0.324,
+ "step": 42
+ },
+ {
+ "epoch": 1.34375,
+ "grad_norm": 0.29338184478895163,
+ "learning_rate": 2e-05,
+ "loss": 0.8947,
+ "step": 43
+ },
+ {
+ "epoch": 1.34375,
+ "eval_loss": 0.8600431680679321,
+ "eval_runtime": 77.1213,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 43
+ },
+ {
+ "epoch": 1.375,
+ "grad_norm": 0.2201714229702277,
+ "learning_rate": 2e-05,
+ "loss": 0.9059,
+ "step": 44
+ },
+ {
+ "epoch": 1.375,
+ "eval_loss": 0.8545799255371094,
+ "eval_runtime": 77.991,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 44
+ },
+ {
+ "epoch": 1.40625,
+ "grad_norm": 0.2254966625243654,
+ "learning_rate": 2e-05,
+ "loss": 0.8942,
+ "step": 45
+ },
+ {
+ "epoch": 1.40625,
+ "eval_loss": 0.8497399687767029,
+ "eval_runtime": 77.2698,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 45
+ },
+ {
+ "epoch": 1.4375,
+ "grad_norm": 0.21753318432075458,
+ "learning_rate": 2e-05,
+ "loss": 0.9376,
+ "step": 46
+ },
+ {
+ "epoch": 1.4375,
+ "eval_loss": 0.8452473282814026,
+ "eval_runtime": 77.0568,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 46
+ },
+ {
+ "epoch": 1.46875,
+ "grad_norm": 0.21449718265972945,
+ "learning_rate": 2e-05,
+ "loss": 0.9369,
+ "step": 47
+ },
+ {
+ "epoch": 1.46875,
+ "eval_loss": 0.841134786605835,
+ "eval_runtime": 77.225,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 47
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 0.2109063266748924,
+ "learning_rate": 2e-05,
+ "loss": 0.8511,
+ "step": 48
+ },
+ {
+ "epoch": 1.5,
+ "eval_loss": 0.8373770117759705,
+ "eval_runtime": 76.2309,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 48
+ },
+ {
+ "epoch": 1.53125,
+ "grad_norm": 0.232838633689838,
+ "learning_rate": 2e-05,
+ "loss": 0.8694,
+ "step": 49
+ },
+ {
+ "epoch": 1.53125,
+ "eval_loss": 0.8338289856910706,
+ "eval_runtime": 76.277,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 49
+ },
+ {
+ "epoch": 1.5625,
+ "grad_norm": 0.4189704940803984,
+ "learning_rate": 2e-05,
+ "loss": 0.8464,
+ "step": 50
+ },
+ {
+ "epoch": 1.5625,
+ "eval_loss": 0.8297132849693298,
+ "eval_runtime": 76.2872,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 50
+ },
+ {
+ "epoch": 1.59375,
+ "grad_norm": 0.2171618165123276,
+ "learning_rate": 2e-05,
+ "loss": 0.8785,
+ "step": 51
+ },
+ {
+ "epoch": 1.59375,
+ "eval_loss": 0.8257431983947754,
+ "eval_runtime": 76.2639,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 51
+ },
+ {
+ "epoch": 1.625,
+ "grad_norm": 0.21934651037670305,
+ "learning_rate": 2e-05,
+ "loss": 0.7645,
+ "step": 52
+ },
+ {
+ "epoch": 1.625,
+ "eval_loss": 0.8223557472229004,
+ "eval_runtime": 76.2383,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 52
+ },
+ {
+ "epoch": 1.65625,
+ "grad_norm": 0.24183530733164746,
+ "learning_rate": 2e-05,
+ "loss": 0.9218,
+ "step": 53
+ },
+ {
+ "epoch": 1.65625,
+ "eval_loss": 0.8189653158187866,
+ "eval_runtime": 76.9819,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 53
+ },
+ {
+ "epoch": 1.6875,
+ "grad_norm": 0.23450930244279267,
+ "learning_rate": 2e-05,
+ "loss": 0.8896,
+ "step": 54
+ },
+ {
+ "epoch": 1.6875,
+ "eval_loss": 0.8152530193328857,
+ "eval_runtime": 76.2378,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 54
+ },
+ {
+ "epoch": 1.71875,
+ "grad_norm": 0.22081665899796085,
+ "learning_rate": 2e-05,
+ "loss": 0.8798,
+ "step": 55
+ },
+ {
+ "epoch": 1.71875,
+ "eval_loss": 0.8122122287750244,
+ "eval_runtime": 76.289,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 55
+ },
+ {
+ "epoch": 1.75,
+ "grad_norm": 0.21311746114111046,
+ "learning_rate": 2e-05,
+ "loss": 0.9482,
+ "step": 56
+ },
+ {
+ "epoch": 1.75,
+ "eval_loss": 0.8092318773269653,
+ "eval_runtime": 77.8321,
+ "eval_samples_per_second": 2.57,
+ "eval_steps_per_second": 0.321,
+ "step": 56
+ },
+ {
+ "epoch": 1.78125,
+ "grad_norm": 0.2496565307107556,
+ "learning_rate": 2e-05,
+ "loss": 0.8917,
+ "step": 57
+ },
+ {
+ "epoch": 1.78125,
+ "eval_loss": 0.8070546984672546,
+ "eval_runtime": 77.2651,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 57
+ },
+ {
+ "epoch": 1.8125,
+ "grad_norm": 0.2137866456424736,
+ "learning_rate": 2e-05,
+ "loss": 0.909,
+ "step": 58
+ },
+ {
+ "epoch": 1.8125,
+ "eval_loss": 0.8049566745758057,
+ "eval_runtime": 78.0925,
+ "eval_samples_per_second": 2.561,
+ "eval_steps_per_second": 0.32,
+ "step": 58
+ },
+ {
+ "epoch": 1.84375,
+ "grad_norm": 0.22567502859345095,
+ "learning_rate": 2e-05,
+ "loss": 0.8611,
+ "step": 59
+ },
+ {
+ "epoch": 1.84375,
+ "eval_loss": 0.8028810024261475,
+ "eval_runtime": 78.0553,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 59
+ },
+ {
+ "epoch": 1.875,
+ "grad_norm": 0.23303796552302508,
+ "learning_rate": 2e-05,
+ "loss": 0.9209,
+ "step": 60
+ },
+ {
+ "epoch": 1.875,
+ "eval_loss": 0.800568699836731,
+ "eval_runtime": 78.052,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 60
+ },
+ {
+ "epoch": 1.90625,
+ "grad_norm": 0.24566727726974544,
+ "learning_rate": 2e-05,
+ "loss": 0.8239,
+ "step": 61
+ },
+ {
+ "epoch": 1.90625,
+ "eval_loss": 0.7976545691490173,
+ "eval_runtime": 77.3056,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 61
+ },
+ {
+ "epoch": 1.9375,
+ "grad_norm": 0.23014192522354907,
+ "learning_rate": 2e-05,
+ "loss": 0.8814,
+ "step": 62
+ },
+ {
+ "epoch": 1.9375,
+ "eval_loss": 0.7945474982261658,
+ "eval_runtime": 77.3398,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 62
+ },
+ {
+ "epoch": 1.96875,
+ "grad_norm": 0.23042819102671622,
+ "learning_rate": 2e-05,
+ "loss": 0.9064,
+ "step": 63
+ },
+ {
+ "epoch": 1.96875,
+ "eval_loss": 0.7918359637260437,
+ "eval_runtime": 77.4272,
+ "eval_samples_per_second": 2.583,
+ "eval_steps_per_second": 0.323,
+ "step": 63
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.23940667173206315,
+ "learning_rate": 2e-05,
+ "loss": 0.8658,
+ "step": 64
+ },
+ {
+ "epoch": 2.0,
+ "eval_loss": 0.7891160845756531,
+ "eval_runtime": 77.3236,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 64
+ },
+ {
+ "epoch": 2.03125,
+ "grad_norm": 0.22630342930143643,
+ "learning_rate": 2e-05,
+ "loss": 0.8403,
+ "step": 65
+ },
+ {
+ "epoch": 2.03125,
+ "eval_loss": 0.7859742641448975,
+ "eval_runtime": 77.2001,
+ "eval_samples_per_second": 2.591,
+ "eval_steps_per_second": 0.324,
+ "step": 65
+ },
+ {
+ "epoch": 2.0625,
+ "grad_norm": 0.20949240460260976,
+ "learning_rate": 2e-05,
+ "loss": 0.8472,
+ "step": 66
+ },
+ {
+ "epoch": 2.0625,
+ "eval_loss": 0.7834083437919617,
+ "eval_runtime": 78.9646,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 66
+ },
+ {
+ "epoch": 2.09375,
+ "grad_norm": 0.22714400479820654,
+ "learning_rate": 2e-05,
+ "loss": 0.841,
+ "step": 67
+ },
+ {
+ "epoch": 2.09375,
+ "eval_loss": 0.7805308699607849,
+ "eval_runtime": 78.7552,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.317,
+ "step": 67
+ },
+ {
+ "epoch": 2.125,
+ "grad_norm": 0.23345123077006047,
+ "learning_rate": 2e-05,
+ "loss": 0.9028,
+ "step": 68
+ },
+ {
+ "epoch": 2.125,
+ "eval_loss": 0.7779514789581299,
+ "eval_runtime": 78.3387,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 68
+ },
+ {
+ "epoch": 2.15625,
+ "grad_norm": 0.251841542575211,
+ "learning_rate": 2e-05,
+ "loss": 0.8381,
+ "step": 69
+ },
+ {
+ "epoch": 2.15625,
+ "eval_loss": 0.7756664752960205,
+ "eval_runtime": 78.3109,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 69
+ },
+ {
+ "epoch": 2.1875,
+ "grad_norm": 0.23548386839773608,
+ "learning_rate": 2e-05,
+ "loss": 0.7914,
+ "step": 70
+ },
+ {
+ "epoch": 2.1875,
+ "eval_loss": 0.7733604907989502,
+ "eval_runtime": 78.9712,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 70
+ },
+ {
+ "epoch": 2.21875,
+ "grad_norm": 0.23262740912668387,
+ "learning_rate": 2e-05,
+ "loss": 0.8778,
+ "step": 71
+ },
+ {
+ "epoch": 2.21875,
+ "eval_loss": 0.771755576133728,
+ "eval_runtime": 78.2633,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 71
+ },
+ {
+ "epoch": 2.25,
+ "grad_norm": 0.22075289612357513,
+ "learning_rate": 2e-05,
+ "loss": 0.7945,
+ "step": 72
+ },
+ {
+ "epoch": 2.25,
+ "eval_loss": 0.7705450654029846,
+ "eval_runtime": 78.3151,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 72
+ },
+ {
+ "epoch": 2.28125,
+ "grad_norm": 0.25520381955936466,
+ "learning_rate": 2e-05,
+ "loss": 0.8387,
+ "step": 73
+ },
+ {
+ "epoch": 2.28125,
+ "eval_loss": 0.7695029973983765,
+ "eval_runtime": 78.2901,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 73
+ },
+ {
+ "epoch": 2.3125,
+ "grad_norm": 0.2047305385827267,
+ "learning_rate": 2e-05,
+ "loss": 0.8404,
+ "step": 74
+ },
+ {
+ "epoch": 2.3125,
+ "eval_loss": 0.7684457302093506,
+ "eval_runtime": 78.3875,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 74
+ },
+ {
+ "epoch": 2.34375,
+ "grad_norm": 0.2262323045133288,
+ "learning_rate": 2e-05,
+ "loss": 0.8811,
+ "step": 75
+ },
+ {
+ "epoch": 2.34375,
+ "eval_loss": 0.7671162486076355,
+ "eval_runtime": 78.202,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 75
+ },
+ {
+ "epoch": 2.375,
+ "grad_norm": 0.21885464923925876,
+ "learning_rate": 2e-05,
+ "loss": 0.7942,
+ "step": 76
+ },
+ {
+ "epoch": 2.375,
+ "eval_loss": 0.7658494710922241,
+ "eval_runtime": 78.1746,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 76
+ },
+ {
+ "epoch": 2.40625,
+ "grad_norm": 0.21717306953626966,
+ "learning_rate": 2e-05,
+ "loss": 0.8497,
+ "step": 77
+ },
+ {
+ "epoch": 2.40625,
+ "eval_loss": 0.7642120122909546,
+ "eval_runtime": 78.2026,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 77
+ },
+ {
+ "epoch": 2.4375,
+ "grad_norm": 0.2530725583748258,
+ "learning_rate": 2e-05,
+ "loss": 0.8584,
+ "step": 78
+ },
+ {
+ "epoch": 2.4375,
+ "eval_loss": 0.7625510692596436,
+ "eval_runtime": 78.1991,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 78
+ },
+ {
+ "epoch": 2.46875,
+ "grad_norm": 0.25354787036627263,
+ "learning_rate": 2e-05,
+ "loss": 0.8569,
+ "step": 79
+ },
+ {
+ "epoch": 2.46875,
+ "eval_loss": 0.7616268396377563,
+ "eval_runtime": 78.2915,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 79
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 0.2800865746664007,
+ "learning_rate": 2e-05,
+ "loss": 0.9116,
+ "step": 80
+ },
+ {
+ "epoch": 2.5,
+ "eval_loss": 0.7603214979171753,
+ "eval_runtime": 78.2749,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 80
+ },
+ {
+ "epoch": 2.53125,
+ "grad_norm": 0.268139688449618,
+ "learning_rate": 2e-05,
+ "loss": 0.8397,
+ "step": 81
+ },
+ {
+ "epoch": 2.53125,
+ "eval_loss": 0.7584869265556335,
+ "eval_runtime": 79.1445,
+ "eval_samples_per_second": 2.527,
+ "eval_steps_per_second": 0.316,
+ "step": 81
+ },
+ {
+ "epoch": 2.5625,
+ "grad_norm": 0.3128648654463789,
+ "learning_rate": 2e-05,
+ "loss": 0.8888,
+ "step": 82
+ },
+ {
+ "epoch": 2.5625,
+ "eval_loss": 0.7566561102867126,
+ "eval_runtime": 79.2089,
+ "eval_samples_per_second": 2.525,
+ "eval_steps_per_second": 0.316,
+ "step": 82
+ },
+ {
+ "epoch": 2.59375,
+ "grad_norm": 0.2502355211215609,
+ "learning_rate": 2e-05,
+ "loss": 0.8346,
+ "step": 83
+ },
+ {
+ "epoch": 2.59375,
+ "eval_loss": 0.7547345161437988,
+ "eval_runtime": 79.2691,
+ "eval_samples_per_second": 2.523,
+ "eval_steps_per_second": 0.315,
+ "step": 83
+ },
+ {
+ "epoch": 2.625,
+ "grad_norm": 0.25281184629018644,
+ "learning_rate": 2e-05,
+ "loss": 0.795,
+ "step": 84
+ },
+ {
+ "epoch": 2.625,
+ "eval_loss": 0.7527951598167419,
+ "eval_runtime": 79.4068,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 84
+ },
+ {
+ "epoch": 2.65625,
+ "grad_norm": 0.24246729562645003,
+ "learning_rate": 2e-05,
+ "loss": 0.7649,
+ "step": 85
+ },
+ {
+ "epoch": 2.65625,
+ "eval_loss": 0.7509815096855164,
+ "eval_runtime": 79.1612,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 85
+ },
+ {
+ "epoch": 2.6875,
+ "grad_norm": 0.27005475109453947,
+ "learning_rate": 2e-05,
+ "loss": 0.7964,
+ "step": 86
+ },
+ {
+ "epoch": 2.6875,
+ "eval_loss": 0.7485950589179993,
+ "eval_runtime": 80.0714,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 86
+ },
+ {
+ "epoch": 2.71875,
+ "grad_norm": 0.2723492355800971,
+ "learning_rate": 2e-05,
+ "loss": 0.8117,
+ "step": 87
+ },
+ {
+ "epoch": 2.71875,
+ "eval_loss": 0.7459420561790466,
+ "eval_runtime": 79.4075,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 87
+ },
+ {
+ "epoch": 2.75,
+ "grad_norm": 0.2946493898427159,
+ "learning_rate": 2e-05,
+ "loss": 0.8986,
+ "step": 88
+ },
+ {
+ "epoch": 2.75,
+ "eval_loss": 0.7436455488204956,
+ "eval_runtime": 79.3721,
+ "eval_samples_per_second": 2.52,
+ "eval_steps_per_second": 0.315,
+ "step": 88
+ },
+ {
+ "epoch": 2.78125,
+ "grad_norm": 0.26411214734213284,
+ "learning_rate": 2e-05,
+ "loss": 0.8145,
+ "step": 89
+ },
+ {
+ "epoch": 2.78125,
+ "eval_loss": 0.7424752712249756,
+ "eval_runtime": 79.2988,
+ "eval_samples_per_second": 2.522,
+ "eval_steps_per_second": 0.315,
+ "step": 89
+ },
+ {
+ "epoch": 2.8125,
+ "grad_norm": 0.27115747269014817,
+ "learning_rate": 2e-05,
+ "loss": 0.8457,
+ "step": 90
+ },
+ {
+ "epoch": 2.8125,
+ "eval_loss": 0.7416408658027649,
+ "eval_runtime": 79.4004,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 90
+ },
+ {
+ "epoch": 2.84375,
+ "grad_norm": 0.25831877964821937,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 91
+ },
+ {
+ "epoch": 2.84375,
+ "eval_loss": 0.7404463291168213,
+ "eval_runtime": 81.7767,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 91
+ },
+ {
+ "epoch": 2.875,
+ "grad_norm": 0.31273388454942935,
+ "learning_rate": 2e-05,
+ "loss": 0.8562,
+ "step": 92
+ },
+ {
+ "epoch": 2.875,
+ "eval_loss": 0.7384185791015625,
+ "eval_runtime": 82.3443,
+ "eval_samples_per_second": 2.429,
+ "eval_steps_per_second": 0.304,
+ "step": 92
+ },
+ {
+ "epoch": 2.90625,
+ "grad_norm": 0.2838267071008901,
+ "learning_rate": 2e-05,
+ "loss": 0.7869,
+ "step": 93
+ },
+ {
+ "epoch": 2.90625,
+ "eval_loss": 0.7366807460784912,
+ "eval_runtime": 82.2622,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 93
+ },
+ {
+ "epoch": 2.9375,
+ "grad_norm": 0.28625827941831467,
+ "learning_rate": 2e-05,
+ "loss": 0.8618,
+ "step": 94
+ },
+ {
+ "epoch": 2.9375,
+ "eval_loss": 0.7357398867607117,
+ "eval_runtime": 81.9471,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 94
+ },
+ {
+ "epoch": 2.96875,
+ "grad_norm": 0.25548002643954326,
+ "learning_rate": 2e-05,
+ "loss": 0.8085,
+ "step": 95
+ },
+ {
+ "epoch": 2.96875,
+ "eval_loss": 0.7356534004211426,
+ "eval_runtime": 82.1186,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 95
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 0.27081450830961107,
+ "learning_rate": 2e-05,
+ "loss": 0.7684,
+ "step": 96
+ },
+ {
+ "epoch": 3.0,
+ "eval_loss": 0.7346957921981812,
+ "eval_runtime": 81.5463,
+ "eval_samples_per_second": 2.453,
+ "eval_steps_per_second": 0.307,
+ "step": 96
+ },
+ {
+ "epoch": 3.03125,
+ "grad_norm": 0.2985486737236676,
+ "learning_rate": 2e-05,
+ "loss": 0.7274,
+ "step": 97
+ },
+ {
+ "epoch": 3.03125,
+ "eval_loss": 0.7325752377510071,
+ "eval_runtime": 81.7804,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 97
+ },
+ {
+ "epoch": 3.0625,
+ "grad_norm": 0.29149719690624026,
+ "learning_rate": 2e-05,
+ "loss": 0.8119,
+ "step": 98
+ },
+ {
+ "epoch": 3.0625,
+ "eval_loss": 0.7298976182937622,
+ "eval_runtime": 76.2764,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 98
+ },
+ {
+ "epoch": 3.09375,
+ "grad_norm": 0.25227859825215865,
+ "learning_rate": 2e-05,
+ "loss": 0.7888,
+ "step": 99
+ },
+ {
+ "epoch": 3.09375,
+ "eval_loss": 0.727373480796814,
+ "eval_runtime": 76.2418,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 99
+ },
+ {
+ "epoch": 3.125,
+ "grad_norm": 0.27316954971752555,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 100
+ },
+ {
+ "epoch": 3.125,
+ "eval_loss": 0.7254325747489929,
+ "eval_runtime": 76.1474,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 100
+ },
+ {
+ "epoch": 3.15625,
+ "grad_norm": 0.24239788607957785,
+ "learning_rate": 2e-05,
+ "loss": 0.7535,
+ "step": 101
+ },
+ {
+ "epoch": 3.15625,
+ "eval_loss": 0.724058985710144,
+ "eval_runtime": 76.2391,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 101
+ },
+ {
+ "epoch": 3.1875,
+ "grad_norm": 0.25648385925427025,
+ "learning_rate": 2e-05,
+ "loss": 0.8195,
+ "step": 102
+ },
+ {
+ "epoch": 3.1875,
+ "eval_loss": 0.7235870957374573,
+ "eval_runtime": 76.9134,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 102
+ },
+ {
+ "epoch": 3.21875,
+ "grad_norm": 0.29620170789161204,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 103
+ },
+ {
+ "epoch": 3.21875,
+ "eval_loss": 0.7228152751922607,
+ "eval_runtime": 76.095,
+ "eval_samples_per_second": 2.628,
+ "eval_steps_per_second": 0.329,
+ "step": 103
+ },
+ {
+ "epoch": 3.25,
+ "grad_norm": 0.3484116181139593,
+ "learning_rate": 2e-05,
+ "loss": 0.7478,
+ "step": 104
+ },
+ {
+ "epoch": 3.25,
+ "eval_loss": 0.7209363579750061,
+ "eval_runtime": 76.9377,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 104
+ },
+ {
+ "epoch": 3.28125,
+ "grad_norm": 0.25212350156184643,
+ "learning_rate": 2e-05,
+ "loss": 0.7885,
+ "step": 105
+ },
+ {
+ "epoch": 3.28125,
+ "eval_loss": 0.7197096347808838,
+ "eval_runtime": 76.2008,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 105
+ },
+ {
+ "epoch": 3.3125,
+ "grad_norm": 0.264200147608962,
+ "learning_rate": 2e-05,
+ "loss": 0.8371,
+ "step": 106
+ },
+ {
+ "epoch": 3.3125,
+ "eval_loss": 0.7197055220603943,
+ "eval_runtime": 78.1542,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 106
+ },
+ {
+ "epoch": 3.34375,
+ "grad_norm": 0.3309431084940201,
+ "learning_rate": 2e-05,
+ "loss": 0.6999,
+ "step": 107
+ },
+ {
+ "epoch": 3.34375,
+ "eval_loss": 0.7187016010284424,
+ "eval_runtime": 78.4259,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 107
+ },
+ {
+ "epoch": 3.375,
+ "grad_norm": 0.3131644456919823,
+ "learning_rate": 2e-05,
+ "loss": 0.7587,
+ "step": 108
+ },
+ {
+ "epoch": 3.375,
+ "eval_loss": 0.717018187046051,
+ "eval_runtime": 78.4558,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 108
+ },
+ {
+ "epoch": 3.40625,
+ "grad_norm": 0.33527684120780293,
+ "learning_rate": 2e-05,
+ "loss": 0.7468,
+ "step": 109
+ },
+ {
+ "epoch": 3.40625,
+ "eval_loss": 0.7147062420845032,
+ "eval_runtime": 78.2334,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.32,
+ "step": 109
+ },
+ {
+ "epoch": 3.4375,
+ "grad_norm": 0.29542683956231724,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 110
+ },
+ {
+ "epoch": 3.4375,
+ "eval_loss": 0.7130224704742432,
+ "eval_runtime": 79.1179,
+ "eval_samples_per_second": 2.528,
+ "eval_steps_per_second": 0.316,
+ "step": 110
+ },
+ {
+ "epoch": 3.46875,
+ "grad_norm": 0.31128698002926114,
+ "learning_rate": 2e-05,
+ "loss": 0.8153,
+ "step": 111
+ },
+ {
+ "epoch": 3.46875,
+ "eval_loss": 0.7120551466941833,
+ "eval_runtime": 80.292,
+ "eval_samples_per_second": 2.491,
+ "eval_steps_per_second": 0.311,
+ "step": 111
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.32502558864214215,
+ "learning_rate": 2e-05,
+ "loss": 0.8043,
+ "step": 112
+ },
+ {
+ "epoch": 3.5,
+ "eval_loss": 0.7117202877998352,
+ "eval_runtime": 79.7539,
+ "eval_samples_per_second": 2.508,
+ "eval_steps_per_second": 0.313,
+ "step": 112
+ },
+ {
+ "epoch": 3.53125,
+ "grad_norm": 0.34335720855758517,
+ "learning_rate": 2e-05,
+ "loss": 0.871,
+ "step": 113
+ },
+ {
+ "epoch": 3.53125,
+ "eval_loss": 0.7117029428482056,
+ "eval_runtime": 80.0281,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 113
+ },
+ {
+ "epoch": 3.5625,
+ "grad_norm": 0.31951931695644,
+ "learning_rate": 2e-05,
+ "loss": 0.7453,
+ "step": 114
+ },
+ {
+ "epoch": 3.5625,
+ "eval_loss": 0.7116554379463196,
+ "eval_runtime": 79.7209,
+ "eval_samples_per_second": 2.509,
+ "eval_steps_per_second": 0.314,
+ "step": 114
+ },
+ {
+ "epoch": 3.59375,
+ "grad_norm": 0.28067192963874266,
+ "learning_rate": 2e-05,
+ "loss": 0.8045,
+ "step": 115
+ },
+ {
+ "epoch": 3.59375,
+ "eval_loss": 0.7118353843688965,
+ "eval_runtime": 80.0195,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 115
+ },
+ {
+ "epoch": 3.625,
+ "grad_norm": 0.2739718257400276,
+ "learning_rate": 2e-05,
+ "loss": 0.775,
+ "step": 116
+ },
+ {
+ "epoch": 3.625,
+ "eval_loss": 0.7122579216957092,
+ "eval_runtime": 76.2052,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 116
+ },
+ {
+ "epoch": 3.65625,
+ "grad_norm": 0.31401723658881836,
+ "learning_rate": 2e-05,
+ "loss": 0.7826,
+ "step": 117
+ },
+ {
+ "epoch": 3.65625,
+ "eval_loss": 0.7118574380874634,
+ "eval_runtime": 76.1509,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 117
+ },
+ {
+ "epoch": 3.6875,
+ "grad_norm": 0.36925964858634625,
+ "learning_rate": 2e-05,
+ "loss": 0.7884,
+ "step": 118
+ },
+ {
+ "epoch": 3.6875,
+ "eval_loss": 0.710691511631012,
+ "eval_runtime": 76.2305,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 118
+ },
+ {
+ "epoch": 3.71875,
+ "grad_norm": 0.3050583880654791,
+ "learning_rate": 2e-05,
+ "loss": 0.8402,
+ "step": 119
+ },
+ {
+ "epoch": 3.71875,
+ "eval_loss": 0.7096763849258423,
+ "eval_runtime": 77.0581,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 119
+ },
+ {
+ "epoch": 3.75,
+ "grad_norm": 0.2648625651290031,
+ "learning_rate": 2e-05,
+ "loss": 0.7889,
+ "step": 120
+ },
+ {
+ "epoch": 3.75,
+ "eval_loss": 0.7094223499298096,
+ "eval_runtime": 76.1379,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 120
+ },
+ {
+ "epoch": 3.78125,
+ "grad_norm": 0.3107221696449271,
+ "learning_rate": 2e-05,
+ "loss": 0.7615,
+ "step": 121
+ },
+ {
+ "epoch": 3.78125,
+ "eval_loss": 0.7081363201141357,
+ "eval_runtime": 76.626,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 121
+ },
+ {
+ "epoch": 3.8125,
+ "grad_norm": 0.3455151299995048,
+ "learning_rate": 2e-05,
+ "loss": 0.8342,
+ "step": 122
+ },
+ {
+ "epoch": 3.8125,
+ "eval_loss": 0.7063001990318298,
+ "eval_runtime": 77.0293,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 122
+ },
+ {
+ "epoch": 3.84375,
+ "grad_norm": 0.28847071926472523,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 123
+ },
+ {
+ "epoch": 3.84375,
+ "eval_loss": 0.7044610381126404,
+ "eval_runtime": 76.2385,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 123
+ },
+ {
+ "epoch": 3.875,
+ "grad_norm": 0.26753816515069856,
+ "learning_rate": 2e-05,
+ "loss": 0.7653,
+ "step": 124
+ },
+ {
+ "epoch": 3.875,
+ "eval_loss": 0.7033799886703491,
+ "eval_runtime": 76.1985,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 124
+ },
+ {
+ "epoch": 3.90625,
+ "grad_norm": 0.3465046292893005,
+ "learning_rate": 2e-05,
+ "loss": 0.8144,
+ "step": 125
+ },
+ {
+ "epoch": 3.90625,
+ "eval_loss": 0.7021930813789368,
+ "eval_runtime": 76.2234,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 125
+ },
+ {
+ "epoch": 3.9375,
+ "grad_norm": 0.3451690427620698,
+ "learning_rate": 2e-05,
+ "loss": 0.7871,
+ "step": 126
+ },
+ {
+ "epoch": 3.9375,
+ "eval_loss": 0.7013542652130127,
+ "eval_runtime": 78.0752,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 126
+ },
+ {
+ "epoch": 3.96875,
+ "grad_norm": 0.31571858642673567,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 127
+ },
+ {
+ "epoch": 3.96875,
+ "eval_loss": 0.7007560729980469,
+ "eval_runtime": 78.3558,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 127
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.3247003540270338,
+ "learning_rate": 2e-05,
+ "loss": 0.6714,
+ "step": 128
+ },
+ {
+ "epoch": 4.0,
+ "eval_loss": 0.6999780535697937,
+ "eval_runtime": 78.9788,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 128
+ },
+ {
+ "epoch": 4.03125,
+ "grad_norm": 0.2814983490019739,
+ "learning_rate": 2e-05,
+ "loss": 0.7797,
+ "step": 129
+ },
+ {
+ "epoch": 4.03125,
+ "eval_loss": 0.6998200416564941,
+ "eval_runtime": 78.3093,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 129
+ },
+ {
+ "epoch": 4.0625,
+ "grad_norm": 0.31961631715145106,
+ "learning_rate": 2e-05,
+ "loss": 0.7993,
+ "step": 130
+ },
+ {
+ "epoch": 4.0625,
+ "eval_loss": 0.6995271444320679,
+ "eval_runtime": 78.2172,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 130
+ },
+ {
+ "epoch": 4.09375,
+ "grad_norm": 0.32333364662215863,
+ "learning_rate": 2e-05,
+ "loss": 0.7896,
+ "step": 131
+ },
+ {
+ "epoch": 4.09375,
+ "eval_loss": 0.6992727518081665,
+ "eval_runtime": 79.0125,
+ "eval_samples_per_second": 2.531,
+ "eval_steps_per_second": 0.316,
+ "step": 131
+ },
+ {
+ "epoch": 4.125,
+ "grad_norm": 0.3255859640449829,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 132
+ },
+ {
+ "epoch": 4.125,
+ "eval_loss": 0.6988572478294373,
+ "eval_runtime": 79.0,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 132
+ },
+ {
+ "epoch": 4.15625,
+ "grad_norm": 0.3307068947429175,
+ "learning_rate": 2e-05,
+ "loss": 0.8416,
+ "step": 133
+ },
+ {
+ "epoch": 4.15625,
+ "eval_loss": 0.6981343030929565,
+ "eval_runtime": 78.3309,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 133
+ },
+ {
+ "epoch": 4.1875,
+ "grad_norm": 0.3842303818116732,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 134
+ },
+ {
+ "epoch": 4.1875,
+ "eval_loss": 0.6968980431556702,
+ "eval_runtime": 78.5608,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 134
+ },
+ {
+ "epoch": 4.21875,
+ "grad_norm": 0.331839472419003,
+ "learning_rate": 2e-05,
+ "loss": 0.7643,
+ "step": 135
+ },
+ {
+ "epoch": 4.21875,
+ "eval_loss": 0.6955949664115906,
+ "eval_runtime": 78.3566,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 135
+ },
+ {
+ "epoch": 4.25,
+ "grad_norm": 0.31864813130499836,
+ "learning_rate": 2e-05,
+ "loss": 0.7369,
+ "step": 136
+ },
+ {
+ "epoch": 4.25,
+ "eval_loss": 0.6951528787612915,
+ "eval_runtime": 79.7802,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 136
+ },
+ {
+ "epoch": 4.28125,
+ "grad_norm": 0.352549164434451,
+ "learning_rate": 2e-05,
+ "loss": 0.7332,
+ "step": 137
+ },
+ {
+ "epoch": 4.28125,
+ "eval_loss": 0.6947290897369385,
+ "eval_runtime": 79.8171,
+ "eval_samples_per_second": 2.506,
+ "eval_steps_per_second": 0.313,
+ "step": 137
+ },
+ {
+ "epoch": 4.3125,
+ "grad_norm": 0.37128812818896284,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 138
+ },
+ {
+ "epoch": 4.3125,
+ "eval_loss": 0.6937370300292969,
+ "eval_runtime": 79.7782,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 138
+ },
+ {
+ "epoch": 4.34375,
+ "grad_norm": 0.3348014941412048,
+ "learning_rate": 2e-05,
+ "loss": 0.7079,
+ "step": 139
+ },
+ {
+ "epoch": 4.34375,
+ "eval_loss": 0.692456066608429,
+ "eval_runtime": 79.9308,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 139
+ },
+ {
+ "epoch": 4.375,
+ "grad_norm": 0.34411051658527964,
+ "learning_rate": 2e-05,
+ "loss": 0.7465,
+ "step": 140
+ },
+ {
+ "epoch": 4.375,
+ "eval_loss": 0.6915809512138367,
+ "eval_runtime": 79.943,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 140
+ },
+ {
+ "epoch": 4.40625,
+ "grad_norm": 0.3373909601921749,
+ "learning_rate": 2e-05,
+ "loss": 0.7648,
+ "step": 141
+ },
+ {
+ "epoch": 4.40625,
+ "eval_loss": 0.6912103295326233,
+ "eval_runtime": 79.8515,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 141
+ },
+ {
+ "epoch": 4.4375,
+ "grad_norm": 0.33253827371305456,
+ "learning_rate": 2e-05,
+ "loss": 0.7224,
+ "step": 142
+ },
+ {
+ "epoch": 4.4375,
+ "eval_loss": 0.6912806630134583,
+ "eval_runtime": 80.6475,
+ "eval_samples_per_second": 2.48,
+ "eval_steps_per_second": 0.31,
+ "step": 142
+ },
+ {
+ "epoch": 4.46875,
+ "grad_norm": 0.38458075172588313,
+ "learning_rate": 2e-05,
+ "loss": 0.7261,
+ "step": 143
+ },
+ {
+ "epoch": 4.46875,
+ "eval_loss": 0.6905419230461121,
+ "eval_runtime": 80.2606,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.311,
+ "step": 143
+ },
+ {
+ "epoch": 4.5,
+ "grad_norm": 0.31351962640463144,
+ "learning_rate": 2e-05,
+ "loss": 0.6909,
+ "step": 144
+ },
+ {
+ "epoch": 4.5,
+ "eval_loss": 0.6898491382598877,
+ "eval_runtime": 79.9965,
+ "eval_samples_per_second": 2.5,
+ "eval_steps_per_second": 0.313,
+ "step": 144
+ },
+ {
+ "epoch": 4.53125,
+ "grad_norm": 0.35474372115704583,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 145
+ },
+ {
+ "epoch": 4.53125,
+ "eval_loss": 0.6893147230148315,
+ "eval_runtime": 1475.5758,
+ "eval_samples_per_second": 0.136,
+ "eval_steps_per_second": 0.017,
+ "step": 145
+ },
+ {
+ "epoch": 4.5625,
+ "grad_norm": 0.3479568917421202,
+ "learning_rate": 2e-05,
+ "loss": 0.6638,
+ "step": 146
+ },
+ {
+ "epoch": 4.5625,
+ "eval_loss": 0.6884538531303406,
+ "eval_runtime": 84.6835,
+ "eval_samples_per_second": 2.362,
+ "eval_steps_per_second": 0.295,
+ "step": 146
+ },
+ {
+ "epoch": 4.59375,
+ "grad_norm": 0.3421823344428645,
+ "learning_rate": 2e-05,
+ "loss": 0.7339,
+ "step": 147
+ },
+ {
+ "epoch": 4.59375,
+ "eval_loss": 0.6873475909233093,
+ "eval_runtime": 83.3138,
+ "eval_samples_per_second": 2.401,
+ "eval_steps_per_second": 0.3,
+ "step": 147
+ },
+ {
+ "epoch": 4.625,
+ "grad_norm": 0.3642187020830788,
+ "learning_rate": 2e-05,
+ "loss": 0.6825,
+ "step": 148
+ },
+ {
+ "epoch": 4.625,
+ "eval_loss": 0.6858401298522949,
+ "eval_runtime": 82.1066,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 148
+ },
+ {
+ "epoch": 4.65625,
+ "grad_norm": 0.35097547901391785,
+ "learning_rate": 2e-05,
+ "loss": 0.7986,
+ "step": 149
+ },
+ {
+ "epoch": 4.65625,
+ "eval_loss": 0.6848779320716858,
+ "eval_runtime": 84.4076,
+ "eval_samples_per_second": 2.369,
+ "eval_steps_per_second": 0.296,
+ "step": 149
+ },
+ {
+ "epoch": 4.6875,
+ "grad_norm": 0.3568694843794629,
+ "learning_rate": 2e-05,
+ "loss": 0.7176,
+ "step": 150
+ },
+ {
+ "epoch": 4.6875,
+ "eval_loss": 0.6842290759086609,
+ "eval_runtime": 82.5945,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 150
+ },
+ {
+ "epoch": 4.71875,
+ "grad_norm": 0.34258633585260334,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 151
+ },
+ {
+ "epoch": 4.71875,
+ "eval_loss": 0.6838659048080444,
+ "eval_runtime": 85.9626,
+ "eval_samples_per_second": 2.327,
+ "eval_steps_per_second": 0.291,
+ "step": 151
+ },
+ {
+ "epoch": 4.75,
+ "grad_norm": 0.42319523894659655,
+ "learning_rate": 2e-05,
+ "loss": 0.7675,
+ "step": 152
+ },
+ {
+ "epoch": 4.75,
+ "eval_loss": 0.6830299496650696,
+ "eval_runtime": 85.7189,
+ "eval_samples_per_second": 2.333,
+ "eval_steps_per_second": 0.292,
+ "step": 152
+ },
+ {
+ "epoch": 4.78125,
+ "grad_norm": 0.3632195533127194,
+ "learning_rate": 2e-05,
+ "loss": 0.715,
+ "step": 153
+ },
+ {
+ "epoch": 4.78125,
+ "eval_loss": 0.6826379895210266,
+ "eval_runtime": 87.8244,
+ "eval_samples_per_second": 2.277,
+ "eval_steps_per_second": 0.285,
+ "step": 153
+ },
+ {
+ "epoch": 4.8125,
+ "grad_norm": 0.3738308004604413,
+ "learning_rate": 2e-05,
+ "loss": 0.7344,
+ "step": 154
+ },
+ {
+ "epoch": 4.8125,
+ "eval_loss": 0.6826817393302917,
+ "eval_runtime": 86.5822,
+ "eval_samples_per_second": 2.31,
+ "eval_steps_per_second": 0.289,
+ "step": 154
+ },
+ {
+ "epoch": 4.84375,
+ "grad_norm": 0.3618696330632776,
+ "learning_rate": 2e-05,
+ "loss": 0.6632,
+ "step": 155
+ },
+ {
+ "epoch": 4.84375,
+ "eval_loss": 0.6827967166900635,
+ "eval_runtime": 82.1829,
+ "eval_samples_per_second": 2.434,
+ "eval_steps_per_second": 0.304,
+ "step": 155
+ },
+ {
+ "epoch": 4.875,
+ "grad_norm": 0.38901912569992203,
+ "learning_rate": 2e-05,
+ "loss": 0.7788,
+ "step": 156
+ },
+ {
+ "epoch": 4.875,
+ "eval_loss": 0.6821711659431458,
+ "eval_runtime": 84.4511,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 156
+ },
+ {
+ "epoch": 4.90625,
+ "grad_norm": 0.3516096507348829,
+ "learning_rate": 2e-05,
+ "loss": 0.7794,
+ "step": 157
+ },
+ {
+ "epoch": 4.90625,
+ "eval_loss": 0.6819837689399719,
+ "eval_runtime": 84.1594,
+ "eval_samples_per_second": 2.376,
+ "eval_steps_per_second": 0.297,
+ "step": 157
+ },
+ {
+ "epoch": 4.9375,
+ "grad_norm": 0.36066902463794986,
+ "learning_rate": 2e-05,
+ "loss": 0.7674,
+ "step": 158
+ },
+ {
+ "epoch": 4.9375,
+ "eval_loss": 0.6817716956138611,
+ "eval_runtime": 83.8929,
+ "eval_samples_per_second": 2.384,
+ "eval_steps_per_second": 0.298,
+ "step": 158
+ },
+ {
+ "epoch": 4.96875,
+ "grad_norm": 0.36641784926154175,
+ "learning_rate": 2e-05,
+ "loss": 0.7116,
+ "step": 159
+ },
+ {
+ "epoch": 4.96875,
+ "eval_loss": 0.6816902160644531,
+ "eval_runtime": 84.4431,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 159
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.4020716293225933,
+ "learning_rate": 2e-05,
+ "loss": 0.7142,
+ "step": 160
+ },
+ {
+ "epoch": 5.0,
+ "eval_loss": 0.6811469793319702,
+ "eval_runtime": 86.0681,
+ "eval_samples_per_second": 2.324,
+ "eval_steps_per_second": 0.29,
+ "step": 160
+ },
+ {
+ "epoch": 5.03125,
+ "grad_norm": 0.38360882669254054,
+ "learning_rate": 2e-05,
+ "loss": 0.6756,
+ "step": 161
+ },
+ {
+ "epoch": 5.03125,
+ "eval_loss": 0.6798409223556519,
+ "eval_runtime": 81.9903,
+ "eval_samples_per_second": 2.439,
+ "eval_steps_per_second": 0.305,
+ "step": 161
+ },
+ {
+ "epoch": 5.0625,
+ "grad_norm": 0.34966156213066135,
+ "learning_rate": 2e-05,
+ "loss": 0.827,
+ "step": 162
+ },
+ {
+ "epoch": 5.0625,
+ "eval_loss": 0.6788859367370605,
+ "eval_runtime": 76.1753,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 162
+ },
+ {
+ "epoch": 5.09375,
+ "grad_norm": 0.41140842939901384,
+ "learning_rate": 2e-05,
+ "loss": 0.6409,
+ "step": 163
+ },
+ {
+ "epoch": 5.09375,
+ "eval_loss": 0.6787077188491821,
+ "eval_runtime": 76.2239,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 163
+ },
+ {
+ "epoch": 5.125,
+ "grad_norm": 0.4222084070163774,
+ "learning_rate": 2e-05,
+ "loss": 0.7774,
+ "step": 164
+ },
+ {
+ "epoch": 5.125,
+ "eval_loss": 0.6796822547912598,
+ "eval_runtime": 76.2141,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 164
+ },
+ {
+ "epoch": 5.15625,
+ "grad_norm": 0.4644454724424921,
+ "learning_rate": 2e-05,
+ "loss": 0.6057,
+ "step": 165
+ },
+ {
+ "epoch": 5.15625,
+ "eval_loss": 0.6794346570968628,
+ "eval_runtime": 76.3216,
+ "eval_samples_per_second": 2.62,
+ "eval_steps_per_second": 0.328,
+ "step": 165
+ },
+ {
+ "epoch": 5.1875,
+ "grad_norm": 0.46128725263272996,
+ "learning_rate": 2e-05,
+ "loss": 0.7158,
+ "step": 166
+ },
+ {
+ "epoch": 5.1875,
+ "eval_loss": 0.6791612505912781,
+ "eval_runtime": 78.4909,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.319,
+ "step": 166
+ },
+ {
+ "epoch": 5.21875,
+ "grad_norm": 0.37300666872025545,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 167
+ },
+ {
+ "epoch": 5.21875,
+ "eval_loss": 0.6788016557693481,
+ "eval_runtime": 78.5697,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 167
+ },
+ {
+ "epoch": 5.25,
+ "grad_norm": 0.41454648576180214,
+ "learning_rate": 2e-05,
+ "loss": 0.7759,
+ "step": 168
+ },
+ {
+ "epoch": 5.25,
+ "eval_loss": 0.6787048578262329,
+ "eval_runtime": 78.5317,
+ "eval_samples_per_second": 2.547,
+ "eval_steps_per_second": 0.318,
+ "step": 168
+ },
+ {
+ "epoch": 5.28125,
+ "grad_norm": 0.40724665091386236,
+ "learning_rate": 2e-05,
+ "loss": 0.6944,
+ "step": 169
+ },
+ {
+ "epoch": 5.28125,
+ "eval_loss": 0.679679811000824,
+ "eval_runtime": 78.6899,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 169
+ },
+ {
+ "epoch": 5.3125,
+ "grad_norm": 0.3875110486208986,
+ "learning_rate": 2e-05,
+ "loss": 0.6634,
+ "step": 170
+ },
+ {
+ "epoch": 5.3125,
+ "eval_loss": 0.6819935441017151,
+ "eval_runtime": 78.3617,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 170
+ },
+ {
+ "epoch": 5.34375,
+ "grad_norm": 0.47956532155617193,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 171
+ },
+ {
+ "epoch": 5.34375,
+ "eval_loss": 0.6825206875801086,
+ "eval_runtime": 78.4435,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 171
+ },
+ {
+ "epoch": 5.375,
+ "grad_norm": 0.4599359590587781,
+ "learning_rate": 2e-05,
+ "loss": 0.7718,
+ "step": 172
+ },
+ {
+ "epoch": 5.375,
+ "eval_loss": 0.6816768050193787,
+ "eval_runtime": 78.3005,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 172
+ },
+ {
+ "epoch": 5.40625,
+ "grad_norm": 0.4057490487995386,
+ "learning_rate": 2e-05,
+ "loss": 0.7292,
+ "step": 173
+ },
+ {
+ "epoch": 5.40625,
+ "eval_loss": 0.6806090474128723,
+ "eval_runtime": 78.3313,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 173
+ },
+ {
+ "epoch": 5.4375,
+ "grad_norm": 0.4143979315360467,
+ "learning_rate": 2e-05,
+ "loss": 0.7697,
+ "step": 174
+ },
+ {
+ "epoch": 5.4375,
+ "eval_loss": 0.6795693039894104,
+ "eval_runtime": 78.4526,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 174
+ },
+ {
+ "epoch": 5.46875,
+ "grad_norm": 0.4219663662343445,
+ "learning_rate": 2e-05,
+ "loss": 0.7534,
+ "step": 175
+ },
+ {
+ "epoch": 5.46875,
+ "eval_loss": 0.6793847680091858,
+ "eval_runtime": 78.8009,
+ "eval_samples_per_second": 2.538,
+ "eval_steps_per_second": 0.317,
+ "step": 175
+ },
+ {
+ "epoch": 5.5,
+ "grad_norm": 0.4491811321927657,
+ "learning_rate": 2e-05,
+ "loss": 0.7004,
+ "step": 176
+ },
+ {
+ "epoch": 5.5,
+ "eval_loss": 0.6775352358818054,
+ "eval_runtime": 80.0685,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 176
+ },
+ {
+ "epoch": 5.53125,
+ "grad_norm": 0.46366516532638885,
+ "learning_rate": 2e-05,
+ "loss": 0.7357,
+ "step": 177
+ },
+ {
+ "epoch": 5.53125,
+ "eval_loss": 0.6748698949813843,
+ "eval_runtime": 80.0487,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 177
+ },
+ {
+ "epoch": 5.5625,
+ "grad_norm": 0.3815188640227797,
+ "learning_rate": 2e-05,
+ "loss": 0.7592,
+ "step": 178
+ },
+ {
+ "epoch": 5.5625,
+ "eval_loss": 0.6728273034095764,
+ "eval_runtime": 80.0318,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 178
+ },
+ {
+ "epoch": 5.59375,
+ "grad_norm": 0.41025429416666304,
+ "learning_rate": 2e-05,
+ "loss": 0.6585,
+ "step": 179
+ },
+ {
+ "epoch": 5.59375,
+ "eval_loss": 0.6718859672546387,
+ "eval_runtime": 79.8801,
+ "eval_samples_per_second": 2.504,
+ "eval_steps_per_second": 0.313,
+ "step": 179
+ },
+ {
+ "epoch": 5.625,
+ "grad_norm": 0.40652817592240054,
+ "learning_rate": 2e-05,
+ "loss": 0.6611,
+ "step": 180
+ },
+ {
+ "epoch": 5.625,
+ "eval_loss": 0.6715708374977112,
+ "eval_runtime": 76.7261,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 180
+ },
+ {
+ "epoch": 5.65625,
+ "grad_norm": 0.40753961326688415,
+ "learning_rate": 2e-05,
+ "loss": 0.6779,
+ "step": 181
+ },
+ {
+ "epoch": 5.65625,
+ "eval_loss": 0.6719761490821838,
+ "eval_runtime": 77.0136,
+ "eval_samples_per_second": 2.597,
+ "eval_steps_per_second": 0.325,
+ "step": 181
+ },
+ {
+ "epoch": 5.6875,
+ "grad_norm": 0.4232811980671673,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 182
+ },
+ {
+ "epoch": 5.6875,
+ "eval_loss": 0.6724664568901062,
+ "eval_runtime": 76.9731,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 182
+ },
+ {
+ "epoch": 5.71875,
+ "grad_norm": 0.5132756318549849,
+ "learning_rate": 2e-05,
+ "loss": 0.6801,
+ "step": 183
+ },
+ {
+ "epoch": 5.71875,
+ "eval_loss": 0.6723365783691406,
+ "eval_runtime": 76.4132,
+ "eval_samples_per_second": 2.617,
+ "eval_steps_per_second": 0.327,
+ "step": 183
+ },
+ {
+ "epoch": 5.75,
+ "grad_norm": 0.43526879230161264,
+ "learning_rate": 2e-05,
+ "loss": 0.6673,
+ "step": 184
+ },
+ {
+ "epoch": 5.75,
+ "eval_loss": 0.672926664352417,
+ "eval_runtime": 76.1936,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 184
+ },
+ {
+ "epoch": 5.78125,
+ "grad_norm": 0.46965560853038507,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 185
+ },
+ {
+ "epoch": 5.78125,
+ "eval_loss": 0.6731134057044983,
+ "eval_runtime": 76.2345,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 185
+ },
+ {
+ "epoch": 5.8125,
+ "grad_norm": 0.4733296318676217,
+ "learning_rate": 2e-05,
+ "loss": 0.6791,
+ "step": 186
+ },
+ {
+ "epoch": 5.8125,
+ "eval_loss": 0.6726363301277161,
+ "eval_runtime": 78.3939,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 186
+ },
+ {
+ "epoch": 5.84375,
+ "grad_norm": 0.4662943253655961,
+ "learning_rate": 2e-05,
+ "loss": 0.7371,
+ "step": 187
+ },
+ {
+ "epoch": 5.84375,
+ "eval_loss": 0.6726526021957397,
+ "eval_runtime": 79.1834,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 187
+ },
+ {
+ "epoch": 5.875,
+ "grad_norm": 0.4420962889993382,
+ "learning_rate": 2e-05,
+ "loss": 0.675,
+ "step": 188
+ },
+ {
+ "epoch": 5.875,
+ "eval_loss": 0.6727125644683838,
+ "eval_runtime": 78.252,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.319,
+ "step": 188
+ },
+ {
+ "epoch": 5.90625,
+ "grad_norm": 0.4345166976944551,
+ "learning_rate": 2e-05,
+ "loss": 0.6748,
+ "step": 189
+ },
+ {
+ "epoch": 5.90625,
+ "eval_loss": 0.6725904941558838,
+ "eval_runtime": 78.3914,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 189
+ },
+ {
+ "epoch": 5.9375,
+ "grad_norm": 0.45109463315374526,
+ "learning_rate": 2e-05,
+ "loss": 0.7024,
+ "step": 190
+ },
+ {
+ "epoch": 5.9375,
+ "eval_loss": 0.6718384027481079,
+ "eval_runtime": 78.4361,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 190
+ },
+ {
+ "epoch": 5.96875,
+ "grad_norm": 0.42953871838795626,
+ "learning_rate": 2e-05,
+ "loss": 0.6904,
+ "step": 191
+ },
+ {
+ "epoch": 5.96875,
+ "eval_loss": 0.6703083515167236,
+ "eval_runtime": 78.3863,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 191
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.4248607379284984,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 192
+ },
+ {
+ "epoch": 6.0,
+ "eval_loss": 0.6693080067634583,
+ "eval_runtime": 78.4373,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 192
+ },
+ {
+ "epoch": 6.03125,
+ "grad_norm": 0.42839417453459494,
+ "learning_rate": 2e-05,
+ "loss": 0.7457,
+ "step": 193
+ },
+ {
+ "epoch": 6.03125,
+ "eval_loss": 0.6689594984054565,
+ "eval_runtime": 78.4169,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 193
+ },
+ {
+ "epoch": 6.0625,
+ "grad_norm": 0.4216922788166874,
+ "learning_rate": 2e-05,
+ "loss": 0.7189,
+ "step": 194
+ },
+ {
+ "epoch": 6.0625,
+ "eval_loss": 0.6689300537109375,
+ "eval_runtime": 78.9793,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 194
+ },
+ {
+ "epoch": 6.09375,
+ "grad_norm": 0.45199575791858004,
+ "learning_rate": 2e-05,
+ "loss": 0.6438,
+ "step": 195
+ },
+ {
+ "epoch": 6.09375,
+ "eval_loss": 0.6690151691436768,
+ "eval_runtime": 78.5002,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.318,
+ "step": 195
+ },
+ {
+ "epoch": 6.125,
+ "grad_norm": 0.4166923177293841,
+ "learning_rate": 2e-05,
+ "loss": 0.6885,
+ "step": 196
+ },
+ {
+ "epoch": 6.125,
+ "eval_loss": 0.6688613891601562,
+ "eval_runtime": 80.5497,
+ "eval_samples_per_second": 2.483,
+ "eval_steps_per_second": 0.31,
+ "step": 196
+ },
+ {
+ "epoch": 6.15625,
+ "grad_norm": 0.45164281863366285,
+ "learning_rate": 2e-05,
+ "loss": 0.7197,
+ "step": 197
+ },
+ {
+ "epoch": 6.15625,
+ "eval_loss": 0.6687932014465332,
+ "eval_runtime": 80.1482,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 197
+ },
+ {
+ "epoch": 6.1875,
+ "grad_norm": 0.45653924787504446,
+ "learning_rate": 2e-05,
+ "loss": 0.776,
+ "step": 198
+ },
+ {
+ "epoch": 6.1875,
+ "eval_loss": 0.6690963506698608,
+ "eval_runtime": 80.4464,
+ "eval_samples_per_second": 2.486,
+ "eval_steps_per_second": 0.311,
+ "step": 198
+ },
+ {
+ "epoch": 6.21875,
+ "grad_norm": 0.4966562341334706,
+ "learning_rate": 2e-05,
+ "loss": 0.6532,
+ "step": 199
+ },
+ {
+ "epoch": 6.21875,
+ "eval_loss": 0.669116735458374,
+ "eval_runtime": 79.8294,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 199
+ },
+ {
+ "epoch": 6.25,
+ "grad_norm": 0.4838469303220975,
+ "learning_rate": 2e-05,
+ "loss": 0.6883,
+ "step": 200
+ },
+ {
+ "epoch": 6.25,
+ "eval_loss": 0.6693156957626343,
+ "eval_runtime": 80.25,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.312,
+ "step": 200
+ },
+ {
+ "epoch": 6.28125,
+ "grad_norm": 0.4836820906895964,
+ "learning_rate": 2e-05,
+ "loss": 0.7106,
+ "step": 201
+ },
+ {
+ "epoch": 6.28125,
+ "eval_loss": 0.6704170107841492,
+ "eval_runtime": 79.9636,
+ "eval_samples_per_second": 2.501,
+ "eval_steps_per_second": 0.313,
+ "step": 201
+ },
+ {
+ "epoch": 6.3125,
+ "grad_norm": 0.4945855983140219,
+ "learning_rate": 2e-05,
+ "loss": 0.6336,
+ "step": 202
+ },
+ {
+ "epoch": 6.3125,
+ "eval_loss": 0.6708824038505554,
+ "eval_runtime": 80.8044,
+ "eval_samples_per_second": 2.475,
+ "eval_steps_per_second": 0.309,
+ "step": 202
+ },
+ {
+ "epoch": 6.34375,
+ "grad_norm": 0.44587847230103017,
+ "learning_rate": 2e-05,
+ "loss": 0.7811,
+ "step": 203
+ },
+ {
+ "epoch": 6.34375,
+ "eval_loss": 0.6723968982696533,
+ "eval_runtime": 80.1715,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 203
+ },
+ {
+ "epoch": 6.375,
+ "grad_norm": 0.5351063503195825,
+ "learning_rate": 2e-05,
+ "loss": 0.6222,
+ "step": 204
+ },
+ {
+ "epoch": 6.375,
+ "eval_loss": 0.672196626663208,
+ "eval_runtime": 79.927,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 204
+ },
+ {
+ "epoch": 6.40625,
+ "grad_norm": 0.4742985088010474,
+ "learning_rate": 2e-05,
+ "loss": 0.6157,
+ "step": 205
+ },
+ {
+ "epoch": 6.40625,
+ "eval_loss": 0.671062171459198,
+ "eval_runtime": 80.1997,
+ "eval_samples_per_second": 2.494,
+ "eval_steps_per_second": 0.312,
+ "step": 205
+ },
+ {
+ "epoch": 6.4375,
+ "grad_norm": 0.5188882333349506,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 206
+ },
+ {
+ "epoch": 6.4375,
+ "eval_loss": 0.6701972484588623,
+ "eval_runtime": 81.6643,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 206
+ },
+ {
+ "epoch": 6.46875,
+ "grad_norm": 0.45328063593983603,
+ "learning_rate": 2e-05,
+ "loss": 0.7058,
+ "step": 207
+ },
+ {
+ "epoch": 6.46875,
+ "eval_loss": 0.6699164509773254,
+ "eval_runtime": 81.2228,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 207
+ },
+ {
+ "epoch": 6.5,
+ "grad_norm": 0.5197645538332801,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 208
+ },
+ {
+ "epoch": 6.5,
+ "eval_loss": 0.6702597141265869,
+ "eval_runtime": 81.1451,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 208
+ },
+ {
+ "epoch": 6.53125,
+ "grad_norm": 0.5762528184834232,
+ "learning_rate": 2e-05,
+ "loss": 0.6259,
+ "step": 209
+ },
+ {
+ "epoch": 6.53125,
+ "eval_loss": 0.6696366667747498,
+ "eval_runtime": 81.1643,
+ "eval_samples_per_second": 2.464,
+ "eval_steps_per_second": 0.308,
+ "step": 209
+ },
+ {
+ "epoch": 6.5625,
+ "grad_norm": 0.5249503180293145,
+ "learning_rate": 2e-05,
+ "loss": 0.6045,
+ "step": 210
+ },
+ {
+ "epoch": 6.5625,
+ "eval_loss": 0.6688054800033569,
+ "eval_runtime": 80.9492,
+ "eval_samples_per_second": 2.471,
+ "eval_steps_per_second": 0.309,
+ "step": 210
+ },
+ {
+ "epoch": 6.59375,
+ "grad_norm": 0.543503888655844,
+ "learning_rate": 2e-05,
+ "loss": 0.6496,
+ "step": 211
+ },
+ {
+ "epoch": 6.59375,
+ "eval_loss": 0.6689916849136353,
+ "eval_runtime": 81.6473,
+ "eval_samples_per_second": 2.45,
+ "eval_steps_per_second": 0.306,
+ "step": 211
+ },
+ {
+ "epoch": 6.625,
+ "grad_norm": 0.48119553592193554,
+ "learning_rate": 2e-05,
+ "loss": 0.6211,
+ "step": 212
+ },
+ {
+ "epoch": 6.625,
+ "eval_loss": 0.6703050136566162,
+ "eval_runtime": 81.9207,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 212
+ },
+ {
+ "epoch": 6.65625,
+ "grad_norm": 0.5153356086819314,
+ "learning_rate": 2e-05,
+ "loss": 0.7135,
+ "step": 213
+ },
+ {
+ "epoch": 6.65625,
+ "eval_loss": 0.6702842116355896,
+ "eval_runtime": 81.1503,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 213
+ },
+ {
+ "epoch": 6.6875,
+ "grad_norm": 0.5249915042825578,
+ "learning_rate": 2e-05,
+ "loss": 0.6635,
+ "step": 214
+ },
+ {
+ "epoch": 6.6875,
+ "eval_loss": 0.6687333583831787,
+ "eval_runtime": 81.6743,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 214
+ },
+ {
+ "epoch": 6.71875,
+ "grad_norm": 0.5204840219868723,
+ "learning_rate": 2e-05,
+ "loss": 0.6701,
+ "step": 215
+ },
+ {
+ "epoch": 6.71875,
+ "eval_loss": 0.6657728552818298,
+ "eval_runtime": 81.106,
+ "eval_samples_per_second": 2.466,
+ "eval_steps_per_second": 0.308,
+ "step": 215
+ },
+ {
+ "epoch": 6.75,
+ "grad_norm": 0.5266935225120133,
+ "learning_rate": 2e-05,
+ "loss": 0.6637,
+ "step": 216
+ },
+ {
+ "epoch": 6.75,
+ "eval_loss": 0.6641908884048462,
+ "eval_runtime": 82.2613,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 216
+ },
+ {
+ "epoch": 6.78125,
+ "grad_norm": 0.5438859451742696,
+ "learning_rate": 2e-05,
+ "loss": 0.6168,
+ "step": 217
+ },
+ {
+ "epoch": 6.78125,
+ "eval_loss": 0.6652233600616455,
+ "eval_runtime": 82.042,
+ "eval_samples_per_second": 2.438,
+ "eval_steps_per_second": 0.305,
+ "step": 217
+ },
+ {
+ "epoch": 6.8125,
+ "grad_norm": 0.5716385253433929,
+ "learning_rate": 2e-05,
+ "loss": 0.6062,
+ "step": 218
+ },
+ {
+ "epoch": 6.8125,
+ "eval_loss": 0.6656240820884705,
+ "eval_runtime": 81.233,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 218
+ },
+ {
+ "epoch": 6.84375,
+ "grad_norm": 1.0572787630142522,
+ "learning_rate": 2e-05,
+ "loss": 0.7037,
+ "step": 219
+ },
+ {
+ "epoch": 6.84375,
+ "eval_loss": 0.6645559072494507,
+ "eval_runtime": 81.2099,
+ "eval_samples_per_second": 2.463,
+ "eval_steps_per_second": 0.308,
+ "step": 219
+ },
+ {
+ "epoch": 6.875,
+ "grad_norm": 0.5924889323251107,
+ "learning_rate": 2e-05,
+ "loss": 0.712,
+ "step": 220
+ },
+ {
+ "epoch": 6.875,
+ "eval_loss": 0.6619111895561218,
+ "eval_runtime": 81.7826,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 220
+ },
+ {
+ "epoch": 6.90625,
+ "grad_norm": 0.5290576915218269,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 221
+ },
+ {
+ "epoch": 6.90625,
+ "eval_loss": 0.6609540581703186,
+ "eval_runtime": 82.9922,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 221
+ },
+ {
+ "epoch": 6.9375,
+ "grad_norm": 0.5831209517049147,
+ "learning_rate": 2e-05,
+ "loss": 0.6547,
+ "step": 222
+ },
+ {
+ "epoch": 6.9375,
+ "eval_loss": 0.660676896572113,
+ "eval_runtime": 83.6541,
+ "eval_samples_per_second": 2.391,
+ "eval_steps_per_second": 0.299,
+ "step": 222
+ },
+ {
+ "epoch": 6.96875,
+ "grad_norm": 0.5320966369511158,
+ "learning_rate": 2e-05,
+ "loss": 0.6968,
+ "step": 223
+ },
+ {
+ "epoch": 6.96875,
+ "eval_loss": 0.6618594527244568,
+ "eval_runtime": 83.1148,
+ "eval_samples_per_second": 2.406,
+ "eval_steps_per_second": 0.301,
+ "step": 223
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5829636446837394,
+ "learning_rate": 2e-05,
+ "loss": 0.7407,
+ "step": 224
+ },
+ {
+ "epoch": 7.0,
+ "eval_loss": 0.6635661125183105,
+ "eval_runtime": 82.8183,
+ "eval_samples_per_second": 2.415,
+ "eval_steps_per_second": 0.302,
+ "step": 224
+ },
+ {
+ "epoch": 7.03125,
+ "grad_norm": 0.4975095056459566,
+ "learning_rate": 2e-05,
+ "loss": 0.6535,
+ "step": 225
+ },
+ {
+ "epoch": 7.03125,
+ "eval_loss": 0.6641671657562256,
+ "eval_runtime": 83.0267,
+ "eval_samples_per_second": 2.409,
+ "eval_steps_per_second": 0.301,
+ "step": 225
+ },
+ {
+ "epoch": 7.0625,
+ "grad_norm": 0.5625698523064815,
+ "learning_rate": 2e-05,
+ "loss": 0.6012,
+ "step": 226
+ },
+ {
+ "epoch": 7.0625,
+ "eval_loss": 0.6639044880867004,
+ "eval_runtime": 83.3881,
+ "eval_samples_per_second": 2.398,
+ "eval_steps_per_second": 0.3,
+ "step": 226
+ },
+ {
+ "epoch": 7.09375,
+ "grad_norm": 0.5436196850683295,
+ "learning_rate": 2e-05,
+ "loss": 0.6485,
+ "step": 227
+ },
+ {
+ "epoch": 7.09375,
+ "eval_loss": 0.6651788353919983,
+ "eval_runtime": 82.7096,
+ "eval_samples_per_second": 2.418,
+ "eval_steps_per_second": 0.302,
+ "step": 227
+ },
+ {
+ "epoch": 7.125,
+ "grad_norm": 0.5598906287609361,
+ "learning_rate": 2e-05,
+ "loss": 0.6142,
+ "step": 228
+ },
+ {
+ "epoch": 7.125,
+ "eval_loss": 0.6688636541366577,
+ "eval_runtime": 82.601,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 228
+ },
+ {
+ "epoch": 7.15625,
+ "grad_norm": 0.7572979310697923,
+ "learning_rate": 2e-05,
+ "loss": 0.6221,
+ "step": 229
+ },
+ {
+ "epoch": 7.15625,
+ "eval_loss": 0.6699694991111755,
+ "eval_runtime": 82.6032,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 229
+ },
+ {
+ "epoch": 7.1875,
+ "grad_norm": 0.6173309690580897,
+ "learning_rate": 2e-05,
+ "loss": 0.5919,
+ "step": 230
+ },
+ {
+ "epoch": 7.1875,
+ "eval_loss": 0.6706527471542358,
+ "eval_runtime": 82.9732,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 230
+ },
+ {
+ "epoch": 7.21875,
+ "grad_norm": 0.643241771517866,
+ "learning_rate": 2e-05,
+ "loss": 0.7081,
+ "step": 231
+ },
+ {
+ "epoch": 7.21875,
+ "eval_loss": 0.6700320243835449,
+ "eval_runtime": 84.5621,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 231
+ },
+ {
+ "epoch": 7.25,
+ "grad_norm": 0.577638137570571,
+ "learning_rate": 2e-05,
+ "loss": 0.6873,
+ "step": 232
+ },
+ {
+ "epoch": 7.25,
+ "eval_loss": 0.669111430644989,
+ "eval_runtime": 84.5124,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 232
+ },
+ {
+ "epoch": 7.28125,
+ "grad_norm": 0.7229488296023369,
+ "learning_rate": 2e-05,
+ "loss": 0.6301,
+ "step": 233
+ },
+ {
+ "epoch": 7.28125,
+ "eval_loss": 0.6664154529571533,
+ "eval_runtime": 84.6437,
+ "eval_samples_per_second": 2.363,
+ "eval_steps_per_second": 0.295,
+ "step": 233
+ },
+ {
+ "epoch": 7.3125,
+ "grad_norm": 0.5827815449039045,
+ "learning_rate": 2e-05,
+ "loss": 0.669,
+ "step": 234
+ },
+ {
+ "epoch": 7.3125,
+ "eval_loss": 0.6641202569007874,
+ "eval_runtime": 84.489,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 234
+ },
+ {
+ "epoch": 7.34375,
+ "grad_norm": 0.57507354017269,
+ "learning_rate": 2e-05,
+ "loss": 0.6474,
+ "step": 235
+ },
+ {
+ "epoch": 7.34375,
+ "eval_loss": 0.6623325347900391,
+ "eval_runtime": 84.5536,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 235
+ },
+ {
+ "epoch": 7.375,
+ "grad_norm": 0.5810844862533651,
+ "learning_rate": 2e-05,
+ "loss": 0.6048,
+ "step": 236
+ },
+ {
+ "epoch": 7.375,
+ "eval_loss": 0.6619194746017456,
+ "eval_runtime": 84.2296,
+ "eval_samples_per_second": 2.374,
+ "eval_steps_per_second": 0.297,
+ "step": 236
+ },
+ {
+ "epoch": 7.40625,
+ "grad_norm": 0.6075032415813726,
+ "learning_rate": 2e-05,
+ "loss": 0.6529,
+ "step": 237
+ },
+ {
+ "epoch": 7.40625,
+ "eval_loss": 0.6626202464103699,
+ "eval_runtime": 84.9703,
+ "eval_samples_per_second": 2.354,
+ "eval_steps_per_second": 0.294,
+ "step": 237
+ },
+ {
+ "epoch": 7.4375,
+ "grad_norm": 0.6402642234375245,
+ "learning_rate": 2e-05,
+ "loss": 0.6433,
+ "step": 238
+ },
+ {
+ "epoch": 7.4375,
+ "eval_loss": 0.663289487361908,
+ "eval_runtime": 84.8924,
+ "eval_samples_per_second": 2.356,
+ "eval_steps_per_second": 0.294,
+ "step": 238
+ },
+ {
+ "epoch": 7.46875,
+ "grad_norm": 0.6335996982657431,
+ "learning_rate": 2e-05,
+ "loss": 0.6815,
+ "step": 239
+ },
+ {
+ "epoch": 7.46875,
+ "eval_loss": 0.6636109948158264,
+ "eval_runtime": 85.0551,
+ "eval_samples_per_second": 2.351,
+ "eval_steps_per_second": 0.294,
+ "step": 239
+ },
+ {
+ "epoch": 7.5,
+ "grad_norm": 0.5796846795848909,
+ "learning_rate": 2e-05,
+ "loss": 0.6236,
+ "step": 240
+ },
+ {
+ "epoch": 7.5,
+ "eval_loss": 0.6652829051017761,
+ "eval_runtime": 84.7574,
+ "eval_samples_per_second": 2.36,
+ "eval_steps_per_second": 0.295,
+ "step": 240
+ },
+ {
+ "epoch": 7.53125,
+ "grad_norm": 0.5380402145760035,
+ "learning_rate": 2e-05,
+ "loss": 0.6564,
+ "step": 241
+ },
+ {
+ "epoch": 7.53125,
+ "eval_loss": 0.6676375865936279,
+ "eval_runtime": 86.2058,
+ "eval_samples_per_second": 2.32,
+ "eval_steps_per_second": 0.29,
+ "step": 241
+ },
+ {
+ "epoch": 7.5625,
+ "grad_norm": 0.5964298255824012,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 242
+ },
+ {
+ "epoch": 7.5625,
+ "eval_loss": 0.6698520183563232,
+ "eval_runtime": 85.8955,
+ "eval_samples_per_second": 2.328,
+ "eval_steps_per_second": 0.291,
+ "step": 242
+ },
+ {
+ "epoch": 7.59375,
+ "grad_norm": 0.561279296875,
+ "learning_rate": 2e-05,
+ "loss": 0.6395,
+ "step": 243
+ },
+ {
+ "epoch": 7.59375,
+ "eval_loss": 0.6705803871154785,
+ "eval_runtime": 86.0036,
+ "eval_samples_per_second": 2.325,
+ "eval_steps_per_second": 0.291,
+ "step": 243
+ },
+ {
+ "epoch": 7.625,
+ "grad_norm": 0.6757292755073548,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 244
+ },
+ {
+ "epoch": 7.625,
+ "eval_loss": 0.6679538488388062,
+ "eval_runtime": 85.5379,
+ "eval_samples_per_second": 2.338,
+ "eval_steps_per_second": 0.292,
+ "step": 244
+ },
+ {
+ "epoch": 7.65625,
+ "grad_norm": 0.659077163070129,
+ "learning_rate": 2e-05,
+ "loss": 0.6078,
+ "step": 245
+ },
+ {
+ "epoch": 7.65625,
+ "eval_loss": 0.6667564511299133,
+ "eval_runtime": 85.752,
+ "eval_samples_per_second": 2.332,
+ "eval_steps_per_second": 0.292,
+ "step": 245
+ },
+ {
+ "epoch": 7.6875,
+ "grad_norm": 0.6215405566454576,
+ "learning_rate": 2e-05,
+ "loss": 0.6603,
+ "step": 246
+ },
+ {
+ "epoch": 7.6875,
+ "eval_loss": 0.665945291519165,
+ "eval_runtime": 92.3086,
+ "eval_samples_per_second": 2.167,
+ "eval_steps_per_second": 0.271,
+ "step": 246
+ },
+ {
+ "epoch": 7.71875,
+ "grad_norm": 0.6130534921490498,
+ "learning_rate": 2e-05,
+ "loss": 0.6435,
+ "step": 247
+ },
+ {
+ "epoch": 7.71875,
+ "eval_loss": 0.6661685109138489,
+ "eval_runtime": 87.1917,
+ "eval_samples_per_second": 2.294,
+ "eval_steps_per_second": 0.287,
+ "step": 247
+ },
+ {
+ "epoch": 7.75,
+ "grad_norm": 0.6025415602868736,
+ "learning_rate": 2e-05,
+ "loss": 0.6308,
+ "step": 248
+ },
+ {
+ "epoch": 7.75,
+ "eval_loss": 0.6658704280853271,
+ "eval_runtime": 86.8233,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 248
+ },
+ {
+ "epoch": 7.78125,
+ "grad_norm": 0.6901593792019413,
+ "learning_rate": 2e-05,
+ "loss": 0.6777,
+ "step": 249
+ },
+ {
+ "epoch": 7.78125,
+ "eval_loss": 0.6652414202690125,
+ "eval_runtime": 86.7625,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 249
+ },
+ {
+ "epoch": 7.8125,
+ "grad_norm": 0.6436454697341579,
+ "learning_rate": 2e-05,
+ "loss": 0.6912,
+ "step": 250
+ },
+ {
+ "epoch": 7.8125,
+ "eval_loss": 0.6654212474822998,
+ "eval_runtime": 86.871,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 250
+ },
+ {
+ "epoch": 7.84375,
+ "grad_norm": 0.649040103024529,
+ "learning_rate": 2e-05,
+ "loss": 0.6025,
+ "step": 251
+ },
+ {
+ "epoch": 7.84375,
+ "eval_loss": 0.6654068231582642,
+ "eval_runtime": 86.7458,
+ "eval_samples_per_second": 2.306,
+ "eval_steps_per_second": 0.288,
+ "step": 251
+ },
+ {
+ "epoch": 7.875,
+ "grad_norm": 0.6595522131680224,
+ "learning_rate": 2e-05,
+ "loss": 0.5973,
+ "step": 252
+ },
+ {
+ "epoch": 7.875,
+ "eval_loss": 0.6644830107688904,
+ "eval_runtime": 86.8739,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 252
+ },
+ {
+ "epoch": 7.90625,
+ "grad_norm": 0.6689891717273936,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 253
+ },
+ {
+ "epoch": 7.90625,
+ "eval_loss": 0.6616199612617493,
+ "eval_runtime": 86.8222,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 253
+ },
+ {
+ "epoch": 7.9375,
+ "grad_norm": 0.6306846778314292,
+ "learning_rate": 2e-05,
+ "loss": 0.6599,
+ "step": 254
+ },
+ {
+ "epoch": 7.9375,
+ "eval_loss": 0.6592965126037598,
+ "eval_runtime": 86.8577,
+ "eval_samples_per_second": 2.303,
+ "eval_steps_per_second": 0.288,
+ "step": 254
+ },
+ {
+ "epoch": 7.96875,
+ "grad_norm": 0.6021327993890785,
+ "learning_rate": 2e-05,
+ "loss": 0.575,
+ "step": 255
+ },
+ {
+ "epoch": 7.96875,
+ "eval_loss": 0.6580593585968018,
+ "eval_runtime": 86.7582,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 255
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.6174712675568311,
+ "learning_rate": 2e-05,
+ "loss": 0.6341,
+ "step": 256
+ },
+ {
+ "epoch": 8.0,
+ "eval_loss": 0.6575854420661926,
+ "eval_runtime": 76.7634,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 256
+ },
+ {
+ "epoch": 8.03125,
+ "grad_norm": 0.6551281786490154,
+ "learning_rate": 2e-05,
+ "loss": 0.6032,
+ "step": 257
+ },
+ {
+ "epoch": 8.03125,
+ "eval_loss": 0.6583926677703857,
+ "eval_runtime": 83.4222,
+ "eval_samples_per_second": 2.397,
+ "eval_steps_per_second": 0.3,
+ "step": 257
+ },
+ {
+ "epoch": 8.0625,
+ "grad_norm": 0.6033798361300539,
+ "learning_rate": 2e-05,
+ "loss": 0.6352,
+ "step": 258
+ },
+ {
+ "epoch": 8.0625,
+ "eval_loss": 0.6615632772445679,
+ "eval_runtime": 76.7227,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 258
+ },
+ {
+ "epoch": 8.09375,
+ "grad_norm": 0.557538857110867,
+ "learning_rate": 2e-05,
+ "loss": 0.6472,
+ "step": 259
+ },
+ {
+ "epoch": 8.09375,
+ "eval_loss": 0.6674608588218689,
+ "eval_runtime": 76.6215,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 259
+ },
+ {
+ "epoch": 8.125,
+ "grad_norm": 0.7828450894757938,
+ "learning_rate": 2e-05,
+ "loss": 0.6576,
+ "step": 260
+ },
+ {
+ "epoch": 8.125,
+ "eval_loss": 0.670245349407196,
+ "eval_runtime": 76.685,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 260
+ },
+ {
+ "epoch": 8.15625,
+ "grad_norm": 0.7969830757603331,
+ "learning_rate": 2e-05,
+ "loss": 0.5809,
+ "step": 261
+ },
+ {
+ "epoch": 8.15625,
+ "eval_loss": 0.6711975336074829,
+ "eval_runtime": 78.0022,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 261
+ },
+ {
+ "epoch": 8.1875,
+ "grad_norm": 0.6431174985709492,
+ "learning_rate": 2e-05,
+ "loss": 0.6971,
+ "step": 262
+ },
+ {
+ "epoch": 8.1875,
+ "eval_loss": 0.6719404458999634,
+ "eval_runtime": 78.7599,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 262
+ },
+ {
+ "epoch": 8.21875,
+ "grad_norm": 0.7025583314944188,
+ "learning_rate": 2e-05,
+ "loss": 0.5751,
+ "step": 263
+ },
+ {
+ "epoch": 8.21875,
+ "eval_loss": 0.6719526648521423,
+ "eval_runtime": 78.0188,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 263
+ },
+ {
+ "epoch": 8.25,
+ "grad_norm": 0.7114355417811269,
+ "learning_rate": 2e-05,
+ "loss": 0.623,
+ "step": 264
+ },
+ {
+ "epoch": 8.25,
+ "eval_loss": 0.6717848181724548,
+ "eval_runtime": 78.6366,
+ "eval_samples_per_second": 2.543,
+ "eval_steps_per_second": 0.318,
+ "step": 264
+ },
+ {
+ "epoch": 8.28125,
+ "grad_norm": 0.8272269435769467,
+ "learning_rate": 2e-05,
+ "loss": 0.6509,
+ "step": 265
+ },
+ {
+ "epoch": 8.28125,
+ "eval_loss": 0.6701865196228027,
+ "eval_runtime": 78.7279,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.318,
+ "step": 265
+ },
+ {
+ "epoch": 8.3125,
+ "grad_norm": 0.7215994453471393,
+ "learning_rate": 2e-05,
+ "loss": 0.6263,
+ "step": 266
+ },
+ {
+ "epoch": 8.3125,
+ "eval_loss": 0.6682087182998657,
+ "eval_runtime": 78.1433,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 266
+ },
+ {
+ "epoch": 8.34375,
+ "grad_norm": 0.6425448006102333,
+ "learning_rate": 2e-05,
+ "loss": 0.5613,
+ "step": 267
+ },
+ {
+ "epoch": 8.34375,
+ "eval_loss": 0.6686681509017944,
+ "eval_runtime": 78.0964,
+ "eval_samples_per_second": 2.561,
+ "eval_steps_per_second": 0.32,
+ "step": 267
+ },
+ {
+ "epoch": 8.375,
+ "grad_norm": 0.7207053166384572,
+ "learning_rate": 2e-05,
+ "loss": 0.6239,
+ "step": 268
+ },
+ {
+ "epoch": 8.375,
+ "eval_loss": 0.6676305532455444,
+ "eval_runtime": 77.9986,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 268
+ },
+ {
+ "epoch": 8.40625,
+ "grad_norm": 0.7459344743811905,
+ "learning_rate": 2e-05,
+ "loss": 0.6159,
+ "step": 269
+ },
+ {
+ "epoch": 8.40625,
+ "eval_loss": 0.6660167574882507,
+ "eval_runtime": 78.4159,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 269
+ },
+ {
+ "epoch": 8.4375,
+ "grad_norm": 0.7179805119560739,
+ "learning_rate": 2e-05,
+ "loss": 0.6192,
+ "step": 270
+ },
+ {
+ "epoch": 8.4375,
+ "eval_loss": 0.6636325716972351,
+ "eval_runtime": 78.2224,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 270
+ },
+ {
+ "epoch": 8.46875,
+ "grad_norm": 0.724792498458059,
+ "learning_rate": 2e-05,
+ "loss": 0.5234,
+ "step": 271
+ },
+ {
+ "epoch": 8.46875,
+ "eval_loss": 0.6647288799285889,
+ "eval_runtime": 79.0573,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 271
+ },
+ {
+ "epoch": 8.5,
+ "grad_norm": 0.6544107138826364,
+ "learning_rate": 2e-05,
+ "loss": 0.6067,
+ "step": 272
+ },
+ {
+ "epoch": 8.5,
+ "eval_loss": 0.6689667701721191,
+ "eval_runtime": 79.2898,
+ "eval_samples_per_second": 2.522,
+ "eval_steps_per_second": 0.315,
+ "step": 272
+ },
+ {
+ "epoch": 8.53125,
+ "grad_norm": 0.71580236810568,
+ "learning_rate": 2e-05,
+ "loss": 0.6215,
+ "step": 273
+ },
+ {
+ "epoch": 8.53125,
+ "eval_loss": 0.6723271012306213,
+ "eval_runtime": 79.0759,
+ "eval_samples_per_second": 2.529,
+ "eval_steps_per_second": 0.316,
+ "step": 273
+ },
+ {
+ "epoch": 8.5625,
+ "grad_norm": 0.7741383931390255,
+ "learning_rate": 2e-05,
+ "loss": 0.6012,
+ "step": 274
+ },
+ {
+ "epoch": 8.5625,
+ "eval_loss": 0.6743794083595276,
+ "eval_runtime": 79.0509,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 274
+ },
+ {
+ "epoch": 8.59375,
+ "grad_norm": 0.7927343087738151,
+ "learning_rate": 2e-05,
+ "loss": 0.6241,
+ "step": 275
+ },
+ {
+ "epoch": 8.59375,
+ "eval_loss": 0.6728585958480835,
+ "eval_runtime": 79.2296,
+ "eval_samples_per_second": 2.524,
+ "eval_steps_per_second": 0.316,
+ "step": 275
+ },
+ {
+ "epoch": 8.625,
+ "grad_norm": 0.759468785526614,
+ "learning_rate": 2e-05,
+ "loss": 0.6209,
+ "step": 276
+ },
+ {
+ "epoch": 8.625,
+ "eval_loss": 0.6686221957206726,
+ "eval_runtime": 76.7494,
+ "eval_samples_per_second": 2.606,
+ "eval_steps_per_second": 0.326,
+ "step": 276
+ },
+ {
+ "epoch": 8.65625,
+ "grad_norm": 0.7345386079388437,
+ "learning_rate": 2e-05,
+ "loss": 0.5618,
+ "step": 277
+ },
+ {
+ "epoch": 8.65625,
+ "eval_loss": 0.6659188270568848,
+ "eval_runtime": 77.4511,
+ "eval_samples_per_second": 2.582,
+ "eval_steps_per_second": 0.323,
+ "step": 277
+ },
+ {
+ "epoch": 8.6875,
+ "grad_norm": 0.6822491965046279,
+ "learning_rate": 2e-05,
+ "loss": 0.6064,
+ "step": 278
+ },
+ {
+ "epoch": 8.6875,
+ "eval_loss": 0.664726734161377,
+ "eval_runtime": 76.7108,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 278
+ },
+ {
+ "epoch": 8.71875,
+ "grad_norm": 0.7329120674082968,
+ "learning_rate": 2e-05,
+ "loss": 0.5843,
+ "step": 279
+ },
+ {
+ "epoch": 8.71875,
+ "eval_loss": 0.6635715961456299,
+ "eval_runtime": 76.7921,
+ "eval_samples_per_second": 2.604,
+ "eval_steps_per_second": 0.326,
+ "step": 279
+ },
+ {
+ "epoch": 8.75,
+ "grad_norm": 0.7950781591249908,
+ "learning_rate": 2e-05,
+ "loss": 0.6383,
+ "step": 280
+ },
+ {
+ "epoch": 8.75,
+ "eval_loss": 0.664521336555481,
+ "eval_runtime": 76.6952,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 280
+ },
+ {
+ "epoch": 8.78125,
+ "grad_norm": 0.6791182798182671,
+ "learning_rate": 2e-05,
+ "loss": 0.5932,
+ "step": 281
+ },
+ {
+ "epoch": 8.78125,
+ "eval_loss": 0.6673008799552917,
+ "eval_runtime": 76.794,
+ "eval_samples_per_second": 2.604,
+ "eval_steps_per_second": 0.326,
+ "step": 281
+ },
+ {
+ "epoch": 8.8125,
+ "grad_norm": 0.7633434086832942,
+ "learning_rate": 2e-05,
+ "loss": 0.5754,
+ "step": 282
+ },
+ {
+ "epoch": 8.8125,
+ "eval_loss": 0.6692779064178467,
+ "eval_runtime": 76.7749,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 282
+ },
+ {
+ "epoch": 8.84375,
+ "grad_norm": 0.6857090076317197,
+ "learning_rate": 2e-05,
+ "loss": 0.5585,
+ "step": 283
+ },
+ {
+ "epoch": 8.84375,
+ "eval_loss": 0.6702080368995667,
+ "eval_runtime": 76.6913,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 283
+ },
+ {
+ "epoch": 8.875,
+ "grad_norm": 0.6961298007385132,
+ "learning_rate": 2e-05,
+ "loss": 0.5093,
+ "step": 284
+ },
+ {
+ "epoch": 8.875,
+ "eval_loss": 0.6708166599273682,
+ "eval_runtime": 76.7725,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 284
+ },
+ {
+ "epoch": 8.90625,
+ "grad_norm": 0.7783752192295856,
+ "learning_rate": 2e-05,
+ "loss": 0.5656,
+ "step": 285
+ },
+ {
+ "epoch": 8.90625,
+ "eval_loss": 0.6697121262550354,
+ "eval_runtime": 76.7888,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 285
+ },
+ {
+ "epoch": 8.9375,
+ "grad_norm": 0.7327581828795048,
+ "learning_rate": 2e-05,
+ "loss": 0.6984,
+ "step": 286
+ },
+ {
+ "epoch": 8.9375,
+ "eval_loss": 0.6684187054634094,
+ "eval_runtime": 78.6657,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 286
+ },
+ {
+ "epoch": 8.96875,
+ "grad_norm": 0.689919829790507,
+ "learning_rate": 2e-05,
+ "loss": 0.6173,
+ "step": 287
+ },
+ {
+ "epoch": 8.96875,
+ "eval_loss": 0.6675245761871338,
+ "eval_runtime": 78.1275,
+ "eval_samples_per_second": 2.56,
+ "eval_steps_per_second": 0.32,
+ "step": 287
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 0.6812947879732435,
+ "learning_rate": 2e-05,
+ "loss": 0.5499,
+ "step": 288
+ },
+ {
+ "epoch": 9.0,
+ "eval_loss": 0.6678825616836548,
+ "eval_runtime": 78.8588,
+ "eval_samples_per_second": 2.536,
+ "eval_steps_per_second": 0.317,
+ "step": 288
+ },
+ {
+ "epoch": 9.03125,
+ "grad_norm": 0.715716761740314,
+ "learning_rate": 2e-05,
+ "loss": 0.5699,
+ "step": 289
+ },
+ {
+ "epoch": 9.03125,
+ "eval_loss": 0.6692755222320557,
+ "eval_runtime": 83.098,
+ "eval_samples_per_second": 2.407,
+ "eval_steps_per_second": 0.301,
+ "step": 289
+ },
+ {
+ "epoch": 9.0625,
+ "grad_norm": 0.7438930389955494,
+ "learning_rate": 2e-05,
+ "loss": 0.5974,
+ "step": 290
+ },
+ {
+ "epoch": 9.0625,
+ "eval_loss": 0.6735746264457703,
+ "eval_runtime": 77.384,
+ "eval_samples_per_second": 2.585,
+ "eval_steps_per_second": 0.323,
+ "step": 290
+ },
+ {
+ "epoch": 9.09375,
+ "grad_norm": 0.7271043131369198,
+ "learning_rate": 2e-05,
+ "loss": 0.601,
+ "step": 291
+ },
+ {
+ "epoch": 9.09375,
+ "eval_loss": 0.6790977716445923,
+ "eval_runtime": 78.0312,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 291
+ },
+ {
+ "epoch": 9.125,
+ "grad_norm": 0.851687675865168,
+ "learning_rate": 2e-05,
+ "loss": 0.5681,
+ "step": 292
+ },
+ {
+ "epoch": 9.125,
+ "eval_loss": 0.6834170818328857,
+ "eval_runtime": 77.8688,
+ "eval_samples_per_second": 2.568,
+ "eval_steps_per_second": 0.321,
+ "step": 292
+ },
+ {
+ "epoch": 9.15625,
+ "grad_norm": 0.7905287763218567,
+ "learning_rate": 2e-05,
+ "loss": 0.6222,
+ "step": 293
+ },
+ {
+ "epoch": 9.15625,
+ "eval_loss": 0.6843841671943665,
+ "eval_runtime": 77.985,
+ "eval_samples_per_second": 2.565,
+ "eval_steps_per_second": 0.321,
+ "step": 293
+ },
+ {
+ "epoch": 9.1875,
+ "grad_norm": 0.7301520002532459,
+ "learning_rate": 2e-05,
+ "loss": 0.5549,
+ "step": 294
+ },
+ {
+ "epoch": 9.1875,
+ "eval_loss": 0.6860540509223938,
+ "eval_runtime": 78.0163,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.32,
+ "step": 294
+ },
+ {
+ "epoch": 9.21875,
+ "grad_norm": 0.899999206595601,
+ "learning_rate": 2e-05,
+ "loss": 0.5128,
+ "step": 295
+ },
+ {
+ "epoch": 9.21875,
+ "eval_loss": 0.685759425163269,
+ "eval_runtime": 78.4339,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 295
+ },
+ {
+ "epoch": 9.25,
+ "grad_norm": 0.8064287475451557,
+ "learning_rate": 2e-05,
+ "loss": 0.5261,
+ "step": 296
+ },
+ {
+ "epoch": 9.25,
+ "eval_loss": 0.6864770650863647,
+ "eval_runtime": 79.6129,
+ "eval_samples_per_second": 2.512,
+ "eval_steps_per_second": 0.314,
+ "step": 296
+ },
+ {
+ "epoch": 9.28125,
+ "grad_norm": 0.8837240795882767,
+ "learning_rate": 2e-05,
+ "loss": 0.621,
+ "step": 297
+ },
+ {
+ "epoch": 9.28125,
+ "eval_loss": 0.6871599555015564,
+ "eval_runtime": 78.9778,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 297
+ },
+ {
+ "epoch": 9.3125,
+ "grad_norm": 0.9676184044078363,
+ "learning_rate": 2e-05,
+ "loss": 0.5655,
+ "step": 298
+ },
+ {
+ "epoch": 9.3125,
+ "eval_loss": 0.6881282329559326,
+ "eval_runtime": 78.9944,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 298
+ },
+ {
+ "epoch": 9.34375,
+ "grad_norm": 0.8723474213941232,
+ "learning_rate": 2e-05,
+ "loss": 0.5449,
+ "step": 299
+ },
+ {
+ "epoch": 9.34375,
+ "eval_loss": 0.6879245638847351,
+ "eval_runtime": 79.0056,
+ "eval_samples_per_second": 2.531,
+ "eval_steps_per_second": 0.316,
+ "step": 299
+ },
+ {
+ "epoch": 9.375,
+ "grad_norm": 0.848833488380702,
+ "learning_rate": 2e-05,
+ "loss": 0.5683,
+ "step": 300
+ },
+ {
+ "epoch": 9.375,
+ "eval_loss": 0.6846978664398193,
+ "eval_runtime": 78.9003,
+ "eval_samples_per_second": 2.535,
+ "eval_steps_per_second": 0.317,
+ "step": 300
+ },
+ {
+ "epoch": 9.40625,
+ "grad_norm": 0.8586391766708288,
+ "learning_rate": 2e-05,
+ "loss": 0.5358,
+ "step": 301
+ },
+ {
+ "epoch": 9.40625,
+ "eval_loss": 0.6798649430274963,
+ "eval_runtime": 80.0404,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 301
+ },
+ {
+ "epoch": 9.4375,
+ "grad_norm": 0.8007832596916474,
+ "learning_rate": 2e-05,
+ "loss": 0.5792,
+ "step": 302
+ },
+ {
+ "epoch": 9.4375,
+ "eval_loss": 0.6757382750511169,
+ "eval_runtime": 79.962,
+ "eval_samples_per_second": 2.501,
+ "eval_steps_per_second": 0.313,
+ "step": 302
+ },
+ {
+ "epoch": 9.46875,
+ "grad_norm": 0.7839805948862919,
+ "learning_rate": 2e-05,
+ "loss": 0.5917,
+ "step": 303
+ },
+ {
+ "epoch": 9.46875,
+ "eval_loss": 0.6754000782966614,
+ "eval_runtime": 80.738,
+ "eval_samples_per_second": 2.477,
+ "eval_steps_per_second": 0.31,
+ "step": 303
+ },
+ {
+ "epoch": 9.5,
+ "grad_norm": 0.7397772754102683,
+ "learning_rate": 2e-05,
+ "loss": 0.6249,
+ "step": 304
+ },
+ {
+ "epoch": 9.5,
+ "eval_loss": 0.6777495741844177,
+ "eval_runtime": 80.5144,
+ "eval_samples_per_second": 2.484,
+ "eval_steps_per_second": 0.311,
+ "step": 304
+ },
+ {
+ "epoch": 9.53125,
+ "grad_norm": 0.857390001265035,
+ "learning_rate": 2e-05,
+ "loss": 0.5932,
+ "step": 305
+ },
+ {
+ "epoch": 9.53125,
+ "eval_loss": 0.6778848171234131,
+ "eval_runtime": 80.1508,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 305
+ },
+ {
+ "epoch": 9.5625,
+ "grad_norm": 0.9430180281536945,
+ "learning_rate": 2e-05,
+ "loss": 0.5793,
+ "step": 306
+ },
+ {
+ "epoch": 9.5625,
+ "eval_loss": 0.6771917939186096,
+ "eval_runtime": 76.7109,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 306
+ },
+ {
+ "epoch": 9.59375,
+ "grad_norm": 0.8705050270903875,
+ "learning_rate": 2e-05,
+ "loss": 0.5601,
+ "step": 307
+ },
+ {
+ "epoch": 9.59375,
+ "eval_loss": 0.6808632016181946,
+ "eval_runtime": 76.6965,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 307
+ },
+ {
+ "epoch": 9.625,
+ "grad_norm": 0.8611871513168323,
+ "learning_rate": 2e-05,
+ "loss": 0.5953,
+ "step": 308
+ },
+ {
+ "epoch": 9.625,
+ "eval_loss": 0.6875945329666138,
+ "eval_runtime": 76.6592,
+ "eval_samples_per_second": 2.609,
+ "eval_steps_per_second": 0.326,
+ "step": 308
+ },
+ {
+ "epoch": 9.65625,
+ "grad_norm": 0.9066952565245906,
+ "learning_rate": 2e-05,
+ "loss": 0.5815,
+ "step": 309
+ },
+ {
+ "epoch": 9.65625,
+ "eval_loss": 0.6910049319267273,
+ "eval_runtime": 76.7021,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 309
+ },
+ {
+ "epoch": 9.6875,
+ "grad_norm": 1.0666864048105145,
+ "learning_rate": 2e-05,
+ "loss": 0.5663,
+ "step": 310
+ },
+ {
+ "epoch": 9.6875,
+ "eval_loss": 0.6869986057281494,
+ "eval_runtime": 76.6344,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 310
+ },
+ {
+ "epoch": 9.71875,
+ "grad_norm": 0.9413311560347162,
+ "learning_rate": 2e-05,
+ "loss": 0.5106,
+ "step": 311
+ },
+ {
+ "epoch": 9.71875,
+ "eval_loss": 0.6825075745582581,
+ "eval_runtime": 78.7857,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 311
+ },
+ {
+ "epoch": 9.75,
+ "grad_norm": 0.9175579044457436,
+ "learning_rate": 2e-05,
+ "loss": 0.5821,
+ "step": 312
+ },
+ {
+ "epoch": 9.75,
+ "eval_loss": 0.6794223189353943,
+ "eval_runtime": 78.0368,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 312
+ },
+ {
+ "epoch": 9.78125,
+ "grad_norm": 0.7982785075945665,
+ "learning_rate": 2e-05,
+ "loss": 0.5781,
+ "step": 313
+ },
+ {
+ "epoch": 9.78125,
+ "eval_loss": 0.679649829864502,
+ "eval_runtime": 78.0513,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 313
+ },
+ {
+ "epoch": 9.8125,
+ "grad_norm": 0.9284642289974022,
+ "learning_rate": 2e-05,
+ "loss": 0.5394,
+ "step": 314
+ },
+ {
+ "epoch": 9.8125,
+ "eval_loss": 0.6805163025856018,
+ "eval_runtime": 78.2229,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 314
+ },
+ {
+ "epoch": 9.84375,
+ "grad_norm": 0.8816568355396782,
+ "learning_rate": 2e-05,
+ "loss": 0.5722,
+ "step": 315
+ },
+ {
+ "epoch": 9.84375,
+ "eval_loss": 0.6801097393035889,
+ "eval_runtime": 78.9282,
+ "eval_samples_per_second": 2.534,
+ "eval_steps_per_second": 0.317,
+ "step": 315
+ },
+ {
+ "epoch": 9.875,
+ "grad_norm": 0.8137119863863306,
+ "learning_rate": 2e-05,
+ "loss": 0.5831,
+ "step": 316
+ },
+ {
+ "epoch": 9.875,
+ "eval_loss": 0.6792600750923157,
+ "eval_runtime": 78.8166,
+ "eval_samples_per_second": 2.538,
+ "eval_steps_per_second": 0.317,
+ "step": 316
+ },
+ {
+ "epoch": 9.90625,
+ "grad_norm": 0.9595174764400289,
+ "learning_rate": 2e-05,
+ "loss": 0.5489,
+ "step": 317
+ },
+ {
+ "epoch": 9.90625,
+ "eval_loss": 0.6755692958831787,
+ "eval_runtime": 78.1426,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 317
+ },
+ {
+ "epoch": 9.9375,
+ "grad_norm": 0.8612490247878711,
+ "learning_rate": 2e-05,
+ "loss": 0.5508,
+ "step": 318
+ },
+ {
+ "epoch": 9.9375,
+ "eval_loss": 0.673053503036499,
+ "eval_runtime": 78.0565,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 318
+ },
+ {
+ "epoch": 9.96875,
+ "grad_norm": 0.9474068762478358,
+ "learning_rate": 2e-05,
+ "loss": 0.5859,
+ "step": 319
+ },
+ {
+ "epoch": 9.96875,
+ "eval_loss": 0.6695602536201477,
+ "eval_runtime": 78.051,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 319
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.8401643717683449,
+ "learning_rate": 2e-05,
+ "loss": 0.5277,
+ "step": 320
+ },
+ {
+ "epoch": 10.0,
+ "eval_loss": 0.6707890033721924,
+ "eval_runtime": 78.9959,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 320
+ }
+ ],
+ "logging_steps": 1.0,
+ "max_steps": 320,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 5,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 613933061373952.0,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-320/training_args.bin b/checkpoint-320/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..95e42fb482680392bc7a586cb1e05294ea9095fd
--- /dev/null
+++ b/checkpoint-320/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df3df8f53bf051656f1ae89d4681a26c113103914ee95e8a97646c6c5c824188
+size 8312
diff --git a/checkpoint-320/zero_to_fp32.py b/checkpoint-320/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-320/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6608a4564d18a51ddf3fd3b64a27fa6ac7ace2f
--- /dev/null
+++ b/config.json
@@ -0,0 +1,77 @@
+{
+ "_attn_implementation_autoset": true,
+ "_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b",
+ "architectures": [
+ "LlavaLlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "freeze_mm_mlp_adapter": false,
+ "freeze_mm_vision_resampler": false,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 5120,
+ "image_aspect_ratio": "anyres",
+ "image_crop_resolution": 224,
+ "image_grid_pinpoints": [
+ [
+ 336,
+ 672
+ ],
+ [
+ 672,
+ 336
+ ],
+ [
+ 672,
+ 672
+ ],
+ [
+ 1008,
+ 336
+ ],
+ [
+ 336,
+ 1008
+ ]
+ ],
+ "image_split_resolution": 224,
+ "initializer_range": 0.02,
+ "intermediate_size": 13824,
+ "max_length": 4096,
+ "max_position_embeddings": 4096,
+ "mlp_bias": false,
+ "mm_hidden_size": 1024,
+ "mm_patch_merge_type": "flat",
+ "mm_projector_lr": 2e-05,
+ "mm_projector_type": "mlp2x_gelu",
+ "mm_resampler_type": null,
+ "mm_use_im_patch_token": false,
+ "mm_use_im_start_end": false,
+ "mm_vision_select_feature": "patch",
+ "mm_vision_select_layer": -2,
+ "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+ "mm_vision_tower_lr": 2e-06,
+ "model_type": "llava_llama",
+ "num_attention_heads": 40,
+ "num_hidden_layers": 40,
+ "num_key_value_heads": 40,
+ "pad_token_id": 0,
+ "pretraining_tp": 1,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": null,
+ "rope_theta": 10000.0,
+ "tie_word_embeddings": false,
+ "tokenizer_model_max_length": 2048,
+ "tokenizer_padding_side": "right",
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.46.3",
+ "tune_mm_mlp_adapter": false,
+ "tune_mm_vision_resampler": false,
+ "unfreeze_mm_vision_tower": true,
+ "use_cache": true,
+ "use_mm_proj": true,
+ "vocab_size": 32000
+}
diff --git a/non_lora_trainables.bin b/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20fee334e636d45956ccdd990b55fff04b5b66a6
--- /dev/null
+++ b/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05b685cfc78d68bd5ca3e549eb69e061a0e258fd61a17b196e3ef3876ec7cda3
+size 62937264
diff --git a/optimizer.pt b/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbac1432997de6b769e0f7a7e030d794b26f7871
--- /dev/null
+++ b/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7758883238817def488bea1dc14bb3c1a10225fa8d5b92dc0ada5d92c0cdf52b
+size 191824418
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4780db2d21891d9dbb836f8a5366026a3a1d5901
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,4842 @@
+{
+ "best_metric": 0.6575854420661926,
+ "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256",
+ "epoch": 10.0,
+ "eval_steps": 1.0,
+ "global_step": 320,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.03125,
+ "grad_norm": 0.5230235555406132,
+ "learning_rate": 0.0,
+ "loss": 1.5809,
+ "step": 1
+ },
+ {
+ "epoch": 0.03125,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 82.059,
+ "eval_samples_per_second": 2.437,
+ "eval_steps_per_second": 0.305,
+ "step": 1
+ },
+ {
+ "epoch": 0.0625,
+ "grad_norm": 0.5095402010892089,
+ "learning_rate": 2e-05,
+ "loss": 1.4958,
+ "step": 2
+ },
+ {
+ "epoch": 0.0625,
+ "eval_loss": 1.6275018453598022,
+ "eval_runtime": 76.5747,
+ "eval_samples_per_second": 2.612,
+ "eval_steps_per_second": 0.326,
+ "step": 2
+ },
+ {
+ "epoch": 0.09375,
+ "grad_norm": 0.4998514282504938,
+ "learning_rate": 2e-05,
+ "loss": 1.5552,
+ "step": 3
+ },
+ {
+ "epoch": 0.09375,
+ "eval_loss": 1.5956931114196777,
+ "eval_runtime": 76.1563,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 3
+ },
+ {
+ "epoch": 0.125,
+ "grad_norm": 0.4280580315108126,
+ "learning_rate": 2e-05,
+ "loss": 1.4846,
+ "step": 4
+ },
+ {
+ "epoch": 0.125,
+ "eval_loss": 1.5584176778793335,
+ "eval_runtime": 76.1235,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 4
+ },
+ {
+ "epoch": 0.15625,
+ "grad_norm": 0.5678499435986384,
+ "learning_rate": 2e-05,
+ "loss": 1.5036,
+ "step": 5
+ },
+ {
+ "epoch": 0.15625,
+ "eval_loss": 1.5207562446594238,
+ "eval_runtime": 76.1514,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 5
+ },
+ {
+ "epoch": 0.1875,
+ "grad_norm": 0.5368461657542534,
+ "learning_rate": 2e-05,
+ "loss": 1.476,
+ "step": 6
+ },
+ {
+ "epoch": 0.1875,
+ "eval_loss": 1.4807783365249634,
+ "eval_runtime": 77.3444,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 6
+ },
+ {
+ "epoch": 0.21875,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4358,
+ "step": 7
+ },
+ {
+ "epoch": 0.21875,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.066,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 7
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 0.5549950083087136,
+ "learning_rate": 2e-05,
+ "loss": 1.4369,
+ "step": 8
+ },
+ {
+ "epoch": 0.25,
+ "eval_loss": 1.4411544799804688,
+ "eval_runtime": 77.2807,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.323,
+ "step": 8
+ },
+ {
+ "epoch": 0.28125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4471,
+ "step": 9
+ },
+ {
+ "epoch": 0.28125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.1562,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 9
+ },
+ {
+ "epoch": 0.3125,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.3666,
+ "step": 10
+ },
+ {
+ "epoch": 0.3125,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 77.1645,
+ "eval_samples_per_second": 2.592,
+ "eval_steps_per_second": 0.324,
+ "step": 10
+ },
+ {
+ "epoch": 0.34375,
+ "grad_norm": 0.5292240951443854,
+ "learning_rate": 2e-05,
+ "loss": 1.4149,
+ "step": 11
+ },
+ {
+ "epoch": 0.34375,
+ "eval_loss": 1.4036556482315063,
+ "eval_runtime": 78.7627,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 11
+ },
+ {
+ "epoch": 0.375,
+ "grad_norm": 0.684588966714067,
+ "learning_rate": 2e-05,
+ "loss": 1.3883,
+ "step": 12
+ },
+ {
+ "epoch": 0.375,
+ "eval_loss": 1.3679308891296387,
+ "eval_runtime": 78.4315,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 12
+ },
+ {
+ "epoch": 0.40625,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.4271,
+ "step": 13
+ },
+ {
+ "epoch": 0.40625,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.685,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 13
+ },
+ {
+ "epoch": 0.4375,
+ "grad_norm": 0.6261826769491422,
+ "learning_rate": 2e-05,
+ "loss": 1.2495,
+ "step": 14
+ },
+ {
+ "epoch": 0.4375,
+ "eval_loss": 1.3369851112365723,
+ "eval_runtime": 78.0511,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 14
+ },
+ {
+ "epoch": 0.46875,
+ "grad_norm": 0.6028103951693778,
+ "learning_rate": 2e-05,
+ "loss": 1.3513,
+ "step": 15
+ },
+ {
+ "epoch": 0.46875,
+ "eval_loss": 1.3032653331756592,
+ "eval_runtime": 78.0271,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 15
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 0.769290402283396,
+ "learning_rate": 2e-05,
+ "loss": 1.3117,
+ "step": 16
+ },
+ {
+ "epoch": 0.5,
+ "eval_loss": 1.2661188840866089,
+ "eval_runtime": 78.1857,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 16
+ },
+ {
+ "epoch": 0.53125,
+ "grad_norm": 1.3279338025863765,
+ "learning_rate": 2e-05,
+ "loss": 1.2768,
+ "step": 17
+ },
+ {
+ "epoch": 0.53125,
+ "eval_loss": 1.2299447059631348,
+ "eval_runtime": 78.2064,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 17
+ },
+ {
+ "epoch": 0.5625,
+ "grad_norm": 0.7410327159336384,
+ "learning_rate": 2e-05,
+ "loss": 1.256,
+ "step": 18
+ },
+ {
+ "epoch": 0.5625,
+ "eval_loss": 1.2044258117675781,
+ "eval_runtime": 78.072,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 18
+ },
+ {
+ "epoch": 0.59375,
+ "grad_norm": 0.44078820770408506,
+ "learning_rate": 2e-05,
+ "loss": 1.1252,
+ "step": 19
+ },
+ {
+ "epoch": 0.59375,
+ "eval_loss": 1.1826122999191284,
+ "eval_runtime": 78.7312,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.318,
+ "step": 19
+ },
+ {
+ "epoch": 0.625,
+ "grad_norm": 0.49020841613371097,
+ "learning_rate": 2e-05,
+ "loss": 1.2249,
+ "step": 20
+ },
+ {
+ "epoch": 0.625,
+ "eval_loss": 1.1616511344909668,
+ "eval_runtime": 78.2736,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 20
+ },
+ {
+ "epoch": 0.65625,
+ "grad_norm": 0.43031322695269714,
+ "learning_rate": 2e-05,
+ "loss": 1.1466,
+ "step": 21
+ },
+ {
+ "epoch": 0.65625,
+ "eval_loss": 1.1410629749298096,
+ "eval_runtime": 79.6432,
+ "eval_samples_per_second": 2.511,
+ "eval_steps_per_second": 0.314,
+ "step": 21
+ },
+ {
+ "epoch": 0.6875,
+ "grad_norm": 0.45632085445955545,
+ "learning_rate": 2e-05,
+ "loss": 1.1951,
+ "step": 22
+ },
+ {
+ "epoch": 0.6875,
+ "eval_loss": 1.1204684972763062,
+ "eval_runtime": 79.0609,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 22
+ },
+ {
+ "epoch": 0.71875,
+ "grad_norm": 0.40048586945364495,
+ "learning_rate": 2e-05,
+ "loss": 1.1826,
+ "step": 23
+ },
+ {
+ "epoch": 0.71875,
+ "eval_loss": 1.1002545356750488,
+ "eval_runtime": 82.8578,
+ "eval_samples_per_second": 2.414,
+ "eval_steps_per_second": 0.302,
+ "step": 23
+ },
+ {
+ "epoch": 0.75,
+ "grad_norm": 0.3703033261027938,
+ "learning_rate": 2e-05,
+ "loss": 1.1543,
+ "step": 24
+ },
+ {
+ "epoch": 0.75,
+ "eval_loss": 1.0805977582931519,
+ "eval_runtime": 76.1407,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 24
+ },
+ {
+ "epoch": 0.78125,
+ "grad_norm": 0.3986313105418924,
+ "learning_rate": 2e-05,
+ "loss": 1.1046,
+ "step": 25
+ },
+ {
+ "epoch": 0.78125,
+ "eval_loss": 1.0610157251358032,
+ "eval_runtime": 76.3083,
+ "eval_samples_per_second": 2.621,
+ "eval_steps_per_second": 0.328,
+ "step": 25
+ },
+ {
+ "epoch": 0.8125,
+ "grad_norm": 0.36265027203577943,
+ "learning_rate": 2e-05,
+ "loss": 1.1048,
+ "step": 26
+ },
+ {
+ "epoch": 0.8125,
+ "eval_loss": 1.0421289205551147,
+ "eval_runtime": 77.2186,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 26
+ },
+ {
+ "epoch": 0.84375,
+ "grad_norm": 0.3881748990218768,
+ "learning_rate": 2e-05,
+ "loss": 1.0425,
+ "step": 27
+ },
+ {
+ "epoch": 0.84375,
+ "eval_loss": 1.0240073204040527,
+ "eval_runtime": 77.8662,
+ "eval_samples_per_second": 2.569,
+ "eval_steps_per_second": 0.321,
+ "step": 27
+ },
+ {
+ "epoch": 0.875,
+ "grad_norm": 0.3734031294324286,
+ "learning_rate": 2e-05,
+ "loss": 1.0484,
+ "step": 28
+ },
+ {
+ "epoch": 0.875,
+ "eval_loss": 1.0066957473754883,
+ "eval_runtime": 77.269,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 28
+ },
+ {
+ "epoch": 0.90625,
+ "grad_norm": 0.29695383079342563,
+ "learning_rate": 2e-05,
+ "loss": 1.0387,
+ "step": 29
+ },
+ {
+ "epoch": 0.90625,
+ "eval_loss": 0.9906074404716492,
+ "eval_runtime": 77.2245,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 29
+ },
+ {
+ "epoch": 0.9375,
+ "grad_norm": 0.29273146875026623,
+ "learning_rate": 2e-05,
+ "loss": 1.0568,
+ "step": 30
+ },
+ {
+ "epoch": 0.9375,
+ "eval_loss": 0.975755512714386,
+ "eval_runtime": 78.0056,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.32,
+ "step": 30
+ },
+ {
+ "epoch": 0.96875,
+ "grad_norm": 0.35070440686850546,
+ "learning_rate": 2e-05,
+ "loss": 0.9114,
+ "step": 31
+ },
+ {
+ "epoch": 0.96875,
+ "eval_loss": 0.9615123271942139,
+ "eval_runtime": 77.9051,
+ "eval_samples_per_second": 2.567,
+ "eval_steps_per_second": 0.321,
+ "step": 31
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.30846157140439384,
+ "learning_rate": 2e-05,
+ "loss": 0.9941,
+ "step": 32
+ },
+ {
+ "epoch": 1.0,
+ "eval_loss": 0.9480571150779724,
+ "eval_runtime": 77.2322,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 32
+ },
+ {
+ "epoch": 1.03125,
+ "grad_norm": 0.2950381371932973,
+ "learning_rate": 2e-05,
+ "loss": 1.0297,
+ "step": 33
+ },
+ {
+ "epoch": 1.03125,
+ "eval_loss": 0.9356330037117004,
+ "eval_runtime": 81.8443,
+ "eval_samples_per_second": 2.444,
+ "eval_steps_per_second": 0.305,
+ "step": 33
+ },
+ {
+ "epoch": 1.0625,
+ "grad_norm": 0.27080038065834283,
+ "learning_rate": 2e-05,
+ "loss": 1.021,
+ "step": 34
+ },
+ {
+ "epoch": 1.0625,
+ "eval_loss": 0.9245791435241699,
+ "eval_runtime": 76.2071,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 34
+ },
+ {
+ "epoch": 1.09375,
+ "grad_norm": 0.23165081252649894,
+ "learning_rate": 2e-05,
+ "loss": 1.0366,
+ "step": 35
+ },
+ {
+ "epoch": 1.09375,
+ "eval_loss": 0.9151126146316528,
+ "eval_runtime": 77.0412,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 35
+ },
+ {
+ "epoch": 1.125,
+ "grad_norm": 0.4033780922500775,
+ "learning_rate": 2e-05,
+ "loss": 1.0127,
+ "step": 36
+ },
+ {
+ "epoch": 1.125,
+ "eval_loss": 0.9063960313796997,
+ "eval_runtime": 76.9327,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 36
+ },
+ {
+ "epoch": 1.15625,
+ "grad_norm": 0.2398039831439168,
+ "learning_rate": 2e-05,
+ "loss": 0.9418,
+ "step": 37
+ },
+ {
+ "epoch": 1.15625,
+ "eval_loss": 0.8982363939285278,
+ "eval_runtime": 76.1234,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 37
+ },
+ {
+ "epoch": 1.1875,
+ "grad_norm": 0.28793451241246804,
+ "learning_rate": 2e-05,
+ "loss": 0.9643,
+ "step": 38
+ },
+ {
+ "epoch": 1.1875,
+ "eval_loss": 0.8908895254135132,
+ "eval_runtime": 76.2877,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 38
+ },
+ {
+ "epoch": 1.21875,
+ "grad_norm": 0.2927691606307197,
+ "learning_rate": 2e-05,
+ "loss": 1.0087,
+ "step": 39
+ },
+ {
+ "epoch": 1.21875,
+ "eval_loss": 0.8845618367195129,
+ "eval_runtime": 76.2282,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 39
+ },
+ {
+ "epoch": 1.25,
+ "grad_norm": 0.26410982001408806,
+ "learning_rate": 2e-05,
+ "loss": 0.986,
+ "step": 40
+ },
+ {
+ "epoch": 1.25,
+ "eval_loss": 0.8784474730491638,
+ "eval_runtime": 76.2512,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 40
+ },
+ {
+ "epoch": 1.28125,
+ "grad_norm": 0.29182630949665306,
+ "learning_rate": 2e-05,
+ "loss": 0.9711,
+ "step": 41
+ },
+ {
+ "epoch": 1.28125,
+ "eval_loss": 0.8725223541259766,
+ "eval_runtime": 77.1229,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 41
+ },
+ {
+ "epoch": 1.3125,
+ "grad_norm": 0.36402838796832665,
+ "learning_rate": 2e-05,
+ "loss": 0.9263,
+ "step": 42
+ },
+ {
+ "epoch": 1.3125,
+ "eval_loss": 0.8662790060043335,
+ "eval_runtime": 77.2362,
+ "eval_samples_per_second": 2.589,
+ "eval_steps_per_second": 0.324,
+ "step": 42
+ },
+ {
+ "epoch": 1.34375,
+ "grad_norm": 0.29338184478895163,
+ "learning_rate": 2e-05,
+ "loss": 0.8947,
+ "step": 43
+ },
+ {
+ "epoch": 1.34375,
+ "eval_loss": 0.8600431680679321,
+ "eval_runtime": 77.1213,
+ "eval_samples_per_second": 2.593,
+ "eval_steps_per_second": 0.324,
+ "step": 43
+ },
+ {
+ "epoch": 1.375,
+ "grad_norm": 0.2201714229702277,
+ "learning_rate": 2e-05,
+ "loss": 0.9059,
+ "step": 44
+ },
+ {
+ "epoch": 1.375,
+ "eval_loss": 0.8545799255371094,
+ "eval_runtime": 77.991,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 44
+ },
+ {
+ "epoch": 1.40625,
+ "grad_norm": 0.2254966625243654,
+ "learning_rate": 2e-05,
+ "loss": 0.8942,
+ "step": 45
+ },
+ {
+ "epoch": 1.40625,
+ "eval_loss": 0.8497399687767029,
+ "eval_runtime": 77.2698,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 45
+ },
+ {
+ "epoch": 1.4375,
+ "grad_norm": 0.21753318432075458,
+ "learning_rate": 2e-05,
+ "loss": 0.9376,
+ "step": 46
+ },
+ {
+ "epoch": 1.4375,
+ "eval_loss": 0.8452473282814026,
+ "eval_runtime": 77.0568,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 46
+ },
+ {
+ "epoch": 1.46875,
+ "grad_norm": 0.21449718265972945,
+ "learning_rate": 2e-05,
+ "loss": 0.9369,
+ "step": 47
+ },
+ {
+ "epoch": 1.46875,
+ "eval_loss": 0.841134786605835,
+ "eval_runtime": 77.225,
+ "eval_samples_per_second": 2.59,
+ "eval_steps_per_second": 0.324,
+ "step": 47
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 0.2109063266748924,
+ "learning_rate": 2e-05,
+ "loss": 0.8511,
+ "step": 48
+ },
+ {
+ "epoch": 1.5,
+ "eval_loss": 0.8373770117759705,
+ "eval_runtime": 76.2309,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 48
+ },
+ {
+ "epoch": 1.53125,
+ "grad_norm": 0.232838633689838,
+ "learning_rate": 2e-05,
+ "loss": 0.8694,
+ "step": 49
+ },
+ {
+ "epoch": 1.53125,
+ "eval_loss": 0.8338289856910706,
+ "eval_runtime": 76.277,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 49
+ },
+ {
+ "epoch": 1.5625,
+ "grad_norm": 0.4189704940803984,
+ "learning_rate": 2e-05,
+ "loss": 0.8464,
+ "step": 50
+ },
+ {
+ "epoch": 1.5625,
+ "eval_loss": 0.8297132849693298,
+ "eval_runtime": 76.2872,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 50
+ },
+ {
+ "epoch": 1.59375,
+ "grad_norm": 0.2171618165123276,
+ "learning_rate": 2e-05,
+ "loss": 0.8785,
+ "step": 51
+ },
+ {
+ "epoch": 1.59375,
+ "eval_loss": 0.8257431983947754,
+ "eval_runtime": 76.2639,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 51
+ },
+ {
+ "epoch": 1.625,
+ "grad_norm": 0.21934651037670305,
+ "learning_rate": 2e-05,
+ "loss": 0.7645,
+ "step": 52
+ },
+ {
+ "epoch": 1.625,
+ "eval_loss": 0.8223557472229004,
+ "eval_runtime": 76.2383,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 52
+ },
+ {
+ "epoch": 1.65625,
+ "grad_norm": 0.24183530733164746,
+ "learning_rate": 2e-05,
+ "loss": 0.9218,
+ "step": 53
+ },
+ {
+ "epoch": 1.65625,
+ "eval_loss": 0.8189653158187866,
+ "eval_runtime": 76.9819,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 53
+ },
+ {
+ "epoch": 1.6875,
+ "grad_norm": 0.23450930244279267,
+ "learning_rate": 2e-05,
+ "loss": 0.8896,
+ "step": 54
+ },
+ {
+ "epoch": 1.6875,
+ "eval_loss": 0.8152530193328857,
+ "eval_runtime": 76.2378,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 54
+ },
+ {
+ "epoch": 1.71875,
+ "grad_norm": 0.22081665899796085,
+ "learning_rate": 2e-05,
+ "loss": 0.8798,
+ "step": 55
+ },
+ {
+ "epoch": 1.71875,
+ "eval_loss": 0.8122122287750244,
+ "eval_runtime": 76.289,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 55
+ },
+ {
+ "epoch": 1.75,
+ "grad_norm": 0.21311746114111046,
+ "learning_rate": 2e-05,
+ "loss": 0.9482,
+ "step": 56
+ },
+ {
+ "epoch": 1.75,
+ "eval_loss": 0.8092318773269653,
+ "eval_runtime": 77.8321,
+ "eval_samples_per_second": 2.57,
+ "eval_steps_per_second": 0.321,
+ "step": 56
+ },
+ {
+ "epoch": 1.78125,
+ "grad_norm": 0.2496565307107556,
+ "learning_rate": 2e-05,
+ "loss": 0.8917,
+ "step": 57
+ },
+ {
+ "epoch": 1.78125,
+ "eval_loss": 0.8070546984672546,
+ "eval_runtime": 77.2651,
+ "eval_samples_per_second": 2.588,
+ "eval_steps_per_second": 0.324,
+ "step": 57
+ },
+ {
+ "epoch": 1.8125,
+ "grad_norm": 0.2137866456424736,
+ "learning_rate": 2e-05,
+ "loss": 0.909,
+ "step": 58
+ },
+ {
+ "epoch": 1.8125,
+ "eval_loss": 0.8049566745758057,
+ "eval_runtime": 78.0925,
+ "eval_samples_per_second": 2.561,
+ "eval_steps_per_second": 0.32,
+ "step": 58
+ },
+ {
+ "epoch": 1.84375,
+ "grad_norm": 0.22567502859345095,
+ "learning_rate": 2e-05,
+ "loss": 0.8611,
+ "step": 59
+ },
+ {
+ "epoch": 1.84375,
+ "eval_loss": 0.8028810024261475,
+ "eval_runtime": 78.0553,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 59
+ },
+ {
+ "epoch": 1.875,
+ "grad_norm": 0.23303796552302508,
+ "learning_rate": 2e-05,
+ "loss": 0.9209,
+ "step": 60
+ },
+ {
+ "epoch": 1.875,
+ "eval_loss": 0.800568699836731,
+ "eval_runtime": 78.052,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 60
+ },
+ {
+ "epoch": 1.90625,
+ "grad_norm": 0.24566727726974544,
+ "learning_rate": 2e-05,
+ "loss": 0.8239,
+ "step": 61
+ },
+ {
+ "epoch": 1.90625,
+ "eval_loss": 0.7976545691490173,
+ "eval_runtime": 77.3056,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 61
+ },
+ {
+ "epoch": 1.9375,
+ "grad_norm": 0.23014192522354907,
+ "learning_rate": 2e-05,
+ "loss": 0.8814,
+ "step": 62
+ },
+ {
+ "epoch": 1.9375,
+ "eval_loss": 0.7945474982261658,
+ "eval_runtime": 77.3398,
+ "eval_samples_per_second": 2.586,
+ "eval_steps_per_second": 0.323,
+ "step": 62
+ },
+ {
+ "epoch": 1.96875,
+ "grad_norm": 0.23042819102671622,
+ "learning_rate": 2e-05,
+ "loss": 0.9064,
+ "step": 63
+ },
+ {
+ "epoch": 1.96875,
+ "eval_loss": 0.7918359637260437,
+ "eval_runtime": 77.4272,
+ "eval_samples_per_second": 2.583,
+ "eval_steps_per_second": 0.323,
+ "step": 63
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.23940667173206315,
+ "learning_rate": 2e-05,
+ "loss": 0.8658,
+ "step": 64
+ },
+ {
+ "epoch": 2.0,
+ "eval_loss": 0.7891160845756531,
+ "eval_runtime": 77.3236,
+ "eval_samples_per_second": 2.587,
+ "eval_steps_per_second": 0.323,
+ "step": 64
+ },
+ {
+ "epoch": 2.03125,
+ "grad_norm": 0.22630342930143643,
+ "learning_rate": 2e-05,
+ "loss": 0.8403,
+ "step": 65
+ },
+ {
+ "epoch": 2.03125,
+ "eval_loss": 0.7859742641448975,
+ "eval_runtime": 77.2001,
+ "eval_samples_per_second": 2.591,
+ "eval_steps_per_second": 0.324,
+ "step": 65
+ },
+ {
+ "epoch": 2.0625,
+ "grad_norm": 0.20949240460260976,
+ "learning_rate": 2e-05,
+ "loss": 0.8472,
+ "step": 66
+ },
+ {
+ "epoch": 2.0625,
+ "eval_loss": 0.7834083437919617,
+ "eval_runtime": 78.9646,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 66
+ },
+ {
+ "epoch": 2.09375,
+ "grad_norm": 0.22714400479820654,
+ "learning_rate": 2e-05,
+ "loss": 0.841,
+ "step": 67
+ },
+ {
+ "epoch": 2.09375,
+ "eval_loss": 0.7805308699607849,
+ "eval_runtime": 78.7552,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.317,
+ "step": 67
+ },
+ {
+ "epoch": 2.125,
+ "grad_norm": 0.23345123077006047,
+ "learning_rate": 2e-05,
+ "loss": 0.9028,
+ "step": 68
+ },
+ {
+ "epoch": 2.125,
+ "eval_loss": 0.7779514789581299,
+ "eval_runtime": 78.3387,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 68
+ },
+ {
+ "epoch": 2.15625,
+ "grad_norm": 0.251841542575211,
+ "learning_rate": 2e-05,
+ "loss": 0.8381,
+ "step": 69
+ },
+ {
+ "epoch": 2.15625,
+ "eval_loss": 0.7756664752960205,
+ "eval_runtime": 78.3109,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 69
+ },
+ {
+ "epoch": 2.1875,
+ "grad_norm": 0.23548386839773608,
+ "learning_rate": 2e-05,
+ "loss": 0.7914,
+ "step": 70
+ },
+ {
+ "epoch": 2.1875,
+ "eval_loss": 0.7733604907989502,
+ "eval_runtime": 78.9712,
+ "eval_samples_per_second": 2.533,
+ "eval_steps_per_second": 0.317,
+ "step": 70
+ },
+ {
+ "epoch": 2.21875,
+ "grad_norm": 0.23262740912668387,
+ "learning_rate": 2e-05,
+ "loss": 0.8778,
+ "step": 71
+ },
+ {
+ "epoch": 2.21875,
+ "eval_loss": 0.771755576133728,
+ "eval_runtime": 78.2633,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 71
+ },
+ {
+ "epoch": 2.25,
+ "grad_norm": 0.22075289612357513,
+ "learning_rate": 2e-05,
+ "loss": 0.7945,
+ "step": 72
+ },
+ {
+ "epoch": 2.25,
+ "eval_loss": 0.7705450654029846,
+ "eval_runtime": 78.3151,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 72
+ },
+ {
+ "epoch": 2.28125,
+ "grad_norm": 0.25520381955936466,
+ "learning_rate": 2e-05,
+ "loss": 0.8387,
+ "step": 73
+ },
+ {
+ "epoch": 2.28125,
+ "eval_loss": 0.7695029973983765,
+ "eval_runtime": 78.2901,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 73
+ },
+ {
+ "epoch": 2.3125,
+ "grad_norm": 0.2047305385827267,
+ "learning_rate": 2e-05,
+ "loss": 0.8404,
+ "step": 74
+ },
+ {
+ "epoch": 2.3125,
+ "eval_loss": 0.7684457302093506,
+ "eval_runtime": 78.3875,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 74
+ },
+ {
+ "epoch": 2.34375,
+ "grad_norm": 0.2262323045133288,
+ "learning_rate": 2e-05,
+ "loss": 0.8811,
+ "step": 75
+ },
+ {
+ "epoch": 2.34375,
+ "eval_loss": 0.7671162486076355,
+ "eval_runtime": 78.202,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 75
+ },
+ {
+ "epoch": 2.375,
+ "grad_norm": 0.21885464923925876,
+ "learning_rate": 2e-05,
+ "loss": 0.7942,
+ "step": 76
+ },
+ {
+ "epoch": 2.375,
+ "eval_loss": 0.7658494710922241,
+ "eval_runtime": 78.1746,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 76
+ },
+ {
+ "epoch": 2.40625,
+ "grad_norm": 0.21717306953626966,
+ "learning_rate": 2e-05,
+ "loss": 0.8497,
+ "step": 77
+ },
+ {
+ "epoch": 2.40625,
+ "eval_loss": 0.7642120122909546,
+ "eval_runtime": 78.2026,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 77
+ },
+ {
+ "epoch": 2.4375,
+ "grad_norm": 0.2530725583748258,
+ "learning_rate": 2e-05,
+ "loss": 0.8584,
+ "step": 78
+ },
+ {
+ "epoch": 2.4375,
+ "eval_loss": 0.7625510692596436,
+ "eval_runtime": 78.1991,
+ "eval_samples_per_second": 2.558,
+ "eval_steps_per_second": 0.32,
+ "step": 78
+ },
+ {
+ "epoch": 2.46875,
+ "grad_norm": 0.25354787036627263,
+ "learning_rate": 2e-05,
+ "loss": 0.8569,
+ "step": 79
+ },
+ {
+ "epoch": 2.46875,
+ "eval_loss": 0.7616268396377563,
+ "eval_runtime": 78.2915,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 79
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 0.2800865746664007,
+ "learning_rate": 2e-05,
+ "loss": 0.9116,
+ "step": 80
+ },
+ {
+ "epoch": 2.5,
+ "eval_loss": 0.7603214979171753,
+ "eval_runtime": 78.2749,
+ "eval_samples_per_second": 2.555,
+ "eval_steps_per_second": 0.319,
+ "step": 80
+ },
+ {
+ "epoch": 2.53125,
+ "grad_norm": 0.268139688449618,
+ "learning_rate": 2e-05,
+ "loss": 0.8397,
+ "step": 81
+ },
+ {
+ "epoch": 2.53125,
+ "eval_loss": 0.7584869265556335,
+ "eval_runtime": 79.1445,
+ "eval_samples_per_second": 2.527,
+ "eval_steps_per_second": 0.316,
+ "step": 81
+ },
+ {
+ "epoch": 2.5625,
+ "grad_norm": 0.3128648654463789,
+ "learning_rate": 2e-05,
+ "loss": 0.8888,
+ "step": 82
+ },
+ {
+ "epoch": 2.5625,
+ "eval_loss": 0.7566561102867126,
+ "eval_runtime": 79.2089,
+ "eval_samples_per_second": 2.525,
+ "eval_steps_per_second": 0.316,
+ "step": 82
+ },
+ {
+ "epoch": 2.59375,
+ "grad_norm": 0.2502355211215609,
+ "learning_rate": 2e-05,
+ "loss": 0.8346,
+ "step": 83
+ },
+ {
+ "epoch": 2.59375,
+ "eval_loss": 0.7547345161437988,
+ "eval_runtime": 79.2691,
+ "eval_samples_per_second": 2.523,
+ "eval_steps_per_second": 0.315,
+ "step": 83
+ },
+ {
+ "epoch": 2.625,
+ "grad_norm": 0.25281184629018644,
+ "learning_rate": 2e-05,
+ "loss": 0.795,
+ "step": 84
+ },
+ {
+ "epoch": 2.625,
+ "eval_loss": 0.7527951598167419,
+ "eval_runtime": 79.4068,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 84
+ },
+ {
+ "epoch": 2.65625,
+ "grad_norm": 0.24246729562645003,
+ "learning_rate": 2e-05,
+ "loss": 0.7649,
+ "step": 85
+ },
+ {
+ "epoch": 2.65625,
+ "eval_loss": 0.7509815096855164,
+ "eval_runtime": 79.1612,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 85
+ },
+ {
+ "epoch": 2.6875,
+ "grad_norm": 0.27005475109453947,
+ "learning_rate": 2e-05,
+ "loss": 0.7964,
+ "step": 86
+ },
+ {
+ "epoch": 2.6875,
+ "eval_loss": 0.7485950589179993,
+ "eval_runtime": 80.0714,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 86
+ },
+ {
+ "epoch": 2.71875,
+ "grad_norm": 0.2723492355800971,
+ "learning_rate": 2e-05,
+ "loss": 0.8117,
+ "step": 87
+ },
+ {
+ "epoch": 2.71875,
+ "eval_loss": 0.7459420561790466,
+ "eval_runtime": 79.4075,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 87
+ },
+ {
+ "epoch": 2.75,
+ "grad_norm": 0.2946493898427159,
+ "learning_rate": 2e-05,
+ "loss": 0.8986,
+ "step": 88
+ },
+ {
+ "epoch": 2.75,
+ "eval_loss": 0.7436455488204956,
+ "eval_runtime": 79.3721,
+ "eval_samples_per_second": 2.52,
+ "eval_steps_per_second": 0.315,
+ "step": 88
+ },
+ {
+ "epoch": 2.78125,
+ "grad_norm": 0.26411214734213284,
+ "learning_rate": 2e-05,
+ "loss": 0.8145,
+ "step": 89
+ },
+ {
+ "epoch": 2.78125,
+ "eval_loss": 0.7424752712249756,
+ "eval_runtime": 79.2988,
+ "eval_samples_per_second": 2.522,
+ "eval_steps_per_second": 0.315,
+ "step": 89
+ },
+ {
+ "epoch": 2.8125,
+ "grad_norm": 0.27115747269014817,
+ "learning_rate": 2e-05,
+ "loss": 0.8457,
+ "step": 90
+ },
+ {
+ "epoch": 2.8125,
+ "eval_loss": 0.7416408658027649,
+ "eval_runtime": 79.4004,
+ "eval_samples_per_second": 2.519,
+ "eval_steps_per_second": 0.315,
+ "step": 90
+ },
+ {
+ "epoch": 2.84375,
+ "grad_norm": 0.25831877964821937,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 91
+ },
+ {
+ "epoch": 2.84375,
+ "eval_loss": 0.7404463291168213,
+ "eval_runtime": 81.7767,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 91
+ },
+ {
+ "epoch": 2.875,
+ "grad_norm": 0.31273388454942935,
+ "learning_rate": 2e-05,
+ "loss": 0.8562,
+ "step": 92
+ },
+ {
+ "epoch": 2.875,
+ "eval_loss": 0.7384185791015625,
+ "eval_runtime": 82.3443,
+ "eval_samples_per_second": 2.429,
+ "eval_steps_per_second": 0.304,
+ "step": 92
+ },
+ {
+ "epoch": 2.90625,
+ "grad_norm": 0.2838267071008901,
+ "learning_rate": 2e-05,
+ "loss": 0.7869,
+ "step": 93
+ },
+ {
+ "epoch": 2.90625,
+ "eval_loss": 0.7366807460784912,
+ "eval_runtime": 82.2622,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 93
+ },
+ {
+ "epoch": 2.9375,
+ "grad_norm": 0.28625827941831467,
+ "learning_rate": 2e-05,
+ "loss": 0.8618,
+ "step": 94
+ },
+ {
+ "epoch": 2.9375,
+ "eval_loss": 0.7357398867607117,
+ "eval_runtime": 81.9471,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 94
+ },
+ {
+ "epoch": 2.96875,
+ "grad_norm": 0.25548002643954326,
+ "learning_rate": 2e-05,
+ "loss": 0.8085,
+ "step": 95
+ },
+ {
+ "epoch": 2.96875,
+ "eval_loss": 0.7356534004211426,
+ "eval_runtime": 82.1186,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 95
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 0.27081450830961107,
+ "learning_rate": 2e-05,
+ "loss": 0.7684,
+ "step": 96
+ },
+ {
+ "epoch": 3.0,
+ "eval_loss": 0.7346957921981812,
+ "eval_runtime": 81.5463,
+ "eval_samples_per_second": 2.453,
+ "eval_steps_per_second": 0.307,
+ "step": 96
+ },
+ {
+ "epoch": 3.03125,
+ "grad_norm": 0.2985486737236676,
+ "learning_rate": 2e-05,
+ "loss": 0.7274,
+ "step": 97
+ },
+ {
+ "epoch": 3.03125,
+ "eval_loss": 0.7325752377510071,
+ "eval_runtime": 81.7804,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 97
+ },
+ {
+ "epoch": 3.0625,
+ "grad_norm": 0.29149719690624026,
+ "learning_rate": 2e-05,
+ "loss": 0.8119,
+ "step": 98
+ },
+ {
+ "epoch": 3.0625,
+ "eval_loss": 0.7298976182937622,
+ "eval_runtime": 76.2764,
+ "eval_samples_per_second": 2.622,
+ "eval_steps_per_second": 0.328,
+ "step": 98
+ },
+ {
+ "epoch": 3.09375,
+ "grad_norm": 0.25227859825215865,
+ "learning_rate": 2e-05,
+ "loss": 0.7888,
+ "step": 99
+ },
+ {
+ "epoch": 3.09375,
+ "eval_loss": 0.727373480796814,
+ "eval_runtime": 76.2418,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 99
+ },
+ {
+ "epoch": 3.125,
+ "grad_norm": 0.27316954971752555,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 100
+ },
+ {
+ "epoch": 3.125,
+ "eval_loss": 0.7254325747489929,
+ "eval_runtime": 76.1474,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 100
+ },
+ {
+ "epoch": 3.15625,
+ "grad_norm": 0.24239788607957785,
+ "learning_rate": 2e-05,
+ "loss": 0.7535,
+ "step": 101
+ },
+ {
+ "epoch": 3.15625,
+ "eval_loss": 0.724058985710144,
+ "eval_runtime": 76.2391,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 101
+ },
+ {
+ "epoch": 3.1875,
+ "grad_norm": 0.25648385925427025,
+ "learning_rate": 2e-05,
+ "loss": 0.8195,
+ "step": 102
+ },
+ {
+ "epoch": 3.1875,
+ "eval_loss": 0.7235870957374573,
+ "eval_runtime": 76.9134,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 102
+ },
+ {
+ "epoch": 3.21875,
+ "grad_norm": 0.29620170789161204,
+ "learning_rate": 2e-05,
+ "loss": 0.8224,
+ "step": 103
+ },
+ {
+ "epoch": 3.21875,
+ "eval_loss": 0.7228152751922607,
+ "eval_runtime": 76.095,
+ "eval_samples_per_second": 2.628,
+ "eval_steps_per_second": 0.329,
+ "step": 103
+ },
+ {
+ "epoch": 3.25,
+ "grad_norm": 0.3484116181139593,
+ "learning_rate": 2e-05,
+ "loss": 0.7478,
+ "step": 104
+ },
+ {
+ "epoch": 3.25,
+ "eval_loss": 0.7209363579750061,
+ "eval_runtime": 76.9377,
+ "eval_samples_per_second": 2.6,
+ "eval_steps_per_second": 0.325,
+ "step": 104
+ },
+ {
+ "epoch": 3.28125,
+ "grad_norm": 0.25212350156184643,
+ "learning_rate": 2e-05,
+ "loss": 0.7885,
+ "step": 105
+ },
+ {
+ "epoch": 3.28125,
+ "eval_loss": 0.7197096347808838,
+ "eval_runtime": 76.2008,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 105
+ },
+ {
+ "epoch": 3.3125,
+ "grad_norm": 0.264200147608962,
+ "learning_rate": 2e-05,
+ "loss": 0.8371,
+ "step": 106
+ },
+ {
+ "epoch": 3.3125,
+ "eval_loss": 0.7197055220603943,
+ "eval_runtime": 78.1542,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 106
+ },
+ {
+ "epoch": 3.34375,
+ "grad_norm": 0.3309431084940201,
+ "learning_rate": 2e-05,
+ "loss": 0.6999,
+ "step": 107
+ },
+ {
+ "epoch": 3.34375,
+ "eval_loss": 0.7187016010284424,
+ "eval_runtime": 78.4259,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 107
+ },
+ {
+ "epoch": 3.375,
+ "grad_norm": 0.3131644456919823,
+ "learning_rate": 2e-05,
+ "loss": 0.7587,
+ "step": 108
+ },
+ {
+ "epoch": 3.375,
+ "eval_loss": 0.717018187046051,
+ "eval_runtime": 78.4558,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 108
+ },
+ {
+ "epoch": 3.40625,
+ "grad_norm": 0.33527684120780293,
+ "learning_rate": 2e-05,
+ "loss": 0.7468,
+ "step": 109
+ },
+ {
+ "epoch": 3.40625,
+ "eval_loss": 0.7147062420845032,
+ "eval_runtime": 78.2334,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.32,
+ "step": 109
+ },
+ {
+ "epoch": 3.4375,
+ "grad_norm": 0.29542683956231724,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 110
+ },
+ {
+ "epoch": 3.4375,
+ "eval_loss": 0.7130224704742432,
+ "eval_runtime": 79.1179,
+ "eval_samples_per_second": 2.528,
+ "eval_steps_per_second": 0.316,
+ "step": 110
+ },
+ {
+ "epoch": 3.46875,
+ "grad_norm": 0.31128698002926114,
+ "learning_rate": 2e-05,
+ "loss": 0.8153,
+ "step": 111
+ },
+ {
+ "epoch": 3.46875,
+ "eval_loss": 0.7120551466941833,
+ "eval_runtime": 80.292,
+ "eval_samples_per_second": 2.491,
+ "eval_steps_per_second": 0.311,
+ "step": 111
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.32502558864214215,
+ "learning_rate": 2e-05,
+ "loss": 0.8043,
+ "step": 112
+ },
+ {
+ "epoch": 3.5,
+ "eval_loss": 0.7117202877998352,
+ "eval_runtime": 79.7539,
+ "eval_samples_per_second": 2.508,
+ "eval_steps_per_second": 0.313,
+ "step": 112
+ },
+ {
+ "epoch": 3.53125,
+ "grad_norm": 0.34335720855758517,
+ "learning_rate": 2e-05,
+ "loss": 0.871,
+ "step": 113
+ },
+ {
+ "epoch": 3.53125,
+ "eval_loss": 0.7117029428482056,
+ "eval_runtime": 80.0281,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 113
+ },
+ {
+ "epoch": 3.5625,
+ "grad_norm": 0.31951931695644,
+ "learning_rate": 2e-05,
+ "loss": 0.7453,
+ "step": 114
+ },
+ {
+ "epoch": 3.5625,
+ "eval_loss": 0.7116554379463196,
+ "eval_runtime": 79.7209,
+ "eval_samples_per_second": 2.509,
+ "eval_steps_per_second": 0.314,
+ "step": 114
+ },
+ {
+ "epoch": 3.59375,
+ "grad_norm": 0.28067192963874266,
+ "learning_rate": 2e-05,
+ "loss": 0.8045,
+ "step": 115
+ },
+ {
+ "epoch": 3.59375,
+ "eval_loss": 0.7118353843688965,
+ "eval_runtime": 80.0195,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 115
+ },
+ {
+ "epoch": 3.625,
+ "grad_norm": 0.2739718257400276,
+ "learning_rate": 2e-05,
+ "loss": 0.775,
+ "step": 116
+ },
+ {
+ "epoch": 3.625,
+ "eval_loss": 0.7122579216957092,
+ "eval_runtime": 76.2052,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 116
+ },
+ {
+ "epoch": 3.65625,
+ "grad_norm": 0.31401723658881836,
+ "learning_rate": 2e-05,
+ "loss": 0.7826,
+ "step": 117
+ },
+ {
+ "epoch": 3.65625,
+ "eval_loss": 0.7118574380874634,
+ "eval_runtime": 76.1509,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 117
+ },
+ {
+ "epoch": 3.6875,
+ "grad_norm": 0.36925964858634625,
+ "learning_rate": 2e-05,
+ "loss": 0.7884,
+ "step": 118
+ },
+ {
+ "epoch": 3.6875,
+ "eval_loss": 0.710691511631012,
+ "eval_runtime": 76.2305,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 118
+ },
+ {
+ "epoch": 3.71875,
+ "grad_norm": 0.3050583880654791,
+ "learning_rate": 2e-05,
+ "loss": 0.8402,
+ "step": 119
+ },
+ {
+ "epoch": 3.71875,
+ "eval_loss": 0.7096763849258423,
+ "eval_runtime": 77.0581,
+ "eval_samples_per_second": 2.595,
+ "eval_steps_per_second": 0.324,
+ "step": 119
+ },
+ {
+ "epoch": 3.75,
+ "grad_norm": 0.2648625651290031,
+ "learning_rate": 2e-05,
+ "loss": 0.7889,
+ "step": 120
+ },
+ {
+ "epoch": 3.75,
+ "eval_loss": 0.7094223499298096,
+ "eval_runtime": 76.1379,
+ "eval_samples_per_second": 2.627,
+ "eval_steps_per_second": 0.328,
+ "step": 120
+ },
+ {
+ "epoch": 3.78125,
+ "grad_norm": 0.3107221696449271,
+ "learning_rate": 2e-05,
+ "loss": 0.7615,
+ "step": 121
+ },
+ {
+ "epoch": 3.78125,
+ "eval_loss": 0.7081363201141357,
+ "eval_runtime": 76.626,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 121
+ },
+ {
+ "epoch": 3.8125,
+ "grad_norm": 0.3455151299995048,
+ "learning_rate": 2e-05,
+ "loss": 0.8342,
+ "step": 122
+ },
+ {
+ "epoch": 3.8125,
+ "eval_loss": 0.7063001990318298,
+ "eval_runtime": 77.0293,
+ "eval_samples_per_second": 2.596,
+ "eval_steps_per_second": 0.325,
+ "step": 122
+ },
+ {
+ "epoch": 3.84375,
+ "grad_norm": 0.28847071926472523,
+ "learning_rate": 2e-05,
+ "loss": 0.7477,
+ "step": 123
+ },
+ {
+ "epoch": 3.84375,
+ "eval_loss": 0.7044610381126404,
+ "eval_runtime": 76.2385,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 123
+ },
+ {
+ "epoch": 3.875,
+ "grad_norm": 0.26753816515069856,
+ "learning_rate": 2e-05,
+ "loss": 0.7653,
+ "step": 124
+ },
+ {
+ "epoch": 3.875,
+ "eval_loss": 0.7033799886703491,
+ "eval_runtime": 76.1985,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 124
+ },
+ {
+ "epoch": 3.90625,
+ "grad_norm": 0.3465046292893005,
+ "learning_rate": 2e-05,
+ "loss": 0.8144,
+ "step": 125
+ },
+ {
+ "epoch": 3.90625,
+ "eval_loss": 0.7021930813789368,
+ "eval_runtime": 76.2234,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 125
+ },
+ {
+ "epoch": 3.9375,
+ "grad_norm": 0.3451690427620698,
+ "learning_rate": 2e-05,
+ "loss": 0.7871,
+ "step": 126
+ },
+ {
+ "epoch": 3.9375,
+ "eval_loss": 0.7013542652130127,
+ "eval_runtime": 78.0752,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 126
+ },
+ {
+ "epoch": 3.96875,
+ "grad_norm": 0.31571858642673567,
+ "learning_rate": 2e-05,
+ "loss": 0.7568,
+ "step": 127
+ },
+ {
+ "epoch": 3.96875,
+ "eval_loss": 0.7007560729980469,
+ "eval_runtime": 78.3558,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 127
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.3247003540270338,
+ "learning_rate": 2e-05,
+ "loss": 0.6714,
+ "step": 128
+ },
+ {
+ "epoch": 4.0,
+ "eval_loss": 0.6999780535697937,
+ "eval_runtime": 78.9788,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 128
+ },
+ {
+ "epoch": 4.03125,
+ "grad_norm": 0.2814983490019739,
+ "learning_rate": 2e-05,
+ "loss": 0.7797,
+ "step": 129
+ },
+ {
+ "epoch": 4.03125,
+ "eval_loss": 0.6998200416564941,
+ "eval_runtime": 78.3093,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 129
+ },
+ {
+ "epoch": 4.0625,
+ "grad_norm": 0.31961631715145106,
+ "learning_rate": 2e-05,
+ "loss": 0.7993,
+ "step": 130
+ },
+ {
+ "epoch": 4.0625,
+ "eval_loss": 0.6995271444320679,
+ "eval_runtime": 78.2172,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 130
+ },
+ {
+ "epoch": 4.09375,
+ "grad_norm": 0.32333364662215863,
+ "learning_rate": 2e-05,
+ "loss": 0.7896,
+ "step": 131
+ },
+ {
+ "epoch": 4.09375,
+ "eval_loss": 0.6992727518081665,
+ "eval_runtime": 79.0125,
+ "eval_samples_per_second": 2.531,
+ "eval_steps_per_second": 0.316,
+ "step": 131
+ },
+ {
+ "epoch": 4.125,
+ "grad_norm": 0.3255859640449829,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 132
+ },
+ {
+ "epoch": 4.125,
+ "eval_loss": 0.6988572478294373,
+ "eval_runtime": 79.0,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 132
+ },
+ {
+ "epoch": 4.15625,
+ "grad_norm": 0.3307068947429175,
+ "learning_rate": 2e-05,
+ "loss": 0.8416,
+ "step": 133
+ },
+ {
+ "epoch": 4.15625,
+ "eval_loss": 0.6981343030929565,
+ "eval_runtime": 78.3309,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 133
+ },
+ {
+ "epoch": 4.1875,
+ "grad_norm": 0.3842303818116732,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 134
+ },
+ {
+ "epoch": 4.1875,
+ "eval_loss": 0.6968980431556702,
+ "eval_runtime": 78.5608,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 134
+ },
+ {
+ "epoch": 4.21875,
+ "grad_norm": 0.331839472419003,
+ "learning_rate": 2e-05,
+ "loss": 0.7643,
+ "step": 135
+ },
+ {
+ "epoch": 4.21875,
+ "eval_loss": 0.6955949664115906,
+ "eval_runtime": 78.3566,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 135
+ },
+ {
+ "epoch": 4.25,
+ "grad_norm": 0.31864813130499836,
+ "learning_rate": 2e-05,
+ "loss": 0.7369,
+ "step": 136
+ },
+ {
+ "epoch": 4.25,
+ "eval_loss": 0.6951528787612915,
+ "eval_runtime": 79.7802,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 136
+ },
+ {
+ "epoch": 4.28125,
+ "grad_norm": 0.352549164434451,
+ "learning_rate": 2e-05,
+ "loss": 0.7332,
+ "step": 137
+ },
+ {
+ "epoch": 4.28125,
+ "eval_loss": 0.6947290897369385,
+ "eval_runtime": 79.8171,
+ "eval_samples_per_second": 2.506,
+ "eval_steps_per_second": 0.313,
+ "step": 137
+ },
+ {
+ "epoch": 4.3125,
+ "grad_norm": 0.37128812818896284,
+ "learning_rate": 2e-05,
+ "loss": 0.7542,
+ "step": 138
+ },
+ {
+ "epoch": 4.3125,
+ "eval_loss": 0.6937370300292969,
+ "eval_runtime": 79.7782,
+ "eval_samples_per_second": 2.507,
+ "eval_steps_per_second": 0.313,
+ "step": 138
+ },
+ {
+ "epoch": 4.34375,
+ "grad_norm": 0.3348014941412048,
+ "learning_rate": 2e-05,
+ "loss": 0.7079,
+ "step": 139
+ },
+ {
+ "epoch": 4.34375,
+ "eval_loss": 0.692456066608429,
+ "eval_runtime": 79.9308,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 139
+ },
+ {
+ "epoch": 4.375,
+ "grad_norm": 0.34411051658527964,
+ "learning_rate": 2e-05,
+ "loss": 0.7465,
+ "step": 140
+ },
+ {
+ "epoch": 4.375,
+ "eval_loss": 0.6915809512138367,
+ "eval_runtime": 79.943,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 140
+ },
+ {
+ "epoch": 4.40625,
+ "grad_norm": 0.3373909601921749,
+ "learning_rate": 2e-05,
+ "loss": 0.7648,
+ "step": 141
+ },
+ {
+ "epoch": 4.40625,
+ "eval_loss": 0.6912103295326233,
+ "eval_runtime": 79.8515,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 141
+ },
+ {
+ "epoch": 4.4375,
+ "grad_norm": 0.33253827371305456,
+ "learning_rate": 2e-05,
+ "loss": 0.7224,
+ "step": 142
+ },
+ {
+ "epoch": 4.4375,
+ "eval_loss": 0.6912806630134583,
+ "eval_runtime": 80.6475,
+ "eval_samples_per_second": 2.48,
+ "eval_steps_per_second": 0.31,
+ "step": 142
+ },
+ {
+ "epoch": 4.46875,
+ "grad_norm": 0.38458075172588313,
+ "learning_rate": 2e-05,
+ "loss": 0.7261,
+ "step": 143
+ },
+ {
+ "epoch": 4.46875,
+ "eval_loss": 0.6905419230461121,
+ "eval_runtime": 80.2606,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.311,
+ "step": 143
+ },
+ {
+ "epoch": 4.5,
+ "grad_norm": 0.31351962640463144,
+ "learning_rate": 2e-05,
+ "loss": 0.6909,
+ "step": 144
+ },
+ {
+ "epoch": 4.5,
+ "eval_loss": 0.6898491382598877,
+ "eval_runtime": 79.9965,
+ "eval_samples_per_second": 2.5,
+ "eval_steps_per_second": 0.313,
+ "step": 144
+ },
+ {
+ "epoch": 4.53125,
+ "grad_norm": 0.35474372115704583,
+ "learning_rate": 2e-05,
+ "loss": 0.7605,
+ "step": 145
+ },
+ {
+ "epoch": 4.53125,
+ "eval_loss": 0.6893147230148315,
+ "eval_runtime": 1475.5758,
+ "eval_samples_per_second": 0.136,
+ "eval_steps_per_second": 0.017,
+ "step": 145
+ },
+ {
+ "epoch": 4.5625,
+ "grad_norm": 0.3479568917421202,
+ "learning_rate": 2e-05,
+ "loss": 0.6638,
+ "step": 146
+ },
+ {
+ "epoch": 4.5625,
+ "eval_loss": 0.6884538531303406,
+ "eval_runtime": 84.6835,
+ "eval_samples_per_second": 2.362,
+ "eval_steps_per_second": 0.295,
+ "step": 146
+ },
+ {
+ "epoch": 4.59375,
+ "grad_norm": 0.3421823344428645,
+ "learning_rate": 2e-05,
+ "loss": 0.7339,
+ "step": 147
+ },
+ {
+ "epoch": 4.59375,
+ "eval_loss": 0.6873475909233093,
+ "eval_runtime": 83.3138,
+ "eval_samples_per_second": 2.401,
+ "eval_steps_per_second": 0.3,
+ "step": 147
+ },
+ {
+ "epoch": 4.625,
+ "grad_norm": 0.3642187020830788,
+ "learning_rate": 2e-05,
+ "loss": 0.6825,
+ "step": 148
+ },
+ {
+ "epoch": 4.625,
+ "eval_loss": 0.6858401298522949,
+ "eval_runtime": 82.1066,
+ "eval_samples_per_second": 2.436,
+ "eval_steps_per_second": 0.304,
+ "step": 148
+ },
+ {
+ "epoch": 4.65625,
+ "grad_norm": 0.35097547901391785,
+ "learning_rate": 2e-05,
+ "loss": 0.7986,
+ "step": 149
+ },
+ {
+ "epoch": 4.65625,
+ "eval_loss": 0.6848779320716858,
+ "eval_runtime": 84.4076,
+ "eval_samples_per_second": 2.369,
+ "eval_steps_per_second": 0.296,
+ "step": 149
+ },
+ {
+ "epoch": 4.6875,
+ "grad_norm": 0.3568694843794629,
+ "learning_rate": 2e-05,
+ "loss": 0.7176,
+ "step": 150
+ },
+ {
+ "epoch": 4.6875,
+ "eval_loss": 0.6842290759086609,
+ "eval_runtime": 82.5945,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 150
+ },
+ {
+ "epoch": 4.71875,
+ "grad_norm": 0.34258633585260334,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 151
+ },
+ {
+ "epoch": 4.71875,
+ "eval_loss": 0.6838659048080444,
+ "eval_runtime": 85.9626,
+ "eval_samples_per_second": 2.327,
+ "eval_steps_per_second": 0.291,
+ "step": 151
+ },
+ {
+ "epoch": 4.75,
+ "grad_norm": 0.42319523894659655,
+ "learning_rate": 2e-05,
+ "loss": 0.7675,
+ "step": 152
+ },
+ {
+ "epoch": 4.75,
+ "eval_loss": 0.6830299496650696,
+ "eval_runtime": 85.7189,
+ "eval_samples_per_second": 2.333,
+ "eval_steps_per_second": 0.292,
+ "step": 152
+ },
+ {
+ "epoch": 4.78125,
+ "grad_norm": 0.3632195533127194,
+ "learning_rate": 2e-05,
+ "loss": 0.715,
+ "step": 153
+ },
+ {
+ "epoch": 4.78125,
+ "eval_loss": 0.6826379895210266,
+ "eval_runtime": 87.8244,
+ "eval_samples_per_second": 2.277,
+ "eval_steps_per_second": 0.285,
+ "step": 153
+ },
+ {
+ "epoch": 4.8125,
+ "grad_norm": 0.3738308004604413,
+ "learning_rate": 2e-05,
+ "loss": 0.7344,
+ "step": 154
+ },
+ {
+ "epoch": 4.8125,
+ "eval_loss": 0.6826817393302917,
+ "eval_runtime": 86.5822,
+ "eval_samples_per_second": 2.31,
+ "eval_steps_per_second": 0.289,
+ "step": 154
+ },
+ {
+ "epoch": 4.84375,
+ "grad_norm": 0.3618696330632776,
+ "learning_rate": 2e-05,
+ "loss": 0.6632,
+ "step": 155
+ },
+ {
+ "epoch": 4.84375,
+ "eval_loss": 0.6827967166900635,
+ "eval_runtime": 82.1829,
+ "eval_samples_per_second": 2.434,
+ "eval_steps_per_second": 0.304,
+ "step": 155
+ },
+ {
+ "epoch": 4.875,
+ "grad_norm": 0.38901912569992203,
+ "learning_rate": 2e-05,
+ "loss": 0.7788,
+ "step": 156
+ },
+ {
+ "epoch": 4.875,
+ "eval_loss": 0.6821711659431458,
+ "eval_runtime": 84.4511,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 156
+ },
+ {
+ "epoch": 4.90625,
+ "grad_norm": 0.3516096507348829,
+ "learning_rate": 2e-05,
+ "loss": 0.7794,
+ "step": 157
+ },
+ {
+ "epoch": 4.90625,
+ "eval_loss": 0.6819837689399719,
+ "eval_runtime": 84.1594,
+ "eval_samples_per_second": 2.376,
+ "eval_steps_per_second": 0.297,
+ "step": 157
+ },
+ {
+ "epoch": 4.9375,
+ "grad_norm": 0.36066902463794986,
+ "learning_rate": 2e-05,
+ "loss": 0.7674,
+ "step": 158
+ },
+ {
+ "epoch": 4.9375,
+ "eval_loss": 0.6817716956138611,
+ "eval_runtime": 83.8929,
+ "eval_samples_per_second": 2.384,
+ "eval_steps_per_second": 0.298,
+ "step": 158
+ },
+ {
+ "epoch": 4.96875,
+ "grad_norm": 0.36641784926154175,
+ "learning_rate": 2e-05,
+ "loss": 0.7116,
+ "step": 159
+ },
+ {
+ "epoch": 4.96875,
+ "eval_loss": 0.6816902160644531,
+ "eval_runtime": 84.4431,
+ "eval_samples_per_second": 2.368,
+ "eval_steps_per_second": 0.296,
+ "step": 159
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.4020716293225933,
+ "learning_rate": 2e-05,
+ "loss": 0.7142,
+ "step": 160
+ },
+ {
+ "epoch": 5.0,
+ "eval_loss": 0.6811469793319702,
+ "eval_runtime": 86.0681,
+ "eval_samples_per_second": 2.324,
+ "eval_steps_per_second": 0.29,
+ "step": 160
+ },
+ {
+ "epoch": 5.03125,
+ "grad_norm": 0.38360882669254054,
+ "learning_rate": 2e-05,
+ "loss": 0.6756,
+ "step": 161
+ },
+ {
+ "epoch": 5.03125,
+ "eval_loss": 0.6798409223556519,
+ "eval_runtime": 81.9903,
+ "eval_samples_per_second": 2.439,
+ "eval_steps_per_second": 0.305,
+ "step": 161
+ },
+ {
+ "epoch": 5.0625,
+ "grad_norm": 0.34966156213066135,
+ "learning_rate": 2e-05,
+ "loss": 0.827,
+ "step": 162
+ },
+ {
+ "epoch": 5.0625,
+ "eval_loss": 0.6788859367370605,
+ "eval_runtime": 76.1753,
+ "eval_samples_per_second": 2.626,
+ "eval_steps_per_second": 0.328,
+ "step": 162
+ },
+ {
+ "epoch": 5.09375,
+ "grad_norm": 0.41140842939901384,
+ "learning_rate": 2e-05,
+ "loss": 0.6409,
+ "step": 163
+ },
+ {
+ "epoch": 5.09375,
+ "eval_loss": 0.6787077188491821,
+ "eval_runtime": 76.2239,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 163
+ },
+ {
+ "epoch": 5.125,
+ "grad_norm": 0.4222084070163774,
+ "learning_rate": 2e-05,
+ "loss": 0.7774,
+ "step": 164
+ },
+ {
+ "epoch": 5.125,
+ "eval_loss": 0.6796822547912598,
+ "eval_runtime": 76.2141,
+ "eval_samples_per_second": 2.624,
+ "eval_steps_per_second": 0.328,
+ "step": 164
+ },
+ {
+ "epoch": 5.15625,
+ "grad_norm": 0.4644454724424921,
+ "learning_rate": 2e-05,
+ "loss": 0.6057,
+ "step": 165
+ },
+ {
+ "epoch": 5.15625,
+ "eval_loss": 0.6794346570968628,
+ "eval_runtime": 76.3216,
+ "eval_samples_per_second": 2.62,
+ "eval_steps_per_second": 0.328,
+ "step": 165
+ },
+ {
+ "epoch": 5.1875,
+ "grad_norm": 0.46128725263272996,
+ "learning_rate": 2e-05,
+ "loss": 0.7158,
+ "step": 166
+ },
+ {
+ "epoch": 5.1875,
+ "eval_loss": 0.6791612505912781,
+ "eval_runtime": 78.4909,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.319,
+ "step": 166
+ },
+ {
+ "epoch": 5.21875,
+ "grad_norm": 0.37300666872025545,
+ "learning_rate": 2e-05,
+ "loss": 0.7363,
+ "step": 167
+ },
+ {
+ "epoch": 5.21875,
+ "eval_loss": 0.6788016557693481,
+ "eval_runtime": 78.5697,
+ "eval_samples_per_second": 2.546,
+ "eval_steps_per_second": 0.318,
+ "step": 167
+ },
+ {
+ "epoch": 5.25,
+ "grad_norm": 0.41454648576180214,
+ "learning_rate": 2e-05,
+ "loss": 0.7759,
+ "step": 168
+ },
+ {
+ "epoch": 5.25,
+ "eval_loss": 0.6787048578262329,
+ "eval_runtime": 78.5317,
+ "eval_samples_per_second": 2.547,
+ "eval_steps_per_second": 0.318,
+ "step": 168
+ },
+ {
+ "epoch": 5.28125,
+ "grad_norm": 0.40724665091386236,
+ "learning_rate": 2e-05,
+ "loss": 0.6944,
+ "step": 169
+ },
+ {
+ "epoch": 5.28125,
+ "eval_loss": 0.679679811000824,
+ "eval_runtime": 78.6899,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 169
+ },
+ {
+ "epoch": 5.3125,
+ "grad_norm": 0.3875110486208986,
+ "learning_rate": 2e-05,
+ "loss": 0.6634,
+ "step": 170
+ },
+ {
+ "epoch": 5.3125,
+ "eval_loss": 0.6819935441017151,
+ "eval_runtime": 78.3617,
+ "eval_samples_per_second": 2.552,
+ "eval_steps_per_second": 0.319,
+ "step": 170
+ },
+ {
+ "epoch": 5.34375,
+ "grad_norm": 0.47956532155617193,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 171
+ },
+ {
+ "epoch": 5.34375,
+ "eval_loss": 0.6825206875801086,
+ "eval_runtime": 78.4435,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 171
+ },
+ {
+ "epoch": 5.375,
+ "grad_norm": 0.4599359590587781,
+ "learning_rate": 2e-05,
+ "loss": 0.7718,
+ "step": 172
+ },
+ {
+ "epoch": 5.375,
+ "eval_loss": 0.6816768050193787,
+ "eval_runtime": 78.3005,
+ "eval_samples_per_second": 2.554,
+ "eval_steps_per_second": 0.319,
+ "step": 172
+ },
+ {
+ "epoch": 5.40625,
+ "grad_norm": 0.4057490487995386,
+ "learning_rate": 2e-05,
+ "loss": 0.7292,
+ "step": 173
+ },
+ {
+ "epoch": 5.40625,
+ "eval_loss": 0.6806090474128723,
+ "eval_runtime": 78.3313,
+ "eval_samples_per_second": 2.553,
+ "eval_steps_per_second": 0.319,
+ "step": 173
+ },
+ {
+ "epoch": 5.4375,
+ "grad_norm": 0.4143979315360467,
+ "learning_rate": 2e-05,
+ "loss": 0.7697,
+ "step": 174
+ },
+ {
+ "epoch": 5.4375,
+ "eval_loss": 0.6795693039894104,
+ "eval_runtime": 78.4526,
+ "eval_samples_per_second": 2.549,
+ "eval_steps_per_second": 0.319,
+ "step": 174
+ },
+ {
+ "epoch": 5.46875,
+ "grad_norm": 0.4219663662343445,
+ "learning_rate": 2e-05,
+ "loss": 0.7534,
+ "step": 175
+ },
+ {
+ "epoch": 5.46875,
+ "eval_loss": 0.6793847680091858,
+ "eval_runtime": 78.8009,
+ "eval_samples_per_second": 2.538,
+ "eval_steps_per_second": 0.317,
+ "step": 175
+ },
+ {
+ "epoch": 5.5,
+ "grad_norm": 0.4491811321927657,
+ "learning_rate": 2e-05,
+ "loss": 0.7004,
+ "step": 176
+ },
+ {
+ "epoch": 5.5,
+ "eval_loss": 0.6775352358818054,
+ "eval_runtime": 80.0685,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 176
+ },
+ {
+ "epoch": 5.53125,
+ "grad_norm": 0.46366516532638885,
+ "learning_rate": 2e-05,
+ "loss": 0.7357,
+ "step": 177
+ },
+ {
+ "epoch": 5.53125,
+ "eval_loss": 0.6748698949813843,
+ "eval_runtime": 80.0487,
+ "eval_samples_per_second": 2.498,
+ "eval_steps_per_second": 0.312,
+ "step": 177
+ },
+ {
+ "epoch": 5.5625,
+ "grad_norm": 0.3815188640227797,
+ "learning_rate": 2e-05,
+ "loss": 0.7592,
+ "step": 178
+ },
+ {
+ "epoch": 5.5625,
+ "eval_loss": 0.6728273034095764,
+ "eval_runtime": 80.0318,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 178
+ },
+ {
+ "epoch": 5.59375,
+ "grad_norm": 0.41025429416666304,
+ "learning_rate": 2e-05,
+ "loss": 0.6585,
+ "step": 179
+ },
+ {
+ "epoch": 5.59375,
+ "eval_loss": 0.6718859672546387,
+ "eval_runtime": 79.8801,
+ "eval_samples_per_second": 2.504,
+ "eval_steps_per_second": 0.313,
+ "step": 179
+ },
+ {
+ "epoch": 5.625,
+ "grad_norm": 0.40652817592240054,
+ "learning_rate": 2e-05,
+ "loss": 0.6611,
+ "step": 180
+ },
+ {
+ "epoch": 5.625,
+ "eval_loss": 0.6715708374977112,
+ "eval_runtime": 76.7261,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 180
+ },
+ {
+ "epoch": 5.65625,
+ "grad_norm": 0.40753961326688415,
+ "learning_rate": 2e-05,
+ "loss": 0.6779,
+ "step": 181
+ },
+ {
+ "epoch": 5.65625,
+ "eval_loss": 0.6719761490821838,
+ "eval_runtime": 77.0136,
+ "eval_samples_per_second": 2.597,
+ "eval_steps_per_second": 0.325,
+ "step": 181
+ },
+ {
+ "epoch": 5.6875,
+ "grad_norm": 0.4232811980671673,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 182
+ },
+ {
+ "epoch": 5.6875,
+ "eval_loss": 0.6724664568901062,
+ "eval_runtime": 76.9731,
+ "eval_samples_per_second": 2.598,
+ "eval_steps_per_second": 0.325,
+ "step": 182
+ },
+ {
+ "epoch": 5.71875,
+ "grad_norm": 0.5132756318549849,
+ "learning_rate": 2e-05,
+ "loss": 0.6801,
+ "step": 183
+ },
+ {
+ "epoch": 5.71875,
+ "eval_loss": 0.6723365783691406,
+ "eval_runtime": 76.4132,
+ "eval_samples_per_second": 2.617,
+ "eval_steps_per_second": 0.327,
+ "step": 183
+ },
+ {
+ "epoch": 5.75,
+ "grad_norm": 0.43526879230161264,
+ "learning_rate": 2e-05,
+ "loss": 0.6673,
+ "step": 184
+ },
+ {
+ "epoch": 5.75,
+ "eval_loss": 0.672926664352417,
+ "eval_runtime": 76.1936,
+ "eval_samples_per_second": 2.625,
+ "eval_steps_per_second": 0.328,
+ "step": 184
+ },
+ {
+ "epoch": 5.78125,
+ "grad_norm": 0.46965560853038507,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 185
+ },
+ {
+ "epoch": 5.78125,
+ "eval_loss": 0.6731134057044983,
+ "eval_runtime": 76.2345,
+ "eval_samples_per_second": 2.623,
+ "eval_steps_per_second": 0.328,
+ "step": 185
+ },
+ {
+ "epoch": 5.8125,
+ "grad_norm": 0.4733296318676217,
+ "learning_rate": 2e-05,
+ "loss": 0.6791,
+ "step": 186
+ },
+ {
+ "epoch": 5.8125,
+ "eval_loss": 0.6726363301277161,
+ "eval_runtime": 78.3939,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 186
+ },
+ {
+ "epoch": 5.84375,
+ "grad_norm": 0.4662943253655961,
+ "learning_rate": 2e-05,
+ "loss": 0.7371,
+ "step": 187
+ },
+ {
+ "epoch": 5.84375,
+ "eval_loss": 0.6726526021957397,
+ "eval_runtime": 79.1834,
+ "eval_samples_per_second": 2.526,
+ "eval_steps_per_second": 0.316,
+ "step": 187
+ },
+ {
+ "epoch": 5.875,
+ "grad_norm": 0.4420962889993382,
+ "learning_rate": 2e-05,
+ "loss": 0.675,
+ "step": 188
+ },
+ {
+ "epoch": 5.875,
+ "eval_loss": 0.6727125644683838,
+ "eval_runtime": 78.252,
+ "eval_samples_per_second": 2.556,
+ "eval_steps_per_second": 0.319,
+ "step": 188
+ },
+ {
+ "epoch": 5.90625,
+ "grad_norm": 0.4345166976944551,
+ "learning_rate": 2e-05,
+ "loss": 0.6748,
+ "step": 189
+ },
+ {
+ "epoch": 5.90625,
+ "eval_loss": 0.6725904941558838,
+ "eval_runtime": 78.3914,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 189
+ },
+ {
+ "epoch": 5.9375,
+ "grad_norm": 0.45109463315374526,
+ "learning_rate": 2e-05,
+ "loss": 0.7024,
+ "step": 190
+ },
+ {
+ "epoch": 5.9375,
+ "eval_loss": 0.6718384027481079,
+ "eval_runtime": 78.4361,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 190
+ },
+ {
+ "epoch": 5.96875,
+ "grad_norm": 0.42953871838795626,
+ "learning_rate": 2e-05,
+ "loss": 0.6904,
+ "step": 191
+ },
+ {
+ "epoch": 5.96875,
+ "eval_loss": 0.6703083515167236,
+ "eval_runtime": 78.3863,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 191
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.4248607379284984,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 192
+ },
+ {
+ "epoch": 6.0,
+ "eval_loss": 0.6693080067634583,
+ "eval_runtime": 78.4373,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 192
+ },
+ {
+ "epoch": 6.03125,
+ "grad_norm": 0.42839417453459494,
+ "learning_rate": 2e-05,
+ "loss": 0.7457,
+ "step": 193
+ },
+ {
+ "epoch": 6.03125,
+ "eval_loss": 0.6689594984054565,
+ "eval_runtime": 78.4169,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 193
+ },
+ {
+ "epoch": 6.0625,
+ "grad_norm": 0.4216922788166874,
+ "learning_rate": 2e-05,
+ "loss": 0.7189,
+ "step": 194
+ },
+ {
+ "epoch": 6.0625,
+ "eval_loss": 0.6689300537109375,
+ "eval_runtime": 78.9793,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 194
+ },
+ {
+ "epoch": 6.09375,
+ "grad_norm": 0.45199575791858004,
+ "learning_rate": 2e-05,
+ "loss": 0.6438,
+ "step": 195
+ },
+ {
+ "epoch": 6.09375,
+ "eval_loss": 0.6690151691436768,
+ "eval_runtime": 78.5002,
+ "eval_samples_per_second": 2.548,
+ "eval_steps_per_second": 0.318,
+ "step": 195
+ },
+ {
+ "epoch": 6.125,
+ "grad_norm": 0.4166923177293841,
+ "learning_rate": 2e-05,
+ "loss": 0.6885,
+ "step": 196
+ },
+ {
+ "epoch": 6.125,
+ "eval_loss": 0.6688613891601562,
+ "eval_runtime": 80.5497,
+ "eval_samples_per_second": 2.483,
+ "eval_steps_per_second": 0.31,
+ "step": 196
+ },
+ {
+ "epoch": 6.15625,
+ "grad_norm": 0.45164281863366285,
+ "learning_rate": 2e-05,
+ "loss": 0.7197,
+ "step": 197
+ },
+ {
+ "epoch": 6.15625,
+ "eval_loss": 0.6687932014465332,
+ "eval_runtime": 80.1482,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 197
+ },
+ {
+ "epoch": 6.1875,
+ "grad_norm": 0.45653924787504446,
+ "learning_rate": 2e-05,
+ "loss": 0.776,
+ "step": 198
+ },
+ {
+ "epoch": 6.1875,
+ "eval_loss": 0.6690963506698608,
+ "eval_runtime": 80.4464,
+ "eval_samples_per_second": 2.486,
+ "eval_steps_per_second": 0.311,
+ "step": 198
+ },
+ {
+ "epoch": 6.21875,
+ "grad_norm": 0.4966562341334706,
+ "learning_rate": 2e-05,
+ "loss": 0.6532,
+ "step": 199
+ },
+ {
+ "epoch": 6.21875,
+ "eval_loss": 0.669116735458374,
+ "eval_runtime": 79.8294,
+ "eval_samples_per_second": 2.505,
+ "eval_steps_per_second": 0.313,
+ "step": 199
+ },
+ {
+ "epoch": 6.25,
+ "grad_norm": 0.4838469303220975,
+ "learning_rate": 2e-05,
+ "loss": 0.6883,
+ "step": 200
+ },
+ {
+ "epoch": 6.25,
+ "eval_loss": 0.6693156957626343,
+ "eval_runtime": 80.25,
+ "eval_samples_per_second": 2.492,
+ "eval_steps_per_second": 0.312,
+ "step": 200
+ },
+ {
+ "epoch": 6.28125,
+ "grad_norm": 0.4836820906895964,
+ "learning_rate": 2e-05,
+ "loss": 0.7106,
+ "step": 201
+ },
+ {
+ "epoch": 6.28125,
+ "eval_loss": 0.6704170107841492,
+ "eval_runtime": 79.9636,
+ "eval_samples_per_second": 2.501,
+ "eval_steps_per_second": 0.313,
+ "step": 201
+ },
+ {
+ "epoch": 6.3125,
+ "grad_norm": 0.4945855983140219,
+ "learning_rate": 2e-05,
+ "loss": 0.6336,
+ "step": 202
+ },
+ {
+ "epoch": 6.3125,
+ "eval_loss": 0.6708824038505554,
+ "eval_runtime": 80.8044,
+ "eval_samples_per_second": 2.475,
+ "eval_steps_per_second": 0.309,
+ "step": 202
+ },
+ {
+ "epoch": 6.34375,
+ "grad_norm": 0.44587847230103017,
+ "learning_rate": 2e-05,
+ "loss": 0.7811,
+ "step": 203
+ },
+ {
+ "epoch": 6.34375,
+ "eval_loss": 0.6723968982696533,
+ "eval_runtime": 80.1715,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 203
+ },
+ {
+ "epoch": 6.375,
+ "grad_norm": 0.5351063503195825,
+ "learning_rate": 2e-05,
+ "loss": 0.6222,
+ "step": 204
+ },
+ {
+ "epoch": 6.375,
+ "eval_loss": 0.672196626663208,
+ "eval_runtime": 79.927,
+ "eval_samples_per_second": 2.502,
+ "eval_steps_per_second": 0.313,
+ "step": 204
+ },
+ {
+ "epoch": 6.40625,
+ "grad_norm": 0.4742985088010474,
+ "learning_rate": 2e-05,
+ "loss": 0.6157,
+ "step": 205
+ },
+ {
+ "epoch": 6.40625,
+ "eval_loss": 0.671062171459198,
+ "eval_runtime": 80.1997,
+ "eval_samples_per_second": 2.494,
+ "eval_steps_per_second": 0.312,
+ "step": 205
+ },
+ {
+ "epoch": 6.4375,
+ "grad_norm": 0.5188882333349506,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 206
+ },
+ {
+ "epoch": 6.4375,
+ "eval_loss": 0.6701972484588623,
+ "eval_runtime": 81.6643,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 206
+ },
+ {
+ "epoch": 6.46875,
+ "grad_norm": 0.45328063593983603,
+ "learning_rate": 2e-05,
+ "loss": 0.7058,
+ "step": 207
+ },
+ {
+ "epoch": 6.46875,
+ "eval_loss": 0.6699164509773254,
+ "eval_runtime": 81.2228,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 207
+ },
+ {
+ "epoch": 6.5,
+ "grad_norm": 0.5197645538332801,
+ "learning_rate": 2e-05,
+ "loss": 0.6462,
+ "step": 208
+ },
+ {
+ "epoch": 6.5,
+ "eval_loss": 0.6702597141265869,
+ "eval_runtime": 81.1451,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 208
+ },
+ {
+ "epoch": 6.53125,
+ "grad_norm": 0.5762528184834232,
+ "learning_rate": 2e-05,
+ "loss": 0.6259,
+ "step": 209
+ },
+ {
+ "epoch": 6.53125,
+ "eval_loss": 0.6696366667747498,
+ "eval_runtime": 81.1643,
+ "eval_samples_per_second": 2.464,
+ "eval_steps_per_second": 0.308,
+ "step": 209
+ },
+ {
+ "epoch": 6.5625,
+ "grad_norm": 0.5249503180293145,
+ "learning_rate": 2e-05,
+ "loss": 0.6045,
+ "step": 210
+ },
+ {
+ "epoch": 6.5625,
+ "eval_loss": 0.6688054800033569,
+ "eval_runtime": 80.9492,
+ "eval_samples_per_second": 2.471,
+ "eval_steps_per_second": 0.309,
+ "step": 210
+ },
+ {
+ "epoch": 6.59375,
+ "grad_norm": 0.543503888655844,
+ "learning_rate": 2e-05,
+ "loss": 0.6496,
+ "step": 211
+ },
+ {
+ "epoch": 6.59375,
+ "eval_loss": 0.6689916849136353,
+ "eval_runtime": 81.6473,
+ "eval_samples_per_second": 2.45,
+ "eval_steps_per_second": 0.306,
+ "step": 211
+ },
+ {
+ "epoch": 6.625,
+ "grad_norm": 0.48119553592193554,
+ "learning_rate": 2e-05,
+ "loss": 0.6211,
+ "step": 212
+ },
+ {
+ "epoch": 6.625,
+ "eval_loss": 0.6703050136566162,
+ "eval_runtime": 81.9207,
+ "eval_samples_per_second": 2.441,
+ "eval_steps_per_second": 0.305,
+ "step": 212
+ },
+ {
+ "epoch": 6.65625,
+ "grad_norm": 0.5153356086819314,
+ "learning_rate": 2e-05,
+ "loss": 0.7135,
+ "step": 213
+ },
+ {
+ "epoch": 6.65625,
+ "eval_loss": 0.6702842116355896,
+ "eval_runtime": 81.1503,
+ "eval_samples_per_second": 2.465,
+ "eval_steps_per_second": 0.308,
+ "step": 213
+ },
+ {
+ "epoch": 6.6875,
+ "grad_norm": 0.5249915042825578,
+ "learning_rate": 2e-05,
+ "loss": 0.6635,
+ "step": 214
+ },
+ {
+ "epoch": 6.6875,
+ "eval_loss": 0.6687333583831787,
+ "eval_runtime": 81.6743,
+ "eval_samples_per_second": 2.449,
+ "eval_steps_per_second": 0.306,
+ "step": 214
+ },
+ {
+ "epoch": 6.71875,
+ "grad_norm": 0.5204840219868723,
+ "learning_rate": 2e-05,
+ "loss": 0.6701,
+ "step": 215
+ },
+ {
+ "epoch": 6.71875,
+ "eval_loss": 0.6657728552818298,
+ "eval_runtime": 81.106,
+ "eval_samples_per_second": 2.466,
+ "eval_steps_per_second": 0.308,
+ "step": 215
+ },
+ {
+ "epoch": 6.75,
+ "grad_norm": 0.5266935225120133,
+ "learning_rate": 2e-05,
+ "loss": 0.6637,
+ "step": 216
+ },
+ {
+ "epoch": 6.75,
+ "eval_loss": 0.6641908884048462,
+ "eval_runtime": 82.2613,
+ "eval_samples_per_second": 2.431,
+ "eval_steps_per_second": 0.304,
+ "step": 216
+ },
+ {
+ "epoch": 6.78125,
+ "grad_norm": 0.5438859451742696,
+ "learning_rate": 2e-05,
+ "loss": 0.6168,
+ "step": 217
+ },
+ {
+ "epoch": 6.78125,
+ "eval_loss": 0.6652233600616455,
+ "eval_runtime": 82.042,
+ "eval_samples_per_second": 2.438,
+ "eval_steps_per_second": 0.305,
+ "step": 217
+ },
+ {
+ "epoch": 6.8125,
+ "grad_norm": 0.5716385253433929,
+ "learning_rate": 2e-05,
+ "loss": 0.6062,
+ "step": 218
+ },
+ {
+ "epoch": 6.8125,
+ "eval_loss": 0.6656240820884705,
+ "eval_runtime": 81.233,
+ "eval_samples_per_second": 2.462,
+ "eval_steps_per_second": 0.308,
+ "step": 218
+ },
+ {
+ "epoch": 6.84375,
+ "grad_norm": 1.0572787630142522,
+ "learning_rate": 2e-05,
+ "loss": 0.7037,
+ "step": 219
+ },
+ {
+ "epoch": 6.84375,
+ "eval_loss": 0.6645559072494507,
+ "eval_runtime": 81.2099,
+ "eval_samples_per_second": 2.463,
+ "eval_steps_per_second": 0.308,
+ "step": 219
+ },
+ {
+ "epoch": 6.875,
+ "grad_norm": 0.5924889323251107,
+ "learning_rate": 2e-05,
+ "loss": 0.712,
+ "step": 220
+ },
+ {
+ "epoch": 6.875,
+ "eval_loss": 0.6619111895561218,
+ "eval_runtime": 81.7826,
+ "eval_samples_per_second": 2.446,
+ "eval_steps_per_second": 0.306,
+ "step": 220
+ },
+ {
+ "epoch": 6.90625,
+ "grad_norm": 0.5290576915218269,
+ "learning_rate": 2e-05,
+ "loss": 0.6659,
+ "step": 221
+ },
+ {
+ "epoch": 6.90625,
+ "eval_loss": 0.6609540581703186,
+ "eval_runtime": 82.9922,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 221
+ },
+ {
+ "epoch": 6.9375,
+ "grad_norm": 0.5831209517049147,
+ "learning_rate": 2e-05,
+ "loss": 0.6547,
+ "step": 222
+ },
+ {
+ "epoch": 6.9375,
+ "eval_loss": 0.660676896572113,
+ "eval_runtime": 83.6541,
+ "eval_samples_per_second": 2.391,
+ "eval_steps_per_second": 0.299,
+ "step": 222
+ },
+ {
+ "epoch": 6.96875,
+ "grad_norm": 0.5320966369511158,
+ "learning_rate": 2e-05,
+ "loss": 0.6968,
+ "step": 223
+ },
+ {
+ "epoch": 6.96875,
+ "eval_loss": 0.6618594527244568,
+ "eval_runtime": 83.1148,
+ "eval_samples_per_second": 2.406,
+ "eval_steps_per_second": 0.301,
+ "step": 223
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5829636446837394,
+ "learning_rate": 2e-05,
+ "loss": 0.7407,
+ "step": 224
+ },
+ {
+ "epoch": 7.0,
+ "eval_loss": 0.6635661125183105,
+ "eval_runtime": 82.8183,
+ "eval_samples_per_second": 2.415,
+ "eval_steps_per_second": 0.302,
+ "step": 224
+ },
+ {
+ "epoch": 7.03125,
+ "grad_norm": 0.4975095056459566,
+ "learning_rate": 2e-05,
+ "loss": 0.6535,
+ "step": 225
+ },
+ {
+ "epoch": 7.03125,
+ "eval_loss": 0.6641671657562256,
+ "eval_runtime": 83.0267,
+ "eval_samples_per_second": 2.409,
+ "eval_steps_per_second": 0.301,
+ "step": 225
+ },
+ {
+ "epoch": 7.0625,
+ "grad_norm": 0.5625698523064815,
+ "learning_rate": 2e-05,
+ "loss": 0.6012,
+ "step": 226
+ },
+ {
+ "epoch": 7.0625,
+ "eval_loss": 0.6639044880867004,
+ "eval_runtime": 83.3881,
+ "eval_samples_per_second": 2.398,
+ "eval_steps_per_second": 0.3,
+ "step": 226
+ },
+ {
+ "epoch": 7.09375,
+ "grad_norm": 0.5436196850683295,
+ "learning_rate": 2e-05,
+ "loss": 0.6485,
+ "step": 227
+ },
+ {
+ "epoch": 7.09375,
+ "eval_loss": 0.6651788353919983,
+ "eval_runtime": 82.7096,
+ "eval_samples_per_second": 2.418,
+ "eval_steps_per_second": 0.302,
+ "step": 227
+ },
+ {
+ "epoch": 7.125,
+ "grad_norm": 0.5598906287609361,
+ "learning_rate": 2e-05,
+ "loss": 0.6142,
+ "step": 228
+ },
+ {
+ "epoch": 7.125,
+ "eval_loss": 0.6688636541366577,
+ "eval_runtime": 82.601,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 228
+ },
+ {
+ "epoch": 7.15625,
+ "grad_norm": 0.7572979310697923,
+ "learning_rate": 2e-05,
+ "loss": 0.6221,
+ "step": 229
+ },
+ {
+ "epoch": 7.15625,
+ "eval_loss": 0.6699694991111755,
+ "eval_runtime": 82.6032,
+ "eval_samples_per_second": 2.421,
+ "eval_steps_per_second": 0.303,
+ "step": 229
+ },
+ {
+ "epoch": 7.1875,
+ "grad_norm": 0.6173309690580897,
+ "learning_rate": 2e-05,
+ "loss": 0.5919,
+ "step": 230
+ },
+ {
+ "epoch": 7.1875,
+ "eval_loss": 0.6706527471542358,
+ "eval_runtime": 82.9732,
+ "eval_samples_per_second": 2.41,
+ "eval_steps_per_second": 0.301,
+ "step": 230
+ },
+ {
+ "epoch": 7.21875,
+ "grad_norm": 0.643241771517866,
+ "learning_rate": 2e-05,
+ "loss": 0.7081,
+ "step": 231
+ },
+ {
+ "epoch": 7.21875,
+ "eval_loss": 0.6700320243835449,
+ "eval_runtime": 84.5621,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 231
+ },
+ {
+ "epoch": 7.25,
+ "grad_norm": 0.577638137570571,
+ "learning_rate": 2e-05,
+ "loss": 0.6873,
+ "step": 232
+ },
+ {
+ "epoch": 7.25,
+ "eval_loss": 0.669111430644989,
+ "eval_runtime": 84.5124,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 232
+ },
+ {
+ "epoch": 7.28125,
+ "grad_norm": 0.7229488296023369,
+ "learning_rate": 2e-05,
+ "loss": 0.6301,
+ "step": 233
+ },
+ {
+ "epoch": 7.28125,
+ "eval_loss": 0.6664154529571533,
+ "eval_runtime": 84.6437,
+ "eval_samples_per_second": 2.363,
+ "eval_steps_per_second": 0.295,
+ "step": 233
+ },
+ {
+ "epoch": 7.3125,
+ "grad_norm": 0.5827815449039045,
+ "learning_rate": 2e-05,
+ "loss": 0.669,
+ "step": 234
+ },
+ {
+ "epoch": 7.3125,
+ "eval_loss": 0.6641202569007874,
+ "eval_runtime": 84.489,
+ "eval_samples_per_second": 2.367,
+ "eval_steps_per_second": 0.296,
+ "step": 234
+ },
+ {
+ "epoch": 7.34375,
+ "grad_norm": 0.57507354017269,
+ "learning_rate": 2e-05,
+ "loss": 0.6474,
+ "step": 235
+ },
+ {
+ "epoch": 7.34375,
+ "eval_loss": 0.6623325347900391,
+ "eval_runtime": 84.5536,
+ "eval_samples_per_second": 2.365,
+ "eval_steps_per_second": 0.296,
+ "step": 235
+ },
+ {
+ "epoch": 7.375,
+ "grad_norm": 0.5810844862533651,
+ "learning_rate": 2e-05,
+ "loss": 0.6048,
+ "step": 236
+ },
+ {
+ "epoch": 7.375,
+ "eval_loss": 0.6619194746017456,
+ "eval_runtime": 84.2296,
+ "eval_samples_per_second": 2.374,
+ "eval_steps_per_second": 0.297,
+ "step": 236
+ },
+ {
+ "epoch": 7.40625,
+ "grad_norm": 0.6075032415813726,
+ "learning_rate": 2e-05,
+ "loss": 0.6529,
+ "step": 237
+ },
+ {
+ "epoch": 7.40625,
+ "eval_loss": 0.6626202464103699,
+ "eval_runtime": 84.9703,
+ "eval_samples_per_second": 2.354,
+ "eval_steps_per_second": 0.294,
+ "step": 237
+ },
+ {
+ "epoch": 7.4375,
+ "grad_norm": 0.6402642234375245,
+ "learning_rate": 2e-05,
+ "loss": 0.6433,
+ "step": 238
+ },
+ {
+ "epoch": 7.4375,
+ "eval_loss": 0.663289487361908,
+ "eval_runtime": 84.8924,
+ "eval_samples_per_second": 2.356,
+ "eval_steps_per_second": 0.294,
+ "step": 238
+ },
+ {
+ "epoch": 7.46875,
+ "grad_norm": 0.6335996982657431,
+ "learning_rate": 2e-05,
+ "loss": 0.6815,
+ "step": 239
+ },
+ {
+ "epoch": 7.46875,
+ "eval_loss": 0.6636109948158264,
+ "eval_runtime": 85.0551,
+ "eval_samples_per_second": 2.351,
+ "eval_steps_per_second": 0.294,
+ "step": 239
+ },
+ {
+ "epoch": 7.5,
+ "grad_norm": 0.5796846795848909,
+ "learning_rate": 2e-05,
+ "loss": 0.6236,
+ "step": 240
+ },
+ {
+ "epoch": 7.5,
+ "eval_loss": 0.6652829051017761,
+ "eval_runtime": 84.7574,
+ "eval_samples_per_second": 2.36,
+ "eval_steps_per_second": 0.295,
+ "step": 240
+ },
+ {
+ "epoch": 7.53125,
+ "grad_norm": 0.5380402145760035,
+ "learning_rate": 2e-05,
+ "loss": 0.6564,
+ "step": 241
+ },
+ {
+ "epoch": 7.53125,
+ "eval_loss": 0.6676375865936279,
+ "eval_runtime": 86.2058,
+ "eval_samples_per_second": 2.32,
+ "eval_steps_per_second": 0.29,
+ "step": 241
+ },
+ {
+ "epoch": 7.5625,
+ "grad_norm": 0.5964298255824012,
+ "learning_rate": 2e-05,
+ "loss": 0.6475,
+ "step": 242
+ },
+ {
+ "epoch": 7.5625,
+ "eval_loss": 0.6698520183563232,
+ "eval_runtime": 85.8955,
+ "eval_samples_per_second": 2.328,
+ "eval_steps_per_second": 0.291,
+ "step": 242
+ },
+ {
+ "epoch": 7.59375,
+ "grad_norm": 0.561279296875,
+ "learning_rate": 2e-05,
+ "loss": 0.6395,
+ "step": 243
+ },
+ {
+ "epoch": 7.59375,
+ "eval_loss": 0.6705803871154785,
+ "eval_runtime": 86.0036,
+ "eval_samples_per_second": 2.325,
+ "eval_steps_per_second": 0.291,
+ "step": 243
+ },
+ {
+ "epoch": 7.625,
+ "grad_norm": 0.6757292755073548,
+ "learning_rate": 2e-05,
+ "loss": 0.7074,
+ "step": 244
+ },
+ {
+ "epoch": 7.625,
+ "eval_loss": 0.6679538488388062,
+ "eval_runtime": 85.5379,
+ "eval_samples_per_second": 2.338,
+ "eval_steps_per_second": 0.292,
+ "step": 244
+ },
+ {
+ "epoch": 7.65625,
+ "grad_norm": 0.659077163070129,
+ "learning_rate": 2e-05,
+ "loss": 0.6078,
+ "step": 245
+ },
+ {
+ "epoch": 7.65625,
+ "eval_loss": 0.6667564511299133,
+ "eval_runtime": 85.752,
+ "eval_samples_per_second": 2.332,
+ "eval_steps_per_second": 0.292,
+ "step": 245
+ },
+ {
+ "epoch": 7.6875,
+ "grad_norm": 0.6215405566454576,
+ "learning_rate": 2e-05,
+ "loss": 0.6603,
+ "step": 246
+ },
+ {
+ "epoch": 7.6875,
+ "eval_loss": 0.665945291519165,
+ "eval_runtime": 92.3086,
+ "eval_samples_per_second": 2.167,
+ "eval_steps_per_second": 0.271,
+ "step": 246
+ },
+ {
+ "epoch": 7.71875,
+ "grad_norm": 0.6130534921490498,
+ "learning_rate": 2e-05,
+ "loss": 0.6435,
+ "step": 247
+ },
+ {
+ "epoch": 7.71875,
+ "eval_loss": 0.6661685109138489,
+ "eval_runtime": 87.1917,
+ "eval_samples_per_second": 2.294,
+ "eval_steps_per_second": 0.287,
+ "step": 247
+ },
+ {
+ "epoch": 7.75,
+ "grad_norm": 0.6025415602868736,
+ "learning_rate": 2e-05,
+ "loss": 0.6308,
+ "step": 248
+ },
+ {
+ "epoch": 7.75,
+ "eval_loss": 0.6658704280853271,
+ "eval_runtime": 86.8233,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 248
+ },
+ {
+ "epoch": 7.78125,
+ "grad_norm": 0.6901593792019413,
+ "learning_rate": 2e-05,
+ "loss": 0.6777,
+ "step": 249
+ },
+ {
+ "epoch": 7.78125,
+ "eval_loss": 0.6652414202690125,
+ "eval_runtime": 86.7625,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 249
+ },
+ {
+ "epoch": 7.8125,
+ "grad_norm": 0.6436454697341579,
+ "learning_rate": 2e-05,
+ "loss": 0.6912,
+ "step": 250
+ },
+ {
+ "epoch": 7.8125,
+ "eval_loss": 0.6654212474822998,
+ "eval_runtime": 86.871,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 250
+ },
+ {
+ "epoch": 7.84375,
+ "grad_norm": 0.649040103024529,
+ "learning_rate": 2e-05,
+ "loss": 0.6025,
+ "step": 251
+ },
+ {
+ "epoch": 7.84375,
+ "eval_loss": 0.6654068231582642,
+ "eval_runtime": 86.7458,
+ "eval_samples_per_second": 2.306,
+ "eval_steps_per_second": 0.288,
+ "step": 251
+ },
+ {
+ "epoch": 7.875,
+ "grad_norm": 0.6595522131680224,
+ "learning_rate": 2e-05,
+ "loss": 0.5973,
+ "step": 252
+ },
+ {
+ "epoch": 7.875,
+ "eval_loss": 0.6644830107688904,
+ "eval_runtime": 86.8739,
+ "eval_samples_per_second": 2.302,
+ "eval_steps_per_second": 0.288,
+ "step": 252
+ },
+ {
+ "epoch": 7.90625,
+ "grad_norm": 0.6689891717273936,
+ "learning_rate": 2e-05,
+ "loss": 0.687,
+ "step": 253
+ },
+ {
+ "epoch": 7.90625,
+ "eval_loss": 0.6616199612617493,
+ "eval_runtime": 86.8222,
+ "eval_samples_per_second": 2.304,
+ "eval_steps_per_second": 0.288,
+ "step": 253
+ },
+ {
+ "epoch": 7.9375,
+ "grad_norm": 0.6306846778314292,
+ "learning_rate": 2e-05,
+ "loss": 0.6599,
+ "step": 254
+ },
+ {
+ "epoch": 7.9375,
+ "eval_loss": 0.6592965126037598,
+ "eval_runtime": 86.8577,
+ "eval_samples_per_second": 2.303,
+ "eval_steps_per_second": 0.288,
+ "step": 254
+ },
+ {
+ "epoch": 7.96875,
+ "grad_norm": 0.6021327993890785,
+ "learning_rate": 2e-05,
+ "loss": 0.575,
+ "step": 255
+ },
+ {
+ "epoch": 7.96875,
+ "eval_loss": 0.6580593585968018,
+ "eval_runtime": 86.7582,
+ "eval_samples_per_second": 2.305,
+ "eval_steps_per_second": 0.288,
+ "step": 255
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.6174712675568311,
+ "learning_rate": 2e-05,
+ "loss": 0.6341,
+ "step": 256
+ },
+ {
+ "epoch": 8.0,
+ "eval_loss": 0.6575854420661926,
+ "eval_runtime": 76.7634,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 256
+ },
+ {
+ "epoch": 8.03125,
+ "grad_norm": 0.6551281786490154,
+ "learning_rate": 2e-05,
+ "loss": 0.6032,
+ "step": 257
+ },
+ {
+ "epoch": 8.03125,
+ "eval_loss": 0.6583926677703857,
+ "eval_runtime": 83.4222,
+ "eval_samples_per_second": 2.397,
+ "eval_steps_per_second": 0.3,
+ "step": 257
+ },
+ {
+ "epoch": 8.0625,
+ "grad_norm": 0.6033798361300539,
+ "learning_rate": 2e-05,
+ "loss": 0.6352,
+ "step": 258
+ },
+ {
+ "epoch": 8.0625,
+ "eval_loss": 0.6615632772445679,
+ "eval_runtime": 76.7227,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 258
+ },
+ {
+ "epoch": 8.09375,
+ "grad_norm": 0.557538857110867,
+ "learning_rate": 2e-05,
+ "loss": 0.6472,
+ "step": 259
+ },
+ {
+ "epoch": 8.09375,
+ "eval_loss": 0.6674608588218689,
+ "eval_runtime": 76.6215,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 259
+ },
+ {
+ "epoch": 8.125,
+ "grad_norm": 0.7828450894757938,
+ "learning_rate": 2e-05,
+ "loss": 0.6576,
+ "step": 260
+ },
+ {
+ "epoch": 8.125,
+ "eval_loss": 0.670245349407196,
+ "eval_runtime": 76.685,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 260
+ },
+ {
+ "epoch": 8.15625,
+ "grad_norm": 0.7969830757603331,
+ "learning_rate": 2e-05,
+ "loss": 0.5809,
+ "step": 261
+ },
+ {
+ "epoch": 8.15625,
+ "eval_loss": 0.6711975336074829,
+ "eval_runtime": 78.0022,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 261
+ },
+ {
+ "epoch": 8.1875,
+ "grad_norm": 0.6431174985709492,
+ "learning_rate": 2e-05,
+ "loss": 0.6971,
+ "step": 262
+ },
+ {
+ "epoch": 8.1875,
+ "eval_loss": 0.6719404458999634,
+ "eval_runtime": 78.7599,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 262
+ },
+ {
+ "epoch": 8.21875,
+ "grad_norm": 0.7025583314944188,
+ "learning_rate": 2e-05,
+ "loss": 0.5751,
+ "step": 263
+ },
+ {
+ "epoch": 8.21875,
+ "eval_loss": 0.6719526648521423,
+ "eval_runtime": 78.0188,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 263
+ },
+ {
+ "epoch": 8.25,
+ "grad_norm": 0.7114355417811269,
+ "learning_rate": 2e-05,
+ "loss": 0.623,
+ "step": 264
+ },
+ {
+ "epoch": 8.25,
+ "eval_loss": 0.6717848181724548,
+ "eval_runtime": 78.6366,
+ "eval_samples_per_second": 2.543,
+ "eval_steps_per_second": 0.318,
+ "step": 264
+ },
+ {
+ "epoch": 8.28125,
+ "grad_norm": 0.8272269435769467,
+ "learning_rate": 2e-05,
+ "loss": 0.6509,
+ "step": 265
+ },
+ {
+ "epoch": 8.28125,
+ "eval_loss": 0.6701865196228027,
+ "eval_runtime": 78.7279,
+ "eval_samples_per_second": 2.54,
+ "eval_steps_per_second": 0.318,
+ "step": 265
+ },
+ {
+ "epoch": 8.3125,
+ "grad_norm": 0.7215994453471393,
+ "learning_rate": 2e-05,
+ "loss": 0.6263,
+ "step": 266
+ },
+ {
+ "epoch": 8.3125,
+ "eval_loss": 0.6682087182998657,
+ "eval_runtime": 78.1433,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 266
+ },
+ {
+ "epoch": 8.34375,
+ "grad_norm": 0.6425448006102333,
+ "learning_rate": 2e-05,
+ "loss": 0.5613,
+ "step": 267
+ },
+ {
+ "epoch": 8.34375,
+ "eval_loss": 0.6686681509017944,
+ "eval_runtime": 78.0964,
+ "eval_samples_per_second": 2.561,
+ "eval_steps_per_second": 0.32,
+ "step": 267
+ },
+ {
+ "epoch": 8.375,
+ "grad_norm": 0.7207053166384572,
+ "learning_rate": 2e-05,
+ "loss": 0.6239,
+ "step": 268
+ },
+ {
+ "epoch": 8.375,
+ "eval_loss": 0.6676305532455444,
+ "eval_runtime": 77.9986,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.321,
+ "step": 268
+ },
+ {
+ "epoch": 8.40625,
+ "grad_norm": 0.7459344743811905,
+ "learning_rate": 2e-05,
+ "loss": 0.6159,
+ "step": 269
+ },
+ {
+ "epoch": 8.40625,
+ "eval_loss": 0.6660167574882507,
+ "eval_runtime": 78.4159,
+ "eval_samples_per_second": 2.551,
+ "eval_steps_per_second": 0.319,
+ "step": 269
+ },
+ {
+ "epoch": 8.4375,
+ "grad_norm": 0.7179805119560739,
+ "learning_rate": 2e-05,
+ "loss": 0.6192,
+ "step": 270
+ },
+ {
+ "epoch": 8.4375,
+ "eval_loss": 0.6636325716972351,
+ "eval_runtime": 78.2224,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 270
+ },
+ {
+ "epoch": 8.46875,
+ "grad_norm": 0.724792498458059,
+ "learning_rate": 2e-05,
+ "loss": 0.5234,
+ "step": 271
+ },
+ {
+ "epoch": 8.46875,
+ "eval_loss": 0.6647288799285889,
+ "eval_runtime": 79.0573,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 271
+ },
+ {
+ "epoch": 8.5,
+ "grad_norm": 0.6544107138826364,
+ "learning_rate": 2e-05,
+ "loss": 0.6067,
+ "step": 272
+ },
+ {
+ "epoch": 8.5,
+ "eval_loss": 0.6689667701721191,
+ "eval_runtime": 79.2898,
+ "eval_samples_per_second": 2.522,
+ "eval_steps_per_second": 0.315,
+ "step": 272
+ },
+ {
+ "epoch": 8.53125,
+ "grad_norm": 0.71580236810568,
+ "learning_rate": 2e-05,
+ "loss": 0.6215,
+ "step": 273
+ },
+ {
+ "epoch": 8.53125,
+ "eval_loss": 0.6723271012306213,
+ "eval_runtime": 79.0759,
+ "eval_samples_per_second": 2.529,
+ "eval_steps_per_second": 0.316,
+ "step": 273
+ },
+ {
+ "epoch": 8.5625,
+ "grad_norm": 0.7741383931390255,
+ "learning_rate": 2e-05,
+ "loss": 0.6012,
+ "step": 274
+ },
+ {
+ "epoch": 8.5625,
+ "eval_loss": 0.6743794083595276,
+ "eval_runtime": 79.0509,
+ "eval_samples_per_second": 2.53,
+ "eval_steps_per_second": 0.316,
+ "step": 274
+ },
+ {
+ "epoch": 8.59375,
+ "grad_norm": 0.7927343087738151,
+ "learning_rate": 2e-05,
+ "loss": 0.6241,
+ "step": 275
+ },
+ {
+ "epoch": 8.59375,
+ "eval_loss": 0.6728585958480835,
+ "eval_runtime": 79.2296,
+ "eval_samples_per_second": 2.524,
+ "eval_steps_per_second": 0.316,
+ "step": 275
+ },
+ {
+ "epoch": 8.625,
+ "grad_norm": 0.759468785526614,
+ "learning_rate": 2e-05,
+ "loss": 0.6209,
+ "step": 276
+ },
+ {
+ "epoch": 8.625,
+ "eval_loss": 0.6686221957206726,
+ "eval_runtime": 76.7494,
+ "eval_samples_per_second": 2.606,
+ "eval_steps_per_second": 0.326,
+ "step": 276
+ },
+ {
+ "epoch": 8.65625,
+ "grad_norm": 0.7345386079388437,
+ "learning_rate": 2e-05,
+ "loss": 0.5618,
+ "step": 277
+ },
+ {
+ "epoch": 8.65625,
+ "eval_loss": 0.6659188270568848,
+ "eval_runtime": 77.4511,
+ "eval_samples_per_second": 2.582,
+ "eval_steps_per_second": 0.323,
+ "step": 277
+ },
+ {
+ "epoch": 8.6875,
+ "grad_norm": 0.6822491965046279,
+ "learning_rate": 2e-05,
+ "loss": 0.6064,
+ "step": 278
+ },
+ {
+ "epoch": 8.6875,
+ "eval_loss": 0.664726734161377,
+ "eval_runtime": 76.7108,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 278
+ },
+ {
+ "epoch": 8.71875,
+ "grad_norm": 0.7329120674082968,
+ "learning_rate": 2e-05,
+ "loss": 0.5843,
+ "step": 279
+ },
+ {
+ "epoch": 8.71875,
+ "eval_loss": 0.6635715961456299,
+ "eval_runtime": 76.7921,
+ "eval_samples_per_second": 2.604,
+ "eval_steps_per_second": 0.326,
+ "step": 279
+ },
+ {
+ "epoch": 8.75,
+ "grad_norm": 0.7950781591249908,
+ "learning_rate": 2e-05,
+ "loss": 0.6383,
+ "step": 280
+ },
+ {
+ "epoch": 8.75,
+ "eval_loss": 0.664521336555481,
+ "eval_runtime": 76.6952,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 280
+ },
+ {
+ "epoch": 8.78125,
+ "grad_norm": 0.6791182798182671,
+ "learning_rate": 2e-05,
+ "loss": 0.5932,
+ "step": 281
+ },
+ {
+ "epoch": 8.78125,
+ "eval_loss": 0.6673008799552917,
+ "eval_runtime": 76.794,
+ "eval_samples_per_second": 2.604,
+ "eval_steps_per_second": 0.326,
+ "step": 281
+ },
+ {
+ "epoch": 8.8125,
+ "grad_norm": 0.7633434086832942,
+ "learning_rate": 2e-05,
+ "loss": 0.5754,
+ "step": 282
+ },
+ {
+ "epoch": 8.8125,
+ "eval_loss": 0.6692779064178467,
+ "eval_runtime": 76.7749,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 282
+ },
+ {
+ "epoch": 8.84375,
+ "grad_norm": 0.6857090076317197,
+ "learning_rate": 2e-05,
+ "loss": 0.5585,
+ "step": 283
+ },
+ {
+ "epoch": 8.84375,
+ "eval_loss": 0.6702080368995667,
+ "eval_runtime": 76.6913,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 283
+ },
+ {
+ "epoch": 8.875,
+ "grad_norm": 0.6961298007385132,
+ "learning_rate": 2e-05,
+ "loss": 0.5093,
+ "step": 284
+ },
+ {
+ "epoch": 8.875,
+ "eval_loss": 0.6708166599273682,
+ "eval_runtime": 76.7725,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 284
+ },
+ {
+ "epoch": 8.90625,
+ "grad_norm": 0.7783752192295856,
+ "learning_rate": 2e-05,
+ "loss": 0.5656,
+ "step": 285
+ },
+ {
+ "epoch": 8.90625,
+ "eval_loss": 0.6697121262550354,
+ "eval_runtime": 76.7888,
+ "eval_samples_per_second": 2.605,
+ "eval_steps_per_second": 0.326,
+ "step": 285
+ },
+ {
+ "epoch": 8.9375,
+ "grad_norm": 0.7327581828795048,
+ "learning_rate": 2e-05,
+ "loss": 0.6984,
+ "step": 286
+ },
+ {
+ "epoch": 8.9375,
+ "eval_loss": 0.6684187054634094,
+ "eval_runtime": 78.6657,
+ "eval_samples_per_second": 2.542,
+ "eval_steps_per_second": 0.318,
+ "step": 286
+ },
+ {
+ "epoch": 8.96875,
+ "grad_norm": 0.689919829790507,
+ "learning_rate": 2e-05,
+ "loss": 0.6173,
+ "step": 287
+ },
+ {
+ "epoch": 8.96875,
+ "eval_loss": 0.6675245761871338,
+ "eval_runtime": 78.1275,
+ "eval_samples_per_second": 2.56,
+ "eval_steps_per_second": 0.32,
+ "step": 287
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 0.6812947879732435,
+ "learning_rate": 2e-05,
+ "loss": 0.5499,
+ "step": 288
+ },
+ {
+ "epoch": 9.0,
+ "eval_loss": 0.6678825616836548,
+ "eval_runtime": 78.8588,
+ "eval_samples_per_second": 2.536,
+ "eval_steps_per_second": 0.317,
+ "step": 288
+ },
+ {
+ "epoch": 9.03125,
+ "grad_norm": 0.715716761740314,
+ "learning_rate": 2e-05,
+ "loss": 0.5699,
+ "step": 289
+ },
+ {
+ "epoch": 9.03125,
+ "eval_loss": 0.6692755222320557,
+ "eval_runtime": 83.098,
+ "eval_samples_per_second": 2.407,
+ "eval_steps_per_second": 0.301,
+ "step": 289
+ },
+ {
+ "epoch": 9.0625,
+ "grad_norm": 0.7438930389955494,
+ "learning_rate": 2e-05,
+ "loss": 0.5974,
+ "step": 290
+ },
+ {
+ "epoch": 9.0625,
+ "eval_loss": 0.6735746264457703,
+ "eval_runtime": 77.384,
+ "eval_samples_per_second": 2.585,
+ "eval_steps_per_second": 0.323,
+ "step": 290
+ },
+ {
+ "epoch": 9.09375,
+ "grad_norm": 0.7271043131369198,
+ "learning_rate": 2e-05,
+ "loss": 0.601,
+ "step": 291
+ },
+ {
+ "epoch": 9.09375,
+ "eval_loss": 0.6790977716445923,
+ "eval_runtime": 78.0312,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 291
+ },
+ {
+ "epoch": 9.125,
+ "grad_norm": 0.851687675865168,
+ "learning_rate": 2e-05,
+ "loss": 0.5681,
+ "step": 292
+ },
+ {
+ "epoch": 9.125,
+ "eval_loss": 0.6834170818328857,
+ "eval_runtime": 77.8688,
+ "eval_samples_per_second": 2.568,
+ "eval_steps_per_second": 0.321,
+ "step": 292
+ },
+ {
+ "epoch": 9.15625,
+ "grad_norm": 0.7905287763218567,
+ "learning_rate": 2e-05,
+ "loss": 0.6222,
+ "step": 293
+ },
+ {
+ "epoch": 9.15625,
+ "eval_loss": 0.6843841671943665,
+ "eval_runtime": 77.985,
+ "eval_samples_per_second": 2.565,
+ "eval_steps_per_second": 0.321,
+ "step": 293
+ },
+ {
+ "epoch": 9.1875,
+ "grad_norm": 0.7301520002532459,
+ "learning_rate": 2e-05,
+ "loss": 0.5549,
+ "step": 294
+ },
+ {
+ "epoch": 9.1875,
+ "eval_loss": 0.6860540509223938,
+ "eval_runtime": 78.0163,
+ "eval_samples_per_second": 2.564,
+ "eval_steps_per_second": 0.32,
+ "step": 294
+ },
+ {
+ "epoch": 9.21875,
+ "grad_norm": 0.899999206595601,
+ "learning_rate": 2e-05,
+ "loss": 0.5128,
+ "step": 295
+ },
+ {
+ "epoch": 9.21875,
+ "eval_loss": 0.685759425163269,
+ "eval_runtime": 78.4339,
+ "eval_samples_per_second": 2.55,
+ "eval_steps_per_second": 0.319,
+ "step": 295
+ },
+ {
+ "epoch": 9.25,
+ "grad_norm": 0.8064287475451557,
+ "learning_rate": 2e-05,
+ "loss": 0.5261,
+ "step": 296
+ },
+ {
+ "epoch": 9.25,
+ "eval_loss": 0.6864770650863647,
+ "eval_runtime": 79.6129,
+ "eval_samples_per_second": 2.512,
+ "eval_steps_per_second": 0.314,
+ "step": 296
+ },
+ {
+ "epoch": 9.28125,
+ "grad_norm": 0.8837240795882767,
+ "learning_rate": 2e-05,
+ "loss": 0.621,
+ "step": 297
+ },
+ {
+ "epoch": 9.28125,
+ "eval_loss": 0.6871599555015564,
+ "eval_runtime": 78.9778,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.317,
+ "step": 297
+ },
+ {
+ "epoch": 9.3125,
+ "grad_norm": 0.9676184044078363,
+ "learning_rate": 2e-05,
+ "loss": 0.5655,
+ "step": 298
+ },
+ {
+ "epoch": 9.3125,
+ "eval_loss": 0.6881282329559326,
+ "eval_runtime": 78.9944,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 298
+ },
+ {
+ "epoch": 9.34375,
+ "grad_norm": 0.8723474213941232,
+ "learning_rate": 2e-05,
+ "loss": 0.5449,
+ "step": 299
+ },
+ {
+ "epoch": 9.34375,
+ "eval_loss": 0.6879245638847351,
+ "eval_runtime": 79.0056,
+ "eval_samples_per_second": 2.531,
+ "eval_steps_per_second": 0.316,
+ "step": 299
+ },
+ {
+ "epoch": 9.375,
+ "grad_norm": 0.848833488380702,
+ "learning_rate": 2e-05,
+ "loss": 0.5683,
+ "step": 300
+ },
+ {
+ "epoch": 9.375,
+ "eval_loss": 0.6846978664398193,
+ "eval_runtime": 78.9003,
+ "eval_samples_per_second": 2.535,
+ "eval_steps_per_second": 0.317,
+ "step": 300
+ },
+ {
+ "epoch": 9.40625,
+ "grad_norm": 0.8586391766708288,
+ "learning_rate": 2e-05,
+ "loss": 0.5358,
+ "step": 301
+ },
+ {
+ "epoch": 9.40625,
+ "eval_loss": 0.6798649430274963,
+ "eval_runtime": 80.0404,
+ "eval_samples_per_second": 2.499,
+ "eval_steps_per_second": 0.312,
+ "step": 301
+ },
+ {
+ "epoch": 9.4375,
+ "grad_norm": 0.8007832596916474,
+ "learning_rate": 2e-05,
+ "loss": 0.5792,
+ "step": 302
+ },
+ {
+ "epoch": 9.4375,
+ "eval_loss": 0.6757382750511169,
+ "eval_runtime": 79.962,
+ "eval_samples_per_second": 2.501,
+ "eval_steps_per_second": 0.313,
+ "step": 302
+ },
+ {
+ "epoch": 9.46875,
+ "grad_norm": 0.7839805948862919,
+ "learning_rate": 2e-05,
+ "loss": 0.5917,
+ "step": 303
+ },
+ {
+ "epoch": 9.46875,
+ "eval_loss": 0.6754000782966614,
+ "eval_runtime": 80.738,
+ "eval_samples_per_second": 2.477,
+ "eval_steps_per_second": 0.31,
+ "step": 303
+ },
+ {
+ "epoch": 9.5,
+ "grad_norm": 0.7397772754102683,
+ "learning_rate": 2e-05,
+ "loss": 0.6249,
+ "step": 304
+ },
+ {
+ "epoch": 9.5,
+ "eval_loss": 0.6777495741844177,
+ "eval_runtime": 80.5144,
+ "eval_samples_per_second": 2.484,
+ "eval_steps_per_second": 0.311,
+ "step": 304
+ },
+ {
+ "epoch": 9.53125,
+ "grad_norm": 0.857390001265035,
+ "learning_rate": 2e-05,
+ "loss": 0.5932,
+ "step": 305
+ },
+ {
+ "epoch": 9.53125,
+ "eval_loss": 0.6778848171234131,
+ "eval_runtime": 80.1508,
+ "eval_samples_per_second": 2.495,
+ "eval_steps_per_second": 0.312,
+ "step": 305
+ },
+ {
+ "epoch": 9.5625,
+ "grad_norm": 0.9430180281536945,
+ "learning_rate": 2e-05,
+ "loss": 0.5793,
+ "step": 306
+ },
+ {
+ "epoch": 9.5625,
+ "eval_loss": 0.6771917939186096,
+ "eval_runtime": 76.7109,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 306
+ },
+ {
+ "epoch": 9.59375,
+ "grad_norm": 0.8705050270903875,
+ "learning_rate": 2e-05,
+ "loss": 0.5601,
+ "step": 307
+ },
+ {
+ "epoch": 9.59375,
+ "eval_loss": 0.6808632016181946,
+ "eval_runtime": 76.6965,
+ "eval_samples_per_second": 2.608,
+ "eval_steps_per_second": 0.326,
+ "step": 307
+ },
+ {
+ "epoch": 9.625,
+ "grad_norm": 0.8611871513168323,
+ "learning_rate": 2e-05,
+ "loss": 0.5953,
+ "step": 308
+ },
+ {
+ "epoch": 9.625,
+ "eval_loss": 0.6875945329666138,
+ "eval_runtime": 76.6592,
+ "eval_samples_per_second": 2.609,
+ "eval_steps_per_second": 0.326,
+ "step": 308
+ },
+ {
+ "epoch": 9.65625,
+ "grad_norm": 0.9066952565245906,
+ "learning_rate": 2e-05,
+ "loss": 0.5815,
+ "step": 309
+ },
+ {
+ "epoch": 9.65625,
+ "eval_loss": 0.6910049319267273,
+ "eval_runtime": 76.7021,
+ "eval_samples_per_second": 2.607,
+ "eval_steps_per_second": 0.326,
+ "step": 309
+ },
+ {
+ "epoch": 9.6875,
+ "grad_norm": 1.0666864048105145,
+ "learning_rate": 2e-05,
+ "loss": 0.5663,
+ "step": 310
+ },
+ {
+ "epoch": 9.6875,
+ "eval_loss": 0.6869986057281494,
+ "eval_runtime": 76.6344,
+ "eval_samples_per_second": 2.61,
+ "eval_steps_per_second": 0.326,
+ "step": 310
+ },
+ {
+ "epoch": 9.71875,
+ "grad_norm": 0.9413311560347162,
+ "learning_rate": 2e-05,
+ "loss": 0.5106,
+ "step": 311
+ },
+ {
+ "epoch": 9.71875,
+ "eval_loss": 0.6825075745582581,
+ "eval_runtime": 78.7857,
+ "eval_samples_per_second": 2.539,
+ "eval_steps_per_second": 0.317,
+ "step": 311
+ },
+ {
+ "epoch": 9.75,
+ "grad_norm": 0.9175579044457436,
+ "learning_rate": 2e-05,
+ "loss": 0.5821,
+ "step": 312
+ },
+ {
+ "epoch": 9.75,
+ "eval_loss": 0.6794223189353943,
+ "eval_runtime": 78.0368,
+ "eval_samples_per_second": 2.563,
+ "eval_steps_per_second": 0.32,
+ "step": 312
+ },
+ {
+ "epoch": 9.78125,
+ "grad_norm": 0.7982785075945665,
+ "learning_rate": 2e-05,
+ "loss": 0.5781,
+ "step": 313
+ },
+ {
+ "epoch": 9.78125,
+ "eval_loss": 0.679649829864502,
+ "eval_runtime": 78.0513,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 313
+ },
+ {
+ "epoch": 9.8125,
+ "grad_norm": 0.9284642289974022,
+ "learning_rate": 2e-05,
+ "loss": 0.5394,
+ "step": 314
+ },
+ {
+ "epoch": 9.8125,
+ "eval_loss": 0.6805163025856018,
+ "eval_runtime": 78.2229,
+ "eval_samples_per_second": 2.557,
+ "eval_steps_per_second": 0.32,
+ "step": 314
+ },
+ {
+ "epoch": 9.84375,
+ "grad_norm": 0.8816568355396782,
+ "learning_rate": 2e-05,
+ "loss": 0.5722,
+ "step": 315
+ },
+ {
+ "epoch": 9.84375,
+ "eval_loss": 0.6801097393035889,
+ "eval_runtime": 78.9282,
+ "eval_samples_per_second": 2.534,
+ "eval_steps_per_second": 0.317,
+ "step": 315
+ },
+ {
+ "epoch": 9.875,
+ "grad_norm": 0.8137119863863306,
+ "learning_rate": 2e-05,
+ "loss": 0.5831,
+ "step": 316
+ },
+ {
+ "epoch": 9.875,
+ "eval_loss": 0.6792600750923157,
+ "eval_runtime": 78.8166,
+ "eval_samples_per_second": 2.538,
+ "eval_steps_per_second": 0.317,
+ "step": 316
+ },
+ {
+ "epoch": 9.90625,
+ "grad_norm": 0.9595174764400289,
+ "learning_rate": 2e-05,
+ "loss": 0.5489,
+ "step": 317
+ },
+ {
+ "epoch": 9.90625,
+ "eval_loss": 0.6755692958831787,
+ "eval_runtime": 78.1426,
+ "eval_samples_per_second": 2.559,
+ "eval_steps_per_second": 0.32,
+ "step": 317
+ },
+ {
+ "epoch": 9.9375,
+ "grad_norm": 0.8612490247878711,
+ "learning_rate": 2e-05,
+ "loss": 0.5508,
+ "step": 318
+ },
+ {
+ "epoch": 9.9375,
+ "eval_loss": 0.673053503036499,
+ "eval_runtime": 78.0565,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 318
+ },
+ {
+ "epoch": 9.96875,
+ "grad_norm": 0.9474068762478358,
+ "learning_rate": 2e-05,
+ "loss": 0.5859,
+ "step": 319
+ },
+ {
+ "epoch": 9.96875,
+ "eval_loss": 0.6695602536201477,
+ "eval_runtime": 78.051,
+ "eval_samples_per_second": 2.562,
+ "eval_steps_per_second": 0.32,
+ "step": 319
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.8401643717683449,
+ "learning_rate": 2e-05,
+ "loss": 0.5277,
+ "step": 320
+ },
+ {
+ "epoch": 10.0,
+ "eval_loss": 0.6707890033721924,
+ "eval_runtime": 78.9959,
+ "eval_samples_per_second": 2.532,
+ "eval_steps_per_second": 0.316,
+ "step": 320
+ },
+ {
+ "epoch": 10.0,
+ "step": 320,
+ "total_flos": 613933061373952.0,
+ "train_loss": 0.056994458101689814,
+ "train_runtime": 3241.7031,
+ "train_samples_per_second": 3.085,
+ "train_steps_per_second": 0.099
+ }
+ ],
+ "logging_steps": 1.0,
+ "max_steps": 320,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 5,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 613933061373952.0,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}