diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237 --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.6-vicuna-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0d9e6d1f521fdefcdd69001e3f15155910cbbf70 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "k_proj", + "gate_proj", + "up_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f52c6fccec0de4ab49229bc2d89cb6e3bc42548 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582f98ea28b87a902c530b0ebdd7bdc472bdb58830ca975afdb7caab0b5d85fb +size 65046168 diff --git a/checkpoint-256/README.md b/checkpoint-256/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237 --- /dev/null +++ b/checkpoint-256/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.6-vicuna-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-256/adapter_config.json b/checkpoint-256/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..42947e9d5eddd8b5dfb4d8381d68cd4c25aabb56 --- /dev/null +++ b/checkpoint-256/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "q_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-256/adapter_model.safetensors b/checkpoint-256/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f52c6fccec0de4ab49229bc2d89cb6e3bc42548 --- /dev/null +++ b/checkpoint-256/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582f98ea28b87a902c530b0ebdd7bdc472bdb58830ca975afdb7caab0b5d85fb +size 65046168 diff --git a/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3a4b6ebd97ff1f0833b4a89c7520ca2eef3dd86 --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465bf9bcd385ef29a3aded4895139c4327721f5e279f0c5928a4350b26901d89 +size 775138 diff --git a/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5d7befc2e8c633a242b7382f047242f50e018cd --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c68ac2bffff7c6cd1da2bd544a0b27610ccc8747cf54ddfb246cdf24ae5f3b +size 191825901 diff --git a/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a588b63eadd1126c02cf1b9faca2ef84a2e6a2d3 --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3144132b88f2cfa0c1803ff5ffa79ab3b81f0b89322c27f987f899c8ff3914e +size 775138 diff --git a/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f69c29ad364039e46e0de66daf775c20de9df1e --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6d3e5ddc3618ab6c11874ff9c3aaa56b9014dabc1ff40a2dae3695964202c5 +size 191825901 diff --git a/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a5ff1efbf48f68803ee715c14d85ba5c3570a4f --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20274aa80392ac7f568845aa97ec6813a7365b6d537f0f55ac2a56b0cc84bbe6 +size 775138 diff --git a/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..545d2a89b3975d1e5f41a49772bea0aad92030d9 --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b7c5723c3fa9c44b9378020f471d5b5ab1102856a7c288397a1809227947d98 +size 191825901 diff --git a/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e45710325a9e9575cdb0d5fb8413b1d7f2b7507 --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d07a7fb704a5010ca2b049a5b38e040049ce1bb62dd2ad63debef296372b740 +size 775138 diff --git a/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6de9a83b4ccd83289ae9855f79ccc37e5daf25af --- /dev/null +++ b/checkpoint-256/global_step256/zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16b92dff9100643c9a0e96243df4bc82354374bb04eaa1541637da45bc3a7bf4 +size 191825901 diff --git a/checkpoint-256/latest b/checkpoint-256/latest new file mode 100644 index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1 --- /dev/null +++ b/checkpoint-256/latest @@ -0,0 +1 @@ +global_step256 \ No newline at end of file diff --git a/checkpoint-256/rng_state_0.pth b/checkpoint-256/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..196981c40334fb7dd951e7d84111eb3493959006 --- /dev/null +++ b/checkpoint-256/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82dfa4cdf25f3321dc609ee6cef399ec5eb5a776c270e602eed6aa9dbcbae97 +size 14960 diff --git a/checkpoint-256/rng_state_1.pth b/checkpoint-256/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c668a886b54bae30d8d32655a29cd71d37f128e --- /dev/null +++ b/checkpoint-256/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a29c2706ec243f7c4af15e34459ed1c098adc1768fbab00b1ca5adede23f126 +size 14960 diff --git a/checkpoint-256/rng_state_2.pth b/checkpoint-256/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8d9d67b555eeeddca57f7c8422458b27ea4aad1 --- /dev/null +++ b/checkpoint-256/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9cdb67e15cdb7445043e4cafda66c8a090655a1f350cb975bf3231ca39c36c +size 14960 diff --git a/checkpoint-256/rng_state_3.pth b/checkpoint-256/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd171c0d77e1cc5f9e1023225de4bf2e3858ea12 --- /dev/null +++ b/checkpoint-256/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:962c98fdd1ccee0e253d9a2f1d5f6ee58bdbf1d8f2b333298226b9554d259ac0 +size 14960 diff --git a/checkpoint-256/special_tokens_map.json b/checkpoint-256/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-256/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-256/tokenizer.model b/checkpoint-256/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-256/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-256/tokenizer_config.json b/checkpoint-256/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3 --- /dev/null +++ b/checkpoint-256/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-256/trainer_state.json b/checkpoint-256/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae0074f37d17375ef64e91f32c2e178805ce2899 --- /dev/null +++ b/checkpoint-256/trainer_state.json @@ -0,0 +1,3873 @@ +{ + "best_metric": 0.6575854420661926, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256", + "epoch": 8.0, + "eval_steps": 1.0, + "global_step": 256, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.5230235555406132, + "learning_rate": 0.0, + "loss": 1.5809, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.6275018453598022, + "eval_runtime": 82.059, + "eval_samples_per_second": 2.437, + "eval_steps_per_second": 0.305, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.5095402010892089, + "learning_rate": 2e-05, + "loss": 1.4958, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.6275018453598022, + "eval_runtime": 76.5747, + "eval_samples_per_second": 2.612, + "eval_steps_per_second": 0.326, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.4998514282504938, + "learning_rate": 2e-05, + "loss": 1.5552, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.5956931114196777, + "eval_runtime": 76.1563, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.4280580315108126, + "learning_rate": 2e-05, + "loss": 1.4846, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.5584176778793335, + "eval_runtime": 76.1235, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5678499435986384, + "learning_rate": 2e-05, + "loss": 1.5036, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.5207562446594238, + "eval_runtime": 76.1514, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5368461657542534, + "learning_rate": 2e-05, + "loss": 1.476, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.4807783365249634, + "eval_runtime": 77.3444, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4358, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.066, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4369, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.2807, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.323, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4471, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.1562, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.3666, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 77.1645, + "eval_samples_per_second": 2.592, + "eval_steps_per_second": 0.324, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4149, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.7627, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.684588966714067, + "learning_rate": 2e-05, + "loss": 1.3883, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.3679308891296387, + "eval_runtime": 78.4315, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.4271, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.685, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.2495, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.0511, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.6028103951693778, + "learning_rate": 2e-05, + "loss": 1.3513, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.3032653331756592, + "eval_runtime": 78.0271, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.769290402283396, + "learning_rate": 2e-05, + "loss": 1.3117, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.2661188840866089, + "eval_runtime": 78.1857, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 1.3279338025863765, + "learning_rate": 2e-05, + "loss": 1.2768, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.2299447059631348, + "eval_runtime": 78.2064, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.7410327159336384, + "learning_rate": 2e-05, + "loss": 1.256, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.2044258117675781, + "eval_runtime": 78.072, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.44078820770408506, + "learning_rate": 2e-05, + "loss": 1.1252, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.1826122999191284, + "eval_runtime": 78.7312, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.318, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.49020841613371097, + "learning_rate": 2e-05, + "loss": 1.2249, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.1616511344909668, + "eval_runtime": 78.2736, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.43031322695269714, + "learning_rate": 2e-05, + "loss": 1.1466, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 1.1410629749298096, + "eval_runtime": 79.6432, + "eval_samples_per_second": 2.511, + "eval_steps_per_second": 0.314, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.45632085445955545, + "learning_rate": 2e-05, + "loss": 1.1951, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 1.1204684972763062, + "eval_runtime": 79.0609, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.40048586945364495, + "learning_rate": 2e-05, + "loss": 1.1826, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 1.1002545356750488, + "eval_runtime": 82.8578, + "eval_samples_per_second": 2.414, + "eval_steps_per_second": 0.302, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.3703033261027938, + "learning_rate": 2e-05, + "loss": 1.1543, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 1.0805977582931519, + "eval_runtime": 76.1407, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3986313105418924, + "learning_rate": 2e-05, + "loss": 1.1046, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 1.0610157251358032, + "eval_runtime": 76.3083, + "eval_samples_per_second": 2.621, + "eval_steps_per_second": 0.328, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.36265027203577943, + "learning_rate": 2e-05, + "loss": 1.1048, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 1.0421289205551147, + "eval_runtime": 77.2186, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.3881748990218768, + "learning_rate": 2e-05, + "loss": 1.0425, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 1.0240073204040527, + "eval_runtime": 77.8662, + "eval_samples_per_second": 2.569, + "eval_steps_per_second": 0.321, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.3734031294324286, + "learning_rate": 2e-05, + "loss": 1.0484, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 1.0066957473754883, + "eval_runtime": 77.269, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.29695383079342563, + "learning_rate": 2e-05, + "loss": 1.0387, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.9906074404716492, + "eval_runtime": 77.2245, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.29273146875026623, + "learning_rate": 2e-05, + "loss": 1.0568, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.975755512714386, + "eval_runtime": 78.0056, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.35070440686850546, + "learning_rate": 2e-05, + "loss": 0.9114, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9615123271942139, + "eval_runtime": 77.9051, + "eval_samples_per_second": 2.567, + "eval_steps_per_second": 0.321, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.30846157140439384, + "learning_rate": 2e-05, + "loss": 0.9941, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9480571150779724, + "eval_runtime": 77.2322, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.2950381371932973, + "learning_rate": 2e-05, + "loss": 1.0297, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9356330037117004, + "eval_runtime": 81.8443, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 0.305, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.27080038065834283, + "learning_rate": 2e-05, + "loss": 1.021, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.9245791435241699, + "eval_runtime": 76.2071, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.23165081252649894, + "learning_rate": 2e-05, + "loss": 1.0366, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.9151126146316528, + "eval_runtime": 77.0412, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.4033780922500775, + "learning_rate": 2e-05, + "loss": 1.0127, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.9063960313796997, + "eval_runtime": 76.9327, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.2398039831439168, + "learning_rate": 2e-05, + "loss": 0.9418, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.8982363939285278, + "eval_runtime": 76.1234, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.28793451241246804, + "learning_rate": 2e-05, + "loss": 0.9643, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8908895254135132, + "eval_runtime": 76.2877, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.2927691606307197, + "learning_rate": 2e-05, + "loss": 1.0087, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8845618367195129, + "eval_runtime": 76.2282, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.26410982001408806, + "learning_rate": 2e-05, + "loss": 0.986, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8784474730491638, + "eval_runtime": 76.2512, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.29182630949665306, + "learning_rate": 2e-05, + "loss": 0.9711, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8725223541259766, + "eval_runtime": 77.1229, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.36402838796832665, + "learning_rate": 2e-05, + "loss": 0.9263, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8662790060043335, + "eval_runtime": 77.2362, + "eval_samples_per_second": 2.589, + "eval_steps_per_second": 0.324, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.29338184478895163, + "learning_rate": 2e-05, + "loss": 0.8947, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8600431680679321, + "eval_runtime": 77.1213, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.2201714229702277, + "learning_rate": 2e-05, + "loss": 0.9059, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8545799255371094, + "eval_runtime": 77.991, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.2254966625243654, + "learning_rate": 2e-05, + "loss": 0.8942, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8497399687767029, + "eval_runtime": 77.2698, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21753318432075458, + "learning_rate": 2e-05, + "loss": 0.9376, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8452473282814026, + "eval_runtime": 77.0568, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.21449718265972945, + "learning_rate": 2e-05, + "loss": 0.9369, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.841134786605835, + "eval_runtime": 77.225, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.2109063266748924, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8373770117759705, + "eval_runtime": 76.2309, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.232838633689838, + "learning_rate": 2e-05, + "loss": 0.8694, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8338289856910706, + "eval_runtime": 76.277, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4189704940803984, + "learning_rate": 2e-05, + "loss": 0.8464, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8297132849693298, + "eval_runtime": 76.2872, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.2171618165123276, + "learning_rate": 2e-05, + "loss": 0.8785, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8257431983947754, + "eval_runtime": 76.2639, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21934651037670305, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.8223557472229004, + "eval_runtime": 76.2383, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.24183530733164746, + "learning_rate": 2e-05, + "loss": 0.9218, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8189653158187866, + "eval_runtime": 76.9819, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.23450930244279267, + "learning_rate": 2e-05, + "loss": 0.8896, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.8152530193328857, + "eval_runtime": 76.2378, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.22081665899796085, + "learning_rate": 2e-05, + "loss": 0.8798, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8122122287750244, + "eval_runtime": 76.289, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.21311746114111046, + "learning_rate": 2e-05, + "loss": 0.9482, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8092318773269653, + "eval_runtime": 77.8321, + "eval_samples_per_second": 2.57, + "eval_steps_per_second": 0.321, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.2496565307107556, + "learning_rate": 2e-05, + "loss": 0.8917, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.8070546984672546, + "eval_runtime": 77.2651, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2137866456424736, + "learning_rate": 2e-05, + "loss": 0.909, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.8049566745758057, + "eval_runtime": 78.0925, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.22567502859345095, + "learning_rate": 2e-05, + "loss": 0.8611, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8028810024261475, + "eval_runtime": 78.0553, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.23303796552302508, + "learning_rate": 2e-05, + "loss": 0.9209, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.800568699836731, + "eval_runtime": 78.052, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24566727726974544, + "learning_rate": 2e-05, + "loss": 0.8239, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.7976545691490173, + "eval_runtime": 77.3056, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.23014192522354907, + "learning_rate": 2e-05, + "loss": 0.8814, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.7945474982261658, + "eval_runtime": 77.3398, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.23042819102671622, + "learning_rate": 2e-05, + "loss": 0.9064, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.7918359637260437, + "eval_runtime": 77.4272, + "eval_samples_per_second": 2.583, + "eval_steps_per_second": 0.323, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.23940667173206315, + "learning_rate": 2e-05, + "loss": 0.8658, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7891160845756531, + "eval_runtime": 77.3236, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.22630342930143643, + "learning_rate": 2e-05, + "loss": 0.8403, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7859742641448975, + "eval_runtime": 77.2001, + "eval_samples_per_second": 2.591, + "eval_steps_per_second": 0.324, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.20949240460260976, + "learning_rate": 2e-05, + "loss": 0.8472, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7834083437919617, + "eval_runtime": 78.9646, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.22714400479820654, + "learning_rate": 2e-05, + "loss": 0.841, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7805308699607849, + "eval_runtime": 78.7552, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.317, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.23345123077006047, + "learning_rate": 2e-05, + "loss": 0.9028, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7779514789581299, + "eval_runtime": 78.3387, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.251841542575211, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7756664752960205, + "eval_runtime": 78.3109, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.23548386839773608, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7733604907989502, + "eval_runtime": 78.9712, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23262740912668387, + "learning_rate": 2e-05, + "loss": 0.8778, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.771755576133728, + "eval_runtime": 78.2633, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.22075289612357513, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7705450654029846, + "eval_runtime": 78.3151, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25520381955936466, + "learning_rate": 2e-05, + "loss": 0.8387, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7695029973983765, + "eval_runtime": 78.2901, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2047305385827267, + "learning_rate": 2e-05, + "loss": 0.8404, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7684457302093506, + "eval_runtime": 78.3875, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.2262323045133288, + "learning_rate": 2e-05, + "loss": 0.8811, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7671162486076355, + "eval_runtime": 78.202, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.21885464923925876, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7658494710922241, + "eval_runtime": 78.1746, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.21717306953626966, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7642120122909546, + "eval_runtime": 78.2026, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2530725583748258, + "learning_rate": 2e-05, + "loss": 0.8584, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7625510692596436, + "eval_runtime": 78.1991, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.25354787036627263, + "learning_rate": 2e-05, + "loss": 0.8569, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7616268396377563, + "eval_runtime": 78.2915, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.2800865746664007, + "learning_rate": 2e-05, + "loss": 0.9116, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7603214979171753, + "eval_runtime": 78.2749, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.268139688449618, + "learning_rate": 2e-05, + "loss": 0.8397, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7584869265556335, + "eval_runtime": 79.1445, + "eval_samples_per_second": 2.527, + "eval_steps_per_second": 0.316, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.3128648654463789, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.7566561102867126, + "eval_runtime": 79.2089, + "eval_samples_per_second": 2.525, + "eval_steps_per_second": 0.316, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2502355211215609, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.7547345161437988, + "eval_runtime": 79.2691, + "eval_samples_per_second": 2.523, + "eval_steps_per_second": 0.315, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.25281184629018644, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7527951598167419, + "eval_runtime": 79.4068, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.24246729562645003, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7509815096855164, + "eval_runtime": 79.1612, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.27005475109453947, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7485950589179993, + "eval_runtime": 80.0714, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.2723492355800971, + "learning_rate": 2e-05, + "loss": 0.8117, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7459420561790466, + "eval_runtime": 79.4075, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.2946493898427159, + "learning_rate": 2e-05, + "loss": 0.8986, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7436455488204956, + "eval_runtime": 79.3721, + "eval_samples_per_second": 2.52, + "eval_steps_per_second": 0.315, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.26411214734213284, + "learning_rate": 2e-05, + "loss": 0.8145, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7424752712249756, + "eval_runtime": 79.2988, + "eval_samples_per_second": 2.522, + "eval_steps_per_second": 0.315, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.27115747269014817, + "learning_rate": 2e-05, + "loss": 0.8457, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7416408658027649, + "eval_runtime": 79.4004, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.25831877964821937, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7404463291168213, + "eval_runtime": 81.7767, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.31273388454942935, + "learning_rate": 2e-05, + "loss": 0.8562, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7384185791015625, + "eval_runtime": 82.3443, + "eval_samples_per_second": 2.429, + "eval_steps_per_second": 0.304, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.2838267071008901, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7366807460784912, + "eval_runtime": 82.2622, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.28625827941831467, + "learning_rate": 2e-05, + "loss": 0.8618, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7357398867607117, + "eval_runtime": 81.9471, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.25548002643954326, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7356534004211426, + "eval_runtime": 82.1186, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.27081450830961107, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7346957921981812, + "eval_runtime": 81.5463, + "eval_samples_per_second": 2.453, + "eval_steps_per_second": 0.307, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.2985486737236676, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7325752377510071, + "eval_runtime": 81.7804, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.29149719690624026, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.7298976182937622, + "eval_runtime": 76.2764, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.25227859825215865, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.727373480796814, + "eval_runtime": 76.2418, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.27316954971752555, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7254325747489929, + "eval_runtime": 76.1474, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.24239788607957785, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.724058985710144, + "eval_runtime": 76.2391, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.25648385925427025, + "learning_rate": 2e-05, + "loss": 0.8195, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7235870957374573, + "eval_runtime": 76.9134, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.29620170789161204, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7228152751922607, + "eval_runtime": 76.095, + "eval_samples_per_second": 2.628, + "eval_steps_per_second": 0.329, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3484116181139593, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7209363579750061, + "eval_runtime": 76.9377, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.25212350156184643, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7197096347808838, + "eval_runtime": 76.2008, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.264200147608962, + "learning_rate": 2e-05, + "loss": 0.8371, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7197055220603943, + "eval_runtime": 78.1542, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.3309431084940201, + "learning_rate": 2e-05, + "loss": 0.6999, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.7187016010284424, + "eval_runtime": 78.4259, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.3131644456919823, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.717018187046051, + "eval_runtime": 78.4558, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.33527684120780293, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7147062420845032, + "eval_runtime": 78.2334, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.32, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.29542683956231724, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7130224704742432, + "eval_runtime": 79.1179, + "eval_samples_per_second": 2.528, + "eval_steps_per_second": 0.316, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.31128698002926114, + "learning_rate": 2e-05, + "loss": 0.8153, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7120551466941833, + "eval_runtime": 80.292, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 0.311, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.32502558864214215, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7117202877998352, + "eval_runtime": 79.7539, + "eval_samples_per_second": 2.508, + "eval_steps_per_second": 0.313, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.34335720855758517, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7117029428482056, + "eval_runtime": 80.0281, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.31951931695644, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.7116554379463196, + "eval_runtime": 79.7209, + "eval_samples_per_second": 2.509, + "eval_steps_per_second": 0.314, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.28067192963874266, + "learning_rate": 2e-05, + "loss": 0.8045, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7118353843688965, + "eval_runtime": 80.0195, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.2739718257400276, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7122579216957092, + "eval_runtime": 76.2052, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.31401723658881836, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7118574380874634, + "eval_runtime": 76.1509, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.36925964858634625, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.710691511631012, + "eval_runtime": 76.2305, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.3050583880654791, + "learning_rate": 2e-05, + "loss": 0.8402, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7096763849258423, + "eval_runtime": 77.0581, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.2648625651290031, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7094223499298096, + "eval_runtime": 76.1379, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.3107221696449271, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7081363201141357, + "eval_runtime": 76.626, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.3455151299995048, + "learning_rate": 2e-05, + "loss": 0.8342, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7063001990318298, + "eval_runtime": 77.0293, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.28847071926472523, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.7044610381126404, + "eval_runtime": 76.2385, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.26753816515069856, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7033799886703491, + "eval_runtime": 76.1985, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.3465046292893005, + "learning_rate": 2e-05, + "loss": 0.8144, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7021930813789368, + "eval_runtime": 76.2234, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.3451690427620698, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7013542652130127, + "eval_runtime": 78.0752, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.31571858642673567, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7007560729980469, + "eval_runtime": 78.3558, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3247003540270338, + "learning_rate": 2e-05, + "loss": 0.6714, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.6999780535697937, + "eval_runtime": 78.9788, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.2814983490019739, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.6998200416564941, + "eval_runtime": 78.3093, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.31961631715145106, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.6995271444320679, + "eval_runtime": 78.2172, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.32333364662215863, + "learning_rate": 2e-05, + "loss": 0.7896, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.6992727518081665, + "eval_runtime": 79.0125, + "eval_samples_per_second": 2.531, + "eval_steps_per_second": 0.316, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3255859640449829, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.6988572478294373, + "eval_runtime": 79.0, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.3307068947429175, + "learning_rate": 2e-05, + "loss": 0.8416, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.6981343030929565, + "eval_runtime": 78.3309, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.3842303818116732, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.6968980431556702, + "eval_runtime": 78.5608, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.331839472419003, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.6955949664115906, + "eval_runtime": 78.3566, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.31864813130499836, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.6951528787612915, + "eval_runtime": 79.7802, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.352549164434451, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.6947290897369385, + "eval_runtime": 79.8171, + "eval_samples_per_second": 2.506, + "eval_steps_per_second": 0.313, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.37128812818896284, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.6937370300292969, + "eval_runtime": 79.7782, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.3348014941412048, + "learning_rate": 2e-05, + "loss": 0.7079, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.692456066608429, + "eval_runtime": 79.9308, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.34411051658527964, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.6915809512138367, + "eval_runtime": 79.943, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.3373909601921749, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.6912103295326233, + "eval_runtime": 79.8515, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.33253827371305456, + "learning_rate": 2e-05, + "loss": 0.7224, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.6912806630134583, + "eval_runtime": 80.6475, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 0.31, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.38458075172588313, + "learning_rate": 2e-05, + "loss": 0.7261, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.6905419230461121, + "eval_runtime": 80.2606, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.311, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.31351962640463144, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.6898491382598877, + "eval_runtime": 79.9965, + "eval_samples_per_second": 2.5, + "eval_steps_per_second": 0.313, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.35474372115704583, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.6893147230148315, + "eval_runtime": 1475.5758, + "eval_samples_per_second": 0.136, + "eval_steps_per_second": 0.017, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.3479568917421202, + "learning_rate": 2e-05, + "loss": 0.6638, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.6884538531303406, + "eval_runtime": 84.6835, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.295, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.3421823344428645, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.6873475909233093, + "eval_runtime": 83.3138, + "eval_samples_per_second": 2.401, + "eval_steps_per_second": 0.3, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.3642187020830788, + "learning_rate": 2e-05, + "loss": 0.6825, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.6858401298522949, + "eval_runtime": 82.1066, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.35097547901391785, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.6848779320716858, + "eval_runtime": 84.4076, + "eval_samples_per_second": 2.369, + "eval_steps_per_second": 0.296, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.3568694843794629, + "learning_rate": 2e-05, + "loss": 0.7176, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.6842290759086609, + "eval_runtime": 82.5945, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.34258633585260334, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.6838659048080444, + "eval_runtime": 85.9626, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.291, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.42319523894659655, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.6830299496650696, + "eval_runtime": 85.7189, + "eval_samples_per_second": 2.333, + "eval_steps_per_second": 0.292, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.3632195533127194, + "learning_rate": 2e-05, + "loss": 0.715, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.6826379895210266, + "eval_runtime": 87.8244, + "eval_samples_per_second": 2.277, + "eval_steps_per_second": 0.285, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.3738308004604413, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.6826817393302917, + "eval_runtime": 86.5822, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.3618696330632776, + "learning_rate": 2e-05, + "loss": 0.6632, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.6827967166900635, + "eval_runtime": 82.1829, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 0.304, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.38901912569992203, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.6821711659431458, + "eval_runtime": 84.4511, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.3516096507348829, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.6819837689399719, + "eval_runtime": 84.1594, + "eval_samples_per_second": 2.376, + "eval_steps_per_second": 0.297, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.36066902463794986, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.6817716956138611, + "eval_runtime": 83.8929, + "eval_samples_per_second": 2.384, + "eval_steps_per_second": 0.298, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.36641784926154175, + "learning_rate": 2e-05, + "loss": 0.7116, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.6816902160644531, + "eval_runtime": 84.4431, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.4020716293225933, + "learning_rate": 2e-05, + "loss": 0.7142, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.6811469793319702, + "eval_runtime": 86.0681, + "eval_samples_per_second": 2.324, + "eval_steps_per_second": 0.29, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.38360882669254054, + "learning_rate": 2e-05, + "loss": 0.6756, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.6798409223556519, + "eval_runtime": 81.9903, + "eval_samples_per_second": 2.439, + "eval_steps_per_second": 0.305, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.34966156213066135, + "learning_rate": 2e-05, + "loss": 0.827, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.6788859367370605, + "eval_runtime": 76.1753, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.41140842939901384, + "learning_rate": 2e-05, + "loss": 0.6409, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.6787077188491821, + "eval_runtime": 76.2239, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.4222084070163774, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.6796822547912598, + "eval_runtime": 76.2141, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.4644454724424921, + "learning_rate": 2e-05, + "loss": 0.6057, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.6794346570968628, + "eval_runtime": 76.3216, + "eval_samples_per_second": 2.62, + "eval_steps_per_second": 0.328, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.46128725263272996, + "learning_rate": 2e-05, + "loss": 0.7158, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.6791612505912781, + "eval_runtime": 78.4909, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.319, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.37300666872025545, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.6788016557693481, + "eval_runtime": 78.5697, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.41454648576180214, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.6787048578262329, + "eval_runtime": 78.5317, + "eval_samples_per_second": 2.547, + "eval_steps_per_second": 0.318, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.40724665091386236, + "learning_rate": 2e-05, + "loss": 0.6944, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.679679811000824, + "eval_runtime": 78.6899, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.3875110486208986, + "learning_rate": 2e-05, + "loss": 0.6634, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.6819935441017151, + "eval_runtime": 78.3617, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.47956532155617193, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.6825206875801086, + "eval_runtime": 78.4435, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.4599359590587781, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.6816768050193787, + "eval_runtime": 78.3005, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4057490487995386, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.6806090474128723, + "eval_runtime": 78.3313, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4143979315360467, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.6795693039894104, + "eval_runtime": 78.4526, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.4219663662343445, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.6793847680091858, + "eval_runtime": 78.8009, + "eval_samples_per_second": 2.538, + "eval_steps_per_second": 0.317, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.4491811321927657, + "learning_rate": 2e-05, + "loss": 0.7004, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6775352358818054, + "eval_runtime": 80.0685, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.46366516532638885, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6748698949813843, + "eval_runtime": 80.0487, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.3815188640227797, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6728273034095764, + "eval_runtime": 80.0318, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.41025429416666304, + "learning_rate": 2e-05, + "loss": 0.6585, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6718859672546387, + "eval_runtime": 79.8801, + "eval_samples_per_second": 2.504, + "eval_steps_per_second": 0.313, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.40652817592240054, + "learning_rate": 2e-05, + "loss": 0.6611, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6715708374977112, + "eval_runtime": 76.7261, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.40753961326688415, + "learning_rate": 2e-05, + "loss": 0.6779, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6719761490821838, + "eval_runtime": 77.0136, + "eval_samples_per_second": 2.597, + "eval_steps_per_second": 0.325, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.4232811980671673, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.6724664568901062, + "eval_runtime": 76.9731, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5132756318549849, + "learning_rate": 2e-05, + "loss": 0.6801, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.6723365783691406, + "eval_runtime": 76.4132, + "eval_samples_per_second": 2.617, + "eval_steps_per_second": 0.327, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.43526879230161264, + "learning_rate": 2e-05, + "loss": 0.6673, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.672926664352417, + "eval_runtime": 76.1936, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.46965560853038507, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.6731134057044983, + "eval_runtime": 76.2345, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.4733296318676217, + "learning_rate": 2e-05, + "loss": 0.6791, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6726363301277161, + "eval_runtime": 78.3939, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.4662943253655961, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6726526021957397, + "eval_runtime": 79.1834, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.4420962889993382, + "learning_rate": 2e-05, + "loss": 0.675, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.6727125644683838, + "eval_runtime": 78.252, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.319, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4345166976944551, + "learning_rate": 2e-05, + "loss": 0.6748, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6725904941558838, + "eval_runtime": 78.3914, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.45109463315374526, + "learning_rate": 2e-05, + "loss": 0.7024, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6718384027481079, + "eval_runtime": 78.4361, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.42953871838795626, + "learning_rate": 2e-05, + "loss": 0.6904, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6703083515167236, + "eval_runtime": 78.3863, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.4248607379284984, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6693080067634583, + "eval_runtime": 78.4373, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.42839417453459494, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6689594984054565, + "eval_runtime": 78.4169, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.4216922788166874, + "learning_rate": 2e-05, + "loss": 0.7189, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6689300537109375, + "eval_runtime": 78.9793, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.45199575791858004, + "learning_rate": 2e-05, + "loss": 0.6438, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.6690151691436768, + "eval_runtime": 78.5002, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.318, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.4166923177293841, + "learning_rate": 2e-05, + "loss": 0.6885, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.6688613891601562, + "eval_runtime": 80.5497, + "eval_samples_per_second": 2.483, + "eval_steps_per_second": 0.31, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.45164281863366285, + "learning_rate": 2e-05, + "loss": 0.7197, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.6687932014465332, + "eval_runtime": 80.1482, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.45653924787504446, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.6690963506698608, + "eval_runtime": 80.4464, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 0.311, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.4966562341334706, + "learning_rate": 2e-05, + "loss": 0.6532, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.669116735458374, + "eval_runtime": 79.8294, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.4838469303220975, + "learning_rate": 2e-05, + "loss": 0.6883, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.6693156957626343, + "eval_runtime": 80.25, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.312, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.4836820906895964, + "learning_rate": 2e-05, + "loss": 0.7106, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6704170107841492, + "eval_runtime": 79.9636, + "eval_samples_per_second": 2.501, + "eval_steps_per_second": 0.313, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.4945855983140219, + "learning_rate": 2e-05, + "loss": 0.6336, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.6708824038505554, + "eval_runtime": 80.8044, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 0.309, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.44587847230103017, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.6723968982696533, + "eval_runtime": 80.1715, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.5351063503195825, + "learning_rate": 2e-05, + "loss": 0.6222, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.672196626663208, + "eval_runtime": 79.927, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.4742985088010474, + "learning_rate": 2e-05, + "loss": 0.6157, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.671062171459198, + "eval_runtime": 80.1997, + "eval_samples_per_second": 2.494, + "eval_steps_per_second": 0.312, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.5188882333349506, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.6701972484588623, + "eval_runtime": 81.6643, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.45328063593983603, + "learning_rate": 2e-05, + "loss": 0.7058, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6699164509773254, + "eval_runtime": 81.2228, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5197645538332801, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.6702597141265869, + "eval_runtime": 81.1451, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.5762528184834232, + "learning_rate": 2e-05, + "loss": 0.6259, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6696366667747498, + "eval_runtime": 81.1643, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 0.308, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.5249503180293145, + "learning_rate": 2e-05, + "loss": 0.6045, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.6688054800033569, + "eval_runtime": 80.9492, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 0.309, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.543503888655844, + "learning_rate": 2e-05, + "loss": 0.6496, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6689916849136353, + "eval_runtime": 81.6473, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 0.306, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.48119553592193554, + "learning_rate": 2e-05, + "loss": 0.6211, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6703050136566162, + "eval_runtime": 81.9207, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5153356086819314, + "learning_rate": 2e-05, + "loss": 0.7135, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6702842116355896, + "eval_runtime": 81.1503, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5249915042825578, + "learning_rate": 2e-05, + "loss": 0.6635, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.6687333583831787, + "eval_runtime": 81.6743, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.5204840219868723, + "learning_rate": 2e-05, + "loss": 0.6701, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6657728552818298, + "eval_runtime": 81.106, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 0.308, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.5266935225120133, + "learning_rate": 2e-05, + "loss": 0.6637, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6641908884048462, + "eval_runtime": 82.2613, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.5438859451742696, + "learning_rate": 2e-05, + "loss": 0.6168, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6652233600616455, + "eval_runtime": 82.042, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 0.305, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.5716385253433929, + "learning_rate": 2e-05, + "loss": 0.6062, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.6656240820884705, + "eval_runtime": 81.233, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 1.0572787630142522, + "learning_rate": 2e-05, + "loss": 0.7037, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.6645559072494507, + "eval_runtime": 81.2099, + "eval_samples_per_second": 2.463, + "eval_steps_per_second": 0.308, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.5924889323251107, + "learning_rate": 2e-05, + "loss": 0.712, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6619111895561218, + "eval_runtime": 81.7826, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.5290576915218269, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6609540581703186, + "eval_runtime": 82.9922, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.5831209517049147, + "learning_rate": 2e-05, + "loss": 0.6547, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.660676896572113, + "eval_runtime": 83.6541, + "eval_samples_per_second": 2.391, + "eval_steps_per_second": 0.299, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.5320966369511158, + "learning_rate": 2e-05, + "loss": 0.6968, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6618594527244568, + "eval_runtime": 83.1148, + "eval_samples_per_second": 2.406, + "eval_steps_per_second": 0.301, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.5829636446837394, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6635661125183105, + "eval_runtime": 82.8183, + "eval_samples_per_second": 2.415, + "eval_steps_per_second": 0.302, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.4975095056459566, + "learning_rate": 2e-05, + "loss": 0.6535, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.6641671657562256, + "eval_runtime": 83.0267, + "eval_samples_per_second": 2.409, + "eval_steps_per_second": 0.301, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.5625698523064815, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6639044880867004, + "eval_runtime": 83.3881, + "eval_samples_per_second": 2.398, + "eval_steps_per_second": 0.3, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.5436196850683295, + "learning_rate": 2e-05, + "loss": 0.6485, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.6651788353919983, + "eval_runtime": 82.7096, + "eval_samples_per_second": 2.418, + "eval_steps_per_second": 0.302, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.5598906287609361, + "learning_rate": 2e-05, + "loss": 0.6142, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.6688636541366577, + "eval_runtime": 82.601, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 0.7572979310697923, + "learning_rate": 2e-05, + "loss": 0.6221, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.6699694991111755, + "eval_runtime": 82.6032, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 0.6173309690580897, + "learning_rate": 2e-05, + "loss": 0.5919, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.6706527471542358, + "eval_runtime": 82.9732, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.643241771517866, + "learning_rate": 2e-05, + "loss": 0.7081, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.6700320243835449, + "eval_runtime": 84.5621, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.577638137570571, + "learning_rate": 2e-05, + "loss": 0.6873, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.669111430644989, + "eval_runtime": 84.5124, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.7229488296023369, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.6664154529571533, + "eval_runtime": 84.6437, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.295, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.5827815449039045, + "learning_rate": 2e-05, + "loss": 0.669, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6641202569007874, + "eval_runtime": 84.489, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.57507354017269, + "learning_rate": 2e-05, + "loss": 0.6474, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6623325347900391, + "eval_runtime": 84.5536, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.5810844862533651, + "learning_rate": 2e-05, + "loss": 0.6048, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.6619194746017456, + "eval_runtime": 84.2296, + "eval_samples_per_second": 2.374, + "eval_steps_per_second": 0.297, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.6075032415813726, + "learning_rate": 2e-05, + "loss": 0.6529, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6626202464103699, + "eval_runtime": 84.9703, + "eval_samples_per_second": 2.354, + "eval_steps_per_second": 0.294, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.6402642234375245, + "learning_rate": 2e-05, + "loss": 0.6433, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.663289487361908, + "eval_runtime": 84.8924, + "eval_samples_per_second": 2.356, + "eval_steps_per_second": 0.294, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.6335996982657431, + "learning_rate": 2e-05, + "loss": 0.6815, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.6636109948158264, + "eval_runtime": 85.0551, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.5796846795848909, + "learning_rate": 2e-05, + "loss": 0.6236, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.6652829051017761, + "eval_runtime": 84.7574, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.295, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.5380402145760035, + "learning_rate": 2e-05, + "loss": 0.6564, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.6676375865936279, + "eval_runtime": 86.2058, + "eval_samples_per_second": 2.32, + "eval_steps_per_second": 0.29, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.5964298255824012, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.6698520183563232, + "eval_runtime": 85.8955, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.561279296875, + "learning_rate": 2e-05, + "loss": 0.6395, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.6705803871154785, + "eval_runtime": 86.0036, + "eval_samples_per_second": 2.325, + "eval_steps_per_second": 0.291, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.6757292755073548, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.6679538488388062, + "eval_runtime": 85.5379, + "eval_samples_per_second": 2.338, + "eval_steps_per_second": 0.292, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.659077163070129, + "learning_rate": 2e-05, + "loss": 0.6078, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6667564511299133, + "eval_runtime": 85.752, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.292, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.6215405566454576, + "learning_rate": 2e-05, + "loss": 0.6603, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.665945291519165, + "eval_runtime": 92.3086, + "eval_samples_per_second": 2.167, + "eval_steps_per_second": 0.271, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.6130534921490498, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6661685109138489, + "eval_runtime": 87.1917, + "eval_samples_per_second": 2.294, + "eval_steps_per_second": 0.287, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.6025415602868736, + "learning_rate": 2e-05, + "loss": 0.6308, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6658704280853271, + "eval_runtime": 86.8233, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.6901593792019413, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6652414202690125, + "eval_runtime": 86.7625, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.6436454697341579, + "learning_rate": 2e-05, + "loss": 0.6912, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6654212474822998, + "eval_runtime": 86.871, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.649040103024529, + "learning_rate": 2e-05, + "loss": 0.6025, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.6654068231582642, + "eval_runtime": 86.7458, + "eval_samples_per_second": 2.306, + "eval_steps_per_second": 0.288, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.6595522131680224, + "learning_rate": 2e-05, + "loss": 0.5973, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.6644830107688904, + "eval_runtime": 86.8739, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.6689891717273936, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.6616199612617493, + "eval_runtime": 86.8222, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.6306846778314292, + "learning_rate": 2e-05, + "loss": 0.6599, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.6592965126037598, + "eval_runtime": 86.8577, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.6021327993890785, + "learning_rate": 2e-05, + "loss": 0.575, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.6580593585968018, + "eval_runtime": 86.7582, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.6174712675568311, + "learning_rate": 2e-05, + "loss": 0.6341, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.6575854420661926, + "eval_runtime": 76.7634, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 256 + } + ], + "logging_steps": 1.0, + "max_steps": 256, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 489287119011840.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-256/training_args.bin b/checkpoint-256/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..80ada675dd1830db80d38411cbdff13cd138ca48 --- /dev/null +++ b/checkpoint-256/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f188470aed89e37f0d7f17497d5475eb84bc639c6ba047e7db9629674c365735 +size 8312 diff --git a/checkpoint-256/zero_to_fp32.py b/checkpoint-256/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-256/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-320/README.md b/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0838e19c978772997ea62c604482bce6c1a8c237 --- /dev/null +++ b/checkpoint-320/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.6-vicuna-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-320/adapter_config.json b/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0d9e6d1f521fdefcdd69001e3f15155910cbbf70 --- /dev/null +++ b/checkpoint-320/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "k_proj", + "gate_proj", + "up_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-320/adapter_model.safetensors b/checkpoint-320/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..484a556d418162e49a42bf701fbc5f757a28bb98 --- /dev/null +++ b/checkpoint-320/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6431e6a5485bf9971a8a505d2ce8ac8f1350546005403146f8f3bab2c6c30f02 +size 65046168 diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75c956d3e74c7830dd9731bc3b88e0154a1db0c4 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800218503888cfc6c6dacef7580b008ce5c28bc77e6d725edc53eb163eb213f4 +size 775138 diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e82e70645c9a178744a8dc23e8cc5fbce7ba8ccc --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02fd758d84cad0221cd366926fce830c888c4a25795ffa592310e55569f8be57 +size 191825901 diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b4567d0cbd31a38caf5de7073a062d6abcf6d32 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b02e6ef69e3a0ba9ce109c85ba6b611ee8a5dcaa7acfb914f2610b48ae72d5b +size 775138 diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3730bd9d75aefc097b1e5a3abeb2413da1dc9cb --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b3cb3c0b9269d0f01562ac97cf7ed7cfd9c803885af1dde56b9a6eda5fd47d +size 191825901 diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81c78494a8837ced1c0b0002e749676e251cbdec --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86079448784de87bb5a32ee61f4e61ec37c91808e8572ee637f70dab7710618a +size 775138 diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..783b56308c1f4124e732268a794a4da8a08d2018 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cde72030703e6207c5148a37bfc458cf75f4464644ba5922dd636d1ed703699 +size 191825901 diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de420fe7ee111721efd9fc1753514c891c64b756 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975b4b22694bd9e2244d3d9338e3df16410609bec48870a280c085c9d0ea6385 +size 775138 diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85726d1d970e58bdf098e8b232c650a427dbaae0 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e7e632b974b0cd0cffd82ac220632d1d22d44d1da09616b219d5f96b07e535c +size 191825901 diff --git a/checkpoint-320/latest b/checkpoint-320/latest new file mode 100644 index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213 --- /dev/null +++ b/checkpoint-320/latest @@ -0,0 +1 @@ +global_step320 \ No newline at end of file diff --git a/checkpoint-320/rng_state_0.pth b/checkpoint-320/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0b9a527d58e9f544523cda61b77080b3d03d824 --- /dev/null +++ b/checkpoint-320/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03af8976c83af29b26ac3bdd42a804bb9e4d7c51eff643b3ad188c88c846c088 +size 14960 diff --git a/checkpoint-320/rng_state_1.pth b/checkpoint-320/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3182bd59e6fba7b8b28fe95246df8e1b8a1c9ee2 --- /dev/null +++ b/checkpoint-320/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65a0552a908836fab6a8e6f840ac7d8de6dafa58227414f46353830c2cac6eae +size 14960 diff --git a/checkpoint-320/rng_state_2.pth b/checkpoint-320/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..45b0262f434b63f9d834d4e880b61b53f2dadb0f --- /dev/null +++ b/checkpoint-320/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05eeeac05a2df77ec2d80d022a5d4c0d3a738fa5f3f0c7f6560893b766f6a722 +size 14960 diff --git a/checkpoint-320/rng_state_3.pth b/checkpoint-320/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3ec2550fa5f351a1500bfa21341cdc5fdb3005a --- /dev/null +++ b/checkpoint-320/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70a680c028586c6979517de8d634e39a4908c3611cac7b62d70f1bcb83f6c594 +size 14960 diff --git a/checkpoint-320/special_tokens_map.json b/checkpoint-320/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-320/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-320/tokenizer.model b/checkpoint-320/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-320/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-320/tokenizer_config.json b/checkpoint-320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3 --- /dev/null +++ b/checkpoint-320/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-320/trainer_state.json b/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b98de20abbfc7e1ce6ed2b7e2c979d4d87692b59 --- /dev/null +++ b/checkpoint-320/trainer_state.json @@ -0,0 +1,4833 @@ +{ + "best_metric": 0.6575854420661926, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.5230235555406132, + "learning_rate": 0.0, + "loss": 1.5809, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.6275018453598022, + "eval_runtime": 82.059, + "eval_samples_per_second": 2.437, + "eval_steps_per_second": 0.305, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.5095402010892089, + "learning_rate": 2e-05, + "loss": 1.4958, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.6275018453598022, + "eval_runtime": 76.5747, + "eval_samples_per_second": 2.612, + "eval_steps_per_second": 0.326, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.4998514282504938, + "learning_rate": 2e-05, + "loss": 1.5552, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.5956931114196777, + "eval_runtime": 76.1563, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.4280580315108126, + "learning_rate": 2e-05, + "loss": 1.4846, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.5584176778793335, + "eval_runtime": 76.1235, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5678499435986384, + "learning_rate": 2e-05, + "loss": 1.5036, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.5207562446594238, + "eval_runtime": 76.1514, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5368461657542534, + "learning_rate": 2e-05, + "loss": 1.476, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.4807783365249634, + "eval_runtime": 77.3444, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4358, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.066, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4369, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.2807, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.323, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4471, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.1562, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.3666, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 77.1645, + "eval_samples_per_second": 2.592, + "eval_steps_per_second": 0.324, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4149, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.7627, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.684588966714067, + "learning_rate": 2e-05, + "loss": 1.3883, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.3679308891296387, + "eval_runtime": 78.4315, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.4271, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.685, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.2495, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.0511, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.6028103951693778, + "learning_rate": 2e-05, + "loss": 1.3513, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.3032653331756592, + "eval_runtime": 78.0271, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.769290402283396, + "learning_rate": 2e-05, + "loss": 1.3117, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.2661188840866089, + "eval_runtime": 78.1857, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 1.3279338025863765, + "learning_rate": 2e-05, + "loss": 1.2768, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.2299447059631348, + "eval_runtime": 78.2064, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.7410327159336384, + "learning_rate": 2e-05, + "loss": 1.256, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.2044258117675781, + "eval_runtime": 78.072, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.44078820770408506, + "learning_rate": 2e-05, + "loss": 1.1252, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.1826122999191284, + "eval_runtime": 78.7312, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.318, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.49020841613371097, + "learning_rate": 2e-05, + "loss": 1.2249, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.1616511344909668, + "eval_runtime": 78.2736, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.43031322695269714, + "learning_rate": 2e-05, + "loss": 1.1466, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 1.1410629749298096, + "eval_runtime": 79.6432, + "eval_samples_per_second": 2.511, + "eval_steps_per_second": 0.314, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.45632085445955545, + "learning_rate": 2e-05, + "loss": 1.1951, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 1.1204684972763062, + "eval_runtime": 79.0609, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.40048586945364495, + "learning_rate": 2e-05, + "loss": 1.1826, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 1.1002545356750488, + "eval_runtime": 82.8578, + "eval_samples_per_second": 2.414, + "eval_steps_per_second": 0.302, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.3703033261027938, + "learning_rate": 2e-05, + "loss": 1.1543, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 1.0805977582931519, + "eval_runtime": 76.1407, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3986313105418924, + "learning_rate": 2e-05, + "loss": 1.1046, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 1.0610157251358032, + "eval_runtime": 76.3083, + "eval_samples_per_second": 2.621, + "eval_steps_per_second": 0.328, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.36265027203577943, + "learning_rate": 2e-05, + "loss": 1.1048, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 1.0421289205551147, + "eval_runtime": 77.2186, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.3881748990218768, + "learning_rate": 2e-05, + "loss": 1.0425, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 1.0240073204040527, + "eval_runtime": 77.8662, + "eval_samples_per_second": 2.569, + "eval_steps_per_second": 0.321, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.3734031294324286, + "learning_rate": 2e-05, + "loss": 1.0484, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 1.0066957473754883, + "eval_runtime": 77.269, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.29695383079342563, + "learning_rate": 2e-05, + "loss": 1.0387, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.9906074404716492, + "eval_runtime": 77.2245, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.29273146875026623, + "learning_rate": 2e-05, + "loss": 1.0568, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.975755512714386, + "eval_runtime": 78.0056, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.35070440686850546, + "learning_rate": 2e-05, + "loss": 0.9114, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9615123271942139, + "eval_runtime": 77.9051, + "eval_samples_per_second": 2.567, + "eval_steps_per_second": 0.321, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.30846157140439384, + "learning_rate": 2e-05, + "loss": 0.9941, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9480571150779724, + "eval_runtime": 77.2322, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.2950381371932973, + "learning_rate": 2e-05, + "loss": 1.0297, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9356330037117004, + "eval_runtime": 81.8443, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 0.305, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.27080038065834283, + "learning_rate": 2e-05, + "loss": 1.021, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.9245791435241699, + "eval_runtime": 76.2071, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.23165081252649894, + "learning_rate": 2e-05, + "loss": 1.0366, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.9151126146316528, + "eval_runtime": 77.0412, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.4033780922500775, + "learning_rate": 2e-05, + "loss": 1.0127, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.9063960313796997, + "eval_runtime": 76.9327, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.2398039831439168, + "learning_rate": 2e-05, + "loss": 0.9418, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.8982363939285278, + "eval_runtime": 76.1234, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.28793451241246804, + "learning_rate": 2e-05, + "loss": 0.9643, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8908895254135132, + "eval_runtime": 76.2877, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.2927691606307197, + "learning_rate": 2e-05, + "loss": 1.0087, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8845618367195129, + "eval_runtime": 76.2282, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.26410982001408806, + "learning_rate": 2e-05, + "loss": 0.986, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8784474730491638, + "eval_runtime": 76.2512, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.29182630949665306, + "learning_rate": 2e-05, + "loss": 0.9711, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8725223541259766, + "eval_runtime": 77.1229, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.36402838796832665, + "learning_rate": 2e-05, + "loss": 0.9263, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8662790060043335, + "eval_runtime": 77.2362, + "eval_samples_per_second": 2.589, + "eval_steps_per_second": 0.324, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.29338184478895163, + "learning_rate": 2e-05, + "loss": 0.8947, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8600431680679321, + "eval_runtime": 77.1213, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.2201714229702277, + "learning_rate": 2e-05, + "loss": 0.9059, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8545799255371094, + "eval_runtime": 77.991, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.2254966625243654, + "learning_rate": 2e-05, + "loss": 0.8942, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8497399687767029, + "eval_runtime": 77.2698, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21753318432075458, + "learning_rate": 2e-05, + "loss": 0.9376, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8452473282814026, + "eval_runtime": 77.0568, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.21449718265972945, + "learning_rate": 2e-05, + "loss": 0.9369, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.841134786605835, + "eval_runtime": 77.225, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.2109063266748924, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8373770117759705, + "eval_runtime": 76.2309, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.232838633689838, + "learning_rate": 2e-05, + "loss": 0.8694, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8338289856910706, + "eval_runtime": 76.277, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4189704940803984, + "learning_rate": 2e-05, + "loss": 0.8464, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8297132849693298, + "eval_runtime": 76.2872, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.2171618165123276, + "learning_rate": 2e-05, + "loss": 0.8785, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8257431983947754, + "eval_runtime": 76.2639, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21934651037670305, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.8223557472229004, + "eval_runtime": 76.2383, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.24183530733164746, + "learning_rate": 2e-05, + "loss": 0.9218, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8189653158187866, + "eval_runtime": 76.9819, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.23450930244279267, + "learning_rate": 2e-05, + "loss": 0.8896, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.8152530193328857, + "eval_runtime": 76.2378, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.22081665899796085, + "learning_rate": 2e-05, + "loss": 0.8798, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8122122287750244, + "eval_runtime": 76.289, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.21311746114111046, + "learning_rate": 2e-05, + "loss": 0.9482, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8092318773269653, + "eval_runtime": 77.8321, + "eval_samples_per_second": 2.57, + "eval_steps_per_second": 0.321, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.2496565307107556, + "learning_rate": 2e-05, + "loss": 0.8917, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.8070546984672546, + "eval_runtime": 77.2651, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2137866456424736, + "learning_rate": 2e-05, + "loss": 0.909, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.8049566745758057, + "eval_runtime": 78.0925, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.22567502859345095, + "learning_rate": 2e-05, + "loss": 0.8611, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8028810024261475, + "eval_runtime": 78.0553, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.23303796552302508, + "learning_rate": 2e-05, + "loss": 0.9209, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.800568699836731, + "eval_runtime": 78.052, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24566727726974544, + "learning_rate": 2e-05, + "loss": 0.8239, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.7976545691490173, + "eval_runtime": 77.3056, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.23014192522354907, + "learning_rate": 2e-05, + "loss": 0.8814, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.7945474982261658, + "eval_runtime": 77.3398, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.23042819102671622, + "learning_rate": 2e-05, + "loss": 0.9064, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.7918359637260437, + "eval_runtime": 77.4272, + "eval_samples_per_second": 2.583, + "eval_steps_per_second": 0.323, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.23940667173206315, + "learning_rate": 2e-05, + "loss": 0.8658, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7891160845756531, + "eval_runtime": 77.3236, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.22630342930143643, + "learning_rate": 2e-05, + "loss": 0.8403, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7859742641448975, + "eval_runtime": 77.2001, + "eval_samples_per_second": 2.591, + "eval_steps_per_second": 0.324, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.20949240460260976, + "learning_rate": 2e-05, + "loss": 0.8472, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7834083437919617, + "eval_runtime": 78.9646, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.22714400479820654, + "learning_rate": 2e-05, + "loss": 0.841, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7805308699607849, + "eval_runtime": 78.7552, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.317, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.23345123077006047, + "learning_rate": 2e-05, + "loss": 0.9028, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7779514789581299, + "eval_runtime": 78.3387, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.251841542575211, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7756664752960205, + "eval_runtime": 78.3109, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.23548386839773608, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7733604907989502, + "eval_runtime": 78.9712, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23262740912668387, + "learning_rate": 2e-05, + "loss": 0.8778, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.771755576133728, + "eval_runtime": 78.2633, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.22075289612357513, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7705450654029846, + "eval_runtime": 78.3151, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25520381955936466, + "learning_rate": 2e-05, + "loss": 0.8387, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7695029973983765, + "eval_runtime": 78.2901, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2047305385827267, + "learning_rate": 2e-05, + "loss": 0.8404, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7684457302093506, + "eval_runtime": 78.3875, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.2262323045133288, + "learning_rate": 2e-05, + "loss": 0.8811, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7671162486076355, + "eval_runtime": 78.202, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.21885464923925876, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7658494710922241, + "eval_runtime": 78.1746, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.21717306953626966, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7642120122909546, + "eval_runtime": 78.2026, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2530725583748258, + "learning_rate": 2e-05, + "loss": 0.8584, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7625510692596436, + "eval_runtime": 78.1991, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.25354787036627263, + "learning_rate": 2e-05, + "loss": 0.8569, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7616268396377563, + "eval_runtime": 78.2915, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.2800865746664007, + "learning_rate": 2e-05, + "loss": 0.9116, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7603214979171753, + "eval_runtime": 78.2749, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.268139688449618, + "learning_rate": 2e-05, + "loss": 0.8397, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7584869265556335, + "eval_runtime": 79.1445, + "eval_samples_per_second": 2.527, + "eval_steps_per_second": 0.316, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.3128648654463789, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.7566561102867126, + "eval_runtime": 79.2089, + "eval_samples_per_second": 2.525, + "eval_steps_per_second": 0.316, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2502355211215609, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.7547345161437988, + "eval_runtime": 79.2691, + "eval_samples_per_second": 2.523, + "eval_steps_per_second": 0.315, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.25281184629018644, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7527951598167419, + "eval_runtime": 79.4068, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.24246729562645003, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7509815096855164, + "eval_runtime": 79.1612, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.27005475109453947, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7485950589179993, + "eval_runtime": 80.0714, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.2723492355800971, + "learning_rate": 2e-05, + "loss": 0.8117, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7459420561790466, + "eval_runtime": 79.4075, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.2946493898427159, + "learning_rate": 2e-05, + "loss": 0.8986, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7436455488204956, + "eval_runtime": 79.3721, + "eval_samples_per_second": 2.52, + "eval_steps_per_second": 0.315, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.26411214734213284, + "learning_rate": 2e-05, + "loss": 0.8145, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7424752712249756, + "eval_runtime": 79.2988, + "eval_samples_per_second": 2.522, + "eval_steps_per_second": 0.315, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.27115747269014817, + "learning_rate": 2e-05, + "loss": 0.8457, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7416408658027649, + "eval_runtime": 79.4004, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.25831877964821937, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7404463291168213, + "eval_runtime": 81.7767, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.31273388454942935, + "learning_rate": 2e-05, + "loss": 0.8562, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7384185791015625, + "eval_runtime": 82.3443, + "eval_samples_per_second": 2.429, + "eval_steps_per_second": 0.304, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.2838267071008901, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7366807460784912, + "eval_runtime": 82.2622, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.28625827941831467, + "learning_rate": 2e-05, + "loss": 0.8618, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7357398867607117, + "eval_runtime": 81.9471, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.25548002643954326, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7356534004211426, + "eval_runtime": 82.1186, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.27081450830961107, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7346957921981812, + "eval_runtime": 81.5463, + "eval_samples_per_second": 2.453, + "eval_steps_per_second": 0.307, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.2985486737236676, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7325752377510071, + "eval_runtime": 81.7804, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.29149719690624026, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.7298976182937622, + "eval_runtime": 76.2764, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.25227859825215865, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.727373480796814, + "eval_runtime": 76.2418, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.27316954971752555, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7254325747489929, + "eval_runtime": 76.1474, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.24239788607957785, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.724058985710144, + "eval_runtime": 76.2391, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.25648385925427025, + "learning_rate": 2e-05, + "loss": 0.8195, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7235870957374573, + "eval_runtime": 76.9134, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.29620170789161204, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7228152751922607, + "eval_runtime": 76.095, + "eval_samples_per_second": 2.628, + "eval_steps_per_second": 0.329, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3484116181139593, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7209363579750061, + "eval_runtime": 76.9377, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.25212350156184643, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7197096347808838, + "eval_runtime": 76.2008, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.264200147608962, + "learning_rate": 2e-05, + "loss": 0.8371, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7197055220603943, + "eval_runtime": 78.1542, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.3309431084940201, + "learning_rate": 2e-05, + "loss": 0.6999, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.7187016010284424, + "eval_runtime": 78.4259, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.3131644456919823, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.717018187046051, + "eval_runtime": 78.4558, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.33527684120780293, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7147062420845032, + "eval_runtime": 78.2334, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.32, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.29542683956231724, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7130224704742432, + "eval_runtime": 79.1179, + "eval_samples_per_second": 2.528, + "eval_steps_per_second": 0.316, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.31128698002926114, + "learning_rate": 2e-05, + "loss": 0.8153, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7120551466941833, + "eval_runtime": 80.292, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 0.311, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.32502558864214215, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7117202877998352, + "eval_runtime": 79.7539, + "eval_samples_per_second": 2.508, + "eval_steps_per_second": 0.313, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.34335720855758517, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7117029428482056, + "eval_runtime": 80.0281, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.31951931695644, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.7116554379463196, + "eval_runtime": 79.7209, + "eval_samples_per_second": 2.509, + "eval_steps_per_second": 0.314, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.28067192963874266, + "learning_rate": 2e-05, + "loss": 0.8045, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7118353843688965, + "eval_runtime": 80.0195, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.2739718257400276, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7122579216957092, + "eval_runtime": 76.2052, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.31401723658881836, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7118574380874634, + "eval_runtime": 76.1509, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.36925964858634625, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.710691511631012, + "eval_runtime": 76.2305, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.3050583880654791, + "learning_rate": 2e-05, + "loss": 0.8402, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7096763849258423, + "eval_runtime": 77.0581, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.2648625651290031, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7094223499298096, + "eval_runtime": 76.1379, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.3107221696449271, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7081363201141357, + "eval_runtime": 76.626, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.3455151299995048, + "learning_rate": 2e-05, + "loss": 0.8342, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7063001990318298, + "eval_runtime": 77.0293, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.28847071926472523, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.7044610381126404, + "eval_runtime": 76.2385, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.26753816515069856, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7033799886703491, + "eval_runtime": 76.1985, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.3465046292893005, + "learning_rate": 2e-05, + "loss": 0.8144, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7021930813789368, + "eval_runtime": 76.2234, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.3451690427620698, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7013542652130127, + "eval_runtime": 78.0752, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.31571858642673567, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7007560729980469, + "eval_runtime": 78.3558, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3247003540270338, + "learning_rate": 2e-05, + "loss": 0.6714, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.6999780535697937, + "eval_runtime": 78.9788, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.2814983490019739, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.6998200416564941, + "eval_runtime": 78.3093, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.31961631715145106, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.6995271444320679, + "eval_runtime": 78.2172, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.32333364662215863, + "learning_rate": 2e-05, + "loss": 0.7896, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.6992727518081665, + "eval_runtime": 79.0125, + "eval_samples_per_second": 2.531, + "eval_steps_per_second": 0.316, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3255859640449829, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.6988572478294373, + "eval_runtime": 79.0, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.3307068947429175, + "learning_rate": 2e-05, + "loss": 0.8416, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.6981343030929565, + "eval_runtime": 78.3309, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.3842303818116732, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.6968980431556702, + "eval_runtime": 78.5608, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.331839472419003, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.6955949664115906, + "eval_runtime": 78.3566, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.31864813130499836, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.6951528787612915, + "eval_runtime": 79.7802, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.352549164434451, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.6947290897369385, + "eval_runtime": 79.8171, + "eval_samples_per_second": 2.506, + "eval_steps_per_second": 0.313, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.37128812818896284, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.6937370300292969, + "eval_runtime": 79.7782, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.3348014941412048, + "learning_rate": 2e-05, + "loss": 0.7079, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.692456066608429, + "eval_runtime": 79.9308, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.34411051658527964, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.6915809512138367, + "eval_runtime": 79.943, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.3373909601921749, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.6912103295326233, + "eval_runtime": 79.8515, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.33253827371305456, + "learning_rate": 2e-05, + "loss": 0.7224, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.6912806630134583, + "eval_runtime": 80.6475, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 0.31, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.38458075172588313, + "learning_rate": 2e-05, + "loss": 0.7261, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.6905419230461121, + "eval_runtime": 80.2606, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.311, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.31351962640463144, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.6898491382598877, + "eval_runtime": 79.9965, + "eval_samples_per_second": 2.5, + "eval_steps_per_second": 0.313, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.35474372115704583, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.6893147230148315, + "eval_runtime": 1475.5758, + "eval_samples_per_second": 0.136, + "eval_steps_per_second": 0.017, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.3479568917421202, + "learning_rate": 2e-05, + "loss": 0.6638, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.6884538531303406, + "eval_runtime": 84.6835, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.295, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.3421823344428645, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.6873475909233093, + "eval_runtime": 83.3138, + "eval_samples_per_second": 2.401, + "eval_steps_per_second": 0.3, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.3642187020830788, + "learning_rate": 2e-05, + "loss": 0.6825, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.6858401298522949, + "eval_runtime": 82.1066, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.35097547901391785, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.6848779320716858, + "eval_runtime": 84.4076, + "eval_samples_per_second": 2.369, + "eval_steps_per_second": 0.296, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.3568694843794629, + "learning_rate": 2e-05, + "loss": 0.7176, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.6842290759086609, + "eval_runtime": 82.5945, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.34258633585260334, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.6838659048080444, + "eval_runtime": 85.9626, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.291, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.42319523894659655, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.6830299496650696, + "eval_runtime": 85.7189, + "eval_samples_per_second": 2.333, + "eval_steps_per_second": 0.292, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.3632195533127194, + "learning_rate": 2e-05, + "loss": 0.715, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.6826379895210266, + "eval_runtime": 87.8244, + "eval_samples_per_second": 2.277, + "eval_steps_per_second": 0.285, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.3738308004604413, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.6826817393302917, + "eval_runtime": 86.5822, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.3618696330632776, + "learning_rate": 2e-05, + "loss": 0.6632, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.6827967166900635, + "eval_runtime": 82.1829, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 0.304, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.38901912569992203, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.6821711659431458, + "eval_runtime": 84.4511, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.3516096507348829, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.6819837689399719, + "eval_runtime": 84.1594, + "eval_samples_per_second": 2.376, + "eval_steps_per_second": 0.297, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.36066902463794986, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.6817716956138611, + "eval_runtime": 83.8929, + "eval_samples_per_second": 2.384, + "eval_steps_per_second": 0.298, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.36641784926154175, + "learning_rate": 2e-05, + "loss": 0.7116, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.6816902160644531, + "eval_runtime": 84.4431, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.4020716293225933, + "learning_rate": 2e-05, + "loss": 0.7142, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.6811469793319702, + "eval_runtime": 86.0681, + "eval_samples_per_second": 2.324, + "eval_steps_per_second": 0.29, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.38360882669254054, + "learning_rate": 2e-05, + "loss": 0.6756, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.6798409223556519, + "eval_runtime": 81.9903, + "eval_samples_per_second": 2.439, + "eval_steps_per_second": 0.305, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.34966156213066135, + "learning_rate": 2e-05, + "loss": 0.827, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.6788859367370605, + "eval_runtime": 76.1753, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.41140842939901384, + "learning_rate": 2e-05, + "loss": 0.6409, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.6787077188491821, + "eval_runtime": 76.2239, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.4222084070163774, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.6796822547912598, + "eval_runtime": 76.2141, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.4644454724424921, + "learning_rate": 2e-05, + "loss": 0.6057, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.6794346570968628, + "eval_runtime": 76.3216, + "eval_samples_per_second": 2.62, + "eval_steps_per_second": 0.328, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.46128725263272996, + "learning_rate": 2e-05, + "loss": 0.7158, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.6791612505912781, + "eval_runtime": 78.4909, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.319, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.37300666872025545, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.6788016557693481, + "eval_runtime": 78.5697, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.41454648576180214, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.6787048578262329, + "eval_runtime": 78.5317, + "eval_samples_per_second": 2.547, + "eval_steps_per_second": 0.318, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.40724665091386236, + "learning_rate": 2e-05, + "loss": 0.6944, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.679679811000824, + "eval_runtime": 78.6899, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.3875110486208986, + "learning_rate": 2e-05, + "loss": 0.6634, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.6819935441017151, + "eval_runtime": 78.3617, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.47956532155617193, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.6825206875801086, + "eval_runtime": 78.4435, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.4599359590587781, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.6816768050193787, + "eval_runtime": 78.3005, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4057490487995386, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.6806090474128723, + "eval_runtime": 78.3313, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4143979315360467, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.6795693039894104, + "eval_runtime": 78.4526, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.4219663662343445, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.6793847680091858, + "eval_runtime": 78.8009, + "eval_samples_per_second": 2.538, + "eval_steps_per_second": 0.317, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.4491811321927657, + "learning_rate": 2e-05, + "loss": 0.7004, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6775352358818054, + "eval_runtime": 80.0685, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.46366516532638885, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6748698949813843, + "eval_runtime": 80.0487, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.3815188640227797, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6728273034095764, + "eval_runtime": 80.0318, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.41025429416666304, + "learning_rate": 2e-05, + "loss": 0.6585, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6718859672546387, + "eval_runtime": 79.8801, + "eval_samples_per_second": 2.504, + "eval_steps_per_second": 0.313, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.40652817592240054, + "learning_rate": 2e-05, + "loss": 0.6611, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6715708374977112, + "eval_runtime": 76.7261, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.40753961326688415, + "learning_rate": 2e-05, + "loss": 0.6779, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6719761490821838, + "eval_runtime": 77.0136, + "eval_samples_per_second": 2.597, + "eval_steps_per_second": 0.325, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.4232811980671673, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.6724664568901062, + "eval_runtime": 76.9731, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5132756318549849, + "learning_rate": 2e-05, + "loss": 0.6801, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.6723365783691406, + "eval_runtime": 76.4132, + "eval_samples_per_second": 2.617, + "eval_steps_per_second": 0.327, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.43526879230161264, + "learning_rate": 2e-05, + "loss": 0.6673, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.672926664352417, + "eval_runtime": 76.1936, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.46965560853038507, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.6731134057044983, + "eval_runtime": 76.2345, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.4733296318676217, + "learning_rate": 2e-05, + "loss": 0.6791, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6726363301277161, + "eval_runtime": 78.3939, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.4662943253655961, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6726526021957397, + "eval_runtime": 79.1834, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.4420962889993382, + "learning_rate": 2e-05, + "loss": 0.675, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.6727125644683838, + "eval_runtime": 78.252, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.319, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4345166976944551, + "learning_rate": 2e-05, + "loss": 0.6748, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6725904941558838, + "eval_runtime": 78.3914, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.45109463315374526, + "learning_rate": 2e-05, + "loss": 0.7024, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6718384027481079, + "eval_runtime": 78.4361, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.42953871838795626, + "learning_rate": 2e-05, + "loss": 0.6904, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6703083515167236, + "eval_runtime": 78.3863, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.4248607379284984, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6693080067634583, + "eval_runtime": 78.4373, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.42839417453459494, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6689594984054565, + "eval_runtime": 78.4169, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.4216922788166874, + "learning_rate": 2e-05, + "loss": 0.7189, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6689300537109375, + "eval_runtime": 78.9793, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.45199575791858004, + "learning_rate": 2e-05, + "loss": 0.6438, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.6690151691436768, + "eval_runtime": 78.5002, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.318, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.4166923177293841, + "learning_rate": 2e-05, + "loss": 0.6885, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.6688613891601562, + "eval_runtime": 80.5497, + "eval_samples_per_second": 2.483, + "eval_steps_per_second": 0.31, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.45164281863366285, + "learning_rate": 2e-05, + "loss": 0.7197, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.6687932014465332, + "eval_runtime": 80.1482, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.45653924787504446, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.6690963506698608, + "eval_runtime": 80.4464, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 0.311, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.4966562341334706, + "learning_rate": 2e-05, + "loss": 0.6532, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.669116735458374, + "eval_runtime": 79.8294, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.4838469303220975, + "learning_rate": 2e-05, + "loss": 0.6883, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.6693156957626343, + "eval_runtime": 80.25, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.312, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.4836820906895964, + "learning_rate": 2e-05, + "loss": 0.7106, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6704170107841492, + "eval_runtime": 79.9636, + "eval_samples_per_second": 2.501, + "eval_steps_per_second": 0.313, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.4945855983140219, + "learning_rate": 2e-05, + "loss": 0.6336, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.6708824038505554, + "eval_runtime": 80.8044, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 0.309, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.44587847230103017, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.6723968982696533, + "eval_runtime": 80.1715, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.5351063503195825, + "learning_rate": 2e-05, + "loss": 0.6222, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.672196626663208, + "eval_runtime": 79.927, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.4742985088010474, + "learning_rate": 2e-05, + "loss": 0.6157, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.671062171459198, + "eval_runtime": 80.1997, + "eval_samples_per_second": 2.494, + "eval_steps_per_second": 0.312, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.5188882333349506, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.6701972484588623, + "eval_runtime": 81.6643, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.45328063593983603, + "learning_rate": 2e-05, + "loss": 0.7058, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6699164509773254, + "eval_runtime": 81.2228, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5197645538332801, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.6702597141265869, + "eval_runtime": 81.1451, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.5762528184834232, + "learning_rate": 2e-05, + "loss": 0.6259, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6696366667747498, + "eval_runtime": 81.1643, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 0.308, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.5249503180293145, + "learning_rate": 2e-05, + "loss": 0.6045, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.6688054800033569, + "eval_runtime": 80.9492, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 0.309, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.543503888655844, + "learning_rate": 2e-05, + "loss": 0.6496, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6689916849136353, + "eval_runtime": 81.6473, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 0.306, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.48119553592193554, + "learning_rate": 2e-05, + "loss": 0.6211, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6703050136566162, + "eval_runtime": 81.9207, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5153356086819314, + "learning_rate": 2e-05, + "loss": 0.7135, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6702842116355896, + "eval_runtime": 81.1503, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5249915042825578, + "learning_rate": 2e-05, + "loss": 0.6635, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.6687333583831787, + "eval_runtime": 81.6743, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.5204840219868723, + "learning_rate": 2e-05, + "loss": 0.6701, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6657728552818298, + "eval_runtime": 81.106, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 0.308, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.5266935225120133, + "learning_rate": 2e-05, + "loss": 0.6637, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6641908884048462, + "eval_runtime": 82.2613, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.5438859451742696, + "learning_rate": 2e-05, + "loss": 0.6168, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6652233600616455, + "eval_runtime": 82.042, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 0.305, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.5716385253433929, + "learning_rate": 2e-05, + "loss": 0.6062, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.6656240820884705, + "eval_runtime": 81.233, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 1.0572787630142522, + "learning_rate": 2e-05, + "loss": 0.7037, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.6645559072494507, + "eval_runtime": 81.2099, + "eval_samples_per_second": 2.463, + "eval_steps_per_second": 0.308, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.5924889323251107, + "learning_rate": 2e-05, + "loss": 0.712, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6619111895561218, + "eval_runtime": 81.7826, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.5290576915218269, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6609540581703186, + "eval_runtime": 82.9922, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.5831209517049147, + "learning_rate": 2e-05, + "loss": 0.6547, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.660676896572113, + "eval_runtime": 83.6541, + "eval_samples_per_second": 2.391, + "eval_steps_per_second": 0.299, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.5320966369511158, + "learning_rate": 2e-05, + "loss": 0.6968, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6618594527244568, + "eval_runtime": 83.1148, + "eval_samples_per_second": 2.406, + "eval_steps_per_second": 0.301, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.5829636446837394, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6635661125183105, + "eval_runtime": 82.8183, + "eval_samples_per_second": 2.415, + "eval_steps_per_second": 0.302, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.4975095056459566, + "learning_rate": 2e-05, + "loss": 0.6535, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.6641671657562256, + "eval_runtime": 83.0267, + "eval_samples_per_second": 2.409, + "eval_steps_per_second": 0.301, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.5625698523064815, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6639044880867004, + "eval_runtime": 83.3881, + "eval_samples_per_second": 2.398, + "eval_steps_per_second": 0.3, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.5436196850683295, + "learning_rate": 2e-05, + "loss": 0.6485, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.6651788353919983, + "eval_runtime": 82.7096, + "eval_samples_per_second": 2.418, + "eval_steps_per_second": 0.302, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.5598906287609361, + "learning_rate": 2e-05, + "loss": 0.6142, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.6688636541366577, + "eval_runtime": 82.601, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 0.7572979310697923, + "learning_rate": 2e-05, + "loss": 0.6221, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.6699694991111755, + "eval_runtime": 82.6032, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 0.6173309690580897, + "learning_rate": 2e-05, + "loss": 0.5919, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.6706527471542358, + "eval_runtime": 82.9732, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.643241771517866, + "learning_rate": 2e-05, + "loss": 0.7081, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.6700320243835449, + "eval_runtime": 84.5621, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.577638137570571, + "learning_rate": 2e-05, + "loss": 0.6873, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.669111430644989, + "eval_runtime": 84.5124, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.7229488296023369, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.6664154529571533, + "eval_runtime": 84.6437, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.295, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.5827815449039045, + "learning_rate": 2e-05, + "loss": 0.669, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6641202569007874, + "eval_runtime": 84.489, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.57507354017269, + "learning_rate": 2e-05, + "loss": 0.6474, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6623325347900391, + "eval_runtime": 84.5536, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.5810844862533651, + "learning_rate": 2e-05, + "loss": 0.6048, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.6619194746017456, + "eval_runtime": 84.2296, + "eval_samples_per_second": 2.374, + "eval_steps_per_second": 0.297, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.6075032415813726, + "learning_rate": 2e-05, + "loss": 0.6529, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6626202464103699, + "eval_runtime": 84.9703, + "eval_samples_per_second": 2.354, + "eval_steps_per_second": 0.294, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.6402642234375245, + "learning_rate": 2e-05, + "loss": 0.6433, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.663289487361908, + "eval_runtime": 84.8924, + "eval_samples_per_second": 2.356, + "eval_steps_per_second": 0.294, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.6335996982657431, + "learning_rate": 2e-05, + "loss": 0.6815, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.6636109948158264, + "eval_runtime": 85.0551, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.5796846795848909, + "learning_rate": 2e-05, + "loss": 0.6236, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.6652829051017761, + "eval_runtime": 84.7574, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.295, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.5380402145760035, + "learning_rate": 2e-05, + "loss": 0.6564, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.6676375865936279, + "eval_runtime": 86.2058, + "eval_samples_per_second": 2.32, + "eval_steps_per_second": 0.29, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.5964298255824012, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.6698520183563232, + "eval_runtime": 85.8955, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.561279296875, + "learning_rate": 2e-05, + "loss": 0.6395, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.6705803871154785, + "eval_runtime": 86.0036, + "eval_samples_per_second": 2.325, + "eval_steps_per_second": 0.291, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.6757292755073548, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.6679538488388062, + "eval_runtime": 85.5379, + "eval_samples_per_second": 2.338, + "eval_steps_per_second": 0.292, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.659077163070129, + "learning_rate": 2e-05, + "loss": 0.6078, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6667564511299133, + "eval_runtime": 85.752, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.292, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.6215405566454576, + "learning_rate": 2e-05, + "loss": 0.6603, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.665945291519165, + "eval_runtime": 92.3086, + "eval_samples_per_second": 2.167, + "eval_steps_per_second": 0.271, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.6130534921490498, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6661685109138489, + "eval_runtime": 87.1917, + "eval_samples_per_second": 2.294, + "eval_steps_per_second": 0.287, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.6025415602868736, + "learning_rate": 2e-05, + "loss": 0.6308, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6658704280853271, + "eval_runtime": 86.8233, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.6901593792019413, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6652414202690125, + "eval_runtime": 86.7625, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.6436454697341579, + "learning_rate": 2e-05, + "loss": 0.6912, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6654212474822998, + "eval_runtime": 86.871, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.649040103024529, + "learning_rate": 2e-05, + "loss": 0.6025, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.6654068231582642, + "eval_runtime": 86.7458, + "eval_samples_per_second": 2.306, + "eval_steps_per_second": 0.288, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.6595522131680224, + "learning_rate": 2e-05, + "loss": 0.5973, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.6644830107688904, + "eval_runtime": 86.8739, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.6689891717273936, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.6616199612617493, + "eval_runtime": 86.8222, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.6306846778314292, + "learning_rate": 2e-05, + "loss": 0.6599, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.6592965126037598, + "eval_runtime": 86.8577, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.6021327993890785, + "learning_rate": 2e-05, + "loss": 0.575, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.6580593585968018, + "eval_runtime": 86.7582, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.6174712675568311, + "learning_rate": 2e-05, + "loss": 0.6341, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.6575854420661926, + "eval_runtime": 76.7634, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.6551281786490154, + "learning_rate": 2e-05, + "loss": 0.6032, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.6583926677703857, + "eval_runtime": 83.4222, + "eval_samples_per_second": 2.397, + "eval_steps_per_second": 0.3, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.6033798361300539, + "learning_rate": 2e-05, + "loss": 0.6352, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.6615632772445679, + "eval_runtime": 76.7227, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.557538857110867, + "learning_rate": 2e-05, + "loss": 0.6472, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.6674608588218689, + "eval_runtime": 76.6215, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.7828450894757938, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.670245349407196, + "eval_runtime": 76.685, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.7969830757603331, + "learning_rate": 2e-05, + "loss": 0.5809, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.6711975336074829, + "eval_runtime": 78.0022, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.6431174985709492, + "learning_rate": 2e-05, + "loss": 0.6971, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.6719404458999634, + "eval_runtime": 78.7599, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.7025583314944188, + "learning_rate": 2e-05, + "loss": 0.5751, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.6719526648521423, + "eval_runtime": 78.0188, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.7114355417811269, + "learning_rate": 2e-05, + "loss": 0.623, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.6717848181724548, + "eval_runtime": 78.6366, + "eval_samples_per_second": 2.543, + "eval_steps_per_second": 0.318, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 0.8272269435769467, + "learning_rate": 2e-05, + "loss": 0.6509, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.6701865196228027, + "eval_runtime": 78.7279, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.318, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.7215994453471393, + "learning_rate": 2e-05, + "loss": 0.6263, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.6682087182998657, + "eval_runtime": 78.1433, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.6425448006102333, + "learning_rate": 2e-05, + "loss": 0.5613, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.6686681509017944, + "eval_runtime": 78.0964, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.7207053166384572, + "learning_rate": 2e-05, + "loss": 0.6239, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.6676305532455444, + "eval_runtime": 77.9986, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.7459344743811905, + "learning_rate": 2e-05, + "loss": 0.6159, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.6660167574882507, + "eval_runtime": 78.4159, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.7179805119560739, + "learning_rate": 2e-05, + "loss": 0.6192, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.6636325716972351, + "eval_runtime": 78.2224, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.724792498458059, + "learning_rate": 2e-05, + "loss": 0.5234, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.6647288799285889, + "eval_runtime": 79.0573, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.6544107138826364, + "learning_rate": 2e-05, + "loss": 0.6067, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.6689667701721191, + "eval_runtime": 79.2898, + "eval_samples_per_second": 2.522, + "eval_steps_per_second": 0.315, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.71580236810568, + "learning_rate": 2e-05, + "loss": 0.6215, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.6723271012306213, + "eval_runtime": 79.0759, + "eval_samples_per_second": 2.529, + "eval_steps_per_second": 0.316, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.7741383931390255, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.6743794083595276, + "eval_runtime": 79.0509, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 0.7927343087738151, + "learning_rate": 2e-05, + "loss": 0.6241, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.6728585958480835, + "eval_runtime": 79.2296, + "eval_samples_per_second": 2.524, + "eval_steps_per_second": 0.316, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.759468785526614, + "learning_rate": 2e-05, + "loss": 0.6209, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.6686221957206726, + "eval_runtime": 76.7494, + "eval_samples_per_second": 2.606, + "eval_steps_per_second": 0.326, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.7345386079388437, + "learning_rate": 2e-05, + "loss": 0.5618, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.6659188270568848, + "eval_runtime": 77.4511, + "eval_samples_per_second": 2.582, + "eval_steps_per_second": 0.323, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.6822491965046279, + "learning_rate": 2e-05, + "loss": 0.6064, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.664726734161377, + "eval_runtime": 76.7108, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.7329120674082968, + "learning_rate": 2e-05, + "loss": 0.5843, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.6635715961456299, + "eval_runtime": 76.7921, + "eval_samples_per_second": 2.604, + "eval_steps_per_second": 0.326, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.7950781591249908, + "learning_rate": 2e-05, + "loss": 0.6383, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.664521336555481, + "eval_runtime": 76.6952, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.6791182798182671, + "learning_rate": 2e-05, + "loss": 0.5932, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.6673008799552917, + "eval_runtime": 76.794, + "eval_samples_per_second": 2.604, + "eval_steps_per_second": 0.326, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.7633434086832942, + "learning_rate": 2e-05, + "loss": 0.5754, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.6692779064178467, + "eval_runtime": 76.7749, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.6857090076317197, + "learning_rate": 2e-05, + "loss": 0.5585, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.6702080368995667, + "eval_runtime": 76.6913, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.6961298007385132, + "learning_rate": 2e-05, + "loss": 0.5093, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.6708166599273682, + "eval_runtime": 76.7725, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.7783752192295856, + "learning_rate": 2e-05, + "loss": 0.5656, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.6697121262550354, + "eval_runtime": 76.7888, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 0.7327581828795048, + "learning_rate": 2e-05, + "loss": 0.6984, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.6684187054634094, + "eval_runtime": 78.6657, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.689919829790507, + "learning_rate": 2e-05, + "loss": 0.6173, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.6675245761871338, + "eval_runtime": 78.1275, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.6812947879732435, + "learning_rate": 2e-05, + "loss": 0.5499, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.6678825616836548, + "eval_runtime": 78.8588, + "eval_samples_per_second": 2.536, + "eval_steps_per_second": 0.317, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.715716761740314, + "learning_rate": 2e-05, + "loss": 0.5699, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.6692755222320557, + "eval_runtime": 83.098, + "eval_samples_per_second": 2.407, + "eval_steps_per_second": 0.301, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.7438930389955494, + "learning_rate": 2e-05, + "loss": 0.5974, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.6735746264457703, + "eval_runtime": 77.384, + "eval_samples_per_second": 2.585, + "eval_steps_per_second": 0.323, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.7271043131369198, + "learning_rate": 2e-05, + "loss": 0.601, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.6790977716445923, + "eval_runtime": 78.0312, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 0.851687675865168, + "learning_rate": 2e-05, + "loss": 0.5681, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.6834170818328857, + "eval_runtime": 77.8688, + "eval_samples_per_second": 2.568, + "eval_steps_per_second": 0.321, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 0.7905287763218567, + "learning_rate": 2e-05, + "loss": 0.6222, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.6843841671943665, + "eval_runtime": 77.985, + "eval_samples_per_second": 2.565, + "eval_steps_per_second": 0.321, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.7301520002532459, + "learning_rate": 2e-05, + "loss": 0.5549, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.6860540509223938, + "eval_runtime": 78.0163, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 0.899999206595601, + "learning_rate": 2e-05, + "loss": 0.5128, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.685759425163269, + "eval_runtime": 78.4339, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 0.8064287475451557, + "learning_rate": 2e-05, + "loss": 0.5261, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.6864770650863647, + "eval_runtime": 79.6129, + "eval_samples_per_second": 2.512, + "eval_steps_per_second": 0.314, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 0.8837240795882767, + "learning_rate": 2e-05, + "loss": 0.621, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.6871599555015564, + "eval_runtime": 78.9778, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 0.9676184044078363, + "learning_rate": 2e-05, + "loss": 0.5655, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.6881282329559326, + "eval_runtime": 78.9944, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 0.8723474213941232, + "learning_rate": 2e-05, + "loss": 0.5449, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.6879245638847351, + "eval_runtime": 79.0056, + "eval_samples_per_second": 2.531, + "eval_steps_per_second": 0.316, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 0.848833488380702, + "learning_rate": 2e-05, + "loss": 0.5683, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.6846978664398193, + "eval_runtime": 78.9003, + "eval_samples_per_second": 2.535, + "eval_steps_per_second": 0.317, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 0.8586391766708288, + "learning_rate": 2e-05, + "loss": 0.5358, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.6798649430274963, + "eval_runtime": 80.0404, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 0.8007832596916474, + "learning_rate": 2e-05, + "loss": 0.5792, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.6757382750511169, + "eval_runtime": 79.962, + "eval_samples_per_second": 2.501, + "eval_steps_per_second": 0.313, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 0.7839805948862919, + "learning_rate": 2e-05, + "loss": 0.5917, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.6754000782966614, + "eval_runtime": 80.738, + "eval_samples_per_second": 2.477, + "eval_steps_per_second": 0.31, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.7397772754102683, + "learning_rate": 2e-05, + "loss": 0.6249, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.6777495741844177, + "eval_runtime": 80.5144, + "eval_samples_per_second": 2.484, + "eval_steps_per_second": 0.311, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.857390001265035, + "learning_rate": 2e-05, + "loss": 0.5932, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.6778848171234131, + "eval_runtime": 80.1508, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 0.9430180281536945, + "learning_rate": 2e-05, + "loss": 0.5793, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.6771917939186096, + "eval_runtime": 76.7109, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 0.8705050270903875, + "learning_rate": 2e-05, + "loss": 0.5601, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.6808632016181946, + "eval_runtime": 76.6965, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 0.8611871513168323, + "learning_rate": 2e-05, + "loss": 0.5953, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.6875945329666138, + "eval_runtime": 76.6592, + "eval_samples_per_second": 2.609, + "eval_steps_per_second": 0.326, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 0.9066952565245906, + "learning_rate": 2e-05, + "loss": 0.5815, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.6910049319267273, + "eval_runtime": 76.7021, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.0666864048105145, + "learning_rate": 2e-05, + "loss": 0.5663, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.6869986057281494, + "eval_runtime": 76.6344, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 0.9413311560347162, + "learning_rate": 2e-05, + "loss": 0.5106, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.6825075745582581, + "eval_runtime": 78.7857, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 0.9175579044457436, + "learning_rate": 2e-05, + "loss": 0.5821, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.6794223189353943, + "eval_runtime": 78.0368, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 0.7982785075945665, + "learning_rate": 2e-05, + "loss": 0.5781, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.679649829864502, + "eval_runtime": 78.0513, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 0.9284642289974022, + "learning_rate": 2e-05, + "loss": 0.5394, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.6805163025856018, + "eval_runtime": 78.2229, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 0.8816568355396782, + "learning_rate": 2e-05, + "loss": 0.5722, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.6801097393035889, + "eval_runtime": 78.9282, + "eval_samples_per_second": 2.534, + "eval_steps_per_second": 0.317, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 0.8137119863863306, + "learning_rate": 2e-05, + "loss": 0.5831, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.6792600750923157, + "eval_runtime": 78.8166, + "eval_samples_per_second": 2.538, + "eval_steps_per_second": 0.317, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 0.9595174764400289, + "learning_rate": 2e-05, + "loss": 0.5489, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.6755692958831787, + "eval_runtime": 78.1426, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 0.8612490247878711, + "learning_rate": 2e-05, + "loss": 0.5508, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.673053503036499, + "eval_runtime": 78.0565, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 0.9474068762478358, + "learning_rate": 2e-05, + "loss": 0.5859, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.6695602536201477, + "eval_runtime": 78.051, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 0.8401643717683449, + "learning_rate": 2e-05, + "loss": 0.5277, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.6707890033721924, + "eval_runtime": 78.9959, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 320 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 613933061373952.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-320/training_args.bin b/checkpoint-320/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95e42fb482680392bc7a586cb1e05294ea9095fd --- /dev/null +++ b/checkpoint-320/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df3df8f53bf051656f1ae89d4681a26c113103914ee95e8a97646c6c5c824188 +size 8312 diff --git a/checkpoint-320/zero_to_fp32.py b/checkpoint-320/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-320/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e6608a4564d18a51ddf3fd3b64a27fa6ac7ace2f --- /dev/null +++ b/config.json @@ -0,0 +1,77 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "liuhaotian/llava-v1.6-vicuna-13b", + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "freeze_mm_mlp_adapter": false, + "freeze_mm_vision_resampler": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "image_aspect_ratio": "anyres", + "image_crop_resolution": 224, + "image_grid_pinpoints": [ + [ + 336, + 672 + ], + [ + 672, + 336 + ], + [ + 672, + 672 + ], + [ + 1008, + 336 + ], + [ + 336, + 1008 + ] + ], + "image_split_resolution": 224, + "initializer_range": 0.02, + "intermediate_size": 13824, + "max_length": 4096, + "max_position_embeddings": 4096, + "mlp_bias": false, + "mm_hidden_size": 1024, + "mm_patch_merge_type": "flat", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "openai/clip-vit-large-patch14-336", + "mm_vision_tower_lr": 2e-06, + "model_type": "llava_llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 40, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.46.3", + "tune_mm_mlp_adapter": false, + "tune_mm_vision_resampler": false, + "unfreeze_mm_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 32000 +} diff --git a/non_lora_trainables.bin b/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..20fee334e636d45956ccdd990b55fff04b5b66a6 --- /dev/null +++ b/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05b685cfc78d68bd5ca3e549eb69e061a0e258fd61a17b196e3ef3876ec7cda3 +size 62937264 diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbac1432997de6b769e0f7a7e030d794b26f7871 --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7758883238817def488bea1dc14bb3c1a10225fa8d5b92dc0ada5d92c0cdf52b +size 191824418 diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4780db2d21891d9dbb836f8a5366026a3a1d5901 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4842 @@ +{ + "best_metric": 0.6575854420661926, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b_anyres/checkpoint-256", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.5230235555406132, + "learning_rate": 0.0, + "loss": 1.5809, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.6275018453598022, + "eval_runtime": 82.059, + "eval_samples_per_second": 2.437, + "eval_steps_per_second": 0.305, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.5095402010892089, + "learning_rate": 2e-05, + "loss": 1.4958, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.6275018453598022, + "eval_runtime": 76.5747, + "eval_samples_per_second": 2.612, + "eval_steps_per_second": 0.326, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.4998514282504938, + "learning_rate": 2e-05, + "loss": 1.5552, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.5956931114196777, + "eval_runtime": 76.1563, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.4280580315108126, + "learning_rate": 2e-05, + "loss": 1.4846, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.5584176778793335, + "eval_runtime": 76.1235, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5678499435986384, + "learning_rate": 2e-05, + "loss": 1.5036, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.5207562446594238, + "eval_runtime": 76.1514, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5368461657542534, + "learning_rate": 2e-05, + "loss": 1.476, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.4807783365249634, + "eval_runtime": 77.3444, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4358, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.066, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.5549950083087136, + "learning_rate": 2e-05, + "loss": 1.4369, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.4411544799804688, + "eval_runtime": 77.2807, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.323, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4471, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.1562, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.3666, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.4036556482315063, + "eval_runtime": 77.1645, + "eval_samples_per_second": 2.592, + "eval_steps_per_second": 0.324, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.5292240951443854, + "learning_rate": 2e-05, + "loss": 1.4149, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.4036556482315063, + "eval_runtime": 78.7627, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.684588966714067, + "learning_rate": 2e-05, + "loss": 1.3883, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.3679308891296387, + "eval_runtime": 78.4315, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.4271, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.685, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.6261826769491422, + "learning_rate": 2e-05, + "loss": 1.2495, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.3369851112365723, + "eval_runtime": 78.0511, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.6028103951693778, + "learning_rate": 2e-05, + "loss": 1.3513, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.3032653331756592, + "eval_runtime": 78.0271, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.769290402283396, + "learning_rate": 2e-05, + "loss": 1.3117, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.2661188840866089, + "eval_runtime": 78.1857, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 1.3279338025863765, + "learning_rate": 2e-05, + "loss": 1.2768, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.2299447059631348, + "eval_runtime": 78.2064, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.7410327159336384, + "learning_rate": 2e-05, + "loss": 1.256, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.2044258117675781, + "eval_runtime": 78.072, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.44078820770408506, + "learning_rate": 2e-05, + "loss": 1.1252, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.1826122999191284, + "eval_runtime": 78.7312, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.318, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.49020841613371097, + "learning_rate": 2e-05, + "loss": 1.2249, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.1616511344909668, + "eval_runtime": 78.2736, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.43031322695269714, + "learning_rate": 2e-05, + "loss": 1.1466, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 1.1410629749298096, + "eval_runtime": 79.6432, + "eval_samples_per_second": 2.511, + "eval_steps_per_second": 0.314, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.45632085445955545, + "learning_rate": 2e-05, + "loss": 1.1951, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 1.1204684972763062, + "eval_runtime": 79.0609, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.40048586945364495, + "learning_rate": 2e-05, + "loss": 1.1826, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 1.1002545356750488, + "eval_runtime": 82.8578, + "eval_samples_per_second": 2.414, + "eval_steps_per_second": 0.302, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.3703033261027938, + "learning_rate": 2e-05, + "loss": 1.1543, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 1.0805977582931519, + "eval_runtime": 76.1407, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3986313105418924, + "learning_rate": 2e-05, + "loss": 1.1046, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 1.0610157251358032, + "eval_runtime": 76.3083, + "eval_samples_per_second": 2.621, + "eval_steps_per_second": 0.328, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.36265027203577943, + "learning_rate": 2e-05, + "loss": 1.1048, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 1.0421289205551147, + "eval_runtime": 77.2186, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.3881748990218768, + "learning_rate": 2e-05, + "loss": 1.0425, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 1.0240073204040527, + "eval_runtime": 77.8662, + "eval_samples_per_second": 2.569, + "eval_steps_per_second": 0.321, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.3734031294324286, + "learning_rate": 2e-05, + "loss": 1.0484, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 1.0066957473754883, + "eval_runtime": 77.269, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.29695383079342563, + "learning_rate": 2e-05, + "loss": 1.0387, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.9906074404716492, + "eval_runtime": 77.2245, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.29273146875026623, + "learning_rate": 2e-05, + "loss": 1.0568, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.975755512714386, + "eval_runtime": 78.0056, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.35070440686850546, + "learning_rate": 2e-05, + "loss": 0.9114, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9615123271942139, + "eval_runtime": 77.9051, + "eval_samples_per_second": 2.567, + "eval_steps_per_second": 0.321, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.30846157140439384, + "learning_rate": 2e-05, + "loss": 0.9941, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9480571150779724, + "eval_runtime": 77.2322, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.2950381371932973, + "learning_rate": 2e-05, + "loss": 1.0297, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9356330037117004, + "eval_runtime": 81.8443, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 0.305, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.27080038065834283, + "learning_rate": 2e-05, + "loss": 1.021, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.9245791435241699, + "eval_runtime": 76.2071, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.23165081252649894, + "learning_rate": 2e-05, + "loss": 1.0366, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.9151126146316528, + "eval_runtime": 77.0412, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.4033780922500775, + "learning_rate": 2e-05, + "loss": 1.0127, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.9063960313796997, + "eval_runtime": 76.9327, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.2398039831439168, + "learning_rate": 2e-05, + "loss": 0.9418, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.8982363939285278, + "eval_runtime": 76.1234, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.28793451241246804, + "learning_rate": 2e-05, + "loss": 0.9643, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8908895254135132, + "eval_runtime": 76.2877, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.2927691606307197, + "learning_rate": 2e-05, + "loss": 1.0087, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8845618367195129, + "eval_runtime": 76.2282, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.26410982001408806, + "learning_rate": 2e-05, + "loss": 0.986, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8784474730491638, + "eval_runtime": 76.2512, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.29182630949665306, + "learning_rate": 2e-05, + "loss": 0.9711, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8725223541259766, + "eval_runtime": 77.1229, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.36402838796832665, + "learning_rate": 2e-05, + "loss": 0.9263, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8662790060043335, + "eval_runtime": 77.2362, + "eval_samples_per_second": 2.589, + "eval_steps_per_second": 0.324, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.29338184478895163, + "learning_rate": 2e-05, + "loss": 0.8947, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8600431680679321, + "eval_runtime": 77.1213, + "eval_samples_per_second": 2.593, + "eval_steps_per_second": 0.324, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.2201714229702277, + "learning_rate": 2e-05, + "loss": 0.9059, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8545799255371094, + "eval_runtime": 77.991, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.2254966625243654, + "learning_rate": 2e-05, + "loss": 0.8942, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8497399687767029, + "eval_runtime": 77.2698, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21753318432075458, + "learning_rate": 2e-05, + "loss": 0.9376, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8452473282814026, + "eval_runtime": 77.0568, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.21449718265972945, + "learning_rate": 2e-05, + "loss": 0.9369, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.841134786605835, + "eval_runtime": 77.225, + "eval_samples_per_second": 2.59, + "eval_steps_per_second": 0.324, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.2109063266748924, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8373770117759705, + "eval_runtime": 76.2309, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.232838633689838, + "learning_rate": 2e-05, + "loss": 0.8694, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8338289856910706, + "eval_runtime": 76.277, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4189704940803984, + "learning_rate": 2e-05, + "loss": 0.8464, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8297132849693298, + "eval_runtime": 76.2872, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.2171618165123276, + "learning_rate": 2e-05, + "loss": 0.8785, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8257431983947754, + "eval_runtime": 76.2639, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21934651037670305, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.8223557472229004, + "eval_runtime": 76.2383, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.24183530733164746, + "learning_rate": 2e-05, + "loss": 0.9218, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8189653158187866, + "eval_runtime": 76.9819, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.23450930244279267, + "learning_rate": 2e-05, + "loss": 0.8896, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.8152530193328857, + "eval_runtime": 76.2378, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.22081665899796085, + "learning_rate": 2e-05, + "loss": 0.8798, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8122122287750244, + "eval_runtime": 76.289, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.21311746114111046, + "learning_rate": 2e-05, + "loss": 0.9482, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8092318773269653, + "eval_runtime": 77.8321, + "eval_samples_per_second": 2.57, + "eval_steps_per_second": 0.321, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.2496565307107556, + "learning_rate": 2e-05, + "loss": 0.8917, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.8070546984672546, + "eval_runtime": 77.2651, + "eval_samples_per_second": 2.588, + "eval_steps_per_second": 0.324, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2137866456424736, + "learning_rate": 2e-05, + "loss": 0.909, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.8049566745758057, + "eval_runtime": 78.0925, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.22567502859345095, + "learning_rate": 2e-05, + "loss": 0.8611, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8028810024261475, + "eval_runtime": 78.0553, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.23303796552302508, + "learning_rate": 2e-05, + "loss": 0.9209, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.800568699836731, + "eval_runtime": 78.052, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24566727726974544, + "learning_rate": 2e-05, + "loss": 0.8239, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.7976545691490173, + "eval_runtime": 77.3056, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.23014192522354907, + "learning_rate": 2e-05, + "loss": 0.8814, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.7945474982261658, + "eval_runtime": 77.3398, + "eval_samples_per_second": 2.586, + "eval_steps_per_second": 0.323, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.23042819102671622, + "learning_rate": 2e-05, + "loss": 0.9064, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.7918359637260437, + "eval_runtime": 77.4272, + "eval_samples_per_second": 2.583, + "eval_steps_per_second": 0.323, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.23940667173206315, + "learning_rate": 2e-05, + "loss": 0.8658, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7891160845756531, + "eval_runtime": 77.3236, + "eval_samples_per_second": 2.587, + "eval_steps_per_second": 0.323, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.22630342930143643, + "learning_rate": 2e-05, + "loss": 0.8403, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7859742641448975, + "eval_runtime": 77.2001, + "eval_samples_per_second": 2.591, + "eval_steps_per_second": 0.324, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.20949240460260976, + "learning_rate": 2e-05, + "loss": 0.8472, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7834083437919617, + "eval_runtime": 78.9646, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.22714400479820654, + "learning_rate": 2e-05, + "loss": 0.841, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7805308699607849, + "eval_runtime": 78.7552, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.317, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.23345123077006047, + "learning_rate": 2e-05, + "loss": 0.9028, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7779514789581299, + "eval_runtime": 78.3387, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.251841542575211, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7756664752960205, + "eval_runtime": 78.3109, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.23548386839773608, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7733604907989502, + "eval_runtime": 78.9712, + "eval_samples_per_second": 2.533, + "eval_steps_per_second": 0.317, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23262740912668387, + "learning_rate": 2e-05, + "loss": 0.8778, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.771755576133728, + "eval_runtime": 78.2633, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.22075289612357513, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7705450654029846, + "eval_runtime": 78.3151, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25520381955936466, + "learning_rate": 2e-05, + "loss": 0.8387, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7695029973983765, + "eval_runtime": 78.2901, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2047305385827267, + "learning_rate": 2e-05, + "loss": 0.8404, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7684457302093506, + "eval_runtime": 78.3875, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.2262323045133288, + "learning_rate": 2e-05, + "loss": 0.8811, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7671162486076355, + "eval_runtime": 78.202, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.21885464923925876, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7658494710922241, + "eval_runtime": 78.1746, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.21717306953626966, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7642120122909546, + "eval_runtime": 78.2026, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2530725583748258, + "learning_rate": 2e-05, + "loss": 0.8584, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7625510692596436, + "eval_runtime": 78.1991, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.25354787036627263, + "learning_rate": 2e-05, + "loss": 0.8569, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7616268396377563, + "eval_runtime": 78.2915, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.2800865746664007, + "learning_rate": 2e-05, + "loss": 0.9116, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7603214979171753, + "eval_runtime": 78.2749, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.268139688449618, + "learning_rate": 2e-05, + "loss": 0.8397, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7584869265556335, + "eval_runtime": 79.1445, + "eval_samples_per_second": 2.527, + "eval_steps_per_second": 0.316, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.3128648654463789, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.7566561102867126, + "eval_runtime": 79.2089, + "eval_samples_per_second": 2.525, + "eval_steps_per_second": 0.316, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2502355211215609, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.7547345161437988, + "eval_runtime": 79.2691, + "eval_samples_per_second": 2.523, + "eval_steps_per_second": 0.315, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.25281184629018644, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7527951598167419, + "eval_runtime": 79.4068, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.24246729562645003, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7509815096855164, + "eval_runtime": 79.1612, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.27005475109453947, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7485950589179993, + "eval_runtime": 80.0714, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.2723492355800971, + "learning_rate": 2e-05, + "loss": 0.8117, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7459420561790466, + "eval_runtime": 79.4075, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.2946493898427159, + "learning_rate": 2e-05, + "loss": 0.8986, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7436455488204956, + "eval_runtime": 79.3721, + "eval_samples_per_second": 2.52, + "eval_steps_per_second": 0.315, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.26411214734213284, + "learning_rate": 2e-05, + "loss": 0.8145, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7424752712249756, + "eval_runtime": 79.2988, + "eval_samples_per_second": 2.522, + "eval_steps_per_second": 0.315, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.27115747269014817, + "learning_rate": 2e-05, + "loss": 0.8457, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7416408658027649, + "eval_runtime": 79.4004, + "eval_samples_per_second": 2.519, + "eval_steps_per_second": 0.315, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.25831877964821937, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7404463291168213, + "eval_runtime": 81.7767, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.31273388454942935, + "learning_rate": 2e-05, + "loss": 0.8562, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7384185791015625, + "eval_runtime": 82.3443, + "eval_samples_per_second": 2.429, + "eval_steps_per_second": 0.304, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.2838267071008901, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7366807460784912, + "eval_runtime": 82.2622, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.28625827941831467, + "learning_rate": 2e-05, + "loss": 0.8618, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7357398867607117, + "eval_runtime": 81.9471, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.25548002643954326, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7356534004211426, + "eval_runtime": 82.1186, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.27081450830961107, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7346957921981812, + "eval_runtime": 81.5463, + "eval_samples_per_second": 2.453, + "eval_steps_per_second": 0.307, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.2985486737236676, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7325752377510071, + "eval_runtime": 81.7804, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.29149719690624026, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.7298976182937622, + "eval_runtime": 76.2764, + "eval_samples_per_second": 2.622, + "eval_steps_per_second": 0.328, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.25227859825215865, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.727373480796814, + "eval_runtime": 76.2418, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.27316954971752555, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7254325747489929, + "eval_runtime": 76.1474, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.24239788607957785, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.724058985710144, + "eval_runtime": 76.2391, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.25648385925427025, + "learning_rate": 2e-05, + "loss": 0.8195, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7235870957374573, + "eval_runtime": 76.9134, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.29620170789161204, + "learning_rate": 2e-05, + "loss": 0.8224, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7228152751922607, + "eval_runtime": 76.095, + "eval_samples_per_second": 2.628, + "eval_steps_per_second": 0.329, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3484116181139593, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7209363579750061, + "eval_runtime": 76.9377, + "eval_samples_per_second": 2.6, + "eval_steps_per_second": 0.325, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.25212350156184643, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7197096347808838, + "eval_runtime": 76.2008, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.264200147608962, + "learning_rate": 2e-05, + "loss": 0.8371, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7197055220603943, + "eval_runtime": 78.1542, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.3309431084940201, + "learning_rate": 2e-05, + "loss": 0.6999, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.7187016010284424, + "eval_runtime": 78.4259, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.3131644456919823, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.717018187046051, + "eval_runtime": 78.4558, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.33527684120780293, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7147062420845032, + "eval_runtime": 78.2334, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.32, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.29542683956231724, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7130224704742432, + "eval_runtime": 79.1179, + "eval_samples_per_second": 2.528, + "eval_steps_per_second": 0.316, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.31128698002926114, + "learning_rate": 2e-05, + "loss": 0.8153, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7120551466941833, + "eval_runtime": 80.292, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 0.311, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.32502558864214215, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7117202877998352, + "eval_runtime": 79.7539, + "eval_samples_per_second": 2.508, + "eval_steps_per_second": 0.313, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.34335720855758517, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7117029428482056, + "eval_runtime": 80.0281, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.31951931695644, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.7116554379463196, + "eval_runtime": 79.7209, + "eval_samples_per_second": 2.509, + "eval_steps_per_second": 0.314, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.28067192963874266, + "learning_rate": 2e-05, + "loss": 0.8045, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7118353843688965, + "eval_runtime": 80.0195, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.2739718257400276, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7122579216957092, + "eval_runtime": 76.2052, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.31401723658881836, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7118574380874634, + "eval_runtime": 76.1509, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.36925964858634625, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.710691511631012, + "eval_runtime": 76.2305, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.3050583880654791, + "learning_rate": 2e-05, + "loss": 0.8402, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7096763849258423, + "eval_runtime": 77.0581, + "eval_samples_per_second": 2.595, + "eval_steps_per_second": 0.324, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.2648625651290031, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7094223499298096, + "eval_runtime": 76.1379, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 0.328, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.3107221696449271, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7081363201141357, + "eval_runtime": 76.626, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.3455151299995048, + "learning_rate": 2e-05, + "loss": 0.8342, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7063001990318298, + "eval_runtime": 77.0293, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.325, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.28847071926472523, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.7044610381126404, + "eval_runtime": 76.2385, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.26753816515069856, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7033799886703491, + "eval_runtime": 76.1985, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.3465046292893005, + "learning_rate": 2e-05, + "loss": 0.8144, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7021930813789368, + "eval_runtime": 76.2234, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.3451690427620698, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7013542652130127, + "eval_runtime": 78.0752, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.31571858642673567, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7007560729980469, + "eval_runtime": 78.3558, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3247003540270338, + "learning_rate": 2e-05, + "loss": 0.6714, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.6999780535697937, + "eval_runtime": 78.9788, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.2814983490019739, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.6998200416564941, + "eval_runtime": 78.3093, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.31961631715145106, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.6995271444320679, + "eval_runtime": 78.2172, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.32333364662215863, + "learning_rate": 2e-05, + "loss": 0.7896, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.6992727518081665, + "eval_runtime": 79.0125, + "eval_samples_per_second": 2.531, + "eval_steps_per_second": 0.316, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3255859640449829, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.6988572478294373, + "eval_runtime": 79.0, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.3307068947429175, + "learning_rate": 2e-05, + "loss": 0.8416, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.6981343030929565, + "eval_runtime": 78.3309, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.3842303818116732, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.6968980431556702, + "eval_runtime": 78.5608, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.331839472419003, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.6955949664115906, + "eval_runtime": 78.3566, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.31864813130499836, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.6951528787612915, + "eval_runtime": 79.7802, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.352549164434451, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.6947290897369385, + "eval_runtime": 79.8171, + "eval_samples_per_second": 2.506, + "eval_steps_per_second": 0.313, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.37128812818896284, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.6937370300292969, + "eval_runtime": 79.7782, + "eval_samples_per_second": 2.507, + "eval_steps_per_second": 0.313, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.3348014941412048, + "learning_rate": 2e-05, + "loss": 0.7079, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.692456066608429, + "eval_runtime": 79.9308, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.34411051658527964, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.6915809512138367, + "eval_runtime": 79.943, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.3373909601921749, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.6912103295326233, + "eval_runtime": 79.8515, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.33253827371305456, + "learning_rate": 2e-05, + "loss": 0.7224, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.6912806630134583, + "eval_runtime": 80.6475, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 0.31, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.38458075172588313, + "learning_rate": 2e-05, + "loss": 0.7261, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.6905419230461121, + "eval_runtime": 80.2606, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.311, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.31351962640463144, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.6898491382598877, + "eval_runtime": 79.9965, + "eval_samples_per_second": 2.5, + "eval_steps_per_second": 0.313, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.35474372115704583, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.6893147230148315, + "eval_runtime": 1475.5758, + "eval_samples_per_second": 0.136, + "eval_steps_per_second": 0.017, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.3479568917421202, + "learning_rate": 2e-05, + "loss": 0.6638, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.6884538531303406, + "eval_runtime": 84.6835, + "eval_samples_per_second": 2.362, + "eval_steps_per_second": 0.295, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.3421823344428645, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.6873475909233093, + "eval_runtime": 83.3138, + "eval_samples_per_second": 2.401, + "eval_steps_per_second": 0.3, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.3642187020830788, + "learning_rate": 2e-05, + "loss": 0.6825, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.6858401298522949, + "eval_runtime": 82.1066, + "eval_samples_per_second": 2.436, + "eval_steps_per_second": 0.304, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.35097547901391785, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.6848779320716858, + "eval_runtime": 84.4076, + "eval_samples_per_second": 2.369, + "eval_steps_per_second": 0.296, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.3568694843794629, + "learning_rate": 2e-05, + "loss": 0.7176, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.6842290759086609, + "eval_runtime": 82.5945, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.34258633585260334, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.6838659048080444, + "eval_runtime": 85.9626, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.291, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.42319523894659655, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.6830299496650696, + "eval_runtime": 85.7189, + "eval_samples_per_second": 2.333, + "eval_steps_per_second": 0.292, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.3632195533127194, + "learning_rate": 2e-05, + "loss": 0.715, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.6826379895210266, + "eval_runtime": 87.8244, + "eval_samples_per_second": 2.277, + "eval_steps_per_second": 0.285, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.3738308004604413, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.6826817393302917, + "eval_runtime": 86.5822, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.3618696330632776, + "learning_rate": 2e-05, + "loss": 0.6632, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.6827967166900635, + "eval_runtime": 82.1829, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 0.304, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.38901912569992203, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.6821711659431458, + "eval_runtime": 84.4511, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.3516096507348829, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.6819837689399719, + "eval_runtime": 84.1594, + "eval_samples_per_second": 2.376, + "eval_steps_per_second": 0.297, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.36066902463794986, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.6817716956138611, + "eval_runtime": 83.8929, + "eval_samples_per_second": 2.384, + "eval_steps_per_second": 0.298, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.36641784926154175, + "learning_rate": 2e-05, + "loss": 0.7116, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.6816902160644531, + "eval_runtime": 84.4431, + "eval_samples_per_second": 2.368, + "eval_steps_per_second": 0.296, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.4020716293225933, + "learning_rate": 2e-05, + "loss": 0.7142, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.6811469793319702, + "eval_runtime": 86.0681, + "eval_samples_per_second": 2.324, + "eval_steps_per_second": 0.29, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.38360882669254054, + "learning_rate": 2e-05, + "loss": 0.6756, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.6798409223556519, + "eval_runtime": 81.9903, + "eval_samples_per_second": 2.439, + "eval_steps_per_second": 0.305, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.34966156213066135, + "learning_rate": 2e-05, + "loss": 0.827, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.6788859367370605, + "eval_runtime": 76.1753, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 0.328, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.41140842939901384, + "learning_rate": 2e-05, + "loss": 0.6409, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.6787077188491821, + "eval_runtime": 76.2239, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.4222084070163774, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.6796822547912598, + "eval_runtime": 76.2141, + "eval_samples_per_second": 2.624, + "eval_steps_per_second": 0.328, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.4644454724424921, + "learning_rate": 2e-05, + "loss": 0.6057, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.6794346570968628, + "eval_runtime": 76.3216, + "eval_samples_per_second": 2.62, + "eval_steps_per_second": 0.328, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.46128725263272996, + "learning_rate": 2e-05, + "loss": 0.7158, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.6791612505912781, + "eval_runtime": 78.4909, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.319, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.37300666872025545, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.6788016557693481, + "eval_runtime": 78.5697, + "eval_samples_per_second": 2.546, + "eval_steps_per_second": 0.318, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.41454648576180214, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.6787048578262329, + "eval_runtime": 78.5317, + "eval_samples_per_second": 2.547, + "eval_steps_per_second": 0.318, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.40724665091386236, + "learning_rate": 2e-05, + "loss": 0.6944, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.679679811000824, + "eval_runtime": 78.6899, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.3875110486208986, + "learning_rate": 2e-05, + "loss": 0.6634, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.6819935441017151, + "eval_runtime": 78.3617, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.47956532155617193, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.6825206875801086, + "eval_runtime": 78.4435, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.4599359590587781, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.6816768050193787, + "eval_runtime": 78.3005, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4057490487995386, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.6806090474128723, + "eval_runtime": 78.3313, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4143979315360467, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.6795693039894104, + "eval_runtime": 78.4526, + "eval_samples_per_second": 2.549, + "eval_steps_per_second": 0.319, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.4219663662343445, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.6793847680091858, + "eval_runtime": 78.8009, + "eval_samples_per_second": 2.538, + "eval_steps_per_second": 0.317, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.4491811321927657, + "learning_rate": 2e-05, + "loss": 0.7004, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6775352358818054, + "eval_runtime": 80.0685, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.46366516532638885, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6748698949813843, + "eval_runtime": 80.0487, + "eval_samples_per_second": 2.498, + "eval_steps_per_second": 0.312, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.3815188640227797, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6728273034095764, + "eval_runtime": 80.0318, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.41025429416666304, + "learning_rate": 2e-05, + "loss": 0.6585, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6718859672546387, + "eval_runtime": 79.8801, + "eval_samples_per_second": 2.504, + "eval_steps_per_second": 0.313, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.40652817592240054, + "learning_rate": 2e-05, + "loss": 0.6611, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6715708374977112, + "eval_runtime": 76.7261, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.40753961326688415, + "learning_rate": 2e-05, + "loss": 0.6779, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6719761490821838, + "eval_runtime": 77.0136, + "eval_samples_per_second": 2.597, + "eval_steps_per_second": 0.325, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.4232811980671673, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.6724664568901062, + "eval_runtime": 76.9731, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 0.325, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5132756318549849, + "learning_rate": 2e-05, + "loss": 0.6801, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.6723365783691406, + "eval_runtime": 76.4132, + "eval_samples_per_second": 2.617, + "eval_steps_per_second": 0.327, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.43526879230161264, + "learning_rate": 2e-05, + "loss": 0.6673, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.672926664352417, + "eval_runtime": 76.1936, + "eval_samples_per_second": 2.625, + "eval_steps_per_second": 0.328, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.46965560853038507, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.6731134057044983, + "eval_runtime": 76.2345, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 0.328, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.4733296318676217, + "learning_rate": 2e-05, + "loss": 0.6791, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6726363301277161, + "eval_runtime": 78.3939, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.4662943253655961, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6726526021957397, + "eval_runtime": 79.1834, + "eval_samples_per_second": 2.526, + "eval_steps_per_second": 0.316, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.4420962889993382, + "learning_rate": 2e-05, + "loss": 0.675, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.6727125644683838, + "eval_runtime": 78.252, + "eval_samples_per_second": 2.556, + "eval_steps_per_second": 0.319, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4345166976944551, + "learning_rate": 2e-05, + "loss": 0.6748, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6725904941558838, + "eval_runtime": 78.3914, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.45109463315374526, + "learning_rate": 2e-05, + "loss": 0.7024, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6718384027481079, + "eval_runtime": 78.4361, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.42953871838795626, + "learning_rate": 2e-05, + "loss": 0.6904, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6703083515167236, + "eval_runtime": 78.3863, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.4248607379284984, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6693080067634583, + "eval_runtime": 78.4373, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.42839417453459494, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6689594984054565, + "eval_runtime": 78.4169, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.4216922788166874, + "learning_rate": 2e-05, + "loss": 0.7189, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6689300537109375, + "eval_runtime": 78.9793, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.45199575791858004, + "learning_rate": 2e-05, + "loss": 0.6438, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.6690151691436768, + "eval_runtime": 78.5002, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.318, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.4166923177293841, + "learning_rate": 2e-05, + "loss": 0.6885, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.6688613891601562, + "eval_runtime": 80.5497, + "eval_samples_per_second": 2.483, + "eval_steps_per_second": 0.31, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.45164281863366285, + "learning_rate": 2e-05, + "loss": 0.7197, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.6687932014465332, + "eval_runtime": 80.1482, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.45653924787504446, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.6690963506698608, + "eval_runtime": 80.4464, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 0.311, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.4966562341334706, + "learning_rate": 2e-05, + "loss": 0.6532, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.669116735458374, + "eval_runtime": 79.8294, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 0.313, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.4838469303220975, + "learning_rate": 2e-05, + "loss": 0.6883, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.6693156957626343, + "eval_runtime": 80.25, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 0.312, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.4836820906895964, + "learning_rate": 2e-05, + "loss": 0.7106, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6704170107841492, + "eval_runtime": 79.9636, + "eval_samples_per_second": 2.501, + "eval_steps_per_second": 0.313, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.4945855983140219, + "learning_rate": 2e-05, + "loss": 0.6336, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.6708824038505554, + "eval_runtime": 80.8044, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 0.309, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.44587847230103017, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.6723968982696533, + "eval_runtime": 80.1715, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.5351063503195825, + "learning_rate": 2e-05, + "loss": 0.6222, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.672196626663208, + "eval_runtime": 79.927, + "eval_samples_per_second": 2.502, + "eval_steps_per_second": 0.313, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.4742985088010474, + "learning_rate": 2e-05, + "loss": 0.6157, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.671062171459198, + "eval_runtime": 80.1997, + "eval_samples_per_second": 2.494, + "eval_steps_per_second": 0.312, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.5188882333349506, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.6701972484588623, + "eval_runtime": 81.6643, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.45328063593983603, + "learning_rate": 2e-05, + "loss": 0.7058, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6699164509773254, + "eval_runtime": 81.2228, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5197645538332801, + "learning_rate": 2e-05, + "loss": 0.6462, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.6702597141265869, + "eval_runtime": 81.1451, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.5762528184834232, + "learning_rate": 2e-05, + "loss": 0.6259, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6696366667747498, + "eval_runtime": 81.1643, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 0.308, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.5249503180293145, + "learning_rate": 2e-05, + "loss": 0.6045, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.6688054800033569, + "eval_runtime": 80.9492, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 0.309, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.543503888655844, + "learning_rate": 2e-05, + "loss": 0.6496, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6689916849136353, + "eval_runtime": 81.6473, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 0.306, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.48119553592193554, + "learning_rate": 2e-05, + "loss": 0.6211, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6703050136566162, + "eval_runtime": 81.9207, + "eval_samples_per_second": 2.441, + "eval_steps_per_second": 0.305, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5153356086819314, + "learning_rate": 2e-05, + "loss": 0.7135, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6702842116355896, + "eval_runtime": 81.1503, + "eval_samples_per_second": 2.465, + "eval_steps_per_second": 0.308, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5249915042825578, + "learning_rate": 2e-05, + "loss": 0.6635, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.6687333583831787, + "eval_runtime": 81.6743, + "eval_samples_per_second": 2.449, + "eval_steps_per_second": 0.306, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.5204840219868723, + "learning_rate": 2e-05, + "loss": 0.6701, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6657728552818298, + "eval_runtime": 81.106, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 0.308, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.5266935225120133, + "learning_rate": 2e-05, + "loss": 0.6637, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6641908884048462, + "eval_runtime": 82.2613, + "eval_samples_per_second": 2.431, + "eval_steps_per_second": 0.304, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.5438859451742696, + "learning_rate": 2e-05, + "loss": 0.6168, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6652233600616455, + "eval_runtime": 82.042, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 0.305, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.5716385253433929, + "learning_rate": 2e-05, + "loss": 0.6062, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.6656240820884705, + "eval_runtime": 81.233, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 0.308, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 1.0572787630142522, + "learning_rate": 2e-05, + "loss": 0.7037, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.6645559072494507, + "eval_runtime": 81.2099, + "eval_samples_per_second": 2.463, + "eval_steps_per_second": 0.308, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.5924889323251107, + "learning_rate": 2e-05, + "loss": 0.712, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6619111895561218, + "eval_runtime": 81.7826, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 0.306, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.5290576915218269, + "learning_rate": 2e-05, + "loss": 0.6659, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6609540581703186, + "eval_runtime": 82.9922, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.5831209517049147, + "learning_rate": 2e-05, + "loss": 0.6547, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.660676896572113, + "eval_runtime": 83.6541, + "eval_samples_per_second": 2.391, + "eval_steps_per_second": 0.299, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.5320966369511158, + "learning_rate": 2e-05, + "loss": 0.6968, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6618594527244568, + "eval_runtime": 83.1148, + "eval_samples_per_second": 2.406, + "eval_steps_per_second": 0.301, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.5829636446837394, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6635661125183105, + "eval_runtime": 82.8183, + "eval_samples_per_second": 2.415, + "eval_steps_per_second": 0.302, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.4975095056459566, + "learning_rate": 2e-05, + "loss": 0.6535, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.6641671657562256, + "eval_runtime": 83.0267, + "eval_samples_per_second": 2.409, + "eval_steps_per_second": 0.301, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.5625698523064815, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6639044880867004, + "eval_runtime": 83.3881, + "eval_samples_per_second": 2.398, + "eval_steps_per_second": 0.3, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.5436196850683295, + "learning_rate": 2e-05, + "loss": 0.6485, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.6651788353919983, + "eval_runtime": 82.7096, + "eval_samples_per_second": 2.418, + "eval_steps_per_second": 0.302, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.5598906287609361, + "learning_rate": 2e-05, + "loss": 0.6142, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.6688636541366577, + "eval_runtime": 82.601, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 0.7572979310697923, + "learning_rate": 2e-05, + "loss": 0.6221, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.6699694991111755, + "eval_runtime": 82.6032, + "eval_samples_per_second": 2.421, + "eval_steps_per_second": 0.303, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 0.6173309690580897, + "learning_rate": 2e-05, + "loss": 0.5919, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.6706527471542358, + "eval_runtime": 82.9732, + "eval_samples_per_second": 2.41, + "eval_steps_per_second": 0.301, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.643241771517866, + "learning_rate": 2e-05, + "loss": 0.7081, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.6700320243835449, + "eval_runtime": 84.5621, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.577638137570571, + "learning_rate": 2e-05, + "loss": 0.6873, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.669111430644989, + "eval_runtime": 84.5124, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.7229488296023369, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.6664154529571533, + "eval_runtime": 84.6437, + "eval_samples_per_second": 2.363, + "eval_steps_per_second": 0.295, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.5827815449039045, + "learning_rate": 2e-05, + "loss": 0.669, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6641202569007874, + "eval_runtime": 84.489, + "eval_samples_per_second": 2.367, + "eval_steps_per_second": 0.296, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.57507354017269, + "learning_rate": 2e-05, + "loss": 0.6474, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6623325347900391, + "eval_runtime": 84.5536, + "eval_samples_per_second": 2.365, + "eval_steps_per_second": 0.296, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.5810844862533651, + "learning_rate": 2e-05, + "loss": 0.6048, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.6619194746017456, + "eval_runtime": 84.2296, + "eval_samples_per_second": 2.374, + "eval_steps_per_second": 0.297, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.6075032415813726, + "learning_rate": 2e-05, + "loss": 0.6529, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6626202464103699, + "eval_runtime": 84.9703, + "eval_samples_per_second": 2.354, + "eval_steps_per_second": 0.294, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.6402642234375245, + "learning_rate": 2e-05, + "loss": 0.6433, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.663289487361908, + "eval_runtime": 84.8924, + "eval_samples_per_second": 2.356, + "eval_steps_per_second": 0.294, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.6335996982657431, + "learning_rate": 2e-05, + "loss": 0.6815, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.6636109948158264, + "eval_runtime": 85.0551, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.5796846795848909, + "learning_rate": 2e-05, + "loss": 0.6236, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.6652829051017761, + "eval_runtime": 84.7574, + "eval_samples_per_second": 2.36, + "eval_steps_per_second": 0.295, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.5380402145760035, + "learning_rate": 2e-05, + "loss": 0.6564, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.6676375865936279, + "eval_runtime": 86.2058, + "eval_samples_per_second": 2.32, + "eval_steps_per_second": 0.29, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.5964298255824012, + "learning_rate": 2e-05, + "loss": 0.6475, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.6698520183563232, + "eval_runtime": 85.8955, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.561279296875, + "learning_rate": 2e-05, + "loss": 0.6395, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.6705803871154785, + "eval_runtime": 86.0036, + "eval_samples_per_second": 2.325, + "eval_steps_per_second": 0.291, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.6757292755073548, + "learning_rate": 2e-05, + "loss": 0.7074, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.6679538488388062, + "eval_runtime": 85.5379, + "eval_samples_per_second": 2.338, + "eval_steps_per_second": 0.292, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.659077163070129, + "learning_rate": 2e-05, + "loss": 0.6078, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6667564511299133, + "eval_runtime": 85.752, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.292, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.6215405566454576, + "learning_rate": 2e-05, + "loss": 0.6603, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.665945291519165, + "eval_runtime": 92.3086, + "eval_samples_per_second": 2.167, + "eval_steps_per_second": 0.271, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.6130534921490498, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6661685109138489, + "eval_runtime": 87.1917, + "eval_samples_per_second": 2.294, + "eval_steps_per_second": 0.287, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.6025415602868736, + "learning_rate": 2e-05, + "loss": 0.6308, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6658704280853271, + "eval_runtime": 86.8233, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.6901593792019413, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6652414202690125, + "eval_runtime": 86.7625, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.6436454697341579, + "learning_rate": 2e-05, + "loss": 0.6912, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6654212474822998, + "eval_runtime": 86.871, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.649040103024529, + "learning_rate": 2e-05, + "loss": 0.6025, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.6654068231582642, + "eval_runtime": 86.7458, + "eval_samples_per_second": 2.306, + "eval_steps_per_second": 0.288, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.6595522131680224, + "learning_rate": 2e-05, + "loss": 0.5973, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.6644830107688904, + "eval_runtime": 86.8739, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.6689891717273936, + "learning_rate": 2e-05, + "loss": 0.687, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.6616199612617493, + "eval_runtime": 86.8222, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.6306846778314292, + "learning_rate": 2e-05, + "loss": 0.6599, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.6592965126037598, + "eval_runtime": 86.8577, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.6021327993890785, + "learning_rate": 2e-05, + "loss": 0.575, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.6580593585968018, + "eval_runtime": 86.7582, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.6174712675568311, + "learning_rate": 2e-05, + "loss": 0.6341, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.6575854420661926, + "eval_runtime": 76.7634, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.6551281786490154, + "learning_rate": 2e-05, + "loss": 0.6032, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.6583926677703857, + "eval_runtime": 83.4222, + "eval_samples_per_second": 2.397, + "eval_steps_per_second": 0.3, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.6033798361300539, + "learning_rate": 2e-05, + "loss": 0.6352, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.6615632772445679, + "eval_runtime": 76.7227, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.557538857110867, + "learning_rate": 2e-05, + "loss": 0.6472, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.6674608588218689, + "eval_runtime": 76.6215, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.7828450894757938, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.670245349407196, + "eval_runtime": 76.685, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.7969830757603331, + "learning_rate": 2e-05, + "loss": 0.5809, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.6711975336074829, + "eval_runtime": 78.0022, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.6431174985709492, + "learning_rate": 2e-05, + "loss": 0.6971, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.6719404458999634, + "eval_runtime": 78.7599, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.7025583314944188, + "learning_rate": 2e-05, + "loss": 0.5751, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.6719526648521423, + "eval_runtime": 78.0188, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.7114355417811269, + "learning_rate": 2e-05, + "loss": 0.623, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.6717848181724548, + "eval_runtime": 78.6366, + "eval_samples_per_second": 2.543, + "eval_steps_per_second": 0.318, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 0.8272269435769467, + "learning_rate": 2e-05, + "loss": 0.6509, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.6701865196228027, + "eval_runtime": 78.7279, + "eval_samples_per_second": 2.54, + "eval_steps_per_second": 0.318, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.7215994453471393, + "learning_rate": 2e-05, + "loss": 0.6263, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.6682087182998657, + "eval_runtime": 78.1433, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.6425448006102333, + "learning_rate": 2e-05, + "loss": 0.5613, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.6686681509017944, + "eval_runtime": 78.0964, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.7207053166384572, + "learning_rate": 2e-05, + "loss": 0.6239, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.6676305532455444, + "eval_runtime": 77.9986, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.321, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.7459344743811905, + "learning_rate": 2e-05, + "loss": 0.6159, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.6660167574882507, + "eval_runtime": 78.4159, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.7179805119560739, + "learning_rate": 2e-05, + "loss": 0.6192, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.6636325716972351, + "eval_runtime": 78.2224, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.724792498458059, + "learning_rate": 2e-05, + "loss": 0.5234, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.6647288799285889, + "eval_runtime": 79.0573, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.6544107138826364, + "learning_rate": 2e-05, + "loss": 0.6067, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.6689667701721191, + "eval_runtime": 79.2898, + "eval_samples_per_second": 2.522, + "eval_steps_per_second": 0.315, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.71580236810568, + "learning_rate": 2e-05, + "loss": 0.6215, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.6723271012306213, + "eval_runtime": 79.0759, + "eval_samples_per_second": 2.529, + "eval_steps_per_second": 0.316, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.7741383931390255, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.6743794083595276, + "eval_runtime": 79.0509, + "eval_samples_per_second": 2.53, + "eval_steps_per_second": 0.316, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 0.7927343087738151, + "learning_rate": 2e-05, + "loss": 0.6241, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.6728585958480835, + "eval_runtime": 79.2296, + "eval_samples_per_second": 2.524, + "eval_steps_per_second": 0.316, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.759468785526614, + "learning_rate": 2e-05, + "loss": 0.6209, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.6686221957206726, + "eval_runtime": 76.7494, + "eval_samples_per_second": 2.606, + "eval_steps_per_second": 0.326, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.7345386079388437, + "learning_rate": 2e-05, + "loss": 0.5618, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.6659188270568848, + "eval_runtime": 77.4511, + "eval_samples_per_second": 2.582, + "eval_steps_per_second": 0.323, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.6822491965046279, + "learning_rate": 2e-05, + "loss": 0.6064, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.664726734161377, + "eval_runtime": 76.7108, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.7329120674082968, + "learning_rate": 2e-05, + "loss": 0.5843, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.6635715961456299, + "eval_runtime": 76.7921, + "eval_samples_per_second": 2.604, + "eval_steps_per_second": 0.326, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.7950781591249908, + "learning_rate": 2e-05, + "loss": 0.6383, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.664521336555481, + "eval_runtime": 76.6952, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.6791182798182671, + "learning_rate": 2e-05, + "loss": 0.5932, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.6673008799552917, + "eval_runtime": 76.794, + "eval_samples_per_second": 2.604, + "eval_steps_per_second": 0.326, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.7633434086832942, + "learning_rate": 2e-05, + "loss": 0.5754, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.6692779064178467, + "eval_runtime": 76.7749, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.6857090076317197, + "learning_rate": 2e-05, + "loss": 0.5585, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.6702080368995667, + "eval_runtime": 76.6913, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.6961298007385132, + "learning_rate": 2e-05, + "loss": 0.5093, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.6708166599273682, + "eval_runtime": 76.7725, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.7783752192295856, + "learning_rate": 2e-05, + "loss": 0.5656, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.6697121262550354, + "eval_runtime": 76.7888, + "eval_samples_per_second": 2.605, + "eval_steps_per_second": 0.326, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 0.7327581828795048, + "learning_rate": 2e-05, + "loss": 0.6984, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.6684187054634094, + "eval_runtime": 78.6657, + "eval_samples_per_second": 2.542, + "eval_steps_per_second": 0.318, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.689919829790507, + "learning_rate": 2e-05, + "loss": 0.6173, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.6675245761871338, + "eval_runtime": 78.1275, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.6812947879732435, + "learning_rate": 2e-05, + "loss": 0.5499, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.6678825616836548, + "eval_runtime": 78.8588, + "eval_samples_per_second": 2.536, + "eval_steps_per_second": 0.317, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.715716761740314, + "learning_rate": 2e-05, + "loss": 0.5699, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.6692755222320557, + "eval_runtime": 83.098, + "eval_samples_per_second": 2.407, + "eval_steps_per_second": 0.301, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.7438930389955494, + "learning_rate": 2e-05, + "loss": 0.5974, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.6735746264457703, + "eval_runtime": 77.384, + "eval_samples_per_second": 2.585, + "eval_steps_per_second": 0.323, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.7271043131369198, + "learning_rate": 2e-05, + "loss": 0.601, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.6790977716445923, + "eval_runtime": 78.0312, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 0.851687675865168, + "learning_rate": 2e-05, + "loss": 0.5681, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.6834170818328857, + "eval_runtime": 77.8688, + "eval_samples_per_second": 2.568, + "eval_steps_per_second": 0.321, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 0.7905287763218567, + "learning_rate": 2e-05, + "loss": 0.6222, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.6843841671943665, + "eval_runtime": 77.985, + "eval_samples_per_second": 2.565, + "eval_steps_per_second": 0.321, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.7301520002532459, + "learning_rate": 2e-05, + "loss": 0.5549, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.6860540509223938, + "eval_runtime": 78.0163, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 0.899999206595601, + "learning_rate": 2e-05, + "loss": 0.5128, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.685759425163269, + "eval_runtime": 78.4339, + "eval_samples_per_second": 2.55, + "eval_steps_per_second": 0.319, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 0.8064287475451557, + "learning_rate": 2e-05, + "loss": 0.5261, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.6864770650863647, + "eval_runtime": 79.6129, + "eval_samples_per_second": 2.512, + "eval_steps_per_second": 0.314, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 0.8837240795882767, + "learning_rate": 2e-05, + "loss": 0.621, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.6871599555015564, + "eval_runtime": 78.9778, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.317, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 0.9676184044078363, + "learning_rate": 2e-05, + "loss": 0.5655, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.6881282329559326, + "eval_runtime": 78.9944, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 0.8723474213941232, + "learning_rate": 2e-05, + "loss": 0.5449, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.6879245638847351, + "eval_runtime": 79.0056, + "eval_samples_per_second": 2.531, + "eval_steps_per_second": 0.316, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 0.848833488380702, + "learning_rate": 2e-05, + "loss": 0.5683, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.6846978664398193, + "eval_runtime": 78.9003, + "eval_samples_per_second": 2.535, + "eval_steps_per_second": 0.317, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 0.8586391766708288, + "learning_rate": 2e-05, + "loss": 0.5358, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.6798649430274963, + "eval_runtime": 80.0404, + "eval_samples_per_second": 2.499, + "eval_steps_per_second": 0.312, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 0.8007832596916474, + "learning_rate": 2e-05, + "loss": 0.5792, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.6757382750511169, + "eval_runtime": 79.962, + "eval_samples_per_second": 2.501, + "eval_steps_per_second": 0.313, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 0.7839805948862919, + "learning_rate": 2e-05, + "loss": 0.5917, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.6754000782966614, + "eval_runtime": 80.738, + "eval_samples_per_second": 2.477, + "eval_steps_per_second": 0.31, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.7397772754102683, + "learning_rate": 2e-05, + "loss": 0.6249, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.6777495741844177, + "eval_runtime": 80.5144, + "eval_samples_per_second": 2.484, + "eval_steps_per_second": 0.311, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.857390001265035, + "learning_rate": 2e-05, + "loss": 0.5932, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.6778848171234131, + "eval_runtime": 80.1508, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 0.312, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 0.9430180281536945, + "learning_rate": 2e-05, + "loss": 0.5793, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.6771917939186096, + "eval_runtime": 76.7109, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 0.8705050270903875, + "learning_rate": 2e-05, + "loss": 0.5601, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.6808632016181946, + "eval_runtime": 76.6965, + "eval_samples_per_second": 2.608, + "eval_steps_per_second": 0.326, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 0.8611871513168323, + "learning_rate": 2e-05, + "loss": 0.5953, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.6875945329666138, + "eval_runtime": 76.6592, + "eval_samples_per_second": 2.609, + "eval_steps_per_second": 0.326, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 0.9066952565245906, + "learning_rate": 2e-05, + "loss": 0.5815, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.6910049319267273, + "eval_runtime": 76.7021, + "eval_samples_per_second": 2.607, + "eval_steps_per_second": 0.326, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.0666864048105145, + "learning_rate": 2e-05, + "loss": 0.5663, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.6869986057281494, + "eval_runtime": 76.6344, + "eval_samples_per_second": 2.61, + "eval_steps_per_second": 0.326, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 0.9413311560347162, + "learning_rate": 2e-05, + "loss": 0.5106, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.6825075745582581, + "eval_runtime": 78.7857, + "eval_samples_per_second": 2.539, + "eval_steps_per_second": 0.317, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 0.9175579044457436, + "learning_rate": 2e-05, + "loss": 0.5821, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.6794223189353943, + "eval_runtime": 78.0368, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 0.7982785075945665, + "learning_rate": 2e-05, + "loss": 0.5781, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.679649829864502, + "eval_runtime": 78.0513, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 0.9284642289974022, + "learning_rate": 2e-05, + "loss": 0.5394, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.6805163025856018, + "eval_runtime": 78.2229, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 0.8816568355396782, + "learning_rate": 2e-05, + "loss": 0.5722, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.6801097393035889, + "eval_runtime": 78.9282, + "eval_samples_per_second": 2.534, + "eval_steps_per_second": 0.317, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 0.8137119863863306, + "learning_rate": 2e-05, + "loss": 0.5831, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.6792600750923157, + "eval_runtime": 78.8166, + "eval_samples_per_second": 2.538, + "eval_steps_per_second": 0.317, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 0.9595174764400289, + "learning_rate": 2e-05, + "loss": 0.5489, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.6755692958831787, + "eval_runtime": 78.1426, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 0.8612490247878711, + "learning_rate": 2e-05, + "loss": 0.5508, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.673053503036499, + "eval_runtime": 78.0565, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 0.9474068762478358, + "learning_rate": 2e-05, + "loss": 0.5859, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.6695602536201477, + "eval_runtime": 78.051, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 0.8401643717683449, + "learning_rate": 2e-05, + "loss": 0.5277, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.6707890033721924, + "eval_runtime": 78.9959, + "eval_samples_per_second": 2.532, + "eval_steps_per_second": 0.316, + "step": 320 + }, + { + "epoch": 10.0, + "step": 320, + "total_flos": 613933061373952.0, + "train_loss": 0.056994458101689814, + "train_runtime": 3241.7031, + "train_samples_per_second": 3.085, + "train_steps_per_second": 0.099 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 613933061373952.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}