diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..155d1f73d5ebc14dd486e783fbc5b512a529ff5c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1855/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3710/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5565/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7420/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-1855/config.json b/checkpoint-1855/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7f34bbd5159c9a132258ecf79562e79459cb64d9 --- /dev/null +++ b/checkpoint-1855/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B-Instruct/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1855/generation_config.json b/checkpoint-1855/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0484b997a9ea9b5b6d711db644716bfd32d5470e --- /dev/null +++ b/checkpoint-1855/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ae70340d06f3877a18a3e95c0c792a0f442c744 --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:715c7957381bbc579912ecbcb589c18818529a2a1b4cfcd2aba27f391a3fcb5e +size 12045398464 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2b3d12fb537153c1bfb21ce3b135b6974130cab --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb825bad27a4cac399ca693db928b13617b9c86e4262fd5723ab19b705a5bfc +size 12045399232 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae2b4ab51712cc3a51dc70be78352a4af70a5aa9 --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77053ad981a39f72f842661fc49c4bb9547556bd4ec7484029c76228bb13e270 +size 12045399488 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e109bbd7e536a80c156aa1e79a8587fd4e41797 --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c721d859abaa4ee1e2319684009698d2e7e91c7a433a33ca396786e66ba7cac8 +size 12045399232 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05acdcdc3edf389d066edb068cf33440504cc70e --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c960691fa9183b3d091d5a30bb374bc3bcf9159e09586f05837474a0004fdd +size 12045399488 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..456ecc877c19e8f553d00031cb0addb928a53050 --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36fcaadedfd51a96cf35399d3c34010098ba621d727e1fafe71157ab8fd570d +size 12045399552 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a36c094b764f234ef58327e9e2bb944b835e232 --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8677e7f565e0ecf0a625024aaba5ddfefcc8043667d8ea5a7bb17ca3495698c4 +size 12045399232 diff --git a/checkpoint-1855/global_step1855/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a8e6898de3b3aa35f9bb74ec73cf26ba610ebba --- /dev/null +++ b/checkpoint-1855/global_step1855/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6042b2a66c6c6ac522afdea89c1fb9baa1f788126ecb16b28dcf22e6f4b0a5 +size 12045398144 diff --git a/checkpoint-1855/global_step1855/mp_rank_00_model_states.pt b/checkpoint-1855/global_step1855/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5a0b5bcb7cdac3366a80c5f2e6be96586e23837 --- /dev/null +++ b/checkpoint-1855/global_step1855/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0846405f141f24add54ce505a8cdba4c128f90058118e575c2af82b443d72771 +size 16060610552 diff --git a/checkpoint-1855/latest b/checkpoint-1855/latest new file mode 100644 index 0000000000000000000000000000000000000000..8107fc8e32044966df7570fe13e804d16c70d482 --- /dev/null +++ b/checkpoint-1855/latest @@ -0,0 +1 @@ +global_step1855 \ No newline at end of file diff --git a/checkpoint-1855/model-00001-of-00004.safetensors b/checkpoint-1855/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a26c97fb21190ed1e54ffa3e5264ec910b7be8a --- /dev/null +++ b/checkpoint-1855/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3b97db9e5e69d861495ef4e307508ef9fcf958f251f993d6c157ff00c66515 +size 4976698672 diff --git a/checkpoint-1855/model-00002-of-00004.safetensors b/checkpoint-1855/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a5c7e2c049bc80bb26c34aca174b928047323e5 --- /dev/null +++ b/checkpoint-1855/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7922c3ee873c839a490c876477a64aac36adc30bce9e4464aeee4255eb66768 +size 4999802720 diff --git a/checkpoint-1855/model-00003-of-00004.safetensors b/checkpoint-1855/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0260cd7656d9fcb961a2db8e61d9d2b7461ddd9 --- /dev/null +++ b/checkpoint-1855/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96b449c68ae8ba700aeefedae17cb282076312ba4aedd50e5da5641c50b95164 +size 4915916176 diff --git a/checkpoint-1855/model-00004-of-00004.safetensors b/checkpoint-1855/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..19328908666811c4e9deb228b46b7f91ca73b05b --- /dev/null +++ b/checkpoint-1855/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6409ac45f94909fd961e26875d0f7b851938c6c74bdcff85be003bbc5dec11aa +size 1168138808 diff --git a/checkpoint-1855/model.safetensors.index.json b/checkpoint-1855/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/checkpoint-1855/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-1855/rng_state_0.pth b/checkpoint-1855/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-1855/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-1855/rng_state_1.pth b/checkpoint-1855/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-1855/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-1855/rng_state_2.pth b/checkpoint-1855/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-1855/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-1855/rng_state_3.pth b/checkpoint-1855/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-1855/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-1855/rng_state_4.pth b/checkpoint-1855/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-1855/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-1855/rng_state_5.pth b/checkpoint-1855/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-1855/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-1855/rng_state_6.pth b/checkpoint-1855/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-1855/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-1855/rng_state_7.pth b/checkpoint-1855/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-1855/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-1855/scheduler.pt b/checkpoint-1855/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e32e254819d7e8d0af20f51bf11caec2e8abbdad --- /dev/null +++ b/checkpoint-1855/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521b137220e619250a4a0c0f7e53b6194c62074e5ef79f389bbab24cfe6f80f8 +size 1064 diff --git a/checkpoint-1855/special_tokens_map.json b/checkpoint-1855/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-1855/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1855/tokenizer.json b/checkpoint-1855/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a5a81eb733cae803b39ffc7644de0048c3a26c3 --- /dev/null +++ b/checkpoint-1855/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d7990a7c3f12081b24b3d098ab366211161e43494d2368211815c164b5f2b7 +size 17209828 diff --git a/checkpoint-1855/tokenizer_config.json b/checkpoint-1855/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5cd68a680b8f949dba64516158c30db7ea52c3cd --- /dev/null +++ b/checkpoint-1855/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1855/trainer_state.json b/checkpoint-1855/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de1ea9ba84ec44c072db0afb039ac6068ac06587 --- /dev/null +++ b/checkpoint-1855/trainer_state.json @@ -0,0 +1,13018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1000107828337287, + "eval_steps": 500, + "global_step": 1855, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.391416864351952e-05, + "grad_norm": 53.75010299682617, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.5864, + "step": 1 + }, + { + "epoch": 0.00010782833728703904, + "grad_norm": 45.00067138671875, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3757, + "step": 2 + }, + { + "epoch": 0.00016174250593055855, + "grad_norm": 51.22366714477539, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.4653, + "step": 3 + }, + { + "epoch": 0.00021565667457407807, + "grad_norm": 62.225242614746094, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5819, + "step": 4 + }, + { + "epoch": 0.0002695708432175976, + "grad_norm": 54.67008590698242, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6368, + "step": 5 + }, + { + "epoch": 0.0003234850118611171, + "grad_norm": 51.261009216308594, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3245, + "step": 6 + }, + { + "epoch": 0.0003773991805046366, + "grad_norm": 53.58714294433594, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7622, + "step": 7 + }, + { + "epoch": 0.00043131334914815614, + "grad_norm": 41.32997131347656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.6444, + "step": 8 + }, + { + "epoch": 0.00048522751779167566, + "grad_norm": 33.232242584228516, + "learning_rate": 9.000000000000001e-07, + "loss": 2.1475, + "step": 9 + }, + { + "epoch": 0.0005391416864351952, + "grad_norm": 34.1890983581543, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7256, + "step": 10 + }, + { + "epoch": 0.0005930558550787146, + "grad_norm": 19.263437271118164, + "learning_rate": 1.1e-06, + "loss": 2.4132, + "step": 11 + }, + { + "epoch": 0.0006469700237222342, + "grad_norm": 15.612638473510742, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.0422, + "step": 12 + }, + { + "epoch": 0.0007008841923657537, + "grad_norm": 13.81751537322998, + "learning_rate": 1.3e-06, + "loss": 1.9663, + "step": 13 + }, + { + "epoch": 0.0007547983610092732, + "grad_norm": 16.390897750854492, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.1135, + "step": 14 + }, + { + "epoch": 0.0008087125296527927, + "grad_norm": 21.830646514892578, + "learning_rate": 1.5e-06, + "loss": 2.217, + "step": 15 + }, + { + "epoch": 0.0008626266982963123, + "grad_norm": 18.630046844482422, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1612, + "step": 16 + }, + { + "epoch": 0.0009165408669398317, + "grad_norm": 12.403571128845215, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9358, + "step": 17 + }, + { + "epoch": 0.0009704550355833513, + "grad_norm": 7.713366508483887, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8522, + "step": 18 + }, + { + "epoch": 0.001024369204226871, + "grad_norm": 7.731616973876953, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.7984, + "step": 19 + }, + { + "epoch": 0.0010782833728703904, + "grad_norm": 7.5799174308776855, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.701, + "step": 20 + }, + { + "epoch": 0.0011321975415139098, + "grad_norm": 5.5428080558776855, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.624, + "step": 21 + }, + { + "epoch": 0.0011861117101574293, + "grad_norm": 5.851474285125732, + "learning_rate": 2.2e-06, + "loss": 1.8064, + "step": 22 + }, + { + "epoch": 0.001240025878800949, + "grad_norm": 5.243111610412598, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7246, + "step": 23 + }, + { + "epoch": 0.0012939400474444684, + "grad_norm": 4.835971832275391, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.763, + "step": 24 + }, + { + "epoch": 0.0013478542160879879, + "grad_norm": 4.127845287322998, + "learning_rate": 2.5e-06, + "loss": 1.5869, + "step": 25 + }, + { + "epoch": 0.0014017683847315074, + "grad_norm": 3.7648322582244873, + "learning_rate": 2.6e-06, + "loss": 1.5599, + "step": 26 + }, + { + "epoch": 0.001455682553375027, + "grad_norm": 3.5424962043762207, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4703, + "step": 27 + }, + { + "epoch": 0.0015095967220185465, + "grad_norm": 3.3707985877990723, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.354, + "step": 28 + }, + { + "epoch": 0.001563510890662066, + "grad_norm": 4.71254825592041, + "learning_rate": 2.9e-06, + "loss": 1.8162, + "step": 29 + }, + { + "epoch": 0.0016174250593055854, + "grad_norm": 3.7660300731658936, + "learning_rate": 3e-06, + "loss": 1.5951, + "step": 30 + }, + { + "epoch": 0.001671339227949105, + "grad_norm": 3.4810571670532227, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.5183, + "step": 31 + }, + { + "epoch": 0.0017252533965926246, + "grad_norm": 3.672693967819214, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6374, + "step": 32 + }, + { + "epoch": 0.001779167565236144, + "grad_norm": 3.3589682579040527, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.4371, + "step": 33 + }, + { + "epoch": 0.0018330817338796635, + "grad_norm": 3.6365807056427, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.595, + "step": 34 + }, + { + "epoch": 0.0018869959025231832, + "grad_norm": 3.6467039585113525, + "learning_rate": 3.5e-06, + "loss": 1.5714, + "step": 35 + }, + { + "epoch": 0.0019409100711667026, + "grad_norm": 3.4684648513793945, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4897, + "step": 36 + }, + { + "epoch": 0.001994824239810222, + "grad_norm": 3.70845627784729, + "learning_rate": 3.7e-06, + "loss": 1.5954, + "step": 37 + }, + { + "epoch": 0.002048738408453742, + "grad_norm": 3.1803395748138428, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3976, + "step": 38 + }, + { + "epoch": 0.002102652577097261, + "grad_norm": 2.851703405380249, + "learning_rate": 3.900000000000001e-06, + "loss": 1.1894, + "step": 39 + }, + { + "epoch": 0.0021565667457407807, + "grad_norm": 2.832003593444824, + "learning_rate": 4.000000000000001e-06, + "loss": 1.353, + "step": 40 + }, + { + "epoch": 0.0022104809143843004, + "grad_norm": 3.397498607635498, + "learning_rate": 4.1e-06, + "loss": 1.4541, + "step": 41 + }, + { + "epoch": 0.0022643950830278196, + "grad_norm": 3.4537954330444336, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.4475, + "step": 42 + }, + { + "epoch": 0.0023183092516713393, + "grad_norm": 3.1131632328033447, + "learning_rate": 4.3e-06, + "loss": 1.2707, + "step": 43 + }, + { + "epoch": 0.0023722234203148586, + "grad_norm": 3.0421881675720215, + "learning_rate": 4.4e-06, + "loss": 1.3418, + "step": 44 + }, + { + "epoch": 0.0024261375889583782, + "grad_norm": 3.528514862060547, + "learning_rate": 4.5e-06, + "loss": 1.4432, + "step": 45 + }, + { + "epoch": 0.002480051757601898, + "grad_norm": 3.6783225536346436, + "learning_rate": 4.600000000000001e-06, + "loss": 1.4863, + "step": 46 + }, + { + "epoch": 0.002533965926245417, + "grad_norm": 2.9829189777374268, + "learning_rate": 4.7e-06, + "loss": 1.2856, + "step": 47 + }, + { + "epoch": 0.002587880094888937, + "grad_norm": 3.4480350017547607, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4129, + "step": 48 + }, + { + "epoch": 0.0026417942635324565, + "grad_norm": 3.4247214794158936, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3467, + "step": 49 + }, + { + "epoch": 0.0026957084321759758, + "grad_norm": 3.5268948078155518, + "learning_rate": 5e-06, + "loss": 1.4795, + "step": 50 + }, + { + "epoch": 0.0027496226008194955, + "grad_norm": 3.3228304386138916, + "learning_rate": 5.1e-06, + "loss": 1.461, + "step": 51 + }, + { + "epoch": 0.0028035367694630147, + "grad_norm": 3.365630865097046, + "learning_rate": 5.2e-06, + "loss": 1.2947, + "step": 52 + }, + { + "epoch": 0.0028574509381065344, + "grad_norm": 3.4889328479766846, + "learning_rate": 5.300000000000001e-06, + "loss": 1.432, + "step": 53 + }, + { + "epoch": 0.002911365106750054, + "grad_norm": 3.5767273902893066, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3773, + "step": 54 + }, + { + "epoch": 0.0029652792753935733, + "grad_norm": 3.499298095703125, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4132, + "step": 55 + }, + { + "epoch": 0.003019193444037093, + "grad_norm": 3.6990244388580322, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4595, + "step": 56 + }, + { + "epoch": 0.0030731076126806127, + "grad_norm": 3.0908327102661133, + "learning_rate": 5.7e-06, + "loss": 1.1873, + "step": 57 + }, + { + "epoch": 0.003127021781324132, + "grad_norm": 3.149425745010376, + "learning_rate": 5.8e-06, + "loss": 1.3306, + "step": 58 + }, + { + "epoch": 0.0031809359499676516, + "grad_norm": 3.193023204803467, + "learning_rate": 5.9e-06, + "loss": 1.3326, + "step": 59 + }, + { + "epoch": 0.003234850118611171, + "grad_norm": 3.610344409942627, + "learning_rate": 6e-06, + "loss": 1.4527, + "step": 60 + }, + { + "epoch": 0.0032887642872546905, + "grad_norm": 2.9877095222473145, + "learning_rate": 6.1e-06, + "loss": 1.2029, + "step": 61 + }, + { + "epoch": 0.00334267845589821, + "grad_norm": 3.0241923332214355, + "learning_rate": 6.200000000000001e-06, + "loss": 1.3413, + "step": 62 + }, + { + "epoch": 0.0033965926245417295, + "grad_norm": 3.212700366973877, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3471, + "step": 63 + }, + { + "epoch": 0.003450506793185249, + "grad_norm": 2.7138960361480713, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.0885, + "step": 64 + }, + { + "epoch": 0.0035044209618287684, + "grad_norm": 2.5690340995788574, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1168, + "step": 65 + }, + { + "epoch": 0.003558335130472288, + "grad_norm": 3.0344784259796143, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2828, + "step": 66 + }, + { + "epoch": 0.0036122492991158077, + "grad_norm": 3.0589816570281982, + "learning_rate": 6.700000000000001e-06, + "loss": 1.2604, + "step": 67 + }, + { + "epoch": 0.003666163467759327, + "grad_norm": 2.676417112350464, + "learning_rate": 6.800000000000001e-06, + "loss": 1.1679, + "step": 68 + }, + { + "epoch": 0.0037200776364028467, + "grad_norm": 2.6590960025787354, + "learning_rate": 6.9e-06, + "loss": 1.2283, + "step": 69 + }, + { + "epoch": 0.0037739918050463664, + "grad_norm": 2.6973354816436768, + "learning_rate": 7e-06, + "loss": 1.2028, + "step": 70 + }, + { + "epoch": 0.0038279059736898856, + "grad_norm": 2.7046608924865723, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2629, + "step": 71 + }, + { + "epoch": 0.0038818201423334053, + "grad_norm": 2.2172696590423584, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.1367, + "step": 72 + }, + { + "epoch": 0.0039357343109769245, + "grad_norm": 2.6138789653778076, + "learning_rate": 7.3e-06, + "loss": 1.3167, + "step": 73 + }, + { + "epoch": 0.003989648479620444, + "grad_norm": 2.2926838397979736, + "learning_rate": 7.4e-06, + "loss": 1.2909, + "step": 74 + }, + { + "epoch": 0.004043562648263964, + "grad_norm": 2.0647220611572266, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2054, + "step": 75 + }, + { + "epoch": 0.004097476816907484, + "grad_norm": 2.1190452575683594, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1497, + "step": 76 + }, + { + "epoch": 0.004151390985551002, + "grad_norm": 1.9973243474960327, + "learning_rate": 7.7e-06, + "loss": 1.1997, + "step": 77 + }, + { + "epoch": 0.004205305154194522, + "grad_norm": 2.11751651763916, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2181, + "step": 78 + }, + { + "epoch": 0.004259219322838042, + "grad_norm": 1.8975950479507446, + "learning_rate": 7.9e-06, + "loss": 1.1582, + "step": 79 + }, + { + "epoch": 0.004313133491481561, + "grad_norm": 1.8368147611618042, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1389, + "step": 80 + }, + { + "epoch": 0.004367047660125081, + "grad_norm": 1.7472988367080688, + "learning_rate": 8.1e-06, + "loss": 1.0959, + "step": 81 + }, + { + "epoch": 0.004420961828768601, + "grad_norm": 1.7325443029403687, + "learning_rate": 8.2e-06, + "loss": 1.1847, + "step": 82 + }, + { + "epoch": 0.00447487599741212, + "grad_norm": 1.6171561479568481, + "learning_rate": 8.3e-06, + "loss": 0.9834, + "step": 83 + }, + { + "epoch": 0.004528790166055639, + "grad_norm": 1.6583327054977417, + "learning_rate": 8.400000000000001e-06, + "loss": 1.0413, + "step": 84 + }, + { + "epoch": 0.004582704334699159, + "grad_norm": 1.8914967775344849, + "learning_rate": 8.5e-06, + "loss": 1.2413, + "step": 85 + }, + { + "epoch": 0.004636618503342679, + "grad_norm": 1.6018317937850952, + "learning_rate": 8.6e-06, + "loss": 1.0577, + "step": 86 + }, + { + "epoch": 0.004690532671986198, + "grad_norm": 1.9170053005218506, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2463, + "step": 87 + }, + { + "epoch": 0.004744446840629717, + "grad_norm": 1.666536569595337, + "learning_rate": 8.8e-06, + "loss": 1.0532, + "step": 88 + }, + { + "epoch": 0.004798361009273237, + "grad_norm": 1.660115361213684, + "learning_rate": 8.900000000000001e-06, + "loss": 1.0514, + "step": 89 + }, + { + "epoch": 0.0048522751779167565, + "grad_norm": 1.8667477369308472, + "learning_rate": 9e-06, + "loss": 1.2039, + "step": 90 + }, + { + "epoch": 0.004906189346560276, + "grad_norm": 1.9490039348602295, + "learning_rate": 9.100000000000001e-06, + "loss": 1.1804, + "step": 91 + }, + { + "epoch": 0.004960103515203796, + "grad_norm": 1.8415377140045166, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1435, + "step": 92 + }, + { + "epoch": 0.005014017683847315, + "grad_norm": 1.8571438789367676, + "learning_rate": 9.3e-06, + "loss": 1.0974, + "step": 93 + }, + { + "epoch": 0.005067931852490834, + "grad_norm": 1.8480113744735718, + "learning_rate": 9.4e-06, + "loss": 1.149, + "step": 94 + }, + { + "epoch": 0.005121846021134354, + "grad_norm": 2.003490447998047, + "learning_rate": 9.5e-06, + "loss": 1.1954, + "step": 95 + }, + { + "epoch": 0.005175760189777874, + "grad_norm": 1.8002668619155884, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9953, + "step": 96 + }, + { + "epoch": 0.005229674358421393, + "grad_norm": 1.9040817022323608, + "learning_rate": 9.7e-06, + "loss": 1.1195, + "step": 97 + }, + { + "epoch": 0.005283588527064913, + "grad_norm": 1.8311433792114258, + "learning_rate": 9.800000000000001e-06, + "loss": 1.083, + "step": 98 + }, + { + "epoch": 0.005337502695708432, + "grad_norm": 1.9509624242782593, + "learning_rate": 9.9e-06, + "loss": 1.176, + "step": 99 + }, + { + "epoch": 0.0053914168643519516, + "grad_norm": 2.0624589920043945, + "learning_rate": 1e-05, + "loss": 1.119, + "step": 100 + }, + { + "epoch": 0.005445331032995471, + "grad_norm": 1.9618796110153198, + "learning_rate": 9.999999995505339e-06, + "loss": 1.1371, + "step": 101 + }, + { + "epoch": 0.005499245201638991, + "grad_norm": 1.946245551109314, + "learning_rate": 9.999999982021349e-06, + "loss": 0.9736, + "step": 102 + }, + { + "epoch": 0.005553159370282511, + "grad_norm": 1.9871301651000977, + "learning_rate": 9.999999959548035e-06, + "loss": 1.1077, + "step": 103 + }, + { + "epoch": 0.005607073538926029, + "grad_norm": 1.86216402053833, + "learning_rate": 9.999999928085396e-06, + "loss": 1.0882, + "step": 104 + }, + { + "epoch": 0.005660987707569549, + "grad_norm": 1.8447723388671875, + "learning_rate": 9.999999887633432e-06, + "loss": 1.0344, + "step": 105 + }, + { + "epoch": 0.005714901876213069, + "grad_norm": 1.8345638513565063, + "learning_rate": 9.99999983819214e-06, + "loss": 1.1077, + "step": 106 + }, + { + "epoch": 0.0057688160448565885, + "grad_norm": 1.8410178422927856, + "learning_rate": 9.999999779761524e-06, + "loss": 1.0824, + "step": 107 + }, + { + "epoch": 0.005822730213500108, + "grad_norm": 1.5881969928741455, + "learning_rate": 9.999999712341583e-06, + "loss": 0.9439, + "step": 108 + }, + { + "epoch": 0.005876644382143627, + "grad_norm": 1.6704047918319702, + "learning_rate": 9.999999635932316e-06, + "loss": 1.033, + "step": 109 + }, + { + "epoch": 0.005930558550787147, + "grad_norm": 1.792449712753296, + "learning_rate": 9.999999550533726e-06, + "loss": 1.0279, + "step": 110 + }, + { + "epoch": 0.005984472719430666, + "grad_norm": 1.6515668630599976, + "learning_rate": 9.999999456145809e-06, + "loss": 1.0301, + "step": 111 + }, + { + "epoch": 0.006038386888074186, + "grad_norm": 1.8541395664215088, + "learning_rate": 9.999999352768568e-06, + "loss": 1.1057, + "step": 112 + }, + { + "epoch": 0.006092301056717706, + "grad_norm": 1.6490236520767212, + "learning_rate": 9.999999240402002e-06, + "loss": 1.0523, + "step": 113 + }, + { + "epoch": 0.006146215225361225, + "grad_norm": 1.655333161354065, + "learning_rate": 9.999999119046113e-06, + "loss": 1.0448, + "step": 114 + }, + { + "epoch": 0.006200129394004744, + "grad_norm": 1.5721609592437744, + "learning_rate": 9.999998988700899e-06, + "loss": 0.9883, + "step": 115 + }, + { + "epoch": 0.006254043562648264, + "grad_norm": 1.6411349773406982, + "learning_rate": 9.99999884936636e-06, + "loss": 1.0255, + "step": 116 + }, + { + "epoch": 0.0063079577312917835, + "grad_norm": 1.6399502754211426, + "learning_rate": 9.999998701042501e-06, + "loss": 1.0146, + "step": 117 + }, + { + "epoch": 0.006361871899935303, + "grad_norm": 1.615026831626892, + "learning_rate": 9.999998543729316e-06, + "loss": 1.0022, + "step": 118 + }, + { + "epoch": 0.006415786068578823, + "grad_norm": 1.4867664575576782, + "learning_rate": 9.99999837742681e-06, + "loss": 1.0164, + "step": 119 + }, + { + "epoch": 0.006469700237222342, + "grad_norm": 1.540153980255127, + "learning_rate": 9.999998202134979e-06, + "loss": 0.989, + "step": 120 + }, + { + "epoch": 0.006523614405865861, + "grad_norm": 1.5535691976547241, + "learning_rate": 9.999998017853825e-06, + "loss": 0.9942, + "step": 121 + }, + { + "epoch": 0.006577528574509381, + "grad_norm": 1.4892929792404175, + "learning_rate": 9.999997824583351e-06, + "loss": 1.0537, + "step": 122 + }, + { + "epoch": 0.006631442743152901, + "grad_norm": 1.4674094915390015, + "learning_rate": 9.999997622323554e-06, + "loss": 1.0239, + "step": 123 + }, + { + "epoch": 0.00668535691179642, + "grad_norm": 1.394027590751648, + "learning_rate": 9.999997411074436e-06, + "loss": 0.9781, + "step": 124 + }, + { + "epoch": 0.006739271080439939, + "grad_norm": 1.372728705406189, + "learning_rate": 9.999997190835999e-06, + "loss": 1.0433, + "step": 125 + }, + { + "epoch": 0.006793185249083459, + "grad_norm": 1.2535908222198486, + "learning_rate": 9.999996961608238e-06, + "loss": 0.958, + "step": 126 + }, + { + "epoch": 0.006847099417726979, + "grad_norm": 1.337633490562439, + "learning_rate": 9.999996723391158e-06, + "loss": 1.0213, + "step": 127 + }, + { + "epoch": 0.006901013586370498, + "grad_norm": 1.3640319108963013, + "learning_rate": 9.999996476184759e-06, + "loss": 1.0432, + "step": 128 + }, + { + "epoch": 0.006954927755014018, + "grad_norm": 1.2663391828536987, + "learning_rate": 9.99999621998904e-06, + "loss": 1.0154, + "step": 129 + }, + { + "epoch": 0.007008841923657537, + "grad_norm": 1.450737476348877, + "learning_rate": 9.999995954804004e-06, + "loss": 1.0074, + "step": 130 + }, + { + "epoch": 0.0070627560923010565, + "grad_norm": 1.2757987976074219, + "learning_rate": 9.999995680629649e-06, + "loss": 0.9996, + "step": 131 + }, + { + "epoch": 0.007116670260944576, + "grad_norm": 1.3978132009506226, + "learning_rate": 9.999995397465974e-06, + "loss": 1.04, + "step": 132 + }, + { + "epoch": 0.007170584429588096, + "grad_norm": 1.3167297840118408, + "learning_rate": 9.999995105312982e-06, + "loss": 1.0069, + "step": 133 + }, + { + "epoch": 0.0072244985982316155, + "grad_norm": 1.1626744270324707, + "learning_rate": 9.999994804170674e-06, + "loss": 0.9722, + "step": 134 + }, + { + "epoch": 0.007278412766875135, + "grad_norm": 1.354797601699829, + "learning_rate": 9.99999449403905e-06, + "loss": 0.9019, + "step": 135 + }, + { + "epoch": 0.007332326935518654, + "grad_norm": 1.2605732679367065, + "learning_rate": 9.99999417491811e-06, + "loss": 1.0038, + "step": 136 + }, + { + "epoch": 0.007386241104162174, + "grad_norm": 1.3804657459259033, + "learning_rate": 9.999993846807855e-06, + "loss": 1.0139, + "step": 137 + }, + { + "epoch": 0.007440155272805693, + "grad_norm": 1.3001742362976074, + "learning_rate": 9.999993509708286e-06, + "loss": 1.1436, + "step": 138 + }, + { + "epoch": 0.007494069441449213, + "grad_norm": 1.2776422500610352, + "learning_rate": 9.999993163619401e-06, + "loss": 0.9792, + "step": 139 + }, + { + "epoch": 0.007547983610092733, + "grad_norm": 1.2149187326431274, + "learning_rate": 9.999992808541204e-06, + "loss": 0.963, + "step": 140 + }, + { + "epoch": 0.0076018977787362515, + "grad_norm": 1.341806173324585, + "learning_rate": 9.999992444473694e-06, + "loss": 0.9639, + "step": 141 + }, + { + "epoch": 0.007655811947379771, + "grad_norm": 1.2565757036209106, + "learning_rate": 9.999992071416874e-06, + "loss": 0.9193, + "step": 142 + }, + { + "epoch": 0.007709726116023291, + "grad_norm": 1.3059918880462646, + "learning_rate": 9.99999168937074e-06, + "loss": 0.9632, + "step": 143 + }, + { + "epoch": 0.0077636402846668106, + "grad_norm": 1.1719332933425903, + "learning_rate": 9.999991298335295e-06, + "loss": 0.9687, + "step": 144 + }, + { + "epoch": 0.00781755445331033, + "grad_norm": 1.125950813293457, + "learning_rate": 9.999990898310542e-06, + "loss": 0.968, + "step": 145 + }, + { + "epoch": 0.007871468621953849, + "grad_norm": 1.2400416135787964, + "learning_rate": 9.999990489296478e-06, + "loss": 0.972, + "step": 146 + }, + { + "epoch": 0.007925382790597369, + "grad_norm": 1.172117829322815, + "learning_rate": 9.999990071293106e-06, + "loss": 0.9243, + "step": 147 + }, + { + "epoch": 0.007979296959240888, + "grad_norm": 1.240317463874817, + "learning_rate": 9.999989644300427e-06, + "loss": 1.0655, + "step": 148 + }, + { + "epoch": 0.008033211127884408, + "grad_norm": 1.1535708904266357, + "learning_rate": 9.999989208318438e-06, + "loss": 0.9871, + "step": 149 + }, + { + "epoch": 0.008087125296527928, + "grad_norm": 1.2711198329925537, + "learning_rate": 9.999988763347145e-06, + "loss": 1.0307, + "step": 150 + }, + { + "epoch": 0.008141039465171447, + "grad_norm": 1.2345954179763794, + "learning_rate": 9.999988309386548e-06, + "loss": 1.1343, + "step": 151 + }, + { + "epoch": 0.008194953633814967, + "grad_norm": 1.2489601373672485, + "learning_rate": 9.999987846436645e-06, + "loss": 1.0303, + "step": 152 + }, + { + "epoch": 0.008248867802458487, + "grad_norm": 1.264240026473999, + "learning_rate": 9.999987374497439e-06, + "loss": 0.9562, + "step": 153 + }, + { + "epoch": 0.008302781971102005, + "grad_norm": 1.2613575458526611, + "learning_rate": 9.99998689356893e-06, + "loss": 0.954, + "step": 154 + }, + { + "epoch": 0.008356696139745524, + "grad_norm": 1.2091072797775269, + "learning_rate": 9.999986403651116e-06, + "loss": 1.0734, + "step": 155 + }, + { + "epoch": 0.008410610308389044, + "grad_norm": 1.18421471118927, + "learning_rate": 9.999985904744002e-06, + "loss": 0.9167, + "step": 156 + }, + { + "epoch": 0.008464524477032564, + "grad_norm": 1.0399659872055054, + "learning_rate": 9.99998539684759e-06, + "loss": 0.9068, + "step": 157 + }, + { + "epoch": 0.008518438645676083, + "grad_norm": 1.1292288303375244, + "learning_rate": 9.999984879961877e-06, + "loss": 1.0027, + "step": 158 + }, + { + "epoch": 0.008572352814319603, + "grad_norm": 1.2592105865478516, + "learning_rate": 9.999984354086867e-06, + "loss": 1.0794, + "step": 159 + }, + { + "epoch": 0.008626266982963123, + "grad_norm": 1.1646504402160645, + "learning_rate": 9.999983819222558e-06, + "loss": 1.0468, + "step": 160 + }, + { + "epoch": 0.008680181151606643, + "grad_norm": 1.156711220741272, + "learning_rate": 9.999983275368952e-06, + "loss": 0.9053, + "step": 161 + }, + { + "epoch": 0.008734095320250162, + "grad_norm": 1.1169341802597046, + "learning_rate": 9.999982722526051e-06, + "loss": 0.97, + "step": 162 + }, + { + "epoch": 0.008788009488893682, + "grad_norm": 1.3474149703979492, + "learning_rate": 9.999982160693856e-06, + "loss": 1.0221, + "step": 163 + }, + { + "epoch": 0.008841923657537202, + "grad_norm": 1.2021468877792358, + "learning_rate": 9.999981589872368e-06, + "loss": 0.9303, + "step": 164 + }, + { + "epoch": 0.00889583782618072, + "grad_norm": 1.0625534057617188, + "learning_rate": 9.999981010061586e-06, + "loss": 0.8765, + "step": 165 + }, + { + "epoch": 0.00894975199482424, + "grad_norm": 1.2688498497009277, + "learning_rate": 9.999980421261512e-06, + "loss": 1.0163, + "step": 166 + }, + { + "epoch": 0.009003666163467759, + "grad_norm": 1.122948408126831, + "learning_rate": 9.999979823472148e-06, + "loss": 0.9953, + "step": 167 + }, + { + "epoch": 0.009057580332111279, + "grad_norm": 1.1817872524261475, + "learning_rate": 9.999979216693495e-06, + "loss": 1.0774, + "step": 168 + }, + { + "epoch": 0.009111494500754798, + "grad_norm": 1.1483280658721924, + "learning_rate": 9.999978600925553e-06, + "loss": 1.0105, + "step": 169 + }, + { + "epoch": 0.009165408669398318, + "grad_norm": 1.4039335250854492, + "learning_rate": 9.999977976168325e-06, + "loss": 0.944, + "step": 170 + }, + { + "epoch": 0.009219322838041838, + "grad_norm": 1.1459723711013794, + "learning_rate": 9.999977342421812e-06, + "loss": 0.9208, + "step": 171 + }, + { + "epoch": 0.009273237006685357, + "grad_norm": 1.0897774696350098, + "learning_rate": 9.999976699686011e-06, + "loss": 0.8719, + "step": 172 + }, + { + "epoch": 0.009327151175328877, + "grad_norm": 1.206467866897583, + "learning_rate": 9.999976047960928e-06, + "loss": 1.0645, + "step": 173 + }, + { + "epoch": 0.009381065343972397, + "grad_norm": 1.004550814628601, + "learning_rate": 9.999975387246563e-06, + "loss": 0.9317, + "step": 174 + }, + { + "epoch": 0.009434979512615916, + "grad_norm": 1.2359992265701294, + "learning_rate": 9.999974717542916e-06, + "loss": 1.1136, + "step": 175 + }, + { + "epoch": 0.009488893681259434, + "grad_norm": 1.1922352313995361, + "learning_rate": 9.999974038849989e-06, + "loss": 1.0307, + "step": 176 + }, + { + "epoch": 0.009542807849902954, + "grad_norm": 1.1597613096237183, + "learning_rate": 9.999973351167782e-06, + "loss": 1.0275, + "step": 177 + }, + { + "epoch": 0.009596722018546474, + "grad_norm": 1.172133445739746, + "learning_rate": 9.999972654496298e-06, + "loss": 0.9269, + "step": 178 + }, + { + "epoch": 0.009650636187189993, + "grad_norm": 1.1879733800888062, + "learning_rate": 9.999971948835538e-06, + "loss": 0.9547, + "step": 179 + }, + { + "epoch": 0.009704550355833513, + "grad_norm": 1.0029833316802979, + "learning_rate": 9.999971234185502e-06, + "loss": 0.8994, + "step": 180 + }, + { + "epoch": 0.009758464524477033, + "grad_norm": 1.0769891738891602, + "learning_rate": 9.999970510546194e-06, + "loss": 0.9107, + "step": 181 + }, + { + "epoch": 0.009812378693120552, + "grad_norm": 1.3288064002990723, + "learning_rate": 9.99996977791761e-06, + "loss": 1.0116, + "step": 182 + }, + { + "epoch": 0.009866292861764072, + "grad_norm": 1.142452597618103, + "learning_rate": 9.999969036299757e-06, + "loss": 0.9367, + "step": 183 + }, + { + "epoch": 0.009920207030407592, + "grad_norm": 1.2458518743515015, + "learning_rate": 9.999968285692632e-06, + "loss": 1.1398, + "step": 184 + }, + { + "epoch": 0.009974121199051111, + "grad_norm": 1.3373422622680664, + "learning_rate": 9.99996752609624e-06, + "loss": 0.959, + "step": 185 + }, + { + "epoch": 0.01002803536769463, + "grad_norm": 1.2288920879364014, + "learning_rate": 9.99996675751058e-06, + "loss": 0.9908, + "step": 186 + }, + { + "epoch": 0.010081949536338149, + "grad_norm": 1.1954001188278198, + "learning_rate": 9.999965979935656e-06, + "loss": 0.9332, + "step": 187 + }, + { + "epoch": 0.010135863704981669, + "grad_norm": 1.171021819114685, + "learning_rate": 9.999965193371466e-06, + "loss": 0.9119, + "step": 188 + }, + { + "epoch": 0.010189777873625188, + "grad_norm": 1.025169014930725, + "learning_rate": 9.999964397818013e-06, + "loss": 0.784, + "step": 189 + }, + { + "epoch": 0.010243692042268708, + "grad_norm": 1.1340326070785522, + "learning_rate": 9.999963593275298e-06, + "loss": 1.0036, + "step": 190 + }, + { + "epoch": 0.010297606210912228, + "grad_norm": 1.0302847623825073, + "learning_rate": 9.999962779743324e-06, + "loss": 0.8293, + "step": 191 + }, + { + "epoch": 0.010351520379555747, + "grad_norm": 1.2410109043121338, + "learning_rate": 9.99996195722209e-06, + "loss": 0.9507, + "step": 192 + }, + { + "epoch": 0.010405434548199267, + "grad_norm": 1.2054308652877808, + "learning_rate": 9.9999611257116e-06, + "loss": 0.9356, + "step": 193 + }, + { + "epoch": 0.010459348716842787, + "grad_norm": 1.2046679258346558, + "learning_rate": 9.999960285211853e-06, + "loss": 1.0638, + "step": 194 + }, + { + "epoch": 0.010513262885486306, + "grad_norm": 1.4594306945800781, + "learning_rate": 9.999959435722852e-06, + "loss": 0.9624, + "step": 195 + }, + { + "epoch": 0.010567177054129826, + "grad_norm": 1.0909247398376465, + "learning_rate": 9.999958577244598e-06, + "loss": 0.9503, + "step": 196 + }, + { + "epoch": 0.010621091222773344, + "grad_norm": 1.1524754762649536, + "learning_rate": 9.999957709777094e-06, + "loss": 0.8954, + "step": 197 + }, + { + "epoch": 0.010675005391416864, + "grad_norm": 1.4128906726837158, + "learning_rate": 9.99995683332034e-06, + "loss": 0.8903, + "step": 198 + }, + { + "epoch": 0.010728919560060383, + "grad_norm": 1.1304652690887451, + "learning_rate": 9.999955947874338e-06, + "loss": 0.9247, + "step": 199 + }, + { + "epoch": 0.010782833728703903, + "grad_norm": 1.2978957891464233, + "learning_rate": 9.99995505343909e-06, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.010836747897347423, + "grad_norm": 1.0742554664611816, + "learning_rate": 9.999954150014595e-06, + "loss": 0.9626, + "step": 201 + }, + { + "epoch": 0.010890662065990942, + "grad_norm": 1.0707745552062988, + "learning_rate": 9.999953237600859e-06, + "loss": 0.8721, + "step": 202 + }, + { + "epoch": 0.010944576234634462, + "grad_norm": 1.17974853515625, + "learning_rate": 9.99995231619788e-06, + "loss": 1.0059, + "step": 203 + }, + { + "epoch": 0.010998490403277982, + "grad_norm": 1.0108370780944824, + "learning_rate": 9.999951385805662e-06, + "loss": 0.9527, + "step": 204 + }, + { + "epoch": 0.011052404571921502, + "grad_norm": 0.9983445405960083, + "learning_rate": 9.999950446424204e-06, + "loss": 0.7626, + "step": 205 + }, + { + "epoch": 0.011106318740565021, + "grad_norm": 1.0860002040863037, + "learning_rate": 9.99994949805351e-06, + "loss": 0.9591, + "step": 206 + }, + { + "epoch": 0.01116023290920854, + "grad_norm": 1.0447322130203247, + "learning_rate": 9.999948540693584e-06, + "loss": 0.9861, + "step": 207 + }, + { + "epoch": 0.011214147077852059, + "grad_norm": 1.2582998275756836, + "learning_rate": 9.999947574344423e-06, + "loss": 0.8949, + "step": 208 + }, + { + "epoch": 0.011268061246495579, + "grad_norm": 1.1507002115249634, + "learning_rate": 9.99994659900603e-06, + "loss": 0.918, + "step": 209 + }, + { + "epoch": 0.011321975415139098, + "grad_norm": 1.135169267654419, + "learning_rate": 9.999945614678408e-06, + "loss": 0.9891, + "step": 210 + }, + { + "epoch": 0.011375889583782618, + "grad_norm": 1.1746275424957275, + "learning_rate": 9.999944621361558e-06, + "loss": 1.0186, + "step": 211 + }, + { + "epoch": 0.011429803752426138, + "grad_norm": 1.1137248277664185, + "learning_rate": 9.999943619055483e-06, + "loss": 0.9584, + "step": 212 + }, + { + "epoch": 0.011483717921069657, + "grad_norm": 1.336651086807251, + "learning_rate": 9.999942607760182e-06, + "loss": 1.091, + "step": 213 + }, + { + "epoch": 0.011537632089713177, + "grad_norm": 1.1966856718063354, + "learning_rate": 9.999941587475658e-06, + "loss": 0.9761, + "step": 214 + }, + { + "epoch": 0.011591546258356697, + "grad_norm": 1.0843144655227661, + "learning_rate": 9.999940558201915e-06, + "loss": 0.8917, + "step": 215 + }, + { + "epoch": 0.011645460427000216, + "grad_norm": 1.2089293003082275, + "learning_rate": 9.999939519938953e-06, + "loss": 0.9704, + "step": 216 + }, + { + "epoch": 0.011699374595643736, + "grad_norm": 1.2409982681274414, + "learning_rate": 9.999938472686775e-06, + "loss": 0.9949, + "step": 217 + }, + { + "epoch": 0.011753288764287254, + "grad_norm": 1.1310094594955444, + "learning_rate": 9.99993741644538e-06, + "loss": 0.9666, + "step": 218 + }, + { + "epoch": 0.011807202932930774, + "grad_norm": 1.120510220527649, + "learning_rate": 9.999936351214772e-06, + "loss": 0.8844, + "step": 219 + }, + { + "epoch": 0.011861117101574293, + "grad_norm": 1.0931518077850342, + "learning_rate": 9.999935276994954e-06, + "loss": 0.9647, + "step": 220 + }, + { + "epoch": 0.011915031270217813, + "grad_norm": 1.2821122407913208, + "learning_rate": 9.999934193785926e-06, + "loss": 1.0533, + "step": 221 + }, + { + "epoch": 0.011968945438861333, + "grad_norm": 1.183580756187439, + "learning_rate": 9.999933101587691e-06, + "loss": 0.9196, + "step": 222 + }, + { + "epoch": 0.012022859607504852, + "grad_norm": 1.045825719833374, + "learning_rate": 9.99993200040025e-06, + "loss": 0.8953, + "step": 223 + }, + { + "epoch": 0.012076773776148372, + "grad_norm": 1.0963969230651855, + "learning_rate": 9.999930890223605e-06, + "loss": 0.9723, + "step": 224 + }, + { + "epoch": 0.012130687944791892, + "grad_norm": 1.0356731414794922, + "learning_rate": 9.999929771057761e-06, + "loss": 1.0215, + "step": 225 + }, + { + "epoch": 0.012184602113435411, + "grad_norm": 1.112277626991272, + "learning_rate": 9.999928642902717e-06, + "loss": 0.9886, + "step": 226 + }, + { + "epoch": 0.012238516282078931, + "grad_norm": 0.9969072937965393, + "learning_rate": 9.999927505758475e-06, + "loss": 0.8601, + "step": 227 + }, + { + "epoch": 0.01229243045072245, + "grad_norm": 1.123781442642212, + "learning_rate": 9.999926359625036e-06, + "loss": 0.9894, + "step": 228 + }, + { + "epoch": 0.012346344619365969, + "grad_norm": 1.2122100591659546, + "learning_rate": 9.999925204502406e-06, + "loss": 1.0783, + "step": 229 + }, + { + "epoch": 0.012400258788009488, + "grad_norm": 1.1256672143936157, + "learning_rate": 9.999924040390584e-06, + "loss": 0.9116, + "step": 230 + }, + { + "epoch": 0.012454172956653008, + "grad_norm": 1.0646952390670776, + "learning_rate": 9.999922867289573e-06, + "loss": 0.8993, + "step": 231 + }, + { + "epoch": 0.012508087125296528, + "grad_norm": 1.194676399230957, + "learning_rate": 9.999921685199376e-06, + "loss": 1.0377, + "step": 232 + }, + { + "epoch": 0.012562001293940047, + "grad_norm": 1.0519152879714966, + "learning_rate": 9.999920494119992e-06, + "loss": 0.8283, + "step": 233 + }, + { + "epoch": 0.012615915462583567, + "grad_norm": 1.243249773979187, + "learning_rate": 9.999919294051427e-06, + "loss": 0.9741, + "step": 234 + }, + { + "epoch": 0.012669829631227087, + "grad_norm": 1.1071687936782837, + "learning_rate": 9.999918084993681e-06, + "loss": 1.0402, + "step": 235 + }, + { + "epoch": 0.012723743799870606, + "grad_norm": 1.1224809885025024, + "learning_rate": 9.999916866946757e-06, + "loss": 0.8793, + "step": 236 + }, + { + "epoch": 0.012777657968514126, + "grad_norm": 1.0458532571792603, + "learning_rate": 9.999915639910656e-06, + "loss": 0.9855, + "step": 237 + }, + { + "epoch": 0.012831572137157646, + "grad_norm": 1.0610811710357666, + "learning_rate": 9.999914403885383e-06, + "loss": 0.8092, + "step": 238 + }, + { + "epoch": 0.012885486305801164, + "grad_norm": 1.2818992137908936, + "learning_rate": 9.999913158870936e-06, + "loss": 1.0101, + "step": 239 + }, + { + "epoch": 0.012939400474444683, + "grad_norm": 1.110400915145874, + "learning_rate": 9.999911904867319e-06, + "loss": 0.9782, + "step": 240 + }, + { + "epoch": 0.012993314643088203, + "grad_norm": 1.3290835618972778, + "learning_rate": 9.999910641874537e-06, + "loss": 1.0683, + "step": 241 + }, + { + "epoch": 0.013047228811731723, + "grad_norm": 1.1448980569839478, + "learning_rate": 9.999909369892588e-06, + "loss": 0.9223, + "step": 242 + }, + { + "epoch": 0.013101142980375242, + "grad_norm": 1.1710877418518066, + "learning_rate": 9.999908088921477e-06, + "loss": 0.8022, + "step": 243 + }, + { + "epoch": 0.013155057149018762, + "grad_norm": 1.1242793798446655, + "learning_rate": 9.999906798961207e-06, + "loss": 0.9238, + "step": 244 + }, + { + "epoch": 0.013208971317662282, + "grad_norm": 1.0338802337646484, + "learning_rate": 9.999905500011778e-06, + "loss": 0.8386, + "step": 245 + }, + { + "epoch": 0.013262885486305801, + "grad_norm": 1.0910224914550781, + "learning_rate": 9.999904192073193e-06, + "loss": 0.937, + "step": 246 + }, + { + "epoch": 0.013316799654949321, + "grad_norm": 1.297788143157959, + "learning_rate": 9.999902875145453e-06, + "loss": 0.9054, + "step": 247 + }, + { + "epoch": 0.01337071382359284, + "grad_norm": 1.1317543983459473, + "learning_rate": 9.999901549228564e-06, + "loss": 0.9418, + "step": 248 + }, + { + "epoch": 0.01342462799223636, + "grad_norm": 1.0944132804870605, + "learning_rate": 9.999900214322526e-06, + "loss": 0.9445, + "step": 249 + }, + { + "epoch": 0.013478542160879878, + "grad_norm": 1.4942843914031982, + "learning_rate": 9.999898870427342e-06, + "loss": 0.8956, + "step": 250 + }, + { + "epoch": 0.013532456329523398, + "grad_norm": 1.0630019903182983, + "learning_rate": 9.999897517543013e-06, + "loss": 0.8381, + "step": 251 + }, + { + "epoch": 0.013586370498166918, + "grad_norm": 1.65073561668396, + "learning_rate": 9.999896155669544e-06, + "loss": 1.0148, + "step": 252 + }, + { + "epoch": 0.013640284666810438, + "grad_norm": 1.035731315612793, + "learning_rate": 9.999894784806936e-06, + "loss": 0.8092, + "step": 253 + }, + { + "epoch": 0.013694198835453957, + "grad_norm": 1.308863639831543, + "learning_rate": 9.99989340495519e-06, + "loss": 0.9742, + "step": 254 + }, + { + "epoch": 0.013748113004097477, + "grad_norm": 1.1512938737869263, + "learning_rate": 9.999892016114313e-06, + "loss": 0.8747, + "step": 255 + }, + { + "epoch": 0.013802027172740997, + "grad_norm": 0.9977009296417236, + "learning_rate": 9.9998906182843e-06, + "loss": 0.8183, + "step": 256 + }, + { + "epoch": 0.013855941341384516, + "grad_norm": 1.2228175401687622, + "learning_rate": 9.99988921146516e-06, + "loss": 0.9917, + "step": 257 + }, + { + "epoch": 0.013909855510028036, + "grad_norm": 1.0753847360610962, + "learning_rate": 9.999887795656896e-06, + "loss": 1.0063, + "step": 258 + }, + { + "epoch": 0.013963769678671556, + "grad_norm": 1.0010429620742798, + "learning_rate": 9.999886370859506e-06, + "loss": 0.9315, + "step": 259 + }, + { + "epoch": 0.014017683847315074, + "grad_norm": 1.2038911581039429, + "learning_rate": 9.999884937072995e-06, + "loss": 0.8764, + "step": 260 + }, + { + "epoch": 0.014071598015958593, + "grad_norm": 1.1268917322158813, + "learning_rate": 9.999883494297365e-06, + "loss": 1.0059, + "step": 261 + }, + { + "epoch": 0.014125512184602113, + "grad_norm": 1.1053709983825684, + "learning_rate": 9.999882042532619e-06, + "loss": 0.8866, + "step": 262 + }, + { + "epoch": 0.014179426353245633, + "grad_norm": 1.091145396232605, + "learning_rate": 9.999880581778758e-06, + "loss": 1.0415, + "step": 263 + }, + { + "epoch": 0.014233340521889152, + "grad_norm": 1.0019958019256592, + "learning_rate": 9.999879112035786e-06, + "loss": 0.8177, + "step": 264 + }, + { + "epoch": 0.014287254690532672, + "grad_norm": 1.1044156551361084, + "learning_rate": 9.999877633303708e-06, + "loss": 0.9508, + "step": 265 + }, + { + "epoch": 0.014341168859176192, + "grad_norm": 0.9750218391418457, + "learning_rate": 9.999876145582524e-06, + "loss": 0.8501, + "step": 266 + }, + { + "epoch": 0.014395083027819711, + "grad_norm": 1.4015804529190063, + "learning_rate": 9.999874648872235e-06, + "loss": 0.9491, + "step": 267 + }, + { + "epoch": 0.014448997196463231, + "grad_norm": 1.066422939300537, + "learning_rate": 9.999873143172848e-06, + "loss": 1.0104, + "step": 268 + }, + { + "epoch": 0.01450291136510675, + "grad_norm": 1.1133167743682861, + "learning_rate": 9.99987162848436e-06, + "loss": 1.0142, + "step": 269 + }, + { + "epoch": 0.01455682553375027, + "grad_norm": 1.1259140968322754, + "learning_rate": 9.999870104806782e-06, + "loss": 0.9803, + "step": 270 + }, + { + "epoch": 0.014610739702393788, + "grad_norm": 1.0813393592834473, + "learning_rate": 9.999868572140108e-06, + "loss": 0.8728, + "step": 271 + }, + { + "epoch": 0.014664653871037308, + "grad_norm": 0.9939939379692078, + "learning_rate": 9.999867030484347e-06, + "loss": 0.8826, + "step": 272 + }, + { + "epoch": 0.014718568039680828, + "grad_norm": 1.0081939697265625, + "learning_rate": 9.999865479839499e-06, + "loss": 0.8682, + "step": 273 + }, + { + "epoch": 0.014772482208324347, + "grad_norm": 1.0190658569335938, + "learning_rate": 9.999863920205567e-06, + "loss": 0.9094, + "step": 274 + }, + { + "epoch": 0.014826396376967867, + "grad_norm": 1.0702111721038818, + "learning_rate": 9.999862351582553e-06, + "loss": 0.9244, + "step": 275 + }, + { + "epoch": 0.014880310545611387, + "grad_norm": 1.0891972780227661, + "learning_rate": 9.999860773970461e-06, + "loss": 1.0318, + "step": 276 + }, + { + "epoch": 0.014934224714254906, + "grad_norm": 0.9788139462471008, + "learning_rate": 9.999859187369294e-06, + "loss": 0.8779, + "step": 277 + }, + { + "epoch": 0.014988138882898426, + "grad_norm": 1.0678125619888306, + "learning_rate": 9.999857591779055e-06, + "loss": 0.8962, + "step": 278 + }, + { + "epoch": 0.015042053051541946, + "grad_norm": 0.9882293343544006, + "learning_rate": 9.999855987199747e-06, + "loss": 0.9082, + "step": 279 + }, + { + "epoch": 0.015095967220185465, + "grad_norm": 0.9987571835517883, + "learning_rate": 9.999854373631371e-06, + "loss": 0.9708, + "step": 280 + }, + { + "epoch": 0.015149881388828985, + "grad_norm": 1.0238722562789917, + "learning_rate": 9.99985275107393e-06, + "loss": 0.9461, + "step": 281 + }, + { + "epoch": 0.015203795557472503, + "grad_norm": 0.9628013372421265, + "learning_rate": 9.999851119527431e-06, + "loss": 0.9412, + "step": 282 + }, + { + "epoch": 0.015257709726116023, + "grad_norm": 1.0021862983703613, + "learning_rate": 9.999849478991873e-06, + "loss": 0.8461, + "step": 283 + }, + { + "epoch": 0.015311623894759542, + "grad_norm": 0.9776142239570618, + "learning_rate": 9.99984782946726e-06, + "loss": 0.962, + "step": 284 + }, + { + "epoch": 0.015365538063403062, + "grad_norm": 1.0114799737930298, + "learning_rate": 9.999846170953593e-06, + "loss": 0.8732, + "step": 285 + }, + { + "epoch": 0.015419452232046582, + "grad_norm": 0.9860401749610901, + "learning_rate": 9.999844503450879e-06, + "loss": 0.8204, + "step": 286 + }, + { + "epoch": 0.015473366400690101, + "grad_norm": 1.0743263959884644, + "learning_rate": 9.999842826959119e-06, + "loss": 0.9445, + "step": 287 + }, + { + "epoch": 0.015527280569333621, + "grad_norm": 1.0456606149673462, + "learning_rate": 9.999841141478315e-06, + "loss": 0.8869, + "step": 288 + }, + { + "epoch": 0.01558119473797714, + "grad_norm": 1.0299748182296753, + "learning_rate": 9.99983944700847e-06, + "loss": 0.9543, + "step": 289 + }, + { + "epoch": 0.01563510890662066, + "grad_norm": 1.0176036357879639, + "learning_rate": 9.99983774354959e-06, + "loss": 0.9672, + "step": 290 + }, + { + "epoch": 0.01568902307526418, + "grad_norm": 1.0023303031921387, + "learning_rate": 9.999836031101675e-06, + "loss": 0.9417, + "step": 291 + }, + { + "epoch": 0.015742937243907698, + "grad_norm": 0.9801005721092224, + "learning_rate": 9.99983430966473e-06, + "loss": 0.9376, + "step": 292 + }, + { + "epoch": 0.01579685141255122, + "grad_norm": 1.002906322479248, + "learning_rate": 9.999832579238756e-06, + "loss": 0.8973, + "step": 293 + }, + { + "epoch": 0.015850765581194737, + "grad_norm": 1.0014845132827759, + "learning_rate": 9.999830839823759e-06, + "loss": 0.9583, + "step": 294 + }, + { + "epoch": 0.01590467974983826, + "grad_norm": 1.0173449516296387, + "learning_rate": 9.999829091419739e-06, + "loss": 0.9006, + "step": 295 + }, + { + "epoch": 0.015958593918481777, + "grad_norm": 0.9779545664787292, + "learning_rate": 9.999827334026702e-06, + "loss": 0.9342, + "step": 296 + }, + { + "epoch": 0.016012508087125298, + "grad_norm": 0.9800315499305725, + "learning_rate": 9.999825567644648e-06, + "loss": 0.7948, + "step": 297 + }, + { + "epoch": 0.016066422255768816, + "grad_norm": 0.9628249406814575, + "learning_rate": 9.999823792273583e-06, + "loss": 0.8415, + "step": 298 + }, + { + "epoch": 0.016120336424412334, + "grad_norm": 1.1227449178695679, + "learning_rate": 9.99982200791351e-06, + "loss": 0.9646, + "step": 299 + }, + { + "epoch": 0.016174250593055856, + "grad_norm": 1.1018567085266113, + "learning_rate": 9.99982021456443e-06, + "loss": 0.8647, + "step": 300 + }, + { + "epoch": 0.016228164761699373, + "grad_norm": 1.1017298698425293, + "learning_rate": 9.999818412226347e-06, + "loss": 0.8708, + "step": 301 + }, + { + "epoch": 0.016282078930342895, + "grad_norm": 1.084594488143921, + "learning_rate": 9.999816600899267e-06, + "loss": 0.9765, + "step": 302 + }, + { + "epoch": 0.016335993098986413, + "grad_norm": 1.3735941648483276, + "learning_rate": 9.99981478058319e-06, + "loss": 1.0253, + "step": 303 + }, + { + "epoch": 0.016389907267629934, + "grad_norm": 1.1644489765167236, + "learning_rate": 9.999812951278119e-06, + "loss": 0.8519, + "step": 304 + }, + { + "epoch": 0.016443821436273452, + "grad_norm": 1.0079474449157715, + "learning_rate": 9.99981111298406e-06, + "loss": 0.9422, + "step": 305 + }, + { + "epoch": 0.016497735604916974, + "grad_norm": 1.0046736001968384, + "learning_rate": 9.999809265701015e-06, + "loss": 0.7766, + "step": 306 + }, + { + "epoch": 0.01655164977356049, + "grad_norm": 1.0312374830245972, + "learning_rate": 9.999807409428987e-06, + "loss": 0.8844, + "step": 307 + }, + { + "epoch": 0.01660556394220401, + "grad_norm": 1.0419421195983887, + "learning_rate": 9.99980554416798e-06, + "loss": 0.8902, + "step": 308 + }, + { + "epoch": 0.01665947811084753, + "grad_norm": 1.2056832313537598, + "learning_rate": 9.999803669917996e-06, + "loss": 0.9842, + "step": 309 + }, + { + "epoch": 0.01671339227949105, + "grad_norm": 0.9645346403121948, + "learning_rate": 9.999801786679039e-06, + "loss": 0.7837, + "step": 310 + }, + { + "epoch": 0.01676730644813457, + "grad_norm": 1.0259841680526733, + "learning_rate": 9.999799894451115e-06, + "loss": 0.8927, + "step": 311 + }, + { + "epoch": 0.016821220616778088, + "grad_norm": 0.9932212233543396, + "learning_rate": 9.999797993234224e-06, + "loss": 0.815, + "step": 312 + }, + { + "epoch": 0.01687513478542161, + "grad_norm": 1.0666078329086304, + "learning_rate": 9.99979608302837e-06, + "loss": 0.8245, + "step": 313 + }, + { + "epoch": 0.016929048954065128, + "grad_norm": 0.9566568732261658, + "learning_rate": 9.999794163833557e-06, + "loss": 0.851, + "step": 314 + }, + { + "epoch": 0.01698296312270865, + "grad_norm": 1.0056332349777222, + "learning_rate": 9.999792235649789e-06, + "loss": 0.8704, + "step": 315 + }, + { + "epoch": 0.017036877291352167, + "grad_norm": 1.036537528038025, + "learning_rate": 9.999790298477068e-06, + "loss": 0.9512, + "step": 316 + }, + { + "epoch": 0.01709079145999569, + "grad_norm": 1.1026023626327515, + "learning_rate": 9.9997883523154e-06, + "loss": 1.0007, + "step": 317 + }, + { + "epoch": 0.017144705628639206, + "grad_norm": 1.006659984588623, + "learning_rate": 9.999786397164786e-06, + "loss": 0.8992, + "step": 318 + }, + { + "epoch": 0.017198619797282724, + "grad_norm": 1.0100573301315308, + "learning_rate": 9.99978443302523e-06, + "loss": 0.9545, + "step": 319 + }, + { + "epoch": 0.017252533965926246, + "grad_norm": 1.000086784362793, + "learning_rate": 9.999782459896735e-06, + "loss": 0.8732, + "step": 320 + }, + { + "epoch": 0.017306448134569764, + "grad_norm": 1.2039650678634644, + "learning_rate": 9.999780477779306e-06, + "loss": 0.9881, + "step": 321 + }, + { + "epoch": 0.017360362303213285, + "grad_norm": 1.0316474437713623, + "learning_rate": 9.999778486672948e-06, + "loss": 0.8686, + "step": 322 + }, + { + "epoch": 0.017414276471856803, + "grad_norm": 1.1697666645050049, + "learning_rate": 9.999776486577661e-06, + "loss": 0.9185, + "step": 323 + }, + { + "epoch": 0.017468190640500324, + "grad_norm": 0.9523053169250488, + "learning_rate": 9.999774477493451e-06, + "loss": 0.858, + "step": 324 + }, + { + "epoch": 0.017522104809143842, + "grad_norm": 0.9660015106201172, + "learning_rate": 9.999772459420319e-06, + "loss": 0.9964, + "step": 325 + }, + { + "epoch": 0.017576018977787364, + "grad_norm": 0.971128523349762, + "learning_rate": 9.999770432358271e-06, + "loss": 0.8999, + "step": 326 + }, + { + "epoch": 0.01762993314643088, + "grad_norm": 1.221969485282898, + "learning_rate": 9.999768396307312e-06, + "loss": 0.8628, + "step": 327 + }, + { + "epoch": 0.017683847315074403, + "grad_norm": 1.0868507623672485, + "learning_rate": 9.999766351267442e-06, + "loss": 1.0732, + "step": 328 + }, + { + "epoch": 0.01773776148371792, + "grad_norm": 0.9527992606163025, + "learning_rate": 9.999764297238666e-06, + "loss": 0.8221, + "step": 329 + }, + { + "epoch": 0.01779167565236144, + "grad_norm": 0.9969122409820557, + "learning_rate": 9.99976223422099e-06, + "loss": 0.9234, + "step": 330 + }, + { + "epoch": 0.01784558982100496, + "grad_norm": 0.9291784763336182, + "learning_rate": 9.999760162214415e-06, + "loss": 0.7839, + "step": 331 + }, + { + "epoch": 0.01789950398964848, + "grad_norm": 0.9766960144042969, + "learning_rate": 9.999758081218944e-06, + "loss": 0.7929, + "step": 332 + }, + { + "epoch": 0.017953418158292, + "grad_norm": 0.9536904692649841, + "learning_rate": 9.999755991234585e-06, + "loss": 0.9136, + "step": 333 + }, + { + "epoch": 0.018007332326935518, + "grad_norm": 1.0325372219085693, + "learning_rate": 9.999753892261337e-06, + "loss": 0.8367, + "step": 334 + }, + { + "epoch": 0.01806124649557904, + "grad_norm": 0.9486141800880432, + "learning_rate": 9.999751784299207e-06, + "loss": 0.8802, + "step": 335 + }, + { + "epoch": 0.018115160664222557, + "grad_norm": 0.9880577921867371, + "learning_rate": 9.999749667348198e-06, + "loss": 0.8597, + "step": 336 + }, + { + "epoch": 0.01816907483286608, + "grad_norm": 1.043199896812439, + "learning_rate": 9.999747541408312e-06, + "loss": 0.9142, + "step": 337 + }, + { + "epoch": 0.018222989001509596, + "grad_norm": 1.0606465339660645, + "learning_rate": 9.999745406479554e-06, + "loss": 0.9876, + "step": 338 + }, + { + "epoch": 0.018276903170153118, + "grad_norm": 1.139449954032898, + "learning_rate": 9.999743262561929e-06, + "loss": 0.7773, + "step": 339 + }, + { + "epoch": 0.018330817338796636, + "grad_norm": 1.1416115760803223, + "learning_rate": 9.99974110965544e-06, + "loss": 0.9566, + "step": 340 + }, + { + "epoch": 0.018384731507440154, + "grad_norm": 1.0145153999328613, + "learning_rate": 9.99973894776009e-06, + "loss": 0.9543, + "step": 341 + }, + { + "epoch": 0.018438645676083675, + "grad_norm": 0.950528621673584, + "learning_rate": 9.999736776875885e-06, + "loss": 0.8007, + "step": 342 + }, + { + "epoch": 0.018492559844727193, + "grad_norm": 0.9080097079277039, + "learning_rate": 9.999734597002826e-06, + "loss": 0.8273, + "step": 343 + }, + { + "epoch": 0.018546474013370715, + "grad_norm": 1.0038888454437256, + "learning_rate": 9.99973240814092e-06, + "loss": 0.9394, + "step": 344 + }, + { + "epoch": 0.018600388182014232, + "grad_norm": 1.05253267288208, + "learning_rate": 9.999730210290168e-06, + "loss": 0.9485, + "step": 345 + }, + { + "epoch": 0.018654302350657754, + "grad_norm": 0.9396592974662781, + "learning_rate": 9.999728003450577e-06, + "loss": 0.8943, + "step": 346 + }, + { + "epoch": 0.018708216519301272, + "grad_norm": 1.149387240409851, + "learning_rate": 9.999725787622148e-06, + "loss": 0.8566, + "step": 347 + }, + { + "epoch": 0.018762130687944793, + "grad_norm": 1.1573290824890137, + "learning_rate": 9.999723562804887e-06, + "loss": 0.9641, + "step": 348 + }, + { + "epoch": 0.01881604485658831, + "grad_norm": 1.0217385292053223, + "learning_rate": 9.999721328998797e-06, + "loss": 0.9555, + "step": 349 + }, + { + "epoch": 0.018869959025231833, + "grad_norm": 1.034690499305725, + "learning_rate": 9.999719086203884e-06, + "loss": 0.9407, + "step": 350 + }, + { + "epoch": 0.01892387319387535, + "grad_norm": 0.9819002151489258, + "learning_rate": 9.999716834420148e-06, + "loss": 0.9104, + "step": 351 + }, + { + "epoch": 0.01897778736251887, + "grad_norm": 1.0459688901901245, + "learning_rate": 9.999714573647597e-06, + "loss": 0.9296, + "step": 352 + }, + { + "epoch": 0.01903170153116239, + "grad_norm": 0.9575183391571045, + "learning_rate": 9.999712303886232e-06, + "loss": 0.8517, + "step": 353 + }, + { + "epoch": 0.019085615699805908, + "grad_norm": 1.0018881559371948, + "learning_rate": 9.99971002513606e-06, + "loss": 0.9208, + "step": 354 + }, + { + "epoch": 0.01913952986844943, + "grad_norm": 1.0291972160339355, + "learning_rate": 9.999707737397085e-06, + "loss": 0.8765, + "step": 355 + }, + { + "epoch": 0.019193444037092947, + "grad_norm": 1.0081498622894287, + "learning_rate": 9.999705440669306e-06, + "loss": 0.9204, + "step": 356 + }, + { + "epoch": 0.01924735820573647, + "grad_norm": 0.956950843334198, + "learning_rate": 9.999703134952733e-06, + "loss": 0.8058, + "step": 357 + }, + { + "epoch": 0.019301272374379987, + "grad_norm": 1.1130229234695435, + "learning_rate": 9.999700820247369e-06, + "loss": 0.8202, + "step": 358 + }, + { + "epoch": 0.019355186543023508, + "grad_norm": 1.047211766242981, + "learning_rate": 9.999698496553216e-06, + "loss": 0.9357, + "step": 359 + }, + { + "epoch": 0.019409100711667026, + "grad_norm": 1.0225415229797363, + "learning_rate": 9.99969616387028e-06, + "loss": 0.8306, + "step": 360 + }, + { + "epoch": 0.019463014880310544, + "grad_norm": 1.060727596282959, + "learning_rate": 9.999693822198564e-06, + "loss": 0.9178, + "step": 361 + }, + { + "epoch": 0.019516929048954065, + "grad_norm": 1.0743412971496582, + "learning_rate": 9.999691471538074e-06, + "loss": 0.8761, + "step": 362 + }, + { + "epoch": 0.019570843217597583, + "grad_norm": 1.2229491472244263, + "learning_rate": 9.99968911188881e-06, + "loss": 1.0738, + "step": 363 + }, + { + "epoch": 0.019624757386241105, + "grad_norm": 0.9889073967933655, + "learning_rate": 9.999686743250783e-06, + "loss": 0.9458, + "step": 364 + }, + { + "epoch": 0.019678671554884623, + "grad_norm": 1.0398520231246948, + "learning_rate": 9.999684365623992e-06, + "loss": 0.9096, + "step": 365 + }, + { + "epoch": 0.019732585723528144, + "grad_norm": 1.0613081455230713, + "learning_rate": 9.999681979008442e-06, + "loss": 0.9312, + "step": 366 + }, + { + "epoch": 0.019786499892171662, + "grad_norm": 0.946211040019989, + "learning_rate": 9.99967958340414e-06, + "loss": 0.9208, + "step": 367 + }, + { + "epoch": 0.019840414060815183, + "grad_norm": 1.1298933029174805, + "learning_rate": 9.999677178811087e-06, + "loss": 0.9378, + "step": 368 + }, + { + "epoch": 0.0198943282294587, + "grad_norm": 1.1042351722717285, + "learning_rate": 9.999674765229288e-06, + "loss": 0.9487, + "step": 369 + }, + { + "epoch": 0.019948242398102223, + "grad_norm": 1.0717188119888306, + "learning_rate": 9.999672342658751e-06, + "loss": 0.939, + "step": 370 + }, + { + "epoch": 0.02000215656674574, + "grad_norm": 1.0936871767044067, + "learning_rate": 9.999669911099474e-06, + "loss": 1.1361, + "step": 371 + }, + { + "epoch": 0.02005607073538926, + "grad_norm": 1.0650005340576172, + "learning_rate": 9.999667470551466e-06, + "loss": 0.9709, + "step": 372 + }, + { + "epoch": 0.02010998490403278, + "grad_norm": 1.0154083967208862, + "learning_rate": 9.999665021014731e-06, + "loss": 0.9422, + "step": 373 + }, + { + "epoch": 0.020163899072676298, + "grad_norm": 1.1382607221603394, + "learning_rate": 9.999662562489272e-06, + "loss": 0.984, + "step": 374 + }, + { + "epoch": 0.02021781324131982, + "grad_norm": 0.9372896552085876, + "learning_rate": 9.999660094975095e-06, + "loss": 0.9857, + "step": 375 + }, + { + "epoch": 0.020271727409963337, + "grad_norm": 1.1777011156082153, + "learning_rate": 9.999657618472203e-06, + "loss": 0.9731, + "step": 376 + }, + { + "epoch": 0.02032564157860686, + "grad_norm": 0.9054237604141235, + "learning_rate": 9.9996551329806e-06, + "loss": 0.9104, + "step": 377 + }, + { + "epoch": 0.020379555747250377, + "grad_norm": 0.9255661964416504, + "learning_rate": 9.999652638500292e-06, + "loss": 0.8632, + "step": 378 + }, + { + "epoch": 0.020433469915893898, + "grad_norm": 0.9440998435020447, + "learning_rate": 9.999650135031282e-06, + "loss": 0.8945, + "step": 379 + }, + { + "epoch": 0.020487384084537416, + "grad_norm": 0.9822732210159302, + "learning_rate": 9.999647622573577e-06, + "loss": 0.8874, + "step": 380 + }, + { + "epoch": 0.020541298253180938, + "grad_norm": 1.1294387578964233, + "learning_rate": 9.999645101127179e-06, + "loss": 0.9892, + "step": 381 + }, + { + "epoch": 0.020595212421824455, + "grad_norm": 1.0458290576934814, + "learning_rate": 9.999642570692094e-06, + "loss": 0.9163, + "step": 382 + }, + { + "epoch": 0.020649126590467973, + "grad_norm": 0.8124557733535767, + "learning_rate": 9.999640031268326e-06, + "loss": 0.6927, + "step": 383 + }, + { + "epoch": 0.020703040759111495, + "grad_norm": 1.1053259372711182, + "learning_rate": 9.999637482855878e-06, + "loss": 0.8651, + "step": 384 + }, + { + "epoch": 0.020756954927755013, + "grad_norm": 1.1280632019042969, + "learning_rate": 9.999634925454757e-06, + "loss": 0.9708, + "step": 385 + }, + { + "epoch": 0.020810869096398534, + "grad_norm": 0.9916180372238159, + "learning_rate": 9.999632359064965e-06, + "loss": 0.9081, + "step": 386 + }, + { + "epoch": 0.020864783265042052, + "grad_norm": 1.0430771112442017, + "learning_rate": 9.99962978368651e-06, + "loss": 0.9837, + "step": 387 + }, + { + "epoch": 0.020918697433685574, + "grad_norm": 1.031343698501587, + "learning_rate": 9.999627199319398e-06, + "loss": 0.9156, + "step": 388 + }, + { + "epoch": 0.02097261160232909, + "grad_norm": 1.0157191753387451, + "learning_rate": 9.999624605963627e-06, + "loss": 0.9379, + "step": 389 + }, + { + "epoch": 0.021026525770972613, + "grad_norm": 0.9524544477462769, + "learning_rate": 9.999622003619204e-06, + "loss": 0.8448, + "step": 390 + }, + { + "epoch": 0.02108043993961613, + "grad_norm": 1.091670036315918, + "learning_rate": 9.999619392286137e-06, + "loss": 0.9794, + "step": 391 + }, + { + "epoch": 0.021134354108259652, + "grad_norm": 1.0502233505249023, + "learning_rate": 9.999616771964429e-06, + "loss": 1.0047, + "step": 392 + }, + { + "epoch": 0.02118826827690317, + "grad_norm": 1.2087476253509521, + "learning_rate": 9.999614142654084e-06, + "loss": 0.8964, + "step": 393 + }, + { + "epoch": 0.021242182445546688, + "grad_norm": 1.0264590978622437, + "learning_rate": 9.999611504355106e-06, + "loss": 0.8608, + "step": 394 + }, + { + "epoch": 0.02129609661419021, + "grad_norm": 0.9883281588554382, + "learning_rate": 9.999608857067503e-06, + "loss": 0.9109, + "step": 395 + }, + { + "epoch": 0.021350010782833728, + "grad_norm": 0.9913623332977295, + "learning_rate": 9.999606200791276e-06, + "loss": 0.8993, + "step": 396 + }, + { + "epoch": 0.02140392495147725, + "grad_norm": 1.019178867340088, + "learning_rate": 9.999603535526432e-06, + "loss": 0.9115, + "step": 397 + }, + { + "epoch": 0.021457839120120767, + "grad_norm": 0.9756026864051819, + "learning_rate": 9.999600861272974e-06, + "loss": 0.834, + "step": 398 + }, + { + "epoch": 0.02151175328876429, + "grad_norm": 0.9956341981887817, + "learning_rate": 9.999598178030909e-06, + "loss": 0.8756, + "step": 399 + }, + { + "epoch": 0.021565667457407806, + "grad_norm": 1.0267717838287354, + "learning_rate": 9.999595485800239e-06, + "loss": 0.9427, + "step": 400 + }, + { + "epoch": 0.021619581626051328, + "grad_norm": 1.061139464378357, + "learning_rate": 9.999592784580974e-06, + "loss": 0.9835, + "step": 401 + }, + { + "epoch": 0.021673495794694846, + "grad_norm": 0.9970353245735168, + "learning_rate": 9.999590074373114e-06, + "loss": 0.8946, + "step": 402 + }, + { + "epoch": 0.021727409963338367, + "grad_norm": 1.056242823600769, + "learning_rate": 9.999587355176664e-06, + "loss": 0.9076, + "step": 403 + }, + { + "epoch": 0.021781324131981885, + "grad_norm": 1.0285427570343018, + "learning_rate": 9.999584626991632e-06, + "loss": 0.8506, + "step": 404 + }, + { + "epoch": 0.021835238300625403, + "grad_norm": 1.0026901960372925, + "learning_rate": 9.99958188981802e-06, + "loss": 0.8457, + "step": 405 + }, + { + "epoch": 0.021889152469268924, + "grad_norm": 0.8921003341674805, + "learning_rate": 9.999579143655833e-06, + "loss": 0.8215, + "step": 406 + }, + { + "epoch": 0.021943066637912442, + "grad_norm": 1.2816855907440186, + "learning_rate": 9.99957638850508e-06, + "loss": 0.8779, + "step": 407 + }, + { + "epoch": 0.021996980806555964, + "grad_norm": 1.4713681936264038, + "learning_rate": 9.99957362436576e-06, + "loss": 0.8581, + "step": 408 + }, + { + "epoch": 0.02205089497519948, + "grad_norm": 1.0117568969726562, + "learning_rate": 9.999570851237883e-06, + "loss": 0.8865, + "step": 409 + }, + { + "epoch": 0.022104809143843003, + "grad_norm": 0.9530962705612183, + "learning_rate": 9.99956806912145e-06, + "loss": 0.8888, + "step": 410 + }, + { + "epoch": 0.02215872331248652, + "grad_norm": 0.865692675113678, + "learning_rate": 9.99956527801647e-06, + "loss": 0.8075, + "step": 411 + }, + { + "epoch": 0.022212637481130042, + "grad_norm": 0.9613220691680908, + "learning_rate": 9.999562477922944e-06, + "loss": 0.9289, + "step": 412 + }, + { + "epoch": 0.02226655164977356, + "grad_norm": 0.9419745802879333, + "learning_rate": 9.99955966884088e-06, + "loss": 0.8758, + "step": 413 + }, + { + "epoch": 0.02232046581841708, + "grad_norm": 1.0120573043823242, + "learning_rate": 9.999556850770282e-06, + "loss": 0.9014, + "step": 414 + }, + { + "epoch": 0.0223743799870606, + "grad_norm": 0.9833963513374329, + "learning_rate": 9.999554023711155e-06, + "loss": 0.9354, + "step": 415 + }, + { + "epoch": 0.022428294155704118, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.999551187663505e-06, + "loss": 0.9201, + "step": 416 + }, + { + "epoch": 0.02248220832434764, + "grad_norm": 1.0103633403778076, + "learning_rate": 9.999548342627334e-06, + "loss": 0.9023, + "step": 417 + }, + { + "epoch": 0.022536122492991157, + "grad_norm": 0.8671039342880249, + "learning_rate": 9.99954548860265e-06, + "loss": 0.7263, + "step": 418 + }, + { + "epoch": 0.02259003666163468, + "grad_norm": 1.0967090129852295, + "learning_rate": 9.999542625589461e-06, + "loss": 1.0616, + "step": 419 + }, + { + "epoch": 0.022643950830278196, + "grad_norm": 0.9032139778137207, + "learning_rate": 9.999539753587764e-06, + "loss": 0.782, + "step": 420 + }, + { + "epoch": 0.022697864998921718, + "grad_norm": 0.9532387256622314, + "learning_rate": 9.99953687259757e-06, + "loss": 0.9628, + "step": 421 + }, + { + "epoch": 0.022751779167565236, + "grad_norm": 0.9732246994972229, + "learning_rate": 9.999533982618885e-06, + "loss": 0.8682, + "step": 422 + }, + { + "epoch": 0.022805693336208757, + "grad_norm": 0.9160019159317017, + "learning_rate": 9.99953108365171e-06, + "loss": 0.9051, + "step": 423 + }, + { + "epoch": 0.022859607504852275, + "grad_norm": 1.0100488662719727, + "learning_rate": 9.999528175696054e-06, + "loss": 0.9836, + "step": 424 + }, + { + "epoch": 0.022913521673495793, + "grad_norm": 1.0130014419555664, + "learning_rate": 9.99952525875192e-06, + "loss": 0.8653, + "step": 425 + }, + { + "epoch": 0.022967435842139314, + "grad_norm": 0.9726247787475586, + "learning_rate": 9.999522332819313e-06, + "loss": 0.8761, + "step": 426 + }, + { + "epoch": 0.023021350010782832, + "grad_norm": 0.9457972049713135, + "learning_rate": 9.99951939789824e-06, + "loss": 0.8792, + "step": 427 + }, + { + "epoch": 0.023075264179426354, + "grad_norm": 1.083130121231079, + "learning_rate": 9.999516453988706e-06, + "loss": 0.9035, + "step": 428 + }, + { + "epoch": 0.023129178348069872, + "grad_norm": 0.9195771217346191, + "learning_rate": 9.999513501090714e-06, + "loss": 0.8586, + "step": 429 + }, + { + "epoch": 0.023183092516713393, + "grad_norm": 0.983346700668335, + "learning_rate": 9.999510539204273e-06, + "loss": 0.8335, + "step": 430 + }, + { + "epoch": 0.02323700668535691, + "grad_norm": 1.0524029731750488, + "learning_rate": 9.999507568329386e-06, + "loss": 0.838, + "step": 431 + }, + { + "epoch": 0.023290920854000433, + "grad_norm": 1.0267860889434814, + "learning_rate": 9.999504588466058e-06, + "loss": 0.9345, + "step": 432 + }, + { + "epoch": 0.02334483502264395, + "grad_norm": 1.025707483291626, + "learning_rate": 9.999501599614294e-06, + "loss": 0.9042, + "step": 433 + }, + { + "epoch": 0.023398749191287472, + "grad_norm": 0.9739174842834473, + "learning_rate": 9.999498601774101e-06, + "loss": 0.7433, + "step": 434 + }, + { + "epoch": 0.02345266335993099, + "grad_norm": 0.9468310475349426, + "learning_rate": 9.999495594945486e-06, + "loss": 0.8447, + "step": 435 + }, + { + "epoch": 0.023506577528574508, + "grad_norm": 0.9820529818534851, + "learning_rate": 9.99949257912845e-06, + "loss": 0.8842, + "step": 436 + }, + { + "epoch": 0.02356049169721803, + "grad_norm": 0.998515784740448, + "learning_rate": 9.999489554323e-06, + "loss": 0.9226, + "step": 437 + }, + { + "epoch": 0.023614405865861547, + "grad_norm": 0.9819791316986084, + "learning_rate": 9.999486520529144e-06, + "loss": 0.8559, + "step": 438 + }, + { + "epoch": 0.02366832003450507, + "grad_norm": 0.9468326568603516, + "learning_rate": 9.999483477746884e-06, + "loss": 0.8064, + "step": 439 + }, + { + "epoch": 0.023722234203148587, + "grad_norm": 1.0087614059448242, + "learning_rate": 9.999480425976229e-06, + "loss": 0.9232, + "step": 440 + }, + { + "epoch": 0.023776148371792108, + "grad_norm": 0.9446098208427429, + "learning_rate": 9.99947736521718e-06, + "loss": 0.8511, + "step": 441 + }, + { + "epoch": 0.023830062540435626, + "grad_norm": 1.0966850519180298, + "learning_rate": 9.999474295469746e-06, + "loss": 0.9929, + "step": 442 + }, + { + "epoch": 0.023883976709079147, + "grad_norm": 0.8858770728111267, + "learning_rate": 9.99947121673393e-06, + "loss": 0.8492, + "step": 443 + }, + { + "epoch": 0.023937890877722665, + "grad_norm": 1.083717703819275, + "learning_rate": 9.999468129009742e-06, + "loss": 0.9948, + "step": 444 + }, + { + "epoch": 0.023991805046366187, + "grad_norm": 1.0251178741455078, + "learning_rate": 9.999465032297184e-06, + "loss": 0.8769, + "step": 445 + }, + { + "epoch": 0.024045719215009705, + "grad_norm": 0.9331875443458557, + "learning_rate": 9.999461926596261e-06, + "loss": 0.8663, + "step": 446 + }, + { + "epoch": 0.024099633383653223, + "grad_norm": 0.8941493034362793, + "learning_rate": 9.999458811906979e-06, + "loss": 0.8172, + "step": 447 + }, + { + "epoch": 0.024153547552296744, + "grad_norm": 0.9978699684143066, + "learning_rate": 9.999455688229347e-06, + "loss": 0.9303, + "step": 448 + }, + { + "epoch": 0.024207461720940262, + "grad_norm": 0.8835211992263794, + "learning_rate": 9.999452555563366e-06, + "loss": 0.8921, + "step": 449 + }, + { + "epoch": 0.024261375889583783, + "grad_norm": 0.9061810970306396, + "learning_rate": 9.999449413909043e-06, + "loss": 0.8201, + "step": 450 + }, + { + "epoch": 0.0243152900582273, + "grad_norm": 1.0061571598052979, + "learning_rate": 9.999446263266385e-06, + "loss": 0.8506, + "step": 451 + }, + { + "epoch": 0.024369204226870823, + "grad_norm": 0.9286402463912964, + "learning_rate": 9.999443103635398e-06, + "loss": 0.8532, + "step": 452 + }, + { + "epoch": 0.02442311839551434, + "grad_norm": 1.0919772386550903, + "learning_rate": 9.999439935016087e-06, + "loss": 0.9466, + "step": 453 + }, + { + "epoch": 0.024477032564157862, + "grad_norm": 1.0552513599395752, + "learning_rate": 9.999436757408453e-06, + "loss": 0.8406, + "step": 454 + }, + { + "epoch": 0.02453094673280138, + "grad_norm": 0.9604331851005554, + "learning_rate": 9.999433570812511e-06, + "loss": 0.8928, + "step": 455 + }, + { + "epoch": 0.0245848609014449, + "grad_norm": 1.0126323699951172, + "learning_rate": 9.999430375228259e-06, + "loss": 0.924, + "step": 456 + }, + { + "epoch": 0.02463877507008842, + "grad_norm": 1.0540791749954224, + "learning_rate": 9.999427170655707e-06, + "loss": 0.9656, + "step": 457 + }, + { + "epoch": 0.024692689238731937, + "grad_norm": 0.8622417449951172, + "learning_rate": 9.999423957094857e-06, + "loss": 0.7428, + "step": 458 + }, + { + "epoch": 0.02474660340737546, + "grad_norm": 1.106581211090088, + "learning_rate": 9.999420734545719e-06, + "loss": 0.9258, + "step": 459 + }, + { + "epoch": 0.024800517576018977, + "grad_norm": 0.990807294845581, + "learning_rate": 9.999417503008296e-06, + "loss": 0.9083, + "step": 460 + }, + { + "epoch": 0.024854431744662498, + "grad_norm": 0.9302589893341064, + "learning_rate": 9.999414262482594e-06, + "loss": 0.8654, + "step": 461 + }, + { + "epoch": 0.024908345913306016, + "grad_norm": 1.0218255519866943, + "learning_rate": 9.999411012968621e-06, + "loss": 0.8996, + "step": 462 + }, + { + "epoch": 0.024962260081949537, + "grad_norm": 0.976108193397522, + "learning_rate": 9.99940775446638e-06, + "loss": 0.9423, + "step": 463 + }, + { + "epoch": 0.025016174250593055, + "grad_norm": 1.1027617454528809, + "learning_rate": 9.99940448697588e-06, + "loss": 1.0407, + "step": 464 + }, + { + "epoch": 0.025070088419236577, + "grad_norm": 1.0148764848709106, + "learning_rate": 9.999401210497122e-06, + "loss": 0.9418, + "step": 465 + }, + { + "epoch": 0.025124002587880095, + "grad_norm": 1.0120681524276733, + "learning_rate": 9.999397925030116e-06, + "loss": 0.92, + "step": 466 + }, + { + "epoch": 0.025177916756523613, + "grad_norm": 1.1855127811431885, + "learning_rate": 9.999394630574868e-06, + "loss": 0.9285, + "step": 467 + }, + { + "epoch": 0.025231830925167134, + "grad_norm": 1.8014320135116577, + "learning_rate": 9.999391327131383e-06, + "loss": 0.979, + "step": 468 + }, + { + "epoch": 0.025285745093810652, + "grad_norm": 1.1568403244018555, + "learning_rate": 9.999388014699664e-06, + "loss": 0.9574, + "step": 469 + }, + { + "epoch": 0.025339659262454173, + "grad_norm": 1.2544865608215332, + "learning_rate": 9.99938469327972e-06, + "loss": 0.8356, + "step": 470 + }, + { + "epoch": 0.02539357343109769, + "grad_norm": 1.8647997379302979, + "learning_rate": 9.99938136287156e-06, + "loss": 0.9181, + "step": 471 + }, + { + "epoch": 0.025447487599741213, + "grad_norm": 0.9942222237586975, + "learning_rate": 9.999378023475184e-06, + "loss": 0.9297, + "step": 472 + }, + { + "epoch": 0.02550140176838473, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.9993746750906e-06, + "loss": 0.9181, + "step": 473 + }, + { + "epoch": 0.025555315937028252, + "grad_norm": 0.9353258609771729, + "learning_rate": 9.999371317717817e-06, + "loss": 0.8789, + "step": 474 + }, + { + "epoch": 0.02560923010567177, + "grad_norm": 0.9256170988082886, + "learning_rate": 9.999367951356838e-06, + "loss": 0.8725, + "step": 475 + }, + { + "epoch": 0.02566314427431529, + "grad_norm": 1.1102124452590942, + "learning_rate": 9.999364576007669e-06, + "loss": 0.9818, + "step": 476 + }, + { + "epoch": 0.02571705844295881, + "grad_norm": 1.04171884059906, + "learning_rate": 9.999361191670316e-06, + "loss": 0.9275, + "step": 477 + }, + { + "epoch": 0.025770972611602327, + "grad_norm": 0.9670290350914001, + "learning_rate": 9.999357798344787e-06, + "loss": 0.8919, + "step": 478 + }, + { + "epoch": 0.02582488678024585, + "grad_norm": 1.0543723106384277, + "learning_rate": 9.999354396031085e-06, + "loss": 0.9356, + "step": 479 + }, + { + "epoch": 0.025878800948889367, + "grad_norm": 1.1368457078933716, + "learning_rate": 9.99935098472922e-06, + "loss": 0.9387, + "step": 480 + }, + { + "epoch": 0.025932715117532888, + "grad_norm": 1.0627872943878174, + "learning_rate": 9.999347564439196e-06, + "loss": 1.0047, + "step": 481 + }, + { + "epoch": 0.025986629286176406, + "grad_norm": 0.9553730487823486, + "learning_rate": 9.999344135161018e-06, + "loss": 0.8845, + "step": 482 + }, + { + "epoch": 0.026040543454819928, + "grad_norm": 0.9605830907821655, + "learning_rate": 9.999340696894694e-06, + "loss": 0.8816, + "step": 483 + }, + { + "epoch": 0.026094457623463446, + "grad_norm": 1.0464140176773071, + "learning_rate": 9.999337249640232e-06, + "loss": 0.9344, + "step": 484 + }, + { + "epoch": 0.026148371792106967, + "grad_norm": 1.0667988061904907, + "learning_rate": 9.999333793397635e-06, + "loss": 0.8834, + "step": 485 + }, + { + "epoch": 0.026202285960750485, + "grad_norm": 0.8996486663818359, + "learning_rate": 9.999330328166908e-06, + "loss": 0.8247, + "step": 486 + }, + { + "epoch": 0.026256200129394006, + "grad_norm": 1.0483838319778442, + "learning_rate": 9.99932685394806e-06, + "loss": 0.9414, + "step": 487 + }, + { + "epoch": 0.026310114298037524, + "grad_norm": 1.2089953422546387, + "learning_rate": 9.999323370741097e-06, + "loss": 1.0913, + "step": 488 + }, + { + "epoch": 0.026364028466681042, + "grad_norm": 1.074291467666626, + "learning_rate": 9.999319878546025e-06, + "loss": 0.8882, + "step": 489 + }, + { + "epoch": 0.026417942635324564, + "grad_norm": 1.0076494216918945, + "learning_rate": 9.99931637736285e-06, + "loss": 0.8393, + "step": 490 + }, + { + "epoch": 0.02647185680396808, + "grad_norm": 1.2263407707214355, + "learning_rate": 9.99931286719158e-06, + "loss": 0.955, + "step": 491 + }, + { + "epoch": 0.026525770972611603, + "grad_norm": 0.9093664884567261, + "learning_rate": 9.999309348032218e-06, + "loss": 0.8366, + "step": 492 + }, + { + "epoch": 0.02657968514125512, + "grad_norm": 1.0704407691955566, + "learning_rate": 9.999305819884772e-06, + "loss": 0.981, + "step": 493 + }, + { + "epoch": 0.026633599309898642, + "grad_norm": 1.2105270624160767, + "learning_rate": 9.999302282749249e-06, + "loss": 0.8896, + "step": 494 + }, + { + "epoch": 0.02668751347854216, + "grad_norm": 1.0142449140548706, + "learning_rate": 9.999298736625654e-06, + "loss": 0.8627, + "step": 495 + }, + { + "epoch": 0.02674142764718568, + "grad_norm": 1.0887057781219482, + "learning_rate": 9.999295181513994e-06, + "loss": 0.8884, + "step": 496 + }, + { + "epoch": 0.0267953418158292, + "grad_norm": 0.9958952069282532, + "learning_rate": 9.999291617414277e-06, + "loss": 0.7768, + "step": 497 + }, + { + "epoch": 0.02684925598447272, + "grad_norm": 0.8576722741127014, + "learning_rate": 9.999288044326508e-06, + "loss": 0.715, + "step": 498 + }, + { + "epoch": 0.02690317015311624, + "grad_norm": 1.058148741722107, + "learning_rate": 9.999284462250691e-06, + "loss": 0.8693, + "step": 499 + }, + { + "epoch": 0.026957084321759757, + "grad_norm": 0.9429569244384766, + "learning_rate": 9.999280871186837e-06, + "loss": 0.8883, + "step": 500 + }, + { + "epoch": 0.02701099849040328, + "grad_norm": 0.9450993537902832, + "learning_rate": 9.999277271134948e-06, + "loss": 0.9376, + "step": 501 + }, + { + "epoch": 0.027064912659046796, + "grad_norm": 1.0307891368865967, + "learning_rate": 9.999273662095035e-06, + "loss": 0.9098, + "step": 502 + }, + { + "epoch": 0.027118826827690318, + "grad_norm": 0.9515891671180725, + "learning_rate": 9.999270044067101e-06, + "loss": 0.8854, + "step": 503 + }, + { + "epoch": 0.027172740996333836, + "grad_norm": 1.1173255443572998, + "learning_rate": 9.999266417051154e-06, + "loss": 0.7977, + "step": 504 + }, + { + "epoch": 0.027226655164977357, + "grad_norm": 1.028194785118103, + "learning_rate": 9.9992627810472e-06, + "loss": 0.9585, + "step": 505 + }, + { + "epoch": 0.027280569333620875, + "grad_norm": 1.0855528116226196, + "learning_rate": 9.999259136055245e-06, + "loss": 0.9807, + "step": 506 + }, + { + "epoch": 0.027334483502264396, + "grad_norm": 1.1148236989974976, + "learning_rate": 9.999255482075298e-06, + "loss": 0.9672, + "step": 507 + }, + { + "epoch": 0.027388397670907914, + "grad_norm": 0.9697713255882263, + "learning_rate": 9.999251819107364e-06, + "loss": 0.9073, + "step": 508 + }, + { + "epoch": 0.027442311839551436, + "grad_norm": 0.9802384972572327, + "learning_rate": 9.999248147151448e-06, + "loss": 0.8704, + "step": 509 + }, + { + "epoch": 0.027496226008194954, + "grad_norm": 0.963330090045929, + "learning_rate": 9.999244466207559e-06, + "loss": 0.9312, + "step": 510 + }, + { + "epoch": 0.02755014017683847, + "grad_norm": 0.8776309490203857, + "learning_rate": 9.999240776275703e-06, + "loss": 0.8068, + "step": 511 + }, + { + "epoch": 0.027604054345481993, + "grad_norm": 1.1159353256225586, + "learning_rate": 9.999237077355886e-06, + "loss": 0.8164, + "step": 512 + }, + { + "epoch": 0.02765796851412551, + "grad_norm": 1.004232406616211, + "learning_rate": 9.999233369448115e-06, + "loss": 0.8666, + "step": 513 + }, + { + "epoch": 0.027711882682769032, + "grad_norm": 1.0300110578536987, + "learning_rate": 9.999229652552395e-06, + "loss": 0.8774, + "step": 514 + }, + { + "epoch": 0.02776579685141255, + "grad_norm": 0.8823155164718628, + "learning_rate": 9.999225926668736e-06, + "loss": 0.7579, + "step": 515 + }, + { + "epoch": 0.027819711020056072, + "grad_norm": 0.938956618309021, + "learning_rate": 9.999222191797144e-06, + "loss": 0.8749, + "step": 516 + }, + { + "epoch": 0.02787362518869959, + "grad_norm": 0.9111800789833069, + "learning_rate": 9.999218447937624e-06, + "loss": 0.8915, + "step": 517 + }, + { + "epoch": 0.02792753935734311, + "grad_norm": 0.971813440322876, + "learning_rate": 9.999214695090182e-06, + "loss": 0.9038, + "step": 518 + }, + { + "epoch": 0.02798145352598663, + "grad_norm": 0.9159868359565735, + "learning_rate": 9.999210933254828e-06, + "loss": 0.8726, + "step": 519 + }, + { + "epoch": 0.028035367694630147, + "grad_norm": 1.0223439931869507, + "learning_rate": 9.999207162431566e-06, + "loss": 0.8738, + "step": 520 + }, + { + "epoch": 0.02808928186327367, + "grad_norm": 0.9844004511833191, + "learning_rate": 9.999203382620404e-06, + "loss": 0.8815, + "step": 521 + }, + { + "epoch": 0.028143196031917186, + "grad_norm": 1.1636719703674316, + "learning_rate": 9.99919959382135e-06, + "loss": 0.8781, + "step": 522 + }, + { + "epoch": 0.028197110200560708, + "grad_norm": 0.9637702703475952, + "learning_rate": 9.999195796034407e-06, + "loss": 0.8491, + "step": 523 + }, + { + "epoch": 0.028251024369204226, + "grad_norm": 0.975931704044342, + "learning_rate": 9.999191989259584e-06, + "loss": 0.9983, + "step": 524 + }, + { + "epoch": 0.028304938537847747, + "grad_norm": 0.9855527877807617, + "learning_rate": 9.999188173496889e-06, + "loss": 0.9587, + "step": 525 + }, + { + "epoch": 0.028358852706491265, + "grad_norm": 0.9925652742385864, + "learning_rate": 9.99918434874633e-06, + "loss": 0.8408, + "step": 526 + }, + { + "epoch": 0.028412766875134787, + "grad_norm": 0.9272180795669556, + "learning_rate": 9.999180515007908e-06, + "loss": 0.8267, + "step": 527 + }, + { + "epoch": 0.028466681043778305, + "grad_norm": 1.161076307296753, + "learning_rate": 9.999176672281636e-06, + "loss": 0.9282, + "step": 528 + }, + { + "epoch": 0.028520595212421826, + "grad_norm": 0.8953909277915955, + "learning_rate": 9.99917282056752e-06, + "loss": 0.8078, + "step": 529 + }, + { + "epoch": 0.028574509381065344, + "grad_norm": 0.9194382429122925, + "learning_rate": 9.999168959865562e-06, + "loss": 0.8385, + "step": 530 + }, + { + "epoch": 0.028628423549708862, + "grad_norm": 1.0351816415786743, + "learning_rate": 9.999165090175775e-06, + "loss": 0.8155, + "step": 531 + }, + { + "epoch": 0.028682337718352383, + "grad_norm": 0.9233224391937256, + "learning_rate": 9.999161211498163e-06, + "loss": 0.8825, + "step": 532 + }, + { + "epoch": 0.0287362518869959, + "grad_norm": 1.0415356159210205, + "learning_rate": 9.999157323832732e-06, + "loss": 0.7844, + "step": 533 + }, + { + "epoch": 0.028790166055639423, + "grad_norm": 1.0329923629760742, + "learning_rate": 9.999153427179492e-06, + "loss": 0.893, + "step": 534 + }, + { + "epoch": 0.02884408022428294, + "grad_norm": 1.237291932106018, + "learning_rate": 9.999149521538448e-06, + "loss": 0.9786, + "step": 535 + }, + { + "epoch": 0.028897994392926462, + "grad_norm": 0.9952654242515564, + "learning_rate": 9.999145606909607e-06, + "loss": 0.9262, + "step": 536 + }, + { + "epoch": 0.02895190856156998, + "grad_norm": 1.016533374786377, + "learning_rate": 9.999141683292977e-06, + "loss": 0.9854, + "step": 537 + }, + { + "epoch": 0.0290058227302135, + "grad_norm": 1.0334454774856567, + "learning_rate": 9.999137750688564e-06, + "loss": 0.8928, + "step": 538 + }, + { + "epoch": 0.02905973689885702, + "grad_norm": 0.941662609577179, + "learning_rate": 9.999133809096374e-06, + "loss": 0.8698, + "step": 539 + }, + { + "epoch": 0.02911365106750054, + "grad_norm": 0.9454428553581238, + "learning_rate": 9.999129858516418e-06, + "loss": 0.9261, + "step": 540 + }, + { + "epoch": 0.02916756523614406, + "grad_norm": 1.0921217203140259, + "learning_rate": 9.9991258989487e-06, + "loss": 0.9163, + "step": 541 + }, + { + "epoch": 0.029221479404787577, + "grad_norm": 0.8999170064926147, + "learning_rate": 9.999121930393227e-06, + "loss": 0.883, + "step": 542 + }, + { + "epoch": 0.029275393573431098, + "grad_norm": 0.9732702970504761, + "learning_rate": 9.999117952850009e-06, + "loss": 0.9168, + "step": 543 + }, + { + "epoch": 0.029329307742074616, + "grad_norm": 1.00196373462677, + "learning_rate": 9.99911396631905e-06, + "loss": 0.826, + "step": 544 + }, + { + "epoch": 0.029383221910718137, + "grad_norm": 0.9776156544685364, + "learning_rate": 9.999109970800358e-06, + "loss": 0.8176, + "step": 545 + }, + { + "epoch": 0.029437136079361655, + "grad_norm": 1.0503387451171875, + "learning_rate": 9.99910596629394e-06, + "loss": 0.8617, + "step": 546 + }, + { + "epoch": 0.029491050248005177, + "grad_norm": 0.9195687174797058, + "learning_rate": 9.999101952799805e-06, + "loss": 0.8224, + "step": 547 + }, + { + "epoch": 0.029544964416648695, + "grad_norm": 0.8746809959411621, + "learning_rate": 9.999097930317959e-06, + "loss": 0.8407, + "step": 548 + }, + { + "epoch": 0.029598878585292216, + "grad_norm": 0.9035898447036743, + "learning_rate": 9.999093898848407e-06, + "loss": 0.8344, + "step": 549 + }, + { + "epoch": 0.029652792753935734, + "grad_norm": 0.8764795064926147, + "learning_rate": 9.99908985839116e-06, + "loss": 0.8323, + "step": 550 + }, + { + "epoch": 0.029706706922579255, + "grad_norm": 0.9654614329338074, + "learning_rate": 9.999085808946224e-06, + "loss": 0.8696, + "step": 551 + }, + { + "epoch": 0.029760621091222773, + "grad_norm": 1.1295796632766724, + "learning_rate": 9.999081750513606e-06, + "loss": 0.9608, + "step": 552 + }, + { + "epoch": 0.02981453525986629, + "grad_norm": 0.9591107368469238, + "learning_rate": 9.999077683093313e-06, + "loss": 0.8762, + "step": 553 + }, + { + "epoch": 0.029868449428509813, + "grad_norm": 0.8287899494171143, + "learning_rate": 9.999073606685353e-06, + "loss": 0.7265, + "step": 554 + }, + { + "epoch": 0.02992236359715333, + "grad_norm": 0.9429282546043396, + "learning_rate": 9.99906952128973e-06, + "loss": 0.8835, + "step": 555 + }, + { + "epoch": 0.029976277765796852, + "grad_norm": 0.9617370963096619, + "learning_rate": 9.999065426906459e-06, + "loss": 0.9138, + "step": 556 + }, + { + "epoch": 0.03003019193444037, + "grad_norm": 1.2346372604370117, + "learning_rate": 9.999061323535538e-06, + "loss": 0.831, + "step": 557 + }, + { + "epoch": 0.03008410610308389, + "grad_norm": 1.2413623332977295, + "learning_rate": 9.999057211176982e-06, + "loss": 1.0211, + "step": 558 + }, + { + "epoch": 0.03013802027172741, + "grad_norm": 0.98906010389328, + "learning_rate": 9.999053089830794e-06, + "loss": 0.7821, + "step": 559 + }, + { + "epoch": 0.03019193444037093, + "grad_norm": 0.96706622838974, + "learning_rate": 9.999048959496983e-06, + "loss": 0.8593, + "step": 560 + }, + { + "epoch": 0.03024584860901445, + "grad_norm": 0.9400071501731873, + "learning_rate": 9.999044820175556e-06, + "loss": 0.8731, + "step": 561 + }, + { + "epoch": 0.03029976277765797, + "grad_norm": 1.1276499032974243, + "learning_rate": 9.999040671866522e-06, + "loss": 0.86, + "step": 562 + }, + { + "epoch": 0.030353676946301488, + "grad_norm": 0.8859087228775024, + "learning_rate": 9.999036514569885e-06, + "loss": 0.8274, + "step": 563 + }, + { + "epoch": 0.030407591114945006, + "grad_norm": 1.1617575883865356, + "learning_rate": 9.999032348285656e-06, + "loss": 1.0519, + "step": 564 + }, + { + "epoch": 0.030461505283588527, + "grad_norm": 0.9717594385147095, + "learning_rate": 9.99902817301384e-06, + "loss": 0.9276, + "step": 565 + }, + { + "epoch": 0.030515419452232045, + "grad_norm": 1.000722050666809, + "learning_rate": 9.999023988754446e-06, + "loss": 0.8714, + "step": 566 + }, + { + "epoch": 0.030569333620875567, + "grad_norm": 1.1744625568389893, + "learning_rate": 9.999019795507481e-06, + "loss": 1.0087, + "step": 567 + }, + { + "epoch": 0.030623247789519085, + "grad_norm": 1.0199978351593018, + "learning_rate": 9.999015593272953e-06, + "loss": 0.8537, + "step": 568 + }, + { + "epoch": 0.030677161958162606, + "grad_norm": 0.9232216477394104, + "learning_rate": 9.999011382050869e-06, + "loss": 0.8488, + "step": 569 + }, + { + "epoch": 0.030731076126806124, + "grad_norm": 0.9905959367752075, + "learning_rate": 9.99900716184124e-06, + "loss": 0.9048, + "step": 570 + }, + { + "epoch": 0.030784990295449646, + "grad_norm": 0.9921644330024719, + "learning_rate": 9.999002932644066e-06, + "loss": 0.9294, + "step": 571 + }, + { + "epoch": 0.030838904464093164, + "grad_norm": 1.1583740711212158, + "learning_rate": 9.99899869445936e-06, + "loss": 0.727, + "step": 572 + }, + { + "epoch": 0.03089281863273668, + "grad_norm": 0.906736433506012, + "learning_rate": 9.998994447287127e-06, + "loss": 0.7889, + "step": 573 + }, + { + "epoch": 0.030946732801380203, + "grad_norm": 0.9060770869255066, + "learning_rate": 9.998990191127379e-06, + "loss": 0.8493, + "step": 574 + }, + { + "epoch": 0.03100064697002372, + "grad_norm": 0.9094041585922241, + "learning_rate": 9.99898592598012e-06, + "loss": 0.8604, + "step": 575 + }, + { + "epoch": 0.031054561138667242, + "grad_norm": 1.0964977741241455, + "learning_rate": 9.998981651845358e-06, + "loss": 0.8481, + "step": 576 + }, + { + "epoch": 0.03110847530731076, + "grad_norm": 0.9509627223014832, + "learning_rate": 9.998977368723102e-06, + "loss": 0.8601, + "step": 577 + }, + { + "epoch": 0.03116238947595428, + "grad_norm": 1.0108642578125, + "learning_rate": 9.998973076613359e-06, + "loss": 0.9076, + "step": 578 + }, + { + "epoch": 0.0312163036445978, + "grad_norm": 1.0268129110336304, + "learning_rate": 9.998968775516136e-06, + "loss": 0.8273, + "step": 579 + }, + { + "epoch": 0.03127021781324132, + "grad_norm": 0.968941867351532, + "learning_rate": 9.99896446543144e-06, + "loss": 0.8859, + "step": 580 + }, + { + "epoch": 0.03132413198188484, + "grad_norm": 0.936779260635376, + "learning_rate": 9.998960146359283e-06, + "loss": 0.8589, + "step": 581 + }, + { + "epoch": 0.03137804615052836, + "grad_norm": 0.9675167202949524, + "learning_rate": 9.998955818299667e-06, + "loss": 0.973, + "step": 582 + }, + { + "epoch": 0.03143196031917188, + "grad_norm": 0.9475553035736084, + "learning_rate": 9.998951481252604e-06, + "loss": 0.8936, + "step": 583 + }, + { + "epoch": 0.031485874487815396, + "grad_norm": 0.9130968451499939, + "learning_rate": 9.9989471352181e-06, + "loss": 0.7668, + "step": 584 + }, + { + "epoch": 0.031539788656458914, + "grad_norm": 0.8890071511268616, + "learning_rate": 9.998942780196164e-06, + "loss": 0.8971, + "step": 585 + }, + { + "epoch": 0.03159370282510244, + "grad_norm": 0.9298738837242126, + "learning_rate": 9.998938416186803e-06, + "loss": 0.9313, + "step": 586 + }, + { + "epoch": 0.03164761699374596, + "grad_norm": 1.0683361291885376, + "learning_rate": 9.998934043190025e-06, + "loss": 0.9018, + "step": 587 + }, + { + "epoch": 0.031701531162389475, + "grad_norm": 0.939253568649292, + "learning_rate": 9.99892966120584e-06, + "loss": 0.9119, + "step": 588 + }, + { + "epoch": 0.03175544533103299, + "grad_norm": 0.9245349764823914, + "learning_rate": 9.99892527023425e-06, + "loss": 0.9258, + "step": 589 + }, + { + "epoch": 0.03180935949967652, + "grad_norm": 0.9318797588348389, + "learning_rate": 9.998920870275267e-06, + "loss": 0.9557, + "step": 590 + }, + { + "epoch": 0.031863273668320036, + "grad_norm": 0.8909592628479004, + "learning_rate": 9.998916461328899e-06, + "loss": 0.8122, + "step": 591 + }, + { + "epoch": 0.031917187836963554, + "grad_norm": 1.0637080669403076, + "learning_rate": 9.998912043395154e-06, + "loss": 0.9517, + "step": 592 + }, + { + "epoch": 0.03197110200560707, + "grad_norm": 0.881934642791748, + "learning_rate": 9.99890761647404e-06, + "loss": 0.8729, + "step": 593 + }, + { + "epoch": 0.032025016174250596, + "grad_norm": 0.8882094025611877, + "learning_rate": 9.998903180565562e-06, + "loss": 0.7943, + "step": 594 + }, + { + "epoch": 0.032078930342894114, + "grad_norm": 0.965085506439209, + "learning_rate": 9.99889873566973e-06, + "loss": 0.8894, + "step": 595 + }, + { + "epoch": 0.03213284451153763, + "grad_norm": 0.9679432511329651, + "learning_rate": 9.998894281786556e-06, + "loss": 0.854, + "step": 596 + }, + { + "epoch": 0.03218675868018115, + "grad_norm": 1.4454354047775269, + "learning_rate": 9.998889818916043e-06, + "loss": 0.9944, + "step": 597 + }, + { + "epoch": 0.03224067284882467, + "grad_norm": 0.9369311928749084, + "learning_rate": 9.998885347058198e-06, + "loss": 0.8699, + "step": 598 + }, + { + "epoch": 0.03229458701746819, + "grad_norm": 0.9014303088188171, + "learning_rate": 9.998880866213033e-06, + "loss": 0.8735, + "step": 599 + }, + { + "epoch": 0.03234850118611171, + "grad_norm": 0.989251971244812, + "learning_rate": 9.998876376380555e-06, + "loss": 0.8872, + "step": 600 + }, + { + "epoch": 0.03240241535475523, + "grad_norm": 1.0256885290145874, + "learning_rate": 9.99887187756077e-06, + "loss": 0.8787, + "step": 601 + }, + { + "epoch": 0.03245632952339875, + "grad_norm": 0.9560148119926453, + "learning_rate": 9.998867369753688e-06, + "loss": 0.8301, + "step": 602 + }, + { + "epoch": 0.03251024369204227, + "grad_norm": 1.044754147529602, + "learning_rate": 9.998862852959316e-06, + "loss": 0.9286, + "step": 603 + }, + { + "epoch": 0.03256415786068579, + "grad_norm": 0.8769629597663879, + "learning_rate": 9.998858327177665e-06, + "loss": 0.7927, + "step": 604 + }, + { + "epoch": 0.03261807202932931, + "grad_norm": 0.9217430949211121, + "learning_rate": 9.99885379240874e-06, + "loss": 0.8327, + "step": 605 + }, + { + "epoch": 0.032671986197972826, + "grad_norm": 0.8202590942382812, + "learning_rate": 9.99884924865255e-06, + "loss": 0.7269, + "step": 606 + }, + { + "epoch": 0.032725900366616344, + "grad_norm": 0.9598796367645264, + "learning_rate": 9.998844695909102e-06, + "loss": 0.9329, + "step": 607 + }, + { + "epoch": 0.03277981453525987, + "grad_norm": 1.1016643047332764, + "learning_rate": 9.998840134178407e-06, + "loss": 0.9836, + "step": 608 + }, + { + "epoch": 0.032833728703903386, + "grad_norm": 0.9639281630516052, + "learning_rate": 9.998835563460471e-06, + "loss": 0.8475, + "step": 609 + }, + { + "epoch": 0.032887642872546904, + "grad_norm": 0.9266204833984375, + "learning_rate": 9.998830983755304e-06, + "loss": 0.7307, + "step": 610 + }, + { + "epoch": 0.03294155704119042, + "grad_norm": 0.9282877445220947, + "learning_rate": 9.99882639506291e-06, + "loss": 0.8163, + "step": 611 + }, + { + "epoch": 0.03299547120983395, + "grad_norm": 0.8939738869667053, + "learning_rate": 9.998821797383302e-06, + "loss": 0.6902, + "step": 612 + }, + { + "epoch": 0.033049385378477465, + "grad_norm": 0.9041041731834412, + "learning_rate": 9.998817190716488e-06, + "loss": 0.8735, + "step": 613 + }, + { + "epoch": 0.03310329954712098, + "grad_norm": 0.9973318576812744, + "learning_rate": 9.998812575062473e-06, + "loss": 0.9017, + "step": 614 + }, + { + "epoch": 0.0331572137157645, + "grad_norm": 1.0416412353515625, + "learning_rate": 9.998807950421268e-06, + "loss": 0.9293, + "step": 615 + }, + { + "epoch": 0.03321112788440802, + "grad_norm": 0.8686584234237671, + "learning_rate": 9.998803316792882e-06, + "loss": 0.8585, + "step": 616 + }, + { + "epoch": 0.033265042053051544, + "grad_norm": 0.9907833337783813, + "learning_rate": 9.998798674177319e-06, + "loss": 0.9264, + "step": 617 + }, + { + "epoch": 0.03331895622169506, + "grad_norm": 0.9927001595497131, + "learning_rate": 9.998794022574592e-06, + "loss": 0.895, + "step": 618 + }, + { + "epoch": 0.03337287039033858, + "grad_norm": 0.9314623475074768, + "learning_rate": 9.998789361984707e-06, + "loss": 0.8353, + "step": 619 + }, + { + "epoch": 0.0334267845589821, + "grad_norm": 0.9768248796463013, + "learning_rate": 9.998784692407673e-06, + "loss": 0.8917, + "step": 620 + }, + { + "epoch": 0.03348069872762562, + "grad_norm": 0.9487942457199097, + "learning_rate": 9.998780013843498e-06, + "loss": 0.9022, + "step": 621 + }, + { + "epoch": 0.03353461289626914, + "grad_norm": 1.0376895666122437, + "learning_rate": 9.99877532629219e-06, + "loss": 0.7692, + "step": 622 + }, + { + "epoch": 0.03358852706491266, + "grad_norm": 1.021345853805542, + "learning_rate": 9.99877062975376e-06, + "loss": 1.0386, + "step": 623 + }, + { + "epoch": 0.033642441233556176, + "grad_norm": 0.9979421496391296, + "learning_rate": 9.998765924228214e-06, + "loss": 0.9209, + "step": 624 + }, + { + "epoch": 0.0336963554021997, + "grad_norm": 0.8552166819572449, + "learning_rate": 9.998761209715559e-06, + "loss": 0.8765, + "step": 625 + }, + { + "epoch": 0.03375026957084322, + "grad_norm": 0.9737898707389832, + "learning_rate": 9.998756486215809e-06, + "loss": 0.7459, + "step": 626 + }, + { + "epoch": 0.03380418373948674, + "grad_norm": 1.1067259311676025, + "learning_rate": 9.998751753728967e-06, + "loss": 0.8582, + "step": 627 + }, + { + "epoch": 0.033858097908130255, + "grad_norm": 1.0689613819122314, + "learning_rate": 9.998747012255044e-06, + "loss": 0.8523, + "step": 628 + }, + { + "epoch": 0.03391201207677377, + "grad_norm": 1.1880419254302979, + "learning_rate": 9.998742261794048e-06, + "loss": 0.9085, + "step": 629 + }, + { + "epoch": 0.0339659262454173, + "grad_norm": 0.9569217562675476, + "learning_rate": 9.998737502345987e-06, + "loss": 0.9112, + "step": 630 + }, + { + "epoch": 0.034019840414060816, + "grad_norm": 0.9955928921699524, + "learning_rate": 9.99873273391087e-06, + "loss": 0.9166, + "step": 631 + }, + { + "epoch": 0.034073754582704334, + "grad_norm": 0.8906963467597961, + "learning_rate": 9.998727956488708e-06, + "loss": 0.882, + "step": 632 + }, + { + "epoch": 0.03412766875134785, + "grad_norm": 0.9241589307785034, + "learning_rate": 9.998723170079506e-06, + "loss": 0.8488, + "step": 633 + }, + { + "epoch": 0.03418158291999138, + "grad_norm": 0.9666005969047546, + "learning_rate": 9.998718374683271e-06, + "loss": 0.8432, + "step": 634 + }, + { + "epoch": 0.034235497088634895, + "grad_norm": 0.9036918878555298, + "learning_rate": 9.998713570300018e-06, + "loss": 0.7979, + "step": 635 + }, + { + "epoch": 0.03428941125727841, + "grad_norm": 0.8946508765220642, + "learning_rate": 9.998708756929751e-06, + "loss": 0.8854, + "step": 636 + }, + { + "epoch": 0.03434332542592193, + "grad_norm": 1.0300164222717285, + "learning_rate": 9.99870393457248e-06, + "loss": 0.9116, + "step": 637 + }, + { + "epoch": 0.03439723959456545, + "grad_norm": 1.0635035037994385, + "learning_rate": 9.998699103228214e-06, + "loss": 0.9138, + "step": 638 + }, + { + "epoch": 0.03445115376320897, + "grad_norm": 1.0362621545791626, + "learning_rate": 9.998694262896962e-06, + "loss": 1.0177, + "step": 639 + }, + { + "epoch": 0.03450506793185249, + "grad_norm": 0.9081454873085022, + "learning_rate": 9.99868941357873e-06, + "loss": 0.7802, + "step": 640 + }, + { + "epoch": 0.03455898210049601, + "grad_norm": 0.9943915605545044, + "learning_rate": 9.998684555273529e-06, + "loss": 0.9356, + "step": 641 + }, + { + "epoch": 0.03461289626913953, + "grad_norm": 0.9647786021232605, + "learning_rate": 9.998679687981367e-06, + "loss": 0.741, + "step": 642 + }, + { + "epoch": 0.03466681043778305, + "grad_norm": 0.9655315279960632, + "learning_rate": 9.998674811702255e-06, + "loss": 0.8644, + "step": 643 + }, + { + "epoch": 0.03472072460642657, + "grad_norm": 0.9162091612815857, + "learning_rate": 9.998669926436197e-06, + "loss": 0.8383, + "step": 644 + }, + { + "epoch": 0.03477463877507009, + "grad_norm": 0.9509754776954651, + "learning_rate": 9.998665032183207e-06, + "loss": 0.8066, + "step": 645 + }, + { + "epoch": 0.034828552943713606, + "grad_norm": 1.0545740127563477, + "learning_rate": 9.998660128943292e-06, + "loss": 0.8455, + "step": 646 + }, + { + "epoch": 0.03488246711235713, + "grad_norm": 1.0928760766983032, + "learning_rate": 9.998655216716458e-06, + "loss": 0.8708, + "step": 647 + }, + { + "epoch": 0.03493638128100065, + "grad_norm": 0.9743762016296387, + "learning_rate": 9.998650295502717e-06, + "loss": 0.878, + "step": 648 + }, + { + "epoch": 0.03499029544964417, + "grad_norm": 1.016741156578064, + "learning_rate": 9.998645365302077e-06, + "loss": 0.867, + "step": 649 + }, + { + "epoch": 0.035044209618287685, + "grad_norm": 1.125252366065979, + "learning_rate": 9.998640426114548e-06, + "loss": 0.9443, + "step": 650 + }, + { + "epoch": 0.0350981237869312, + "grad_norm": 0.9555762410163879, + "learning_rate": 9.998635477940135e-06, + "loss": 0.8353, + "step": 651 + }, + { + "epoch": 0.03515203795557473, + "grad_norm": 0.930173397064209, + "learning_rate": 9.998630520778851e-06, + "loss": 0.8383, + "step": 652 + }, + { + "epoch": 0.035205952124218245, + "grad_norm": 1.1592127084732056, + "learning_rate": 9.998625554630704e-06, + "loss": 0.9708, + "step": 653 + }, + { + "epoch": 0.03525986629286176, + "grad_norm": 0.9333894848823547, + "learning_rate": 9.998620579495701e-06, + "loss": 0.9055, + "step": 654 + }, + { + "epoch": 0.03531378046150528, + "grad_norm": 0.9495646357536316, + "learning_rate": 9.998615595373853e-06, + "loss": 0.7993, + "step": 655 + }, + { + "epoch": 0.035367694630148806, + "grad_norm": 1.0919233560562134, + "learning_rate": 9.99861060226517e-06, + "loss": 0.8852, + "step": 656 + }, + { + "epoch": 0.035421608798792324, + "grad_norm": 0.907940685749054, + "learning_rate": 9.998605600169657e-06, + "loss": 0.8294, + "step": 657 + }, + { + "epoch": 0.03547552296743584, + "grad_norm": 1.0423756837844849, + "learning_rate": 9.998600589087328e-06, + "loss": 0.8758, + "step": 658 + }, + { + "epoch": 0.03552943713607936, + "grad_norm": 1.0387269258499146, + "learning_rate": 9.998595569018186e-06, + "loss": 0.9099, + "step": 659 + }, + { + "epoch": 0.03558335130472288, + "grad_norm": 0.9186104536056519, + "learning_rate": 9.998590539962245e-06, + "loss": 0.9025, + "step": 660 + }, + { + "epoch": 0.0356372654733664, + "grad_norm": 1.0173289775848389, + "learning_rate": 9.998585501919514e-06, + "loss": 0.8468, + "step": 661 + }, + { + "epoch": 0.03569117964200992, + "grad_norm": 0.9579570889472961, + "learning_rate": 9.998580454889996e-06, + "loss": 0.8542, + "step": 662 + }, + { + "epoch": 0.03574509381065344, + "grad_norm": 1.093515396118164, + "learning_rate": 9.99857539887371e-06, + "loss": 0.8932, + "step": 663 + }, + { + "epoch": 0.03579900797929696, + "grad_norm": 1.0651243925094604, + "learning_rate": 9.998570333870656e-06, + "loss": 0.8822, + "step": 664 + }, + { + "epoch": 0.03585292214794048, + "grad_norm": 0.973278284072876, + "learning_rate": 9.998565259880845e-06, + "loss": 0.8724, + "step": 665 + }, + { + "epoch": 0.035906836316584, + "grad_norm": 0.961321234703064, + "learning_rate": 9.998560176904291e-06, + "loss": 0.947, + "step": 666 + }, + { + "epoch": 0.03596075048522752, + "grad_norm": 1.0216654539108276, + "learning_rate": 9.998555084940999e-06, + "loss": 0.8528, + "step": 667 + }, + { + "epoch": 0.036014664653871035, + "grad_norm": 0.9917817711830139, + "learning_rate": 9.99854998399098e-06, + "loss": 0.8608, + "step": 668 + }, + { + "epoch": 0.03606857882251455, + "grad_norm": 1.0164326429367065, + "learning_rate": 9.998544874054243e-06, + "loss": 0.8752, + "step": 669 + }, + { + "epoch": 0.03612249299115808, + "grad_norm": 0.9181317687034607, + "learning_rate": 9.998539755130793e-06, + "loss": 0.8032, + "step": 670 + }, + { + "epoch": 0.036176407159801596, + "grad_norm": 1.0100011825561523, + "learning_rate": 9.998534627220646e-06, + "loss": 0.9205, + "step": 671 + }, + { + "epoch": 0.036230321328445114, + "grad_norm": 0.9306463599205017, + "learning_rate": 9.998529490323807e-06, + "loss": 0.8209, + "step": 672 + }, + { + "epoch": 0.03628423549708863, + "grad_norm": 1.8988754749298096, + "learning_rate": 9.998524344440286e-06, + "loss": 0.8455, + "step": 673 + }, + { + "epoch": 0.03633814966573216, + "grad_norm": 0.9742317795753479, + "learning_rate": 9.998519189570091e-06, + "loss": 0.8733, + "step": 674 + }, + { + "epoch": 0.036392063834375675, + "grad_norm": 0.9334224462509155, + "learning_rate": 9.998514025713234e-06, + "loss": 0.8761, + "step": 675 + }, + { + "epoch": 0.03644597800301919, + "grad_norm": 0.9729838371276855, + "learning_rate": 9.998508852869724e-06, + "loss": 0.8916, + "step": 676 + }, + { + "epoch": 0.03649989217166271, + "grad_norm": 0.9721505641937256, + "learning_rate": 9.998503671039568e-06, + "loss": 0.8735, + "step": 677 + }, + { + "epoch": 0.036553806340306236, + "grad_norm": 0.9600850939750671, + "learning_rate": 9.998498480222775e-06, + "loss": 0.9157, + "step": 678 + }, + { + "epoch": 0.036607720508949754, + "grad_norm": 0.9010732173919678, + "learning_rate": 9.998493280419358e-06, + "loss": 0.9215, + "step": 679 + }, + { + "epoch": 0.03666163467759327, + "grad_norm": 0.8708087801933289, + "learning_rate": 9.998488071629324e-06, + "loss": 0.7218, + "step": 680 + }, + { + "epoch": 0.03671554884623679, + "grad_norm": 0.9739180207252502, + "learning_rate": 9.998482853852682e-06, + "loss": 0.8845, + "step": 681 + }, + { + "epoch": 0.03676946301488031, + "grad_norm": 0.9823595881462097, + "learning_rate": 9.998477627089443e-06, + "loss": 0.896, + "step": 682 + }, + { + "epoch": 0.03682337718352383, + "grad_norm": 0.9629859328269958, + "learning_rate": 9.998472391339612e-06, + "loss": 0.8636, + "step": 683 + }, + { + "epoch": 0.03687729135216735, + "grad_norm": 0.8644251823425293, + "learning_rate": 9.998467146603206e-06, + "loss": 0.9124, + "step": 684 + }, + { + "epoch": 0.03693120552081087, + "grad_norm": 0.8987632989883423, + "learning_rate": 9.99846189288023e-06, + "loss": 0.801, + "step": 685 + }, + { + "epoch": 0.036985119689454386, + "grad_norm": 0.9017630219459534, + "learning_rate": 9.99845663017069e-06, + "loss": 0.8675, + "step": 686 + }, + { + "epoch": 0.03703903385809791, + "grad_norm": 0.8905850648880005, + "learning_rate": 9.998451358474603e-06, + "loss": 0.8512, + "step": 687 + }, + { + "epoch": 0.03709294802674143, + "grad_norm": 0.9807800650596619, + "learning_rate": 9.998446077791972e-06, + "loss": 0.9258, + "step": 688 + }, + { + "epoch": 0.03714686219538495, + "grad_norm": 0.8916336894035339, + "learning_rate": 9.99844078812281e-06, + "loss": 0.8236, + "step": 689 + }, + { + "epoch": 0.037200776364028465, + "grad_norm": 0.9330187439918518, + "learning_rate": 9.998435489467126e-06, + "loss": 0.7812, + "step": 690 + }, + { + "epoch": 0.03725469053267198, + "grad_norm": 0.9859142899513245, + "learning_rate": 9.99843018182493e-06, + "loss": 0.8699, + "step": 691 + }, + { + "epoch": 0.03730860470131551, + "grad_norm": 0.9277002215385437, + "learning_rate": 9.998424865196228e-06, + "loss": 0.9276, + "step": 692 + }, + { + "epoch": 0.037362518869959026, + "grad_norm": 0.9764281511306763, + "learning_rate": 9.998419539581034e-06, + "loss": 0.9482, + "step": 693 + }, + { + "epoch": 0.037416433038602544, + "grad_norm": 1.0108616352081299, + "learning_rate": 9.998414204979357e-06, + "loss": 0.8582, + "step": 694 + }, + { + "epoch": 0.03747034720724606, + "grad_norm": 1.2767362594604492, + "learning_rate": 9.998408861391202e-06, + "loss": 0.7833, + "step": 695 + }, + { + "epoch": 0.03752426137588959, + "grad_norm": 0.8874560594558716, + "learning_rate": 9.998403508816585e-06, + "loss": 0.8935, + "step": 696 + }, + { + "epoch": 0.037578175544533104, + "grad_norm": 0.8549458980560303, + "learning_rate": 9.998398147255511e-06, + "loss": 0.7747, + "step": 697 + }, + { + "epoch": 0.03763208971317662, + "grad_norm": 0.9971988201141357, + "learning_rate": 9.998392776707993e-06, + "loss": 0.753, + "step": 698 + }, + { + "epoch": 0.03768600388182014, + "grad_norm": 0.9822113513946533, + "learning_rate": 9.998387397174037e-06, + "loss": 0.9121, + "step": 699 + }, + { + "epoch": 0.037739918050463665, + "grad_norm": 0.996151864528656, + "learning_rate": 9.998382008653656e-06, + "loss": 0.9356, + "step": 700 + }, + { + "epoch": 0.03779383221910718, + "grad_norm": 1.7505156993865967, + "learning_rate": 9.998376611146857e-06, + "loss": 0.8351, + "step": 701 + }, + { + "epoch": 0.0378477463877507, + "grad_norm": 1.070356011390686, + "learning_rate": 9.998371204653651e-06, + "loss": 0.9153, + "step": 702 + }, + { + "epoch": 0.03790166055639422, + "grad_norm": 0.9383741617202759, + "learning_rate": 9.998365789174048e-06, + "loss": 0.8904, + "step": 703 + }, + { + "epoch": 0.03795557472503774, + "grad_norm": 0.8444882035255432, + "learning_rate": 9.998360364708058e-06, + "loss": 0.8243, + "step": 704 + }, + { + "epoch": 0.03800948889368126, + "grad_norm": 1.0012257099151611, + "learning_rate": 9.99835493125569e-06, + "loss": 0.9439, + "step": 705 + }, + { + "epoch": 0.03806340306232478, + "grad_norm": 0.9745193719863892, + "learning_rate": 9.998349488816954e-06, + "loss": 0.8667, + "step": 706 + }, + { + "epoch": 0.0381173172309683, + "grad_norm": 0.8363852500915527, + "learning_rate": 9.998344037391859e-06, + "loss": 0.8082, + "step": 707 + }, + { + "epoch": 0.038171231399611816, + "grad_norm": 0.9389918446540833, + "learning_rate": 9.998338576980417e-06, + "loss": 0.8113, + "step": 708 + }, + { + "epoch": 0.03822514556825534, + "grad_norm": 0.9216110110282898, + "learning_rate": 9.998333107582635e-06, + "loss": 0.8179, + "step": 709 + }, + { + "epoch": 0.03827905973689886, + "grad_norm": 1.0292471647262573, + "learning_rate": 9.998327629198526e-06, + "loss": 0.8605, + "step": 710 + }, + { + "epoch": 0.03833297390554238, + "grad_norm": 0.9812708497047424, + "learning_rate": 9.998322141828097e-06, + "loss": 0.9279, + "step": 711 + }, + { + "epoch": 0.038386888074185894, + "grad_norm": 0.8186620473861694, + "learning_rate": 9.998316645471358e-06, + "loss": 0.7877, + "step": 712 + }, + { + "epoch": 0.03844080224282941, + "grad_norm": 1.034134864807129, + "learning_rate": 9.99831114012832e-06, + "loss": 0.9867, + "step": 713 + }, + { + "epoch": 0.03849471641147294, + "grad_norm": 1.1604938507080078, + "learning_rate": 9.998305625798993e-06, + "loss": 0.9134, + "step": 714 + }, + { + "epoch": 0.038548630580116455, + "grad_norm": 0.8452483415603638, + "learning_rate": 9.998300102483388e-06, + "loss": 0.8732, + "step": 715 + }, + { + "epoch": 0.03860254474875997, + "grad_norm": 0.8881269693374634, + "learning_rate": 9.998294570181512e-06, + "loss": 0.847, + "step": 716 + }, + { + "epoch": 0.03865645891740349, + "grad_norm": 0.8822013735771179, + "learning_rate": 9.998289028893375e-06, + "loss": 0.8404, + "step": 717 + }, + { + "epoch": 0.038710373086047016, + "grad_norm": 1.0011916160583496, + "learning_rate": 9.998283478618991e-06, + "loss": 0.8133, + "step": 718 + }, + { + "epoch": 0.038764287254690534, + "grad_norm": 1.0004018545150757, + "learning_rate": 9.998277919358367e-06, + "loss": 0.9556, + "step": 719 + }, + { + "epoch": 0.03881820142333405, + "grad_norm": 0.8176954984664917, + "learning_rate": 9.998272351111513e-06, + "loss": 0.7977, + "step": 720 + }, + { + "epoch": 0.03887211559197757, + "grad_norm": 0.9160690307617188, + "learning_rate": 9.99826677387844e-06, + "loss": 0.9239, + "step": 721 + }, + { + "epoch": 0.03892602976062109, + "grad_norm": 1.2158405780792236, + "learning_rate": 9.998261187659157e-06, + "loss": 0.9023, + "step": 722 + }, + { + "epoch": 0.03897994392926461, + "grad_norm": 0.9564448595046997, + "learning_rate": 9.998255592453674e-06, + "loss": 0.8585, + "step": 723 + }, + { + "epoch": 0.03903385809790813, + "grad_norm": 0.8902252316474915, + "learning_rate": 9.998249988262002e-06, + "loss": 0.8388, + "step": 724 + }, + { + "epoch": 0.03908777226655165, + "grad_norm": 0.8738620281219482, + "learning_rate": 9.998244375084152e-06, + "loss": 0.9545, + "step": 725 + }, + { + "epoch": 0.03914168643519517, + "grad_norm": 0.9670735001564026, + "learning_rate": 9.99823875292013e-06, + "loss": 0.8335, + "step": 726 + }, + { + "epoch": 0.03919560060383869, + "grad_norm": 0.8719429969787598, + "learning_rate": 9.998233121769952e-06, + "loss": 0.8546, + "step": 727 + }, + { + "epoch": 0.03924951477248221, + "grad_norm": 1.318429708480835, + "learning_rate": 9.998227481633622e-06, + "loss": 1.0658, + "step": 728 + }, + { + "epoch": 0.03930342894112573, + "grad_norm": 0.962630569934845, + "learning_rate": 9.998221832511155e-06, + "loss": 0.9049, + "step": 729 + }, + { + "epoch": 0.039357343109769245, + "grad_norm": 0.9639857411384583, + "learning_rate": 9.998216174402558e-06, + "loss": 0.9114, + "step": 730 + }, + { + "epoch": 0.03941125727841277, + "grad_norm": 1.1621571779251099, + "learning_rate": 9.998210507307843e-06, + "loss": 0.8776, + "step": 731 + }, + { + "epoch": 0.03946517144705629, + "grad_norm": 1.170089840888977, + "learning_rate": 9.998204831227019e-06, + "loss": 0.9928, + "step": 732 + }, + { + "epoch": 0.039519085615699806, + "grad_norm": 0.8257297873497009, + "learning_rate": 9.998199146160098e-06, + "loss": 0.7885, + "step": 733 + }, + { + "epoch": 0.039572999784343324, + "grad_norm": 0.8887513279914856, + "learning_rate": 9.998193452107088e-06, + "loss": 0.8389, + "step": 734 + }, + { + "epoch": 0.03962691395298684, + "grad_norm": 0.9321185350418091, + "learning_rate": 9.998187749068001e-06, + "loss": 0.9083, + "step": 735 + }, + { + "epoch": 0.03968082812163037, + "grad_norm": 0.9926772713661194, + "learning_rate": 9.998182037042847e-06, + "loss": 0.9102, + "step": 736 + }, + { + "epoch": 0.039734742290273885, + "grad_norm": 1.0760009288787842, + "learning_rate": 9.998176316031634e-06, + "loss": 0.7781, + "step": 737 + }, + { + "epoch": 0.0397886564589174, + "grad_norm": 1.0998133420944214, + "learning_rate": 9.998170586034376e-06, + "loss": 0.9725, + "step": 738 + }, + { + "epoch": 0.03984257062756092, + "grad_norm": 0.9367475509643555, + "learning_rate": 9.99816484705108e-06, + "loss": 0.8277, + "step": 739 + }, + { + "epoch": 0.039896484796204446, + "grad_norm": 0.942954957485199, + "learning_rate": 9.998159099081758e-06, + "loss": 0.8542, + "step": 740 + }, + { + "epoch": 0.039950398964847963, + "grad_norm": 0.9841166138648987, + "learning_rate": 9.998153342126421e-06, + "loss": 0.9179, + "step": 741 + }, + { + "epoch": 0.04000431313349148, + "grad_norm": 0.9215245246887207, + "learning_rate": 9.998147576185077e-06, + "loss": 0.8899, + "step": 742 + }, + { + "epoch": 0.040058227302135, + "grad_norm": 1.0368192195892334, + "learning_rate": 9.998141801257739e-06, + "loss": 0.9828, + "step": 743 + }, + { + "epoch": 0.04011214147077852, + "grad_norm": 0.9696660041809082, + "learning_rate": 9.998136017344416e-06, + "loss": 0.9431, + "step": 744 + }, + { + "epoch": 0.04016605563942204, + "grad_norm": 1.111257791519165, + "learning_rate": 9.998130224445117e-06, + "loss": 0.9666, + "step": 745 + }, + { + "epoch": 0.04021996980806556, + "grad_norm": 0.9260644316673279, + "learning_rate": 9.998124422559856e-06, + "loss": 0.8941, + "step": 746 + }, + { + "epoch": 0.04027388397670908, + "grad_norm": 0.8622020483016968, + "learning_rate": 9.99811861168864e-06, + "loss": 0.8148, + "step": 747 + }, + { + "epoch": 0.040327798145352596, + "grad_norm": 0.8767471313476562, + "learning_rate": 9.998112791831483e-06, + "loss": 0.7093, + "step": 748 + }, + { + "epoch": 0.04038171231399612, + "grad_norm": 0.902917206287384, + "learning_rate": 9.998106962988391e-06, + "loss": 0.7677, + "step": 749 + }, + { + "epoch": 0.04043562648263964, + "grad_norm": 1.351694941520691, + "learning_rate": 9.998101125159377e-06, + "loss": 1.0382, + "step": 750 + }, + { + "epoch": 0.04048954065128316, + "grad_norm": 0.8547930121421814, + "learning_rate": 9.998095278344452e-06, + "loss": 0.7974, + "step": 751 + }, + { + "epoch": 0.040543454819926675, + "grad_norm": 0.941149115562439, + "learning_rate": 9.998089422543626e-06, + "loss": 0.8518, + "step": 752 + }, + { + "epoch": 0.0405973689885702, + "grad_norm": 0.8671521544456482, + "learning_rate": 9.998083557756908e-06, + "loss": 0.8049, + "step": 753 + }, + { + "epoch": 0.04065128315721372, + "grad_norm": 0.9877942800521851, + "learning_rate": 9.998077683984311e-06, + "loss": 0.8874, + "step": 754 + }, + { + "epoch": 0.040705197325857236, + "grad_norm": 1.2130393981933594, + "learning_rate": 9.998071801225843e-06, + "loss": 0.9794, + "step": 755 + }, + { + "epoch": 0.040759111494500753, + "grad_norm": 0.9422823786735535, + "learning_rate": 9.998065909481518e-06, + "loss": 0.899, + "step": 756 + }, + { + "epoch": 0.04081302566314427, + "grad_norm": 0.9770492911338806, + "learning_rate": 9.998060008751343e-06, + "loss": 0.8434, + "step": 757 + }, + { + "epoch": 0.040866939831787796, + "grad_norm": 0.9227531552314758, + "learning_rate": 9.998054099035332e-06, + "loss": 0.8797, + "step": 758 + }, + { + "epoch": 0.040920854000431314, + "grad_norm": 1.0452102422714233, + "learning_rate": 9.998048180333492e-06, + "loss": 0.8702, + "step": 759 + }, + { + "epoch": 0.04097476816907483, + "grad_norm": 1.034125566482544, + "learning_rate": 9.998042252645837e-06, + "loss": 0.9041, + "step": 760 + }, + { + "epoch": 0.04102868233771835, + "grad_norm": 0.886029064655304, + "learning_rate": 9.998036315972375e-06, + "loss": 0.7805, + "step": 761 + }, + { + "epoch": 0.041082596506361875, + "grad_norm": 0.9845888614654541, + "learning_rate": 9.998030370313116e-06, + "loss": 0.9836, + "step": 762 + }, + { + "epoch": 0.04113651067500539, + "grad_norm": 0.9223973155021667, + "learning_rate": 9.998024415668075e-06, + "loss": 0.768, + "step": 763 + }, + { + "epoch": 0.04119042484364891, + "grad_norm": 1.0607362985610962, + "learning_rate": 9.99801845203726e-06, + "loss": 0.865, + "step": 764 + }, + { + "epoch": 0.04124433901229243, + "grad_norm": 0.9620907306671143, + "learning_rate": 9.998012479420683e-06, + "loss": 0.7645, + "step": 765 + }, + { + "epoch": 0.04129825318093595, + "grad_norm": 0.9490310549736023, + "learning_rate": 9.99800649781835e-06, + "loss": 0.9124, + "step": 766 + }, + { + "epoch": 0.04135216734957947, + "grad_norm": 0.9684557914733887, + "learning_rate": 9.99800050723028e-06, + "loss": 0.876, + "step": 767 + }, + { + "epoch": 0.04140608151822299, + "grad_norm": 0.9633080959320068, + "learning_rate": 9.997994507656476e-06, + "loss": 0.8976, + "step": 768 + }, + { + "epoch": 0.04145999568686651, + "grad_norm": 0.9495208263397217, + "learning_rate": 9.997988499096953e-06, + "loss": 0.9049, + "step": 769 + }, + { + "epoch": 0.041513909855510026, + "grad_norm": 1.0614326000213623, + "learning_rate": 9.997982481551721e-06, + "loss": 0.905, + "step": 770 + }, + { + "epoch": 0.04156782402415355, + "grad_norm": 0.820672869682312, + "learning_rate": 9.99797645502079e-06, + "loss": 0.8306, + "step": 771 + }, + { + "epoch": 0.04162173819279707, + "grad_norm": 0.9719771146774292, + "learning_rate": 9.997970419504171e-06, + "loss": 0.828, + "step": 772 + }, + { + "epoch": 0.041675652361440586, + "grad_norm": 0.893326997756958, + "learning_rate": 9.997964375001875e-06, + "loss": 0.8416, + "step": 773 + }, + { + "epoch": 0.041729566530084104, + "grad_norm": 0.858121395111084, + "learning_rate": 9.997958321513915e-06, + "loss": 0.8779, + "step": 774 + }, + { + "epoch": 0.04178348069872762, + "grad_norm": 0.9703636765480042, + "learning_rate": 9.997952259040297e-06, + "loss": 0.8623, + "step": 775 + }, + { + "epoch": 0.04183739486737115, + "grad_norm": 0.9626398086547852, + "learning_rate": 9.997946187581039e-06, + "loss": 0.8309, + "step": 776 + }, + { + "epoch": 0.041891309036014665, + "grad_norm": 0.9132344722747803, + "learning_rate": 9.997940107136143e-06, + "loss": 0.8798, + "step": 777 + }, + { + "epoch": 0.04194522320465818, + "grad_norm": 0.9608821272850037, + "learning_rate": 9.997934017705629e-06, + "loss": 0.8764, + "step": 778 + }, + { + "epoch": 0.0419991373733017, + "grad_norm": 1.0852513313293457, + "learning_rate": 9.997927919289501e-06, + "loss": 0.8908, + "step": 779 + }, + { + "epoch": 0.042053051541945226, + "grad_norm": 0.9690573215484619, + "learning_rate": 9.997921811887774e-06, + "loss": 0.8556, + "step": 780 + }, + { + "epoch": 0.042106965710588744, + "grad_norm": 0.9107050895690918, + "learning_rate": 9.997915695500458e-06, + "loss": 0.9249, + "step": 781 + }, + { + "epoch": 0.04216087987923226, + "grad_norm": 1.029974102973938, + "learning_rate": 9.997909570127564e-06, + "loss": 0.8369, + "step": 782 + }, + { + "epoch": 0.04221479404787578, + "grad_norm": 0.8179258704185486, + "learning_rate": 9.997903435769101e-06, + "loss": 0.7729, + "step": 783 + }, + { + "epoch": 0.042268708216519305, + "grad_norm": 1.0664961338043213, + "learning_rate": 9.997897292425082e-06, + "loss": 0.8815, + "step": 784 + }, + { + "epoch": 0.04232262238516282, + "grad_norm": 0.9794465899467468, + "learning_rate": 9.997891140095519e-06, + "loss": 0.9244, + "step": 785 + }, + { + "epoch": 0.04237653655380634, + "grad_norm": 0.875953197479248, + "learning_rate": 9.99788497878042e-06, + "loss": 0.9191, + "step": 786 + }, + { + "epoch": 0.04243045072244986, + "grad_norm": 0.9880902767181396, + "learning_rate": 9.9978788084798e-06, + "loss": 0.8639, + "step": 787 + }, + { + "epoch": 0.042484364891093376, + "grad_norm": 1.0391566753387451, + "learning_rate": 9.997872629193666e-06, + "loss": 0.9943, + "step": 788 + }, + { + "epoch": 0.0425382790597369, + "grad_norm": 0.9321290850639343, + "learning_rate": 9.997866440922033e-06, + "loss": 0.7809, + "step": 789 + }, + { + "epoch": 0.04259219322838042, + "grad_norm": 0.8898556232452393, + "learning_rate": 9.99786024366491e-06, + "loss": 0.9353, + "step": 790 + }, + { + "epoch": 0.04264610739702394, + "grad_norm": 1.1177983283996582, + "learning_rate": 9.997854037422306e-06, + "loss": 0.8157, + "step": 791 + }, + { + "epoch": 0.042700021565667455, + "grad_norm": 0.8821296691894531, + "learning_rate": 9.997847822194236e-06, + "loss": 0.8729, + "step": 792 + }, + { + "epoch": 0.04275393573431098, + "grad_norm": 0.8545325398445129, + "learning_rate": 9.997841597980709e-06, + "loss": 0.8415, + "step": 793 + }, + { + "epoch": 0.0428078499029545, + "grad_norm": 0.9313606023788452, + "learning_rate": 9.997835364781739e-06, + "loss": 0.8411, + "step": 794 + }, + { + "epoch": 0.042861764071598016, + "grad_norm": 0.9587781429290771, + "learning_rate": 9.997829122597332e-06, + "loss": 0.8086, + "step": 795 + }, + { + "epoch": 0.042915678240241534, + "grad_norm": 0.9708360433578491, + "learning_rate": 9.997822871427504e-06, + "loss": 0.8715, + "step": 796 + }, + { + "epoch": 0.04296959240888505, + "grad_norm": 0.8868080973625183, + "learning_rate": 9.997816611272265e-06, + "loss": 0.8549, + "step": 797 + }, + { + "epoch": 0.04302350657752858, + "grad_norm": 0.9147778153419495, + "learning_rate": 9.997810342131624e-06, + "loss": 0.7854, + "step": 798 + }, + { + "epoch": 0.043077420746172095, + "grad_norm": 0.9853960275650024, + "learning_rate": 9.997804064005596e-06, + "loss": 0.8243, + "step": 799 + }, + { + "epoch": 0.04313133491481561, + "grad_norm": 1.0076130628585815, + "learning_rate": 9.997797776894189e-06, + "loss": 0.9077, + "step": 800 + }, + { + "epoch": 0.04318524908345913, + "grad_norm": 0.9694076776504517, + "learning_rate": 9.997791480797417e-06, + "loss": 0.8767, + "step": 801 + }, + { + "epoch": 0.043239163252102655, + "grad_norm": 1.114001750946045, + "learning_rate": 9.99778517571529e-06, + "loss": 0.8211, + "step": 802 + }, + { + "epoch": 0.04329307742074617, + "grad_norm": 0.9701128005981445, + "learning_rate": 9.997778861647817e-06, + "loss": 0.9084, + "step": 803 + }, + { + "epoch": 0.04334699158938969, + "grad_norm": 0.868299126625061, + "learning_rate": 9.997772538595015e-06, + "loss": 0.7556, + "step": 804 + }, + { + "epoch": 0.04340090575803321, + "grad_norm": 0.9160446524620056, + "learning_rate": 9.997766206556888e-06, + "loss": 0.821, + "step": 805 + }, + { + "epoch": 0.043454819926676734, + "grad_norm": 0.934198260307312, + "learning_rate": 9.997759865533454e-06, + "loss": 0.9113, + "step": 806 + }, + { + "epoch": 0.04350873409532025, + "grad_norm": 0.8949079513549805, + "learning_rate": 9.997753515524722e-06, + "loss": 0.7821, + "step": 807 + }, + { + "epoch": 0.04356264826396377, + "grad_norm": 0.9035944938659668, + "learning_rate": 9.997747156530702e-06, + "loss": 0.8233, + "step": 808 + }, + { + "epoch": 0.04361656243260729, + "grad_norm": 0.9681552052497864, + "learning_rate": 9.99774078855141e-06, + "loss": 0.9241, + "step": 809 + }, + { + "epoch": 0.043670476601250806, + "grad_norm": 0.906092643737793, + "learning_rate": 9.99773441158685e-06, + "loss": 0.8948, + "step": 810 + }, + { + "epoch": 0.04372439076989433, + "grad_norm": 0.9229143261909485, + "learning_rate": 9.997728025637039e-06, + "loss": 0.8897, + "step": 811 + }, + { + "epoch": 0.04377830493853785, + "grad_norm": 0.9263061881065369, + "learning_rate": 9.997721630701986e-06, + "loss": 0.7923, + "step": 812 + }, + { + "epoch": 0.04383221910718137, + "grad_norm": 0.8474372029304504, + "learning_rate": 9.997715226781706e-06, + "loss": 0.796, + "step": 813 + }, + { + "epoch": 0.043886133275824885, + "grad_norm": 0.9960548877716064, + "learning_rate": 9.997708813876206e-06, + "loss": 0.9166, + "step": 814 + }, + { + "epoch": 0.04394004744446841, + "grad_norm": 0.9843032956123352, + "learning_rate": 9.997702391985499e-06, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 0.04399396161311193, + "grad_norm": 0.9313154220581055, + "learning_rate": 9.997695961109599e-06, + "loss": 0.8972, + "step": 816 + }, + { + "epoch": 0.044047875781755445, + "grad_norm": 0.8846973180770874, + "learning_rate": 9.997689521248515e-06, + "loss": 0.8599, + "step": 817 + }, + { + "epoch": 0.04410178995039896, + "grad_norm": 0.8113641738891602, + "learning_rate": 9.99768307240226e-06, + "loss": 0.8509, + "step": 818 + }, + { + "epoch": 0.04415570411904248, + "grad_norm": 1.0659984350204468, + "learning_rate": 9.997676614570844e-06, + "loss": 0.938, + "step": 819 + }, + { + "epoch": 0.044209618287686006, + "grad_norm": 0.9183745384216309, + "learning_rate": 9.99767014775428e-06, + "loss": 0.8761, + "step": 820 + }, + { + "epoch": 0.044263532456329524, + "grad_norm": 0.87090003490448, + "learning_rate": 9.997663671952578e-06, + "loss": 0.8535, + "step": 821 + }, + { + "epoch": 0.04431744662497304, + "grad_norm": 0.9857214093208313, + "learning_rate": 9.997657187165753e-06, + "loss": 0.9434, + "step": 822 + }, + { + "epoch": 0.04437136079361656, + "grad_norm": 1.0443209409713745, + "learning_rate": 9.997650693393812e-06, + "loss": 0.8994, + "step": 823 + }, + { + "epoch": 0.044425274962260085, + "grad_norm": 0.8348391652107239, + "learning_rate": 9.99764419063677e-06, + "loss": 0.8383, + "step": 824 + }, + { + "epoch": 0.0444791891309036, + "grad_norm": 1.2708821296691895, + "learning_rate": 9.997637678894639e-06, + "loss": 0.8733, + "step": 825 + }, + { + "epoch": 0.04453310329954712, + "grad_norm": 0.9863126277923584, + "learning_rate": 9.997631158167428e-06, + "loss": 0.9364, + "step": 826 + }, + { + "epoch": 0.04458701746819064, + "grad_norm": 1.0223352909088135, + "learning_rate": 9.99762462845515e-06, + "loss": 0.9139, + "step": 827 + }, + { + "epoch": 0.04464093163683416, + "grad_norm": 0.8559738397598267, + "learning_rate": 9.997618089757818e-06, + "loss": 0.7461, + "step": 828 + }, + { + "epoch": 0.04469484580547768, + "grad_norm": 0.9347368478775024, + "learning_rate": 9.997611542075442e-06, + "loss": 0.9275, + "step": 829 + }, + { + "epoch": 0.0447487599741212, + "grad_norm": 1.0208019018173218, + "learning_rate": 9.997604985408036e-06, + "loss": 0.8338, + "step": 830 + }, + { + "epoch": 0.04480267414276472, + "grad_norm": 0.9792174100875854, + "learning_rate": 9.997598419755607e-06, + "loss": 0.9437, + "step": 831 + }, + { + "epoch": 0.044856588311408235, + "grad_norm": 0.851665198802948, + "learning_rate": 9.997591845118173e-06, + "loss": 0.8008, + "step": 832 + }, + { + "epoch": 0.04491050248005176, + "grad_norm": 0.9315025806427002, + "learning_rate": 9.997585261495742e-06, + "loss": 0.8389, + "step": 833 + }, + { + "epoch": 0.04496441664869528, + "grad_norm": 0.9658921360969543, + "learning_rate": 9.997578668888326e-06, + "loss": 0.9252, + "step": 834 + }, + { + "epoch": 0.045018330817338796, + "grad_norm": 0.8989397287368774, + "learning_rate": 9.997572067295938e-06, + "loss": 0.8648, + "step": 835 + }, + { + "epoch": 0.045072244985982314, + "grad_norm": 0.8874988555908203, + "learning_rate": 9.99756545671859e-06, + "loss": 0.7801, + "step": 836 + }, + { + "epoch": 0.04512615915462584, + "grad_norm": 0.9186223745346069, + "learning_rate": 9.997558837156293e-06, + "loss": 0.767, + "step": 837 + }, + { + "epoch": 0.04518007332326936, + "grad_norm": 1.163044810295105, + "learning_rate": 9.997552208609059e-06, + "loss": 0.8938, + "step": 838 + }, + { + "epoch": 0.045233987491912875, + "grad_norm": 0.8315468430519104, + "learning_rate": 9.997545571076901e-06, + "loss": 0.725, + "step": 839 + }, + { + "epoch": 0.04528790166055639, + "grad_norm": 1.0088660717010498, + "learning_rate": 9.99753892455983e-06, + "loss": 0.8533, + "step": 840 + }, + { + "epoch": 0.04534181582919991, + "grad_norm": 0.9268692135810852, + "learning_rate": 9.997532269057857e-06, + "loss": 0.8739, + "step": 841 + }, + { + "epoch": 0.045395729997843436, + "grad_norm": 1.0793242454528809, + "learning_rate": 9.997525604570995e-06, + "loss": 0.9605, + "step": 842 + }, + { + "epoch": 0.045449644166486954, + "grad_norm": 1.101798176765442, + "learning_rate": 9.997518931099258e-06, + "loss": 0.9525, + "step": 843 + }, + { + "epoch": 0.04550355833513047, + "grad_norm": 0.9046466946601868, + "learning_rate": 9.997512248642654e-06, + "loss": 0.8853, + "step": 844 + }, + { + "epoch": 0.04555747250377399, + "grad_norm": 0.9629097580909729, + "learning_rate": 9.997505557201198e-06, + "loss": 0.8882, + "step": 845 + }, + { + "epoch": 0.045611386672417514, + "grad_norm": 1.1880977153778076, + "learning_rate": 9.997498856774898e-06, + "loss": 0.8812, + "step": 846 + }, + { + "epoch": 0.04566530084106103, + "grad_norm": 0.8678451180458069, + "learning_rate": 9.997492147363772e-06, + "loss": 0.887, + "step": 847 + }, + { + "epoch": 0.04571921500970455, + "grad_norm": 1.3359739780426025, + "learning_rate": 9.99748542896783e-06, + "loss": 0.8141, + "step": 848 + }, + { + "epoch": 0.04577312917834807, + "grad_norm": 0.9263296127319336, + "learning_rate": 9.99747870158708e-06, + "loss": 0.9357, + "step": 849 + }, + { + "epoch": 0.045827043346991586, + "grad_norm": 0.9199776649475098, + "learning_rate": 9.997471965221541e-06, + "loss": 0.8352, + "step": 850 + }, + { + "epoch": 0.04588095751563511, + "grad_norm": 0.8880730867385864, + "learning_rate": 9.997465219871218e-06, + "loss": 0.7802, + "step": 851 + }, + { + "epoch": 0.04593487168427863, + "grad_norm": 0.8561250567436218, + "learning_rate": 9.99745846553613e-06, + "loss": 0.7987, + "step": 852 + }, + { + "epoch": 0.04598878585292215, + "grad_norm": 0.8975661396980286, + "learning_rate": 9.997451702216283e-06, + "loss": 0.8325, + "step": 853 + }, + { + "epoch": 0.046042700021565665, + "grad_norm": 0.9350215196609497, + "learning_rate": 9.997444929911693e-06, + "loss": 0.7708, + "step": 854 + }, + { + "epoch": 0.04609661419020919, + "grad_norm": 1.0229014158248901, + "learning_rate": 9.99743814862237e-06, + "loss": 0.9643, + "step": 855 + }, + { + "epoch": 0.04615052835885271, + "grad_norm": 0.9249217510223389, + "learning_rate": 9.997431358348329e-06, + "loss": 0.8411, + "step": 856 + }, + { + "epoch": 0.046204442527496226, + "grad_norm": 0.9823042154312134, + "learning_rate": 9.99742455908958e-06, + "loss": 0.9406, + "step": 857 + }, + { + "epoch": 0.046258356696139744, + "grad_norm": 1.2525794506072998, + "learning_rate": 9.997417750846134e-06, + "loss": 0.8507, + "step": 858 + }, + { + "epoch": 0.04631227086478327, + "grad_norm": 0.9583309888839722, + "learning_rate": 9.997410933618006e-06, + "loss": 0.8504, + "step": 859 + }, + { + "epoch": 0.046366185033426786, + "grad_norm": 0.9264401793479919, + "learning_rate": 9.997404107405207e-06, + "loss": 0.8595, + "step": 860 + }, + { + "epoch": 0.046420099202070304, + "grad_norm": 0.9833316206932068, + "learning_rate": 9.99739727220775e-06, + "loss": 0.9025, + "step": 861 + }, + { + "epoch": 0.04647401337071382, + "grad_norm": 1.0220664739608765, + "learning_rate": 9.997390428025645e-06, + "loss": 0.8671, + "step": 862 + }, + { + "epoch": 0.04652792753935734, + "grad_norm": 1.0774664878845215, + "learning_rate": 9.997383574858908e-06, + "loss": 0.8463, + "step": 863 + }, + { + "epoch": 0.046581841708000865, + "grad_norm": 0.8821879029273987, + "learning_rate": 9.997376712707547e-06, + "loss": 0.7565, + "step": 864 + }, + { + "epoch": 0.04663575587664438, + "grad_norm": 0.9233925938606262, + "learning_rate": 9.997369841571577e-06, + "loss": 0.9151, + "step": 865 + }, + { + "epoch": 0.0466896700452879, + "grad_norm": 1.0006109476089478, + "learning_rate": 9.997362961451015e-06, + "loss": 0.8339, + "step": 866 + }, + { + "epoch": 0.04674358421393142, + "grad_norm": 0.865035891532898, + "learning_rate": 9.997356072345863e-06, + "loss": 0.8997, + "step": 867 + }, + { + "epoch": 0.046797498382574944, + "grad_norm": 1.0450654029846191, + "learning_rate": 9.99734917425614e-06, + "loss": 0.7966, + "step": 868 + }, + { + "epoch": 0.04685141255121846, + "grad_norm": 0.8878824710845947, + "learning_rate": 9.997342267181857e-06, + "loss": 0.831, + "step": 869 + }, + { + "epoch": 0.04690532671986198, + "grad_norm": 1.0056546926498413, + "learning_rate": 9.997335351123028e-06, + "loss": 0.8178, + "step": 870 + }, + { + "epoch": 0.0469592408885055, + "grad_norm": 1.0531659126281738, + "learning_rate": 9.997328426079661e-06, + "loss": 0.7773, + "step": 871 + }, + { + "epoch": 0.047013155057149016, + "grad_norm": 0.911021888256073, + "learning_rate": 9.997321492051775e-06, + "loss": 0.9001, + "step": 872 + }, + { + "epoch": 0.04706706922579254, + "grad_norm": 0.920103132724762, + "learning_rate": 9.997314549039379e-06, + "loss": 0.7222, + "step": 873 + }, + { + "epoch": 0.04712098339443606, + "grad_norm": 0.9449265599250793, + "learning_rate": 9.997307597042483e-06, + "loss": 0.9197, + "step": 874 + }, + { + "epoch": 0.047174897563079576, + "grad_norm": 1.013066291809082, + "learning_rate": 9.997300636061103e-06, + "loss": 0.8854, + "step": 875 + }, + { + "epoch": 0.047228811731723094, + "grad_norm": 0.8990256786346436, + "learning_rate": 9.99729366609525e-06, + "loss": 0.81, + "step": 876 + }, + { + "epoch": 0.04728272590036662, + "grad_norm": 1.0211769342422485, + "learning_rate": 9.997286687144938e-06, + "loss": 0.8335, + "step": 877 + }, + { + "epoch": 0.04733664006901014, + "grad_norm": 1.14606773853302, + "learning_rate": 9.997279699210178e-06, + "loss": 1.0956, + "step": 878 + }, + { + "epoch": 0.047390554237653655, + "grad_norm": 0.982725977897644, + "learning_rate": 9.997272702290981e-06, + "loss": 0.8289, + "step": 879 + }, + { + "epoch": 0.04744446840629717, + "grad_norm": 0.8667361736297607, + "learning_rate": 9.997265696387364e-06, + "loss": 0.8056, + "step": 880 + }, + { + "epoch": 0.04749838257494069, + "grad_norm": 0.9029837250709534, + "learning_rate": 9.997258681499338e-06, + "loss": 0.8461, + "step": 881 + }, + { + "epoch": 0.047552296743584216, + "grad_norm": 0.8767060041427612, + "learning_rate": 9.997251657626915e-06, + "loss": 0.8162, + "step": 882 + }, + { + "epoch": 0.047606210912227734, + "grad_norm": 1.4750713109970093, + "learning_rate": 9.997244624770104e-06, + "loss": 0.8677, + "step": 883 + }, + { + "epoch": 0.04766012508087125, + "grad_norm": 1.001286506652832, + "learning_rate": 9.997237582928924e-06, + "loss": 0.7673, + "step": 884 + }, + { + "epoch": 0.04771403924951477, + "grad_norm": 0.9560269713401794, + "learning_rate": 9.997230532103384e-06, + "loss": 0.8597, + "step": 885 + }, + { + "epoch": 0.047767953418158295, + "grad_norm": 0.834237277507782, + "learning_rate": 9.997223472293499e-06, + "loss": 0.7629, + "step": 886 + }, + { + "epoch": 0.04782186758680181, + "grad_norm": 0.9642406702041626, + "learning_rate": 9.997216403499278e-06, + "loss": 0.83, + "step": 887 + }, + { + "epoch": 0.04787578175544533, + "grad_norm": 1.2931480407714844, + "learning_rate": 9.997209325720736e-06, + "loss": 1.0333, + "step": 888 + }, + { + "epoch": 0.04792969592408885, + "grad_norm": 0.8024531602859497, + "learning_rate": 9.997202238957886e-06, + "loss": 0.7166, + "step": 889 + }, + { + "epoch": 0.04798361009273237, + "grad_norm": 0.9585899710655212, + "learning_rate": 9.997195143210741e-06, + "loss": 0.8099, + "step": 890 + }, + { + "epoch": 0.04803752426137589, + "grad_norm": 0.9917063117027283, + "learning_rate": 9.997188038479313e-06, + "loss": 0.8486, + "step": 891 + }, + { + "epoch": 0.04809143843001941, + "grad_norm": 1.6290080547332764, + "learning_rate": 9.997180924763616e-06, + "loss": 0.863, + "step": 892 + }, + { + "epoch": 0.04814535259866293, + "grad_norm": 0.9488585591316223, + "learning_rate": 9.99717380206366e-06, + "loss": 0.8277, + "step": 893 + }, + { + "epoch": 0.048199266767306445, + "grad_norm": 1.0710817575454712, + "learning_rate": 9.997166670379459e-06, + "loss": 0.8898, + "step": 894 + }, + { + "epoch": 0.04825318093594997, + "grad_norm": 0.9916248917579651, + "learning_rate": 9.997159529711026e-06, + "loss": 0.9144, + "step": 895 + }, + { + "epoch": 0.04830709510459349, + "grad_norm": 1.0074565410614014, + "learning_rate": 9.997152380058378e-06, + "loss": 0.8391, + "step": 896 + }, + { + "epoch": 0.048361009273237006, + "grad_norm": 1.0258312225341797, + "learning_rate": 9.99714522142152e-06, + "loss": 0.973, + "step": 897 + }, + { + "epoch": 0.048414923441880524, + "grad_norm": 0.9497826099395752, + "learning_rate": 9.99713805380047e-06, + "loss": 0.9221, + "step": 898 + }, + { + "epoch": 0.04846883761052405, + "grad_norm": 0.9103115200996399, + "learning_rate": 9.99713087719524e-06, + "loss": 0.7942, + "step": 899 + }, + { + "epoch": 0.04852275177916757, + "grad_norm": 0.9810470938682556, + "learning_rate": 9.997123691605843e-06, + "loss": 0.8673, + "step": 900 + }, + { + "epoch": 0.048576665947811085, + "grad_norm": 1.0422937870025635, + "learning_rate": 9.997116497032291e-06, + "loss": 0.9263, + "step": 901 + }, + { + "epoch": 0.0486305801164546, + "grad_norm": 0.8522017002105713, + "learning_rate": 9.997109293474596e-06, + "loss": 0.8296, + "step": 902 + }, + { + "epoch": 0.04868449428509812, + "grad_norm": 0.818270742893219, + "learning_rate": 9.997102080932775e-06, + "loss": 0.7898, + "step": 903 + }, + { + "epoch": 0.048738408453741645, + "grad_norm": 0.9286766648292542, + "learning_rate": 9.997094859406838e-06, + "loss": 0.8751, + "step": 904 + }, + { + "epoch": 0.04879232262238516, + "grad_norm": 1.0779087543487549, + "learning_rate": 9.997087628896797e-06, + "loss": 0.8377, + "step": 905 + }, + { + "epoch": 0.04884623679102868, + "grad_norm": 0.8711867928504944, + "learning_rate": 9.997080389402667e-06, + "loss": 0.8547, + "step": 906 + }, + { + "epoch": 0.0489001509596722, + "grad_norm": 0.8919721245765686, + "learning_rate": 9.99707314092446e-06, + "loss": 0.8178, + "step": 907 + }, + { + "epoch": 0.048954065128315724, + "grad_norm": 0.9084917306900024, + "learning_rate": 9.997065883462192e-06, + "loss": 0.8618, + "step": 908 + }, + { + "epoch": 0.04900797929695924, + "grad_norm": 0.869216799736023, + "learning_rate": 9.997058617015871e-06, + "loss": 0.8636, + "step": 909 + }, + { + "epoch": 0.04906189346560276, + "grad_norm": 0.9376553893089294, + "learning_rate": 9.997051341585513e-06, + "loss": 0.8986, + "step": 910 + }, + { + "epoch": 0.04911580763424628, + "grad_norm": 0.9041107892990112, + "learning_rate": 9.99704405717113e-06, + "loss": 0.817, + "step": 911 + }, + { + "epoch": 0.0491697218028898, + "grad_norm": 0.9530431628227234, + "learning_rate": 9.997036763772737e-06, + "loss": 0.9464, + "step": 912 + }, + { + "epoch": 0.04922363597153332, + "grad_norm": 0.9601117968559265, + "learning_rate": 9.997029461390344e-06, + "loss": 0.9014, + "step": 913 + }, + { + "epoch": 0.04927755014017684, + "grad_norm": 0.9162781834602356, + "learning_rate": 9.997022150023968e-06, + "loss": 0.8851, + "step": 914 + }, + { + "epoch": 0.04933146430882036, + "grad_norm": 0.9514605402946472, + "learning_rate": 9.99701482967362e-06, + "loss": 0.8975, + "step": 915 + }, + { + "epoch": 0.049385378477463875, + "grad_norm": 0.897203803062439, + "learning_rate": 9.997007500339313e-06, + "loss": 0.8371, + "step": 916 + }, + { + "epoch": 0.0494392926461074, + "grad_norm": 0.9372673630714417, + "learning_rate": 9.99700016202106e-06, + "loss": 0.9432, + "step": 917 + }, + { + "epoch": 0.04949320681475092, + "grad_norm": 0.8993443846702576, + "learning_rate": 9.996992814718875e-06, + "loss": 0.8528, + "step": 918 + }, + { + "epoch": 0.049547120983394435, + "grad_norm": 0.9300720691680908, + "learning_rate": 9.996985458432771e-06, + "loss": 0.873, + "step": 919 + }, + { + "epoch": 0.04960103515203795, + "grad_norm": 0.9311426281929016, + "learning_rate": 9.996978093162761e-06, + "loss": 0.9092, + "step": 920 + }, + { + "epoch": 0.04965494932068148, + "grad_norm": 0.9244507551193237, + "learning_rate": 9.996970718908859e-06, + "loss": 0.764, + "step": 921 + }, + { + "epoch": 0.049708863489324996, + "grad_norm": 0.915512204170227, + "learning_rate": 9.996963335671074e-06, + "loss": 0.8328, + "step": 922 + }, + { + "epoch": 0.049762777657968514, + "grad_norm": 0.889994740486145, + "learning_rate": 9.996955943449426e-06, + "loss": 0.8491, + "step": 923 + }, + { + "epoch": 0.04981669182661203, + "grad_norm": 0.8676478266716003, + "learning_rate": 9.996948542243925e-06, + "loss": 0.7677, + "step": 924 + }, + { + "epoch": 0.04987060599525555, + "grad_norm": 0.9795013070106506, + "learning_rate": 9.996941132054586e-06, + "loss": 0.9279, + "step": 925 + }, + { + "epoch": 0.049924520163899075, + "grad_norm": 0.940078854560852, + "learning_rate": 9.996933712881419e-06, + "loss": 0.8685, + "step": 926 + }, + { + "epoch": 0.04997843433254259, + "grad_norm": 0.9440926313400269, + "learning_rate": 9.996926284724437e-06, + "loss": 0.9634, + "step": 927 + }, + { + "epoch": 0.05003234850118611, + "grad_norm": 0.9120537638664246, + "learning_rate": 9.99691884758366e-06, + "loss": 0.7656, + "step": 928 + }, + { + "epoch": 0.05008626266982963, + "grad_norm": 1.1514596939086914, + "learning_rate": 9.996911401459093e-06, + "loss": 0.864, + "step": 929 + }, + { + "epoch": 0.050140176838473154, + "grad_norm": 0.8924434185028076, + "learning_rate": 9.996903946350756e-06, + "loss": 0.877, + "step": 930 + }, + { + "epoch": 0.05019409100711667, + "grad_norm": 0.9884456992149353, + "learning_rate": 9.996896482258657e-06, + "loss": 0.94, + "step": 931 + }, + { + "epoch": 0.05024800517576019, + "grad_norm": 0.9282665252685547, + "learning_rate": 9.996889009182814e-06, + "loss": 0.8443, + "step": 932 + }, + { + "epoch": 0.05030191934440371, + "grad_norm": 1.1029064655303955, + "learning_rate": 9.996881527123237e-06, + "loss": 0.9168, + "step": 933 + }, + { + "epoch": 0.050355833513047225, + "grad_norm": 0.839625358581543, + "learning_rate": 9.996874036079942e-06, + "loss": 0.8261, + "step": 934 + }, + { + "epoch": 0.05040974768169075, + "grad_norm": 0.8612869381904602, + "learning_rate": 9.996866536052942e-06, + "loss": 0.8197, + "step": 935 + }, + { + "epoch": 0.05046366185033427, + "grad_norm": 0.9483891129493713, + "learning_rate": 9.996859027042249e-06, + "loss": 0.8374, + "step": 936 + }, + { + "epoch": 0.050517576018977786, + "grad_norm": 0.9374566674232483, + "learning_rate": 9.996851509047877e-06, + "loss": 0.8884, + "step": 937 + }, + { + "epoch": 0.050571490187621304, + "grad_norm": 0.9164647459983826, + "learning_rate": 9.99684398206984e-06, + "loss": 0.8419, + "step": 938 + }, + { + "epoch": 0.05062540435626483, + "grad_norm": 1.0109184980392456, + "learning_rate": 9.996836446108153e-06, + "loss": 0.8912, + "step": 939 + }, + { + "epoch": 0.05067931852490835, + "grad_norm": 0.8549674153327942, + "learning_rate": 9.996828901162825e-06, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.050733232693551865, + "grad_norm": 0.9618684649467468, + "learning_rate": 9.996821347233875e-06, + "loss": 0.8246, + "step": 941 + }, + { + "epoch": 0.05078714686219538, + "grad_norm": 0.9777100682258606, + "learning_rate": 9.996813784321314e-06, + "loss": 0.887, + "step": 942 + }, + { + "epoch": 0.05084106103083891, + "grad_norm": 0.8675182461738586, + "learning_rate": 9.996806212425157e-06, + "loss": 0.7584, + "step": 943 + }, + { + "epoch": 0.050894975199482426, + "grad_norm": 0.9174523949623108, + "learning_rate": 9.996798631545414e-06, + "loss": 0.8911, + "step": 944 + }, + { + "epoch": 0.050948889368125944, + "grad_norm": 0.9269078373908997, + "learning_rate": 9.996791041682101e-06, + "loss": 0.8049, + "step": 945 + }, + { + "epoch": 0.05100280353676946, + "grad_norm": 0.8447721600532532, + "learning_rate": 9.996783442835233e-06, + "loss": 0.7781, + "step": 946 + }, + { + "epoch": 0.05105671770541298, + "grad_norm": 0.9178231954574585, + "learning_rate": 9.99677583500482e-06, + "loss": 0.8107, + "step": 947 + }, + { + "epoch": 0.051110631874056504, + "grad_norm": 0.8741039633750916, + "learning_rate": 9.996768218190879e-06, + "loss": 0.9278, + "step": 948 + }, + { + "epoch": 0.05116454604270002, + "grad_norm": 0.7997228503227234, + "learning_rate": 9.996760592393425e-06, + "loss": 0.7706, + "step": 949 + }, + { + "epoch": 0.05121846021134354, + "grad_norm": 1.003300428390503, + "learning_rate": 9.996752957612468e-06, + "loss": 0.8464, + "step": 950 + }, + { + "epoch": 0.05127237437998706, + "grad_norm": 0.9237748980522156, + "learning_rate": 9.996745313848021e-06, + "loss": 0.9088, + "step": 951 + }, + { + "epoch": 0.05132628854863058, + "grad_norm": 0.8565654754638672, + "learning_rate": 9.996737661100103e-06, + "loss": 0.8208, + "step": 952 + }, + { + "epoch": 0.0513802027172741, + "grad_norm": 1.0590770244598389, + "learning_rate": 9.996729999368722e-06, + "loss": 0.9272, + "step": 953 + }, + { + "epoch": 0.05143411688591762, + "grad_norm": 0.8888198733329773, + "learning_rate": 9.996722328653897e-06, + "loss": 0.8264, + "step": 954 + }, + { + "epoch": 0.05148803105456114, + "grad_norm": 0.9211130142211914, + "learning_rate": 9.996714648955636e-06, + "loss": 0.8807, + "step": 955 + }, + { + "epoch": 0.051541945223204655, + "grad_norm": 1.0241321325302124, + "learning_rate": 9.996706960273958e-06, + "loss": 0.7638, + "step": 956 + }, + { + "epoch": 0.05159585939184818, + "grad_norm": 0.903762698173523, + "learning_rate": 9.996699262608875e-06, + "loss": 0.8583, + "step": 957 + }, + { + "epoch": 0.0516497735604917, + "grad_norm": 0.9271189570426941, + "learning_rate": 9.9966915559604e-06, + "loss": 0.8341, + "step": 958 + }, + { + "epoch": 0.051703687729135216, + "grad_norm": 0.865260899066925, + "learning_rate": 9.996683840328546e-06, + "loss": 0.9136, + "step": 959 + }, + { + "epoch": 0.051757601897778734, + "grad_norm": 0.8903625011444092, + "learning_rate": 9.996676115713332e-06, + "loss": 0.8706, + "step": 960 + }, + { + "epoch": 0.05181151606642226, + "grad_norm": 0.9228227138519287, + "learning_rate": 9.996668382114765e-06, + "loss": 0.8825, + "step": 961 + }, + { + "epoch": 0.051865430235065776, + "grad_norm": 0.9146421551704407, + "learning_rate": 9.996660639532863e-06, + "loss": 0.8347, + "step": 962 + }, + { + "epoch": 0.051919344403709294, + "grad_norm": 0.9010991454124451, + "learning_rate": 9.99665288796764e-06, + "loss": 0.8016, + "step": 963 + }, + { + "epoch": 0.05197325857235281, + "grad_norm": 0.8763105869293213, + "learning_rate": 9.996645127419107e-06, + "loss": 0.8651, + "step": 964 + }, + { + "epoch": 0.05202717274099634, + "grad_norm": 0.9506256580352783, + "learning_rate": 9.996637357887281e-06, + "loss": 0.9429, + "step": 965 + }, + { + "epoch": 0.052081086909639855, + "grad_norm": 0.9484269022941589, + "learning_rate": 9.996629579372175e-06, + "loss": 0.855, + "step": 966 + }, + { + "epoch": 0.05213500107828337, + "grad_norm": 0.8970646262168884, + "learning_rate": 9.996621791873804e-06, + "loss": 0.8611, + "step": 967 + }, + { + "epoch": 0.05218891524692689, + "grad_norm": 0.8925203680992126, + "learning_rate": 9.99661399539218e-06, + "loss": 0.8206, + "step": 968 + }, + { + "epoch": 0.05224282941557041, + "grad_norm": 1.069669246673584, + "learning_rate": 9.996606189927318e-06, + "loss": 0.876, + "step": 969 + }, + { + "epoch": 0.052296743584213934, + "grad_norm": 0.8456307649612427, + "learning_rate": 9.996598375479232e-06, + "loss": 0.7514, + "step": 970 + }, + { + "epoch": 0.05235065775285745, + "grad_norm": 0.9182801246643066, + "learning_rate": 9.996590552047936e-06, + "loss": 0.8915, + "step": 971 + }, + { + "epoch": 0.05240457192150097, + "grad_norm": 0.7616676688194275, + "learning_rate": 9.996582719633445e-06, + "loss": 0.7106, + "step": 972 + }, + { + "epoch": 0.05245848609014449, + "grad_norm": 0.8873127102851868, + "learning_rate": 9.99657487823577e-06, + "loss": 0.9171, + "step": 973 + }, + { + "epoch": 0.05251240025878801, + "grad_norm": 0.9724618792533875, + "learning_rate": 9.996567027854929e-06, + "loss": 0.9765, + "step": 974 + }, + { + "epoch": 0.05256631442743153, + "grad_norm": 0.9106513857841492, + "learning_rate": 9.996559168490933e-06, + "loss": 0.8332, + "step": 975 + }, + { + "epoch": 0.05262022859607505, + "grad_norm": 0.8551159501075745, + "learning_rate": 9.996551300143798e-06, + "loss": 0.8128, + "step": 976 + }, + { + "epoch": 0.052674142764718566, + "grad_norm": 0.9829822182655334, + "learning_rate": 9.996543422813539e-06, + "loss": 0.9088, + "step": 977 + }, + { + "epoch": 0.052728056933362084, + "grad_norm": 0.8281888961791992, + "learning_rate": 9.996535536500166e-06, + "loss": 0.8338, + "step": 978 + }, + { + "epoch": 0.05278197110200561, + "grad_norm": 0.951319694519043, + "learning_rate": 9.9965276412037e-06, + "loss": 0.9359, + "step": 979 + }, + { + "epoch": 0.05283588527064913, + "grad_norm": 0.841390073299408, + "learning_rate": 9.996519736924148e-06, + "loss": 0.7952, + "step": 980 + }, + { + "epoch": 0.052889799439292645, + "grad_norm": 0.8847686648368835, + "learning_rate": 9.996511823661528e-06, + "loss": 0.8435, + "step": 981 + }, + { + "epoch": 0.05294371360793616, + "grad_norm": 0.9261316061019897, + "learning_rate": 9.996503901415855e-06, + "loss": 0.8646, + "step": 982 + }, + { + "epoch": 0.05299762777657969, + "grad_norm": 0.9366586804389954, + "learning_rate": 9.99649597018714e-06, + "loss": 0.8586, + "step": 983 + }, + { + "epoch": 0.053051541945223206, + "grad_norm": 0.8916764259338379, + "learning_rate": 9.9964880299754e-06, + "loss": 0.8215, + "step": 984 + }, + { + "epoch": 0.053105456113866724, + "grad_norm": 0.9496534466743469, + "learning_rate": 9.996480080780648e-06, + "loss": 0.7984, + "step": 985 + }, + { + "epoch": 0.05315937028251024, + "grad_norm": 0.9736526608467102, + "learning_rate": 9.9964721226029e-06, + "loss": 0.7881, + "step": 986 + }, + { + "epoch": 0.05321328445115376, + "grad_norm": 0.9533856511116028, + "learning_rate": 9.996464155442167e-06, + "loss": 0.9855, + "step": 987 + }, + { + "epoch": 0.053267198619797285, + "grad_norm": 0.9656437039375305, + "learning_rate": 9.996456179298467e-06, + "loss": 0.9571, + "step": 988 + }, + { + "epoch": 0.0533211127884408, + "grad_norm": 0.8887313008308411, + "learning_rate": 9.996448194171813e-06, + "loss": 0.9381, + "step": 989 + }, + { + "epoch": 0.05337502695708432, + "grad_norm": 1.0181535482406616, + "learning_rate": 9.996440200062217e-06, + "loss": 0.8834, + "step": 990 + }, + { + "epoch": 0.05342894112572784, + "grad_norm": 0.9083503484725952, + "learning_rate": 9.996432196969696e-06, + "loss": 0.9733, + "step": 991 + }, + { + "epoch": 0.05348285529437136, + "grad_norm": 0.9051093459129333, + "learning_rate": 9.996424184894264e-06, + "loss": 0.8531, + "step": 992 + }, + { + "epoch": 0.05353676946301488, + "grad_norm": 1.0264357328414917, + "learning_rate": 9.996416163835935e-06, + "loss": 0.9212, + "step": 993 + }, + { + "epoch": 0.0535906836316584, + "grad_norm": 1.0350812673568726, + "learning_rate": 9.996408133794726e-06, + "loss": 0.7843, + "step": 994 + }, + { + "epoch": 0.05364459780030192, + "grad_norm": 0.9610341787338257, + "learning_rate": 9.996400094770647e-06, + "loss": 0.8561, + "step": 995 + }, + { + "epoch": 0.05369851196894544, + "grad_norm": 0.8123961687088013, + "learning_rate": 9.996392046763714e-06, + "loss": 0.8296, + "step": 996 + }, + { + "epoch": 0.05375242613758896, + "grad_norm": 0.9337920546531677, + "learning_rate": 9.996383989773942e-06, + "loss": 0.8525, + "step": 997 + }, + { + "epoch": 0.05380634030623248, + "grad_norm": 1.1319444179534912, + "learning_rate": 9.996375923801347e-06, + "loss": 0.9127, + "step": 998 + }, + { + "epoch": 0.053860254474875996, + "grad_norm": 0.8506798148155212, + "learning_rate": 9.996367848845941e-06, + "loss": 0.884, + "step": 999 + }, + { + "epoch": 0.053914168643519514, + "grad_norm": 0.8248615860939026, + "learning_rate": 9.996359764907739e-06, + "loss": 0.7579, + "step": 1000 + }, + { + "epoch": 0.05396808281216304, + "grad_norm": 0.9258946180343628, + "learning_rate": 9.996351671986756e-06, + "loss": 0.8632, + "step": 1001 + }, + { + "epoch": 0.05402199698080656, + "grad_norm": 0.8891279101371765, + "learning_rate": 9.996343570083006e-06, + "loss": 0.8758, + "step": 1002 + }, + { + "epoch": 0.054075911149450075, + "grad_norm": 0.9592086672782898, + "learning_rate": 9.996335459196505e-06, + "loss": 0.8962, + "step": 1003 + }, + { + "epoch": 0.05412982531809359, + "grad_norm": 0.8937798738479614, + "learning_rate": 9.996327339327267e-06, + "loss": 0.8434, + "step": 1004 + }, + { + "epoch": 0.05418373948673712, + "grad_norm": 0.9602083563804626, + "learning_rate": 9.996319210475307e-06, + "loss": 0.9692, + "step": 1005 + }, + { + "epoch": 0.054237653655380635, + "grad_norm": 0.870637834072113, + "learning_rate": 9.996311072640637e-06, + "loss": 0.9146, + "step": 1006 + }, + { + "epoch": 0.05429156782402415, + "grad_norm": 0.9330273866653442, + "learning_rate": 9.996302925823276e-06, + "loss": 0.8584, + "step": 1007 + }, + { + "epoch": 0.05434548199266767, + "grad_norm": 0.8185963034629822, + "learning_rate": 9.996294770023234e-06, + "loss": 0.7854, + "step": 1008 + }, + { + "epoch": 0.05439939616131119, + "grad_norm": 0.8727489113807678, + "learning_rate": 9.996286605240528e-06, + "loss": 0.7388, + "step": 1009 + }, + { + "epoch": 0.054453310329954714, + "grad_norm": 1.0858477354049683, + "learning_rate": 9.996278431475172e-06, + "loss": 0.9201, + "step": 1010 + }, + { + "epoch": 0.05450722449859823, + "grad_norm": 0.9749255776405334, + "learning_rate": 9.996270248727184e-06, + "loss": 0.9041, + "step": 1011 + }, + { + "epoch": 0.05456113866724175, + "grad_norm": 0.9460576176643372, + "learning_rate": 9.996262056996575e-06, + "loss": 0.8553, + "step": 1012 + }, + { + "epoch": 0.05461505283588527, + "grad_norm": 0.9379808306694031, + "learning_rate": 9.99625385628336e-06, + "loss": 0.9253, + "step": 1013 + }, + { + "epoch": 0.05466896700452879, + "grad_norm": 0.8154170513153076, + "learning_rate": 9.996245646587553e-06, + "loss": 0.8703, + "step": 1014 + }, + { + "epoch": 0.05472288117317231, + "grad_norm": 0.9122161269187927, + "learning_rate": 9.996237427909172e-06, + "loss": 0.7734, + "step": 1015 + }, + { + "epoch": 0.05477679534181583, + "grad_norm": 0.9049486517906189, + "learning_rate": 9.996229200248228e-06, + "loss": 0.8991, + "step": 1016 + }, + { + "epoch": 0.05483070951045935, + "grad_norm": 0.9244295358657837, + "learning_rate": 9.996220963604741e-06, + "loss": 0.8514, + "step": 1017 + }, + { + "epoch": 0.05488462367910287, + "grad_norm": 0.9817934036254883, + "learning_rate": 9.99621271797872e-06, + "loss": 0.8641, + "step": 1018 + }, + { + "epoch": 0.05493853784774639, + "grad_norm": 0.9253972768783569, + "learning_rate": 9.996204463370182e-06, + "loss": 0.9199, + "step": 1019 + }, + { + "epoch": 0.05499245201638991, + "grad_norm": 0.9114319682121277, + "learning_rate": 9.996196199779145e-06, + "loss": 0.8063, + "step": 1020 + }, + { + "epoch": 0.055046366185033425, + "grad_norm": 0.9643195867538452, + "learning_rate": 9.996187927205619e-06, + "loss": 0.9668, + "step": 1021 + }, + { + "epoch": 0.05510028035367694, + "grad_norm": 0.8127598166465759, + "learning_rate": 9.996179645649622e-06, + "loss": 0.764, + "step": 1022 + }, + { + "epoch": 0.05515419452232047, + "grad_norm": 0.8728108406066895, + "learning_rate": 9.996171355111167e-06, + "loss": 0.7703, + "step": 1023 + }, + { + "epoch": 0.055208108690963986, + "grad_norm": 0.8554317355155945, + "learning_rate": 9.996163055590269e-06, + "loss": 0.8266, + "step": 1024 + }, + { + "epoch": 0.055262022859607504, + "grad_norm": 0.7951076030731201, + "learning_rate": 9.996154747086946e-06, + "loss": 0.7601, + "step": 1025 + }, + { + "epoch": 0.05531593702825102, + "grad_norm": 0.8916927576065063, + "learning_rate": 9.996146429601208e-06, + "loss": 0.8936, + "step": 1026 + }, + { + "epoch": 0.05536985119689455, + "grad_norm": 1.0242576599121094, + "learning_rate": 9.996138103133075e-06, + "loss": 0.8868, + "step": 1027 + }, + { + "epoch": 0.055423765365538065, + "grad_norm": 0.9273019433021545, + "learning_rate": 9.996129767682557e-06, + "loss": 0.8622, + "step": 1028 + }, + { + "epoch": 0.05547767953418158, + "grad_norm": 0.9547039866447449, + "learning_rate": 9.996121423249673e-06, + "loss": 0.7814, + "step": 1029 + }, + { + "epoch": 0.0555315937028251, + "grad_norm": 0.8750621676445007, + "learning_rate": 9.996113069834437e-06, + "loss": 0.7717, + "step": 1030 + }, + { + "epoch": 0.05558550787146862, + "grad_norm": 0.9547988176345825, + "learning_rate": 9.996104707436862e-06, + "loss": 0.8877, + "step": 1031 + }, + { + "epoch": 0.055639422040112144, + "grad_norm": 0.8856480717658997, + "learning_rate": 9.996096336056966e-06, + "loss": 0.7927, + "step": 1032 + }, + { + "epoch": 0.05569333620875566, + "grad_norm": 0.8311342000961304, + "learning_rate": 9.99608795569476e-06, + "loss": 0.7847, + "step": 1033 + }, + { + "epoch": 0.05574725037739918, + "grad_norm": 1.0720731019973755, + "learning_rate": 9.996079566350266e-06, + "loss": 0.9243, + "step": 1034 + }, + { + "epoch": 0.0558011645460427, + "grad_norm": 0.9498684406280518, + "learning_rate": 9.996071168023491e-06, + "loss": 0.8605, + "step": 1035 + }, + { + "epoch": 0.05585507871468622, + "grad_norm": 0.9043952822685242, + "learning_rate": 9.996062760714456e-06, + "loss": 0.8488, + "step": 1036 + }, + { + "epoch": 0.05590899288332974, + "grad_norm": 0.8051116466522217, + "learning_rate": 9.996054344423173e-06, + "loss": 0.8275, + "step": 1037 + }, + { + "epoch": 0.05596290705197326, + "grad_norm": 0.857120156288147, + "learning_rate": 9.996045919149658e-06, + "loss": 0.8837, + "step": 1038 + }, + { + "epoch": 0.056016821220616776, + "grad_norm": 0.8810911774635315, + "learning_rate": 9.996037484893926e-06, + "loss": 0.8179, + "step": 1039 + }, + { + "epoch": 0.056070735389260294, + "grad_norm": 0.8783093690872192, + "learning_rate": 9.996029041655994e-06, + "loss": 0.7734, + "step": 1040 + }, + { + "epoch": 0.05612464955790382, + "grad_norm": 0.9281952977180481, + "learning_rate": 9.996020589435874e-06, + "loss": 0.8747, + "step": 1041 + }, + { + "epoch": 0.05617856372654734, + "grad_norm": 0.8307299613952637, + "learning_rate": 9.996012128233583e-06, + "loss": 0.8055, + "step": 1042 + }, + { + "epoch": 0.056232477895190855, + "grad_norm": 0.9520873427391052, + "learning_rate": 9.996003658049136e-06, + "loss": 0.8181, + "step": 1043 + }, + { + "epoch": 0.05628639206383437, + "grad_norm": 0.8753806948661804, + "learning_rate": 9.995995178882549e-06, + "loss": 0.808, + "step": 1044 + }, + { + "epoch": 0.0563403062324779, + "grad_norm": 1.067691683769226, + "learning_rate": 9.995986690733836e-06, + "loss": 0.8048, + "step": 1045 + }, + { + "epoch": 0.056394220401121416, + "grad_norm": 0.8575261235237122, + "learning_rate": 9.995978193603013e-06, + "loss": 0.9231, + "step": 1046 + }, + { + "epoch": 0.056448134569764934, + "grad_norm": 0.9857104420661926, + "learning_rate": 9.995969687490096e-06, + "loss": 0.8883, + "step": 1047 + }, + { + "epoch": 0.05650204873840845, + "grad_norm": 0.9203484654426575, + "learning_rate": 9.995961172395098e-06, + "loss": 0.7634, + "step": 1048 + }, + { + "epoch": 0.056555962907051976, + "grad_norm": 0.8741904497146606, + "learning_rate": 9.995952648318036e-06, + "loss": 0.8061, + "step": 1049 + }, + { + "epoch": 0.056609877075695494, + "grad_norm": 0.9495588541030884, + "learning_rate": 9.995944115258925e-06, + "loss": 0.8922, + "step": 1050 + }, + { + "epoch": 0.05666379124433901, + "grad_norm": 0.9306020140647888, + "learning_rate": 9.99593557321778e-06, + "loss": 0.8454, + "step": 1051 + }, + { + "epoch": 0.05671770541298253, + "grad_norm": 0.9457784295082092, + "learning_rate": 9.995927022194615e-06, + "loss": 0.8701, + "step": 1052 + }, + { + "epoch": 0.05677161958162605, + "grad_norm": 0.88719242811203, + "learning_rate": 9.99591846218945e-06, + "loss": 0.8416, + "step": 1053 + }, + { + "epoch": 0.05682553375026957, + "grad_norm": 0.8740848302841187, + "learning_rate": 9.995909893202296e-06, + "loss": 0.7962, + "step": 1054 + }, + { + "epoch": 0.05687944791891309, + "grad_norm": 1.0149377584457397, + "learning_rate": 9.99590131523317e-06, + "loss": 0.8352, + "step": 1055 + }, + { + "epoch": 0.05693336208755661, + "grad_norm": 0.9014917016029358, + "learning_rate": 9.995892728282088e-06, + "loss": 0.9244, + "step": 1056 + }, + { + "epoch": 0.05698727625620013, + "grad_norm": 0.9351898431777954, + "learning_rate": 9.995884132349062e-06, + "loss": 0.865, + "step": 1057 + }, + { + "epoch": 0.05704119042484365, + "grad_norm": 0.8656749129295349, + "learning_rate": 9.995875527434113e-06, + "loss": 0.8836, + "step": 1058 + }, + { + "epoch": 0.05709510459348717, + "grad_norm": 0.9120789170265198, + "learning_rate": 9.995866913537254e-06, + "loss": 0.8772, + "step": 1059 + }, + { + "epoch": 0.05714901876213069, + "grad_norm": 1.0019149780273438, + "learning_rate": 9.995858290658497e-06, + "loss": 0.9338, + "step": 1060 + }, + { + "epoch": 0.057202932930774206, + "grad_norm": 0.8492977023124695, + "learning_rate": 9.995849658797863e-06, + "loss": 0.742, + "step": 1061 + }, + { + "epoch": 0.057256847099417724, + "grad_norm": 1.000607967376709, + "learning_rate": 9.995841017955363e-06, + "loss": 0.8498, + "step": 1062 + }, + { + "epoch": 0.05731076126806125, + "grad_norm": 1.0268487930297852, + "learning_rate": 9.995832368131016e-06, + "loss": 0.8937, + "step": 1063 + }, + { + "epoch": 0.057364675436704766, + "grad_norm": 0.9388830661773682, + "learning_rate": 9.995823709324836e-06, + "loss": 0.877, + "step": 1064 + }, + { + "epoch": 0.057418589605348284, + "grad_norm": 0.9747199416160583, + "learning_rate": 9.99581504153684e-06, + "loss": 0.8436, + "step": 1065 + }, + { + "epoch": 0.0574725037739918, + "grad_norm": 0.9125073552131653, + "learning_rate": 9.99580636476704e-06, + "loss": 0.8853, + "step": 1066 + }, + { + "epoch": 0.05752641794263533, + "grad_norm": 0.8910282254219055, + "learning_rate": 9.995797679015455e-06, + "loss": 0.8566, + "step": 1067 + }, + { + "epoch": 0.057580332111278845, + "grad_norm": 0.8546010255813599, + "learning_rate": 9.995788984282101e-06, + "loss": 0.8209, + "step": 1068 + }, + { + "epoch": 0.05763424627992236, + "grad_norm": 0.9205883145332336, + "learning_rate": 9.99578028056699e-06, + "loss": 0.7814, + "step": 1069 + }, + { + "epoch": 0.05768816044856588, + "grad_norm": 0.9627780914306641, + "learning_rate": 9.995771567870142e-06, + "loss": 0.8686, + "step": 1070 + }, + { + "epoch": 0.057742074617209406, + "grad_norm": 0.9917465448379517, + "learning_rate": 9.995762846191569e-06, + "loss": 0.9672, + "step": 1071 + }, + { + "epoch": 0.057795988785852924, + "grad_norm": 0.9396706223487854, + "learning_rate": 9.995754115531288e-06, + "loss": 0.8631, + "step": 1072 + }, + { + "epoch": 0.05784990295449644, + "grad_norm": 0.8310922980308533, + "learning_rate": 9.995745375889317e-06, + "loss": 0.8637, + "step": 1073 + }, + { + "epoch": 0.05790381712313996, + "grad_norm": 0.9085954427719116, + "learning_rate": 9.995736627265667e-06, + "loss": 0.8821, + "step": 1074 + }, + { + "epoch": 0.05795773129178348, + "grad_norm": 0.8529816269874573, + "learning_rate": 9.995727869660357e-06, + "loss": 0.8426, + "step": 1075 + }, + { + "epoch": 0.058011645460427, + "grad_norm": 0.8288499116897583, + "learning_rate": 9.995719103073403e-06, + "loss": 0.8415, + "step": 1076 + }, + { + "epoch": 0.05806555962907052, + "grad_norm": 0.9105609059333801, + "learning_rate": 9.995710327504819e-06, + "loss": 0.7683, + "step": 1077 + }, + { + "epoch": 0.05811947379771404, + "grad_norm": 0.9578274488449097, + "learning_rate": 9.995701542954622e-06, + "loss": 0.8796, + "step": 1078 + }, + { + "epoch": 0.058173387966357556, + "grad_norm": 0.8542460799217224, + "learning_rate": 9.995692749422827e-06, + "loss": 0.8363, + "step": 1079 + }, + { + "epoch": 0.05822730213500108, + "grad_norm": 0.8723183274269104, + "learning_rate": 9.99568394690945e-06, + "loss": 0.8434, + "step": 1080 + }, + { + "epoch": 0.0582812163036446, + "grad_norm": 0.9157887697219849, + "learning_rate": 9.995675135414507e-06, + "loss": 0.6532, + "step": 1081 + }, + { + "epoch": 0.05833513047228812, + "grad_norm": 0.9055691361427307, + "learning_rate": 9.995666314938014e-06, + "loss": 0.8762, + "step": 1082 + }, + { + "epoch": 0.058389044640931635, + "grad_norm": 0.8224693536758423, + "learning_rate": 9.995657485479987e-06, + "loss": 0.7976, + "step": 1083 + }, + { + "epoch": 0.05844295880957515, + "grad_norm": 0.925414502620697, + "learning_rate": 9.995648647040441e-06, + "loss": 0.8673, + "step": 1084 + }, + { + "epoch": 0.05849687297821868, + "grad_norm": 0.9194141626358032, + "learning_rate": 9.995639799619395e-06, + "loss": 0.7916, + "step": 1085 + }, + { + "epoch": 0.058550787146862196, + "grad_norm": 1.08795166015625, + "learning_rate": 9.995630943216859e-06, + "loss": 0.9135, + "step": 1086 + }, + { + "epoch": 0.058604701315505714, + "grad_norm": 0.9648925065994263, + "learning_rate": 9.995622077832854e-06, + "loss": 0.8442, + "step": 1087 + }, + { + "epoch": 0.05865861548414923, + "grad_norm": 1.0012339353561401, + "learning_rate": 9.995613203467394e-06, + "loss": 0.9543, + "step": 1088 + }, + { + "epoch": 0.05871252965279276, + "grad_norm": 0.9333881735801697, + "learning_rate": 9.995604320120496e-06, + "loss": 0.9267, + "step": 1089 + }, + { + "epoch": 0.058766443821436275, + "grad_norm": 0.8566498160362244, + "learning_rate": 9.995595427792173e-06, + "loss": 0.8539, + "step": 1090 + }, + { + "epoch": 0.05882035799007979, + "grad_norm": 0.8766364455223083, + "learning_rate": 9.995586526482446e-06, + "loss": 0.9293, + "step": 1091 + }, + { + "epoch": 0.05887427215872331, + "grad_norm": 0.9181047677993774, + "learning_rate": 9.995577616191326e-06, + "loss": 0.8333, + "step": 1092 + }, + { + "epoch": 0.05892818632736683, + "grad_norm": 0.8831031918525696, + "learning_rate": 9.995568696918833e-06, + "loss": 0.8016, + "step": 1093 + }, + { + "epoch": 0.05898210049601035, + "grad_norm": 0.8618754148483276, + "learning_rate": 9.99555976866498e-06, + "loss": 0.8988, + "step": 1094 + }, + { + "epoch": 0.05903601466465387, + "grad_norm": 0.9083183407783508, + "learning_rate": 9.995550831429785e-06, + "loss": 0.8626, + "step": 1095 + }, + { + "epoch": 0.05908992883329739, + "grad_norm": 0.8423884510993958, + "learning_rate": 9.995541885213262e-06, + "loss": 0.9121, + "step": 1096 + }, + { + "epoch": 0.05914384300194091, + "grad_norm": 0.7747607827186584, + "learning_rate": 9.99553293001543e-06, + "loss": 0.8087, + "step": 1097 + }, + { + "epoch": 0.05919775717058443, + "grad_norm": 0.8828368186950684, + "learning_rate": 9.995523965836302e-06, + "loss": 0.8284, + "step": 1098 + }, + { + "epoch": 0.05925167133922795, + "grad_norm": 0.9448524713516235, + "learning_rate": 9.995514992675896e-06, + "loss": 0.9565, + "step": 1099 + }, + { + "epoch": 0.05930558550787147, + "grad_norm": 0.8967006206512451, + "learning_rate": 9.99550601053423e-06, + "loss": 0.8412, + "step": 1100 + }, + { + "epoch": 0.059359499676514986, + "grad_norm": 0.9394551515579224, + "learning_rate": 9.995497019411315e-06, + "loss": 0.929, + "step": 1101 + }, + { + "epoch": 0.05941341384515851, + "grad_norm": 0.9002842903137207, + "learning_rate": 9.995488019307172e-06, + "loss": 0.734, + "step": 1102 + }, + { + "epoch": 0.05946732801380203, + "grad_norm": 1.3590562343597412, + "learning_rate": 9.995479010221816e-06, + "loss": 0.8843, + "step": 1103 + }, + { + "epoch": 0.05952124218244555, + "grad_norm": 1.041528582572937, + "learning_rate": 9.99546999215526e-06, + "loss": 0.9001, + "step": 1104 + }, + { + "epoch": 0.059575156351089065, + "grad_norm": 0.9846720099449158, + "learning_rate": 9.995460965107524e-06, + "loss": 0.8174, + "step": 1105 + }, + { + "epoch": 0.05962907051973258, + "grad_norm": 0.9171685576438904, + "learning_rate": 9.995451929078624e-06, + "loss": 0.8756, + "step": 1106 + }, + { + "epoch": 0.05968298468837611, + "grad_norm": 0.9155516028404236, + "learning_rate": 9.995442884068574e-06, + "loss": 0.7327, + "step": 1107 + }, + { + "epoch": 0.059736898857019625, + "grad_norm": 0.8734007477760315, + "learning_rate": 9.99543383007739e-06, + "loss": 0.8385, + "step": 1108 + }, + { + "epoch": 0.05979081302566314, + "grad_norm": 0.8580977320671082, + "learning_rate": 9.99542476710509e-06, + "loss": 0.885, + "step": 1109 + }, + { + "epoch": 0.05984472719430666, + "grad_norm": 0.8499299883842468, + "learning_rate": 9.995415695151692e-06, + "loss": 0.8323, + "step": 1110 + }, + { + "epoch": 0.059898641362950186, + "grad_norm": 0.8348694443702698, + "learning_rate": 9.99540661421721e-06, + "loss": 0.7947, + "step": 1111 + }, + { + "epoch": 0.059952555531593704, + "grad_norm": 0.8865199685096741, + "learning_rate": 9.99539752430166e-06, + "loss": 0.9363, + "step": 1112 + }, + { + "epoch": 0.06000646970023722, + "grad_norm": 0.9492315649986267, + "learning_rate": 9.995388425405059e-06, + "loss": 0.913, + "step": 1113 + }, + { + "epoch": 0.06006038386888074, + "grad_norm": 0.938252329826355, + "learning_rate": 9.995379317527422e-06, + "loss": 0.861, + "step": 1114 + }, + { + "epoch": 0.06011429803752426, + "grad_norm": 1.2601032257080078, + "learning_rate": 9.995370200668768e-06, + "loss": 0.9435, + "step": 1115 + }, + { + "epoch": 0.06016821220616778, + "grad_norm": 0.915830671787262, + "learning_rate": 9.995361074829112e-06, + "loss": 0.9372, + "step": 1116 + }, + { + "epoch": 0.0602221263748113, + "grad_norm": 1.4548465013504028, + "learning_rate": 9.995351940008473e-06, + "loss": 0.9055, + "step": 1117 + }, + { + "epoch": 0.06027604054345482, + "grad_norm": 0.9090906381607056, + "learning_rate": 9.995342796206861e-06, + "loss": 0.8849, + "step": 1118 + }, + { + "epoch": 0.06032995471209834, + "grad_norm": 0.9860616326332092, + "learning_rate": 9.995333643424298e-06, + "loss": 0.8304, + "step": 1119 + }, + { + "epoch": 0.06038386888074186, + "grad_norm": 0.8320879340171814, + "learning_rate": 9.9953244816608e-06, + "loss": 0.8432, + "step": 1120 + }, + { + "epoch": 0.06043778304938538, + "grad_norm": 0.8633564114570618, + "learning_rate": 9.995315310916381e-06, + "loss": 0.7461, + "step": 1121 + }, + { + "epoch": 0.0604916972180289, + "grad_norm": 0.881287693977356, + "learning_rate": 9.995306131191059e-06, + "loss": 0.8512, + "step": 1122 + }, + { + "epoch": 0.060545611386672415, + "grad_norm": 0.8888201713562012, + "learning_rate": 9.99529694248485e-06, + "loss": 0.8416, + "step": 1123 + }, + { + "epoch": 0.06059952555531594, + "grad_norm": 0.8073605895042419, + "learning_rate": 9.99528774479777e-06, + "loss": 0.8369, + "step": 1124 + }, + { + "epoch": 0.06065343972395946, + "grad_norm": 0.9260549545288086, + "learning_rate": 9.995278538129837e-06, + "loss": 0.8548, + "step": 1125 + }, + { + "epoch": 0.060707353892602976, + "grad_norm": 0.9169156551361084, + "learning_rate": 9.99526932248107e-06, + "loss": 0.9149, + "step": 1126 + }, + { + "epoch": 0.060761268061246494, + "grad_norm": 0.8481706380844116, + "learning_rate": 9.995260097851478e-06, + "loss": 0.8591, + "step": 1127 + }, + { + "epoch": 0.06081518222989001, + "grad_norm": 0.8934486508369446, + "learning_rate": 9.995250864241085e-06, + "loss": 0.9322, + "step": 1128 + }, + { + "epoch": 0.06086909639853354, + "grad_norm": 0.947390615940094, + "learning_rate": 9.995241621649902e-06, + "loss": 1.0015, + "step": 1129 + }, + { + "epoch": 0.060923010567177055, + "grad_norm": 0.9185096025466919, + "learning_rate": 9.995232370077949e-06, + "loss": 0.9293, + "step": 1130 + }, + { + "epoch": 0.06097692473582057, + "grad_norm": 0.9517882466316223, + "learning_rate": 9.995223109525245e-06, + "loss": 0.8673, + "step": 1131 + }, + { + "epoch": 0.06103083890446409, + "grad_norm": 1.065699815750122, + "learning_rate": 9.9952138399918e-06, + "loss": 0.9144, + "step": 1132 + }, + { + "epoch": 0.061084753073107616, + "grad_norm": 0.9048404693603516, + "learning_rate": 9.995204561477635e-06, + "loss": 0.7773, + "step": 1133 + }, + { + "epoch": 0.061138667241751134, + "grad_norm": 1.104457139968872, + "learning_rate": 9.995195273982768e-06, + "loss": 0.8847, + "step": 1134 + }, + { + "epoch": 0.06119258141039465, + "grad_norm": 0.9009587168693542, + "learning_rate": 9.995185977507212e-06, + "loss": 0.8118, + "step": 1135 + }, + { + "epoch": 0.06124649557903817, + "grad_norm": 1.0740209817886353, + "learning_rate": 9.995176672050983e-06, + "loss": 0.9173, + "step": 1136 + }, + { + "epoch": 0.06130040974768169, + "grad_norm": 0.9820743203163147, + "learning_rate": 9.995167357614104e-06, + "loss": 0.8555, + "step": 1137 + }, + { + "epoch": 0.06135432391632521, + "grad_norm": 0.9250825047492981, + "learning_rate": 9.995158034196586e-06, + "loss": 0.8771, + "step": 1138 + }, + { + "epoch": 0.06140823808496873, + "grad_norm": 0.8952597379684448, + "learning_rate": 9.995148701798447e-06, + "loss": 0.8598, + "step": 1139 + }, + { + "epoch": 0.06146215225361225, + "grad_norm": 0.8485212922096252, + "learning_rate": 9.995139360419706e-06, + "loss": 0.8557, + "step": 1140 + }, + { + "epoch": 0.061516066422255766, + "grad_norm": 0.9676715731620789, + "learning_rate": 9.995130010060377e-06, + "loss": 0.7748, + "step": 1141 + }, + { + "epoch": 0.06156998059089929, + "grad_norm": 0.7896347045898438, + "learning_rate": 9.995120650720478e-06, + "loss": 0.6183, + "step": 1142 + }, + { + "epoch": 0.06162389475954281, + "grad_norm": 0.8746615052223206, + "learning_rate": 9.995111282400024e-06, + "loss": 0.8321, + "step": 1143 + }, + { + "epoch": 0.06167780892818633, + "grad_norm": 0.9029875993728638, + "learning_rate": 9.995101905099036e-06, + "loss": 0.8686, + "step": 1144 + }, + { + "epoch": 0.061731723096829845, + "grad_norm": 0.9529547095298767, + "learning_rate": 9.995092518817528e-06, + "loss": 0.8878, + "step": 1145 + }, + { + "epoch": 0.06178563726547336, + "grad_norm": 0.8280455470085144, + "learning_rate": 9.995083123555517e-06, + "loss": 0.8232, + "step": 1146 + }, + { + "epoch": 0.06183955143411689, + "grad_norm": 0.908881664276123, + "learning_rate": 9.995073719313021e-06, + "loss": 0.8387, + "step": 1147 + }, + { + "epoch": 0.061893465602760406, + "grad_norm": 0.9137653708457947, + "learning_rate": 9.995064306090055e-06, + "loss": 0.8943, + "step": 1148 + }, + { + "epoch": 0.061947379771403924, + "grad_norm": 0.863861620426178, + "learning_rate": 9.995054883886639e-06, + "loss": 0.7435, + "step": 1149 + }, + { + "epoch": 0.06200129394004744, + "grad_norm": 0.8534915447235107, + "learning_rate": 9.995045452702786e-06, + "loss": 0.941, + "step": 1150 + }, + { + "epoch": 0.06205520810869097, + "grad_norm": 0.9469791650772095, + "learning_rate": 9.995036012538515e-06, + "loss": 0.9137, + "step": 1151 + }, + { + "epoch": 0.062109122277334484, + "grad_norm": 0.9044890999794006, + "learning_rate": 9.995026563393844e-06, + "loss": 0.9117, + "step": 1152 + }, + { + "epoch": 0.062163036445978, + "grad_norm": 0.989772379398346, + "learning_rate": 9.995017105268789e-06, + "loss": 0.8306, + "step": 1153 + }, + { + "epoch": 0.06221695061462152, + "grad_norm": 0.8586496114730835, + "learning_rate": 9.995007638163365e-06, + "loss": 0.8012, + "step": 1154 + }, + { + "epoch": 0.062270864783265045, + "grad_norm": 0.9221116304397583, + "learning_rate": 9.994998162077594e-06, + "loss": 0.7935, + "step": 1155 + }, + { + "epoch": 0.06232477895190856, + "grad_norm": 0.9453061819076538, + "learning_rate": 9.994988677011489e-06, + "loss": 0.8257, + "step": 1156 + }, + { + "epoch": 0.06237869312055208, + "grad_norm": 0.8065335154533386, + "learning_rate": 9.994979182965065e-06, + "loss": 0.86, + "step": 1157 + }, + { + "epoch": 0.0624326072891956, + "grad_norm": 0.9597793817520142, + "learning_rate": 9.994969679938346e-06, + "loss": 0.862, + "step": 1158 + }, + { + "epoch": 0.06248652145783912, + "grad_norm": 0.9118353128433228, + "learning_rate": 9.994960167931342e-06, + "loss": 0.8925, + "step": 1159 + }, + { + "epoch": 0.06254043562648263, + "grad_norm": 1.0216273069381714, + "learning_rate": 9.994950646944077e-06, + "loss": 0.7078, + "step": 1160 + }, + { + "epoch": 0.06259434979512615, + "grad_norm": 0.960182785987854, + "learning_rate": 9.994941116976562e-06, + "loss": 0.8936, + "step": 1161 + }, + { + "epoch": 0.06264826396376968, + "grad_norm": 0.9551856517791748, + "learning_rate": 9.994931578028817e-06, + "loss": 0.8053, + "step": 1162 + }, + { + "epoch": 0.0627021781324132, + "grad_norm": 0.9419867992401123, + "learning_rate": 9.994922030100857e-06, + "loss": 0.8333, + "step": 1163 + }, + { + "epoch": 0.06275609230105672, + "grad_norm": 0.9780306816101074, + "learning_rate": 9.994912473192702e-06, + "loss": 0.88, + "step": 1164 + }, + { + "epoch": 0.06281000646970024, + "grad_norm": 0.9320577383041382, + "learning_rate": 9.99490290730437e-06, + "loss": 0.8859, + "step": 1165 + }, + { + "epoch": 0.06286392063834376, + "grad_norm": 0.7692422270774841, + "learning_rate": 9.994893332435874e-06, + "loss": 0.8093, + "step": 1166 + }, + { + "epoch": 0.06291783480698727, + "grad_norm": 1.0622048377990723, + "learning_rate": 9.994883748587234e-06, + "loss": 0.8959, + "step": 1167 + }, + { + "epoch": 0.06297174897563079, + "grad_norm": 0.9598555564880371, + "learning_rate": 9.994874155758467e-06, + "loss": 0.8153, + "step": 1168 + }, + { + "epoch": 0.06302566314427431, + "grad_norm": 0.9207014441490173, + "learning_rate": 9.994864553949591e-06, + "loss": 0.9383, + "step": 1169 + }, + { + "epoch": 0.06307957731291783, + "grad_norm": 1.0074093341827393, + "learning_rate": 9.99485494316062e-06, + "loss": 0.9999, + "step": 1170 + }, + { + "epoch": 0.06313349148156136, + "grad_norm": 0.8454248905181885, + "learning_rate": 9.994845323391575e-06, + "loss": 0.7946, + "step": 1171 + }, + { + "epoch": 0.06318740565020488, + "grad_norm": 0.847578763961792, + "learning_rate": 9.99483569464247e-06, + "loss": 0.7144, + "step": 1172 + }, + { + "epoch": 0.0632413198188484, + "grad_norm": 0.9083126187324524, + "learning_rate": 9.994826056913325e-06, + "loss": 0.774, + "step": 1173 + }, + { + "epoch": 0.06329523398749191, + "grad_norm": 0.8995345830917358, + "learning_rate": 9.994816410204158e-06, + "loss": 0.8995, + "step": 1174 + }, + { + "epoch": 0.06334914815613543, + "grad_norm": 1.0547746419906616, + "learning_rate": 9.994806754514983e-06, + "loss": 0.8142, + "step": 1175 + }, + { + "epoch": 0.06340306232477895, + "grad_norm": 0.946854829788208, + "learning_rate": 9.99479708984582e-06, + "loss": 0.8639, + "step": 1176 + }, + { + "epoch": 0.06345697649342247, + "grad_norm": 0.8746247291564941, + "learning_rate": 9.994787416196683e-06, + "loss": 0.8601, + "step": 1177 + }, + { + "epoch": 0.06351089066206599, + "grad_norm": 0.9075024127960205, + "learning_rate": 9.994777733567595e-06, + "loss": 0.7969, + "step": 1178 + }, + { + "epoch": 0.0635648048307095, + "grad_norm": 0.9435486197471619, + "learning_rate": 9.994768041958569e-06, + "loss": 0.8199, + "step": 1179 + }, + { + "epoch": 0.06361871899935304, + "grad_norm": 0.8597564697265625, + "learning_rate": 9.994758341369624e-06, + "loss": 0.8791, + "step": 1180 + }, + { + "epoch": 0.06367263316799655, + "grad_norm": 0.7960480451583862, + "learning_rate": 9.994748631800777e-06, + "loss": 0.8035, + "step": 1181 + }, + { + "epoch": 0.06372654733664007, + "grad_norm": 1.1984984874725342, + "learning_rate": 9.994738913252045e-06, + "loss": 0.7372, + "step": 1182 + }, + { + "epoch": 0.06378046150528359, + "grad_norm": 0.8532997369766235, + "learning_rate": 9.994729185723446e-06, + "loss": 0.9094, + "step": 1183 + }, + { + "epoch": 0.06383437567392711, + "grad_norm": 0.8327267169952393, + "learning_rate": 9.994719449214999e-06, + "loss": 0.809, + "step": 1184 + }, + { + "epoch": 0.06388828984257063, + "grad_norm": 0.9086306691169739, + "learning_rate": 9.99470970372672e-06, + "loss": 0.8278, + "step": 1185 + }, + { + "epoch": 0.06394220401121414, + "grad_norm": 0.8422104716300964, + "learning_rate": 9.994699949258626e-06, + "loss": 0.7754, + "step": 1186 + }, + { + "epoch": 0.06399611817985766, + "grad_norm": 1.0434929132461548, + "learning_rate": 9.994690185810733e-06, + "loss": 0.908, + "step": 1187 + }, + { + "epoch": 0.06405003234850119, + "grad_norm": 1.1625720262527466, + "learning_rate": 9.994680413383064e-06, + "loss": 0.8814, + "step": 1188 + }, + { + "epoch": 0.06410394651714471, + "grad_norm": 0.9940767288208008, + "learning_rate": 9.994670631975631e-06, + "loss": 0.7846, + "step": 1189 + }, + { + "epoch": 0.06415786068578823, + "grad_norm": 0.8356907963752747, + "learning_rate": 9.994660841588457e-06, + "loss": 0.798, + "step": 1190 + }, + { + "epoch": 0.06421177485443175, + "grad_norm": 0.830348014831543, + "learning_rate": 9.994651042221552e-06, + "loss": 0.7875, + "step": 1191 + }, + { + "epoch": 0.06426568902307526, + "grad_norm": 1.1060880422592163, + "learning_rate": 9.994641233874943e-06, + "loss": 0.8893, + "step": 1192 + }, + { + "epoch": 0.06431960319171878, + "grad_norm": 0.9319590926170349, + "learning_rate": 9.994631416548637e-06, + "loss": 0.791, + "step": 1193 + }, + { + "epoch": 0.0643735173603623, + "grad_norm": 0.8345780968666077, + "learning_rate": 9.994621590242661e-06, + "loss": 0.8213, + "step": 1194 + }, + { + "epoch": 0.06442743152900582, + "grad_norm": 0.9848359227180481, + "learning_rate": 9.99461175495703e-06, + "loss": 0.735, + "step": 1195 + }, + { + "epoch": 0.06448134569764934, + "grad_norm": 0.9134055972099304, + "learning_rate": 9.994601910691758e-06, + "loss": 0.8415, + "step": 1196 + }, + { + "epoch": 0.06453525986629287, + "grad_norm": 0.8084586262702942, + "learning_rate": 9.994592057446866e-06, + "loss": 0.8702, + "step": 1197 + }, + { + "epoch": 0.06458917403493639, + "grad_norm": 0.9168767333030701, + "learning_rate": 9.994582195222371e-06, + "loss": 0.8921, + "step": 1198 + }, + { + "epoch": 0.0646430882035799, + "grad_norm": 0.8380446434020996, + "learning_rate": 9.994572324018292e-06, + "loss": 0.7705, + "step": 1199 + }, + { + "epoch": 0.06469700237222342, + "grad_norm": 0.8120049238204956, + "learning_rate": 9.994562443834646e-06, + "loss": 0.7576, + "step": 1200 + }, + { + "epoch": 0.06475091654086694, + "grad_norm": 0.9559764266014099, + "learning_rate": 9.994552554671448e-06, + "loss": 0.8427, + "step": 1201 + }, + { + "epoch": 0.06480483070951046, + "grad_norm": 0.9473673105239868, + "learning_rate": 9.99454265652872e-06, + "loss": 0.9988, + "step": 1202 + }, + { + "epoch": 0.06485874487815398, + "grad_norm": 1.0704870223999023, + "learning_rate": 9.994532749406477e-06, + "loss": 0.9499, + "step": 1203 + }, + { + "epoch": 0.0649126590467975, + "grad_norm": 0.9905646443367004, + "learning_rate": 9.994522833304738e-06, + "loss": 0.8801, + "step": 1204 + }, + { + "epoch": 0.06496657321544101, + "grad_norm": 1.194190502166748, + "learning_rate": 9.99451290822352e-06, + "loss": 0.9051, + "step": 1205 + }, + { + "epoch": 0.06502048738408454, + "grad_norm": 0.8571314811706543, + "learning_rate": 9.994502974162843e-06, + "loss": 0.8131, + "step": 1206 + }, + { + "epoch": 0.06507440155272806, + "grad_norm": 0.9769417643547058, + "learning_rate": 9.994493031122721e-06, + "loss": 0.8524, + "step": 1207 + }, + { + "epoch": 0.06512831572137158, + "grad_norm": 0.8106759786605835, + "learning_rate": 9.994483079103176e-06, + "loss": 0.8142, + "step": 1208 + }, + { + "epoch": 0.0651822298900151, + "grad_norm": 0.8817846775054932, + "learning_rate": 9.994473118104223e-06, + "loss": 0.9076, + "step": 1209 + }, + { + "epoch": 0.06523614405865862, + "grad_norm": 0.8271930813789368, + "learning_rate": 9.994463148125882e-06, + "loss": 0.7914, + "step": 1210 + }, + { + "epoch": 0.06529005822730213, + "grad_norm": 0.9060614705085754, + "learning_rate": 9.994453169168169e-06, + "loss": 0.8375, + "step": 1211 + }, + { + "epoch": 0.06534397239594565, + "grad_norm": 0.880614697933197, + "learning_rate": 9.994443181231103e-06, + "loss": 0.7751, + "step": 1212 + }, + { + "epoch": 0.06539788656458917, + "grad_norm": 0.9420819282531738, + "learning_rate": 9.994433184314702e-06, + "loss": 0.8532, + "step": 1213 + }, + { + "epoch": 0.06545180073323269, + "grad_norm": 0.8587054014205933, + "learning_rate": 9.994423178418984e-06, + "loss": 0.8804, + "step": 1214 + }, + { + "epoch": 0.06550571490187622, + "grad_norm": 0.9624550938606262, + "learning_rate": 9.994413163543965e-06, + "loss": 0.9782, + "step": 1215 + }, + { + "epoch": 0.06555962907051974, + "grad_norm": 0.9458224773406982, + "learning_rate": 9.994403139689665e-06, + "loss": 0.8274, + "step": 1216 + }, + { + "epoch": 0.06561354323916326, + "grad_norm": 1.0417940616607666, + "learning_rate": 9.994393106856104e-06, + "loss": 0.9065, + "step": 1217 + }, + { + "epoch": 0.06566745740780677, + "grad_norm": 1.0225417613983154, + "learning_rate": 9.994383065043296e-06, + "loss": 0.8642, + "step": 1218 + }, + { + "epoch": 0.06572137157645029, + "grad_norm": 0.9015594720840454, + "learning_rate": 9.994373014251261e-06, + "loss": 0.8775, + "step": 1219 + }, + { + "epoch": 0.06577528574509381, + "grad_norm": 0.8473883271217346, + "learning_rate": 9.994362954480018e-06, + "loss": 0.8566, + "step": 1220 + }, + { + "epoch": 0.06582919991373733, + "grad_norm": 0.8571242690086365, + "learning_rate": 9.994352885729584e-06, + "loss": 0.8502, + "step": 1221 + }, + { + "epoch": 0.06588311408238084, + "grad_norm": 0.8793268799781799, + "learning_rate": 9.994342807999977e-06, + "loss": 0.9062, + "step": 1222 + }, + { + "epoch": 0.06593702825102436, + "grad_norm": 0.8866230249404907, + "learning_rate": 9.994332721291214e-06, + "loss": 0.9026, + "step": 1223 + }, + { + "epoch": 0.0659909424196679, + "grad_norm": 0.9135996103286743, + "learning_rate": 9.994322625603314e-06, + "loss": 0.8558, + "step": 1224 + }, + { + "epoch": 0.06604485658831141, + "grad_norm": 0.9904530048370361, + "learning_rate": 9.994312520936297e-06, + "loss": 0.8823, + "step": 1225 + }, + { + "epoch": 0.06609877075695493, + "grad_norm": 0.8590260148048401, + "learning_rate": 9.99430240729018e-06, + "loss": 0.8344, + "step": 1226 + }, + { + "epoch": 0.06615268492559845, + "grad_norm": 1.1669397354125977, + "learning_rate": 9.99429228466498e-06, + "loss": 0.9459, + "step": 1227 + }, + { + "epoch": 0.06620659909424197, + "grad_norm": 0.9290857315063477, + "learning_rate": 9.994282153060715e-06, + "loss": 0.8723, + "step": 1228 + }, + { + "epoch": 0.06626051326288548, + "grad_norm": 0.9619696140289307, + "learning_rate": 9.994272012477405e-06, + "loss": 0.8986, + "step": 1229 + }, + { + "epoch": 0.066314427431529, + "grad_norm": 0.8312071561813354, + "learning_rate": 9.994261862915068e-06, + "loss": 0.7291, + "step": 1230 + }, + { + "epoch": 0.06636834160017252, + "grad_norm": 1.0099300146102905, + "learning_rate": 9.994251704373721e-06, + "loss": 0.8725, + "step": 1231 + }, + { + "epoch": 0.06642225576881604, + "grad_norm": 0.8522336483001709, + "learning_rate": 9.994241536853384e-06, + "loss": 0.8656, + "step": 1232 + }, + { + "epoch": 0.06647616993745957, + "grad_norm": 0.919360339641571, + "learning_rate": 9.994231360354074e-06, + "loss": 0.8854, + "step": 1233 + }, + { + "epoch": 0.06653008410610309, + "grad_norm": 0.8002495169639587, + "learning_rate": 9.994221174875809e-06, + "loss": 0.7879, + "step": 1234 + }, + { + "epoch": 0.0665839982747466, + "grad_norm": 0.9539757370948792, + "learning_rate": 9.994210980418607e-06, + "loss": 0.9027, + "step": 1235 + }, + { + "epoch": 0.06663791244339012, + "grad_norm": 0.9222649335861206, + "learning_rate": 9.99420077698249e-06, + "loss": 0.7611, + "step": 1236 + }, + { + "epoch": 0.06669182661203364, + "grad_norm": 0.8629900813102722, + "learning_rate": 9.994190564567472e-06, + "loss": 0.8122, + "step": 1237 + }, + { + "epoch": 0.06674574078067716, + "grad_norm": 0.8339203000068665, + "learning_rate": 9.994180343173574e-06, + "loss": 0.7873, + "step": 1238 + }, + { + "epoch": 0.06679965494932068, + "grad_norm": 0.8844656348228455, + "learning_rate": 9.994170112800812e-06, + "loss": 0.8176, + "step": 1239 + }, + { + "epoch": 0.0668535691179642, + "grad_norm": 1.0024579763412476, + "learning_rate": 9.994159873449206e-06, + "loss": 0.844, + "step": 1240 + }, + { + "epoch": 0.06690748328660773, + "grad_norm": 0.8317261338233948, + "learning_rate": 9.994149625118774e-06, + "loss": 0.9103, + "step": 1241 + }, + { + "epoch": 0.06696139745525125, + "grad_norm": 0.8915300965309143, + "learning_rate": 9.994139367809534e-06, + "loss": 0.9084, + "step": 1242 + }, + { + "epoch": 0.06701531162389476, + "grad_norm": 0.9270803332328796, + "learning_rate": 9.994129101521506e-06, + "loss": 0.7634, + "step": 1243 + }, + { + "epoch": 0.06706922579253828, + "grad_norm": 0.9891652464866638, + "learning_rate": 9.994118826254708e-06, + "loss": 0.9776, + "step": 1244 + }, + { + "epoch": 0.0671231399611818, + "grad_norm": 0.7778229713439941, + "learning_rate": 9.994108542009156e-06, + "loss": 0.7481, + "step": 1245 + }, + { + "epoch": 0.06717705412982532, + "grad_norm": 0.8451201319694519, + "learning_rate": 9.994098248784872e-06, + "loss": 0.8012, + "step": 1246 + }, + { + "epoch": 0.06723096829846884, + "grad_norm": 0.8115825057029724, + "learning_rate": 9.994087946581873e-06, + "loss": 0.874, + "step": 1247 + }, + { + "epoch": 0.06728488246711235, + "grad_norm": 0.815934419631958, + "learning_rate": 9.994077635400175e-06, + "loss": 0.8114, + "step": 1248 + }, + { + "epoch": 0.06733879663575587, + "grad_norm": 1.1179388761520386, + "learning_rate": 9.9940673152398e-06, + "loss": 0.9078, + "step": 1249 + }, + { + "epoch": 0.0673927108043994, + "grad_norm": 0.9235454201698303, + "learning_rate": 9.994056986100767e-06, + "loss": 0.7511, + "step": 1250 + }, + { + "epoch": 0.06744662497304292, + "grad_norm": 0.8568270206451416, + "learning_rate": 9.994046647983093e-06, + "loss": 0.7805, + "step": 1251 + }, + { + "epoch": 0.06750053914168644, + "grad_norm": 1.1337388753890991, + "learning_rate": 9.994036300886796e-06, + "loss": 0.8835, + "step": 1252 + }, + { + "epoch": 0.06755445331032996, + "grad_norm": 0.9154239892959595, + "learning_rate": 9.994025944811896e-06, + "loss": 0.8804, + "step": 1253 + }, + { + "epoch": 0.06760836747897347, + "grad_norm": 0.8301606774330139, + "learning_rate": 9.99401557975841e-06, + "loss": 0.7905, + "step": 1254 + }, + { + "epoch": 0.06766228164761699, + "grad_norm": 0.9907017350196838, + "learning_rate": 9.994005205726358e-06, + "loss": 0.9091, + "step": 1255 + }, + { + "epoch": 0.06771619581626051, + "grad_norm": 0.8883876204490662, + "learning_rate": 9.993994822715758e-06, + "loss": 0.8815, + "step": 1256 + }, + { + "epoch": 0.06777010998490403, + "grad_norm": 0.9746614098548889, + "learning_rate": 9.993984430726627e-06, + "loss": 0.7897, + "step": 1257 + }, + { + "epoch": 0.06782402415354755, + "grad_norm": 0.9773344993591309, + "learning_rate": 9.993974029758988e-06, + "loss": 0.8499, + "step": 1258 + }, + { + "epoch": 0.06787793832219108, + "grad_norm": 0.9552164077758789, + "learning_rate": 9.993963619812856e-06, + "loss": 0.711, + "step": 1259 + }, + { + "epoch": 0.0679318524908346, + "grad_norm": 0.9146968126296997, + "learning_rate": 9.993953200888252e-06, + "loss": 0.9016, + "step": 1260 + }, + { + "epoch": 0.06798576665947811, + "grad_norm": 0.924244225025177, + "learning_rate": 9.993942772985192e-06, + "loss": 0.7534, + "step": 1261 + }, + { + "epoch": 0.06803968082812163, + "grad_norm": 1.2963265180587769, + "learning_rate": 9.993932336103699e-06, + "loss": 0.9409, + "step": 1262 + }, + { + "epoch": 0.06809359499676515, + "grad_norm": 0.7954462766647339, + "learning_rate": 9.993921890243788e-06, + "loss": 0.7669, + "step": 1263 + }, + { + "epoch": 0.06814750916540867, + "grad_norm": 0.9115849137306213, + "learning_rate": 9.993911435405478e-06, + "loss": 0.7567, + "step": 1264 + }, + { + "epoch": 0.06820142333405219, + "grad_norm": 1.0030237436294556, + "learning_rate": 9.99390097158879e-06, + "loss": 0.8952, + "step": 1265 + }, + { + "epoch": 0.0682553375026957, + "grad_norm": 0.8897690773010254, + "learning_rate": 9.993890498793742e-06, + "loss": 0.7993, + "step": 1266 + }, + { + "epoch": 0.06830925167133922, + "grad_norm": 0.9283807277679443, + "learning_rate": 9.993880017020349e-06, + "loss": 0.8808, + "step": 1267 + }, + { + "epoch": 0.06836316583998275, + "grad_norm": 0.848922848701477, + "learning_rate": 9.993869526268637e-06, + "loss": 0.7979, + "step": 1268 + }, + { + "epoch": 0.06841708000862627, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.993859026538618e-06, + "loss": 0.8886, + "step": 1269 + }, + { + "epoch": 0.06847099417726979, + "grad_norm": 0.8602685928344727, + "learning_rate": 9.993848517830318e-06, + "loss": 0.8209, + "step": 1270 + }, + { + "epoch": 0.06852490834591331, + "grad_norm": 0.9300077557563782, + "learning_rate": 9.99383800014375e-06, + "loss": 0.9261, + "step": 1271 + }, + { + "epoch": 0.06857882251455683, + "grad_norm": 0.8691270351409912, + "learning_rate": 9.993827473478934e-06, + "loss": 0.9217, + "step": 1272 + }, + { + "epoch": 0.06863273668320034, + "grad_norm": 0.7943814992904663, + "learning_rate": 9.99381693783589e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.06868665085184386, + "grad_norm": 0.9060125946998596, + "learning_rate": 9.993806393214638e-06, + "loss": 0.8314, + "step": 1274 + }, + { + "epoch": 0.06874056502048738, + "grad_norm": 0.8014434576034546, + "learning_rate": 9.993795839615194e-06, + "loss": 0.8047, + "step": 1275 + }, + { + "epoch": 0.0687944791891309, + "grad_norm": 1.0498815774917603, + "learning_rate": 9.993785277037578e-06, + "loss": 0.7125, + "step": 1276 + }, + { + "epoch": 0.06884839335777443, + "grad_norm": 0.8868438005447388, + "learning_rate": 9.993774705481812e-06, + "loss": 0.8594, + "step": 1277 + }, + { + "epoch": 0.06890230752641795, + "grad_norm": 0.8213896155357361, + "learning_rate": 9.993764124947911e-06, + "loss": 0.7995, + "step": 1278 + }, + { + "epoch": 0.06895622169506146, + "grad_norm": 0.9007741212844849, + "learning_rate": 9.993753535435895e-06, + "loss": 0.8982, + "step": 1279 + }, + { + "epoch": 0.06901013586370498, + "grad_norm": 0.8377478122711182, + "learning_rate": 9.993742936945785e-06, + "loss": 0.7387, + "step": 1280 + }, + { + "epoch": 0.0690640500323485, + "grad_norm": 0.8009492754936218, + "learning_rate": 9.993732329477598e-06, + "loss": 0.8079, + "step": 1281 + }, + { + "epoch": 0.06911796420099202, + "grad_norm": 0.8478789925575256, + "learning_rate": 9.993721713031354e-06, + "loss": 0.8682, + "step": 1282 + }, + { + "epoch": 0.06917187836963554, + "grad_norm": 0.7498561143875122, + "learning_rate": 9.993711087607072e-06, + "loss": 0.8107, + "step": 1283 + }, + { + "epoch": 0.06922579253827905, + "grad_norm": 0.8972634077072144, + "learning_rate": 9.99370045320477e-06, + "loss": 0.8494, + "step": 1284 + }, + { + "epoch": 0.06927970670692257, + "grad_norm": 0.942449152469635, + "learning_rate": 9.99368980982447e-06, + "loss": 0.8487, + "step": 1285 + }, + { + "epoch": 0.0693336208755661, + "grad_norm": 0.8752795457839966, + "learning_rate": 9.993679157466188e-06, + "loss": 0.8859, + "step": 1286 + }, + { + "epoch": 0.06938753504420962, + "grad_norm": 0.8289507031440735, + "learning_rate": 9.993668496129945e-06, + "loss": 0.8726, + "step": 1287 + }, + { + "epoch": 0.06944144921285314, + "grad_norm": 0.9452151656150818, + "learning_rate": 9.993657825815759e-06, + "loss": 0.9266, + "step": 1288 + }, + { + "epoch": 0.06949536338149666, + "grad_norm": 0.8697348237037659, + "learning_rate": 9.993647146523651e-06, + "loss": 0.8946, + "step": 1289 + }, + { + "epoch": 0.06954927755014018, + "grad_norm": 0.8712061643600464, + "learning_rate": 9.993636458253637e-06, + "loss": 0.8551, + "step": 1290 + }, + { + "epoch": 0.0696031917187837, + "grad_norm": 0.9295617938041687, + "learning_rate": 9.993625761005739e-06, + "loss": 0.8963, + "step": 1291 + }, + { + "epoch": 0.06965710588742721, + "grad_norm": 0.9441055059432983, + "learning_rate": 9.993615054779975e-06, + "loss": 0.9567, + "step": 1292 + }, + { + "epoch": 0.06971102005607073, + "grad_norm": 0.8742032051086426, + "learning_rate": 9.993604339576365e-06, + "loss": 0.8341, + "step": 1293 + }, + { + "epoch": 0.06976493422471426, + "grad_norm": 0.8596220016479492, + "learning_rate": 9.993593615394928e-06, + "loss": 0.8576, + "step": 1294 + }, + { + "epoch": 0.06981884839335778, + "grad_norm": 0.8011770844459534, + "learning_rate": 9.993582882235682e-06, + "loss": 0.7317, + "step": 1295 + }, + { + "epoch": 0.0698727625620013, + "grad_norm": 0.8578245043754578, + "learning_rate": 9.993572140098648e-06, + "loss": 0.8853, + "step": 1296 + }, + { + "epoch": 0.06992667673064482, + "grad_norm": 1.1155178546905518, + "learning_rate": 9.993561388983845e-06, + "loss": 0.8199, + "step": 1297 + }, + { + "epoch": 0.06998059089928833, + "grad_norm": 1.035699486732483, + "learning_rate": 9.993550628891293e-06, + "loss": 0.9498, + "step": 1298 + }, + { + "epoch": 0.07003450506793185, + "grad_norm": 0.8635748028755188, + "learning_rate": 9.99353985982101e-06, + "loss": 0.8741, + "step": 1299 + }, + { + "epoch": 0.07008841923657537, + "grad_norm": 0.8650850653648376, + "learning_rate": 9.993529081773016e-06, + "loss": 0.7337, + "step": 1300 + }, + { + "epoch": 0.07014233340521889, + "grad_norm": 0.8334539532661438, + "learning_rate": 9.99351829474733e-06, + "loss": 0.8927, + "step": 1301 + }, + { + "epoch": 0.0701962475738624, + "grad_norm": 0.9150926470756531, + "learning_rate": 9.993507498743971e-06, + "loss": 0.8464, + "step": 1302 + }, + { + "epoch": 0.07025016174250594, + "grad_norm": 0.8916522860527039, + "learning_rate": 9.993496693762958e-06, + "loss": 0.7899, + "step": 1303 + }, + { + "epoch": 0.07030407591114946, + "grad_norm": 1.0224976539611816, + "learning_rate": 9.993485879804314e-06, + "loss": 0.8256, + "step": 1304 + }, + { + "epoch": 0.07035799007979297, + "grad_norm": 0.921816885471344, + "learning_rate": 9.993475056868054e-06, + "loss": 0.7944, + "step": 1305 + }, + { + "epoch": 0.07041190424843649, + "grad_norm": 0.8775705099105835, + "learning_rate": 9.9934642249542e-06, + "loss": 0.9098, + "step": 1306 + }, + { + "epoch": 0.07046581841708001, + "grad_norm": 0.9802567362785339, + "learning_rate": 9.99345338406277e-06, + "loss": 0.9756, + "step": 1307 + }, + { + "epoch": 0.07051973258572353, + "grad_norm": 0.9785491228103638, + "learning_rate": 9.993442534193786e-06, + "loss": 1.0017, + "step": 1308 + }, + { + "epoch": 0.07057364675436704, + "grad_norm": 0.8796840906143188, + "learning_rate": 9.993431675347265e-06, + "loss": 0.7202, + "step": 1309 + }, + { + "epoch": 0.07062756092301056, + "grad_norm": 0.878099799156189, + "learning_rate": 9.993420807523227e-06, + "loss": 0.8655, + "step": 1310 + }, + { + "epoch": 0.07068147509165408, + "grad_norm": 0.8361509442329407, + "learning_rate": 9.99340993072169e-06, + "loss": 0.8522, + "step": 1311 + }, + { + "epoch": 0.07073538926029761, + "grad_norm": 0.8556873798370361, + "learning_rate": 9.99339904494268e-06, + "loss": 0.8603, + "step": 1312 + }, + { + "epoch": 0.07078930342894113, + "grad_norm": 0.8434461355209351, + "learning_rate": 9.993388150186208e-06, + "loss": 0.8571, + "step": 1313 + }, + { + "epoch": 0.07084321759758465, + "grad_norm": 0.8545907139778137, + "learning_rate": 9.9933772464523e-06, + "loss": 0.8145, + "step": 1314 + }, + { + "epoch": 0.07089713176622817, + "grad_norm": 0.9502561092376709, + "learning_rate": 9.993366333740971e-06, + "loss": 0.8068, + "step": 1315 + }, + { + "epoch": 0.07095104593487168, + "grad_norm": 0.848628580570221, + "learning_rate": 9.993355412052244e-06, + "loss": 0.8793, + "step": 1316 + }, + { + "epoch": 0.0710049601035152, + "grad_norm": 0.9699797630310059, + "learning_rate": 9.993344481386137e-06, + "loss": 0.9904, + "step": 1317 + }, + { + "epoch": 0.07105887427215872, + "grad_norm": 0.8888396620750427, + "learning_rate": 9.993333541742671e-06, + "loss": 0.8363, + "step": 1318 + }, + { + "epoch": 0.07111278844080224, + "grad_norm": 0.8805423974990845, + "learning_rate": 9.993322593121863e-06, + "loss": 0.8905, + "step": 1319 + }, + { + "epoch": 0.07116670260944576, + "grad_norm": 0.8875272274017334, + "learning_rate": 9.993311635523736e-06, + "loss": 0.7717, + "step": 1320 + }, + { + "epoch": 0.07122061677808929, + "grad_norm": 0.8853299617767334, + "learning_rate": 9.993300668948308e-06, + "loss": 0.9077, + "step": 1321 + }, + { + "epoch": 0.0712745309467328, + "grad_norm": 0.8847644329071045, + "learning_rate": 9.993289693395599e-06, + "loss": 0.8362, + "step": 1322 + }, + { + "epoch": 0.07132844511537632, + "grad_norm": 0.9531683325767517, + "learning_rate": 9.993278708865629e-06, + "loss": 0.8848, + "step": 1323 + }, + { + "epoch": 0.07138235928401984, + "grad_norm": 0.8573325276374817, + "learning_rate": 9.993267715358414e-06, + "loss": 0.8367, + "step": 1324 + }, + { + "epoch": 0.07143627345266336, + "grad_norm": 0.8920298218727112, + "learning_rate": 9.99325671287398e-06, + "loss": 0.8838, + "step": 1325 + }, + { + "epoch": 0.07149018762130688, + "grad_norm": 0.8472782969474792, + "learning_rate": 9.993245701412343e-06, + "loss": 0.8313, + "step": 1326 + }, + { + "epoch": 0.0715441017899504, + "grad_norm": 1.047664761543274, + "learning_rate": 9.993234680973525e-06, + "loss": 0.8663, + "step": 1327 + }, + { + "epoch": 0.07159801595859391, + "grad_norm": 0.9395570158958435, + "learning_rate": 9.993223651557542e-06, + "loss": 0.7703, + "step": 1328 + }, + { + "epoch": 0.07165193012723743, + "grad_norm": 0.9125472903251648, + "learning_rate": 9.993212613164419e-06, + "loss": 0.9335, + "step": 1329 + }, + { + "epoch": 0.07170584429588096, + "grad_norm": 0.9043323397636414, + "learning_rate": 9.993201565794172e-06, + "loss": 0.9185, + "step": 1330 + }, + { + "epoch": 0.07175975846452448, + "grad_norm": 0.8764339089393616, + "learning_rate": 9.993190509446821e-06, + "loss": 0.8807, + "step": 1331 + }, + { + "epoch": 0.071813672633168, + "grad_norm": 0.9123268723487854, + "learning_rate": 9.99317944412239e-06, + "loss": 0.8134, + "step": 1332 + }, + { + "epoch": 0.07186758680181152, + "grad_norm": 0.9625567197799683, + "learning_rate": 9.993168369820892e-06, + "loss": 0.8132, + "step": 1333 + }, + { + "epoch": 0.07192150097045504, + "grad_norm": 0.880536675453186, + "learning_rate": 9.993157286542352e-06, + "loss": 0.8107, + "step": 1334 + }, + { + "epoch": 0.07197541513909855, + "grad_norm": 0.9165224432945251, + "learning_rate": 9.99314619428679e-06, + "loss": 0.8376, + "step": 1335 + }, + { + "epoch": 0.07202932930774207, + "grad_norm": 0.8278066515922546, + "learning_rate": 9.993135093054223e-06, + "loss": 0.8075, + "step": 1336 + }, + { + "epoch": 0.07208324347638559, + "grad_norm": 0.9237795472145081, + "learning_rate": 9.993123982844674e-06, + "loss": 0.7838, + "step": 1337 + }, + { + "epoch": 0.0721371576450291, + "grad_norm": 0.8200939297676086, + "learning_rate": 9.993112863658161e-06, + "loss": 0.8475, + "step": 1338 + }, + { + "epoch": 0.07219107181367264, + "grad_norm": 0.8505958318710327, + "learning_rate": 9.993101735494704e-06, + "loss": 0.7891, + "step": 1339 + }, + { + "epoch": 0.07224498598231616, + "grad_norm": 0.8407264351844788, + "learning_rate": 9.993090598354323e-06, + "loss": 0.8128, + "step": 1340 + }, + { + "epoch": 0.07229890015095967, + "grad_norm": 0.8039887547492981, + "learning_rate": 9.993079452237038e-06, + "loss": 0.8504, + "step": 1341 + }, + { + "epoch": 0.07235281431960319, + "grad_norm": 0.7590643167495728, + "learning_rate": 9.993068297142871e-06, + "loss": 0.7402, + "step": 1342 + }, + { + "epoch": 0.07240672848824671, + "grad_norm": 0.7866249680519104, + "learning_rate": 9.993057133071842e-06, + "loss": 0.7076, + "step": 1343 + }, + { + "epoch": 0.07246064265689023, + "grad_norm": 0.9846029281616211, + "learning_rate": 9.993045960023967e-06, + "loss": 0.9179, + "step": 1344 + }, + { + "epoch": 0.07251455682553375, + "grad_norm": 0.8918319940567017, + "learning_rate": 9.99303477799927e-06, + "loss": 0.8087, + "step": 1345 + }, + { + "epoch": 0.07256847099417726, + "grad_norm": 0.8407700061798096, + "learning_rate": 9.99302358699777e-06, + "loss": 0.7272, + "step": 1346 + }, + { + "epoch": 0.0726223851628208, + "grad_norm": 0.9637326598167419, + "learning_rate": 9.993012387019486e-06, + "loss": 0.8613, + "step": 1347 + }, + { + "epoch": 0.07267629933146431, + "grad_norm": 0.8362317681312561, + "learning_rate": 9.99300117806444e-06, + "loss": 0.917, + "step": 1348 + }, + { + "epoch": 0.07273021350010783, + "grad_norm": 0.8584982752799988, + "learning_rate": 9.992989960132651e-06, + "loss": 0.8857, + "step": 1349 + }, + { + "epoch": 0.07278412766875135, + "grad_norm": 0.8341198563575745, + "learning_rate": 9.992978733224139e-06, + "loss": 0.802, + "step": 1350 + }, + { + "epoch": 0.07283804183739487, + "grad_norm": 1.6860167980194092, + "learning_rate": 9.992967497338926e-06, + "loss": 0.8789, + "step": 1351 + }, + { + "epoch": 0.07289195600603839, + "grad_norm": 0.8399189114570618, + "learning_rate": 9.99295625247703e-06, + "loss": 0.6338, + "step": 1352 + }, + { + "epoch": 0.0729458701746819, + "grad_norm": 0.9616976976394653, + "learning_rate": 9.992944998638473e-06, + "loss": 0.9735, + "step": 1353 + }, + { + "epoch": 0.07299978434332542, + "grad_norm": 0.8592861890792847, + "learning_rate": 9.992933735823272e-06, + "loss": 0.8159, + "step": 1354 + }, + { + "epoch": 0.07305369851196894, + "grad_norm": 0.8448725342750549, + "learning_rate": 9.992922464031451e-06, + "loss": 0.7942, + "step": 1355 + }, + { + "epoch": 0.07310761268061247, + "grad_norm": 0.8015927672386169, + "learning_rate": 9.99291118326303e-06, + "loss": 0.7429, + "step": 1356 + }, + { + "epoch": 0.07316152684925599, + "grad_norm": 0.8255912065505981, + "learning_rate": 9.992899893518025e-06, + "loss": 0.8532, + "step": 1357 + }, + { + "epoch": 0.07321544101789951, + "grad_norm": 0.8764085173606873, + "learning_rate": 9.992888594796462e-06, + "loss": 0.7989, + "step": 1358 + }, + { + "epoch": 0.07326935518654303, + "grad_norm": 0.8405522704124451, + "learning_rate": 9.992877287098357e-06, + "loss": 0.8709, + "step": 1359 + }, + { + "epoch": 0.07332326935518654, + "grad_norm": 0.8657836318016052, + "learning_rate": 9.992865970423733e-06, + "loss": 0.8236, + "step": 1360 + }, + { + "epoch": 0.07337718352383006, + "grad_norm": 0.8817959427833557, + "learning_rate": 9.992854644772609e-06, + "loss": 0.902, + "step": 1361 + }, + { + "epoch": 0.07343109769247358, + "grad_norm": 0.8290701508522034, + "learning_rate": 9.992843310145006e-06, + "loss": 0.8454, + "step": 1362 + }, + { + "epoch": 0.0734850118611171, + "grad_norm": 0.9637642502784729, + "learning_rate": 9.992831966540946e-06, + "loss": 0.9414, + "step": 1363 + }, + { + "epoch": 0.07353892602976062, + "grad_norm": 0.9220197200775146, + "learning_rate": 9.992820613960446e-06, + "loss": 0.9827, + "step": 1364 + }, + { + "epoch": 0.07359284019840415, + "grad_norm": 0.9008362889289856, + "learning_rate": 9.992809252403526e-06, + "loss": 0.8388, + "step": 1365 + }, + { + "epoch": 0.07364675436704766, + "grad_norm": 0.9517331123352051, + "learning_rate": 9.992797881870212e-06, + "loss": 0.8758, + "step": 1366 + }, + { + "epoch": 0.07370066853569118, + "grad_norm": 0.7811571359634399, + "learning_rate": 9.992786502360517e-06, + "loss": 0.6984, + "step": 1367 + }, + { + "epoch": 0.0737545827043347, + "grad_norm": 0.9887184500694275, + "learning_rate": 9.992775113874466e-06, + "loss": 0.7832, + "step": 1368 + }, + { + "epoch": 0.07380849687297822, + "grad_norm": 1.025869607925415, + "learning_rate": 9.99276371641208e-06, + "loss": 0.8417, + "step": 1369 + }, + { + "epoch": 0.07386241104162174, + "grad_norm": 0.8479165434837341, + "learning_rate": 9.99275230997338e-06, + "loss": 0.7862, + "step": 1370 + }, + { + "epoch": 0.07391632521026525, + "grad_norm": 0.9213555455207825, + "learning_rate": 9.992740894558381e-06, + "loss": 0.915, + "step": 1371 + }, + { + "epoch": 0.07397023937890877, + "grad_norm": 0.832306444644928, + "learning_rate": 9.992729470167109e-06, + "loss": 0.7566, + "step": 1372 + }, + { + "epoch": 0.07402415354755229, + "grad_norm": 1.0360348224639893, + "learning_rate": 9.992718036799583e-06, + "loss": 0.9096, + "step": 1373 + }, + { + "epoch": 0.07407806771619582, + "grad_norm": 0.8898483514785767, + "learning_rate": 9.992706594455823e-06, + "loss": 0.8738, + "step": 1374 + }, + { + "epoch": 0.07413198188483934, + "grad_norm": 0.8813758492469788, + "learning_rate": 9.992695143135849e-06, + "loss": 0.8736, + "step": 1375 + }, + { + "epoch": 0.07418589605348286, + "grad_norm": 1.1480571031570435, + "learning_rate": 9.992683682839683e-06, + "loss": 0.915, + "step": 1376 + }, + { + "epoch": 0.07423981022212638, + "grad_norm": 0.8588376641273499, + "learning_rate": 9.992672213567345e-06, + "loss": 0.8295, + "step": 1377 + }, + { + "epoch": 0.0742937243907699, + "grad_norm": 0.8729918599128723, + "learning_rate": 9.992660735318858e-06, + "loss": 0.9058, + "step": 1378 + }, + { + "epoch": 0.07434763855941341, + "grad_norm": 0.7953224778175354, + "learning_rate": 9.992649248094236e-06, + "loss": 0.7857, + "step": 1379 + }, + { + "epoch": 0.07440155272805693, + "grad_norm": 0.8485717177391052, + "learning_rate": 9.992637751893508e-06, + "loss": 0.7641, + "step": 1380 + }, + { + "epoch": 0.07445546689670045, + "grad_norm": 0.8630878329277039, + "learning_rate": 9.99262624671669e-06, + "loss": 0.8624, + "step": 1381 + }, + { + "epoch": 0.07450938106534397, + "grad_norm": 0.8655185103416443, + "learning_rate": 9.992614732563802e-06, + "loss": 0.8428, + "step": 1382 + }, + { + "epoch": 0.0745632952339875, + "grad_norm": 0.7875732779502869, + "learning_rate": 9.992603209434868e-06, + "loss": 0.7272, + "step": 1383 + }, + { + "epoch": 0.07461720940263102, + "grad_norm": 0.875879168510437, + "learning_rate": 9.992591677329905e-06, + "loss": 0.8539, + "step": 1384 + }, + { + "epoch": 0.07467112357127453, + "grad_norm": 0.8618319034576416, + "learning_rate": 9.992580136248934e-06, + "loss": 0.879, + "step": 1385 + }, + { + "epoch": 0.07472503773991805, + "grad_norm": 0.8695591688156128, + "learning_rate": 9.992568586191981e-06, + "loss": 0.8477, + "step": 1386 + }, + { + "epoch": 0.07477895190856157, + "grad_norm": 0.8539825677871704, + "learning_rate": 9.992557027159062e-06, + "loss": 0.7347, + "step": 1387 + }, + { + "epoch": 0.07483286607720509, + "grad_norm": 0.9625217914581299, + "learning_rate": 9.992545459150197e-06, + "loss": 0.8561, + "step": 1388 + }, + { + "epoch": 0.0748867802458486, + "grad_norm": 0.9862298369407654, + "learning_rate": 9.992533882165409e-06, + "loss": 0.9583, + "step": 1389 + }, + { + "epoch": 0.07494069441449212, + "grad_norm": 0.8217719793319702, + "learning_rate": 9.99252229620472e-06, + "loss": 0.7995, + "step": 1390 + }, + { + "epoch": 0.07499460858313564, + "grad_norm": 0.8668621182441711, + "learning_rate": 9.992510701268147e-06, + "loss": 0.8484, + "step": 1391 + }, + { + "epoch": 0.07504852275177917, + "grad_norm": 0.8549453616142273, + "learning_rate": 9.992499097355716e-06, + "loss": 0.8552, + "step": 1392 + }, + { + "epoch": 0.07510243692042269, + "grad_norm": 0.8262618184089661, + "learning_rate": 9.992487484467444e-06, + "loss": 0.7054, + "step": 1393 + }, + { + "epoch": 0.07515635108906621, + "grad_norm": 0.8524961471557617, + "learning_rate": 9.992475862603352e-06, + "loss": 0.8231, + "step": 1394 + }, + { + "epoch": 0.07521026525770973, + "grad_norm": 0.7805570363998413, + "learning_rate": 9.99246423176346e-06, + "loss": 0.7778, + "step": 1395 + }, + { + "epoch": 0.07526417942635324, + "grad_norm": 0.950484037399292, + "learning_rate": 9.992452591947794e-06, + "loss": 0.8662, + "step": 1396 + }, + { + "epoch": 0.07531809359499676, + "grad_norm": 0.8746458888053894, + "learning_rate": 9.99244094315637e-06, + "loss": 0.7854, + "step": 1397 + }, + { + "epoch": 0.07537200776364028, + "grad_norm": 0.9450538754463196, + "learning_rate": 9.992429285389212e-06, + "loss": 0.954, + "step": 1398 + }, + { + "epoch": 0.0754259219322838, + "grad_norm": 0.9048300385475159, + "learning_rate": 9.992417618646337e-06, + "loss": 0.8915, + "step": 1399 + }, + { + "epoch": 0.07547983610092733, + "grad_norm": 0.8735381364822388, + "learning_rate": 9.99240594292777e-06, + "loss": 0.8391, + "step": 1400 + }, + { + "epoch": 0.07553375026957085, + "grad_norm": 1.0980675220489502, + "learning_rate": 9.99239425823353e-06, + "loss": 0.8892, + "step": 1401 + }, + { + "epoch": 0.07558766443821437, + "grad_norm": 0.9016425013542175, + "learning_rate": 9.992382564563638e-06, + "loss": 0.8192, + "step": 1402 + }, + { + "epoch": 0.07564157860685788, + "grad_norm": 0.801419198513031, + "learning_rate": 9.992370861918117e-06, + "loss": 0.7914, + "step": 1403 + }, + { + "epoch": 0.0756954927755014, + "grad_norm": 0.9043407440185547, + "learning_rate": 9.992359150296985e-06, + "loss": 0.8767, + "step": 1404 + }, + { + "epoch": 0.07574940694414492, + "grad_norm": 0.9703086018562317, + "learning_rate": 9.992347429700266e-06, + "loss": 0.9173, + "step": 1405 + }, + { + "epoch": 0.07580332111278844, + "grad_norm": 0.8154104351997375, + "learning_rate": 9.992335700127978e-06, + "loss": 0.8453, + "step": 1406 + }, + { + "epoch": 0.07585723528143196, + "grad_norm": 0.8551482558250427, + "learning_rate": 9.992323961580146e-06, + "loss": 0.9132, + "step": 1407 + }, + { + "epoch": 0.07591114945007547, + "grad_norm": 0.9425063729286194, + "learning_rate": 9.992312214056785e-06, + "loss": 0.8171, + "step": 1408 + }, + { + "epoch": 0.075965063618719, + "grad_norm": 0.8958794474601746, + "learning_rate": 9.992300457557922e-06, + "loss": 0.7983, + "step": 1409 + }, + { + "epoch": 0.07601897778736252, + "grad_norm": 0.873874843120575, + "learning_rate": 9.992288692083579e-06, + "loss": 0.798, + "step": 1410 + }, + { + "epoch": 0.07607289195600604, + "grad_norm": 0.7951189279556274, + "learning_rate": 9.99227691763377e-06, + "loss": 0.8671, + "step": 1411 + }, + { + "epoch": 0.07612680612464956, + "grad_norm": 0.8073802590370178, + "learning_rate": 9.992265134208522e-06, + "loss": 0.8214, + "step": 1412 + }, + { + "epoch": 0.07618072029329308, + "grad_norm": 0.918222188949585, + "learning_rate": 9.992253341807854e-06, + "loss": 0.807, + "step": 1413 + }, + { + "epoch": 0.0762346344619366, + "grad_norm": 0.834381103515625, + "learning_rate": 9.992241540431789e-06, + "loss": 0.8737, + "step": 1414 + }, + { + "epoch": 0.07628854863058011, + "grad_norm": 0.808437168598175, + "learning_rate": 9.992229730080347e-06, + "loss": 0.7982, + "step": 1415 + }, + { + "epoch": 0.07634246279922363, + "grad_norm": 0.7868708968162537, + "learning_rate": 9.992217910753547e-06, + "loss": 0.7071, + "step": 1416 + }, + { + "epoch": 0.07639637696786715, + "grad_norm": 0.8445919156074524, + "learning_rate": 9.992206082451416e-06, + "loss": 0.8353, + "step": 1417 + }, + { + "epoch": 0.07645029113651068, + "grad_norm": 0.8283419609069824, + "learning_rate": 9.992194245173969e-06, + "loss": 0.867, + "step": 1418 + }, + { + "epoch": 0.0765042053051542, + "grad_norm": 0.8390635251998901, + "learning_rate": 9.99218239892123e-06, + "loss": 0.822, + "step": 1419 + }, + { + "epoch": 0.07655811947379772, + "grad_norm": 0.9037001132965088, + "learning_rate": 9.992170543693222e-06, + "loss": 0.8759, + "step": 1420 + }, + { + "epoch": 0.07661203364244124, + "grad_norm": 0.9708169102668762, + "learning_rate": 9.992158679489965e-06, + "loss": 0.875, + "step": 1421 + }, + { + "epoch": 0.07666594781108475, + "grad_norm": 0.8712205290794373, + "learning_rate": 9.992146806311479e-06, + "loss": 0.8711, + "step": 1422 + }, + { + "epoch": 0.07671986197972827, + "grad_norm": 0.953936755657196, + "learning_rate": 9.992134924157786e-06, + "loss": 0.8117, + "step": 1423 + }, + { + "epoch": 0.07677377614837179, + "grad_norm": 1.3178669214248657, + "learning_rate": 9.992123033028908e-06, + "loss": 0.8932, + "step": 1424 + }, + { + "epoch": 0.0768276903170153, + "grad_norm": 0.8657799959182739, + "learning_rate": 9.992111132924867e-06, + "loss": 0.8429, + "step": 1425 + }, + { + "epoch": 0.07688160448565882, + "grad_norm": 0.8979378938674927, + "learning_rate": 9.992099223845681e-06, + "loss": 0.9165, + "step": 1426 + }, + { + "epoch": 0.07693551865430236, + "grad_norm": 0.797493040561676, + "learning_rate": 9.992087305791376e-06, + "loss": 0.8139, + "step": 1427 + }, + { + "epoch": 0.07698943282294587, + "grad_norm": 0.9762497544288635, + "learning_rate": 9.99207537876197e-06, + "loss": 0.8006, + "step": 1428 + }, + { + "epoch": 0.07704334699158939, + "grad_norm": 0.9322238564491272, + "learning_rate": 9.992063442757487e-06, + "loss": 0.8708, + "step": 1429 + }, + { + "epoch": 0.07709726116023291, + "grad_norm": 0.9208402037620544, + "learning_rate": 9.992051497777947e-06, + "loss": 0.9137, + "step": 1430 + }, + { + "epoch": 0.07715117532887643, + "grad_norm": 0.9262849688529968, + "learning_rate": 9.99203954382337e-06, + "loss": 0.8043, + "step": 1431 + }, + { + "epoch": 0.07720508949751995, + "grad_norm": 1.0556507110595703, + "learning_rate": 9.992027580893781e-06, + "loss": 0.8321, + "step": 1432 + }, + { + "epoch": 0.07725900366616346, + "grad_norm": 1.0503417253494263, + "learning_rate": 9.9920156089892e-06, + "loss": 0.8875, + "step": 1433 + }, + { + "epoch": 0.07731291783480698, + "grad_norm": 0.8772387504577637, + "learning_rate": 9.992003628109647e-06, + "loss": 0.7407, + "step": 1434 + }, + { + "epoch": 0.0773668320034505, + "grad_norm": 0.942286491394043, + "learning_rate": 9.991991638255146e-06, + "loss": 0.8493, + "step": 1435 + }, + { + "epoch": 0.07742074617209403, + "grad_norm": 0.8584794998168945, + "learning_rate": 9.991979639425717e-06, + "loss": 0.8003, + "step": 1436 + }, + { + "epoch": 0.07747466034073755, + "grad_norm": 0.8247780203819275, + "learning_rate": 9.99196763162138e-06, + "loss": 0.9156, + "step": 1437 + }, + { + "epoch": 0.07752857450938107, + "grad_norm": 0.859018862247467, + "learning_rate": 9.99195561484216e-06, + "loss": 0.8255, + "step": 1438 + }, + { + "epoch": 0.07758248867802459, + "grad_norm": 0.9073282480239868, + "learning_rate": 9.991943589088078e-06, + "loss": 0.903, + "step": 1439 + }, + { + "epoch": 0.0776364028466681, + "grad_norm": 0.9324385523796082, + "learning_rate": 9.991931554359154e-06, + "loss": 0.8618, + "step": 1440 + }, + { + "epoch": 0.07769031701531162, + "grad_norm": 0.8038938045501709, + "learning_rate": 9.991919510655409e-06, + "loss": 0.7545, + "step": 1441 + }, + { + "epoch": 0.07774423118395514, + "grad_norm": 0.7999526858329773, + "learning_rate": 9.991907457976866e-06, + "loss": 0.6804, + "step": 1442 + }, + { + "epoch": 0.07779814535259866, + "grad_norm": 1.0165048837661743, + "learning_rate": 9.991895396323548e-06, + "loss": 0.7664, + "step": 1443 + }, + { + "epoch": 0.07785205952124218, + "grad_norm": 0.9513073563575745, + "learning_rate": 9.991883325695475e-06, + "loss": 0.8115, + "step": 1444 + }, + { + "epoch": 0.07790597368988571, + "grad_norm": 1.0391769409179688, + "learning_rate": 9.991871246092669e-06, + "loss": 0.9197, + "step": 1445 + }, + { + "epoch": 0.07795988785852923, + "grad_norm": 0.8990768194198608, + "learning_rate": 9.991859157515151e-06, + "loss": 0.9507, + "step": 1446 + }, + { + "epoch": 0.07801380202717274, + "grad_norm": 0.9990912079811096, + "learning_rate": 9.991847059962945e-06, + "loss": 0.7951, + "step": 1447 + }, + { + "epoch": 0.07806771619581626, + "grad_norm": 1.0030032396316528, + "learning_rate": 9.99183495343607e-06, + "loss": 0.7237, + "step": 1448 + }, + { + "epoch": 0.07812163036445978, + "grad_norm": 0.889561116695404, + "learning_rate": 9.991822837934551e-06, + "loss": 0.9061, + "step": 1449 + }, + { + "epoch": 0.0781755445331033, + "grad_norm": 0.8766982555389404, + "learning_rate": 9.991810713458405e-06, + "loss": 0.7952, + "step": 1450 + }, + { + "epoch": 0.07822945870174682, + "grad_norm": 0.9144406914710999, + "learning_rate": 9.991798580007658e-06, + "loss": 0.9235, + "step": 1451 + }, + { + "epoch": 0.07828337287039033, + "grad_norm": 0.895516037940979, + "learning_rate": 9.99178643758233e-06, + "loss": 0.9469, + "step": 1452 + }, + { + "epoch": 0.07833728703903386, + "grad_norm": 0.8802943229675293, + "learning_rate": 9.991774286182443e-06, + "loss": 0.8548, + "step": 1453 + }, + { + "epoch": 0.07839120120767738, + "grad_norm": 1.2773913145065308, + "learning_rate": 9.99176212580802e-06, + "loss": 0.794, + "step": 1454 + }, + { + "epoch": 0.0784451153763209, + "grad_norm": 0.9501168131828308, + "learning_rate": 9.99174995645908e-06, + "loss": 0.8711, + "step": 1455 + }, + { + "epoch": 0.07849902954496442, + "grad_norm": 0.9047390222549438, + "learning_rate": 9.991737778135649e-06, + "loss": 0.8419, + "step": 1456 + }, + { + "epoch": 0.07855294371360794, + "grad_norm": 0.9492837190628052, + "learning_rate": 9.991725590837747e-06, + "loss": 0.9832, + "step": 1457 + }, + { + "epoch": 0.07860685788225145, + "grad_norm": 0.9585106372833252, + "learning_rate": 9.991713394565394e-06, + "loss": 0.8393, + "step": 1458 + }, + { + "epoch": 0.07866077205089497, + "grad_norm": 0.9568297266960144, + "learning_rate": 9.991701189318615e-06, + "loss": 0.8711, + "step": 1459 + }, + { + "epoch": 0.07871468621953849, + "grad_norm": 0.9201347231864929, + "learning_rate": 9.991688975097429e-06, + "loss": 0.7947, + "step": 1460 + }, + { + "epoch": 0.07876860038818201, + "grad_norm": 0.8375768661499023, + "learning_rate": 9.99167675190186e-06, + "loss": 0.8051, + "step": 1461 + }, + { + "epoch": 0.07882251455682554, + "grad_norm": 0.8397765755653381, + "learning_rate": 9.99166451973193e-06, + "loss": 0.7727, + "step": 1462 + }, + { + "epoch": 0.07887642872546906, + "grad_norm": 0.8697947859764099, + "learning_rate": 9.99165227858766e-06, + "loss": 0.8171, + "step": 1463 + }, + { + "epoch": 0.07893034289411258, + "grad_norm": 0.8894750475883484, + "learning_rate": 9.991640028469073e-06, + "loss": 0.8773, + "step": 1464 + }, + { + "epoch": 0.0789842570627561, + "grad_norm": 0.8817871809005737, + "learning_rate": 9.991627769376189e-06, + "loss": 0.8983, + "step": 1465 + }, + { + "epoch": 0.07903817123139961, + "grad_norm": 0.9241123795509338, + "learning_rate": 9.99161550130903e-06, + "loss": 0.8967, + "step": 1466 + }, + { + "epoch": 0.07909208540004313, + "grad_norm": 0.852982223033905, + "learning_rate": 9.991603224267623e-06, + "loss": 0.9054, + "step": 1467 + }, + { + "epoch": 0.07914599956868665, + "grad_norm": 0.7719098925590515, + "learning_rate": 9.991590938251986e-06, + "loss": 0.7845, + "step": 1468 + }, + { + "epoch": 0.07919991373733017, + "grad_norm": 0.8700329661369324, + "learning_rate": 9.99157864326214e-06, + "loss": 0.9664, + "step": 1469 + }, + { + "epoch": 0.07925382790597368, + "grad_norm": 0.880553126335144, + "learning_rate": 9.991566339298112e-06, + "loss": 0.8803, + "step": 1470 + }, + { + "epoch": 0.07930774207461722, + "grad_norm": 0.9425762295722961, + "learning_rate": 9.991554026359918e-06, + "loss": 0.8259, + "step": 1471 + }, + { + "epoch": 0.07936165624326073, + "grad_norm": 0.8611294031143188, + "learning_rate": 9.991541704447585e-06, + "loss": 0.8693, + "step": 1472 + }, + { + "epoch": 0.07941557041190425, + "grad_norm": 0.856023907661438, + "learning_rate": 9.99152937356113e-06, + "loss": 0.7073, + "step": 1473 + }, + { + "epoch": 0.07946948458054777, + "grad_norm": 0.7763693332672119, + "learning_rate": 9.991517033700582e-06, + "loss": 0.6815, + "step": 1474 + }, + { + "epoch": 0.07952339874919129, + "grad_norm": 0.8417321443557739, + "learning_rate": 9.991504684865959e-06, + "loss": 0.8239, + "step": 1475 + }, + { + "epoch": 0.0795773129178348, + "grad_norm": 0.9151323437690735, + "learning_rate": 9.991492327057282e-06, + "loss": 0.8327, + "step": 1476 + }, + { + "epoch": 0.07963122708647832, + "grad_norm": 0.8285405039787292, + "learning_rate": 9.991479960274576e-06, + "loss": 0.8623, + "step": 1477 + }, + { + "epoch": 0.07968514125512184, + "grad_norm": 0.8204792141914368, + "learning_rate": 9.991467584517863e-06, + "loss": 0.8494, + "step": 1478 + }, + { + "epoch": 0.07973905542376536, + "grad_norm": 0.8516230583190918, + "learning_rate": 9.991455199787164e-06, + "loss": 0.8219, + "step": 1479 + }, + { + "epoch": 0.07979296959240889, + "grad_norm": 0.9418333172798157, + "learning_rate": 9.991442806082501e-06, + "loss": 0.9293, + "step": 1480 + }, + { + "epoch": 0.07984688376105241, + "grad_norm": 0.8852763175964355, + "learning_rate": 9.991430403403898e-06, + "loss": 0.8124, + "step": 1481 + }, + { + "epoch": 0.07990079792969593, + "grad_norm": 0.8435791730880737, + "learning_rate": 9.991417991751376e-06, + "loss": 0.8634, + "step": 1482 + }, + { + "epoch": 0.07995471209833944, + "grad_norm": 0.7795083522796631, + "learning_rate": 9.991405571124957e-06, + "loss": 0.802, + "step": 1483 + }, + { + "epoch": 0.08000862626698296, + "grad_norm": 0.8102303743362427, + "learning_rate": 9.991393141524663e-06, + "loss": 0.7492, + "step": 1484 + }, + { + "epoch": 0.08006254043562648, + "grad_norm": 0.8433593511581421, + "learning_rate": 9.99138070295052e-06, + "loss": 0.7926, + "step": 1485 + }, + { + "epoch": 0.08011645460427, + "grad_norm": 0.8992267847061157, + "learning_rate": 9.991368255402546e-06, + "loss": 0.7859, + "step": 1486 + }, + { + "epoch": 0.08017036877291352, + "grad_norm": 0.8748059868812561, + "learning_rate": 9.991355798880765e-06, + "loss": 0.8245, + "step": 1487 + }, + { + "epoch": 0.08022428294155703, + "grad_norm": 0.8456832766532898, + "learning_rate": 9.9913433333852e-06, + "loss": 0.9009, + "step": 1488 + }, + { + "epoch": 0.08027819711020057, + "grad_norm": 0.8582474589347839, + "learning_rate": 9.991330858915873e-06, + "loss": 0.7607, + "step": 1489 + }, + { + "epoch": 0.08033211127884408, + "grad_norm": 0.8157060146331787, + "learning_rate": 9.991318375472807e-06, + "loss": 0.8426, + "step": 1490 + }, + { + "epoch": 0.0803860254474876, + "grad_norm": 0.7474784851074219, + "learning_rate": 9.991305883056021e-06, + "loss": 0.8014, + "step": 1491 + }, + { + "epoch": 0.08043993961613112, + "grad_norm": 0.8432475924491882, + "learning_rate": 9.991293381665543e-06, + "loss": 0.8254, + "step": 1492 + }, + { + "epoch": 0.08049385378477464, + "grad_norm": 0.8733057379722595, + "learning_rate": 9.991280871301392e-06, + "loss": 0.8694, + "step": 1493 + }, + { + "epoch": 0.08054776795341816, + "grad_norm": 0.8694074153900146, + "learning_rate": 9.991268351963592e-06, + "loss": 0.7306, + "step": 1494 + }, + { + "epoch": 0.08060168212206167, + "grad_norm": 0.8981258869171143, + "learning_rate": 9.991255823652162e-06, + "loss": 0.7821, + "step": 1495 + }, + { + "epoch": 0.08065559629070519, + "grad_norm": 0.9740719795227051, + "learning_rate": 9.99124328636713e-06, + "loss": 0.7678, + "step": 1496 + }, + { + "epoch": 0.08070951045934871, + "grad_norm": 0.8847763538360596, + "learning_rate": 9.991230740108515e-06, + "loss": 0.73, + "step": 1497 + }, + { + "epoch": 0.08076342462799224, + "grad_norm": 0.8909339308738708, + "learning_rate": 9.99121818487634e-06, + "loss": 0.7713, + "step": 1498 + }, + { + "epoch": 0.08081733879663576, + "grad_norm": 0.8183975219726562, + "learning_rate": 9.991205620670626e-06, + "loss": 0.8234, + "step": 1499 + }, + { + "epoch": 0.08087125296527928, + "grad_norm": 1.241355299949646, + "learning_rate": 9.991193047491399e-06, + "loss": 0.8135, + "step": 1500 + }, + { + "epoch": 0.0809251671339228, + "grad_norm": 0.9039500951766968, + "learning_rate": 9.991180465338682e-06, + "loss": 0.8642, + "step": 1501 + }, + { + "epoch": 0.08097908130256631, + "grad_norm": 1.1762068271636963, + "learning_rate": 9.991167874212493e-06, + "loss": 0.7892, + "step": 1502 + }, + { + "epoch": 0.08103299547120983, + "grad_norm": 0.8402833938598633, + "learning_rate": 9.991155274112857e-06, + "loss": 0.9054, + "step": 1503 + }, + { + "epoch": 0.08108690963985335, + "grad_norm": 0.9271976351737976, + "learning_rate": 9.991142665039799e-06, + "loss": 0.8902, + "step": 1504 + }, + { + "epoch": 0.08114082380849687, + "grad_norm": 0.9105845093727112, + "learning_rate": 9.991130046993337e-06, + "loss": 0.8522, + "step": 1505 + }, + { + "epoch": 0.0811947379771404, + "grad_norm": 0.8248290419578552, + "learning_rate": 9.991117419973499e-06, + "loss": 0.882, + "step": 1506 + }, + { + "epoch": 0.08124865214578392, + "grad_norm": 1.0726820230484009, + "learning_rate": 9.991104783980305e-06, + "loss": 0.8001, + "step": 1507 + }, + { + "epoch": 0.08130256631442744, + "grad_norm": 1.296281337738037, + "learning_rate": 9.991092139013776e-06, + "loss": 1.0022, + "step": 1508 + }, + { + "epoch": 0.08135648048307095, + "grad_norm": 1.7287628650665283, + "learning_rate": 9.991079485073938e-06, + "loss": 0.914, + "step": 1509 + }, + { + "epoch": 0.08141039465171447, + "grad_norm": 0.8731694221496582, + "learning_rate": 9.991066822160813e-06, + "loss": 0.8672, + "step": 1510 + }, + { + "epoch": 0.08146430882035799, + "grad_norm": 0.875747799873352, + "learning_rate": 9.99105415027442e-06, + "loss": 0.8044, + "step": 1511 + }, + { + "epoch": 0.08151822298900151, + "grad_norm": 0.9055120348930359, + "learning_rate": 9.991041469414787e-06, + "loss": 0.8312, + "step": 1512 + }, + { + "epoch": 0.08157213715764502, + "grad_norm": 0.8849499821662903, + "learning_rate": 9.991028779581935e-06, + "loss": 0.889, + "step": 1513 + }, + { + "epoch": 0.08162605132628854, + "grad_norm": 0.9549855589866638, + "learning_rate": 9.991016080775884e-06, + "loss": 0.8929, + "step": 1514 + }, + { + "epoch": 0.08167996549493207, + "grad_norm": 0.8395527005195618, + "learning_rate": 9.991003372996662e-06, + "loss": 0.6774, + "step": 1515 + }, + { + "epoch": 0.08173387966357559, + "grad_norm": 0.7791672945022583, + "learning_rate": 9.990990656244287e-06, + "loss": 0.7178, + "step": 1516 + }, + { + "epoch": 0.08178779383221911, + "grad_norm": 0.91841721534729, + "learning_rate": 9.990977930518785e-06, + "loss": 0.8372, + "step": 1517 + }, + { + "epoch": 0.08184170800086263, + "grad_norm": 0.923937976360321, + "learning_rate": 9.990965195820178e-06, + "loss": 0.8467, + "step": 1518 + }, + { + "epoch": 0.08189562216950615, + "grad_norm": 0.9804415106773376, + "learning_rate": 9.990952452148488e-06, + "loss": 0.9281, + "step": 1519 + }, + { + "epoch": 0.08194953633814966, + "grad_norm": 0.9396255016326904, + "learning_rate": 9.99093969950374e-06, + "loss": 0.8606, + "step": 1520 + }, + { + "epoch": 0.08200345050679318, + "grad_norm": 0.8492118120193481, + "learning_rate": 9.990926937885953e-06, + "loss": 0.8253, + "step": 1521 + }, + { + "epoch": 0.0820573646754367, + "grad_norm": 0.8482204079627991, + "learning_rate": 9.990914167295154e-06, + "loss": 0.7361, + "step": 1522 + }, + { + "epoch": 0.08211127884408022, + "grad_norm": 1.1302778720855713, + "learning_rate": 9.990901387731365e-06, + "loss": 0.7511, + "step": 1523 + }, + { + "epoch": 0.08216519301272375, + "grad_norm": 0.9285756945610046, + "learning_rate": 9.990888599194607e-06, + "loss": 0.8329, + "step": 1524 + }, + { + "epoch": 0.08221910718136727, + "grad_norm": 0.8932104110717773, + "learning_rate": 9.990875801684905e-06, + "loss": 0.8146, + "step": 1525 + }, + { + "epoch": 0.08227302135001079, + "grad_norm": 0.8232647180557251, + "learning_rate": 9.990862995202282e-06, + "loss": 0.763, + "step": 1526 + }, + { + "epoch": 0.0823269355186543, + "grad_norm": 0.8582163453102112, + "learning_rate": 9.990850179746759e-06, + "loss": 0.7675, + "step": 1527 + }, + { + "epoch": 0.08238084968729782, + "grad_norm": 0.9890977144241333, + "learning_rate": 9.990837355318362e-06, + "loss": 0.8438, + "step": 1528 + }, + { + "epoch": 0.08243476385594134, + "grad_norm": 0.9228235483169556, + "learning_rate": 9.990824521917113e-06, + "loss": 0.9324, + "step": 1529 + }, + { + "epoch": 0.08248867802458486, + "grad_norm": 0.8286252617835999, + "learning_rate": 9.990811679543033e-06, + "loss": 0.872, + "step": 1530 + }, + { + "epoch": 0.08254259219322838, + "grad_norm": 0.8546530604362488, + "learning_rate": 9.990798828196146e-06, + "loss": 0.7256, + "step": 1531 + }, + { + "epoch": 0.0825965063618719, + "grad_norm": 0.8240640759468079, + "learning_rate": 9.990785967876478e-06, + "loss": 0.8083, + "step": 1532 + }, + { + "epoch": 0.08265042053051543, + "grad_norm": 0.8650565147399902, + "learning_rate": 9.99077309858405e-06, + "loss": 0.8274, + "step": 1533 + }, + { + "epoch": 0.08270433469915894, + "grad_norm": 0.7865849137306213, + "learning_rate": 9.990760220318884e-06, + "loss": 0.7978, + "step": 1534 + }, + { + "epoch": 0.08275824886780246, + "grad_norm": 0.8567995429039001, + "learning_rate": 9.990747333081005e-06, + "loss": 0.8172, + "step": 1535 + }, + { + "epoch": 0.08281216303644598, + "grad_norm": 0.8242521286010742, + "learning_rate": 9.990734436870435e-06, + "loss": 0.8045, + "step": 1536 + }, + { + "epoch": 0.0828660772050895, + "grad_norm": 0.801266074180603, + "learning_rate": 9.990721531687197e-06, + "loss": 0.8312, + "step": 1537 + }, + { + "epoch": 0.08291999137373302, + "grad_norm": 0.8027862906455994, + "learning_rate": 9.990708617531314e-06, + "loss": 0.7227, + "step": 1538 + }, + { + "epoch": 0.08297390554237653, + "grad_norm": 1.0332401990890503, + "learning_rate": 9.990695694402811e-06, + "loss": 0.9091, + "step": 1539 + }, + { + "epoch": 0.08302781971102005, + "grad_norm": 0.8537373542785645, + "learning_rate": 9.99068276230171e-06, + "loss": 0.7573, + "step": 1540 + }, + { + "epoch": 0.08308173387966357, + "grad_norm": 0.8734087944030762, + "learning_rate": 9.990669821228037e-06, + "loss": 0.901, + "step": 1541 + }, + { + "epoch": 0.0831356480483071, + "grad_norm": 0.8546577095985413, + "learning_rate": 9.99065687118181e-06, + "loss": 0.8294, + "step": 1542 + }, + { + "epoch": 0.08318956221695062, + "grad_norm": 0.9555438756942749, + "learning_rate": 9.990643912163055e-06, + "loss": 0.83, + "step": 1543 + }, + { + "epoch": 0.08324347638559414, + "grad_norm": 0.8778670430183411, + "learning_rate": 9.990630944171798e-06, + "loss": 0.8694, + "step": 1544 + }, + { + "epoch": 0.08329739055423765, + "grad_norm": 0.973791241645813, + "learning_rate": 9.990617967208058e-06, + "loss": 0.8348, + "step": 1545 + }, + { + "epoch": 0.08335130472288117, + "grad_norm": 0.7933714389801025, + "learning_rate": 9.990604981271858e-06, + "loss": 0.8208, + "step": 1546 + }, + { + "epoch": 0.08340521889152469, + "grad_norm": 0.9328469634056091, + "learning_rate": 9.990591986363226e-06, + "loss": 0.8188, + "step": 1547 + }, + { + "epoch": 0.08345913306016821, + "grad_norm": 0.8217103481292725, + "learning_rate": 9.990578982482183e-06, + "loss": 0.7948, + "step": 1548 + }, + { + "epoch": 0.08351304722881173, + "grad_norm": 0.8556894659996033, + "learning_rate": 9.990565969628749e-06, + "loss": 0.8129, + "step": 1549 + }, + { + "epoch": 0.08356696139745524, + "grad_norm": 0.901633083820343, + "learning_rate": 9.990552947802954e-06, + "loss": 0.9025, + "step": 1550 + }, + { + "epoch": 0.08362087556609878, + "grad_norm": 0.9021494388580322, + "learning_rate": 9.990539917004815e-06, + "loss": 0.8882, + "step": 1551 + }, + { + "epoch": 0.0836747897347423, + "grad_norm": 0.8187722563743591, + "learning_rate": 9.990526877234359e-06, + "loss": 0.7385, + "step": 1552 + }, + { + "epoch": 0.08372870390338581, + "grad_norm": 0.9237630367279053, + "learning_rate": 9.990513828491609e-06, + "loss": 0.851, + "step": 1553 + }, + { + "epoch": 0.08378261807202933, + "grad_norm": 1.1868582963943481, + "learning_rate": 9.990500770776589e-06, + "loss": 0.7701, + "step": 1554 + }, + { + "epoch": 0.08383653224067285, + "grad_norm": 0.9831421971321106, + "learning_rate": 9.990487704089322e-06, + "loss": 0.836, + "step": 1555 + }, + { + "epoch": 0.08389044640931637, + "grad_norm": 0.9255663752555847, + "learning_rate": 9.99047462842983e-06, + "loss": 0.7916, + "step": 1556 + }, + { + "epoch": 0.08394436057795988, + "grad_norm": 1.0069084167480469, + "learning_rate": 9.990461543798137e-06, + "loss": 0.8652, + "step": 1557 + }, + { + "epoch": 0.0839982747466034, + "grad_norm": 0.943044900894165, + "learning_rate": 9.990448450194267e-06, + "loss": 0.9511, + "step": 1558 + }, + { + "epoch": 0.08405218891524693, + "grad_norm": 0.9996150135993958, + "learning_rate": 9.990435347618246e-06, + "loss": 0.8751, + "step": 1559 + }, + { + "epoch": 0.08410610308389045, + "grad_norm": 0.9531681537628174, + "learning_rate": 9.990422236070094e-06, + "loss": 0.8988, + "step": 1560 + }, + { + "epoch": 0.08416001725253397, + "grad_norm": 0.9504678249359131, + "learning_rate": 9.990409115549837e-06, + "loss": 0.808, + "step": 1561 + }, + { + "epoch": 0.08421393142117749, + "grad_norm": 0.9796282052993774, + "learning_rate": 9.990395986057496e-06, + "loss": 0.778, + "step": 1562 + }, + { + "epoch": 0.084267845589821, + "grad_norm": 0.8871618509292603, + "learning_rate": 9.990382847593096e-06, + "loss": 0.8945, + "step": 1563 + }, + { + "epoch": 0.08432175975846452, + "grad_norm": 0.8253110647201538, + "learning_rate": 9.990369700156662e-06, + "loss": 0.8206, + "step": 1564 + }, + { + "epoch": 0.08437567392710804, + "grad_norm": 0.8799824118614197, + "learning_rate": 9.990356543748216e-06, + "loss": 0.7665, + "step": 1565 + }, + { + "epoch": 0.08442958809575156, + "grad_norm": 0.8275637626647949, + "learning_rate": 9.990343378367782e-06, + "loss": 0.8468, + "step": 1566 + }, + { + "epoch": 0.08448350226439508, + "grad_norm": 1.0431691408157349, + "learning_rate": 9.990330204015382e-06, + "loss": 0.8539, + "step": 1567 + }, + { + "epoch": 0.08453741643303861, + "grad_norm": 1.298999547958374, + "learning_rate": 9.990317020691043e-06, + "loss": 0.8989, + "step": 1568 + }, + { + "epoch": 0.08459133060168213, + "grad_norm": 0.865868866443634, + "learning_rate": 9.990303828394787e-06, + "loss": 0.8296, + "step": 1569 + }, + { + "epoch": 0.08464524477032564, + "grad_norm": 0.9162652492523193, + "learning_rate": 9.990290627126637e-06, + "loss": 0.8617, + "step": 1570 + }, + { + "epoch": 0.08469915893896916, + "grad_norm": 0.9753283858299255, + "learning_rate": 9.990277416886618e-06, + "loss": 0.8082, + "step": 1571 + }, + { + "epoch": 0.08475307310761268, + "grad_norm": 0.9561176300048828, + "learning_rate": 9.990264197674754e-06, + "loss": 0.8678, + "step": 1572 + }, + { + "epoch": 0.0848069872762562, + "grad_norm": 0.833341658115387, + "learning_rate": 9.990250969491067e-06, + "loss": 0.8164, + "step": 1573 + }, + { + "epoch": 0.08486090144489972, + "grad_norm": 0.9928603172302246, + "learning_rate": 9.990237732335581e-06, + "loss": 0.6889, + "step": 1574 + }, + { + "epoch": 0.08491481561354323, + "grad_norm": 1.0163367986679077, + "learning_rate": 9.990224486208322e-06, + "loss": 0.8278, + "step": 1575 + }, + { + "epoch": 0.08496872978218675, + "grad_norm": 0.9905970096588135, + "learning_rate": 9.990211231109312e-06, + "loss": 0.8094, + "step": 1576 + }, + { + "epoch": 0.08502264395083028, + "grad_norm": 0.9112648963928223, + "learning_rate": 9.990197967038574e-06, + "loss": 0.8782, + "step": 1577 + }, + { + "epoch": 0.0850765581194738, + "grad_norm": 1.1176974773406982, + "learning_rate": 9.990184693996136e-06, + "loss": 0.8826, + "step": 1578 + }, + { + "epoch": 0.08513047228811732, + "grad_norm": 0.7696222066879272, + "learning_rate": 9.990171411982016e-06, + "loss": 0.8025, + "step": 1579 + }, + { + "epoch": 0.08518438645676084, + "grad_norm": 0.9288634061813354, + "learning_rate": 9.990158120996242e-06, + "loss": 0.8777, + "step": 1580 + }, + { + "epoch": 0.08523830062540436, + "grad_norm": 0.9235022068023682, + "learning_rate": 9.990144821038839e-06, + "loss": 0.9339, + "step": 1581 + }, + { + "epoch": 0.08529221479404787, + "grad_norm": 0.9124205708503723, + "learning_rate": 9.990131512109826e-06, + "loss": 0.8368, + "step": 1582 + }, + { + "epoch": 0.08534612896269139, + "grad_norm": 0.8409048914909363, + "learning_rate": 9.990118194209229e-06, + "loss": 0.7772, + "step": 1583 + }, + { + "epoch": 0.08540004313133491, + "grad_norm": 0.8279136419296265, + "learning_rate": 9.990104867337074e-06, + "loss": 0.738, + "step": 1584 + }, + { + "epoch": 0.08545395729997843, + "grad_norm": 0.8895745873451233, + "learning_rate": 9.990091531493382e-06, + "loss": 0.7669, + "step": 1585 + }, + { + "epoch": 0.08550787146862196, + "grad_norm": 0.9280734062194824, + "learning_rate": 9.99007818667818e-06, + "loss": 0.9052, + "step": 1586 + }, + { + "epoch": 0.08556178563726548, + "grad_norm": 0.7676610350608826, + "learning_rate": 9.990064832891491e-06, + "loss": 0.807, + "step": 1587 + }, + { + "epoch": 0.085615699805909, + "grad_norm": 0.9035676121711731, + "learning_rate": 9.990051470133337e-06, + "loss": 0.8848, + "step": 1588 + }, + { + "epoch": 0.08566961397455251, + "grad_norm": 1.0960334539413452, + "learning_rate": 9.990038098403742e-06, + "loss": 0.8279, + "step": 1589 + }, + { + "epoch": 0.08572352814319603, + "grad_norm": 0.87922203540802, + "learning_rate": 9.990024717702736e-06, + "loss": 0.8325, + "step": 1590 + }, + { + "epoch": 0.08577744231183955, + "grad_norm": 0.922815203666687, + "learning_rate": 9.990011328030335e-06, + "loss": 0.881, + "step": 1591 + }, + { + "epoch": 0.08583135648048307, + "grad_norm": 0.9880780577659607, + "learning_rate": 9.989997929386567e-06, + "loss": 0.7506, + "step": 1592 + }, + { + "epoch": 0.08588527064912659, + "grad_norm": 0.8827483057975769, + "learning_rate": 9.989984521771456e-06, + "loss": 0.8961, + "step": 1593 + }, + { + "epoch": 0.0859391848177701, + "grad_norm": 0.8395072817802429, + "learning_rate": 9.989971105185026e-06, + "loss": 0.8564, + "step": 1594 + }, + { + "epoch": 0.08599309898641364, + "grad_norm": 0.8731534481048584, + "learning_rate": 9.989957679627302e-06, + "loss": 0.8209, + "step": 1595 + }, + { + "epoch": 0.08604701315505715, + "grad_norm": 0.7969424724578857, + "learning_rate": 9.989944245098305e-06, + "loss": 0.8031, + "step": 1596 + }, + { + "epoch": 0.08610092732370067, + "grad_norm": 0.8420547246932983, + "learning_rate": 9.989930801598062e-06, + "loss": 0.8027, + "step": 1597 + }, + { + "epoch": 0.08615484149234419, + "grad_norm": 0.7900253534317017, + "learning_rate": 9.989917349126597e-06, + "loss": 0.8246, + "step": 1598 + }, + { + "epoch": 0.08620875566098771, + "grad_norm": 0.8860716819763184, + "learning_rate": 9.989903887683934e-06, + "loss": 0.7846, + "step": 1599 + }, + { + "epoch": 0.08626266982963122, + "grad_norm": 0.907744288444519, + "learning_rate": 9.989890417270097e-06, + "loss": 0.7813, + "step": 1600 + }, + { + "epoch": 0.08631658399827474, + "grad_norm": 0.764076828956604, + "learning_rate": 9.989876937885108e-06, + "loss": 0.7953, + "step": 1601 + }, + { + "epoch": 0.08637049816691826, + "grad_norm": 1.0143790245056152, + "learning_rate": 9.989863449528994e-06, + "loss": 0.8854, + "step": 1602 + }, + { + "epoch": 0.08642441233556178, + "grad_norm": 0.8605815172195435, + "learning_rate": 9.989849952201779e-06, + "loss": 0.9289, + "step": 1603 + }, + { + "epoch": 0.08647832650420531, + "grad_norm": 0.8897641897201538, + "learning_rate": 9.989836445903487e-06, + "loss": 0.8659, + "step": 1604 + }, + { + "epoch": 0.08653224067284883, + "grad_norm": 0.8893518447875977, + "learning_rate": 9.989822930634141e-06, + "loss": 0.8724, + "step": 1605 + }, + { + "epoch": 0.08658615484149235, + "grad_norm": 0.8152129054069519, + "learning_rate": 9.989809406393767e-06, + "loss": 0.8321, + "step": 1606 + }, + { + "epoch": 0.08664006901013586, + "grad_norm": 0.8394732475280762, + "learning_rate": 9.98979587318239e-06, + "loss": 0.8074, + "step": 1607 + }, + { + "epoch": 0.08669398317877938, + "grad_norm": 0.8038346767425537, + "learning_rate": 9.989782331000031e-06, + "loss": 0.8132, + "step": 1608 + }, + { + "epoch": 0.0867478973474229, + "grad_norm": 0.8574134111404419, + "learning_rate": 9.989768779846717e-06, + "loss": 0.8191, + "step": 1609 + }, + { + "epoch": 0.08680181151606642, + "grad_norm": 1.0049889087677002, + "learning_rate": 9.989755219722472e-06, + "loss": 0.8771, + "step": 1610 + }, + { + "epoch": 0.08685572568470994, + "grad_norm": 0.9765112996101379, + "learning_rate": 9.989741650627319e-06, + "loss": 0.839, + "step": 1611 + }, + { + "epoch": 0.08690963985335347, + "grad_norm": 0.9430082440376282, + "learning_rate": 9.989728072561284e-06, + "loss": 1.0316, + "step": 1612 + }, + { + "epoch": 0.08696355402199699, + "grad_norm": 0.841590404510498, + "learning_rate": 9.989714485524391e-06, + "loss": 0.8727, + "step": 1613 + }, + { + "epoch": 0.0870174681906405, + "grad_norm": 0.9475975632667542, + "learning_rate": 9.989700889516664e-06, + "loss": 0.8131, + "step": 1614 + }, + { + "epoch": 0.08707138235928402, + "grad_norm": 0.8059530258178711, + "learning_rate": 9.98968728453813e-06, + "loss": 0.8297, + "step": 1615 + }, + { + "epoch": 0.08712529652792754, + "grad_norm": 0.8513601422309875, + "learning_rate": 9.989673670588808e-06, + "loss": 0.8016, + "step": 1616 + }, + { + "epoch": 0.08717921069657106, + "grad_norm": 0.8434658646583557, + "learning_rate": 9.989660047668728e-06, + "loss": 0.866, + "step": 1617 + }, + { + "epoch": 0.08723312486521458, + "grad_norm": 0.9081484079360962, + "learning_rate": 9.989646415777912e-06, + "loss": 0.816, + "step": 1618 + }, + { + "epoch": 0.0872870390338581, + "grad_norm": 0.7941877841949463, + "learning_rate": 9.989632774916385e-06, + "loss": 0.7191, + "step": 1619 + }, + { + "epoch": 0.08734095320250161, + "grad_norm": 0.8800172209739685, + "learning_rate": 9.98961912508417e-06, + "loss": 0.8135, + "step": 1620 + }, + { + "epoch": 0.08739486737114514, + "grad_norm": 0.7940575480461121, + "learning_rate": 9.989605466281292e-06, + "loss": 0.8124, + "step": 1621 + }, + { + "epoch": 0.08744878153978866, + "grad_norm": 0.9570618271827698, + "learning_rate": 9.989591798507779e-06, + "loss": 0.9043, + "step": 1622 + }, + { + "epoch": 0.08750269570843218, + "grad_norm": 0.8635395169258118, + "learning_rate": 9.98957812176365e-06, + "loss": 0.835, + "step": 1623 + }, + { + "epoch": 0.0875566098770757, + "grad_norm": 0.8289955258369446, + "learning_rate": 9.989564436048932e-06, + "loss": 0.8265, + "step": 1624 + }, + { + "epoch": 0.08761052404571922, + "grad_norm": 0.9519028663635254, + "learning_rate": 9.989550741363654e-06, + "loss": 0.8127, + "step": 1625 + }, + { + "epoch": 0.08766443821436273, + "grad_norm": 0.9611422419548035, + "learning_rate": 9.989537037707834e-06, + "loss": 0.8422, + "step": 1626 + }, + { + "epoch": 0.08771835238300625, + "grad_norm": 0.8824746608734131, + "learning_rate": 9.9895233250815e-06, + "loss": 0.8669, + "step": 1627 + }, + { + "epoch": 0.08777226655164977, + "grad_norm": 0.8402838706970215, + "learning_rate": 9.989509603484676e-06, + "loss": 0.8072, + "step": 1628 + }, + { + "epoch": 0.08782618072029329, + "grad_norm": 0.7537099719047546, + "learning_rate": 9.989495872917386e-06, + "loss": 0.7127, + "step": 1629 + }, + { + "epoch": 0.08788009488893682, + "grad_norm": 0.78285151720047, + "learning_rate": 9.989482133379656e-06, + "loss": 0.819, + "step": 1630 + }, + { + "epoch": 0.08793400905758034, + "grad_norm": 0.9339445233345032, + "learning_rate": 9.98946838487151e-06, + "loss": 0.8694, + "step": 1631 + }, + { + "epoch": 0.08798792322622385, + "grad_norm": 0.8022040128707886, + "learning_rate": 9.989454627392973e-06, + "loss": 0.7601, + "step": 1632 + }, + { + "epoch": 0.08804183739486737, + "grad_norm": 0.8593827486038208, + "learning_rate": 9.98944086094407e-06, + "loss": 0.8536, + "step": 1633 + }, + { + "epoch": 0.08809575156351089, + "grad_norm": 0.8415039777755737, + "learning_rate": 9.989427085524824e-06, + "loss": 0.9027, + "step": 1634 + }, + { + "epoch": 0.08814966573215441, + "grad_norm": 0.9551103711128235, + "learning_rate": 9.989413301135263e-06, + "loss": 0.8063, + "step": 1635 + }, + { + "epoch": 0.08820357990079793, + "grad_norm": 0.8554351925849915, + "learning_rate": 9.989399507775407e-06, + "loss": 0.7694, + "step": 1636 + }, + { + "epoch": 0.08825749406944144, + "grad_norm": 0.8688547015190125, + "learning_rate": 9.989385705445285e-06, + "loss": 0.8862, + "step": 1637 + }, + { + "epoch": 0.08831140823808496, + "grad_norm": 0.816558837890625, + "learning_rate": 9.98937189414492e-06, + "loss": 0.7302, + "step": 1638 + }, + { + "epoch": 0.0883653224067285, + "grad_norm": 0.8164445757865906, + "learning_rate": 9.989358073874337e-06, + "loss": 0.8724, + "step": 1639 + }, + { + "epoch": 0.08841923657537201, + "grad_norm": 0.8909460306167603, + "learning_rate": 9.989344244633564e-06, + "loss": 0.7618, + "step": 1640 + }, + { + "epoch": 0.08847315074401553, + "grad_norm": 1.0117470026016235, + "learning_rate": 9.98933040642262e-06, + "loss": 0.8191, + "step": 1641 + }, + { + "epoch": 0.08852706491265905, + "grad_norm": 0.8317937850952148, + "learning_rate": 9.989316559241533e-06, + "loss": 0.8339, + "step": 1642 + }, + { + "epoch": 0.08858097908130257, + "grad_norm": 0.7955135107040405, + "learning_rate": 9.98930270309033e-06, + "loss": 0.7799, + "step": 1643 + }, + { + "epoch": 0.08863489324994608, + "grad_norm": 0.996306300163269, + "learning_rate": 9.98928883796903e-06, + "loss": 0.8547, + "step": 1644 + }, + { + "epoch": 0.0886888074185896, + "grad_norm": 0.9679511189460754, + "learning_rate": 9.989274963877664e-06, + "loss": 1.0831, + "step": 1645 + }, + { + "epoch": 0.08874272158723312, + "grad_norm": 0.8471615314483643, + "learning_rate": 9.989261080816253e-06, + "loss": 0.7765, + "step": 1646 + }, + { + "epoch": 0.08879663575587664, + "grad_norm": 0.8662555813789368, + "learning_rate": 9.989247188784826e-06, + "loss": 0.8894, + "step": 1647 + }, + { + "epoch": 0.08885054992452017, + "grad_norm": 0.9549373388290405, + "learning_rate": 9.989233287783402e-06, + "loss": 0.8341, + "step": 1648 + }, + { + "epoch": 0.08890446409316369, + "grad_norm": 0.8179014325141907, + "learning_rate": 9.989219377812014e-06, + "loss": 0.8653, + "step": 1649 + }, + { + "epoch": 0.0889583782618072, + "grad_norm": 0.9237802624702454, + "learning_rate": 9.989205458870678e-06, + "loss": 0.8206, + "step": 1650 + }, + { + "epoch": 0.08901229243045072, + "grad_norm": 0.940217137336731, + "learning_rate": 9.989191530959426e-06, + "loss": 0.8695, + "step": 1651 + }, + { + "epoch": 0.08906620659909424, + "grad_norm": 0.9200409054756165, + "learning_rate": 9.98917759407828e-06, + "loss": 0.7984, + "step": 1652 + }, + { + "epoch": 0.08912012076773776, + "grad_norm": 0.9270562529563904, + "learning_rate": 9.989163648227265e-06, + "loss": 0.8265, + "step": 1653 + }, + { + "epoch": 0.08917403493638128, + "grad_norm": 0.9945223331451416, + "learning_rate": 9.989149693406408e-06, + "loss": 0.84, + "step": 1654 + }, + { + "epoch": 0.0892279491050248, + "grad_norm": 0.826195478439331, + "learning_rate": 9.98913572961573e-06, + "loss": 0.7862, + "step": 1655 + }, + { + "epoch": 0.08928186327366831, + "grad_norm": 0.9132022857666016, + "learning_rate": 9.989121756855263e-06, + "loss": 0.826, + "step": 1656 + }, + { + "epoch": 0.08933577744231185, + "grad_norm": 0.8559401631355286, + "learning_rate": 9.989107775125023e-06, + "loss": 0.8007, + "step": 1657 + }, + { + "epoch": 0.08938969161095536, + "grad_norm": 0.8000867366790771, + "learning_rate": 9.989093784425044e-06, + "loss": 0.7547, + "step": 1658 + }, + { + "epoch": 0.08944360577959888, + "grad_norm": 0.7761433720588684, + "learning_rate": 9.989079784755346e-06, + "loss": 0.8083, + "step": 1659 + }, + { + "epoch": 0.0894975199482424, + "grad_norm": 0.8072230815887451, + "learning_rate": 9.989065776115956e-06, + "loss": 0.892, + "step": 1660 + }, + { + "epoch": 0.08955143411688592, + "grad_norm": 0.9021360874176025, + "learning_rate": 9.989051758506898e-06, + "loss": 0.8715, + "step": 1661 + }, + { + "epoch": 0.08960534828552943, + "grad_norm": 0.7585147023200989, + "learning_rate": 9.989037731928197e-06, + "loss": 0.7115, + "step": 1662 + }, + { + "epoch": 0.08965926245417295, + "grad_norm": 0.9388399124145508, + "learning_rate": 9.98902369637988e-06, + "loss": 0.8976, + "step": 1663 + }, + { + "epoch": 0.08971317662281647, + "grad_norm": 0.8454418778419495, + "learning_rate": 9.989009651861972e-06, + "loss": 0.8063, + "step": 1664 + }, + { + "epoch": 0.08976709079146, + "grad_norm": 0.82308030128479, + "learning_rate": 9.988995598374496e-06, + "loss": 0.8044, + "step": 1665 + }, + { + "epoch": 0.08982100496010352, + "grad_norm": 1.006800651550293, + "learning_rate": 9.98898153591748e-06, + "loss": 0.8609, + "step": 1666 + }, + { + "epoch": 0.08987491912874704, + "grad_norm": 0.8325724601745605, + "learning_rate": 9.988967464490947e-06, + "loss": 0.8295, + "step": 1667 + }, + { + "epoch": 0.08992883329739056, + "grad_norm": 0.7575547695159912, + "learning_rate": 9.988953384094923e-06, + "loss": 0.8252, + "step": 1668 + }, + { + "epoch": 0.08998274746603407, + "grad_norm": 0.869877278804779, + "learning_rate": 9.988939294729436e-06, + "loss": 0.8304, + "step": 1669 + }, + { + "epoch": 0.09003666163467759, + "grad_norm": 0.7840037941932678, + "learning_rate": 9.988925196394508e-06, + "loss": 0.7742, + "step": 1670 + }, + { + "epoch": 0.09009057580332111, + "grad_norm": 0.8044409155845642, + "learning_rate": 9.988911089090163e-06, + "loss": 0.8371, + "step": 1671 + }, + { + "epoch": 0.09014448997196463, + "grad_norm": 0.8635613322257996, + "learning_rate": 9.988896972816431e-06, + "loss": 0.7693, + "step": 1672 + }, + { + "epoch": 0.09019840414060815, + "grad_norm": 0.7780656814575195, + "learning_rate": 9.988882847573335e-06, + "loss": 0.841, + "step": 1673 + }, + { + "epoch": 0.09025231830925168, + "grad_norm": 0.8938048481941223, + "learning_rate": 9.9888687133609e-06, + "loss": 0.8149, + "step": 1674 + }, + { + "epoch": 0.0903062324778952, + "grad_norm": 0.8432002663612366, + "learning_rate": 9.988854570179152e-06, + "loss": 0.853, + "step": 1675 + }, + { + "epoch": 0.09036014664653871, + "grad_norm": 0.8222450613975525, + "learning_rate": 9.988840418028118e-06, + "loss": 0.897, + "step": 1676 + }, + { + "epoch": 0.09041406081518223, + "grad_norm": 0.8370371460914612, + "learning_rate": 9.98882625690782e-06, + "loss": 0.8288, + "step": 1677 + }, + { + "epoch": 0.09046797498382575, + "grad_norm": 0.8510713577270508, + "learning_rate": 9.988812086818285e-06, + "loss": 0.7637, + "step": 1678 + }, + { + "epoch": 0.09052188915246927, + "grad_norm": 0.8271141648292542, + "learning_rate": 9.98879790775954e-06, + "loss": 0.853, + "step": 1679 + }, + { + "epoch": 0.09057580332111279, + "grad_norm": 1.0627025365829468, + "learning_rate": 9.988783719731607e-06, + "loss": 0.7569, + "step": 1680 + }, + { + "epoch": 0.0906297174897563, + "grad_norm": 0.880283534526825, + "learning_rate": 9.988769522734517e-06, + "loss": 0.8362, + "step": 1681 + }, + { + "epoch": 0.09068363165839982, + "grad_norm": 0.8721734881401062, + "learning_rate": 9.988755316768288e-06, + "loss": 0.8585, + "step": 1682 + }, + { + "epoch": 0.09073754582704335, + "grad_norm": 0.8830682039260864, + "learning_rate": 9.988741101832952e-06, + "loss": 0.8853, + "step": 1683 + }, + { + "epoch": 0.09079145999568687, + "grad_norm": 0.7676220536231995, + "learning_rate": 9.988726877928534e-06, + "loss": 0.7832, + "step": 1684 + }, + { + "epoch": 0.09084537416433039, + "grad_norm": 0.866149365901947, + "learning_rate": 9.988712645055055e-06, + "loss": 0.8534, + "step": 1685 + }, + { + "epoch": 0.09089928833297391, + "grad_norm": 0.8467028141021729, + "learning_rate": 9.988698403212546e-06, + "loss": 0.8637, + "step": 1686 + }, + { + "epoch": 0.09095320250161743, + "grad_norm": 0.913436770439148, + "learning_rate": 9.988684152401028e-06, + "loss": 0.855, + "step": 1687 + }, + { + "epoch": 0.09100711667026094, + "grad_norm": 0.8307977914810181, + "learning_rate": 9.98866989262053e-06, + "loss": 0.8538, + "step": 1688 + }, + { + "epoch": 0.09106103083890446, + "grad_norm": 1.13442862033844, + "learning_rate": 9.988655623871075e-06, + "loss": 0.8129, + "step": 1689 + }, + { + "epoch": 0.09111494500754798, + "grad_norm": 0.8950080871582031, + "learning_rate": 9.988641346152692e-06, + "loss": 0.8674, + "step": 1690 + }, + { + "epoch": 0.0911688591761915, + "grad_norm": 0.9107043147087097, + "learning_rate": 9.988627059465403e-06, + "loss": 0.9507, + "step": 1691 + }, + { + "epoch": 0.09122277334483503, + "grad_norm": 0.8210874795913696, + "learning_rate": 9.988612763809237e-06, + "loss": 0.8913, + "step": 1692 + }, + { + "epoch": 0.09127668751347855, + "grad_norm": 1.0306476354599, + "learning_rate": 9.988598459184217e-06, + "loss": 0.8589, + "step": 1693 + }, + { + "epoch": 0.09133060168212206, + "grad_norm": 0.7582615613937378, + "learning_rate": 9.98858414559037e-06, + "loss": 0.7482, + "step": 1694 + }, + { + "epoch": 0.09138451585076558, + "grad_norm": 0.8572216629981995, + "learning_rate": 9.98856982302772e-06, + "loss": 0.822, + "step": 1695 + }, + { + "epoch": 0.0914384300194091, + "grad_norm": 0.9358139038085938, + "learning_rate": 9.988555491496297e-06, + "loss": 0.8298, + "step": 1696 + }, + { + "epoch": 0.09149234418805262, + "grad_norm": 0.8705672025680542, + "learning_rate": 9.988541150996123e-06, + "loss": 0.8818, + "step": 1697 + }, + { + "epoch": 0.09154625835669614, + "grad_norm": 0.9081273674964905, + "learning_rate": 9.988526801527224e-06, + "loss": 0.8994, + "step": 1698 + }, + { + "epoch": 0.09160017252533965, + "grad_norm": 0.7358905076980591, + "learning_rate": 9.988512443089627e-06, + "loss": 0.7752, + "step": 1699 + }, + { + "epoch": 0.09165408669398317, + "grad_norm": 0.8570963740348816, + "learning_rate": 9.988498075683357e-06, + "loss": 0.908, + "step": 1700 + }, + { + "epoch": 0.0917080008626267, + "grad_norm": 0.8998208045959473, + "learning_rate": 9.988483699308442e-06, + "loss": 0.8561, + "step": 1701 + }, + { + "epoch": 0.09176191503127022, + "grad_norm": 0.7481779456138611, + "learning_rate": 9.988469313964903e-06, + "loss": 0.7184, + "step": 1702 + }, + { + "epoch": 0.09181582919991374, + "grad_norm": 1.052809238433838, + "learning_rate": 9.988454919652772e-06, + "loss": 0.8579, + "step": 1703 + }, + { + "epoch": 0.09186974336855726, + "grad_norm": 0.8492130637168884, + "learning_rate": 9.988440516372071e-06, + "loss": 0.8796, + "step": 1704 + }, + { + "epoch": 0.09192365753720078, + "grad_norm": 0.884483277797699, + "learning_rate": 9.988426104122826e-06, + "loss": 0.8781, + "step": 1705 + }, + { + "epoch": 0.0919775717058443, + "grad_norm": 0.8844857811927795, + "learning_rate": 9.988411682905065e-06, + "loss": 0.8981, + "step": 1706 + }, + { + "epoch": 0.09203148587448781, + "grad_norm": 0.906216025352478, + "learning_rate": 9.988397252718811e-06, + "loss": 0.8741, + "step": 1707 + }, + { + "epoch": 0.09208540004313133, + "grad_norm": 0.8565787076950073, + "learning_rate": 9.988382813564092e-06, + "loss": 0.7358, + "step": 1708 + }, + { + "epoch": 0.09213931421177485, + "grad_norm": 0.8036391139030457, + "learning_rate": 9.988368365440935e-06, + "loss": 0.7966, + "step": 1709 + }, + { + "epoch": 0.09219322838041838, + "grad_norm": 1.1708556413650513, + "learning_rate": 9.988353908349361e-06, + "loss": 0.8385, + "step": 1710 + }, + { + "epoch": 0.0922471425490619, + "grad_norm": 0.8536746501922607, + "learning_rate": 9.988339442289403e-06, + "loss": 0.7387, + "step": 1711 + }, + { + "epoch": 0.09230105671770542, + "grad_norm": 0.8376518487930298, + "learning_rate": 9.988324967261083e-06, + "loss": 0.8537, + "step": 1712 + }, + { + "epoch": 0.09235497088634893, + "grad_norm": 0.8793227672576904, + "learning_rate": 9.988310483264426e-06, + "loss": 0.8028, + "step": 1713 + }, + { + "epoch": 0.09240888505499245, + "grad_norm": 0.8186830282211304, + "learning_rate": 9.98829599029946e-06, + "loss": 0.8478, + "step": 1714 + }, + { + "epoch": 0.09246279922363597, + "grad_norm": 0.8845428824424744, + "learning_rate": 9.98828148836621e-06, + "loss": 0.8524, + "step": 1715 + }, + { + "epoch": 0.09251671339227949, + "grad_norm": 1.0494492053985596, + "learning_rate": 9.988266977464704e-06, + "loss": 0.8542, + "step": 1716 + }, + { + "epoch": 0.092570627560923, + "grad_norm": 0.8876493573188782, + "learning_rate": 9.988252457594966e-06, + "loss": 0.8989, + "step": 1717 + }, + { + "epoch": 0.09262454172956654, + "grad_norm": 0.8787088394165039, + "learning_rate": 9.988237928757024e-06, + "loss": 0.8214, + "step": 1718 + }, + { + "epoch": 0.09267845589821005, + "grad_norm": 1.069684624671936, + "learning_rate": 9.988223390950901e-06, + "loss": 0.9714, + "step": 1719 + }, + { + "epoch": 0.09273237006685357, + "grad_norm": 0.7957501411437988, + "learning_rate": 9.988208844176626e-06, + "loss": 0.7562, + "step": 1720 + }, + { + "epoch": 0.09278628423549709, + "grad_norm": 0.8354908227920532, + "learning_rate": 9.988194288434225e-06, + "loss": 0.7494, + "step": 1721 + }, + { + "epoch": 0.09284019840414061, + "grad_norm": 0.8205936551094055, + "learning_rate": 9.988179723723722e-06, + "loss": 0.7727, + "step": 1722 + }, + { + "epoch": 0.09289411257278413, + "grad_norm": 0.8364951014518738, + "learning_rate": 9.988165150045146e-06, + "loss": 0.861, + "step": 1723 + }, + { + "epoch": 0.09294802674142764, + "grad_norm": 0.8664119243621826, + "learning_rate": 9.98815056739852e-06, + "loss": 0.8512, + "step": 1724 + }, + { + "epoch": 0.09300194091007116, + "grad_norm": 0.9565482139587402, + "learning_rate": 9.988135975783874e-06, + "loss": 0.8606, + "step": 1725 + }, + { + "epoch": 0.09305585507871468, + "grad_norm": 0.8696085214614868, + "learning_rate": 9.988121375201232e-06, + "loss": 0.8614, + "step": 1726 + }, + { + "epoch": 0.09310976924735821, + "grad_norm": 0.8623467683792114, + "learning_rate": 9.98810676565062e-06, + "loss": 0.8547, + "step": 1727 + }, + { + "epoch": 0.09316368341600173, + "grad_norm": 0.8284831047058105, + "learning_rate": 9.988092147132064e-06, + "loss": 0.8376, + "step": 1728 + }, + { + "epoch": 0.09321759758464525, + "grad_norm": 0.7768245339393616, + "learning_rate": 9.988077519645591e-06, + "loss": 0.7472, + "step": 1729 + }, + { + "epoch": 0.09327151175328877, + "grad_norm": 1.221225619316101, + "learning_rate": 9.988062883191228e-06, + "loss": 0.9052, + "step": 1730 + }, + { + "epoch": 0.09332542592193228, + "grad_norm": 1.0027954578399658, + "learning_rate": 9.988048237769002e-06, + "loss": 0.9411, + "step": 1731 + }, + { + "epoch": 0.0933793400905758, + "grad_norm": 0.8029824495315552, + "learning_rate": 9.988033583378937e-06, + "loss": 0.8141, + "step": 1732 + }, + { + "epoch": 0.09343325425921932, + "grad_norm": 0.8081389665603638, + "learning_rate": 9.98801892002106e-06, + "loss": 0.7977, + "step": 1733 + }, + { + "epoch": 0.09348716842786284, + "grad_norm": 0.887438952922821, + "learning_rate": 9.988004247695398e-06, + "loss": 0.8574, + "step": 1734 + }, + { + "epoch": 0.09354108259650636, + "grad_norm": 0.887238085269928, + "learning_rate": 9.987989566401977e-06, + "loss": 0.9041, + "step": 1735 + }, + { + "epoch": 0.09359499676514989, + "grad_norm": 0.9135997891426086, + "learning_rate": 9.987974876140822e-06, + "loss": 0.738, + "step": 1736 + }, + { + "epoch": 0.0936489109337934, + "grad_norm": 0.7749861478805542, + "learning_rate": 9.987960176911964e-06, + "loss": 0.773, + "step": 1737 + }, + { + "epoch": 0.09370282510243692, + "grad_norm": 0.7850096225738525, + "learning_rate": 9.987945468715425e-06, + "loss": 0.7924, + "step": 1738 + }, + { + "epoch": 0.09375673927108044, + "grad_norm": 0.8044145107269287, + "learning_rate": 9.987930751551231e-06, + "loss": 0.8196, + "step": 1739 + }, + { + "epoch": 0.09381065343972396, + "grad_norm": 0.8781464695930481, + "learning_rate": 9.987916025419413e-06, + "loss": 0.9337, + "step": 1740 + }, + { + "epoch": 0.09386456760836748, + "grad_norm": 1.0839952230453491, + "learning_rate": 9.987901290319993e-06, + "loss": 0.8092, + "step": 1741 + }, + { + "epoch": 0.093918481777011, + "grad_norm": 0.7910736203193665, + "learning_rate": 9.987886546253e-06, + "loss": 0.8775, + "step": 1742 + }, + { + "epoch": 0.09397239594565451, + "grad_norm": 0.887287974357605, + "learning_rate": 9.98787179321846e-06, + "loss": 0.8271, + "step": 1743 + }, + { + "epoch": 0.09402631011429803, + "grad_norm": 1.1318427324295044, + "learning_rate": 9.987857031216397e-06, + "loss": 0.8328, + "step": 1744 + }, + { + "epoch": 0.09408022428294156, + "grad_norm": 0.8660401105880737, + "learning_rate": 9.987842260246842e-06, + "loss": 0.8647, + "step": 1745 + }, + { + "epoch": 0.09413413845158508, + "grad_norm": 0.9396790266036987, + "learning_rate": 9.98782748030982e-06, + "loss": 0.9373, + "step": 1746 + }, + { + "epoch": 0.0941880526202286, + "grad_norm": 0.8715323209762573, + "learning_rate": 9.987812691405353e-06, + "loss": 0.8621, + "step": 1747 + }, + { + "epoch": 0.09424196678887212, + "grad_norm": 0.7882347106933594, + "learning_rate": 9.987797893533475e-06, + "loss": 0.7283, + "step": 1748 + }, + { + "epoch": 0.09429588095751563, + "grad_norm": 0.9641733765602112, + "learning_rate": 9.987783086694208e-06, + "loss": 0.8038, + "step": 1749 + }, + { + "epoch": 0.09434979512615915, + "grad_norm": 0.8808518648147583, + "learning_rate": 9.98776827088758e-06, + "loss": 0.8072, + "step": 1750 + }, + { + "epoch": 0.09440370929480267, + "grad_norm": 0.7720713019371033, + "learning_rate": 9.987753446113618e-06, + "loss": 0.7786, + "step": 1751 + }, + { + "epoch": 0.09445762346344619, + "grad_norm": 1.0507936477661133, + "learning_rate": 9.987738612372346e-06, + "loss": 0.9302, + "step": 1752 + }, + { + "epoch": 0.0945115376320897, + "grad_norm": 0.7705017328262329, + "learning_rate": 9.987723769663795e-06, + "loss": 0.7366, + "step": 1753 + }, + { + "epoch": 0.09456545180073324, + "grad_norm": 0.82464200258255, + "learning_rate": 9.987708917987989e-06, + "loss": 0.8063, + "step": 1754 + }, + { + "epoch": 0.09461936596937676, + "grad_norm": 0.9387272000312805, + "learning_rate": 9.987694057344953e-06, + "loss": 0.8108, + "step": 1755 + }, + { + "epoch": 0.09467328013802027, + "grad_norm": 0.9161933064460754, + "learning_rate": 9.987679187734717e-06, + "loss": 0.8331, + "step": 1756 + }, + { + "epoch": 0.09472719430666379, + "grad_norm": 0.9379769563674927, + "learning_rate": 9.987664309157306e-06, + "loss": 0.9064, + "step": 1757 + }, + { + "epoch": 0.09478110847530731, + "grad_norm": 0.9597976803779602, + "learning_rate": 9.987649421612748e-06, + "loss": 0.7785, + "step": 1758 + }, + { + "epoch": 0.09483502264395083, + "grad_norm": 0.8689720630645752, + "learning_rate": 9.98763452510107e-06, + "loss": 0.7828, + "step": 1759 + }, + { + "epoch": 0.09488893681259435, + "grad_norm": 0.9207726716995239, + "learning_rate": 9.987619619622296e-06, + "loss": 0.7853, + "step": 1760 + }, + { + "epoch": 0.09494285098123786, + "grad_norm": 0.8130320310592651, + "learning_rate": 9.987604705176455e-06, + "loss": 0.858, + "step": 1761 + }, + { + "epoch": 0.09499676514988138, + "grad_norm": 0.9004638195037842, + "learning_rate": 9.987589781763574e-06, + "loss": 0.8148, + "step": 1762 + }, + { + "epoch": 0.09505067931852491, + "grad_norm": 0.8554181456565857, + "learning_rate": 9.987574849383678e-06, + "loss": 0.8103, + "step": 1763 + }, + { + "epoch": 0.09510459348716843, + "grad_norm": 0.9148527979850769, + "learning_rate": 9.987559908036797e-06, + "loss": 0.9467, + "step": 1764 + }, + { + "epoch": 0.09515850765581195, + "grad_norm": 0.890083909034729, + "learning_rate": 9.987544957722956e-06, + "loss": 0.8338, + "step": 1765 + }, + { + "epoch": 0.09521242182445547, + "grad_norm": 0.8118012547492981, + "learning_rate": 9.98752999844218e-06, + "loss": 0.8355, + "step": 1766 + }, + { + "epoch": 0.09526633599309899, + "grad_norm": 0.8115151524543762, + "learning_rate": 9.987515030194498e-06, + "loss": 0.9172, + "step": 1767 + }, + { + "epoch": 0.0953202501617425, + "grad_norm": 0.8750082850456238, + "learning_rate": 9.987500052979938e-06, + "loss": 0.8301, + "step": 1768 + }, + { + "epoch": 0.09537416433038602, + "grad_norm": 0.9008756875991821, + "learning_rate": 9.987485066798525e-06, + "loss": 0.8642, + "step": 1769 + }, + { + "epoch": 0.09542807849902954, + "grad_norm": 0.8335922956466675, + "learning_rate": 9.987470071650287e-06, + "loss": 0.8466, + "step": 1770 + }, + { + "epoch": 0.09548199266767307, + "grad_norm": 0.8604272603988647, + "learning_rate": 9.987455067535249e-06, + "loss": 0.8801, + "step": 1771 + }, + { + "epoch": 0.09553590683631659, + "grad_norm": 0.889854371547699, + "learning_rate": 9.98744005445344e-06, + "loss": 0.8804, + "step": 1772 + }, + { + "epoch": 0.09558982100496011, + "grad_norm": 0.8756876587867737, + "learning_rate": 9.987425032404887e-06, + "loss": 0.8367, + "step": 1773 + }, + { + "epoch": 0.09564373517360363, + "grad_norm": 0.9071298837661743, + "learning_rate": 9.987410001389616e-06, + "loss": 0.8875, + "step": 1774 + }, + { + "epoch": 0.09569764934224714, + "grad_norm": 0.8214284777641296, + "learning_rate": 9.987394961407654e-06, + "loss": 0.7859, + "step": 1775 + }, + { + "epoch": 0.09575156351089066, + "grad_norm": 0.940034806728363, + "learning_rate": 9.98737991245903e-06, + "loss": 0.8272, + "step": 1776 + }, + { + "epoch": 0.09580547767953418, + "grad_norm": 0.8156501054763794, + "learning_rate": 9.987364854543768e-06, + "loss": 0.7831, + "step": 1777 + }, + { + "epoch": 0.0958593918481777, + "grad_norm": 0.8450450301170349, + "learning_rate": 9.987349787661898e-06, + "loss": 0.7888, + "step": 1778 + }, + { + "epoch": 0.09591330601682121, + "grad_norm": 0.8143148422241211, + "learning_rate": 9.987334711813446e-06, + "loss": 0.7593, + "step": 1779 + }, + { + "epoch": 0.09596722018546475, + "grad_norm": 1.0489457845687866, + "learning_rate": 9.987319626998437e-06, + "loss": 0.8248, + "step": 1780 + }, + { + "epoch": 0.09602113435410826, + "grad_norm": 0.9584689140319824, + "learning_rate": 9.987304533216901e-06, + "loss": 0.9025, + "step": 1781 + }, + { + "epoch": 0.09607504852275178, + "grad_norm": 0.8366501331329346, + "learning_rate": 9.987289430468862e-06, + "loss": 0.7513, + "step": 1782 + }, + { + "epoch": 0.0961289626913953, + "grad_norm": 0.9896461963653564, + "learning_rate": 9.987274318754352e-06, + "loss": 0.8598, + "step": 1783 + }, + { + "epoch": 0.09618287686003882, + "grad_norm": 1.1904568672180176, + "learning_rate": 9.987259198073396e-06, + "loss": 0.9143, + "step": 1784 + }, + { + "epoch": 0.09623679102868234, + "grad_norm": 0.8100086450576782, + "learning_rate": 9.987244068426019e-06, + "loss": 0.7733, + "step": 1785 + }, + { + "epoch": 0.09629070519732585, + "grad_norm": 0.7814387083053589, + "learning_rate": 9.987228929812249e-06, + "loss": 0.7735, + "step": 1786 + }, + { + "epoch": 0.09634461936596937, + "grad_norm": 0.8880924582481384, + "learning_rate": 9.987213782232115e-06, + "loss": 0.8377, + "step": 1787 + }, + { + "epoch": 0.09639853353461289, + "grad_norm": 0.8739203810691833, + "learning_rate": 9.987198625685643e-06, + "loss": 0.8851, + "step": 1788 + }, + { + "epoch": 0.09645244770325642, + "grad_norm": 0.8984062671661377, + "learning_rate": 9.987183460172861e-06, + "loss": 0.8773, + "step": 1789 + }, + { + "epoch": 0.09650636187189994, + "grad_norm": 1.2485296726226807, + "learning_rate": 9.987168285693795e-06, + "loss": 0.787, + "step": 1790 + }, + { + "epoch": 0.09656027604054346, + "grad_norm": 0.8414161205291748, + "learning_rate": 9.987153102248474e-06, + "loss": 0.7895, + "step": 1791 + }, + { + "epoch": 0.09661419020918698, + "grad_norm": 0.7895180583000183, + "learning_rate": 9.987137909836924e-06, + "loss": 0.7592, + "step": 1792 + }, + { + "epoch": 0.0966681043778305, + "grad_norm": 1.0752787590026855, + "learning_rate": 9.987122708459173e-06, + "loss": 0.8472, + "step": 1793 + }, + { + "epoch": 0.09672201854647401, + "grad_norm": 0.9069424271583557, + "learning_rate": 9.987107498115247e-06, + "loss": 0.8746, + "step": 1794 + }, + { + "epoch": 0.09677593271511753, + "grad_norm": 0.8566716909408569, + "learning_rate": 9.987092278805175e-06, + "loss": 0.7604, + "step": 1795 + }, + { + "epoch": 0.09682984688376105, + "grad_norm": 0.833852231502533, + "learning_rate": 9.987077050528983e-06, + "loss": 0.8645, + "step": 1796 + }, + { + "epoch": 0.09688376105240457, + "grad_norm": 0.8439596891403198, + "learning_rate": 9.9870618132867e-06, + "loss": 0.7673, + "step": 1797 + }, + { + "epoch": 0.0969376752210481, + "grad_norm": 0.9743669629096985, + "learning_rate": 9.987046567078352e-06, + "loss": 0.7754, + "step": 1798 + }, + { + "epoch": 0.09699158938969162, + "grad_norm": 0.9291634559631348, + "learning_rate": 9.987031311903968e-06, + "loss": 0.8431, + "step": 1799 + }, + { + "epoch": 0.09704550355833513, + "grad_norm": 1.169450283050537, + "learning_rate": 9.987016047763571e-06, + "loss": 0.9321, + "step": 1800 + }, + { + "epoch": 0.09709941772697865, + "grad_norm": 0.7758163809776306, + "learning_rate": 9.987000774657195e-06, + "loss": 0.7832, + "step": 1801 + }, + { + "epoch": 0.09715333189562217, + "grad_norm": 0.9673672914505005, + "learning_rate": 9.986985492584863e-06, + "loss": 0.9822, + "step": 1802 + }, + { + "epoch": 0.09720724606426569, + "grad_norm": 1.1516417264938354, + "learning_rate": 9.986970201546605e-06, + "loss": 0.9956, + "step": 1803 + }, + { + "epoch": 0.0972611602329092, + "grad_norm": 0.9660587906837463, + "learning_rate": 9.986954901542445e-06, + "loss": 0.8248, + "step": 1804 + }, + { + "epoch": 0.09731507440155272, + "grad_norm": 0.9452739953994751, + "learning_rate": 9.986939592572413e-06, + "loss": 0.8805, + "step": 1805 + }, + { + "epoch": 0.09736898857019624, + "grad_norm": 0.9339364171028137, + "learning_rate": 9.986924274636538e-06, + "loss": 0.8819, + "step": 1806 + }, + { + "epoch": 0.09742290273883977, + "grad_norm": 0.9344542026519775, + "learning_rate": 9.986908947734844e-06, + "loss": 0.8531, + "step": 1807 + }, + { + "epoch": 0.09747681690748329, + "grad_norm": 0.8910528421401978, + "learning_rate": 9.986893611867362e-06, + "loss": 0.8949, + "step": 1808 + }, + { + "epoch": 0.09753073107612681, + "grad_norm": 0.8484895825386047, + "learning_rate": 9.986878267034115e-06, + "loss": 0.8028, + "step": 1809 + }, + { + "epoch": 0.09758464524477033, + "grad_norm": 1.0784810781478882, + "learning_rate": 9.986862913235135e-06, + "loss": 0.9564, + "step": 1810 + }, + { + "epoch": 0.09763855941341384, + "grad_norm": 0.8350296020507812, + "learning_rate": 9.98684755047045e-06, + "loss": 0.8672, + "step": 1811 + }, + { + "epoch": 0.09769247358205736, + "grad_norm": 0.8558050990104675, + "learning_rate": 9.986832178740084e-06, + "loss": 0.8538, + "step": 1812 + }, + { + "epoch": 0.09774638775070088, + "grad_norm": 0.8633396029472351, + "learning_rate": 9.986816798044066e-06, + "loss": 0.8356, + "step": 1813 + }, + { + "epoch": 0.0978003019193444, + "grad_norm": 0.8256344199180603, + "learning_rate": 9.986801408382424e-06, + "loss": 0.7552, + "step": 1814 + }, + { + "epoch": 0.09785421608798792, + "grad_norm": 0.872844398021698, + "learning_rate": 9.986786009755186e-06, + "loss": 0.9153, + "step": 1815 + }, + { + "epoch": 0.09790813025663145, + "grad_norm": 0.842241108417511, + "learning_rate": 9.986770602162378e-06, + "loss": 0.7965, + "step": 1816 + }, + { + "epoch": 0.09796204442527497, + "grad_norm": 0.9673634171485901, + "learning_rate": 9.98675518560403e-06, + "loss": 0.8317, + "step": 1817 + }, + { + "epoch": 0.09801595859391848, + "grad_norm": 0.8744896650314331, + "learning_rate": 9.98673976008017e-06, + "loss": 0.7342, + "step": 1818 + }, + { + "epoch": 0.098069872762562, + "grad_norm": 0.7830422520637512, + "learning_rate": 9.986724325590825e-06, + "loss": 0.721, + "step": 1819 + }, + { + "epoch": 0.09812378693120552, + "grad_norm": 1.0335441827774048, + "learning_rate": 9.986708882136021e-06, + "loss": 0.8088, + "step": 1820 + }, + { + "epoch": 0.09817770109984904, + "grad_norm": 0.841342568397522, + "learning_rate": 9.986693429715785e-06, + "loss": 0.8847, + "step": 1821 + }, + { + "epoch": 0.09823161526849256, + "grad_norm": 0.9405834674835205, + "learning_rate": 9.98667796833015e-06, + "loss": 0.8878, + "step": 1822 + }, + { + "epoch": 0.09828552943713607, + "grad_norm": 0.8358225226402283, + "learning_rate": 9.986662497979138e-06, + "loss": 0.7377, + "step": 1823 + }, + { + "epoch": 0.0983394436057796, + "grad_norm": 0.8844004273414612, + "learning_rate": 9.98664701866278e-06, + "loss": 0.7236, + "step": 1824 + }, + { + "epoch": 0.09839335777442312, + "grad_norm": 0.8165417313575745, + "learning_rate": 9.986631530381105e-06, + "loss": 0.819, + "step": 1825 + }, + { + "epoch": 0.09844727194306664, + "grad_norm": 0.9569553732872009, + "learning_rate": 9.986616033134137e-06, + "loss": 0.9337, + "step": 1826 + }, + { + "epoch": 0.09850118611171016, + "grad_norm": 0.8311771750450134, + "learning_rate": 9.986600526921907e-06, + "loss": 0.8516, + "step": 1827 + }, + { + "epoch": 0.09855510028035368, + "grad_norm": 0.9444357752799988, + "learning_rate": 9.986585011744441e-06, + "loss": 0.805, + "step": 1828 + }, + { + "epoch": 0.0986090144489972, + "grad_norm": 1.0128875970840454, + "learning_rate": 9.986569487601769e-06, + "loss": 0.8514, + "step": 1829 + }, + { + "epoch": 0.09866292861764071, + "grad_norm": 0.8973994255065918, + "learning_rate": 9.986553954493917e-06, + "loss": 0.7938, + "step": 1830 + }, + { + "epoch": 0.09871684278628423, + "grad_norm": 0.8571779131889343, + "learning_rate": 9.986538412420912e-06, + "loss": 0.7506, + "step": 1831 + }, + { + "epoch": 0.09877075695492775, + "grad_norm": 0.9053436517715454, + "learning_rate": 9.986522861382785e-06, + "loss": 0.8551, + "step": 1832 + }, + { + "epoch": 0.09882467112357128, + "grad_norm": 0.9941746592521667, + "learning_rate": 9.986507301379562e-06, + "loss": 0.8828, + "step": 1833 + }, + { + "epoch": 0.0988785852922148, + "grad_norm": 0.9620066285133362, + "learning_rate": 9.986491732411272e-06, + "loss": 0.8982, + "step": 1834 + }, + { + "epoch": 0.09893249946085832, + "grad_norm": 0.9470074772834778, + "learning_rate": 9.986476154477941e-06, + "loss": 0.8295, + "step": 1835 + }, + { + "epoch": 0.09898641362950183, + "grad_norm": 0.9962137937545776, + "learning_rate": 9.986460567579599e-06, + "loss": 0.8714, + "step": 1836 + }, + { + "epoch": 0.09904032779814535, + "grad_norm": 0.8492829203605652, + "learning_rate": 9.986444971716273e-06, + "loss": 0.8234, + "step": 1837 + }, + { + "epoch": 0.09909424196678887, + "grad_norm": 0.9463719725608826, + "learning_rate": 9.986429366887994e-06, + "loss": 0.7769, + "step": 1838 + }, + { + "epoch": 0.09914815613543239, + "grad_norm": 0.8588153123855591, + "learning_rate": 9.986413753094786e-06, + "loss": 0.8883, + "step": 1839 + }, + { + "epoch": 0.0992020703040759, + "grad_norm": 0.7692183256149292, + "learning_rate": 9.986398130336677e-06, + "loss": 0.7691, + "step": 1840 + }, + { + "epoch": 0.09925598447271942, + "grad_norm": 0.8377199172973633, + "learning_rate": 9.986382498613699e-06, + "loss": 0.789, + "step": 1841 + }, + { + "epoch": 0.09930989864136296, + "grad_norm": 0.9783869385719299, + "learning_rate": 9.986366857925876e-06, + "loss": 0.8517, + "step": 1842 + }, + { + "epoch": 0.09936381281000647, + "grad_norm": 0.8233169913291931, + "learning_rate": 9.986351208273239e-06, + "loss": 0.8701, + "step": 1843 + }, + { + "epoch": 0.09941772697864999, + "grad_norm": 0.9393780827522278, + "learning_rate": 9.986335549655814e-06, + "loss": 0.8837, + "step": 1844 + }, + { + "epoch": 0.09947164114729351, + "grad_norm": 0.8517693877220154, + "learning_rate": 9.986319882073631e-06, + "loss": 0.9043, + "step": 1845 + }, + { + "epoch": 0.09952555531593703, + "grad_norm": 0.8296724557876587, + "learning_rate": 9.986304205526718e-06, + "loss": 0.7406, + "step": 1846 + }, + { + "epoch": 0.09957946948458055, + "grad_norm": 0.8372161388397217, + "learning_rate": 9.986288520015102e-06, + "loss": 0.7763, + "step": 1847 + }, + { + "epoch": 0.09963338365322406, + "grad_norm": 0.8086470365524292, + "learning_rate": 9.986272825538812e-06, + "loss": 0.8786, + "step": 1848 + }, + { + "epoch": 0.09968729782186758, + "grad_norm": 0.8562842011451721, + "learning_rate": 9.986257122097875e-06, + "loss": 0.8391, + "step": 1849 + }, + { + "epoch": 0.0997412119905111, + "grad_norm": 0.9052720665931702, + "learning_rate": 9.986241409692321e-06, + "loss": 0.948, + "step": 1850 + }, + { + "epoch": 0.09979512615915463, + "grad_norm": 0.8220609426498413, + "learning_rate": 9.986225688322178e-06, + "loss": 0.8039, + "step": 1851 + }, + { + "epoch": 0.09984904032779815, + "grad_norm": 0.8018030524253845, + "learning_rate": 9.98620995798747e-06, + "loss": 0.7748, + "step": 1852 + }, + { + "epoch": 0.09990295449644167, + "grad_norm": 0.8150879144668579, + "learning_rate": 9.986194218688235e-06, + "loss": 0.7304, + "step": 1853 + }, + { + "epoch": 0.09995686866508519, + "grad_norm": 0.8677535653114319, + "learning_rate": 9.98617847042449e-06, + "loss": 0.8756, + "step": 1854 + }, + { + "epoch": 0.1000107828337287, + "grad_norm": 0.8889294862747192, + "learning_rate": 9.986162713196272e-06, + "loss": 0.8926, + "step": 1855 + } + ], + "logging_steps": 1, + "max_steps": 74192, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1855, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.474209732975657e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1855/training_args.bin b/checkpoint-1855/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..736549377f516c4bc25a43293c6f37ec549a9a60 --- /dev/null +++ b/checkpoint-1855/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb97268504007caea6a1175a54f08b974d7fa47a1a5fb4547021d5b9d223b4a4 +size 7928 diff --git a/checkpoint-1855/zero_to_fp32.py b/checkpoint-1855/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-1855/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-3710/config.json b/checkpoint-3710/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7f34bbd5159c9a132258ecf79562e79459cb64d9 --- /dev/null +++ b/checkpoint-3710/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B-Instruct/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-3710/generation_config.json b/checkpoint-3710/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0484b997a9ea9b5b6d711db644716bfd32d5470e --- /dev/null +++ b/checkpoint-3710/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20eecad7490bd08efbc915570d68583f4d0139c0 --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10286ab639d30ded8ba5edd817a51c1d02df6790dca7a57508b3dd662cac4b73 +size 12045398464 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bb85bdcd888c612771b34cc72c34b8b770a6d10 --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:991b413d3ffcfc7b78d63989076fbfe9f3635c00307f2cc677300478fd25c3ba +size 12045399232 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99087f4a8a54a061f243020eddee1e8248b226fc --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19519676fda76b710ff25a88aba059b920f1662bf4d6ee4b639fee77d243099a +size 12045399488 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f9e29a2d98c8bd8ecd36878079766423edb7eec --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30bf413c97aaa2552a889a9ec3561fb73ecdbfe32a6f4e77be9150928b50e43d +size 12045399232 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff4a14cdb190d6a24f27d1e2c53eda048c3752d8 --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a82980242675c001ba12a23ebe18875cbd94c1508b42a1dd889310d6245ffe7 +size 12045399488 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3415ed77d235f2049728e1112876b7d73e35adc --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b462e9dca5fc68f7750c74dd105d1e1430d3324e1b36b0589c0348121a61ce42 +size 12045399552 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc2e236c911edacc8169e80dc836f60ca983277b --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3a49d67ca39581e484e3f0a0bdbb00df822a7e1d1bee5e55d064bf3889ec40 +size 12045399232 diff --git a/checkpoint-3710/global_step3710/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd1ff3f3ed8f03a9564a1254dedf093df3c92e72 --- /dev/null +++ b/checkpoint-3710/global_step3710/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3803b8532b85c0edc10aebe8505cc4b0db91bd796d9e9518f56a418508d19350 +size 12045398144 diff --git a/checkpoint-3710/global_step3710/mp_rank_00_model_states.pt b/checkpoint-3710/global_step3710/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8df707a346ad6b0aeda5688f842982c512eff940 --- /dev/null +++ b/checkpoint-3710/global_step3710/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2693b8097c830403babca4838f23ef7aec16553e2dc7a7df0d35c602dd2e8ae +size 16060610552 diff --git a/checkpoint-3710/latest b/checkpoint-3710/latest new file mode 100644 index 0000000000000000000000000000000000000000..124ddfb0db5975ff2dfd07a7bcbdd6c2e5bacb0e --- /dev/null +++ b/checkpoint-3710/latest @@ -0,0 +1 @@ +global_step3710 \ No newline at end of file diff --git a/checkpoint-3710/model-00001-of-00004.safetensors b/checkpoint-3710/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..013a577f156bd95ab53555e25803fff60611b8d1 --- /dev/null +++ b/checkpoint-3710/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ca4755e54ef51e959b66fe960cf91b0a5fd70d50060442b73ccfbe387b43ce +size 4976698672 diff --git a/checkpoint-3710/model-00002-of-00004.safetensors b/checkpoint-3710/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8c8c0e5dec9d29bf3d7fcc2f8afd2ebe8d75d454 --- /dev/null +++ b/checkpoint-3710/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a41a0b91133a5d3872911c11f2954944cbc4662c73beeb1d30c36360110962 +size 4999802720 diff --git a/checkpoint-3710/model-00003-of-00004.safetensors b/checkpoint-3710/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a35ed719fbb9759a0c5c5f4661eab5fc943ffd4c --- /dev/null +++ b/checkpoint-3710/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3feb9885a98294b48c0dbaa4da22b1659f12665c3a9ce44f9d68f831ef4075 +size 4915916176 diff --git a/checkpoint-3710/model-00004-of-00004.safetensors b/checkpoint-3710/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..70d33de83abf25f122dff10726927c86d198cb63 --- /dev/null +++ b/checkpoint-3710/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:649b0a56fb7599334a5e06281af2f433e1195110a8771dc7a34fca716b6355bb +size 1168138808 diff --git a/checkpoint-3710/model.safetensors.index.json b/checkpoint-3710/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/checkpoint-3710/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-3710/rng_state_0.pth b/checkpoint-3710/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-3710/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-3710/rng_state_1.pth b/checkpoint-3710/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-3710/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-3710/rng_state_2.pth b/checkpoint-3710/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-3710/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-3710/rng_state_3.pth b/checkpoint-3710/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-3710/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-3710/rng_state_4.pth b/checkpoint-3710/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-3710/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-3710/rng_state_5.pth b/checkpoint-3710/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-3710/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-3710/rng_state_6.pth b/checkpoint-3710/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-3710/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-3710/rng_state_7.pth b/checkpoint-3710/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-3710/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-3710/scheduler.pt b/checkpoint-3710/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc5bf68155d8a04980d05bb385f508e55862bbb2 --- /dev/null +++ b/checkpoint-3710/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8baf224c87ecd773c4790249a7c513e300b7a036aa93a88602ebdd86ed2f84 +size 1064 diff --git a/checkpoint-3710/special_tokens_map.json b/checkpoint-3710/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-3710/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-3710/tokenizer.json b/checkpoint-3710/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a5a81eb733cae803b39ffc7644de0048c3a26c3 --- /dev/null +++ b/checkpoint-3710/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d7990a7c3f12081b24b3d098ab366211161e43494d2368211815c164b5f2b7 +size 17209828 diff --git a/checkpoint-3710/tokenizer_config.json b/checkpoint-3710/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5cd68a680b8f949dba64516158c30db7ea52c3cd --- /dev/null +++ b/checkpoint-3710/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-3710/trainer_state.json b/checkpoint-3710/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3b0d9e80775000ad0b6f38b427642606cc966b79 --- /dev/null +++ b/checkpoint-3710/trainer_state.json @@ -0,0 +1,26003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000215656674574, + "eval_steps": 500, + "global_step": 3710, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.391416864351952e-05, + "grad_norm": 53.75010299682617, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.5864, + "step": 1 + }, + { + "epoch": 0.00010782833728703904, + "grad_norm": 45.00067138671875, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3757, + "step": 2 + }, + { + "epoch": 0.00016174250593055855, + "grad_norm": 51.22366714477539, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.4653, + "step": 3 + }, + { + "epoch": 0.00021565667457407807, + "grad_norm": 62.225242614746094, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5819, + "step": 4 + }, + { + "epoch": 0.0002695708432175976, + "grad_norm": 54.67008590698242, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6368, + "step": 5 + }, + { + "epoch": 0.0003234850118611171, + "grad_norm": 51.261009216308594, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3245, + "step": 6 + }, + { + "epoch": 0.0003773991805046366, + "grad_norm": 53.58714294433594, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7622, + "step": 7 + }, + { + "epoch": 0.00043131334914815614, + "grad_norm": 41.32997131347656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.6444, + "step": 8 + }, + { + "epoch": 0.00048522751779167566, + "grad_norm": 33.232242584228516, + "learning_rate": 9.000000000000001e-07, + "loss": 2.1475, + "step": 9 + }, + { + "epoch": 0.0005391416864351952, + "grad_norm": 34.1890983581543, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7256, + "step": 10 + }, + { + "epoch": 0.0005930558550787146, + "grad_norm": 19.263437271118164, + "learning_rate": 1.1e-06, + "loss": 2.4132, + "step": 11 + }, + { + "epoch": 0.0006469700237222342, + "grad_norm": 15.612638473510742, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.0422, + "step": 12 + }, + { + "epoch": 0.0007008841923657537, + "grad_norm": 13.81751537322998, + "learning_rate": 1.3e-06, + "loss": 1.9663, + "step": 13 + }, + { + "epoch": 0.0007547983610092732, + "grad_norm": 16.390897750854492, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.1135, + "step": 14 + }, + { + "epoch": 0.0008087125296527927, + "grad_norm": 21.830646514892578, + "learning_rate": 1.5e-06, + "loss": 2.217, + "step": 15 + }, + { + "epoch": 0.0008626266982963123, + "grad_norm": 18.630046844482422, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1612, + "step": 16 + }, + { + "epoch": 0.0009165408669398317, + "grad_norm": 12.403571128845215, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9358, + "step": 17 + }, + { + "epoch": 0.0009704550355833513, + "grad_norm": 7.713366508483887, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8522, + "step": 18 + }, + { + "epoch": 0.001024369204226871, + "grad_norm": 7.731616973876953, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.7984, + "step": 19 + }, + { + "epoch": 0.0010782833728703904, + "grad_norm": 7.5799174308776855, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.701, + "step": 20 + }, + { + "epoch": 0.0011321975415139098, + "grad_norm": 5.5428080558776855, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.624, + "step": 21 + }, + { + "epoch": 0.0011861117101574293, + "grad_norm": 5.851474285125732, + "learning_rate": 2.2e-06, + "loss": 1.8064, + "step": 22 + }, + { + "epoch": 0.001240025878800949, + "grad_norm": 5.243111610412598, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7246, + "step": 23 + }, + { + "epoch": 0.0012939400474444684, + "grad_norm": 4.835971832275391, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.763, + "step": 24 + }, + { + "epoch": 0.0013478542160879879, + "grad_norm": 4.127845287322998, + "learning_rate": 2.5e-06, + "loss": 1.5869, + "step": 25 + }, + { + "epoch": 0.0014017683847315074, + "grad_norm": 3.7648322582244873, + "learning_rate": 2.6e-06, + "loss": 1.5599, + "step": 26 + }, + { + "epoch": 0.001455682553375027, + "grad_norm": 3.5424962043762207, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4703, + "step": 27 + }, + { + "epoch": 0.0015095967220185465, + "grad_norm": 3.3707985877990723, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.354, + "step": 28 + }, + { + "epoch": 0.001563510890662066, + "grad_norm": 4.71254825592041, + "learning_rate": 2.9e-06, + "loss": 1.8162, + "step": 29 + }, + { + "epoch": 0.0016174250593055854, + "grad_norm": 3.7660300731658936, + "learning_rate": 3e-06, + "loss": 1.5951, + "step": 30 + }, + { + "epoch": 0.001671339227949105, + "grad_norm": 3.4810571670532227, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.5183, + "step": 31 + }, + { + "epoch": 0.0017252533965926246, + "grad_norm": 3.672693967819214, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6374, + "step": 32 + }, + { + "epoch": 0.001779167565236144, + "grad_norm": 3.3589682579040527, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.4371, + "step": 33 + }, + { + "epoch": 0.0018330817338796635, + "grad_norm": 3.6365807056427, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.595, + "step": 34 + }, + { + "epoch": 0.0018869959025231832, + "grad_norm": 3.6467039585113525, + "learning_rate": 3.5e-06, + "loss": 1.5714, + "step": 35 + }, + { + "epoch": 0.0019409100711667026, + "grad_norm": 3.4684648513793945, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4897, + "step": 36 + }, + { + "epoch": 0.001994824239810222, + "grad_norm": 3.70845627784729, + "learning_rate": 3.7e-06, + "loss": 1.5954, + "step": 37 + }, + { + "epoch": 0.002048738408453742, + "grad_norm": 3.1803395748138428, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3976, + "step": 38 + }, + { + "epoch": 0.002102652577097261, + "grad_norm": 2.851703405380249, + "learning_rate": 3.900000000000001e-06, + "loss": 1.1894, + "step": 39 + }, + { + "epoch": 0.0021565667457407807, + "grad_norm": 2.832003593444824, + "learning_rate": 4.000000000000001e-06, + "loss": 1.353, + "step": 40 + }, + { + "epoch": 0.0022104809143843004, + "grad_norm": 3.397498607635498, + "learning_rate": 4.1e-06, + "loss": 1.4541, + "step": 41 + }, + { + "epoch": 0.0022643950830278196, + "grad_norm": 3.4537954330444336, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.4475, + "step": 42 + }, + { + "epoch": 0.0023183092516713393, + "grad_norm": 3.1131632328033447, + "learning_rate": 4.3e-06, + "loss": 1.2707, + "step": 43 + }, + { + "epoch": 0.0023722234203148586, + "grad_norm": 3.0421881675720215, + "learning_rate": 4.4e-06, + "loss": 1.3418, + "step": 44 + }, + { + "epoch": 0.0024261375889583782, + "grad_norm": 3.528514862060547, + "learning_rate": 4.5e-06, + "loss": 1.4432, + "step": 45 + }, + { + "epoch": 0.002480051757601898, + "grad_norm": 3.6783225536346436, + "learning_rate": 4.600000000000001e-06, + "loss": 1.4863, + "step": 46 + }, + { + "epoch": 0.002533965926245417, + "grad_norm": 2.9829189777374268, + "learning_rate": 4.7e-06, + "loss": 1.2856, + "step": 47 + }, + { + "epoch": 0.002587880094888937, + "grad_norm": 3.4480350017547607, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4129, + "step": 48 + }, + { + "epoch": 0.0026417942635324565, + "grad_norm": 3.4247214794158936, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3467, + "step": 49 + }, + { + "epoch": 0.0026957084321759758, + "grad_norm": 3.5268948078155518, + "learning_rate": 5e-06, + "loss": 1.4795, + "step": 50 + }, + { + "epoch": 0.0027496226008194955, + "grad_norm": 3.3228304386138916, + "learning_rate": 5.1e-06, + "loss": 1.461, + "step": 51 + }, + { + "epoch": 0.0028035367694630147, + "grad_norm": 3.365630865097046, + "learning_rate": 5.2e-06, + "loss": 1.2947, + "step": 52 + }, + { + "epoch": 0.0028574509381065344, + "grad_norm": 3.4889328479766846, + "learning_rate": 5.300000000000001e-06, + "loss": 1.432, + "step": 53 + }, + { + "epoch": 0.002911365106750054, + "grad_norm": 3.5767273902893066, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3773, + "step": 54 + }, + { + "epoch": 0.0029652792753935733, + "grad_norm": 3.499298095703125, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4132, + "step": 55 + }, + { + "epoch": 0.003019193444037093, + "grad_norm": 3.6990244388580322, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4595, + "step": 56 + }, + { + "epoch": 0.0030731076126806127, + "grad_norm": 3.0908327102661133, + "learning_rate": 5.7e-06, + "loss": 1.1873, + "step": 57 + }, + { + "epoch": 0.003127021781324132, + "grad_norm": 3.149425745010376, + "learning_rate": 5.8e-06, + "loss": 1.3306, + "step": 58 + }, + { + "epoch": 0.0031809359499676516, + "grad_norm": 3.193023204803467, + "learning_rate": 5.9e-06, + "loss": 1.3326, + "step": 59 + }, + { + "epoch": 0.003234850118611171, + "grad_norm": 3.610344409942627, + "learning_rate": 6e-06, + "loss": 1.4527, + "step": 60 + }, + { + "epoch": 0.0032887642872546905, + "grad_norm": 2.9877095222473145, + "learning_rate": 6.1e-06, + "loss": 1.2029, + "step": 61 + }, + { + "epoch": 0.00334267845589821, + "grad_norm": 3.0241923332214355, + "learning_rate": 6.200000000000001e-06, + "loss": 1.3413, + "step": 62 + }, + { + "epoch": 0.0033965926245417295, + "grad_norm": 3.212700366973877, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3471, + "step": 63 + }, + { + "epoch": 0.003450506793185249, + "grad_norm": 2.7138960361480713, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.0885, + "step": 64 + }, + { + "epoch": 0.0035044209618287684, + "grad_norm": 2.5690340995788574, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1168, + "step": 65 + }, + { + "epoch": 0.003558335130472288, + "grad_norm": 3.0344784259796143, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2828, + "step": 66 + }, + { + "epoch": 0.0036122492991158077, + "grad_norm": 3.0589816570281982, + "learning_rate": 6.700000000000001e-06, + "loss": 1.2604, + "step": 67 + }, + { + "epoch": 0.003666163467759327, + "grad_norm": 2.676417112350464, + "learning_rate": 6.800000000000001e-06, + "loss": 1.1679, + "step": 68 + }, + { + "epoch": 0.0037200776364028467, + "grad_norm": 2.6590960025787354, + "learning_rate": 6.9e-06, + "loss": 1.2283, + "step": 69 + }, + { + "epoch": 0.0037739918050463664, + "grad_norm": 2.6973354816436768, + "learning_rate": 7e-06, + "loss": 1.2028, + "step": 70 + }, + { + "epoch": 0.0038279059736898856, + "grad_norm": 2.7046608924865723, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2629, + "step": 71 + }, + { + "epoch": 0.0038818201423334053, + "grad_norm": 2.2172696590423584, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.1367, + "step": 72 + }, + { + "epoch": 0.0039357343109769245, + "grad_norm": 2.6138789653778076, + "learning_rate": 7.3e-06, + "loss": 1.3167, + "step": 73 + }, + { + "epoch": 0.003989648479620444, + "grad_norm": 2.2926838397979736, + "learning_rate": 7.4e-06, + "loss": 1.2909, + "step": 74 + }, + { + "epoch": 0.004043562648263964, + "grad_norm": 2.0647220611572266, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2054, + "step": 75 + }, + { + "epoch": 0.004097476816907484, + "grad_norm": 2.1190452575683594, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1497, + "step": 76 + }, + { + "epoch": 0.004151390985551002, + "grad_norm": 1.9973243474960327, + "learning_rate": 7.7e-06, + "loss": 1.1997, + "step": 77 + }, + { + "epoch": 0.004205305154194522, + "grad_norm": 2.11751651763916, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2181, + "step": 78 + }, + { + "epoch": 0.004259219322838042, + "grad_norm": 1.8975950479507446, + "learning_rate": 7.9e-06, + "loss": 1.1582, + "step": 79 + }, + { + "epoch": 0.004313133491481561, + "grad_norm": 1.8368147611618042, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1389, + "step": 80 + }, + { + "epoch": 0.004367047660125081, + "grad_norm": 1.7472988367080688, + "learning_rate": 8.1e-06, + "loss": 1.0959, + "step": 81 + }, + { + "epoch": 0.004420961828768601, + "grad_norm": 1.7325443029403687, + "learning_rate": 8.2e-06, + "loss": 1.1847, + "step": 82 + }, + { + "epoch": 0.00447487599741212, + "grad_norm": 1.6171561479568481, + "learning_rate": 8.3e-06, + "loss": 0.9834, + "step": 83 + }, + { + "epoch": 0.004528790166055639, + "grad_norm": 1.6583327054977417, + "learning_rate": 8.400000000000001e-06, + "loss": 1.0413, + "step": 84 + }, + { + "epoch": 0.004582704334699159, + "grad_norm": 1.8914967775344849, + "learning_rate": 8.5e-06, + "loss": 1.2413, + "step": 85 + }, + { + "epoch": 0.004636618503342679, + "grad_norm": 1.6018317937850952, + "learning_rate": 8.6e-06, + "loss": 1.0577, + "step": 86 + }, + { + "epoch": 0.004690532671986198, + "grad_norm": 1.9170053005218506, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2463, + "step": 87 + }, + { + "epoch": 0.004744446840629717, + "grad_norm": 1.666536569595337, + "learning_rate": 8.8e-06, + "loss": 1.0532, + "step": 88 + }, + { + "epoch": 0.004798361009273237, + "grad_norm": 1.660115361213684, + "learning_rate": 8.900000000000001e-06, + "loss": 1.0514, + "step": 89 + }, + { + "epoch": 0.0048522751779167565, + "grad_norm": 1.8667477369308472, + "learning_rate": 9e-06, + "loss": 1.2039, + "step": 90 + }, + { + "epoch": 0.004906189346560276, + "grad_norm": 1.9490039348602295, + "learning_rate": 9.100000000000001e-06, + "loss": 1.1804, + "step": 91 + }, + { + "epoch": 0.004960103515203796, + "grad_norm": 1.8415377140045166, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1435, + "step": 92 + }, + { + "epoch": 0.005014017683847315, + "grad_norm": 1.8571438789367676, + "learning_rate": 9.3e-06, + "loss": 1.0974, + "step": 93 + }, + { + "epoch": 0.005067931852490834, + "grad_norm": 1.8480113744735718, + "learning_rate": 9.4e-06, + "loss": 1.149, + "step": 94 + }, + { + "epoch": 0.005121846021134354, + "grad_norm": 2.003490447998047, + "learning_rate": 9.5e-06, + "loss": 1.1954, + "step": 95 + }, + { + "epoch": 0.005175760189777874, + "grad_norm": 1.8002668619155884, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9953, + "step": 96 + }, + { + "epoch": 0.005229674358421393, + "grad_norm": 1.9040817022323608, + "learning_rate": 9.7e-06, + "loss": 1.1195, + "step": 97 + }, + { + "epoch": 0.005283588527064913, + "grad_norm": 1.8311433792114258, + "learning_rate": 9.800000000000001e-06, + "loss": 1.083, + "step": 98 + }, + { + "epoch": 0.005337502695708432, + "grad_norm": 1.9509624242782593, + "learning_rate": 9.9e-06, + "loss": 1.176, + "step": 99 + }, + { + "epoch": 0.0053914168643519516, + "grad_norm": 2.0624589920043945, + "learning_rate": 1e-05, + "loss": 1.119, + "step": 100 + }, + { + "epoch": 0.005445331032995471, + "grad_norm": 1.9618796110153198, + "learning_rate": 9.999999995505339e-06, + "loss": 1.1371, + "step": 101 + }, + { + "epoch": 0.005499245201638991, + "grad_norm": 1.946245551109314, + "learning_rate": 9.999999982021349e-06, + "loss": 0.9736, + "step": 102 + }, + { + "epoch": 0.005553159370282511, + "grad_norm": 1.9871301651000977, + "learning_rate": 9.999999959548035e-06, + "loss": 1.1077, + "step": 103 + }, + { + "epoch": 0.005607073538926029, + "grad_norm": 1.86216402053833, + "learning_rate": 9.999999928085396e-06, + "loss": 1.0882, + "step": 104 + }, + { + "epoch": 0.005660987707569549, + "grad_norm": 1.8447723388671875, + "learning_rate": 9.999999887633432e-06, + "loss": 1.0344, + "step": 105 + }, + { + "epoch": 0.005714901876213069, + "grad_norm": 1.8345638513565063, + "learning_rate": 9.99999983819214e-06, + "loss": 1.1077, + "step": 106 + }, + { + "epoch": 0.0057688160448565885, + "grad_norm": 1.8410178422927856, + "learning_rate": 9.999999779761524e-06, + "loss": 1.0824, + "step": 107 + }, + { + "epoch": 0.005822730213500108, + "grad_norm": 1.5881969928741455, + "learning_rate": 9.999999712341583e-06, + "loss": 0.9439, + "step": 108 + }, + { + "epoch": 0.005876644382143627, + "grad_norm": 1.6704047918319702, + "learning_rate": 9.999999635932316e-06, + "loss": 1.033, + "step": 109 + }, + { + "epoch": 0.005930558550787147, + "grad_norm": 1.792449712753296, + "learning_rate": 9.999999550533726e-06, + "loss": 1.0279, + "step": 110 + }, + { + "epoch": 0.005984472719430666, + "grad_norm": 1.6515668630599976, + "learning_rate": 9.999999456145809e-06, + "loss": 1.0301, + "step": 111 + }, + { + "epoch": 0.006038386888074186, + "grad_norm": 1.8541395664215088, + "learning_rate": 9.999999352768568e-06, + "loss": 1.1057, + "step": 112 + }, + { + "epoch": 0.006092301056717706, + "grad_norm": 1.6490236520767212, + "learning_rate": 9.999999240402002e-06, + "loss": 1.0523, + "step": 113 + }, + { + "epoch": 0.006146215225361225, + "grad_norm": 1.655333161354065, + "learning_rate": 9.999999119046113e-06, + "loss": 1.0448, + "step": 114 + }, + { + "epoch": 0.006200129394004744, + "grad_norm": 1.5721609592437744, + "learning_rate": 9.999998988700899e-06, + "loss": 0.9883, + "step": 115 + }, + { + "epoch": 0.006254043562648264, + "grad_norm": 1.6411349773406982, + "learning_rate": 9.99999884936636e-06, + "loss": 1.0255, + "step": 116 + }, + { + "epoch": 0.0063079577312917835, + "grad_norm": 1.6399502754211426, + "learning_rate": 9.999998701042501e-06, + "loss": 1.0146, + "step": 117 + }, + { + "epoch": 0.006361871899935303, + "grad_norm": 1.615026831626892, + "learning_rate": 9.999998543729316e-06, + "loss": 1.0022, + "step": 118 + }, + { + "epoch": 0.006415786068578823, + "grad_norm": 1.4867664575576782, + "learning_rate": 9.99999837742681e-06, + "loss": 1.0164, + "step": 119 + }, + { + "epoch": 0.006469700237222342, + "grad_norm": 1.540153980255127, + "learning_rate": 9.999998202134979e-06, + "loss": 0.989, + "step": 120 + }, + { + "epoch": 0.006523614405865861, + "grad_norm": 1.5535691976547241, + "learning_rate": 9.999998017853825e-06, + "loss": 0.9942, + "step": 121 + }, + { + "epoch": 0.006577528574509381, + "grad_norm": 1.4892929792404175, + "learning_rate": 9.999997824583351e-06, + "loss": 1.0537, + "step": 122 + }, + { + "epoch": 0.006631442743152901, + "grad_norm": 1.4674094915390015, + "learning_rate": 9.999997622323554e-06, + "loss": 1.0239, + "step": 123 + }, + { + "epoch": 0.00668535691179642, + "grad_norm": 1.394027590751648, + "learning_rate": 9.999997411074436e-06, + "loss": 0.9781, + "step": 124 + }, + { + "epoch": 0.006739271080439939, + "grad_norm": 1.372728705406189, + "learning_rate": 9.999997190835999e-06, + "loss": 1.0433, + "step": 125 + }, + { + "epoch": 0.006793185249083459, + "grad_norm": 1.2535908222198486, + "learning_rate": 9.999996961608238e-06, + "loss": 0.958, + "step": 126 + }, + { + "epoch": 0.006847099417726979, + "grad_norm": 1.337633490562439, + "learning_rate": 9.999996723391158e-06, + "loss": 1.0213, + "step": 127 + }, + { + "epoch": 0.006901013586370498, + "grad_norm": 1.3640319108963013, + "learning_rate": 9.999996476184759e-06, + "loss": 1.0432, + "step": 128 + }, + { + "epoch": 0.006954927755014018, + "grad_norm": 1.2663391828536987, + "learning_rate": 9.99999621998904e-06, + "loss": 1.0154, + "step": 129 + }, + { + "epoch": 0.007008841923657537, + "grad_norm": 1.450737476348877, + "learning_rate": 9.999995954804004e-06, + "loss": 1.0074, + "step": 130 + }, + { + "epoch": 0.0070627560923010565, + "grad_norm": 1.2757987976074219, + "learning_rate": 9.999995680629649e-06, + "loss": 0.9996, + "step": 131 + }, + { + "epoch": 0.007116670260944576, + "grad_norm": 1.3978132009506226, + "learning_rate": 9.999995397465974e-06, + "loss": 1.04, + "step": 132 + }, + { + "epoch": 0.007170584429588096, + "grad_norm": 1.3167297840118408, + "learning_rate": 9.999995105312982e-06, + "loss": 1.0069, + "step": 133 + }, + { + "epoch": 0.0072244985982316155, + "grad_norm": 1.1626744270324707, + "learning_rate": 9.999994804170674e-06, + "loss": 0.9722, + "step": 134 + }, + { + "epoch": 0.007278412766875135, + "grad_norm": 1.354797601699829, + "learning_rate": 9.99999449403905e-06, + "loss": 0.9019, + "step": 135 + }, + { + "epoch": 0.007332326935518654, + "grad_norm": 1.2605732679367065, + "learning_rate": 9.99999417491811e-06, + "loss": 1.0038, + "step": 136 + }, + { + "epoch": 0.007386241104162174, + "grad_norm": 1.3804657459259033, + "learning_rate": 9.999993846807855e-06, + "loss": 1.0139, + "step": 137 + }, + { + "epoch": 0.007440155272805693, + "grad_norm": 1.3001742362976074, + "learning_rate": 9.999993509708286e-06, + "loss": 1.1436, + "step": 138 + }, + { + "epoch": 0.007494069441449213, + "grad_norm": 1.2776422500610352, + "learning_rate": 9.999993163619401e-06, + "loss": 0.9792, + "step": 139 + }, + { + "epoch": 0.007547983610092733, + "grad_norm": 1.2149187326431274, + "learning_rate": 9.999992808541204e-06, + "loss": 0.963, + "step": 140 + }, + { + "epoch": 0.0076018977787362515, + "grad_norm": 1.341806173324585, + "learning_rate": 9.999992444473694e-06, + "loss": 0.9639, + "step": 141 + }, + { + "epoch": 0.007655811947379771, + "grad_norm": 1.2565757036209106, + "learning_rate": 9.999992071416874e-06, + "loss": 0.9193, + "step": 142 + }, + { + "epoch": 0.007709726116023291, + "grad_norm": 1.3059918880462646, + "learning_rate": 9.99999168937074e-06, + "loss": 0.9632, + "step": 143 + }, + { + "epoch": 0.0077636402846668106, + "grad_norm": 1.1719332933425903, + "learning_rate": 9.999991298335295e-06, + "loss": 0.9687, + "step": 144 + }, + { + "epoch": 0.00781755445331033, + "grad_norm": 1.125950813293457, + "learning_rate": 9.999990898310542e-06, + "loss": 0.968, + "step": 145 + }, + { + "epoch": 0.007871468621953849, + "grad_norm": 1.2400416135787964, + "learning_rate": 9.999990489296478e-06, + "loss": 0.972, + "step": 146 + }, + { + "epoch": 0.007925382790597369, + "grad_norm": 1.172117829322815, + "learning_rate": 9.999990071293106e-06, + "loss": 0.9243, + "step": 147 + }, + { + "epoch": 0.007979296959240888, + "grad_norm": 1.240317463874817, + "learning_rate": 9.999989644300427e-06, + "loss": 1.0655, + "step": 148 + }, + { + "epoch": 0.008033211127884408, + "grad_norm": 1.1535708904266357, + "learning_rate": 9.999989208318438e-06, + "loss": 0.9871, + "step": 149 + }, + { + "epoch": 0.008087125296527928, + "grad_norm": 1.2711198329925537, + "learning_rate": 9.999988763347145e-06, + "loss": 1.0307, + "step": 150 + }, + { + "epoch": 0.008141039465171447, + "grad_norm": 1.2345954179763794, + "learning_rate": 9.999988309386548e-06, + "loss": 1.1343, + "step": 151 + }, + { + "epoch": 0.008194953633814967, + "grad_norm": 1.2489601373672485, + "learning_rate": 9.999987846436645e-06, + "loss": 1.0303, + "step": 152 + }, + { + "epoch": 0.008248867802458487, + "grad_norm": 1.264240026473999, + "learning_rate": 9.999987374497439e-06, + "loss": 0.9562, + "step": 153 + }, + { + "epoch": 0.008302781971102005, + "grad_norm": 1.2613575458526611, + "learning_rate": 9.99998689356893e-06, + "loss": 0.954, + "step": 154 + }, + { + "epoch": 0.008356696139745524, + "grad_norm": 1.2091072797775269, + "learning_rate": 9.999986403651116e-06, + "loss": 1.0734, + "step": 155 + }, + { + "epoch": 0.008410610308389044, + "grad_norm": 1.18421471118927, + "learning_rate": 9.999985904744002e-06, + "loss": 0.9167, + "step": 156 + }, + { + "epoch": 0.008464524477032564, + "grad_norm": 1.0399659872055054, + "learning_rate": 9.99998539684759e-06, + "loss": 0.9068, + "step": 157 + }, + { + "epoch": 0.008518438645676083, + "grad_norm": 1.1292288303375244, + "learning_rate": 9.999984879961877e-06, + "loss": 1.0027, + "step": 158 + }, + { + "epoch": 0.008572352814319603, + "grad_norm": 1.2592105865478516, + "learning_rate": 9.999984354086867e-06, + "loss": 1.0794, + "step": 159 + }, + { + "epoch": 0.008626266982963123, + "grad_norm": 1.1646504402160645, + "learning_rate": 9.999983819222558e-06, + "loss": 1.0468, + "step": 160 + }, + { + "epoch": 0.008680181151606643, + "grad_norm": 1.156711220741272, + "learning_rate": 9.999983275368952e-06, + "loss": 0.9053, + "step": 161 + }, + { + "epoch": 0.008734095320250162, + "grad_norm": 1.1169341802597046, + "learning_rate": 9.999982722526051e-06, + "loss": 0.97, + "step": 162 + }, + { + "epoch": 0.008788009488893682, + "grad_norm": 1.3474149703979492, + "learning_rate": 9.999982160693856e-06, + "loss": 1.0221, + "step": 163 + }, + { + "epoch": 0.008841923657537202, + "grad_norm": 1.2021468877792358, + "learning_rate": 9.999981589872368e-06, + "loss": 0.9303, + "step": 164 + }, + { + "epoch": 0.00889583782618072, + "grad_norm": 1.0625534057617188, + "learning_rate": 9.999981010061586e-06, + "loss": 0.8765, + "step": 165 + }, + { + "epoch": 0.00894975199482424, + "grad_norm": 1.2688498497009277, + "learning_rate": 9.999980421261512e-06, + "loss": 1.0163, + "step": 166 + }, + { + "epoch": 0.009003666163467759, + "grad_norm": 1.122948408126831, + "learning_rate": 9.999979823472148e-06, + "loss": 0.9953, + "step": 167 + }, + { + "epoch": 0.009057580332111279, + "grad_norm": 1.1817872524261475, + "learning_rate": 9.999979216693495e-06, + "loss": 1.0774, + "step": 168 + }, + { + "epoch": 0.009111494500754798, + "grad_norm": 1.1483280658721924, + "learning_rate": 9.999978600925553e-06, + "loss": 1.0105, + "step": 169 + }, + { + "epoch": 0.009165408669398318, + "grad_norm": 1.4039335250854492, + "learning_rate": 9.999977976168325e-06, + "loss": 0.944, + "step": 170 + }, + { + "epoch": 0.009219322838041838, + "grad_norm": 1.1459723711013794, + "learning_rate": 9.999977342421812e-06, + "loss": 0.9208, + "step": 171 + }, + { + "epoch": 0.009273237006685357, + "grad_norm": 1.0897774696350098, + "learning_rate": 9.999976699686011e-06, + "loss": 0.8719, + "step": 172 + }, + { + "epoch": 0.009327151175328877, + "grad_norm": 1.206467866897583, + "learning_rate": 9.999976047960928e-06, + "loss": 1.0645, + "step": 173 + }, + { + "epoch": 0.009381065343972397, + "grad_norm": 1.004550814628601, + "learning_rate": 9.999975387246563e-06, + "loss": 0.9317, + "step": 174 + }, + { + "epoch": 0.009434979512615916, + "grad_norm": 1.2359992265701294, + "learning_rate": 9.999974717542916e-06, + "loss": 1.1136, + "step": 175 + }, + { + "epoch": 0.009488893681259434, + "grad_norm": 1.1922352313995361, + "learning_rate": 9.999974038849989e-06, + "loss": 1.0307, + "step": 176 + }, + { + "epoch": 0.009542807849902954, + "grad_norm": 1.1597613096237183, + "learning_rate": 9.999973351167782e-06, + "loss": 1.0275, + "step": 177 + }, + { + "epoch": 0.009596722018546474, + "grad_norm": 1.172133445739746, + "learning_rate": 9.999972654496298e-06, + "loss": 0.9269, + "step": 178 + }, + { + "epoch": 0.009650636187189993, + "grad_norm": 1.1879733800888062, + "learning_rate": 9.999971948835538e-06, + "loss": 0.9547, + "step": 179 + }, + { + "epoch": 0.009704550355833513, + "grad_norm": 1.0029833316802979, + "learning_rate": 9.999971234185502e-06, + "loss": 0.8994, + "step": 180 + }, + { + "epoch": 0.009758464524477033, + "grad_norm": 1.0769891738891602, + "learning_rate": 9.999970510546194e-06, + "loss": 0.9107, + "step": 181 + }, + { + "epoch": 0.009812378693120552, + "grad_norm": 1.3288064002990723, + "learning_rate": 9.99996977791761e-06, + "loss": 1.0116, + "step": 182 + }, + { + "epoch": 0.009866292861764072, + "grad_norm": 1.142452597618103, + "learning_rate": 9.999969036299757e-06, + "loss": 0.9367, + "step": 183 + }, + { + "epoch": 0.009920207030407592, + "grad_norm": 1.2458518743515015, + "learning_rate": 9.999968285692632e-06, + "loss": 1.1398, + "step": 184 + }, + { + "epoch": 0.009974121199051111, + "grad_norm": 1.3373422622680664, + "learning_rate": 9.99996752609624e-06, + "loss": 0.959, + "step": 185 + }, + { + "epoch": 0.01002803536769463, + "grad_norm": 1.2288920879364014, + "learning_rate": 9.99996675751058e-06, + "loss": 0.9908, + "step": 186 + }, + { + "epoch": 0.010081949536338149, + "grad_norm": 1.1954001188278198, + "learning_rate": 9.999965979935656e-06, + "loss": 0.9332, + "step": 187 + }, + { + "epoch": 0.010135863704981669, + "grad_norm": 1.171021819114685, + "learning_rate": 9.999965193371466e-06, + "loss": 0.9119, + "step": 188 + }, + { + "epoch": 0.010189777873625188, + "grad_norm": 1.025169014930725, + "learning_rate": 9.999964397818013e-06, + "loss": 0.784, + "step": 189 + }, + { + "epoch": 0.010243692042268708, + "grad_norm": 1.1340326070785522, + "learning_rate": 9.999963593275298e-06, + "loss": 1.0036, + "step": 190 + }, + { + "epoch": 0.010297606210912228, + "grad_norm": 1.0302847623825073, + "learning_rate": 9.999962779743324e-06, + "loss": 0.8293, + "step": 191 + }, + { + "epoch": 0.010351520379555747, + "grad_norm": 1.2410109043121338, + "learning_rate": 9.99996195722209e-06, + "loss": 0.9507, + "step": 192 + }, + { + "epoch": 0.010405434548199267, + "grad_norm": 1.2054308652877808, + "learning_rate": 9.9999611257116e-06, + "loss": 0.9356, + "step": 193 + }, + { + "epoch": 0.010459348716842787, + "grad_norm": 1.2046679258346558, + "learning_rate": 9.999960285211853e-06, + "loss": 1.0638, + "step": 194 + }, + { + "epoch": 0.010513262885486306, + "grad_norm": 1.4594306945800781, + "learning_rate": 9.999959435722852e-06, + "loss": 0.9624, + "step": 195 + }, + { + "epoch": 0.010567177054129826, + "grad_norm": 1.0909247398376465, + "learning_rate": 9.999958577244598e-06, + "loss": 0.9503, + "step": 196 + }, + { + "epoch": 0.010621091222773344, + "grad_norm": 1.1524754762649536, + "learning_rate": 9.999957709777094e-06, + "loss": 0.8954, + "step": 197 + }, + { + "epoch": 0.010675005391416864, + "grad_norm": 1.4128906726837158, + "learning_rate": 9.99995683332034e-06, + "loss": 0.8903, + "step": 198 + }, + { + "epoch": 0.010728919560060383, + "grad_norm": 1.1304652690887451, + "learning_rate": 9.999955947874338e-06, + "loss": 0.9247, + "step": 199 + }, + { + "epoch": 0.010782833728703903, + "grad_norm": 1.2978957891464233, + "learning_rate": 9.99995505343909e-06, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.010836747897347423, + "grad_norm": 1.0742554664611816, + "learning_rate": 9.999954150014595e-06, + "loss": 0.9626, + "step": 201 + }, + { + "epoch": 0.010890662065990942, + "grad_norm": 1.0707745552062988, + "learning_rate": 9.999953237600859e-06, + "loss": 0.8721, + "step": 202 + }, + { + "epoch": 0.010944576234634462, + "grad_norm": 1.17974853515625, + "learning_rate": 9.99995231619788e-06, + "loss": 1.0059, + "step": 203 + }, + { + "epoch": 0.010998490403277982, + "grad_norm": 1.0108370780944824, + "learning_rate": 9.999951385805662e-06, + "loss": 0.9527, + "step": 204 + }, + { + "epoch": 0.011052404571921502, + "grad_norm": 0.9983445405960083, + "learning_rate": 9.999950446424204e-06, + "loss": 0.7626, + "step": 205 + }, + { + "epoch": 0.011106318740565021, + "grad_norm": 1.0860002040863037, + "learning_rate": 9.99994949805351e-06, + "loss": 0.9591, + "step": 206 + }, + { + "epoch": 0.01116023290920854, + "grad_norm": 1.0447322130203247, + "learning_rate": 9.999948540693584e-06, + "loss": 0.9861, + "step": 207 + }, + { + "epoch": 0.011214147077852059, + "grad_norm": 1.2582998275756836, + "learning_rate": 9.999947574344423e-06, + "loss": 0.8949, + "step": 208 + }, + { + "epoch": 0.011268061246495579, + "grad_norm": 1.1507002115249634, + "learning_rate": 9.99994659900603e-06, + "loss": 0.918, + "step": 209 + }, + { + "epoch": 0.011321975415139098, + "grad_norm": 1.135169267654419, + "learning_rate": 9.999945614678408e-06, + "loss": 0.9891, + "step": 210 + }, + { + "epoch": 0.011375889583782618, + "grad_norm": 1.1746275424957275, + "learning_rate": 9.999944621361558e-06, + "loss": 1.0186, + "step": 211 + }, + { + "epoch": 0.011429803752426138, + "grad_norm": 1.1137248277664185, + "learning_rate": 9.999943619055483e-06, + "loss": 0.9584, + "step": 212 + }, + { + "epoch": 0.011483717921069657, + "grad_norm": 1.336651086807251, + "learning_rate": 9.999942607760182e-06, + "loss": 1.091, + "step": 213 + }, + { + "epoch": 0.011537632089713177, + "grad_norm": 1.1966856718063354, + "learning_rate": 9.999941587475658e-06, + "loss": 0.9761, + "step": 214 + }, + { + "epoch": 0.011591546258356697, + "grad_norm": 1.0843144655227661, + "learning_rate": 9.999940558201915e-06, + "loss": 0.8917, + "step": 215 + }, + { + "epoch": 0.011645460427000216, + "grad_norm": 1.2089293003082275, + "learning_rate": 9.999939519938953e-06, + "loss": 0.9704, + "step": 216 + }, + { + "epoch": 0.011699374595643736, + "grad_norm": 1.2409982681274414, + "learning_rate": 9.999938472686775e-06, + "loss": 0.9949, + "step": 217 + }, + { + "epoch": 0.011753288764287254, + "grad_norm": 1.1310094594955444, + "learning_rate": 9.99993741644538e-06, + "loss": 0.9666, + "step": 218 + }, + { + "epoch": 0.011807202932930774, + "grad_norm": 1.120510220527649, + "learning_rate": 9.999936351214772e-06, + "loss": 0.8844, + "step": 219 + }, + { + "epoch": 0.011861117101574293, + "grad_norm": 1.0931518077850342, + "learning_rate": 9.999935276994954e-06, + "loss": 0.9647, + "step": 220 + }, + { + "epoch": 0.011915031270217813, + "grad_norm": 1.2821122407913208, + "learning_rate": 9.999934193785926e-06, + "loss": 1.0533, + "step": 221 + }, + { + "epoch": 0.011968945438861333, + "grad_norm": 1.183580756187439, + "learning_rate": 9.999933101587691e-06, + "loss": 0.9196, + "step": 222 + }, + { + "epoch": 0.012022859607504852, + "grad_norm": 1.045825719833374, + "learning_rate": 9.99993200040025e-06, + "loss": 0.8953, + "step": 223 + }, + { + "epoch": 0.012076773776148372, + "grad_norm": 1.0963969230651855, + "learning_rate": 9.999930890223605e-06, + "loss": 0.9723, + "step": 224 + }, + { + "epoch": 0.012130687944791892, + "grad_norm": 1.0356731414794922, + "learning_rate": 9.999929771057761e-06, + "loss": 1.0215, + "step": 225 + }, + { + "epoch": 0.012184602113435411, + "grad_norm": 1.112277626991272, + "learning_rate": 9.999928642902717e-06, + "loss": 0.9886, + "step": 226 + }, + { + "epoch": 0.012238516282078931, + "grad_norm": 0.9969072937965393, + "learning_rate": 9.999927505758475e-06, + "loss": 0.8601, + "step": 227 + }, + { + "epoch": 0.01229243045072245, + "grad_norm": 1.123781442642212, + "learning_rate": 9.999926359625036e-06, + "loss": 0.9894, + "step": 228 + }, + { + "epoch": 0.012346344619365969, + "grad_norm": 1.2122100591659546, + "learning_rate": 9.999925204502406e-06, + "loss": 1.0783, + "step": 229 + }, + { + "epoch": 0.012400258788009488, + "grad_norm": 1.1256672143936157, + "learning_rate": 9.999924040390584e-06, + "loss": 0.9116, + "step": 230 + }, + { + "epoch": 0.012454172956653008, + "grad_norm": 1.0646952390670776, + "learning_rate": 9.999922867289573e-06, + "loss": 0.8993, + "step": 231 + }, + { + "epoch": 0.012508087125296528, + "grad_norm": 1.194676399230957, + "learning_rate": 9.999921685199376e-06, + "loss": 1.0377, + "step": 232 + }, + { + "epoch": 0.012562001293940047, + "grad_norm": 1.0519152879714966, + "learning_rate": 9.999920494119992e-06, + "loss": 0.8283, + "step": 233 + }, + { + "epoch": 0.012615915462583567, + "grad_norm": 1.243249773979187, + "learning_rate": 9.999919294051427e-06, + "loss": 0.9741, + "step": 234 + }, + { + "epoch": 0.012669829631227087, + "grad_norm": 1.1071687936782837, + "learning_rate": 9.999918084993681e-06, + "loss": 1.0402, + "step": 235 + }, + { + "epoch": 0.012723743799870606, + "grad_norm": 1.1224809885025024, + "learning_rate": 9.999916866946757e-06, + "loss": 0.8793, + "step": 236 + }, + { + "epoch": 0.012777657968514126, + "grad_norm": 1.0458532571792603, + "learning_rate": 9.999915639910656e-06, + "loss": 0.9855, + "step": 237 + }, + { + "epoch": 0.012831572137157646, + "grad_norm": 1.0610811710357666, + "learning_rate": 9.999914403885383e-06, + "loss": 0.8092, + "step": 238 + }, + { + "epoch": 0.012885486305801164, + "grad_norm": 1.2818992137908936, + "learning_rate": 9.999913158870936e-06, + "loss": 1.0101, + "step": 239 + }, + { + "epoch": 0.012939400474444683, + "grad_norm": 1.110400915145874, + "learning_rate": 9.999911904867319e-06, + "loss": 0.9782, + "step": 240 + }, + { + "epoch": 0.012993314643088203, + "grad_norm": 1.3290835618972778, + "learning_rate": 9.999910641874537e-06, + "loss": 1.0683, + "step": 241 + }, + { + "epoch": 0.013047228811731723, + "grad_norm": 1.1448980569839478, + "learning_rate": 9.999909369892588e-06, + "loss": 0.9223, + "step": 242 + }, + { + "epoch": 0.013101142980375242, + "grad_norm": 1.1710877418518066, + "learning_rate": 9.999908088921477e-06, + "loss": 0.8022, + "step": 243 + }, + { + "epoch": 0.013155057149018762, + "grad_norm": 1.1242793798446655, + "learning_rate": 9.999906798961207e-06, + "loss": 0.9238, + "step": 244 + }, + { + "epoch": 0.013208971317662282, + "grad_norm": 1.0338802337646484, + "learning_rate": 9.999905500011778e-06, + "loss": 0.8386, + "step": 245 + }, + { + "epoch": 0.013262885486305801, + "grad_norm": 1.0910224914550781, + "learning_rate": 9.999904192073193e-06, + "loss": 0.937, + "step": 246 + }, + { + "epoch": 0.013316799654949321, + "grad_norm": 1.297788143157959, + "learning_rate": 9.999902875145453e-06, + "loss": 0.9054, + "step": 247 + }, + { + "epoch": 0.01337071382359284, + "grad_norm": 1.1317543983459473, + "learning_rate": 9.999901549228564e-06, + "loss": 0.9418, + "step": 248 + }, + { + "epoch": 0.01342462799223636, + "grad_norm": 1.0944132804870605, + "learning_rate": 9.999900214322526e-06, + "loss": 0.9445, + "step": 249 + }, + { + "epoch": 0.013478542160879878, + "grad_norm": 1.4942843914031982, + "learning_rate": 9.999898870427342e-06, + "loss": 0.8956, + "step": 250 + }, + { + "epoch": 0.013532456329523398, + "grad_norm": 1.0630019903182983, + "learning_rate": 9.999897517543013e-06, + "loss": 0.8381, + "step": 251 + }, + { + "epoch": 0.013586370498166918, + "grad_norm": 1.65073561668396, + "learning_rate": 9.999896155669544e-06, + "loss": 1.0148, + "step": 252 + }, + { + "epoch": 0.013640284666810438, + "grad_norm": 1.035731315612793, + "learning_rate": 9.999894784806936e-06, + "loss": 0.8092, + "step": 253 + }, + { + "epoch": 0.013694198835453957, + "grad_norm": 1.308863639831543, + "learning_rate": 9.99989340495519e-06, + "loss": 0.9742, + "step": 254 + }, + { + "epoch": 0.013748113004097477, + "grad_norm": 1.1512938737869263, + "learning_rate": 9.999892016114313e-06, + "loss": 0.8747, + "step": 255 + }, + { + "epoch": 0.013802027172740997, + "grad_norm": 0.9977009296417236, + "learning_rate": 9.9998906182843e-06, + "loss": 0.8183, + "step": 256 + }, + { + "epoch": 0.013855941341384516, + "grad_norm": 1.2228175401687622, + "learning_rate": 9.99988921146516e-06, + "loss": 0.9917, + "step": 257 + }, + { + "epoch": 0.013909855510028036, + "grad_norm": 1.0753847360610962, + "learning_rate": 9.999887795656896e-06, + "loss": 1.0063, + "step": 258 + }, + { + "epoch": 0.013963769678671556, + "grad_norm": 1.0010429620742798, + "learning_rate": 9.999886370859506e-06, + "loss": 0.9315, + "step": 259 + }, + { + "epoch": 0.014017683847315074, + "grad_norm": 1.2038911581039429, + "learning_rate": 9.999884937072995e-06, + "loss": 0.8764, + "step": 260 + }, + { + "epoch": 0.014071598015958593, + "grad_norm": 1.1268917322158813, + "learning_rate": 9.999883494297365e-06, + "loss": 1.0059, + "step": 261 + }, + { + "epoch": 0.014125512184602113, + "grad_norm": 1.1053709983825684, + "learning_rate": 9.999882042532619e-06, + "loss": 0.8866, + "step": 262 + }, + { + "epoch": 0.014179426353245633, + "grad_norm": 1.091145396232605, + "learning_rate": 9.999880581778758e-06, + "loss": 1.0415, + "step": 263 + }, + { + "epoch": 0.014233340521889152, + "grad_norm": 1.0019958019256592, + "learning_rate": 9.999879112035786e-06, + "loss": 0.8177, + "step": 264 + }, + { + "epoch": 0.014287254690532672, + "grad_norm": 1.1044156551361084, + "learning_rate": 9.999877633303708e-06, + "loss": 0.9508, + "step": 265 + }, + { + "epoch": 0.014341168859176192, + "grad_norm": 0.9750218391418457, + "learning_rate": 9.999876145582524e-06, + "loss": 0.8501, + "step": 266 + }, + { + "epoch": 0.014395083027819711, + "grad_norm": 1.4015804529190063, + "learning_rate": 9.999874648872235e-06, + "loss": 0.9491, + "step": 267 + }, + { + "epoch": 0.014448997196463231, + "grad_norm": 1.066422939300537, + "learning_rate": 9.999873143172848e-06, + "loss": 1.0104, + "step": 268 + }, + { + "epoch": 0.01450291136510675, + "grad_norm": 1.1133167743682861, + "learning_rate": 9.99987162848436e-06, + "loss": 1.0142, + "step": 269 + }, + { + "epoch": 0.01455682553375027, + "grad_norm": 1.1259140968322754, + "learning_rate": 9.999870104806782e-06, + "loss": 0.9803, + "step": 270 + }, + { + "epoch": 0.014610739702393788, + "grad_norm": 1.0813393592834473, + "learning_rate": 9.999868572140108e-06, + "loss": 0.8728, + "step": 271 + }, + { + "epoch": 0.014664653871037308, + "grad_norm": 0.9939939379692078, + "learning_rate": 9.999867030484347e-06, + "loss": 0.8826, + "step": 272 + }, + { + "epoch": 0.014718568039680828, + "grad_norm": 1.0081939697265625, + "learning_rate": 9.999865479839499e-06, + "loss": 0.8682, + "step": 273 + }, + { + "epoch": 0.014772482208324347, + "grad_norm": 1.0190658569335938, + "learning_rate": 9.999863920205567e-06, + "loss": 0.9094, + "step": 274 + }, + { + "epoch": 0.014826396376967867, + "grad_norm": 1.0702111721038818, + "learning_rate": 9.999862351582553e-06, + "loss": 0.9244, + "step": 275 + }, + { + "epoch": 0.014880310545611387, + "grad_norm": 1.0891972780227661, + "learning_rate": 9.999860773970461e-06, + "loss": 1.0318, + "step": 276 + }, + { + "epoch": 0.014934224714254906, + "grad_norm": 0.9788139462471008, + "learning_rate": 9.999859187369294e-06, + "loss": 0.8779, + "step": 277 + }, + { + "epoch": 0.014988138882898426, + "grad_norm": 1.0678125619888306, + "learning_rate": 9.999857591779055e-06, + "loss": 0.8962, + "step": 278 + }, + { + "epoch": 0.015042053051541946, + "grad_norm": 0.9882293343544006, + "learning_rate": 9.999855987199747e-06, + "loss": 0.9082, + "step": 279 + }, + { + "epoch": 0.015095967220185465, + "grad_norm": 0.9987571835517883, + "learning_rate": 9.999854373631371e-06, + "loss": 0.9708, + "step": 280 + }, + { + "epoch": 0.015149881388828985, + "grad_norm": 1.0238722562789917, + "learning_rate": 9.99985275107393e-06, + "loss": 0.9461, + "step": 281 + }, + { + "epoch": 0.015203795557472503, + "grad_norm": 0.9628013372421265, + "learning_rate": 9.999851119527431e-06, + "loss": 0.9412, + "step": 282 + }, + { + "epoch": 0.015257709726116023, + "grad_norm": 1.0021862983703613, + "learning_rate": 9.999849478991873e-06, + "loss": 0.8461, + "step": 283 + }, + { + "epoch": 0.015311623894759542, + "grad_norm": 0.9776142239570618, + "learning_rate": 9.99984782946726e-06, + "loss": 0.962, + "step": 284 + }, + { + "epoch": 0.015365538063403062, + "grad_norm": 1.0114799737930298, + "learning_rate": 9.999846170953593e-06, + "loss": 0.8732, + "step": 285 + }, + { + "epoch": 0.015419452232046582, + "grad_norm": 0.9860401749610901, + "learning_rate": 9.999844503450879e-06, + "loss": 0.8204, + "step": 286 + }, + { + "epoch": 0.015473366400690101, + "grad_norm": 1.0743263959884644, + "learning_rate": 9.999842826959119e-06, + "loss": 0.9445, + "step": 287 + }, + { + "epoch": 0.015527280569333621, + "grad_norm": 1.0456606149673462, + "learning_rate": 9.999841141478315e-06, + "loss": 0.8869, + "step": 288 + }, + { + "epoch": 0.01558119473797714, + "grad_norm": 1.0299748182296753, + "learning_rate": 9.99983944700847e-06, + "loss": 0.9543, + "step": 289 + }, + { + "epoch": 0.01563510890662066, + "grad_norm": 1.0176036357879639, + "learning_rate": 9.99983774354959e-06, + "loss": 0.9672, + "step": 290 + }, + { + "epoch": 0.01568902307526418, + "grad_norm": 1.0023303031921387, + "learning_rate": 9.999836031101675e-06, + "loss": 0.9417, + "step": 291 + }, + { + "epoch": 0.015742937243907698, + "grad_norm": 0.9801005721092224, + "learning_rate": 9.99983430966473e-06, + "loss": 0.9376, + "step": 292 + }, + { + "epoch": 0.01579685141255122, + "grad_norm": 1.002906322479248, + "learning_rate": 9.999832579238756e-06, + "loss": 0.8973, + "step": 293 + }, + { + "epoch": 0.015850765581194737, + "grad_norm": 1.0014845132827759, + "learning_rate": 9.999830839823759e-06, + "loss": 0.9583, + "step": 294 + }, + { + "epoch": 0.01590467974983826, + "grad_norm": 1.0173449516296387, + "learning_rate": 9.999829091419739e-06, + "loss": 0.9006, + "step": 295 + }, + { + "epoch": 0.015958593918481777, + "grad_norm": 0.9779545664787292, + "learning_rate": 9.999827334026702e-06, + "loss": 0.9342, + "step": 296 + }, + { + "epoch": 0.016012508087125298, + "grad_norm": 0.9800315499305725, + "learning_rate": 9.999825567644648e-06, + "loss": 0.7948, + "step": 297 + }, + { + "epoch": 0.016066422255768816, + "grad_norm": 0.9628249406814575, + "learning_rate": 9.999823792273583e-06, + "loss": 0.8415, + "step": 298 + }, + { + "epoch": 0.016120336424412334, + "grad_norm": 1.1227449178695679, + "learning_rate": 9.99982200791351e-06, + "loss": 0.9646, + "step": 299 + }, + { + "epoch": 0.016174250593055856, + "grad_norm": 1.1018567085266113, + "learning_rate": 9.99982021456443e-06, + "loss": 0.8647, + "step": 300 + }, + { + "epoch": 0.016228164761699373, + "grad_norm": 1.1017298698425293, + "learning_rate": 9.999818412226347e-06, + "loss": 0.8708, + "step": 301 + }, + { + "epoch": 0.016282078930342895, + "grad_norm": 1.084594488143921, + "learning_rate": 9.999816600899267e-06, + "loss": 0.9765, + "step": 302 + }, + { + "epoch": 0.016335993098986413, + "grad_norm": 1.3735941648483276, + "learning_rate": 9.99981478058319e-06, + "loss": 1.0253, + "step": 303 + }, + { + "epoch": 0.016389907267629934, + "grad_norm": 1.1644489765167236, + "learning_rate": 9.999812951278119e-06, + "loss": 0.8519, + "step": 304 + }, + { + "epoch": 0.016443821436273452, + "grad_norm": 1.0079474449157715, + "learning_rate": 9.99981111298406e-06, + "loss": 0.9422, + "step": 305 + }, + { + "epoch": 0.016497735604916974, + "grad_norm": 1.0046736001968384, + "learning_rate": 9.999809265701015e-06, + "loss": 0.7766, + "step": 306 + }, + { + "epoch": 0.01655164977356049, + "grad_norm": 1.0312374830245972, + "learning_rate": 9.999807409428987e-06, + "loss": 0.8844, + "step": 307 + }, + { + "epoch": 0.01660556394220401, + "grad_norm": 1.0419421195983887, + "learning_rate": 9.99980554416798e-06, + "loss": 0.8902, + "step": 308 + }, + { + "epoch": 0.01665947811084753, + "grad_norm": 1.2056832313537598, + "learning_rate": 9.999803669917996e-06, + "loss": 0.9842, + "step": 309 + }, + { + "epoch": 0.01671339227949105, + "grad_norm": 0.9645346403121948, + "learning_rate": 9.999801786679039e-06, + "loss": 0.7837, + "step": 310 + }, + { + "epoch": 0.01676730644813457, + "grad_norm": 1.0259841680526733, + "learning_rate": 9.999799894451115e-06, + "loss": 0.8927, + "step": 311 + }, + { + "epoch": 0.016821220616778088, + "grad_norm": 0.9932212233543396, + "learning_rate": 9.999797993234224e-06, + "loss": 0.815, + "step": 312 + }, + { + "epoch": 0.01687513478542161, + "grad_norm": 1.0666078329086304, + "learning_rate": 9.99979608302837e-06, + "loss": 0.8245, + "step": 313 + }, + { + "epoch": 0.016929048954065128, + "grad_norm": 0.9566568732261658, + "learning_rate": 9.999794163833557e-06, + "loss": 0.851, + "step": 314 + }, + { + "epoch": 0.01698296312270865, + "grad_norm": 1.0056332349777222, + "learning_rate": 9.999792235649789e-06, + "loss": 0.8704, + "step": 315 + }, + { + "epoch": 0.017036877291352167, + "grad_norm": 1.036537528038025, + "learning_rate": 9.999790298477068e-06, + "loss": 0.9512, + "step": 316 + }, + { + "epoch": 0.01709079145999569, + "grad_norm": 1.1026023626327515, + "learning_rate": 9.9997883523154e-06, + "loss": 1.0007, + "step": 317 + }, + { + "epoch": 0.017144705628639206, + "grad_norm": 1.006659984588623, + "learning_rate": 9.999786397164786e-06, + "loss": 0.8992, + "step": 318 + }, + { + "epoch": 0.017198619797282724, + "grad_norm": 1.0100573301315308, + "learning_rate": 9.99978443302523e-06, + "loss": 0.9545, + "step": 319 + }, + { + "epoch": 0.017252533965926246, + "grad_norm": 1.000086784362793, + "learning_rate": 9.999782459896735e-06, + "loss": 0.8732, + "step": 320 + }, + { + "epoch": 0.017306448134569764, + "grad_norm": 1.2039650678634644, + "learning_rate": 9.999780477779306e-06, + "loss": 0.9881, + "step": 321 + }, + { + "epoch": 0.017360362303213285, + "grad_norm": 1.0316474437713623, + "learning_rate": 9.999778486672948e-06, + "loss": 0.8686, + "step": 322 + }, + { + "epoch": 0.017414276471856803, + "grad_norm": 1.1697666645050049, + "learning_rate": 9.999776486577661e-06, + "loss": 0.9185, + "step": 323 + }, + { + "epoch": 0.017468190640500324, + "grad_norm": 0.9523053169250488, + "learning_rate": 9.999774477493451e-06, + "loss": 0.858, + "step": 324 + }, + { + "epoch": 0.017522104809143842, + "grad_norm": 0.9660015106201172, + "learning_rate": 9.999772459420319e-06, + "loss": 0.9964, + "step": 325 + }, + { + "epoch": 0.017576018977787364, + "grad_norm": 0.971128523349762, + "learning_rate": 9.999770432358271e-06, + "loss": 0.8999, + "step": 326 + }, + { + "epoch": 0.01762993314643088, + "grad_norm": 1.221969485282898, + "learning_rate": 9.999768396307312e-06, + "loss": 0.8628, + "step": 327 + }, + { + "epoch": 0.017683847315074403, + "grad_norm": 1.0868507623672485, + "learning_rate": 9.999766351267442e-06, + "loss": 1.0732, + "step": 328 + }, + { + "epoch": 0.01773776148371792, + "grad_norm": 0.9527992606163025, + "learning_rate": 9.999764297238666e-06, + "loss": 0.8221, + "step": 329 + }, + { + "epoch": 0.01779167565236144, + "grad_norm": 0.9969122409820557, + "learning_rate": 9.99976223422099e-06, + "loss": 0.9234, + "step": 330 + }, + { + "epoch": 0.01784558982100496, + "grad_norm": 0.9291784763336182, + "learning_rate": 9.999760162214415e-06, + "loss": 0.7839, + "step": 331 + }, + { + "epoch": 0.01789950398964848, + "grad_norm": 0.9766960144042969, + "learning_rate": 9.999758081218944e-06, + "loss": 0.7929, + "step": 332 + }, + { + "epoch": 0.017953418158292, + "grad_norm": 0.9536904692649841, + "learning_rate": 9.999755991234585e-06, + "loss": 0.9136, + "step": 333 + }, + { + "epoch": 0.018007332326935518, + "grad_norm": 1.0325372219085693, + "learning_rate": 9.999753892261337e-06, + "loss": 0.8367, + "step": 334 + }, + { + "epoch": 0.01806124649557904, + "grad_norm": 0.9486141800880432, + "learning_rate": 9.999751784299207e-06, + "loss": 0.8802, + "step": 335 + }, + { + "epoch": 0.018115160664222557, + "grad_norm": 0.9880577921867371, + "learning_rate": 9.999749667348198e-06, + "loss": 0.8597, + "step": 336 + }, + { + "epoch": 0.01816907483286608, + "grad_norm": 1.043199896812439, + "learning_rate": 9.999747541408312e-06, + "loss": 0.9142, + "step": 337 + }, + { + "epoch": 0.018222989001509596, + "grad_norm": 1.0606465339660645, + "learning_rate": 9.999745406479554e-06, + "loss": 0.9876, + "step": 338 + }, + { + "epoch": 0.018276903170153118, + "grad_norm": 1.139449954032898, + "learning_rate": 9.999743262561929e-06, + "loss": 0.7773, + "step": 339 + }, + { + "epoch": 0.018330817338796636, + "grad_norm": 1.1416115760803223, + "learning_rate": 9.99974110965544e-06, + "loss": 0.9566, + "step": 340 + }, + { + "epoch": 0.018384731507440154, + "grad_norm": 1.0145153999328613, + "learning_rate": 9.99973894776009e-06, + "loss": 0.9543, + "step": 341 + }, + { + "epoch": 0.018438645676083675, + "grad_norm": 0.950528621673584, + "learning_rate": 9.999736776875885e-06, + "loss": 0.8007, + "step": 342 + }, + { + "epoch": 0.018492559844727193, + "grad_norm": 0.9080097079277039, + "learning_rate": 9.999734597002826e-06, + "loss": 0.8273, + "step": 343 + }, + { + "epoch": 0.018546474013370715, + "grad_norm": 1.0038888454437256, + "learning_rate": 9.99973240814092e-06, + "loss": 0.9394, + "step": 344 + }, + { + "epoch": 0.018600388182014232, + "grad_norm": 1.05253267288208, + "learning_rate": 9.999730210290168e-06, + "loss": 0.9485, + "step": 345 + }, + { + "epoch": 0.018654302350657754, + "grad_norm": 0.9396592974662781, + "learning_rate": 9.999728003450577e-06, + "loss": 0.8943, + "step": 346 + }, + { + "epoch": 0.018708216519301272, + "grad_norm": 1.149387240409851, + "learning_rate": 9.999725787622148e-06, + "loss": 0.8566, + "step": 347 + }, + { + "epoch": 0.018762130687944793, + "grad_norm": 1.1573290824890137, + "learning_rate": 9.999723562804887e-06, + "loss": 0.9641, + "step": 348 + }, + { + "epoch": 0.01881604485658831, + "grad_norm": 1.0217385292053223, + "learning_rate": 9.999721328998797e-06, + "loss": 0.9555, + "step": 349 + }, + { + "epoch": 0.018869959025231833, + "grad_norm": 1.034690499305725, + "learning_rate": 9.999719086203884e-06, + "loss": 0.9407, + "step": 350 + }, + { + "epoch": 0.01892387319387535, + "grad_norm": 0.9819002151489258, + "learning_rate": 9.999716834420148e-06, + "loss": 0.9104, + "step": 351 + }, + { + "epoch": 0.01897778736251887, + "grad_norm": 1.0459688901901245, + "learning_rate": 9.999714573647597e-06, + "loss": 0.9296, + "step": 352 + }, + { + "epoch": 0.01903170153116239, + "grad_norm": 0.9575183391571045, + "learning_rate": 9.999712303886232e-06, + "loss": 0.8517, + "step": 353 + }, + { + "epoch": 0.019085615699805908, + "grad_norm": 1.0018881559371948, + "learning_rate": 9.99971002513606e-06, + "loss": 0.9208, + "step": 354 + }, + { + "epoch": 0.01913952986844943, + "grad_norm": 1.0291972160339355, + "learning_rate": 9.999707737397085e-06, + "loss": 0.8765, + "step": 355 + }, + { + "epoch": 0.019193444037092947, + "grad_norm": 1.0081498622894287, + "learning_rate": 9.999705440669306e-06, + "loss": 0.9204, + "step": 356 + }, + { + "epoch": 0.01924735820573647, + "grad_norm": 0.956950843334198, + "learning_rate": 9.999703134952733e-06, + "loss": 0.8058, + "step": 357 + }, + { + "epoch": 0.019301272374379987, + "grad_norm": 1.1130229234695435, + "learning_rate": 9.999700820247369e-06, + "loss": 0.8202, + "step": 358 + }, + { + "epoch": 0.019355186543023508, + "grad_norm": 1.047211766242981, + "learning_rate": 9.999698496553216e-06, + "loss": 0.9357, + "step": 359 + }, + { + "epoch": 0.019409100711667026, + "grad_norm": 1.0225415229797363, + "learning_rate": 9.99969616387028e-06, + "loss": 0.8306, + "step": 360 + }, + { + "epoch": 0.019463014880310544, + "grad_norm": 1.060727596282959, + "learning_rate": 9.999693822198564e-06, + "loss": 0.9178, + "step": 361 + }, + { + "epoch": 0.019516929048954065, + "grad_norm": 1.0743412971496582, + "learning_rate": 9.999691471538074e-06, + "loss": 0.8761, + "step": 362 + }, + { + "epoch": 0.019570843217597583, + "grad_norm": 1.2229491472244263, + "learning_rate": 9.99968911188881e-06, + "loss": 1.0738, + "step": 363 + }, + { + "epoch": 0.019624757386241105, + "grad_norm": 0.9889073967933655, + "learning_rate": 9.999686743250783e-06, + "loss": 0.9458, + "step": 364 + }, + { + "epoch": 0.019678671554884623, + "grad_norm": 1.0398520231246948, + "learning_rate": 9.999684365623992e-06, + "loss": 0.9096, + "step": 365 + }, + { + "epoch": 0.019732585723528144, + "grad_norm": 1.0613081455230713, + "learning_rate": 9.999681979008442e-06, + "loss": 0.9312, + "step": 366 + }, + { + "epoch": 0.019786499892171662, + "grad_norm": 0.946211040019989, + "learning_rate": 9.99967958340414e-06, + "loss": 0.9208, + "step": 367 + }, + { + "epoch": 0.019840414060815183, + "grad_norm": 1.1298933029174805, + "learning_rate": 9.999677178811087e-06, + "loss": 0.9378, + "step": 368 + }, + { + "epoch": 0.0198943282294587, + "grad_norm": 1.1042351722717285, + "learning_rate": 9.999674765229288e-06, + "loss": 0.9487, + "step": 369 + }, + { + "epoch": 0.019948242398102223, + "grad_norm": 1.0717188119888306, + "learning_rate": 9.999672342658751e-06, + "loss": 0.939, + "step": 370 + }, + { + "epoch": 0.02000215656674574, + "grad_norm": 1.0936871767044067, + "learning_rate": 9.999669911099474e-06, + "loss": 1.1361, + "step": 371 + }, + { + "epoch": 0.02005607073538926, + "grad_norm": 1.0650005340576172, + "learning_rate": 9.999667470551466e-06, + "loss": 0.9709, + "step": 372 + }, + { + "epoch": 0.02010998490403278, + "grad_norm": 1.0154083967208862, + "learning_rate": 9.999665021014731e-06, + "loss": 0.9422, + "step": 373 + }, + { + "epoch": 0.020163899072676298, + "grad_norm": 1.1382607221603394, + "learning_rate": 9.999662562489272e-06, + "loss": 0.984, + "step": 374 + }, + { + "epoch": 0.02021781324131982, + "grad_norm": 0.9372896552085876, + "learning_rate": 9.999660094975095e-06, + "loss": 0.9857, + "step": 375 + }, + { + "epoch": 0.020271727409963337, + "grad_norm": 1.1777011156082153, + "learning_rate": 9.999657618472203e-06, + "loss": 0.9731, + "step": 376 + }, + { + "epoch": 0.02032564157860686, + "grad_norm": 0.9054237604141235, + "learning_rate": 9.9996551329806e-06, + "loss": 0.9104, + "step": 377 + }, + { + "epoch": 0.020379555747250377, + "grad_norm": 0.9255661964416504, + "learning_rate": 9.999652638500292e-06, + "loss": 0.8632, + "step": 378 + }, + { + "epoch": 0.020433469915893898, + "grad_norm": 0.9440998435020447, + "learning_rate": 9.999650135031282e-06, + "loss": 0.8945, + "step": 379 + }, + { + "epoch": 0.020487384084537416, + "grad_norm": 0.9822732210159302, + "learning_rate": 9.999647622573577e-06, + "loss": 0.8874, + "step": 380 + }, + { + "epoch": 0.020541298253180938, + "grad_norm": 1.1294387578964233, + "learning_rate": 9.999645101127179e-06, + "loss": 0.9892, + "step": 381 + }, + { + "epoch": 0.020595212421824455, + "grad_norm": 1.0458290576934814, + "learning_rate": 9.999642570692094e-06, + "loss": 0.9163, + "step": 382 + }, + { + "epoch": 0.020649126590467973, + "grad_norm": 0.8124557733535767, + "learning_rate": 9.999640031268326e-06, + "loss": 0.6927, + "step": 383 + }, + { + "epoch": 0.020703040759111495, + "grad_norm": 1.1053259372711182, + "learning_rate": 9.999637482855878e-06, + "loss": 0.8651, + "step": 384 + }, + { + "epoch": 0.020756954927755013, + "grad_norm": 1.1280632019042969, + "learning_rate": 9.999634925454757e-06, + "loss": 0.9708, + "step": 385 + }, + { + "epoch": 0.020810869096398534, + "grad_norm": 0.9916180372238159, + "learning_rate": 9.999632359064965e-06, + "loss": 0.9081, + "step": 386 + }, + { + "epoch": 0.020864783265042052, + "grad_norm": 1.0430771112442017, + "learning_rate": 9.99962978368651e-06, + "loss": 0.9837, + "step": 387 + }, + { + "epoch": 0.020918697433685574, + "grad_norm": 1.031343698501587, + "learning_rate": 9.999627199319398e-06, + "loss": 0.9156, + "step": 388 + }, + { + "epoch": 0.02097261160232909, + "grad_norm": 1.0157191753387451, + "learning_rate": 9.999624605963627e-06, + "loss": 0.9379, + "step": 389 + }, + { + "epoch": 0.021026525770972613, + "grad_norm": 0.9524544477462769, + "learning_rate": 9.999622003619204e-06, + "loss": 0.8448, + "step": 390 + }, + { + "epoch": 0.02108043993961613, + "grad_norm": 1.091670036315918, + "learning_rate": 9.999619392286137e-06, + "loss": 0.9794, + "step": 391 + }, + { + "epoch": 0.021134354108259652, + "grad_norm": 1.0502233505249023, + "learning_rate": 9.999616771964429e-06, + "loss": 1.0047, + "step": 392 + }, + { + "epoch": 0.02118826827690317, + "grad_norm": 1.2087476253509521, + "learning_rate": 9.999614142654084e-06, + "loss": 0.8964, + "step": 393 + }, + { + "epoch": 0.021242182445546688, + "grad_norm": 1.0264590978622437, + "learning_rate": 9.999611504355106e-06, + "loss": 0.8608, + "step": 394 + }, + { + "epoch": 0.02129609661419021, + "grad_norm": 0.9883281588554382, + "learning_rate": 9.999608857067503e-06, + "loss": 0.9109, + "step": 395 + }, + { + "epoch": 0.021350010782833728, + "grad_norm": 0.9913623332977295, + "learning_rate": 9.999606200791276e-06, + "loss": 0.8993, + "step": 396 + }, + { + "epoch": 0.02140392495147725, + "grad_norm": 1.019178867340088, + "learning_rate": 9.999603535526432e-06, + "loss": 0.9115, + "step": 397 + }, + { + "epoch": 0.021457839120120767, + "grad_norm": 0.9756026864051819, + "learning_rate": 9.999600861272974e-06, + "loss": 0.834, + "step": 398 + }, + { + "epoch": 0.02151175328876429, + "grad_norm": 0.9956341981887817, + "learning_rate": 9.999598178030909e-06, + "loss": 0.8756, + "step": 399 + }, + { + "epoch": 0.021565667457407806, + "grad_norm": 1.0267717838287354, + "learning_rate": 9.999595485800239e-06, + "loss": 0.9427, + "step": 400 + }, + { + "epoch": 0.021619581626051328, + "grad_norm": 1.061139464378357, + "learning_rate": 9.999592784580974e-06, + "loss": 0.9835, + "step": 401 + }, + { + "epoch": 0.021673495794694846, + "grad_norm": 0.9970353245735168, + "learning_rate": 9.999590074373114e-06, + "loss": 0.8946, + "step": 402 + }, + { + "epoch": 0.021727409963338367, + "grad_norm": 1.056242823600769, + "learning_rate": 9.999587355176664e-06, + "loss": 0.9076, + "step": 403 + }, + { + "epoch": 0.021781324131981885, + "grad_norm": 1.0285427570343018, + "learning_rate": 9.999584626991632e-06, + "loss": 0.8506, + "step": 404 + }, + { + "epoch": 0.021835238300625403, + "grad_norm": 1.0026901960372925, + "learning_rate": 9.99958188981802e-06, + "loss": 0.8457, + "step": 405 + }, + { + "epoch": 0.021889152469268924, + "grad_norm": 0.8921003341674805, + "learning_rate": 9.999579143655833e-06, + "loss": 0.8215, + "step": 406 + }, + { + "epoch": 0.021943066637912442, + "grad_norm": 1.2816855907440186, + "learning_rate": 9.99957638850508e-06, + "loss": 0.8779, + "step": 407 + }, + { + "epoch": 0.021996980806555964, + "grad_norm": 1.4713681936264038, + "learning_rate": 9.99957362436576e-06, + "loss": 0.8581, + "step": 408 + }, + { + "epoch": 0.02205089497519948, + "grad_norm": 1.0117568969726562, + "learning_rate": 9.999570851237883e-06, + "loss": 0.8865, + "step": 409 + }, + { + "epoch": 0.022104809143843003, + "grad_norm": 0.9530962705612183, + "learning_rate": 9.99956806912145e-06, + "loss": 0.8888, + "step": 410 + }, + { + "epoch": 0.02215872331248652, + "grad_norm": 0.865692675113678, + "learning_rate": 9.99956527801647e-06, + "loss": 0.8075, + "step": 411 + }, + { + "epoch": 0.022212637481130042, + "grad_norm": 0.9613220691680908, + "learning_rate": 9.999562477922944e-06, + "loss": 0.9289, + "step": 412 + }, + { + "epoch": 0.02226655164977356, + "grad_norm": 0.9419745802879333, + "learning_rate": 9.99955966884088e-06, + "loss": 0.8758, + "step": 413 + }, + { + "epoch": 0.02232046581841708, + "grad_norm": 1.0120573043823242, + "learning_rate": 9.999556850770282e-06, + "loss": 0.9014, + "step": 414 + }, + { + "epoch": 0.0223743799870606, + "grad_norm": 0.9833963513374329, + "learning_rate": 9.999554023711155e-06, + "loss": 0.9354, + "step": 415 + }, + { + "epoch": 0.022428294155704118, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.999551187663505e-06, + "loss": 0.9201, + "step": 416 + }, + { + "epoch": 0.02248220832434764, + "grad_norm": 1.0103633403778076, + "learning_rate": 9.999548342627334e-06, + "loss": 0.9023, + "step": 417 + }, + { + "epoch": 0.022536122492991157, + "grad_norm": 0.8671039342880249, + "learning_rate": 9.99954548860265e-06, + "loss": 0.7263, + "step": 418 + }, + { + "epoch": 0.02259003666163468, + "grad_norm": 1.0967090129852295, + "learning_rate": 9.999542625589461e-06, + "loss": 1.0616, + "step": 419 + }, + { + "epoch": 0.022643950830278196, + "grad_norm": 0.9032139778137207, + "learning_rate": 9.999539753587764e-06, + "loss": 0.782, + "step": 420 + }, + { + "epoch": 0.022697864998921718, + "grad_norm": 0.9532387256622314, + "learning_rate": 9.99953687259757e-06, + "loss": 0.9628, + "step": 421 + }, + { + "epoch": 0.022751779167565236, + "grad_norm": 0.9732246994972229, + "learning_rate": 9.999533982618885e-06, + "loss": 0.8682, + "step": 422 + }, + { + "epoch": 0.022805693336208757, + "grad_norm": 0.9160019159317017, + "learning_rate": 9.99953108365171e-06, + "loss": 0.9051, + "step": 423 + }, + { + "epoch": 0.022859607504852275, + "grad_norm": 1.0100488662719727, + "learning_rate": 9.999528175696054e-06, + "loss": 0.9836, + "step": 424 + }, + { + "epoch": 0.022913521673495793, + "grad_norm": 1.0130014419555664, + "learning_rate": 9.99952525875192e-06, + "loss": 0.8653, + "step": 425 + }, + { + "epoch": 0.022967435842139314, + "grad_norm": 0.9726247787475586, + "learning_rate": 9.999522332819313e-06, + "loss": 0.8761, + "step": 426 + }, + { + "epoch": 0.023021350010782832, + "grad_norm": 0.9457972049713135, + "learning_rate": 9.99951939789824e-06, + "loss": 0.8792, + "step": 427 + }, + { + "epoch": 0.023075264179426354, + "grad_norm": 1.083130121231079, + "learning_rate": 9.999516453988706e-06, + "loss": 0.9035, + "step": 428 + }, + { + "epoch": 0.023129178348069872, + "grad_norm": 0.9195771217346191, + "learning_rate": 9.999513501090714e-06, + "loss": 0.8586, + "step": 429 + }, + { + "epoch": 0.023183092516713393, + "grad_norm": 0.983346700668335, + "learning_rate": 9.999510539204273e-06, + "loss": 0.8335, + "step": 430 + }, + { + "epoch": 0.02323700668535691, + "grad_norm": 1.0524029731750488, + "learning_rate": 9.999507568329386e-06, + "loss": 0.838, + "step": 431 + }, + { + "epoch": 0.023290920854000433, + "grad_norm": 1.0267860889434814, + "learning_rate": 9.999504588466058e-06, + "loss": 0.9345, + "step": 432 + }, + { + "epoch": 0.02334483502264395, + "grad_norm": 1.025707483291626, + "learning_rate": 9.999501599614294e-06, + "loss": 0.9042, + "step": 433 + }, + { + "epoch": 0.023398749191287472, + "grad_norm": 0.9739174842834473, + "learning_rate": 9.999498601774101e-06, + "loss": 0.7433, + "step": 434 + }, + { + "epoch": 0.02345266335993099, + "grad_norm": 0.9468310475349426, + "learning_rate": 9.999495594945486e-06, + "loss": 0.8447, + "step": 435 + }, + { + "epoch": 0.023506577528574508, + "grad_norm": 0.9820529818534851, + "learning_rate": 9.99949257912845e-06, + "loss": 0.8842, + "step": 436 + }, + { + "epoch": 0.02356049169721803, + "grad_norm": 0.998515784740448, + "learning_rate": 9.999489554323e-06, + "loss": 0.9226, + "step": 437 + }, + { + "epoch": 0.023614405865861547, + "grad_norm": 0.9819791316986084, + "learning_rate": 9.999486520529144e-06, + "loss": 0.8559, + "step": 438 + }, + { + "epoch": 0.02366832003450507, + "grad_norm": 0.9468326568603516, + "learning_rate": 9.999483477746884e-06, + "loss": 0.8064, + "step": 439 + }, + { + "epoch": 0.023722234203148587, + "grad_norm": 1.0087614059448242, + "learning_rate": 9.999480425976229e-06, + "loss": 0.9232, + "step": 440 + }, + { + "epoch": 0.023776148371792108, + "grad_norm": 0.9446098208427429, + "learning_rate": 9.99947736521718e-06, + "loss": 0.8511, + "step": 441 + }, + { + "epoch": 0.023830062540435626, + "grad_norm": 1.0966850519180298, + "learning_rate": 9.999474295469746e-06, + "loss": 0.9929, + "step": 442 + }, + { + "epoch": 0.023883976709079147, + "grad_norm": 0.8858770728111267, + "learning_rate": 9.99947121673393e-06, + "loss": 0.8492, + "step": 443 + }, + { + "epoch": 0.023937890877722665, + "grad_norm": 1.083717703819275, + "learning_rate": 9.999468129009742e-06, + "loss": 0.9948, + "step": 444 + }, + { + "epoch": 0.023991805046366187, + "grad_norm": 1.0251178741455078, + "learning_rate": 9.999465032297184e-06, + "loss": 0.8769, + "step": 445 + }, + { + "epoch": 0.024045719215009705, + "grad_norm": 0.9331875443458557, + "learning_rate": 9.999461926596261e-06, + "loss": 0.8663, + "step": 446 + }, + { + "epoch": 0.024099633383653223, + "grad_norm": 0.8941493034362793, + "learning_rate": 9.999458811906979e-06, + "loss": 0.8172, + "step": 447 + }, + { + "epoch": 0.024153547552296744, + "grad_norm": 0.9978699684143066, + "learning_rate": 9.999455688229347e-06, + "loss": 0.9303, + "step": 448 + }, + { + "epoch": 0.024207461720940262, + "grad_norm": 0.8835211992263794, + "learning_rate": 9.999452555563366e-06, + "loss": 0.8921, + "step": 449 + }, + { + "epoch": 0.024261375889583783, + "grad_norm": 0.9061810970306396, + "learning_rate": 9.999449413909043e-06, + "loss": 0.8201, + "step": 450 + }, + { + "epoch": 0.0243152900582273, + "grad_norm": 1.0061571598052979, + "learning_rate": 9.999446263266385e-06, + "loss": 0.8506, + "step": 451 + }, + { + "epoch": 0.024369204226870823, + "grad_norm": 0.9286402463912964, + "learning_rate": 9.999443103635398e-06, + "loss": 0.8532, + "step": 452 + }, + { + "epoch": 0.02442311839551434, + "grad_norm": 1.0919772386550903, + "learning_rate": 9.999439935016087e-06, + "loss": 0.9466, + "step": 453 + }, + { + "epoch": 0.024477032564157862, + "grad_norm": 1.0552513599395752, + "learning_rate": 9.999436757408453e-06, + "loss": 0.8406, + "step": 454 + }, + { + "epoch": 0.02453094673280138, + "grad_norm": 0.9604331851005554, + "learning_rate": 9.999433570812511e-06, + "loss": 0.8928, + "step": 455 + }, + { + "epoch": 0.0245848609014449, + "grad_norm": 1.0126323699951172, + "learning_rate": 9.999430375228259e-06, + "loss": 0.924, + "step": 456 + }, + { + "epoch": 0.02463877507008842, + "grad_norm": 1.0540791749954224, + "learning_rate": 9.999427170655707e-06, + "loss": 0.9656, + "step": 457 + }, + { + "epoch": 0.024692689238731937, + "grad_norm": 0.8622417449951172, + "learning_rate": 9.999423957094857e-06, + "loss": 0.7428, + "step": 458 + }, + { + "epoch": 0.02474660340737546, + "grad_norm": 1.106581211090088, + "learning_rate": 9.999420734545719e-06, + "loss": 0.9258, + "step": 459 + }, + { + "epoch": 0.024800517576018977, + "grad_norm": 0.990807294845581, + "learning_rate": 9.999417503008296e-06, + "loss": 0.9083, + "step": 460 + }, + { + "epoch": 0.024854431744662498, + "grad_norm": 0.9302589893341064, + "learning_rate": 9.999414262482594e-06, + "loss": 0.8654, + "step": 461 + }, + { + "epoch": 0.024908345913306016, + "grad_norm": 1.0218255519866943, + "learning_rate": 9.999411012968621e-06, + "loss": 0.8996, + "step": 462 + }, + { + "epoch": 0.024962260081949537, + "grad_norm": 0.976108193397522, + "learning_rate": 9.99940775446638e-06, + "loss": 0.9423, + "step": 463 + }, + { + "epoch": 0.025016174250593055, + "grad_norm": 1.1027617454528809, + "learning_rate": 9.99940448697588e-06, + "loss": 1.0407, + "step": 464 + }, + { + "epoch": 0.025070088419236577, + "grad_norm": 1.0148764848709106, + "learning_rate": 9.999401210497122e-06, + "loss": 0.9418, + "step": 465 + }, + { + "epoch": 0.025124002587880095, + "grad_norm": 1.0120681524276733, + "learning_rate": 9.999397925030116e-06, + "loss": 0.92, + "step": 466 + }, + { + "epoch": 0.025177916756523613, + "grad_norm": 1.1855127811431885, + "learning_rate": 9.999394630574868e-06, + "loss": 0.9285, + "step": 467 + }, + { + "epoch": 0.025231830925167134, + "grad_norm": 1.8014320135116577, + "learning_rate": 9.999391327131383e-06, + "loss": 0.979, + "step": 468 + }, + { + "epoch": 0.025285745093810652, + "grad_norm": 1.1568403244018555, + "learning_rate": 9.999388014699664e-06, + "loss": 0.9574, + "step": 469 + }, + { + "epoch": 0.025339659262454173, + "grad_norm": 1.2544865608215332, + "learning_rate": 9.99938469327972e-06, + "loss": 0.8356, + "step": 470 + }, + { + "epoch": 0.02539357343109769, + "grad_norm": 1.8647997379302979, + "learning_rate": 9.99938136287156e-06, + "loss": 0.9181, + "step": 471 + }, + { + "epoch": 0.025447487599741213, + "grad_norm": 0.9942222237586975, + "learning_rate": 9.999378023475184e-06, + "loss": 0.9297, + "step": 472 + }, + { + "epoch": 0.02550140176838473, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.9993746750906e-06, + "loss": 0.9181, + "step": 473 + }, + { + "epoch": 0.025555315937028252, + "grad_norm": 0.9353258609771729, + "learning_rate": 9.999371317717817e-06, + "loss": 0.8789, + "step": 474 + }, + { + "epoch": 0.02560923010567177, + "grad_norm": 0.9256170988082886, + "learning_rate": 9.999367951356838e-06, + "loss": 0.8725, + "step": 475 + }, + { + "epoch": 0.02566314427431529, + "grad_norm": 1.1102124452590942, + "learning_rate": 9.999364576007669e-06, + "loss": 0.9818, + "step": 476 + }, + { + "epoch": 0.02571705844295881, + "grad_norm": 1.04171884059906, + "learning_rate": 9.999361191670316e-06, + "loss": 0.9275, + "step": 477 + }, + { + "epoch": 0.025770972611602327, + "grad_norm": 0.9670290350914001, + "learning_rate": 9.999357798344787e-06, + "loss": 0.8919, + "step": 478 + }, + { + "epoch": 0.02582488678024585, + "grad_norm": 1.0543723106384277, + "learning_rate": 9.999354396031085e-06, + "loss": 0.9356, + "step": 479 + }, + { + "epoch": 0.025878800948889367, + "grad_norm": 1.1368457078933716, + "learning_rate": 9.99935098472922e-06, + "loss": 0.9387, + "step": 480 + }, + { + "epoch": 0.025932715117532888, + "grad_norm": 1.0627872943878174, + "learning_rate": 9.999347564439196e-06, + "loss": 1.0047, + "step": 481 + }, + { + "epoch": 0.025986629286176406, + "grad_norm": 0.9553730487823486, + "learning_rate": 9.999344135161018e-06, + "loss": 0.8845, + "step": 482 + }, + { + "epoch": 0.026040543454819928, + "grad_norm": 0.9605830907821655, + "learning_rate": 9.999340696894694e-06, + "loss": 0.8816, + "step": 483 + }, + { + "epoch": 0.026094457623463446, + "grad_norm": 1.0464140176773071, + "learning_rate": 9.999337249640232e-06, + "loss": 0.9344, + "step": 484 + }, + { + "epoch": 0.026148371792106967, + "grad_norm": 1.0667988061904907, + "learning_rate": 9.999333793397635e-06, + "loss": 0.8834, + "step": 485 + }, + { + "epoch": 0.026202285960750485, + "grad_norm": 0.8996486663818359, + "learning_rate": 9.999330328166908e-06, + "loss": 0.8247, + "step": 486 + }, + { + "epoch": 0.026256200129394006, + "grad_norm": 1.0483838319778442, + "learning_rate": 9.99932685394806e-06, + "loss": 0.9414, + "step": 487 + }, + { + "epoch": 0.026310114298037524, + "grad_norm": 1.2089953422546387, + "learning_rate": 9.999323370741097e-06, + "loss": 1.0913, + "step": 488 + }, + { + "epoch": 0.026364028466681042, + "grad_norm": 1.074291467666626, + "learning_rate": 9.999319878546025e-06, + "loss": 0.8882, + "step": 489 + }, + { + "epoch": 0.026417942635324564, + "grad_norm": 1.0076494216918945, + "learning_rate": 9.99931637736285e-06, + "loss": 0.8393, + "step": 490 + }, + { + "epoch": 0.02647185680396808, + "grad_norm": 1.2263407707214355, + "learning_rate": 9.99931286719158e-06, + "loss": 0.955, + "step": 491 + }, + { + "epoch": 0.026525770972611603, + "grad_norm": 0.9093664884567261, + "learning_rate": 9.999309348032218e-06, + "loss": 0.8366, + "step": 492 + }, + { + "epoch": 0.02657968514125512, + "grad_norm": 1.0704407691955566, + "learning_rate": 9.999305819884772e-06, + "loss": 0.981, + "step": 493 + }, + { + "epoch": 0.026633599309898642, + "grad_norm": 1.2105270624160767, + "learning_rate": 9.999302282749249e-06, + "loss": 0.8896, + "step": 494 + }, + { + "epoch": 0.02668751347854216, + "grad_norm": 1.0142449140548706, + "learning_rate": 9.999298736625654e-06, + "loss": 0.8627, + "step": 495 + }, + { + "epoch": 0.02674142764718568, + "grad_norm": 1.0887057781219482, + "learning_rate": 9.999295181513994e-06, + "loss": 0.8884, + "step": 496 + }, + { + "epoch": 0.0267953418158292, + "grad_norm": 0.9958952069282532, + "learning_rate": 9.999291617414277e-06, + "loss": 0.7768, + "step": 497 + }, + { + "epoch": 0.02684925598447272, + "grad_norm": 0.8576722741127014, + "learning_rate": 9.999288044326508e-06, + "loss": 0.715, + "step": 498 + }, + { + "epoch": 0.02690317015311624, + "grad_norm": 1.058148741722107, + "learning_rate": 9.999284462250691e-06, + "loss": 0.8693, + "step": 499 + }, + { + "epoch": 0.026957084321759757, + "grad_norm": 0.9429569244384766, + "learning_rate": 9.999280871186837e-06, + "loss": 0.8883, + "step": 500 + }, + { + "epoch": 0.02701099849040328, + "grad_norm": 0.9450993537902832, + "learning_rate": 9.999277271134948e-06, + "loss": 0.9376, + "step": 501 + }, + { + "epoch": 0.027064912659046796, + "grad_norm": 1.0307891368865967, + "learning_rate": 9.999273662095035e-06, + "loss": 0.9098, + "step": 502 + }, + { + "epoch": 0.027118826827690318, + "grad_norm": 0.9515891671180725, + "learning_rate": 9.999270044067101e-06, + "loss": 0.8854, + "step": 503 + }, + { + "epoch": 0.027172740996333836, + "grad_norm": 1.1173255443572998, + "learning_rate": 9.999266417051154e-06, + "loss": 0.7977, + "step": 504 + }, + { + "epoch": 0.027226655164977357, + "grad_norm": 1.028194785118103, + "learning_rate": 9.9992627810472e-06, + "loss": 0.9585, + "step": 505 + }, + { + "epoch": 0.027280569333620875, + "grad_norm": 1.0855528116226196, + "learning_rate": 9.999259136055245e-06, + "loss": 0.9807, + "step": 506 + }, + { + "epoch": 0.027334483502264396, + "grad_norm": 1.1148236989974976, + "learning_rate": 9.999255482075298e-06, + "loss": 0.9672, + "step": 507 + }, + { + "epoch": 0.027388397670907914, + "grad_norm": 0.9697713255882263, + "learning_rate": 9.999251819107364e-06, + "loss": 0.9073, + "step": 508 + }, + { + "epoch": 0.027442311839551436, + "grad_norm": 0.9802384972572327, + "learning_rate": 9.999248147151448e-06, + "loss": 0.8704, + "step": 509 + }, + { + "epoch": 0.027496226008194954, + "grad_norm": 0.963330090045929, + "learning_rate": 9.999244466207559e-06, + "loss": 0.9312, + "step": 510 + }, + { + "epoch": 0.02755014017683847, + "grad_norm": 0.8776309490203857, + "learning_rate": 9.999240776275703e-06, + "loss": 0.8068, + "step": 511 + }, + { + "epoch": 0.027604054345481993, + "grad_norm": 1.1159353256225586, + "learning_rate": 9.999237077355886e-06, + "loss": 0.8164, + "step": 512 + }, + { + "epoch": 0.02765796851412551, + "grad_norm": 1.004232406616211, + "learning_rate": 9.999233369448115e-06, + "loss": 0.8666, + "step": 513 + }, + { + "epoch": 0.027711882682769032, + "grad_norm": 1.0300110578536987, + "learning_rate": 9.999229652552395e-06, + "loss": 0.8774, + "step": 514 + }, + { + "epoch": 0.02776579685141255, + "grad_norm": 0.8823155164718628, + "learning_rate": 9.999225926668736e-06, + "loss": 0.7579, + "step": 515 + }, + { + "epoch": 0.027819711020056072, + "grad_norm": 0.938956618309021, + "learning_rate": 9.999222191797144e-06, + "loss": 0.8749, + "step": 516 + }, + { + "epoch": 0.02787362518869959, + "grad_norm": 0.9111800789833069, + "learning_rate": 9.999218447937624e-06, + "loss": 0.8915, + "step": 517 + }, + { + "epoch": 0.02792753935734311, + "grad_norm": 0.971813440322876, + "learning_rate": 9.999214695090182e-06, + "loss": 0.9038, + "step": 518 + }, + { + "epoch": 0.02798145352598663, + "grad_norm": 0.9159868359565735, + "learning_rate": 9.999210933254828e-06, + "loss": 0.8726, + "step": 519 + }, + { + "epoch": 0.028035367694630147, + "grad_norm": 1.0223439931869507, + "learning_rate": 9.999207162431566e-06, + "loss": 0.8738, + "step": 520 + }, + { + "epoch": 0.02808928186327367, + "grad_norm": 0.9844004511833191, + "learning_rate": 9.999203382620404e-06, + "loss": 0.8815, + "step": 521 + }, + { + "epoch": 0.028143196031917186, + "grad_norm": 1.1636719703674316, + "learning_rate": 9.99919959382135e-06, + "loss": 0.8781, + "step": 522 + }, + { + "epoch": 0.028197110200560708, + "grad_norm": 0.9637702703475952, + "learning_rate": 9.999195796034407e-06, + "loss": 0.8491, + "step": 523 + }, + { + "epoch": 0.028251024369204226, + "grad_norm": 0.975931704044342, + "learning_rate": 9.999191989259584e-06, + "loss": 0.9983, + "step": 524 + }, + { + "epoch": 0.028304938537847747, + "grad_norm": 0.9855527877807617, + "learning_rate": 9.999188173496889e-06, + "loss": 0.9587, + "step": 525 + }, + { + "epoch": 0.028358852706491265, + "grad_norm": 0.9925652742385864, + "learning_rate": 9.99918434874633e-06, + "loss": 0.8408, + "step": 526 + }, + { + "epoch": 0.028412766875134787, + "grad_norm": 0.9272180795669556, + "learning_rate": 9.999180515007908e-06, + "loss": 0.8267, + "step": 527 + }, + { + "epoch": 0.028466681043778305, + "grad_norm": 1.161076307296753, + "learning_rate": 9.999176672281636e-06, + "loss": 0.9282, + "step": 528 + }, + { + "epoch": 0.028520595212421826, + "grad_norm": 0.8953909277915955, + "learning_rate": 9.99917282056752e-06, + "loss": 0.8078, + "step": 529 + }, + { + "epoch": 0.028574509381065344, + "grad_norm": 0.9194382429122925, + "learning_rate": 9.999168959865562e-06, + "loss": 0.8385, + "step": 530 + }, + { + "epoch": 0.028628423549708862, + "grad_norm": 1.0351816415786743, + "learning_rate": 9.999165090175775e-06, + "loss": 0.8155, + "step": 531 + }, + { + "epoch": 0.028682337718352383, + "grad_norm": 0.9233224391937256, + "learning_rate": 9.999161211498163e-06, + "loss": 0.8825, + "step": 532 + }, + { + "epoch": 0.0287362518869959, + "grad_norm": 1.0415356159210205, + "learning_rate": 9.999157323832732e-06, + "loss": 0.7844, + "step": 533 + }, + { + "epoch": 0.028790166055639423, + "grad_norm": 1.0329923629760742, + "learning_rate": 9.999153427179492e-06, + "loss": 0.893, + "step": 534 + }, + { + "epoch": 0.02884408022428294, + "grad_norm": 1.237291932106018, + "learning_rate": 9.999149521538448e-06, + "loss": 0.9786, + "step": 535 + }, + { + "epoch": 0.028897994392926462, + "grad_norm": 0.9952654242515564, + "learning_rate": 9.999145606909607e-06, + "loss": 0.9262, + "step": 536 + }, + { + "epoch": 0.02895190856156998, + "grad_norm": 1.016533374786377, + "learning_rate": 9.999141683292977e-06, + "loss": 0.9854, + "step": 537 + }, + { + "epoch": 0.0290058227302135, + "grad_norm": 1.0334454774856567, + "learning_rate": 9.999137750688564e-06, + "loss": 0.8928, + "step": 538 + }, + { + "epoch": 0.02905973689885702, + "grad_norm": 0.941662609577179, + "learning_rate": 9.999133809096374e-06, + "loss": 0.8698, + "step": 539 + }, + { + "epoch": 0.02911365106750054, + "grad_norm": 0.9454428553581238, + "learning_rate": 9.999129858516418e-06, + "loss": 0.9261, + "step": 540 + }, + { + "epoch": 0.02916756523614406, + "grad_norm": 1.0921217203140259, + "learning_rate": 9.9991258989487e-06, + "loss": 0.9163, + "step": 541 + }, + { + "epoch": 0.029221479404787577, + "grad_norm": 0.8999170064926147, + "learning_rate": 9.999121930393227e-06, + "loss": 0.883, + "step": 542 + }, + { + "epoch": 0.029275393573431098, + "grad_norm": 0.9732702970504761, + "learning_rate": 9.999117952850009e-06, + "loss": 0.9168, + "step": 543 + }, + { + "epoch": 0.029329307742074616, + "grad_norm": 1.00196373462677, + "learning_rate": 9.99911396631905e-06, + "loss": 0.826, + "step": 544 + }, + { + "epoch": 0.029383221910718137, + "grad_norm": 0.9776156544685364, + "learning_rate": 9.999109970800358e-06, + "loss": 0.8176, + "step": 545 + }, + { + "epoch": 0.029437136079361655, + "grad_norm": 1.0503387451171875, + "learning_rate": 9.99910596629394e-06, + "loss": 0.8617, + "step": 546 + }, + { + "epoch": 0.029491050248005177, + "grad_norm": 0.9195687174797058, + "learning_rate": 9.999101952799805e-06, + "loss": 0.8224, + "step": 547 + }, + { + "epoch": 0.029544964416648695, + "grad_norm": 0.8746809959411621, + "learning_rate": 9.999097930317959e-06, + "loss": 0.8407, + "step": 548 + }, + { + "epoch": 0.029598878585292216, + "grad_norm": 0.9035898447036743, + "learning_rate": 9.999093898848407e-06, + "loss": 0.8344, + "step": 549 + }, + { + "epoch": 0.029652792753935734, + "grad_norm": 0.8764795064926147, + "learning_rate": 9.99908985839116e-06, + "loss": 0.8323, + "step": 550 + }, + { + "epoch": 0.029706706922579255, + "grad_norm": 0.9654614329338074, + "learning_rate": 9.999085808946224e-06, + "loss": 0.8696, + "step": 551 + }, + { + "epoch": 0.029760621091222773, + "grad_norm": 1.1295796632766724, + "learning_rate": 9.999081750513606e-06, + "loss": 0.9608, + "step": 552 + }, + { + "epoch": 0.02981453525986629, + "grad_norm": 0.9591107368469238, + "learning_rate": 9.999077683093313e-06, + "loss": 0.8762, + "step": 553 + }, + { + "epoch": 0.029868449428509813, + "grad_norm": 0.8287899494171143, + "learning_rate": 9.999073606685353e-06, + "loss": 0.7265, + "step": 554 + }, + { + "epoch": 0.02992236359715333, + "grad_norm": 0.9429282546043396, + "learning_rate": 9.99906952128973e-06, + "loss": 0.8835, + "step": 555 + }, + { + "epoch": 0.029976277765796852, + "grad_norm": 0.9617370963096619, + "learning_rate": 9.999065426906459e-06, + "loss": 0.9138, + "step": 556 + }, + { + "epoch": 0.03003019193444037, + "grad_norm": 1.2346372604370117, + "learning_rate": 9.999061323535538e-06, + "loss": 0.831, + "step": 557 + }, + { + "epoch": 0.03008410610308389, + "grad_norm": 1.2413623332977295, + "learning_rate": 9.999057211176982e-06, + "loss": 1.0211, + "step": 558 + }, + { + "epoch": 0.03013802027172741, + "grad_norm": 0.98906010389328, + "learning_rate": 9.999053089830794e-06, + "loss": 0.7821, + "step": 559 + }, + { + "epoch": 0.03019193444037093, + "grad_norm": 0.96706622838974, + "learning_rate": 9.999048959496983e-06, + "loss": 0.8593, + "step": 560 + }, + { + "epoch": 0.03024584860901445, + "grad_norm": 0.9400071501731873, + "learning_rate": 9.999044820175556e-06, + "loss": 0.8731, + "step": 561 + }, + { + "epoch": 0.03029976277765797, + "grad_norm": 1.1276499032974243, + "learning_rate": 9.999040671866522e-06, + "loss": 0.86, + "step": 562 + }, + { + "epoch": 0.030353676946301488, + "grad_norm": 0.8859087228775024, + "learning_rate": 9.999036514569885e-06, + "loss": 0.8274, + "step": 563 + }, + { + "epoch": 0.030407591114945006, + "grad_norm": 1.1617575883865356, + "learning_rate": 9.999032348285656e-06, + "loss": 1.0519, + "step": 564 + }, + { + "epoch": 0.030461505283588527, + "grad_norm": 0.9717594385147095, + "learning_rate": 9.99902817301384e-06, + "loss": 0.9276, + "step": 565 + }, + { + "epoch": 0.030515419452232045, + "grad_norm": 1.000722050666809, + "learning_rate": 9.999023988754446e-06, + "loss": 0.8714, + "step": 566 + }, + { + "epoch": 0.030569333620875567, + "grad_norm": 1.1744625568389893, + "learning_rate": 9.999019795507481e-06, + "loss": 1.0087, + "step": 567 + }, + { + "epoch": 0.030623247789519085, + "grad_norm": 1.0199978351593018, + "learning_rate": 9.999015593272953e-06, + "loss": 0.8537, + "step": 568 + }, + { + "epoch": 0.030677161958162606, + "grad_norm": 0.9232216477394104, + "learning_rate": 9.999011382050869e-06, + "loss": 0.8488, + "step": 569 + }, + { + "epoch": 0.030731076126806124, + "grad_norm": 0.9905959367752075, + "learning_rate": 9.99900716184124e-06, + "loss": 0.9048, + "step": 570 + }, + { + "epoch": 0.030784990295449646, + "grad_norm": 0.9921644330024719, + "learning_rate": 9.999002932644066e-06, + "loss": 0.9294, + "step": 571 + }, + { + "epoch": 0.030838904464093164, + "grad_norm": 1.1583740711212158, + "learning_rate": 9.99899869445936e-06, + "loss": 0.727, + "step": 572 + }, + { + "epoch": 0.03089281863273668, + "grad_norm": 0.906736433506012, + "learning_rate": 9.998994447287127e-06, + "loss": 0.7889, + "step": 573 + }, + { + "epoch": 0.030946732801380203, + "grad_norm": 0.9060770869255066, + "learning_rate": 9.998990191127379e-06, + "loss": 0.8493, + "step": 574 + }, + { + "epoch": 0.03100064697002372, + "grad_norm": 0.9094041585922241, + "learning_rate": 9.99898592598012e-06, + "loss": 0.8604, + "step": 575 + }, + { + "epoch": 0.031054561138667242, + "grad_norm": 1.0964977741241455, + "learning_rate": 9.998981651845358e-06, + "loss": 0.8481, + "step": 576 + }, + { + "epoch": 0.03110847530731076, + "grad_norm": 0.9509627223014832, + "learning_rate": 9.998977368723102e-06, + "loss": 0.8601, + "step": 577 + }, + { + "epoch": 0.03116238947595428, + "grad_norm": 1.0108642578125, + "learning_rate": 9.998973076613359e-06, + "loss": 0.9076, + "step": 578 + }, + { + "epoch": 0.0312163036445978, + "grad_norm": 1.0268129110336304, + "learning_rate": 9.998968775516136e-06, + "loss": 0.8273, + "step": 579 + }, + { + "epoch": 0.03127021781324132, + "grad_norm": 0.968941867351532, + "learning_rate": 9.99896446543144e-06, + "loss": 0.8859, + "step": 580 + }, + { + "epoch": 0.03132413198188484, + "grad_norm": 0.936779260635376, + "learning_rate": 9.998960146359283e-06, + "loss": 0.8589, + "step": 581 + }, + { + "epoch": 0.03137804615052836, + "grad_norm": 0.9675167202949524, + "learning_rate": 9.998955818299667e-06, + "loss": 0.973, + "step": 582 + }, + { + "epoch": 0.03143196031917188, + "grad_norm": 0.9475553035736084, + "learning_rate": 9.998951481252604e-06, + "loss": 0.8936, + "step": 583 + }, + { + "epoch": 0.031485874487815396, + "grad_norm": 0.9130968451499939, + "learning_rate": 9.9989471352181e-06, + "loss": 0.7668, + "step": 584 + }, + { + "epoch": 0.031539788656458914, + "grad_norm": 0.8890071511268616, + "learning_rate": 9.998942780196164e-06, + "loss": 0.8971, + "step": 585 + }, + { + "epoch": 0.03159370282510244, + "grad_norm": 0.9298738837242126, + "learning_rate": 9.998938416186803e-06, + "loss": 0.9313, + "step": 586 + }, + { + "epoch": 0.03164761699374596, + "grad_norm": 1.0683361291885376, + "learning_rate": 9.998934043190025e-06, + "loss": 0.9018, + "step": 587 + }, + { + "epoch": 0.031701531162389475, + "grad_norm": 0.939253568649292, + "learning_rate": 9.99892966120584e-06, + "loss": 0.9119, + "step": 588 + }, + { + "epoch": 0.03175544533103299, + "grad_norm": 0.9245349764823914, + "learning_rate": 9.99892527023425e-06, + "loss": 0.9258, + "step": 589 + }, + { + "epoch": 0.03180935949967652, + "grad_norm": 0.9318797588348389, + "learning_rate": 9.998920870275267e-06, + "loss": 0.9557, + "step": 590 + }, + { + "epoch": 0.031863273668320036, + "grad_norm": 0.8909592628479004, + "learning_rate": 9.998916461328899e-06, + "loss": 0.8122, + "step": 591 + }, + { + "epoch": 0.031917187836963554, + "grad_norm": 1.0637080669403076, + "learning_rate": 9.998912043395154e-06, + "loss": 0.9517, + "step": 592 + }, + { + "epoch": 0.03197110200560707, + "grad_norm": 0.881934642791748, + "learning_rate": 9.99890761647404e-06, + "loss": 0.8729, + "step": 593 + }, + { + "epoch": 0.032025016174250596, + "grad_norm": 0.8882094025611877, + "learning_rate": 9.998903180565562e-06, + "loss": 0.7943, + "step": 594 + }, + { + "epoch": 0.032078930342894114, + "grad_norm": 0.965085506439209, + "learning_rate": 9.99889873566973e-06, + "loss": 0.8894, + "step": 595 + }, + { + "epoch": 0.03213284451153763, + "grad_norm": 0.9679432511329651, + "learning_rate": 9.998894281786556e-06, + "loss": 0.854, + "step": 596 + }, + { + "epoch": 0.03218675868018115, + "grad_norm": 1.4454354047775269, + "learning_rate": 9.998889818916043e-06, + "loss": 0.9944, + "step": 597 + }, + { + "epoch": 0.03224067284882467, + "grad_norm": 0.9369311928749084, + "learning_rate": 9.998885347058198e-06, + "loss": 0.8699, + "step": 598 + }, + { + "epoch": 0.03229458701746819, + "grad_norm": 0.9014303088188171, + "learning_rate": 9.998880866213033e-06, + "loss": 0.8735, + "step": 599 + }, + { + "epoch": 0.03234850118611171, + "grad_norm": 0.989251971244812, + "learning_rate": 9.998876376380555e-06, + "loss": 0.8872, + "step": 600 + }, + { + "epoch": 0.03240241535475523, + "grad_norm": 1.0256885290145874, + "learning_rate": 9.99887187756077e-06, + "loss": 0.8787, + "step": 601 + }, + { + "epoch": 0.03245632952339875, + "grad_norm": 0.9560148119926453, + "learning_rate": 9.998867369753688e-06, + "loss": 0.8301, + "step": 602 + }, + { + "epoch": 0.03251024369204227, + "grad_norm": 1.044754147529602, + "learning_rate": 9.998862852959316e-06, + "loss": 0.9286, + "step": 603 + }, + { + "epoch": 0.03256415786068579, + "grad_norm": 0.8769629597663879, + "learning_rate": 9.998858327177665e-06, + "loss": 0.7927, + "step": 604 + }, + { + "epoch": 0.03261807202932931, + "grad_norm": 0.9217430949211121, + "learning_rate": 9.99885379240874e-06, + "loss": 0.8327, + "step": 605 + }, + { + "epoch": 0.032671986197972826, + "grad_norm": 0.8202590942382812, + "learning_rate": 9.99884924865255e-06, + "loss": 0.7269, + "step": 606 + }, + { + "epoch": 0.032725900366616344, + "grad_norm": 0.9598796367645264, + "learning_rate": 9.998844695909102e-06, + "loss": 0.9329, + "step": 607 + }, + { + "epoch": 0.03277981453525987, + "grad_norm": 1.1016643047332764, + "learning_rate": 9.998840134178407e-06, + "loss": 0.9836, + "step": 608 + }, + { + "epoch": 0.032833728703903386, + "grad_norm": 0.9639281630516052, + "learning_rate": 9.998835563460471e-06, + "loss": 0.8475, + "step": 609 + }, + { + "epoch": 0.032887642872546904, + "grad_norm": 0.9266204833984375, + "learning_rate": 9.998830983755304e-06, + "loss": 0.7307, + "step": 610 + }, + { + "epoch": 0.03294155704119042, + "grad_norm": 0.9282877445220947, + "learning_rate": 9.99882639506291e-06, + "loss": 0.8163, + "step": 611 + }, + { + "epoch": 0.03299547120983395, + "grad_norm": 0.8939738869667053, + "learning_rate": 9.998821797383302e-06, + "loss": 0.6902, + "step": 612 + }, + { + "epoch": 0.033049385378477465, + "grad_norm": 0.9041041731834412, + "learning_rate": 9.998817190716488e-06, + "loss": 0.8735, + "step": 613 + }, + { + "epoch": 0.03310329954712098, + "grad_norm": 0.9973318576812744, + "learning_rate": 9.998812575062473e-06, + "loss": 0.9017, + "step": 614 + }, + { + "epoch": 0.0331572137157645, + "grad_norm": 1.0416412353515625, + "learning_rate": 9.998807950421268e-06, + "loss": 0.9293, + "step": 615 + }, + { + "epoch": 0.03321112788440802, + "grad_norm": 0.8686584234237671, + "learning_rate": 9.998803316792882e-06, + "loss": 0.8585, + "step": 616 + }, + { + "epoch": 0.033265042053051544, + "grad_norm": 0.9907833337783813, + "learning_rate": 9.998798674177319e-06, + "loss": 0.9264, + "step": 617 + }, + { + "epoch": 0.03331895622169506, + "grad_norm": 0.9927001595497131, + "learning_rate": 9.998794022574592e-06, + "loss": 0.895, + "step": 618 + }, + { + "epoch": 0.03337287039033858, + "grad_norm": 0.9314623475074768, + "learning_rate": 9.998789361984707e-06, + "loss": 0.8353, + "step": 619 + }, + { + "epoch": 0.0334267845589821, + "grad_norm": 0.9768248796463013, + "learning_rate": 9.998784692407673e-06, + "loss": 0.8917, + "step": 620 + }, + { + "epoch": 0.03348069872762562, + "grad_norm": 0.9487942457199097, + "learning_rate": 9.998780013843498e-06, + "loss": 0.9022, + "step": 621 + }, + { + "epoch": 0.03353461289626914, + "grad_norm": 1.0376895666122437, + "learning_rate": 9.99877532629219e-06, + "loss": 0.7692, + "step": 622 + }, + { + "epoch": 0.03358852706491266, + "grad_norm": 1.021345853805542, + "learning_rate": 9.99877062975376e-06, + "loss": 1.0386, + "step": 623 + }, + { + "epoch": 0.033642441233556176, + "grad_norm": 0.9979421496391296, + "learning_rate": 9.998765924228214e-06, + "loss": 0.9209, + "step": 624 + }, + { + "epoch": 0.0336963554021997, + "grad_norm": 0.8552166819572449, + "learning_rate": 9.998761209715559e-06, + "loss": 0.8765, + "step": 625 + }, + { + "epoch": 0.03375026957084322, + "grad_norm": 0.9737898707389832, + "learning_rate": 9.998756486215809e-06, + "loss": 0.7459, + "step": 626 + }, + { + "epoch": 0.03380418373948674, + "grad_norm": 1.1067259311676025, + "learning_rate": 9.998751753728967e-06, + "loss": 0.8582, + "step": 627 + }, + { + "epoch": 0.033858097908130255, + "grad_norm": 1.0689613819122314, + "learning_rate": 9.998747012255044e-06, + "loss": 0.8523, + "step": 628 + }, + { + "epoch": 0.03391201207677377, + "grad_norm": 1.1880419254302979, + "learning_rate": 9.998742261794048e-06, + "loss": 0.9085, + "step": 629 + }, + { + "epoch": 0.0339659262454173, + "grad_norm": 0.9569217562675476, + "learning_rate": 9.998737502345987e-06, + "loss": 0.9112, + "step": 630 + }, + { + "epoch": 0.034019840414060816, + "grad_norm": 0.9955928921699524, + "learning_rate": 9.99873273391087e-06, + "loss": 0.9166, + "step": 631 + }, + { + "epoch": 0.034073754582704334, + "grad_norm": 0.8906963467597961, + "learning_rate": 9.998727956488708e-06, + "loss": 0.882, + "step": 632 + }, + { + "epoch": 0.03412766875134785, + "grad_norm": 0.9241589307785034, + "learning_rate": 9.998723170079506e-06, + "loss": 0.8488, + "step": 633 + }, + { + "epoch": 0.03418158291999138, + "grad_norm": 0.9666005969047546, + "learning_rate": 9.998718374683271e-06, + "loss": 0.8432, + "step": 634 + }, + { + "epoch": 0.034235497088634895, + "grad_norm": 0.9036918878555298, + "learning_rate": 9.998713570300018e-06, + "loss": 0.7979, + "step": 635 + }, + { + "epoch": 0.03428941125727841, + "grad_norm": 0.8946508765220642, + "learning_rate": 9.998708756929751e-06, + "loss": 0.8854, + "step": 636 + }, + { + "epoch": 0.03434332542592193, + "grad_norm": 1.0300164222717285, + "learning_rate": 9.99870393457248e-06, + "loss": 0.9116, + "step": 637 + }, + { + "epoch": 0.03439723959456545, + "grad_norm": 1.0635035037994385, + "learning_rate": 9.998699103228214e-06, + "loss": 0.9138, + "step": 638 + }, + { + "epoch": 0.03445115376320897, + "grad_norm": 1.0362621545791626, + "learning_rate": 9.998694262896962e-06, + "loss": 1.0177, + "step": 639 + }, + { + "epoch": 0.03450506793185249, + "grad_norm": 0.9081454873085022, + "learning_rate": 9.99868941357873e-06, + "loss": 0.7802, + "step": 640 + }, + { + "epoch": 0.03455898210049601, + "grad_norm": 0.9943915605545044, + "learning_rate": 9.998684555273529e-06, + "loss": 0.9356, + "step": 641 + }, + { + "epoch": 0.03461289626913953, + "grad_norm": 0.9647786021232605, + "learning_rate": 9.998679687981367e-06, + "loss": 0.741, + "step": 642 + }, + { + "epoch": 0.03466681043778305, + "grad_norm": 0.9655315279960632, + "learning_rate": 9.998674811702255e-06, + "loss": 0.8644, + "step": 643 + }, + { + "epoch": 0.03472072460642657, + "grad_norm": 0.9162091612815857, + "learning_rate": 9.998669926436197e-06, + "loss": 0.8383, + "step": 644 + }, + { + "epoch": 0.03477463877507009, + "grad_norm": 0.9509754776954651, + "learning_rate": 9.998665032183207e-06, + "loss": 0.8066, + "step": 645 + }, + { + "epoch": 0.034828552943713606, + "grad_norm": 1.0545740127563477, + "learning_rate": 9.998660128943292e-06, + "loss": 0.8455, + "step": 646 + }, + { + "epoch": 0.03488246711235713, + "grad_norm": 1.0928760766983032, + "learning_rate": 9.998655216716458e-06, + "loss": 0.8708, + "step": 647 + }, + { + "epoch": 0.03493638128100065, + "grad_norm": 0.9743762016296387, + "learning_rate": 9.998650295502717e-06, + "loss": 0.878, + "step": 648 + }, + { + "epoch": 0.03499029544964417, + "grad_norm": 1.016741156578064, + "learning_rate": 9.998645365302077e-06, + "loss": 0.867, + "step": 649 + }, + { + "epoch": 0.035044209618287685, + "grad_norm": 1.125252366065979, + "learning_rate": 9.998640426114548e-06, + "loss": 0.9443, + "step": 650 + }, + { + "epoch": 0.0350981237869312, + "grad_norm": 0.9555762410163879, + "learning_rate": 9.998635477940135e-06, + "loss": 0.8353, + "step": 651 + }, + { + "epoch": 0.03515203795557473, + "grad_norm": 0.930173397064209, + "learning_rate": 9.998630520778851e-06, + "loss": 0.8383, + "step": 652 + }, + { + "epoch": 0.035205952124218245, + "grad_norm": 1.1592127084732056, + "learning_rate": 9.998625554630704e-06, + "loss": 0.9708, + "step": 653 + }, + { + "epoch": 0.03525986629286176, + "grad_norm": 0.9333894848823547, + "learning_rate": 9.998620579495701e-06, + "loss": 0.9055, + "step": 654 + }, + { + "epoch": 0.03531378046150528, + "grad_norm": 0.9495646357536316, + "learning_rate": 9.998615595373853e-06, + "loss": 0.7993, + "step": 655 + }, + { + "epoch": 0.035367694630148806, + "grad_norm": 1.0919233560562134, + "learning_rate": 9.99861060226517e-06, + "loss": 0.8852, + "step": 656 + }, + { + "epoch": 0.035421608798792324, + "grad_norm": 0.907940685749054, + "learning_rate": 9.998605600169657e-06, + "loss": 0.8294, + "step": 657 + }, + { + "epoch": 0.03547552296743584, + "grad_norm": 1.0423756837844849, + "learning_rate": 9.998600589087328e-06, + "loss": 0.8758, + "step": 658 + }, + { + "epoch": 0.03552943713607936, + "grad_norm": 1.0387269258499146, + "learning_rate": 9.998595569018186e-06, + "loss": 0.9099, + "step": 659 + }, + { + "epoch": 0.03558335130472288, + "grad_norm": 0.9186104536056519, + "learning_rate": 9.998590539962245e-06, + "loss": 0.9025, + "step": 660 + }, + { + "epoch": 0.0356372654733664, + "grad_norm": 1.0173289775848389, + "learning_rate": 9.998585501919514e-06, + "loss": 0.8468, + "step": 661 + }, + { + "epoch": 0.03569117964200992, + "grad_norm": 0.9579570889472961, + "learning_rate": 9.998580454889996e-06, + "loss": 0.8542, + "step": 662 + }, + { + "epoch": 0.03574509381065344, + "grad_norm": 1.093515396118164, + "learning_rate": 9.99857539887371e-06, + "loss": 0.8932, + "step": 663 + }, + { + "epoch": 0.03579900797929696, + "grad_norm": 1.0651243925094604, + "learning_rate": 9.998570333870656e-06, + "loss": 0.8822, + "step": 664 + }, + { + "epoch": 0.03585292214794048, + "grad_norm": 0.973278284072876, + "learning_rate": 9.998565259880845e-06, + "loss": 0.8724, + "step": 665 + }, + { + "epoch": 0.035906836316584, + "grad_norm": 0.961321234703064, + "learning_rate": 9.998560176904291e-06, + "loss": 0.947, + "step": 666 + }, + { + "epoch": 0.03596075048522752, + "grad_norm": 1.0216654539108276, + "learning_rate": 9.998555084940999e-06, + "loss": 0.8528, + "step": 667 + }, + { + "epoch": 0.036014664653871035, + "grad_norm": 0.9917817711830139, + "learning_rate": 9.99854998399098e-06, + "loss": 0.8608, + "step": 668 + }, + { + "epoch": 0.03606857882251455, + "grad_norm": 1.0164326429367065, + "learning_rate": 9.998544874054243e-06, + "loss": 0.8752, + "step": 669 + }, + { + "epoch": 0.03612249299115808, + "grad_norm": 0.9181317687034607, + "learning_rate": 9.998539755130793e-06, + "loss": 0.8032, + "step": 670 + }, + { + "epoch": 0.036176407159801596, + "grad_norm": 1.0100011825561523, + "learning_rate": 9.998534627220646e-06, + "loss": 0.9205, + "step": 671 + }, + { + "epoch": 0.036230321328445114, + "grad_norm": 0.9306463599205017, + "learning_rate": 9.998529490323807e-06, + "loss": 0.8209, + "step": 672 + }, + { + "epoch": 0.03628423549708863, + "grad_norm": 1.8988754749298096, + "learning_rate": 9.998524344440286e-06, + "loss": 0.8455, + "step": 673 + }, + { + "epoch": 0.03633814966573216, + "grad_norm": 0.9742317795753479, + "learning_rate": 9.998519189570091e-06, + "loss": 0.8733, + "step": 674 + }, + { + "epoch": 0.036392063834375675, + "grad_norm": 0.9334224462509155, + "learning_rate": 9.998514025713234e-06, + "loss": 0.8761, + "step": 675 + }, + { + "epoch": 0.03644597800301919, + "grad_norm": 0.9729838371276855, + "learning_rate": 9.998508852869724e-06, + "loss": 0.8916, + "step": 676 + }, + { + "epoch": 0.03649989217166271, + "grad_norm": 0.9721505641937256, + "learning_rate": 9.998503671039568e-06, + "loss": 0.8735, + "step": 677 + }, + { + "epoch": 0.036553806340306236, + "grad_norm": 0.9600850939750671, + "learning_rate": 9.998498480222775e-06, + "loss": 0.9157, + "step": 678 + }, + { + "epoch": 0.036607720508949754, + "grad_norm": 0.9010732173919678, + "learning_rate": 9.998493280419358e-06, + "loss": 0.9215, + "step": 679 + }, + { + "epoch": 0.03666163467759327, + "grad_norm": 0.8708087801933289, + "learning_rate": 9.998488071629324e-06, + "loss": 0.7218, + "step": 680 + }, + { + "epoch": 0.03671554884623679, + "grad_norm": 0.9739180207252502, + "learning_rate": 9.998482853852682e-06, + "loss": 0.8845, + "step": 681 + }, + { + "epoch": 0.03676946301488031, + "grad_norm": 0.9823595881462097, + "learning_rate": 9.998477627089443e-06, + "loss": 0.896, + "step": 682 + }, + { + "epoch": 0.03682337718352383, + "grad_norm": 0.9629859328269958, + "learning_rate": 9.998472391339612e-06, + "loss": 0.8636, + "step": 683 + }, + { + "epoch": 0.03687729135216735, + "grad_norm": 0.8644251823425293, + "learning_rate": 9.998467146603206e-06, + "loss": 0.9124, + "step": 684 + }, + { + "epoch": 0.03693120552081087, + "grad_norm": 0.8987632989883423, + "learning_rate": 9.99846189288023e-06, + "loss": 0.801, + "step": 685 + }, + { + "epoch": 0.036985119689454386, + "grad_norm": 0.9017630219459534, + "learning_rate": 9.99845663017069e-06, + "loss": 0.8675, + "step": 686 + }, + { + "epoch": 0.03703903385809791, + "grad_norm": 0.8905850648880005, + "learning_rate": 9.998451358474603e-06, + "loss": 0.8512, + "step": 687 + }, + { + "epoch": 0.03709294802674143, + "grad_norm": 0.9807800650596619, + "learning_rate": 9.998446077791972e-06, + "loss": 0.9258, + "step": 688 + }, + { + "epoch": 0.03714686219538495, + "grad_norm": 0.8916336894035339, + "learning_rate": 9.99844078812281e-06, + "loss": 0.8236, + "step": 689 + }, + { + "epoch": 0.037200776364028465, + "grad_norm": 0.9330187439918518, + "learning_rate": 9.998435489467126e-06, + "loss": 0.7812, + "step": 690 + }, + { + "epoch": 0.03725469053267198, + "grad_norm": 0.9859142899513245, + "learning_rate": 9.99843018182493e-06, + "loss": 0.8699, + "step": 691 + }, + { + "epoch": 0.03730860470131551, + "grad_norm": 0.9277002215385437, + "learning_rate": 9.998424865196228e-06, + "loss": 0.9276, + "step": 692 + }, + { + "epoch": 0.037362518869959026, + "grad_norm": 0.9764281511306763, + "learning_rate": 9.998419539581034e-06, + "loss": 0.9482, + "step": 693 + }, + { + "epoch": 0.037416433038602544, + "grad_norm": 1.0108616352081299, + "learning_rate": 9.998414204979357e-06, + "loss": 0.8582, + "step": 694 + }, + { + "epoch": 0.03747034720724606, + "grad_norm": 1.2767362594604492, + "learning_rate": 9.998408861391202e-06, + "loss": 0.7833, + "step": 695 + }, + { + "epoch": 0.03752426137588959, + "grad_norm": 0.8874560594558716, + "learning_rate": 9.998403508816585e-06, + "loss": 0.8935, + "step": 696 + }, + { + "epoch": 0.037578175544533104, + "grad_norm": 0.8549458980560303, + "learning_rate": 9.998398147255511e-06, + "loss": 0.7747, + "step": 697 + }, + { + "epoch": 0.03763208971317662, + "grad_norm": 0.9971988201141357, + "learning_rate": 9.998392776707993e-06, + "loss": 0.753, + "step": 698 + }, + { + "epoch": 0.03768600388182014, + "grad_norm": 0.9822113513946533, + "learning_rate": 9.998387397174037e-06, + "loss": 0.9121, + "step": 699 + }, + { + "epoch": 0.037739918050463665, + "grad_norm": 0.996151864528656, + "learning_rate": 9.998382008653656e-06, + "loss": 0.9356, + "step": 700 + }, + { + "epoch": 0.03779383221910718, + "grad_norm": 1.7505156993865967, + "learning_rate": 9.998376611146857e-06, + "loss": 0.8351, + "step": 701 + }, + { + "epoch": 0.0378477463877507, + "grad_norm": 1.070356011390686, + "learning_rate": 9.998371204653651e-06, + "loss": 0.9153, + "step": 702 + }, + { + "epoch": 0.03790166055639422, + "grad_norm": 0.9383741617202759, + "learning_rate": 9.998365789174048e-06, + "loss": 0.8904, + "step": 703 + }, + { + "epoch": 0.03795557472503774, + "grad_norm": 0.8444882035255432, + "learning_rate": 9.998360364708058e-06, + "loss": 0.8243, + "step": 704 + }, + { + "epoch": 0.03800948889368126, + "grad_norm": 1.0012257099151611, + "learning_rate": 9.99835493125569e-06, + "loss": 0.9439, + "step": 705 + }, + { + "epoch": 0.03806340306232478, + "grad_norm": 0.9745193719863892, + "learning_rate": 9.998349488816954e-06, + "loss": 0.8667, + "step": 706 + }, + { + "epoch": 0.0381173172309683, + "grad_norm": 0.8363852500915527, + "learning_rate": 9.998344037391859e-06, + "loss": 0.8082, + "step": 707 + }, + { + "epoch": 0.038171231399611816, + "grad_norm": 0.9389918446540833, + "learning_rate": 9.998338576980417e-06, + "loss": 0.8113, + "step": 708 + }, + { + "epoch": 0.03822514556825534, + "grad_norm": 0.9216110110282898, + "learning_rate": 9.998333107582635e-06, + "loss": 0.8179, + "step": 709 + }, + { + "epoch": 0.03827905973689886, + "grad_norm": 1.0292471647262573, + "learning_rate": 9.998327629198526e-06, + "loss": 0.8605, + "step": 710 + }, + { + "epoch": 0.03833297390554238, + "grad_norm": 0.9812708497047424, + "learning_rate": 9.998322141828097e-06, + "loss": 0.9279, + "step": 711 + }, + { + "epoch": 0.038386888074185894, + "grad_norm": 0.8186620473861694, + "learning_rate": 9.998316645471358e-06, + "loss": 0.7877, + "step": 712 + }, + { + "epoch": 0.03844080224282941, + "grad_norm": 1.034134864807129, + "learning_rate": 9.99831114012832e-06, + "loss": 0.9867, + "step": 713 + }, + { + "epoch": 0.03849471641147294, + "grad_norm": 1.1604938507080078, + "learning_rate": 9.998305625798993e-06, + "loss": 0.9134, + "step": 714 + }, + { + "epoch": 0.038548630580116455, + "grad_norm": 0.8452483415603638, + "learning_rate": 9.998300102483388e-06, + "loss": 0.8732, + "step": 715 + }, + { + "epoch": 0.03860254474875997, + "grad_norm": 0.8881269693374634, + "learning_rate": 9.998294570181512e-06, + "loss": 0.847, + "step": 716 + }, + { + "epoch": 0.03865645891740349, + "grad_norm": 0.8822013735771179, + "learning_rate": 9.998289028893375e-06, + "loss": 0.8404, + "step": 717 + }, + { + "epoch": 0.038710373086047016, + "grad_norm": 1.0011916160583496, + "learning_rate": 9.998283478618991e-06, + "loss": 0.8133, + "step": 718 + }, + { + "epoch": 0.038764287254690534, + "grad_norm": 1.0004018545150757, + "learning_rate": 9.998277919358367e-06, + "loss": 0.9556, + "step": 719 + }, + { + "epoch": 0.03881820142333405, + "grad_norm": 0.8176954984664917, + "learning_rate": 9.998272351111513e-06, + "loss": 0.7977, + "step": 720 + }, + { + "epoch": 0.03887211559197757, + "grad_norm": 0.9160690307617188, + "learning_rate": 9.99826677387844e-06, + "loss": 0.9239, + "step": 721 + }, + { + "epoch": 0.03892602976062109, + "grad_norm": 1.2158405780792236, + "learning_rate": 9.998261187659157e-06, + "loss": 0.9023, + "step": 722 + }, + { + "epoch": 0.03897994392926461, + "grad_norm": 0.9564448595046997, + "learning_rate": 9.998255592453674e-06, + "loss": 0.8585, + "step": 723 + }, + { + "epoch": 0.03903385809790813, + "grad_norm": 0.8902252316474915, + "learning_rate": 9.998249988262002e-06, + "loss": 0.8388, + "step": 724 + }, + { + "epoch": 0.03908777226655165, + "grad_norm": 0.8738620281219482, + "learning_rate": 9.998244375084152e-06, + "loss": 0.9545, + "step": 725 + }, + { + "epoch": 0.03914168643519517, + "grad_norm": 0.9670735001564026, + "learning_rate": 9.99823875292013e-06, + "loss": 0.8335, + "step": 726 + }, + { + "epoch": 0.03919560060383869, + "grad_norm": 0.8719429969787598, + "learning_rate": 9.998233121769952e-06, + "loss": 0.8546, + "step": 727 + }, + { + "epoch": 0.03924951477248221, + "grad_norm": 1.318429708480835, + "learning_rate": 9.998227481633622e-06, + "loss": 1.0658, + "step": 728 + }, + { + "epoch": 0.03930342894112573, + "grad_norm": 0.962630569934845, + "learning_rate": 9.998221832511155e-06, + "loss": 0.9049, + "step": 729 + }, + { + "epoch": 0.039357343109769245, + "grad_norm": 0.9639857411384583, + "learning_rate": 9.998216174402558e-06, + "loss": 0.9114, + "step": 730 + }, + { + "epoch": 0.03941125727841277, + "grad_norm": 1.1621571779251099, + "learning_rate": 9.998210507307843e-06, + "loss": 0.8776, + "step": 731 + }, + { + "epoch": 0.03946517144705629, + "grad_norm": 1.170089840888977, + "learning_rate": 9.998204831227019e-06, + "loss": 0.9928, + "step": 732 + }, + { + "epoch": 0.039519085615699806, + "grad_norm": 0.8257297873497009, + "learning_rate": 9.998199146160098e-06, + "loss": 0.7885, + "step": 733 + }, + { + "epoch": 0.039572999784343324, + "grad_norm": 0.8887513279914856, + "learning_rate": 9.998193452107088e-06, + "loss": 0.8389, + "step": 734 + }, + { + "epoch": 0.03962691395298684, + "grad_norm": 0.9321185350418091, + "learning_rate": 9.998187749068001e-06, + "loss": 0.9083, + "step": 735 + }, + { + "epoch": 0.03968082812163037, + "grad_norm": 0.9926772713661194, + "learning_rate": 9.998182037042847e-06, + "loss": 0.9102, + "step": 736 + }, + { + "epoch": 0.039734742290273885, + "grad_norm": 1.0760009288787842, + "learning_rate": 9.998176316031634e-06, + "loss": 0.7781, + "step": 737 + }, + { + "epoch": 0.0397886564589174, + "grad_norm": 1.0998133420944214, + "learning_rate": 9.998170586034376e-06, + "loss": 0.9725, + "step": 738 + }, + { + "epoch": 0.03984257062756092, + "grad_norm": 0.9367475509643555, + "learning_rate": 9.99816484705108e-06, + "loss": 0.8277, + "step": 739 + }, + { + "epoch": 0.039896484796204446, + "grad_norm": 0.942954957485199, + "learning_rate": 9.998159099081758e-06, + "loss": 0.8542, + "step": 740 + }, + { + "epoch": 0.039950398964847963, + "grad_norm": 0.9841166138648987, + "learning_rate": 9.998153342126421e-06, + "loss": 0.9179, + "step": 741 + }, + { + "epoch": 0.04000431313349148, + "grad_norm": 0.9215245246887207, + "learning_rate": 9.998147576185077e-06, + "loss": 0.8899, + "step": 742 + }, + { + "epoch": 0.040058227302135, + "grad_norm": 1.0368192195892334, + "learning_rate": 9.998141801257739e-06, + "loss": 0.9828, + "step": 743 + }, + { + "epoch": 0.04011214147077852, + "grad_norm": 0.9696660041809082, + "learning_rate": 9.998136017344416e-06, + "loss": 0.9431, + "step": 744 + }, + { + "epoch": 0.04016605563942204, + "grad_norm": 1.111257791519165, + "learning_rate": 9.998130224445117e-06, + "loss": 0.9666, + "step": 745 + }, + { + "epoch": 0.04021996980806556, + "grad_norm": 0.9260644316673279, + "learning_rate": 9.998124422559856e-06, + "loss": 0.8941, + "step": 746 + }, + { + "epoch": 0.04027388397670908, + "grad_norm": 0.8622020483016968, + "learning_rate": 9.99811861168864e-06, + "loss": 0.8148, + "step": 747 + }, + { + "epoch": 0.040327798145352596, + "grad_norm": 0.8767471313476562, + "learning_rate": 9.998112791831483e-06, + "loss": 0.7093, + "step": 748 + }, + { + "epoch": 0.04038171231399612, + "grad_norm": 0.902917206287384, + "learning_rate": 9.998106962988391e-06, + "loss": 0.7677, + "step": 749 + }, + { + "epoch": 0.04043562648263964, + "grad_norm": 1.351694941520691, + "learning_rate": 9.998101125159377e-06, + "loss": 1.0382, + "step": 750 + }, + { + "epoch": 0.04048954065128316, + "grad_norm": 0.8547930121421814, + "learning_rate": 9.998095278344452e-06, + "loss": 0.7974, + "step": 751 + }, + { + "epoch": 0.040543454819926675, + "grad_norm": 0.941149115562439, + "learning_rate": 9.998089422543626e-06, + "loss": 0.8518, + "step": 752 + }, + { + "epoch": 0.0405973689885702, + "grad_norm": 0.8671521544456482, + "learning_rate": 9.998083557756908e-06, + "loss": 0.8049, + "step": 753 + }, + { + "epoch": 0.04065128315721372, + "grad_norm": 0.9877942800521851, + "learning_rate": 9.998077683984311e-06, + "loss": 0.8874, + "step": 754 + }, + { + "epoch": 0.040705197325857236, + "grad_norm": 1.2130393981933594, + "learning_rate": 9.998071801225843e-06, + "loss": 0.9794, + "step": 755 + }, + { + "epoch": 0.040759111494500753, + "grad_norm": 0.9422823786735535, + "learning_rate": 9.998065909481518e-06, + "loss": 0.899, + "step": 756 + }, + { + "epoch": 0.04081302566314427, + "grad_norm": 0.9770492911338806, + "learning_rate": 9.998060008751343e-06, + "loss": 0.8434, + "step": 757 + }, + { + "epoch": 0.040866939831787796, + "grad_norm": 0.9227531552314758, + "learning_rate": 9.998054099035332e-06, + "loss": 0.8797, + "step": 758 + }, + { + "epoch": 0.040920854000431314, + "grad_norm": 1.0452102422714233, + "learning_rate": 9.998048180333492e-06, + "loss": 0.8702, + "step": 759 + }, + { + "epoch": 0.04097476816907483, + "grad_norm": 1.034125566482544, + "learning_rate": 9.998042252645837e-06, + "loss": 0.9041, + "step": 760 + }, + { + "epoch": 0.04102868233771835, + "grad_norm": 0.886029064655304, + "learning_rate": 9.998036315972375e-06, + "loss": 0.7805, + "step": 761 + }, + { + "epoch": 0.041082596506361875, + "grad_norm": 0.9845888614654541, + "learning_rate": 9.998030370313116e-06, + "loss": 0.9836, + "step": 762 + }, + { + "epoch": 0.04113651067500539, + "grad_norm": 0.9223973155021667, + "learning_rate": 9.998024415668075e-06, + "loss": 0.768, + "step": 763 + }, + { + "epoch": 0.04119042484364891, + "grad_norm": 1.0607362985610962, + "learning_rate": 9.99801845203726e-06, + "loss": 0.865, + "step": 764 + }, + { + "epoch": 0.04124433901229243, + "grad_norm": 0.9620907306671143, + "learning_rate": 9.998012479420683e-06, + "loss": 0.7645, + "step": 765 + }, + { + "epoch": 0.04129825318093595, + "grad_norm": 0.9490310549736023, + "learning_rate": 9.99800649781835e-06, + "loss": 0.9124, + "step": 766 + }, + { + "epoch": 0.04135216734957947, + "grad_norm": 0.9684557914733887, + "learning_rate": 9.99800050723028e-06, + "loss": 0.876, + "step": 767 + }, + { + "epoch": 0.04140608151822299, + "grad_norm": 0.9633080959320068, + "learning_rate": 9.997994507656476e-06, + "loss": 0.8976, + "step": 768 + }, + { + "epoch": 0.04145999568686651, + "grad_norm": 0.9495208263397217, + "learning_rate": 9.997988499096953e-06, + "loss": 0.9049, + "step": 769 + }, + { + "epoch": 0.041513909855510026, + "grad_norm": 1.0614326000213623, + "learning_rate": 9.997982481551721e-06, + "loss": 0.905, + "step": 770 + }, + { + "epoch": 0.04156782402415355, + "grad_norm": 0.820672869682312, + "learning_rate": 9.99797645502079e-06, + "loss": 0.8306, + "step": 771 + }, + { + "epoch": 0.04162173819279707, + "grad_norm": 0.9719771146774292, + "learning_rate": 9.997970419504171e-06, + "loss": 0.828, + "step": 772 + }, + { + "epoch": 0.041675652361440586, + "grad_norm": 0.893326997756958, + "learning_rate": 9.997964375001875e-06, + "loss": 0.8416, + "step": 773 + }, + { + "epoch": 0.041729566530084104, + "grad_norm": 0.858121395111084, + "learning_rate": 9.997958321513915e-06, + "loss": 0.8779, + "step": 774 + }, + { + "epoch": 0.04178348069872762, + "grad_norm": 0.9703636765480042, + "learning_rate": 9.997952259040297e-06, + "loss": 0.8623, + "step": 775 + }, + { + "epoch": 0.04183739486737115, + "grad_norm": 0.9626398086547852, + "learning_rate": 9.997946187581039e-06, + "loss": 0.8309, + "step": 776 + }, + { + "epoch": 0.041891309036014665, + "grad_norm": 0.9132344722747803, + "learning_rate": 9.997940107136143e-06, + "loss": 0.8798, + "step": 777 + }, + { + "epoch": 0.04194522320465818, + "grad_norm": 0.9608821272850037, + "learning_rate": 9.997934017705629e-06, + "loss": 0.8764, + "step": 778 + }, + { + "epoch": 0.0419991373733017, + "grad_norm": 1.0852513313293457, + "learning_rate": 9.997927919289501e-06, + "loss": 0.8908, + "step": 779 + }, + { + "epoch": 0.042053051541945226, + "grad_norm": 0.9690573215484619, + "learning_rate": 9.997921811887774e-06, + "loss": 0.8556, + "step": 780 + }, + { + "epoch": 0.042106965710588744, + "grad_norm": 0.9107050895690918, + "learning_rate": 9.997915695500458e-06, + "loss": 0.9249, + "step": 781 + }, + { + "epoch": 0.04216087987923226, + "grad_norm": 1.029974102973938, + "learning_rate": 9.997909570127564e-06, + "loss": 0.8369, + "step": 782 + }, + { + "epoch": 0.04221479404787578, + "grad_norm": 0.8179258704185486, + "learning_rate": 9.997903435769101e-06, + "loss": 0.7729, + "step": 783 + }, + { + "epoch": 0.042268708216519305, + "grad_norm": 1.0664961338043213, + "learning_rate": 9.997897292425082e-06, + "loss": 0.8815, + "step": 784 + }, + { + "epoch": 0.04232262238516282, + "grad_norm": 0.9794465899467468, + "learning_rate": 9.997891140095519e-06, + "loss": 0.9244, + "step": 785 + }, + { + "epoch": 0.04237653655380634, + "grad_norm": 0.875953197479248, + "learning_rate": 9.99788497878042e-06, + "loss": 0.9191, + "step": 786 + }, + { + "epoch": 0.04243045072244986, + "grad_norm": 0.9880902767181396, + "learning_rate": 9.9978788084798e-06, + "loss": 0.8639, + "step": 787 + }, + { + "epoch": 0.042484364891093376, + "grad_norm": 1.0391566753387451, + "learning_rate": 9.997872629193666e-06, + "loss": 0.9943, + "step": 788 + }, + { + "epoch": 0.0425382790597369, + "grad_norm": 0.9321290850639343, + "learning_rate": 9.997866440922033e-06, + "loss": 0.7809, + "step": 789 + }, + { + "epoch": 0.04259219322838042, + "grad_norm": 0.8898556232452393, + "learning_rate": 9.99786024366491e-06, + "loss": 0.9353, + "step": 790 + }, + { + "epoch": 0.04264610739702394, + "grad_norm": 1.1177983283996582, + "learning_rate": 9.997854037422306e-06, + "loss": 0.8157, + "step": 791 + }, + { + "epoch": 0.042700021565667455, + "grad_norm": 0.8821296691894531, + "learning_rate": 9.997847822194236e-06, + "loss": 0.8729, + "step": 792 + }, + { + "epoch": 0.04275393573431098, + "grad_norm": 0.8545325398445129, + "learning_rate": 9.997841597980709e-06, + "loss": 0.8415, + "step": 793 + }, + { + "epoch": 0.0428078499029545, + "grad_norm": 0.9313606023788452, + "learning_rate": 9.997835364781739e-06, + "loss": 0.8411, + "step": 794 + }, + { + "epoch": 0.042861764071598016, + "grad_norm": 0.9587781429290771, + "learning_rate": 9.997829122597332e-06, + "loss": 0.8086, + "step": 795 + }, + { + "epoch": 0.042915678240241534, + "grad_norm": 0.9708360433578491, + "learning_rate": 9.997822871427504e-06, + "loss": 0.8715, + "step": 796 + }, + { + "epoch": 0.04296959240888505, + "grad_norm": 0.8868080973625183, + "learning_rate": 9.997816611272265e-06, + "loss": 0.8549, + "step": 797 + }, + { + "epoch": 0.04302350657752858, + "grad_norm": 0.9147778153419495, + "learning_rate": 9.997810342131624e-06, + "loss": 0.7854, + "step": 798 + }, + { + "epoch": 0.043077420746172095, + "grad_norm": 0.9853960275650024, + "learning_rate": 9.997804064005596e-06, + "loss": 0.8243, + "step": 799 + }, + { + "epoch": 0.04313133491481561, + "grad_norm": 1.0076130628585815, + "learning_rate": 9.997797776894189e-06, + "loss": 0.9077, + "step": 800 + }, + { + "epoch": 0.04318524908345913, + "grad_norm": 0.9694076776504517, + "learning_rate": 9.997791480797417e-06, + "loss": 0.8767, + "step": 801 + }, + { + "epoch": 0.043239163252102655, + "grad_norm": 1.114001750946045, + "learning_rate": 9.99778517571529e-06, + "loss": 0.8211, + "step": 802 + }, + { + "epoch": 0.04329307742074617, + "grad_norm": 0.9701128005981445, + "learning_rate": 9.997778861647817e-06, + "loss": 0.9084, + "step": 803 + }, + { + "epoch": 0.04334699158938969, + "grad_norm": 0.868299126625061, + "learning_rate": 9.997772538595015e-06, + "loss": 0.7556, + "step": 804 + }, + { + "epoch": 0.04340090575803321, + "grad_norm": 0.9160446524620056, + "learning_rate": 9.997766206556888e-06, + "loss": 0.821, + "step": 805 + }, + { + "epoch": 0.043454819926676734, + "grad_norm": 0.934198260307312, + "learning_rate": 9.997759865533454e-06, + "loss": 0.9113, + "step": 806 + }, + { + "epoch": 0.04350873409532025, + "grad_norm": 0.8949079513549805, + "learning_rate": 9.997753515524722e-06, + "loss": 0.7821, + "step": 807 + }, + { + "epoch": 0.04356264826396377, + "grad_norm": 0.9035944938659668, + "learning_rate": 9.997747156530702e-06, + "loss": 0.8233, + "step": 808 + }, + { + "epoch": 0.04361656243260729, + "grad_norm": 0.9681552052497864, + "learning_rate": 9.99774078855141e-06, + "loss": 0.9241, + "step": 809 + }, + { + "epoch": 0.043670476601250806, + "grad_norm": 0.906092643737793, + "learning_rate": 9.99773441158685e-06, + "loss": 0.8948, + "step": 810 + }, + { + "epoch": 0.04372439076989433, + "grad_norm": 0.9229143261909485, + "learning_rate": 9.997728025637039e-06, + "loss": 0.8897, + "step": 811 + }, + { + "epoch": 0.04377830493853785, + "grad_norm": 0.9263061881065369, + "learning_rate": 9.997721630701986e-06, + "loss": 0.7923, + "step": 812 + }, + { + "epoch": 0.04383221910718137, + "grad_norm": 0.8474372029304504, + "learning_rate": 9.997715226781706e-06, + "loss": 0.796, + "step": 813 + }, + { + "epoch": 0.043886133275824885, + "grad_norm": 0.9960548877716064, + "learning_rate": 9.997708813876206e-06, + "loss": 0.9166, + "step": 814 + }, + { + "epoch": 0.04394004744446841, + "grad_norm": 0.9843032956123352, + "learning_rate": 9.997702391985499e-06, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 0.04399396161311193, + "grad_norm": 0.9313154220581055, + "learning_rate": 9.997695961109599e-06, + "loss": 0.8972, + "step": 816 + }, + { + "epoch": 0.044047875781755445, + "grad_norm": 0.8846973180770874, + "learning_rate": 9.997689521248515e-06, + "loss": 0.8599, + "step": 817 + }, + { + "epoch": 0.04410178995039896, + "grad_norm": 0.8113641738891602, + "learning_rate": 9.99768307240226e-06, + "loss": 0.8509, + "step": 818 + }, + { + "epoch": 0.04415570411904248, + "grad_norm": 1.0659984350204468, + "learning_rate": 9.997676614570844e-06, + "loss": 0.938, + "step": 819 + }, + { + "epoch": 0.044209618287686006, + "grad_norm": 0.9183745384216309, + "learning_rate": 9.99767014775428e-06, + "loss": 0.8761, + "step": 820 + }, + { + "epoch": 0.044263532456329524, + "grad_norm": 0.87090003490448, + "learning_rate": 9.997663671952578e-06, + "loss": 0.8535, + "step": 821 + }, + { + "epoch": 0.04431744662497304, + "grad_norm": 0.9857214093208313, + "learning_rate": 9.997657187165753e-06, + "loss": 0.9434, + "step": 822 + }, + { + "epoch": 0.04437136079361656, + "grad_norm": 1.0443209409713745, + "learning_rate": 9.997650693393812e-06, + "loss": 0.8994, + "step": 823 + }, + { + "epoch": 0.044425274962260085, + "grad_norm": 0.8348391652107239, + "learning_rate": 9.99764419063677e-06, + "loss": 0.8383, + "step": 824 + }, + { + "epoch": 0.0444791891309036, + "grad_norm": 1.2708821296691895, + "learning_rate": 9.997637678894639e-06, + "loss": 0.8733, + "step": 825 + }, + { + "epoch": 0.04453310329954712, + "grad_norm": 0.9863126277923584, + "learning_rate": 9.997631158167428e-06, + "loss": 0.9364, + "step": 826 + }, + { + "epoch": 0.04458701746819064, + "grad_norm": 1.0223352909088135, + "learning_rate": 9.99762462845515e-06, + "loss": 0.9139, + "step": 827 + }, + { + "epoch": 0.04464093163683416, + "grad_norm": 0.8559738397598267, + "learning_rate": 9.997618089757818e-06, + "loss": 0.7461, + "step": 828 + }, + { + "epoch": 0.04469484580547768, + "grad_norm": 0.9347368478775024, + "learning_rate": 9.997611542075442e-06, + "loss": 0.9275, + "step": 829 + }, + { + "epoch": 0.0447487599741212, + "grad_norm": 1.0208019018173218, + "learning_rate": 9.997604985408036e-06, + "loss": 0.8338, + "step": 830 + }, + { + "epoch": 0.04480267414276472, + "grad_norm": 0.9792174100875854, + "learning_rate": 9.997598419755607e-06, + "loss": 0.9437, + "step": 831 + }, + { + "epoch": 0.044856588311408235, + "grad_norm": 0.851665198802948, + "learning_rate": 9.997591845118173e-06, + "loss": 0.8008, + "step": 832 + }, + { + "epoch": 0.04491050248005176, + "grad_norm": 0.9315025806427002, + "learning_rate": 9.997585261495742e-06, + "loss": 0.8389, + "step": 833 + }, + { + "epoch": 0.04496441664869528, + "grad_norm": 0.9658921360969543, + "learning_rate": 9.997578668888326e-06, + "loss": 0.9252, + "step": 834 + }, + { + "epoch": 0.045018330817338796, + "grad_norm": 0.8989397287368774, + "learning_rate": 9.997572067295938e-06, + "loss": 0.8648, + "step": 835 + }, + { + "epoch": 0.045072244985982314, + "grad_norm": 0.8874988555908203, + "learning_rate": 9.99756545671859e-06, + "loss": 0.7801, + "step": 836 + }, + { + "epoch": 0.04512615915462584, + "grad_norm": 0.9186223745346069, + "learning_rate": 9.997558837156293e-06, + "loss": 0.767, + "step": 837 + }, + { + "epoch": 0.04518007332326936, + "grad_norm": 1.163044810295105, + "learning_rate": 9.997552208609059e-06, + "loss": 0.8938, + "step": 838 + }, + { + "epoch": 0.045233987491912875, + "grad_norm": 0.8315468430519104, + "learning_rate": 9.997545571076901e-06, + "loss": 0.725, + "step": 839 + }, + { + "epoch": 0.04528790166055639, + "grad_norm": 1.0088660717010498, + "learning_rate": 9.99753892455983e-06, + "loss": 0.8533, + "step": 840 + }, + { + "epoch": 0.04534181582919991, + "grad_norm": 0.9268692135810852, + "learning_rate": 9.997532269057857e-06, + "loss": 0.8739, + "step": 841 + }, + { + "epoch": 0.045395729997843436, + "grad_norm": 1.0793242454528809, + "learning_rate": 9.997525604570995e-06, + "loss": 0.9605, + "step": 842 + }, + { + "epoch": 0.045449644166486954, + "grad_norm": 1.101798176765442, + "learning_rate": 9.997518931099258e-06, + "loss": 0.9525, + "step": 843 + }, + { + "epoch": 0.04550355833513047, + "grad_norm": 0.9046466946601868, + "learning_rate": 9.997512248642654e-06, + "loss": 0.8853, + "step": 844 + }, + { + "epoch": 0.04555747250377399, + "grad_norm": 0.9629097580909729, + "learning_rate": 9.997505557201198e-06, + "loss": 0.8882, + "step": 845 + }, + { + "epoch": 0.045611386672417514, + "grad_norm": 1.1880977153778076, + "learning_rate": 9.997498856774898e-06, + "loss": 0.8812, + "step": 846 + }, + { + "epoch": 0.04566530084106103, + "grad_norm": 0.8678451180458069, + "learning_rate": 9.997492147363772e-06, + "loss": 0.887, + "step": 847 + }, + { + "epoch": 0.04571921500970455, + "grad_norm": 1.3359739780426025, + "learning_rate": 9.99748542896783e-06, + "loss": 0.8141, + "step": 848 + }, + { + "epoch": 0.04577312917834807, + "grad_norm": 0.9263296127319336, + "learning_rate": 9.99747870158708e-06, + "loss": 0.9357, + "step": 849 + }, + { + "epoch": 0.045827043346991586, + "grad_norm": 0.9199776649475098, + "learning_rate": 9.997471965221541e-06, + "loss": 0.8352, + "step": 850 + }, + { + "epoch": 0.04588095751563511, + "grad_norm": 0.8880730867385864, + "learning_rate": 9.997465219871218e-06, + "loss": 0.7802, + "step": 851 + }, + { + "epoch": 0.04593487168427863, + "grad_norm": 0.8561250567436218, + "learning_rate": 9.99745846553613e-06, + "loss": 0.7987, + "step": 852 + }, + { + "epoch": 0.04598878585292215, + "grad_norm": 0.8975661396980286, + "learning_rate": 9.997451702216283e-06, + "loss": 0.8325, + "step": 853 + }, + { + "epoch": 0.046042700021565665, + "grad_norm": 0.9350215196609497, + "learning_rate": 9.997444929911693e-06, + "loss": 0.7708, + "step": 854 + }, + { + "epoch": 0.04609661419020919, + "grad_norm": 1.0229014158248901, + "learning_rate": 9.99743814862237e-06, + "loss": 0.9643, + "step": 855 + }, + { + "epoch": 0.04615052835885271, + "grad_norm": 0.9249217510223389, + "learning_rate": 9.997431358348329e-06, + "loss": 0.8411, + "step": 856 + }, + { + "epoch": 0.046204442527496226, + "grad_norm": 0.9823042154312134, + "learning_rate": 9.99742455908958e-06, + "loss": 0.9406, + "step": 857 + }, + { + "epoch": 0.046258356696139744, + "grad_norm": 1.2525794506072998, + "learning_rate": 9.997417750846134e-06, + "loss": 0.8507, + "step": 858 + }, + { + "epoch": 0.04631227086478327, + "grad_norm": 0.9583309888839722, + "learning_rate": 9.997410933618006e-06, + "loss": 0.8504, + "step": 859 + }, + { + "epoch": 0.046366185033426786, + "grad_norm": 0.9264401793479919, + "learning_rate": 9.997404107405207e-06, + "loss": 0.8595, + "step": 860 + }, + { + "epoch": 0.046420099202070304, + "grad_norm": 0.9833316206932068, + "learning_rate": 9.99739727220775e-06, + "loss": 0.9025, + "step": 861 + }, + { + "epoch": 0.04647401337071382, + "grad_norm": 1.0220664739608765, + "learning_rate": 9.997390428025645e-06, + "loss": 0.8671, + "step": 862 + }, + { + "epoch": 0.04652792753935734, + "grad_norm": 1.0774664878845215, + "learning_rate": 9.997383574858908e-06, + "loss": 0.8463, + "step": 863 + }, + { + "epoch": 0.046581841708000865, + "grad_norm": 0.8821879029273987, + "learning_rate": 9.997376712707547e-06, + "loss": 0.7565, + "step": 864 + }, + { + "epoch": 0.04663575587664438, + "grad_norm": 0.9233925938606262, + "learning_rate": 9.997369841571577e-06, + "loss": 0.9151, + "step": 865 + }, + { + "epoch": 0.0466896700452879, + "grad_norm": 1.0006109476089478, + "learning_rate": 9.997362961451015e-06, + "loss": 0.8339, + "step": 866 + }, + { + "epoch": 0.04674358421393142, + "grad_norm": 0.865035891532898, + "learning_rate": 9.997356072345863e-06, + "loss": 0.8997, + "step": 867 + }, + { + "epoch": 0.046797498382574944, + "grad_norm": 1.0450654029846191, + "learning_rate": 9.99734917425614e-06, + "loss": 0.7966, + "step": 868 + }, + { + "epoch": 0.04685141255121846, + "grad_norm": 0.8878824710845947, + "learning_rate": 9.997342267181857e-06, + "loss": 0.831, + "step": 869 + }, + { + "epoch": 0.04690532671986198, + "grad_norm": 1.0056546926498413, + "learning_rate": 9.997335351123028e-06, + "loss": 0.8178, + "step": 870 + }, + { + "epoch": 0.0469592408885055, + "grad_norm": 1.0531659126281738, + "learning_rate": 9.997328426079661e-06, + "loss": 0.7773, + "step": 871 + }, + { + "epoch": 0.047013155057149016, + "grad_norm": 0.911021888256073, + "learning_rate": 9.997321492051775e-06, + "loss": 0.9001, + "step": 872 + }, + { + "epoch": 0.04706706922579254, + "grad_norm": 0.920103132724762, + "learning_rate": 9.997314549039379e-06, + "loss": 0.7222, + "step": 873 + }, + { + "epoch": 0.04712098339443606, + "grad_norm": 0.9449265599250793, + "learning_rate": 9.997307597042483e-06, + "loss": 0.9197, + "step": 874 + }, + { + "epoch": 0.047174897563079576, + "grad_norm": 1.013066291809082, + "learning_rate": 9.997300636061103e-06, + "loss": 0.8854, + "step": 875 + }, + { + "epoch": 0.047228811731723094, + "grad_norm": 0.8990256786346436, + "learning_rate": 9.99729366609525e-06, + "loss": 0.81, + "step": 876 + }, + { + "epoch": 0.04728272590036662, + "grad_norm": 1.0211769342422485, + "learning_rate": 9.997286687144938e-06, + "loss": 0.8335, + "step": 877 + }, + { + "epoch": 0.04733664006901014, + "grad_norm": 1.14606773853302, + "learning_rate": 9.997279699210178e-06, + "loss": 1.0956, + "step": 878 + }, + { + "epoch": 0.047390554237653655, + "grad_norm": 0.982725977897644, + "learning_rate": 9.997272702290981e-06, + "loss": 0.8289, + "step": 879 + }, + { + "epoch": 0.04744446840629717, + "grad_norm": 0.8667361736297607, + "learning_rate": 9.997265696387364e-06, + "loss": 0.8056, + "step": 880 + }, + { + "epoch": 0.04749838257494069, + "grad_norm": 0.9029837250709534, + "learning_rate": 9.997258681499338e-06, + "loss": 0.8461, + "step": 881 + }, + { + "epoch": 0.047552296743584216, + "grad_norm": 0.8767060041427612, + "learning_rate": 9.997251657626915e-06, + "loss": 0.8162, + "step": 882 + }, + { + "epoch": 0.047606210912227734, + "grad_norm": 1.4750713109970093, + "learning_rate": 9.997244624770104e-06, + "loss": 0.8677, + "step": 883 + }, + { + "epoch": 0.04766012508087125, + "grad_norm": 1.001286506652832, + "learning_rate": 9.997237582928924e-06, + "loss": 0.7673, + "step": 884 + }, + { + "epoch": 0.04771403924951477, + "grad_norm": 0.9560269713401794, + "learning_rate": 9.997230532103384e-06, + "loss": 0.8597, + "step": 885 + }, + { + "epoch": 0.047767953418158295, + "grad_norm": 0.834237277507782, + "learning_rate": 9.997223472293499e-06, + "loss": 0.7629, + "step": 886 + }, + { + "epoch": 0.04782186758680181, + "grad_norm": 0.9642406702041626, + "learning_rate": 9.997216403499278e-06, + "loss": 0.83, + "step": 887 + }, + { + "epoch": 0.04787578175544533, + "grad_norm": 1.2931480407714844, + "learning_rate": 9.997209325720736e-06, + "loss": 1.0333, + "step": 888 + }, + { + "epoch": 0.04792969592408885, + "grad_norm": 0.8024531602859497, + "learning_rate": 9.997202238957886e-06, + "loss": 0.7166, + "step": 889 + }, + { + "epoch": 0.04798361009273237, + "grad_norm": 0.9585899710655212, + "learning_rate": 9.997195143210741e-06, + "loss": 0.8099, + "step": 890 + }, + { + "epoch": 0.04803752426137589, + "grad_norm": 0.9917063117027283, + "learning_rate": 9.997188038479313e-06, + "loss": 0.8486, + "step": 891 + }, + { + "epoch": 0.04809143843001941, + "grad_norm": 1.6290080547332764, + "learning_rate": 9.997180924763616e-06, + "loss": 0.863, + "step": 892 + }, + { + "epoch": 0.04814535259866293, + "grad_norm": 0.9488585591316223, + "learning_rate": 9.99717380206366e-06, + "loss": 0.8277, + "step": 893 + }, + { + "epoch": 0.048199266767306445, + "grad_norm": 1.0710817575454712, + "learning_rate": 9.997166670379459e-06, + "loss": 0.8898, + "step": 894 + }, + { + "epoch": 0.04825318093594997, + "grad_norm": 0.9916248917579651, + "learning_rate": 9.997159529711026e-06, + "loss": 0.9144, + "step": 895 + }, + { + "epoch": 0.04830709510459349, + "grad_norm": 1.0074565410614014, + "learning_rate": 9.997152380058378e-06, + "loss": 0.8391, + "step": 896 + }, + { + "epoch": 0.048361009273237006, + "grad_norm": 1.0258312225341797, + "learning_rate": 9.99714522142152e-06, + "loss": 0.973, + "step": 897 + }, + { + "epoch": 0.048414923441880524, + "grad_norm": 0.9497826099395752, + "learning_rate": 9.99713805380047e-06, + "loss": 0.9221, + "step": 898 + }, + { + "epoch": 0.04846883761052405, + "grad_norm": 0.9103115200996399, + "learning_rate": 9.99713087719524e-06, + "loss": 0.7942, + "step": 899 + }, + { + "epoch": 0.04852275177916757, + "grad_norm": 0.9810470938682556, + "learning_rate": 9.997123691605843e-06, + "loss": 0.8673, + "step": 900 + }, + { + "epoch": 0.048576665947811085, + "grad_norm": 1.0422937870025635, + "learning_rate": 9.997116497032291e-06, + "loss": 0.9263, + "step": 901 + }, + { + "epoch": 0.0486305801164546, + "grad_norm": 0.8522017002105713, + "learning_rate": 9.997109293474596e-06, + "loss": 0.8296, + "step": 902 + }, + { + "epoch": 0.04868449428509812, + "grad_norm": 0.818270742893219, + "learning_rate": 9.997102080932775e-06, + "loss": 0.7898, + "step": 903 + }, + { + "epoch": 0.048738408453741645, + "grad_norm": 0.9286766648292542, + "learning_rate": 9.997094859406838e-06, + "loss": 0.8751, + "step": 904 + }, + { + "epoch": 0.04879232262238516, + "grad_norm": 1.0779087543487549, + "learning_rate": 9.997087628896797e-06, + "loss": 0.8377, + "step": 905 + }, + { + "epoch": 0.04884623679102868, + "grad_norm": 0.8711867928504944, + "learning_rate": 9.997080389402667e-06, + "loss": 0.8547, + "step": 906 + }, + { + "epoch": 0.0489001509596722, + "grad_norm": 0.8919721245765686, + "learning_rate": 9.99707314092446e-06, + "loss": 0.8178, + "step": 907 + }, + { + "epoch": 0.048954065128315724, + "grad_norm": 0.9084917306900024, + "learning_rate": 9.997065883462192e-06, + "loss": 0.8618, + "step": 908 + }, + { + "epoch": 0.04900797929695924, + "grad_norm": 0.869216799736023, + "learning_rate": 9.997058617015871e-06, + "loss": 0.8636, + "step": 909 + }, + { + "epoch": 0.04906189346560276, + "grad_norm": 0.9376553893089294, + "learning_rate": 9.997051341585513e-06, + "loss": 0.8986, + "step": 910 + }, + { + "epoch": 0.04911580763424628, + "grad_norm": 0.9041107892990112, + "learning_rate": 9.99704405717113e-06, + "loss": 0.817, + "step": 911 + }, + { + "epoch": 0.0491697218028898, + "grad_norm": 0.9530431628227234, + "learning_rate": 9.997036763772737e-06, + "loss": 0.9464, + "step": 912 + }, + { + "epoch": 0.04922363597153332, + "grad_norm": 0.9601117968559265, + "learning_rate": 9.997029461390344e-06, + "loss": 0.9014, + "step": 913 + }, + { + "epoch": 0.04927755014017684, + "grad_norm": 0.9162781834602356, + "learning_rate": 9.997022150023968e-06, + "loss": 0.8851, + "step": 914 + }, + { + "epoch": 0.04933146430882036, + "grad_norm": 0.9514605402946472, + "learning_rate": 9.99701482967362e-06, + "loss": 0.8975, + "step": 915 + }, + { + "epoch": 0.049385378477463875, + "grad_norm": 0.897203803062439, + "learning_rate": 9.997007500339313e-06, + "loss": 0.8371, + "step": 916 + }, + { + "epoch": 0.0494392926461074, + "grad_norm": 0.9372673630714417, + "learning_rate": 9.99700016202106e-06, + "loss": 0.9432, + "step": 917 + }, + { + "epoch": 0.04949320681475092, + "grad_norm": 0.8993443846702576, + "learning_rate": 9.996992814718875e-06, + "loss": 0.8528, + "step": 918 + }, + { + "epoch": 0.049547120983394435, + "grad_norm": 0.9300720691680908, + "learning_rate": 9.996985458432771e-06, + "loss": 0.873, + "step": 919 + }, + { + "epoch": 0.04960103515203795, + "grad_norm": 0.9311426281929016, + "learning_rate": 9.996978093162761e-06, + "loss": 0.9092, + "step": 920 + }, + { + "epoch": 0.04965494932068148, + "grad_norm": 0.9244507551193237, + "learning_rate": 9.996970718908859e-06, + "loss": 0.764, + "step": 921 + }, + { + "epoch": 0.049708863489324996, + "grad_norm": 0.915512204170227, + "learning_rate": 9.996963335671074e-06, + "loss": 0.8328, + "step": 922 + }, + { + "epoch": 0.049762777657968514, + "grad_norm": 0.889994740486145, + "learning_rate": 9.996955943449426e-06, + "loss": 0.8491, + "step": 923 + }, + { + "epoch": 0.04981669182661203, + "grad_norm": 0.8676478266716003, + "learning_rate": 9.996948542243925e-06, + "loss": 0.7677, + "step": 924 + }, + { + "epoch": 0.04987060599525555, + "grad_norm": 0.9795013070106506, + "learning_rate": 9.996941132054586e-06, + "loss": 0.9279, + "step": 925 + }, + { + "epoch": 0.049924520163899075, + "grad_norm": 0.940078854560852, + "learning_rate": 9.996933712881419e-06, + "loss": 0.8685, + "step": 926 + }, + { + "epoch": 0.04997843433254259, + "grad_norm": 0.9440926313400269, + "learning_rate": 9.996926284724437e-06, + "loss": 0.9634, + "step": 927 + }, + { + "epoch": 0.05003234850118611, + "grad_norm": 0.9120537638664246, + "learning_rate": 9.99691884758366e-06, + "loss": 0.7656, + "step": 928 + }, + { + "epoch": 0.05008626266982963, + "grad_norm": 1.1514596939086914, + "learning_rate": 9.996911401459093e-06, + "loss": 0.864, + "step": 929 + }, + { + "epoch": 0.050140176838473154, + "grad_norm": 0.8924434185028076, + "learning_rate": 9.996903946350756e-06, + "loss": 0.877, + "step": 930 + }, + { + "epoch": 0.05019409100711667, + "grad_norm": 0.9884456992149353, + "learning_rate": 9.996896482258657e-06, + "loss": 0.94, + "step": 931 + }, + { + "epoch": 0.05024800517576019, + "grad_norm": 0.9282665252685547, + "learning_rate": 9.996889009182814e-06, + "loss": 0.8443, + "step": 932 + }, + { + "epoch": 0.05030191934440371, + "grad_norm": 1.1029064655303955, + "learning_rate": 9.996881527123237e-06, + "loss": 0.9168, + "step": 933 + }, + { + "epoch": 0.050355833513047225, + "grad_norm": 0.839625358581543, + "learning_rate": 9.996874036079942e-06, + "loss": 0.8261, + "step": 934 + }, + { + "epoch": 0.05040974768169075, + "grad_norm": 0.8612869381904602, + "learning_rate": 9.996866536052942e-06, + "loss": 0.8197, + "step": 935 + }, + { + "epoch": 0.05046366185033427, + "grad_norm": 0.9483891129493713, + "learning_rate": 9.996859027042249e-06, + "loss": 0.8374, + "step": 936 + }, + { + "epoch": 0.050517576018977786, + "grad_norm": 0.9374566674232483, + "learning_rate": 9.996851509047877e-06, + "loss": 0.8884, + "step": 937 + }, + { + "epoch": 0.050571490187621304, + "grad_norm": 0.9164647459983826, + "learning_rate": 9.99684398206984e-06, + "loss": 0.8419, + "step": 938 + }, + { + "epoch": 0.05062540435626483, + "grad_norm": 1.0109184980392456, + "learning_rate": 9.996836446108153e-06, + "loss": 0.8912, + "step": 939 + }, + { + "epoch": 0.05067931852490835, + "grad_norm": 0.8549674153327942, + "learning_rate": 9.996828901162825e-06, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.050733232693551865, + "grad_norm": 0.9618684649467468, + "learning_rate": 9.996821347233875e-06, + "loss": 0.8246, + "step": 941 + }, + { + "epoch": 0.05078714686219538, + "grad_norm": 0.9777100682258606, + "learning_rate": 9.996813784321314e-06, + "loss": 0.887, + "step": 942 + }, + { + "epoch": 0.05084106103083891, + "grad_norm": 0.8675182461738586, + "learning_rate": 9.996806212425157e-06, + "loss": 0.7584, + "step": 943 + }, + { + "epoch": 0.050894975199482426, + "grad_norm": 0.9174523949623108, + "learning_rate": 9.996798631545414e-06, + "loss": 0.8911, + "step": 944 + }, + { + "epoch": 0.050948889368125944, + "grad_norm": 0.9269078373908997, + "learning_rate": 9.996791041682101e-06, + "loss": 0.8049, + "step": 945 + }, + { + "epoch": 0.05100280353676946, + "grad_norm": 0.8447721600532532, + "learning_rate": 9.996783442835233e-06, + "loss": 0.7781, + "step": 946 + }, + { + "epoch": 0.05105671770541298, + "grad_norm": 0.9178231954574585, + "learning_rate": 9.99677583500482e-06, + "loss": 0.8107, + "step": 947 + }, + { + "epoch": 0.051110631874056504, + "grad_norm": 0.8741039633750916, + "learning_rate": 9.996768218190879e-06, + "loss": 0.9278, + "step": 948 + }, + { + "epoch": 0.05116454604270002, + "grad_norm": 0.7997228503227234, + "learning_rate": 9.996760592393425e-06, + "loss": 0.7706, + "step": 949 + }, + { + "epoch": 0.05121846021134354, + "grad_norm": 1.003300428390503, + "learning_rate": 9.996752957612468e-06, + "loss": 0.8464, + "step": 950 + }, + { + "epoch": 0.05127237437998706, + "grad_norm": 0.9237748980522156, + "learning_rate": 9.996745313848021e-06, + "loss": 0.9088, + "step": 951 + }, + { + "epoch": 0.05132628854863058, + "grad_norm": 0.8565654754638672, + "learning_rate": 9.996737661100103e-06, + "loss": 0.8208, + "step": 952 + }, + { + "epoch": 0.0513802027172741, + "grad_norm": 1.0590770244598389, + "learning_rate": 9.996729999368722e-06, + "loss": 0.9272, + "step": 953 + }, + { + "epoch": 0.05143411688591762, + "grad_norm": 0.8888198733329773, + "learning_rate": 9.996722328653897e-06, + "loss": 0.8264, + "step": 954 + }, + { + "epoch": 0.05148803105456114, + "grad_norm": 0.9211130142211914, + "learning_rate": 9.996714648955636e-06, + "loss": 0.8807, + "step": 955 + }, + { + "epoch": 0.051541945223204655, + "grad_norm": 1.0241321325302124, + "learning_rate": 9.996706960273958e-06, + "loss": 0.7638, + "step": 956 + }, + { + "epoch": 0.05159585939184818, + "grad_norm": 0.903762698173523, + "learning_rate": 9.996699262608875e-06, + "loss": 0.8583, + "step": 957 + }, + { + "epoch": 0.0516497735604917, + "grad_norm": 0.9271189570426941, + "learning_rate": 9.9966915559604e-06, + "loss": 0.8341, + "step": 958 + }, + { + "epoch": 0.051703687729135216, + "grad_norm": 0.865260899066925, + "learning_rate": 9.996683840328546e-06, + "loss": 0.9136, + "step": 959 + }, + { + "epoch": 0.051757601897778734, + "grad_norm": 0.8903625011444092, + "learning_rate": 9.996676115713332e-06, + "loss": 0.8706, + "step": 960 + }, + { + "epoch": 0.05181151606642226, + "grad_norm": 0.9228227138519287, + "learning_rate": 9.996668382114765e-06, + "loss": 0.8825, + "step": 961 + }, + { + "epoch": 0.051865430235065776, + "grad_norm": 0.9146421551704407, + "learning_rate": 9.996660639532863e-06, + "loss": 0.8347, + "step": 962 + }, + { + "epoch": 0.051919344403709294, + "grad_norm": 0.9010991454124451, + "learning_rate": 9.99665288796764e-06, + "loss": 0.8016, + "step": 963 + }, + { + "epoch": 0.05197325857235281, + "grad_norm": 0.8763105869293213, + "learning_rate": 9.996645127419107e-06, + "loss": 0.8651, + "step": 964 + }, + { + "epoch": 0.05202717274099634, + "grad_norm": 0.9506256580352783, + "learning_rate": 9.996637357887281e-06, + "loss": 0.9429, + "step": 965 + }, + { + "epoch": 0.052081086909639855, + "grad_norm": 0.9484269022941589, + "learning_rate": 9.996629579372175e-06, + "loss": 0.855, + "step": 966 + }, + { + "epoch": 0.05213500107828337, + "grad_norm": 0.8970646262168884, + "learning_rate": 9.996621791873804e-06, + "loss": 0.8611, + "step": 967 + }, + { + "epoch": 0.05218891524692689, + "grad_norm": 0.8925203680992126, + "learning_rate": 9.99661399539218e-06, + "loss": 0.8206, + "step": 968 + }, + { + "epoch": 0.05224282941557041, + "grad_norm": 1.069669246673584, + "learning_rate": 9.996606189927318e-06, + "loss": 0.876, + "step": 969 + }, + { + "epoch": 0.052296743584213934, + "grad_norm": 0.8456307649612427, + "learning_rate": 9.996598375479232e-06, + "loss": 0.7514, + "step": 970 + }, + { + "epoch": 0.05235065775285745, + "grad_norm": 0.9182801246643066, + "learning_rate": 9.996590552047936e-06, + "loss": 0.8915, + "step": 971 + }, + { + "epoch": 0.05240457192150097, + "grad_norm": 0.7616676688194275, + "learning_rate": 9.996582719633445e-06, + "loss": 0.7106, + "step": 972 + }, + { + "epoch": 0.05245848609014449, + "grad_norm": 0.8873127102851868, + "learning_rate": 9.99657487823577e-06, + "loss": 0.9171, + "step": 973 + }, + { + "epoch": 0.05251240025878801, + "grad_norm": 0.9724618792533875, + "learning_rate": 9.996567027854929e-06, + "loss": 0.9765, + "step": 974 + }, + { + "epoch": 0.05256631442743153, + "grad_norm": 0.9106513857841492, + "learning_rate": 9.996559168490933e-06, + "loss": 0.8332, + "step": 975 + }, + { + "epoch": 0.05262022859607505, + "grad_norm": 0.8551159501075745, + "learning_rate": 9.996551300143798e-06, + "loss": 0.8128, + "step": 976 + }, + { + "epoch": 0.052674142764718566, + "grad_norm": 0.9829822182655334, + "learning_rate": 9.996543422813539e-06, + "loss": 0.9088, + "step": 977 + }, + { + "epoch": 0.052728056933362084, + "grad_norm": 0.8281888961791992, + "learning_rate": 9.996535536500166e-06, + "loss": 0.8338, + "step": 978 + }, + { + "epoch": 0.05278197110200561, + "grad_norm": 0.951319694519043, + "learning_rate": 9.9965276412037e-06, + "loss": 0.9359, + "step": 979 + }, + { + "epoch": 0.05283588527064913, + "grad_norm": 0.841390073299408, + "learning_rate": 9.996519736924148e-06, + "loss": 0.7952, + "step": 980 + }, + { + "epoch": 0.052889799439292645, + "grad_norm": 0.8847686648368835, + "learning_rate": 9.996511823661528e-06, + "loss": 0.8435, + "step": 981 + }, + { + "epoch": 0.05294371360793616, + "grad_norm": 0.9261316061019897, + "learning_rate": 9.996503901415855e-06, + "loss": 0.8646, + "step": 982 + }, + { + "epoch": 0.05299762777657969, + "grad_norm": 0.9366586804389954, + "learning_rate": 9.99649597018714e-06, + "loss": 0.8586, + "step": 983 + }, + { + "epoch": 0.053051541945223206, + "grad_norm": 0.8916764259338379, + "learning_rate": 9.9964880299754e-06, + "loss": 0.8215, + "step": 984 + }, + { + "epoch": 0.053105456113866724, + "grad_norm": 0.9496534466743469, + "learning_rate": 9.996480080780648e-06, + "loss": 0.7984, + "step": 985 + }, + { + "epoch": 0.05315937028251024, + "grad_norm": 0.9736526608467102, + "learning_rate": 9.9964721226029e-06, + "loss": 0.7881, + "step": 986 + }, + { + "epoch": 0.05321328445115376, + "grad_norm": 0.9533856511116028, + "learning_rate": 9.996464155442167e-06, + "loss": 0.9855, + "step": 987 + }, + { + "epoch": 0.053267198619797285, + "grad_norm": 0.9656437039375305, + "learning_rate": 9.996456179298467e-06, + "loss": 0.9571, + "step": 988 + }, + { + "epoch": 0.0533211127884408, + "grad_norm": 0.8887313008308411, + "learning_rate": 9.996448194171813e-06, + "loss": 0.9381, + "step": 989 + }, + { + "epoch": 0.05337502695708432, + "grad_norm": 1.0181535482406616, + "learning_rate": 9.996440200062217e-06, + "loss": 0.8834, + "step": 990 + }, + { + "epoch": 0.05342894112572784, + "grad_norm": 0.9083503484725952, + "learning_rate": 9.996432196969696e-06, + "loss": 0.9733, + "step": 991 + }, + { + "epoch": 0.05348285529437136, + "grad_norm": 0.9051093459129333, + "learning_rate": 9.996424184894264e-06, + "loss": 0.8531, + "step": 992 + }, + { + "epoch": 0.05353676946301488, + "grad_norm": 1.0264357328414917, + "learning_rate": 9.996416163835935e-06, + "loss": 0.9212, + "step": 993 + }, + { + "epoch": 0.0535906836316584, + "grad_norm": 1.0350812673568726, + "learning_rate": 9.996408133794726e-06, + "loss": 0.7843, + "step": 994 + }, + { + "epoch": 0.05364459780030192, + "grad_norm": 0.9610341787338257, + "learning_rate": 9.996400094770647e-06, + "loss": 0.8561, + "step": 995 + }, + { + "epoch": 0.05369851196894544, + "grad_norm": 0.8123961687088013, + "learning_rate": 9.996392046763714e-06, + "loss": 0.8296, + "step": 996 + }, + { + "epoch": 0.05375242613758896, + "grad_norm": 0.9337920546531677, + "learning_rate": 9.996383989773942e-06, + "loss": 0.8525, + "step": 997 + }, + { + "epoch": 0.05380634030623248, + "grad_norm": 1.1319444179534912, + "learning_rate": 9.996375923801347e-06, + "loss": 0.9127, + "step": 998 + }, + { + "epoch": 0.053860254474875996, + "grad_norm": 0.8506798148155212, + "learning_rate": 9.996367848845941e-06, + "loss": 0.884, + "step": 999 + }, + { + "epoch": 0.053914168643519514, + "grad_norm": 0.8248615860939026, + "learning_rate": 9.996359764907739e-06, + "loss": 0.7579, + "step": 1000 + }, + { + "epoch": 0.05396808281216304, + "grad_norm": 0.9258946180343628, + "learning_rate": 9.996351671986756e-06, + "loss": 0.8632, + "step": 1001 + }, + { + "epoch": 0.05402199698080656, + "grad_norm": 0.8891279101371765, + "learning_rate": 9.996343570083006e-06, + "loss": 0.8758, + "step": 1002 + }, + { + "epoch": 0.054075911149450075, + "grad_norm": 0.9592086672782898, + "learning_rate": 9.996335459196505e-06, + "loss": 0.8962, + "step": 1003 + }, + { + "epoch": 0.05412982531809359, + "grad_norm": 0.8937798738479614, + "learning_rate": 9.996327339327267e-06, + "loss": 0.8434, + "step": 1004 + }, + { + "epoch": 0.05418373948673712, + "grad_norm": 0.9602083563804626, + "learning_rate": 9.996319210475307e-06, + "loss": 0.9692, + "step": 1005 + }, + { + "epoch": 0.054237653655380635, + "grad_norm": 0.870637834072113, + "learning_rate": 9.996311072640637e-06, + "loss": 0.9146, + "step": 1006 + }, + { + "epoch": 0.05429156782402415, + "grad_norm": 0.9330273866653442, + "learning_rate": 9.996302925823276e-06, + "loss": 0.8584, + "step": 1007 + }, + { + "epoch": 0.05434548199266767, + "grad_norm": 0.8185963034629822, + "learning_rate": 9.996294770023234e-06, + "loss": 0.7854, + "step": 1008 + }, + { + "epoch": 0.05439939616131119, + "grad_norm": 0.8727489113807678, + "learning_rate": 9.996286605240528e-06, + "loss": 0.7388, + "step": 1009 + }, + { + "epoch": 0.054453310329954714, + "grad_norm": 1.0858477354049683, + "learning_rate": 9.996278431475172e-06, + "loss": 0.9201, + "step": 1010 + }, + { + "epoch": 0.05450722449859823, + "grad_norm": 0.9749255776405334, + "learning_rate": 9.996270248727184e-06, + "loss": 0.9041, + "step": 1011 + }, + { + "epoch": 0.05456113866724175, + "grad_norm": 0.9460576176643372, + "learning_rate": 9.996262056996575e-06, + "loss": 0.8553, + "step": 1012 + }, + { + "epoch": 0.05461505283588527, + "grad_norm": 0.9379808306694031, + "learning_rate": 9.99625385628336e-06, + "loss": 0.9253, + "step": 1013 + }, + { + "epoch": 0.05466896700452879, + "grad_norm": 0.8154170513153076, + "learning_rate": 9.996245646587553e-06, + "loss": 0.8703, + "step": 1014 + }, + { + "epoch": 0.05472288117317231, + "grad_norm": 0.9122161269187927, + "learning_rate": 9.996237427909172e-06, + "loss": 0.7734, + "step": 1015 + }, + { + "epoch": 0.05477679534181583, + "grad_norm": 0.9049486517906189, + "learning_rate": 9.996229200248228e-06, + "loss": 0.8991, + "step": 1016 + }, + { + "epoch": 0.05483070951045935, + "grad_norm": 0.9244295358657837, + "learning_rate": 9.996220963604741e-06, + "loss": 0.8514, + "step": 1017 + }, + { + "epoch": 0.05488462367910287, + "grad_norm": 0.9817934036254883, + "learning_rate": 9.99621271797872e-06, + "loss": 0.8641, + "step": 1018 + }, + { + "epoch": 0.05493853784774639, + "grad_norm": 0.9253972768783569, + "learning_rate": 9.996204463370182e-06, + "loss": 0.9199, + "step": 1019 + }, + { + "epoch": 0.05499245201638991, + "grad_norm": 0.9114319682121277, + "learning_rate": 9.996196199779145e-06, + "loss": 0.8063, + "step": 1020 + }, + { + "epoch": 0.055046366185033425, + "grad_norm": 0.9643195867538452, + "learning_rate": 9.996187927205619e-06, + "loss": 0.9668, + "step": 1021 + }, + { + "epoch": 0.05510028035367694, + "grad_norm": 0.8127598166465759, + "learning_rate": 9.996179645649622e-06, + "loss": 0.764, + "step": 1022 + }, + { + "epoch": 0.05515419452232047, + "grad_norm": 0.8728108406066895, + "learning_rate": 9.996171355111167e-06, + "loss": 0.7703, + "step": 1023 + }, + { + "epoch": 0.055208108690963986, + "grad_norm": 0.8554317355155945, + "learning_rate": 9.996163055590269e-06, + "loss": 0.8266, + "step": 1024 + }, + { + "epoch": 0.055262022859607504, + "grad_norm": 0.7951076030731201, + "learning_rate": 9.996154747086946e-06, + "loss": 0.7601, + "step": 1025 + }, + { + "epoch": 0.05531593702825102, + "grad_norm": 0.8916927576065063, + "learning_rate": 9.996146429601208e-06, + "loss": 0.8936, + "step": 1026 + }, + { + "epoch": 0.05536985119689455, + "grad_norm": 1.0242576599121094, + "learning_rate": 9.996138103133075e-06, + "loss": 0.8868, + "step": 1027 + }, + { + "epoch": 0.055423765365538065, + "grad_norm": 0.9273019433021545, + "learning_rate": 9.996129767682557e-06, + "loss": 0.8622, + "step": 1028 + }, + { + "epoch": 0.05547767953418158, + "grad_norm": 0.9547039866447449, + "learning_rate": 9.996121423249673e-06, + "loss": 0.7814, + "step": 1029 + }, + { + "epoch": 0.0555315937028251, + "grad_norm": 0.8750621676445007, + "learning_rate": 9.996113069834437e-06, + "loss": 0.7717, + "step": 1030 + }, + { + "epoch": 0.05558550787146862, + "grad_norm": 0.9547988176345825, + "learning_rate": 9.996104707436862e-06, + "loss": 0.8877, + "step": 1031 + }, + { + "epoch": 0.055639422040112144, + "grad_norm": 0.8856480717658997, + "learning_rate": 9.996096336056966e-06, + "loss": 0.7927, + "step": 1032 + }, + { + "epoch": 0.05569333620875566, + "grad_norm": 0.8311342000961304, + "learning_rate": 9.99608795569476e-06, + "loss": 0.7847, + "step": 1033 + }, + { + "epoch": 0.05574725037739918, + "grad_norm": 1.0720731019973755, + "learning_rate": 9.996079566350266e-06, + "loss": 0.9243, + "step": 1034 + }, + { + "epoch": 0.0558011645460427, + "grad_norm": 0.9498684406280518, + "learning_rate": 9.996071168023491e-06, + "loss": 0.8605, + "step": 1035 + }, + { + "epoch": 0.05585507871468622, + "grad_norm": 0.9043952822685242, + "learning_rate": 9.996062760714456e-06, + "loss": 0.8488, + "step": 1036 + }, + { + "epoch": 0.05590899288332974, + "grad_norm": 0.8051116466522217, + "learning_rate": 9.996054344423173e-06, + "loss": 0.8275, + "step": 1037 + }, + { + "epoch": 0.05596290705197326, + "grad_norm": 0.857120156288147, + "learning_rate": 9.996045919149658e-06, + "loss": 0.8837, + "step": 1038 + }, + { + "epoch": 0.056016821220616776, + "grad_norm": 0.8810911774635315, + "learning_rate": 9.996037484893926e-06, + "loss": 0.8179, + "step": 1039 + }, + { + "epoch": 0.056070735389260294, + "grad_norm": 0.8783093690872192, + "learning_rate": 9.996029041655994e-06, + "loss": 0.7734, + "step": 1040 + }, + { + "epoch": 0.05612464955790382, + "grad_norm": 0.9281952977180481, + "learning_rate": 9.996020589435874e-06, + "loss": 0.8747, + "step": 1041 + }, + { + "epoch": 0.05617856372654734, + "grad_norm": 0.8307299613952637, + "learning_rate": 9.996012128233583e-06, + "loss": 0.8055, + "step": 1042 + }, + { + "epoch": 0.056232477895190855, + "grad_norm": 0.9520873427391052, + "learning_rate": 9.996003658049136e-06, + "loss": 0.8181, + "step": 1043 + }, + { + "epoch": 0.05628639206383437, + "grad_norm": 0.8753806948661804, + "learning_rate": 9.995995178882549e-06, + "loss": 0.808, + "step": 1044 + }, + { + "epoch": 0.0563403062324779, + "grad_norm": 1.067691683769226, + "learning_rate": 9.995986690733836e-06, + "loss": 0.8048, + "step": 1045 + }, + { + "epoch": 0.056394220401121416, + "grad_norm": 0.8575261235237122, + "learning_rate": 9.995978193603013e-06, + "loss": 0.9231, + "step": 1046 + }, + { + "epoch": 0.056448134569764934, + "grad_norm": 0.9857104420661926, + "learning_rate": 9.995969687490096e-06, + "loss": 0.8883, + "step": 1047 + }, + { + "epoch": 0.05650204873840845, + "grad_norm": 0.9203484654426575, + "learning_rate": 9.995961172395098e-06, + "loss": 0.7634, + "step": 1048 + }, + { + "epoch": 0.056555962907051976, + "grad_norm": 0.8741904497146606, + "learning_rate": 9.995952648318036e-06, + "loss": 0.8061, + "step": 1049 + }, + { + "epoch": 0.056609877075695494, + "grad_norm": 0.9495588541030884, + "learning_rate": 9.995944115258925e-06, + "loss": 0.8922, + "step": 1050 + }, + { + "epoch": 0.05666379124433901, + "grad_norm": 0.9306020140647888, + "learning_rate": 9.99593557321778e-06, + "loss": 0.8454, + "step": 1051 + }, + { + "epoch": 0.05671770541298253, + "grad_norm": 0.9457784295082092, + "learning_rate": 9.995927022194615e-06, + "loss": 0.8701, + "step": 1052 + }, + { + "epoch": 0.05677161958162605, + "grad_norm": 0.88719242811203, + "learning_rate": 9.99591846218945e-06, + "loss": 0.8416, + "step": 1053 + }, + { + "epoch": 0.05682553375026957, + "grad_norm": 0.8740848302841187, + "learning_rate": 9.995909893202296e-06, + "loss": 0.7962, + "step": 1054 + }, + { + "epoch": 0.05687944791891309, + "grad_norm": 1.0149377584457397, + "learning_rate": 9.99590131523317e-06, + "loss": 0.8352, + "step": 1055 + }, + { + "epoch": 0.05693336208755661, + "grad_norm": 0.9014917016029358, + "learning_rate": 9.995892728282088e-06, + "loss": 0.9244, + "step": 1056 + }, + { + "epoch": 0.05698727625620013, + "grad_norm": 0.9351898431777954, + "learning_rate": 9.995884132349062e-06, + "loss": 0.865, + "step": 1057 + }, + { + "epoch": 0.05704119042484365, + "grad_norm": 0.8656749129295349, + "learning_rate": 9.995875527434113e-06, + "loss": 0.8836, + "step": 1058 + }, + { + "epoch": 0.05709510459348717, + "grad_norm": 0.9120789170265198, + "learning_rate": 9.995866913537254e-06, + "loss": 0.8772, + "step": 1059 + }, + { + "epoch": 0.05714901876213069, + "grad_norm": 1.0019149780273438, + "learning_rate": 9.995858290658497e-06, + "loss": 0.9338, + "step": 1060 + }, + { + "epoch": 0.057202932930774206, + "grad_norm": 0.8492977023124695, + "learning_rate": 9.995849658797863e-06, + "loss": 0.742, + "step": 1061 + }, + { + "epoch": 0.057256847099417724, + "grad_norm": 1.000607967376709, + "learning_rate": 9.995841017955363e-06, + "loss": 0.8498, + "step": 1062 + }, + { + "epoch": 0.05731076126806125, + "grad_norm": 1.0268487930297852, + "learning_rate": 9.995832368131016e-06, + "loss": 0.8937, + "step": 1063 + }, + { + "epoch": 0.057364675436704766, + "grad_norm": 0.9388830661773682, + "learning_rate": 9.995823709324836e-06, + "loss": 0.877, + "step": 1064 + }, + { + "epoch": 0.057418589605348284, + "grad_norm": 0.9747199416160583, + "learning_rate": 9.99581504153684e-06, + "loss": 0.8436, + "step": 1065 + }, + { + "epoch": 0.0574725037739918, + "grad_norm": 0.9125073552131653, + "learning_rate": 9.99580636476704e-06, + "loss": 0.8853, + "step": 1066 + }, + { + "epoch": 0.05752641794263533, + "grad_norm": 0.8910282254219055, + "learning_rate": 9.995797679015455e-06, + "loss": 0.8566, + "step": 1067 + }, + { + "epoch": 0.057580332111278845, + "grad_norm": 0.8546010255813599, + "learning_rate": 9.995788984282101e-06, + "loss": 0.8209, + "step": 1068 + }, + { + "epoch": 0.05763424627992236, + "grad_norm": 0.9205883145332336, + "learning_rate": 9.99578028056699e-06, + "loss": 0.7814, + "step": 1069 + }, + { + "epoch": 0.05768816044856588, + "grad_norm": 0.9627780914306641, + "learning_rate": 9.995771567870142e-06, + "loss": 0.8686, + "step": 1070 + }, + { + "epoch": 0.057742074617209406, + "grad_norm": 0.9917465448379517, + "learning_rate": 9.995762846191569e-06, + "loss": 0.9672, + "step": 1071 + }, + { + "epoch": 0.057795988785852924, + "grad_norm": 0.9396706223487854, + "learning_rate": 9.995754115531288e-06, + "loss": 0.8631, + "step": 1072 + }, + { + "epoch": 0.05784990295449644, + "grad_norm": 0.8310922980308533, + "learning_rate": 9.995745375889317e-06, + "loss": 0.8637, + "step": 1073 + }, + { + "epoch": 0.05790381712313996, + "grad_norm": 0.9085954427719116, + "learning_rate": 9.995736627265667e-06, + "loss": 0.8821, + "step": 1074 + }, + { + "epoch": 0.05795773129178348, + "grad_norm": 0.8529816269874573, + "learning_rate": 9.995727869660357e-06, + "loss": 0.8426, + "step": 1075 + }, + { + "epoch": 0.058011645460427, + "grad_norm": 0.8288499116897583, + "learning_rate": 9.995719103073403e-06, + "loss": 0.8415, + "step": 1076 + }, + { + "epoch": 0.05806555962907052, + "grad_norm": 0.9105609059333801, + "learning_rate": 9.995710327504819e-06, + "loss": 0.7683, + "step": 1077 + }, + { + "epoch": 0.05811947379771404, + "grad_norm": 0.9578274488449097, + "learning_rate": 9.995701542954622e-06, + "loss": 0.8796, + "step": 1078 + }, + { + "epoch": 0.058173387966357556, + "grad_norm": 0.8542460799217224, + "learning_rate": 9.995692749422827e-06, + "loss": 0.8363, + "step": 1079 + }, + { + "epoch": 0.05822730213500108, + "grad_norm": 0.8723183274269104, + "learning_rate": 9.99568394690945e-06, + "loss": 0.8434, + "step": 1080 + }, + { + "epoch": 0.0582812163036446, + "grad_norm": 0.9157887697219849, + "learning_rate": 9.995675135414507e-06, + "loss": 0.6532, + "step": 1081 + }, + { + "epoch": 0.05833513047228812, + "grad_norm": 0.9055691361427307, + "learning_rate": 9.995666314938014e-06, + "loss": 0.8762, + "step": 1082 + }, + { + "epoch": 0.058389044640931635, + "grad_norm": 0.8224693536758423, + "learning_rate": 9.995657485479987e-06, + "loss": 0.7976, + "step": 1083 + }, + { + "epoch": 0.05844295880957515, + "grad_norm": 0.925414502620697, + "learning_rate": 9.995648647040441e-06, + "loss": 0.8673, + "step": 1084 + }, + { + "epoch": 0.05849687297821868, + "grad_norm": 0.9194141626358032, + "learning_rate": 9.995639799619395e-06, + "loss": 0.7916, + "step": 1085 + }, + { + "epoch": 0.058550787146862196, + "grad_norm": 1.08795166015625, + "learning_rate": 9.995630943216859e-06, + "loss": 0.9135, + "step": 1086 + }, + { + "epoch": 0.058604701315505714, + "grad_norm": 0.9648925065994263, + "learning_rate": 9.995622077832854e-06, + "loss": 0.8442, + "step": 1087 + }, + { + "epoch": 0.05865861548414923, + "grad_norm": 1.0012339353561401, + "learning_rate": 9.995613203467394e-06, + "loss": 0.9543, + "step": 1088 + }, + { + "epoch": 0.05871252965279276, + "grad_norm": 0.9333881735801697, + "learning_rate": 9.995604320120496e-06, + "loss": 0.9267, + "step": 1089 + }, + { + "epoch": 0.058766443821436275, + "grad_norm": 0.8566498160362244, + "learning_rate": 9.995595427792173e-06, + "loss": 0.8539, + "step": 1090 + }, + { + "epoch": 0.05882035799007979, + "grad_norm": 0.8766364455223083, + "learning_rate": 9.995586526482446e-06, + "loss": 0.9293, + "step": 1091 + }, + { + "epoch": 0.05887427215872331, + "grad_norm": 0.9181047677993774, + "learning_rate": 9.995577616191326e-06, + "loss": 0.8333, + "step": 1092 + }, + { + "epoch": 0.05892818632736683, + "grad_norm": 0.8831031918525696, + "learning_rate": 9.995568696918833e-06, + "loss": 0.8016, + "step": 1093 + }, + { + "epoch": 0.05898210049601035, + "grad_norm": 0.8618754148483276, + "learning_rate": 9.99555976866498e-06, + "loss": 0.8988, + "step": 1094 + }, + { + "epoch": 0.05903601466465387, + "grad_norm": 0.9083183407783508, + "learning_rate": 9.995550831429785e-06, + "loss": 0.8626, + "step": 1095 + }, + { + "epoch": 0.05908992883329739, + "grad_norm": 0.8423884510993958, + "learning_rate": 9.995541885213262e-06, + "loss": 0.9121, + "step": 1096 + }, + { + "epoch": 0.05914384300194091, + "grad_norm": 0.7747607827186584, + "learning_rate": 9.99553293001543e-06, + "loss": 0.8087, + "step": 1097 + }, + { + "epoch": 0.05919775717058443, + "grad_norm": 0.8828368186950684, + "learning_rate": 9.995523965836302e-06, + "loss": 0.8284, + "step": 1098 + }, + { + "epoch": 0.05925167133922795, + "grad_norm": 0.9448524713516235, + "learning_rate": 9.995514992675896e-06, + "loss": 0.9565, + "step": 1099 + }, + { + "epoch": 0.05930558550787147, + "grad_norm": 0.8967006206512451, + "learning_rate": 9.99550601053423e-06, + "loss": 0.8412, + "step": 1100 + }, + { + "epoch": 0.059359499676514986, + "grad_norm": 0.9394551515579224, + "learning_rate": 9.995497019411315e-06, + "loss": 0.929, + "step": 1101 + }, + { + "epoch": 0.05941341384515851, + "grad_norm": 0.9002842903137207, + "learning_rate": 9.995488019307172e-06, + "loss": 0.734, + "step": 1102 + }, + { + "epoch": 0.05946732801380203, + "grad_norm": 1.3590562343597412, + "learning_rate": 9.995479010221816e-06, + "loss": 0.8843, + "step": 1103 + }, + { + "epoch": 0.05952124218244555, + "grad_norm": 1.041528582572937, + "learning_rate": 9.99546999215526e-06, + "loss": 0.9001, + "step": 1104 + }, + { + "epoch": 0.059575156351089065, + "grad_norm": 0.9846720099449158, + "learning_rate": 9.995460965107524e-06, + "loss": 0.8174, + "step": 1105 + }, + { + "epoch": 0.05962907051973258, + "grad_norm": 0.9171685576438904, + "learning_rate": 9.995451929078624e-06, + "loss": 0.8756, + "step": 1106 + }, + { + "epoch": 0.05968298468837611, + "grad_norm": 0.9155516028404236, + "learning_rate": 9.995442884068574e-06, + "loss": 0.7327, + "step": 1107 + }, + { + "epoch": 0.059736898857019625, + "grad_norm": 0.8734007477760315, + "learning_rate": 9.99543383007739e-06, + "loss": 0.8385, + "step": 1108 + }, + { + "epoch": 0.05979081302566314, + "grad_norm": 0.8580977320671082, + "learning_rate": 9.99542476710509e-06, + "loss": 0.885, + "step": 1109 + }, + { + "epoch": 0.05984472719430666, + "grad_norm": 0.8499299883842468, + "learning_rate": 9.995415695151692e-06, + "loss": 0.8323, + "step": 1110 + }, + { + "epoch": 0.059898641362950186, + "grad_norm": 0.8348694443702698, + "learning_rate": 9.99540661421721e-06, + "loss": 0.7947, + "step": 1111 + }, + { + "epoch": 0.059952555531593704, + "grad_norm": 0.8865199685096741, + "learning_rate": 9.99539752430166e-06, + "loss": 0.9363, + "step": 1112 + }, + { + "epoch": 0.06000646970023722, + "grad_norm": 0.9492315649986267, + "learning_rate": 9.995388425405059e-06, + "loss": 0.913, + "step": 1113 + }, + { + "epoch": 0.06006038386888074, + "grad_norm": 0.938252329826355, + "learning_rate": 9.995379317527422e-06, + "loss": 0.861, + "step": 1114 + }, + { + "epoch": 0.06011429803752426, + "grad_norm": 1.2601032257080078, + "learning_rate": 9.995370200668768e-06, + "loss": 0.9435, + "step": 1115 + }, + { + "epoch": 0.06016821220616778, + "grad_norm": 0.915830671787262, + "learning_rate": 9.995361074829112e-06, + "loss": 0.9372, + "step": 1116 + }, + { + "epoch": 0.0602221263748113, + "grad_norm": 1.4548465013504028, + "learning_rate": 9.995351940008473e-06, + "loss": 0.9055, + "step": 1117 + }, + { + "epoch": 0.06027604054345482, + "grad_norm": 0.9090906381607056, + "learning_rate": 9.995342796206861e-06, + "loss": 0.8849, + "step": 1118 + }, + { + "epoch": 0.06032995471209834, + "grad_norm": 0.9860616326332092, + "learning_rate": 9.995333643424298e-06, + "loss": 0.8304, + "step": 1119 + }, + { + "epoch": 0.06038386888074186, + "grad_norm": 0.8320879340171814, + "learning_rate": 9.9953244816608e-06, + "loss": 0.8432, + "step": 1120 + }, + { + "epoch": 0.06043778304938538, + "grad_norm": 0.8633564114570618, + "learning_rate": 9.995315310916381e-06, + "loss": 0.7461, + "step": 1121 + }, + { + "epoch": 0.0604916972180289, + "grad_norm": 0.881287693977356, + "learning_rate": 9.995306131191059e-06, + "loss": 0.8512, + "step": 1122 + }, + { + "epoch": 0.060545611386672415, + "grad_norm": 0.8888201713562012, + "learning_rate": 9.99529694248485e-06, + "loss": 0.8416, + "step": 1123 + }, + { + "epoch": 0.06059952555531594, + "grad_norm": 0.8073605895042419, + "learning_rate": 9.99528774479777e-06, + "loss": 0.8369, + "step": 1124 + }, + { + "epoch": 0.06065343972395946, + "grad_norm": 0.9260549545288086, + "learning_rate": 9.995278538129837e-06, + "loss": 0.8548, + "step": 1125 + }, + { + "epoch": 0.060707353892602976, + "grad_norm": 0.9169156551361084, + "learning_rate": 9.99526932248107e-06, + "loss": 0.9149, + "step": 1126 + }, + { + "epoch": 0.060761268061246494, + "grad_norm": 0.8481706380844116, + "learning_rate": 9.995260097851478e-06, + "loss": 0.8591, + "step": 1127 + }, + { + "epoch": 0.06081518222989001, + "grad_norm": 0.8934486508369446, + "learning_rate": 9.995250864241085e-06, + "loss": 0.9322, + "step": 1128 + }, + { + "epoch": 0.06086909639853354, + "grad_norm": 0.947390615940094, + "learning_rate": 9.995241621649902e-06, + "loss": 1.0015, + "step": 1129 + }, + { + "epoch": 0.060923010567177055, + "grad_norm": 0.9185096025466919, + "learning_rate": 9.995232370077949e-06, + "loss": 0.9293, + "step": 1130 + }, + { + "epoch": 0.06097692473582057, + "grad_norm": 0.9517882466316223, + "learning_rate": 9.995223109525245e-06, + "loss": 0.8673, + "step": 1131 + }, + { + "epoch": 0.06103083890446409, + "grad_norm": 1.065699815750122, + "learning_rate": 9.9952138399918e-06, + "loss": 0.9144, + "step": 1132 + }, + { + "epoch": 0.061084753073107616, + "grad_norm": 0.9048404693603516, + "learning_rate": 9.995204561477635e-06, + "loss": 0.7773, + "step": 1133 + }, + { + "epoch": 0.061138667241751134, + "grad_norm": 1.104457139968872, + "learning_rate": 9.995195273982768e-06, + "loss": 0.8847, + "step": 1134 + }, + { + "epoch": 0.06119258141039465, + "grad_norm": 0.9009587168693542, + "learning_rate": 9.995185977507212e-06, + "loss": 0.8118, + "step": 1135 + }, + { + "epoch": 0.06124649557903817, + "grad_norm": 1.0740209817886353, + "learning_rate": 9.995176672050983e-06, + "loss": 0.9173, + "step": 1136 + }, + { + "epoch": 0.06130040974768169, + "grad_norm": 0.9820743203163147, + "learning_rate": 9.995167357614104e-06, + "loss": 0.8555, + "step": 1137 + }, + { + "epoch": 0.06135432391632521, + "grad_norm": 0.9250825047492981, + "learning_rate": 9.995158034196586e-06, + "loss": 0.8771, + "step": 1138 + }, + { + "epoch": 0.06140823808496873, + "grad_norm": 0.8952597379684448, + "learning_rate": 9.995148701798447e-06, + "loss": 0.8598, + "step": 1139 + }, + { + "epoch": 0.06146215225361225, + "grad_norm": 0.8485212922096252, + "learning_rate": 9.995139360419706e-06, + "loss": 0.8557, + "step": 1140 + }, + { + "epoch": 0.061516066422255766, + "grad_norm": 0.9676715731620789, + "learning_rate": 9.995130010060377e-06, + "loss": 0.7748, + "step": 1141 + }, + { + "epoch": 0.06156998059089929, + "grad_norm": 0.7896347045898438, + "learning_rate": 9.995120650720478e-06, + "loss": 0.6183, + "step": 1142 + }, + { + "epoch": 0.06162389475954281, + "grad_norm": 0.8746615052223206, + "learning_rate": 9.995111282400024e-06, + "loss": 0.8321, + "step": 1143 + }, + { + "epoch": 0.06167780892818633, + "grad_norm": 0.9029875993728638, + "learning_rate": 9.995101905099036e-06, + "loss": 0.8686, + "step": 1144 + }, + { + "epoch": 0.061731723096829845, + "grad_norm": 0.9529547095298767, + "learning_rate": 9.995092518817528e-06, + "loss": 0.8878, + "step": 1145 + }, + { + "epoch": 0.06178563726547336, + "grad_norm": 0.8280455470085144, + "learning_rate": 9.995083123555517e-06, + "loss": 0.8232, + "step": 1146 + }, + { + "epoch": 0.06183955143411689, + "grad_norm": 0.908881664276123, + "learning_rate": 9.995073719313021e-06, + "loss": 0.8387, + "step": 1147 + }, + { + "epoch": 0.061893465602760406, + "grad_norm": 0.9137653708457947, + "learning_rate": 9.995064306090055e-06, + "loss": 0.8943, + "step": 1148 + }, + { + "epoch": 0.061947379771403924, + "grad_norm": 0.863861620426178, + "learning_rate": 9.995054883886639e-06, + "loss": 0.7435, + "step": 1149 + }, + { + "epoch": 0.06200129394004744, + "grad_norm": 0.8534915447235107, + "learning_rate": 9.995045452702786e-06, + "loss": 0.941, + "step": 1150 + }, + { + "epoch": 0.06205520810869097, + "grad_norm": 0.9469791650772095, + "learning_rate": 9.995036012538515e-06, + "loss": 0.9137, + "step": 1151 + }, + { + "epoch": 0.062109122277334484, + "grad_norm": 0.9044890999794006, + "learning_rate": 9.995026563393844e-06, + "loss": 0.9117, + "step": 1152 + }, + { + "epoch": 0.062163036445978, + "grad_norm": 0.989772379398346, + "learning_rate": 9.995017105268789e-06, + "loss": 0.8306, + "step": 1153 + }, + { + "epoch": 0.06221695061462152, + "grad_norm": 0.8586496114730835, + "learning_rate": 9.995007638163365e-06, + "loss": 0.8012, + "step": 1154 + }, + { + "epoch": 0.062270864783265045, + "grad_norm": 0.9221116304397583, + "learning_rate": 9.994998162077594e-06, + "loss": 0.7935, + "step": 1155 + }, + { + "epoch": 0.06232477895190856, + "grad_norm": 0.9453061819076538, + "learning_rate": 9.994988677011489e-06, + "loss": 0.8257, + "step": 1156 + }, + { + "epoch": 0.06237869312055208, + "grad_norm": 0.8065335154533386, + "learning_rate": 9.994979182965065e-06, + "loss": 0.86, + "step": 1157 + }, + { + "epoch": 0.0624326072891956, + "grad_norm": 0.9597793817520142, + "learning_rate": 9.994969679938346e-06, + "loss": 0.862, + "step": 1158 + }, + { + "epoch": 0.06248652145783912, + "grad_norm": 0.9118353128433228, + "learning_rate": 9.994960167931342e-06, + "loss": 0.8925, + "step": 1159 + }, + { + "epoch": 0.06254043562648263, + "grad_norm": 1.0216273069381714, + "learning_rate": 9.994950646944077e-06, + "loss": 0.7078, + "step": 1160 + }, + { + "epoch": 0.06259434979512615, + "grad_norm": 0.960182785987854, + "learning_rate": 9.994941116976562e-06, + "loss": 0.8936, + "step": 1161 + }, + { + "epoch": 0.06264826396376968, + "grad_norm": 0.9551856517791748, + "learning_rate": 9.994931578028817e-06, + "loss": 0.8053, + "step": 1162 + }, + { + "epoch": 0.0627021781324132, + "grad_norm": 0.9419867992401123, + "learning_rate": 9.994922030100857e-06, + "loss": 0.8333, + "step": 1163 + }, + { + "epoch": 0.06275609230105672, + "grad_norm": 0.9780306816101074, + "learning_rate": 9.994912473192702e-06, + "loss": 0.88, + "step": 1164 + }, + { + "epoch": 0.06281000646970024, + "grad_norm": 0.9320577383041382, + "learning_rate": 9.99490290730437e-06, + "loss": 0.8859, + "step": 1165 + }, + { + "epoch": 0.06286392063834376, + "grad_norm": 0.7692422270774841, + "learning_rate": 9.994893332435874e-06, + "loss": 0.8093, + "step": 1166 + }, + { + "epoch": 0.06291783480698727, + "grad_norm": 1.0622048377990723, + "learning_rate": 9.994883748587234e-06, + "loss": 0.8959, + "step": 1167 + }, + { + "epoch": 0.06297174897563079, + "grad_norm": 0.9598555564880371, + "learning_rate": 9.994874155758467e-06, + "loss": 0.8153, + "step": 1168 + }, + { + "epoch": 0.06302566314427431, + "grad_norm": 0.9207014441490173, + "learning_rate": 9.994864553949591e-06, + "loss": 0.9383, + "step": 1169 + }, + { + "epoch": 0.06307957731291783, + "grad_norm": 1.0074093341827393, + "learning_rate": 9.99485494316062e-06, + "loss": 0.9999, + "step": 1170 + }, + { + "epoch": 0.06313349148156136, + "grad_norm": 0.8454248905181885, + "learning_rate": 9.994845323391575e-06, + "loss": 0.7946, + "step": 1171 + }, + { + "epoch": 0.06318740565020488, + "grad_norm": 0.847578763961792, + "learning_rate": 9.99483569464247e-06, + "loss": 0.7144, + "step": 1172 + }, + { + "epoch": 0.0632413198188484, + "grad_norm": 0.9083126187324524, + "learning_rate": 9.994826056913325e-06, + "loss": 0.774, + "step": 1173 + }, + { + "epoch": 0.06329523398749191, + "grad_norm": 0.8995345830917358, + "learning_rate": 9.994816410204158e-06, + "loss": 0.8995, + "step": 1174 + }, + { + "epoch": 0.06334914815613543, + "grad_norm": 1.0547746419906616, + "learning_rate": 9.994806754514983e-06, + "loss": 0.8142, + "step": 1175 + }, + { + "epoch": 0.06340306232477895, + "grad_norm": 0.946854829788208, + "learning_rate": 9.99479708984582e-06, + "loss": 0.8639, + "step": 1176 + }, + { + "epoch": 0.06345697649342247, + "grad_norm": 0.8746247291564941, + "learning_rate": 9.994787416196683e-06, + "loss": 0.8601, + "step": 1177 + }, + { + "epoch": 0.06351089066206599, + "grad_norm": 0.9075024127960205, + "learning_rate": 9.994777733567595e-06, + "loss": 0.7969, + "step": 1178 + }, + { + "epoch": 0.0635648048307095, + "grad_norm": 0.9435486197471619, + "learning_rate": 9.994768041958569e-06, + "loss": 0.8199, + "step": 1179 + }, + { + "epoch": 0.06361871899935304, + "grad_norm": 0.8597564697265625, + "learning_rate": 9.994758341369624e-06, + "loss": 0.8791, + "step": 1180 + }, + { + "epoch": 0.06367263316799655, + "grad_norm": 0.7960480451583862, + "learning_rate": 9.994748631800777e-06, + "loss": 0.8035, + "step": 1181 + }, + { + "epoch": 0.06372654733664007, + "grad_norm": 1.1984984874725342, + "learning_rate": 9.994738913252045e-06, + "loss": 0.7372, + "step": 1182 + }, + { + "epoch": 0.06378046150528359, + "grad_norm": 0.8532997369766235, + "learning_rate": 9.994729185723446e-06, + "loss": 0.9094, + "step": 1183 + }, + { + "epoch": 0.06383437567392711, + "grad_norm": 0.8327267169952393, + "learning_rate": 9.994719449214999e-06, + "loss": 0.809, + "step": 1184 + }, + { + "epoch": 0.06388828984257063, + "grad_norm": 0.9086306691169739, + "learning_rate": 9.99470970372672e-06, + "loss": 0.8278, + "step": 1185 + }, + { + "epoch": 0.06394220401121414, + "grad_norm": 0.8422104716300964, + "learning_rate": 9.994699949258626e-06, + "loss": 0.7754, + "step": 1186 + }, + { + "epoch": 0.06399611817985766, + "grad_norm": 1.0434929132461548, + "learning_rate": 9.994690185810733e-06, + "loss": 0.908, + "step": 1187 + }, + { + "epoch": 0.06405003234850119, + "grad_norm": 1.1625720262527466, + "learning_rate": 9.994680413383064e-06, + "loss": 0.8814, + "step": 1188 + }, + { + "epoch": 0.06410394651714471, + "grad_norm": 0.9940767288208008, + "learning_rate": 9.994670631975631e-06, + "loss": 0.7846, + "step": 1189 + }, + { + "epoch": 0.06415786068578823, + "grad_norm": 0.8356907963752747, + "learning_rate": 9.994660841588457e-06, + "loss": 0.798, + "step": 1190 + }, + { + "epoch": 0.06421177485443175, + "grad_norm": 0.830348014831543, + "learning_rate": 9.994651042221552e-06, + "loss": 0.7875, + "step": 1191 + }, + { + "epoch": 0.06426568902307526, + "grad_norm": 1.1060880422592163, + "learning_rate": 9.994641233874943e-06, + "loss": 0.8893, + "step": 1192 + }, + { + "epoch": 0.06431960319171878, + "grad_norm": 0.9319590926170349, + "learning_rate": 9.994631416548637e-06, + "loss": 0.791, + "step": 1193 + }, + { + "epoch": 0.0643735173603623, + "grad_norm": 0.8345780968666077, + "learning_rate": 9.994621590242661e-06, + "loss": 0.8213, + "step": 1194 + }, + { + "epoch": 0.06442743152900582, + "grad_norm": 0.9848359227180481, + "learning_rate": 9.99461175495703e-06, + "loss": 0.735, + "step": 1195 + }, + { + "epoch": 0.06448134569764934, + "grad_norm": 0.9134055972099304, + "learning_rate": 9.994601910691758e-06, + "loss": 0.8415, + "step": 1196 + }, + { + "epoch": 0.06453525986629287, + "grad_norm": 0.8084586262702942, + "learning_rate": 9.994592057446866e-06, + "loss": 0.8702, + "step": 1197 + }, + { + "epoch": 0.06458917403493639, + "grad_norm": 0.9168767333030701, + "learning_rate": 9.994582195222371e-06, + "loss": 0.8921, + "step": 1198 + }, + { + "epoch": 0.0646430882035799, + "grad_norm": 0.8380446434020996, + "learning_rate": 9.994572324018292e-06, + "loss": 0.7705, + "step": 1199 + }, + { + "epoch": 0.06469700237222342, + "grad_norm": 0.8120049238204956, + "learning_rate": 9.994562443834646e-06, + "loss": 0.7576, + "step": 1200 + }, + { + "epoch": 0.06475091654086694, + "grad_norm": 0.9559764266014099, + "learning_rate": 9.994552554671448e-06, + "loss": 0.8427, + "step": 1201 + }, + { + "epoch": 0.06480483070951046, + "grad_norm": 0.9473673105239868, + "learning_rate": 9.99454265652872e-06, + "loss": 0.9988, + "step": 1202 + }, + { + "epoch": 0.06485874487815398, + "grad_norm": 1.0704870223999023, + "learning_rate": 9.994532749406477e-06, + "loss": 0.9499, + "step": 1203 + }, + { + "epoch": 0.0649126590467975, + "grad_norm": 0.9905646443367004, + "learning_rate": 9.994522833304738e-06, + "loss": 0.8801, + "step": 1204 + }, + { + "epoch": 0.06496657321544101, + "grad_norm": 1.194190502166748, + "learning_rate": 9.99451290822352e-06, + "loss": 0.9051, + "step": 1205 + }, + { + "epoch": 0.06502048738408454, + "grad_norm": 0.8571314811706543, + "learning_rate": 9.994502974162843e-06, + "loss": 0.8131, + "step": 1206 + }, + { + "epoch": 0.06507440155272806, + "grad_norm": 0.9769417643547058, + "learning_rate": 9.994493031122721e-06, + "loss": 0.8524, + "step": 1207 + }, + { + "epoch": 0.06512831572137158, + "grad_norm": 0.8106759786605835, + "learning_rate": 9.994483079103176e-06, + "loss": 0.8142, + "step": 1208 + }, + { + "epoch": 0.0651822298900151, + "grad_norm": 0.8817846775054932, + "learning_rate": 9.994473118104223e-06, + "loss": 0.9076, + "step": 1209 + }, + { + "epoch": 0.06523614405865862, + "grad_norm": 0.8271930813789368, + "learning_rate": 9.994463148125882e-06, + "loss": 0.7914, + "step": 1210 + }, + { + "epoch": 0.06529005822730213, + "grad_norm": 0.9060614705085754, + "learning_rate": 9.994453169168169e-06, + "loss": 0.8375, + "step": 1211 + }, + { + "epoch": 0.06534397239594565, + "grad_norm": 0.880614697933197, + "learning_rate": 9.994443181231103e-06, + "loss": 0.7751, + "step": 1212 + }, + { + "epoch": 0.06539788656458917, + "grad_norm": 0.9420819282531738, + "learning_rate": 9.994433184314702e-06, + "loss": 0.8532, + "step": 1213 + }, + { + "epoch": 0.06545180073323269, + "grad_norm": 0.8587054014205933, + "learning_rate": 9.994423178418984e-06, + "loss": 0.8804, + "step": 1214 + }, + { + "epoch": 0.06550571490187622, + "grad_norm": 0.9624550938606262, + "learning_rate": 9.994413163543965e-06, + "loss": 0.9782, + "step": 1215 + }, + { + "epoch": 0.06555962907051974, + "grad_norm": 0.9458224773406982, + "learning_rate": 9.994403139689665e-06, + "loss": 0.8274, + "step": 1216 + }, + { + "epoch": 0.06561354323916326, + "grad_norm": 1.0417940616607666, + "learning_rate": 9.994393106856104e-06, + "loss": 0.9065, + "step": 1217 + }, + { + "epoch": 0.06566745740780677, + "grad_norm": 1.0225417613983154, + "learning_rate": 9.994383065043296e-06, + "loss": 0.8642, + "step": 1218 + }, + { + "epoch": 0.06572137157645029, + "grad_norm": 0.9015594720840454, + "learning_rate": 9.994373014251261e-06, + "loss": 0.8775, + "step": 1219 + }, + { + "epoch": 0.06577528574509381, + "grad_norm": 0.8473883271217346, + "learning_rate": 9.994362954480018e-06, + "loss": 0.8566, + "step": 1220 + }, + { + "epoch": 0.06582919991373733, + "grad_norm": 0.8571242690086365, + "learning_rate": 9.994352885729584e-06, + "loss": 0.8502, + "step": 1221 + }, + { + "epoch": 0.06588311408238084, + "grad_norm": 0.8793268799781799, + "learning_rate": 9.994342807999977e-06, + "loss": 0.9062, + "step": 1222 + }, + { + "epoch": 0.06593702825102436, + "grad_norm": 0.8866230249404907, + "learning_rate": 9.994332721291214e-06, + "loss": 0.9026, + "step": 1223 + }, + { + "epoch": 0.0659909424196679, + "grad_norm": 0.9135996103286743, + "learning_rate": 9.994322625603314e-06, + "loss": 0.8558, + "step": 1224 + }, + { + "epoch": 0.06604485658831141, + "grad_norm": 0.9904530048370361, + "learning_rate": 9.994312520936297e-06, + "loss": 0.8823, + "step": 1225 + }, + { + "epoch": 0.06609877075695493, + "grad_norm": 0.8590260148048401, + "learning_rate": 9.99430240729018e-06, + "loss": 0.8344, + "step": 1226 + }, + { + "epoch": 0.06615268492559845, + "grad_norm": 1.1669397354125977, + "learning_rate": 9.99429228466498e-06, + "loss": 0.9459, + "step": 1227 + }, + { + "epoch": 0.06620659909424197, + "grad_norm": 0.9290857315063477, + "learning_rate": 9.994282153060715e-06, + "loss": 0.8723, + "step": 1228 + }, + { + "epoch": 0.06626051326288548, + "grad_norm": 0.9619696140289307, + "learning_rate": 9.994272012477405e-06, + "loss": 0.8986, + "step": 1229 + }, + { + "epoch": 0.066314427431529, + "grad_norm": 0.8312071561813354, + "learning_rate": 9.994261862915068e-06, + "loss": 0.7291, + "step": 1230 + }, + { + "epoch": 0.06636834160017252, + "grad_norm": 1.0099300146102905, + "learning_rate": 9.994251704373721e-06, + "loss": 0.8725, + "step": 1231 + }, + { + "epoch": 0.06642225576881604, + "grad_norm": 0.8522336483001709, + "learning_rate": 9.994241536853384e-06, + "loss": 0.8656, + "step": 1232 + }, + { + "epoch": 0.06647616993745957, + "grad_norm": 0.919360339641571, + "learning_rate": 9.994231360354074e-06, + "loss": 0.8854, + "step": 1233 + }, + { + "epoch": 0.06653008410610309, + "grad_norm": 0.8002495169639587, + "learning_rate": 9.994221174875809e-06, + "loss": 0.7879, + "step": 1234 + }, + { + "epoch": 0.0665839982747466, + "grad_norm": 0.9539757370948792, + "learning_rate": 9.994210980418607e-06, + "loss": 0.9027, + "step": 1235 + }, + { + "epoch": 0.06663791244339012, + "grad_norm": 0.9222649335861206, + "learning_rate": 9.99420077698249e-06, + "loss": 0.7611, + "step": 1236 + }, + { + "epoch": 0.06669182661203364, + "grad_norm": 0.8629900813102722, + "learning_rate": 9.994190564567472e-06, + "loss": 0.8122, + "step": 1237 + }, + { + "epoch": 0.06674574078067716, + "grad_norm": 0.8339203000068665, + "learning_rate": 9.994180343173574e-06, + "loss": 0.7873, + "step": 1238 + }, + { + "epoch": 0.06679965494932068, + "grad_norm": 0.8844656348228455, + "learning_rate": 9.994170112800812e-06, + "loss": 0.8176, + "step": 1239 + }, + { + "epoch": 0.0668535691179642, + "grad_norm": 1.0024579763412476, + "learning_rate": 9.994159873449206e-06, + "loss": 0.844, + "step": 1240 + }, + { + "epoch": 0.06690748328660773, + "grad_norm": 0.8317261338233948, + "learning_rate": 9.994149625118774e-06, + "loss": 0.9103, + "step": 1241 + }, + { + "epoch": 0.06696139745525125, + "grad_norm": 0.8915300965309143, + "learning_rate": 9.994139367809534e-06, + "loss": 0.9084, + "step": 1242 + }, + { + "epoch": 0.06701531162389476, + "grad_norm": 0.9270803332328796, + "learning_rate": 9.994129101521506e-06, + "loss": 0.7634, + "step": 1243 + }, + { + "epoch": 0.06706922579253828, + "grad_norm": 0.9891652464866638, + "learning_rate": 9.994118826254708e-06, + "loss": 0.9776, + "step": 1244 + }, + { + "epoch": 0.0671231399611818, + "grad_norm": 0.7778229713439941, + "learning_rate": 9.994108542009156e-06, + "loss": 0.7481, + "step": 1245 + }, + { + "epoch": 0.06717705412982532, + "grad_norm": 0.8451201319694519, + "learning_rate": 9.994098248784872e-06, + "loss": 0.8012, + "step": 1246 + }, + { + "epoch": 0.06723096829846884, + "grad_norm": 0.8115825057029724, + "learning_rate": 9.994087946581873e-06, + "loss": 0.874, + "step": 1247 + }, + { + "epoch": 0.06728488246711235, + "grad_norm": 0.815934419631958, + "learning_rate": 9.994077635400175e-06, + "loss": 0.8114, + "step": 1248 + }, + { + "epoch": 0.06733879663575587, + "grad_norm": 1.1179388761520386, + "learning_rate": 9.9940673152398e-06, + "loss": 0.9078, + "step": 1249 + }, + { + "epoch": 0.0673927108043994, + "grad_norm": 0.9235454201698303, + "learning_rate": 9.994056986100767e-06, + "loss": 0.7511, + "step": 1250 + }, + { + "epoch": 0.06744662497304292, + "grad_norm": 0.8568270206451416, + "learning_rate": 9.994046647983093e-06, + "loss": 0.7805, + "step": 1251 + }, + { + "epoch": 0.06750053914168644, + "grad_norm": 1.1337388753890991, + "learning_rate": 9.994036300886796e-06, + "loss": 0.8835, + "step": 1252 + }, + { + "epoch": 0.06755445331032996, + "grad_norm": 0.9154239892959595, + "learning_rate": 9.994025944811896e-06, + "loss": 0.8804, + "step": 1253 + }, + { + "epoch": 0.06760836747897347, + "grad_norm": 0.8301606774330139, + "learning_rate": 9.99401557975841e-06, + "loss": 0.7905, + "step": 1254 + }, + { + "epoch": 0.06766228164761699, + "grad_norm": 0.9907017350196838, + "learning_rate": 9.994005205726358e-06, + "loss": 0.9091, + "step": 1255 + }, + { + "epoch": 0.06771619581626051, + "grad_norm": 0.8883876204490662, + "learning_rate": 9.993994822715758e-06, + "loss": 0.8815, + "step": 1256 + }, + { + "epoch": 0.06777010998490403, + "grad_norm": 0.9746614098548889, + "learning_rate": 9.993984430726627e-06, + "loss": 0.7897, + "step": 1257 + }, + { + "epoch": 0.06782402415354755, + "grad_norm": 0.9773344993591309, + "learning_rate": 9.993974029758988e-06, + "loss": 0.8499, + "step": 1258 + }, + { + "epoch": 0.06787793832219108, + "grad_norm": 0.9552164077758789, + "learning_rate": 9.993963619812856e-06, + "loss": 0.711, + "step": 1259 + }, + { + "epoch": 0.0679318524908346, + "grad_norm": 0.9146968126296997, + "learning_rate": 9.993953200888252e-06, + "loss": 0.9016, + "step": 1260 + }, + { + "epoch": 0.06798576665947811, + "grad_norm": 0.924244225025177, + "learning_rate": 9.993942772985192e-06, + "loss": 0.7534, + "step": 1261 + }, + { + "epoch": 0.06803968082812163, + "grad_norm": 1.2963265180587769, + "learning_rate": 9.993932336103699e-06, + "loss": 0.9409, + "step": 1262 + }, + { + "epoch": 0.06809359499676515, + "grad_norm": 0.7954462766647339, + "learning_rate": 9.993921890243788e-06, + "loss": 0.7669, + "step": 1263 + }, + { + "epoch": 0.06814750916540867, + "grad_norm": 0.9115849137306213, + "learning_rate": 9.993911435405478e-06, + "loss": 0.7567, + "step": 1264 + }, + { + "epoch": 0.06820142333405219, + "grad_norm": 1.0030237436294556, + "learning_rate": 9.99390097158879e-06, + "loss": 0.8952, + "step": 1265 + }, + { + "epoch": 0.0682553375026957, + "grad_norm": 0.8897690773010254, + "learning_rate": 9.993890498793742e-06, + "loss": 0.7993, + "step": 1266 + }, + { + "epoch": 0.06830925167133922, + "grad_norm": 0.9283807277679443, + "learning_rate": 9.993880017020349e-06, + "loss": 0.8808, + "step": 1267 + }, + { + "epoch": 0.06836316583998275, + "grad_norm": 0.848922848701477, + "learning_rate": 9.993869526268637e-06, + "loss": 0.7979, + "step": 1268 + }, + { + "epoch": 0.06841708000862627, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.993859026538618e-06, + "loss": 0.8886, + "step": 1269 + }, + { + "epoch": 0.06847099417726979, + "grad_norm": 0.8602685928344727, + "learning_rate": 9.993848517830318e-06, + "loss": 0.8209, + "step": 1270 + }, + { + "epoch": 0.06852490834591331, + "grad_norm": 0.9300077557563782, + "learning_rate": 9.99383800014375e-06, + "loss": 0.9261, + "step": 1271 + }, + { + "epoch": 0.06857882251455683, + "grad_norm": 0.8691270351409912, + "learning_rate": 9.993827473478934e-06, + "loss": 0.9217, + "step": 1272 + }, + { + "epoch": 0.06863273668320034, + "grad_norm": 0.7943814992904663, + "learning_rate": 9.99381693783589e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.06868665085184386, + "grad_norm": 0.9060125946998596, + "learning_rate": 9.993806393214638e-06, + "loss": 0.8314, + "step": 1274 + }, + { + "epoch": 0.06874056502048738, + "grad_norm": 0.8014434576034546, + "learning_rate": 9.993795839615194e-06, + "loss": 0.8047, + "step": 1275 + }, + { + "epoch": 0.0687944791891309, + "grad_norm": 1.0498815774917603, + "learning_rate": 9.993785277037578e-06, + "loss": 0.7125, + "step": 1276 + }, + { + "epoch": 0.06884839335777443, + "grad_norm": 0.8868438005447388, + "learning_rate": 9.993774705481812e-06, + "loss": 0.8594, + "step": 1277 + }, + { + "epoch": 0.06890230752641795, + "grad_norm": 0.8213896155357361, + "learning_rate": 9.993764124947911e-06, + "loss": 0.7995, + "step": 1278 + }, + { + "epoch": 0.06895622169506146, + "grad_norm": 0.9007741212844849, + "learning_rate": 9.993753535435895e-06, + "loss": 0.8982, + "step": 1279 + }, + { + "epoch": 0.06901013586370498, + "grad_norm": 0.8377478122711182, + "learning_rate": 9.993742936945785e-06, + "loss": 0.7387, + "step": 1280 + }, + { + "epoch": 0.0690640500323485, + "grad_norm": 0.8009492754936218, + "learning_rate": 9.993732329477598e-06, + "loss": 0.8079, + "step": 1281 + }, + { + "epoch": 0.06911796420099202, + "grad_norm": 0.8478789925575256, + "learning_rate": 9.993721713031354e-06, + "loss": 0.8682, + "step": 1282 + }, + { + "epoch": 0.06917187836963554, + "grad_norm": 0.7498561143875122, + "learning_rate": 9.993711087607072e-06, + "loss": 0.8107, + "step": 1283 + }, + { + "epoch": 0.06922579253827905, + "grad_norm": 0.8972634077072144, + "learning_rate": 9.99370045320477e-06, + "loss": 0.8494, + "step": 1284 + }, + { + "epoch": 0.06927970670692257, + "grad_norm": 0.942449152469635, + "learning_rate": 9.99368980982447e-06, + "loss": 0.8487, + "step": 1285 + }, + { + "epoch": 0.0693336208755661, + "grad_norm": 0.8752795457839966, + "learning_rate": 9.993679157466188e-06, + "loss": 0.8859, + "step": 1286 + }, + { + "epoch": 0.06938753504420962, + "grad_norm": 0.8289507031440735, + "learning_rate": 9.993668496129945e-06, + "loss": 0.8726, + "step": 1287 + }, + { + "epoch": 0.06944144921285314, + "grad_norm": 0.9452151656150818, + "learning_rate": 9.993657825815759e-06, + "loss": 0.9266, + "step": 1288 + }, + { + "epoch": 0.06949536338149666, + "grad_norm": 0.8697348237037659, + "learning_rate": 9.993647146523651e-06, + "loss": 0.8946, + "step": 1289 + }, + { + "epoch": 0.06954927755014018, + "grad_norm": 0.8712061643600464, + "learning_rate": 9.993636458253637e-06, + "loss": 0.8551, + "step": 1290 + }, + { + "epoch": 0.0696031917187837, + "grad_norm": 0.9295617938041687, + "learning_rate": 9.993625761005739e-06, + "loss": 0.8963, + "step": 1291 + }, + { + "epoch": 0.06965710588742721, + "grad_norm": 0.9441055059432983, + "learning_rate": 9.993615054779975e-06, + "loss": 0.9567, + "step": 1292 + }, + { + "epoch": 0.06971102005607073, + "grad_norm": 0.8742032051086426, + "learning_rate": 9.993604339576365e-06, + "loss": 0.8341, + "step": 1293 + }, + { + "epoch": 0.06976493422471426, + "grad_norm": 0.8596220016479492, + "learning_rate": 9.993593615394928e-06, + "loss": 0.8576, + "step": 1294 + }, + { + "epoch": 0.06981884839335778, + "grad_norm": 0.8011770844459534, + "learning_rate": 9.993582882235682e-06, + "loss": 0.7317, + "step": 1295 + }, + { + "epoch": 0.0698727625620013, + "grad_norm": 0.8578245043754578, + "learning_rate": 9.993572140098648e-06, + "loss": 0.8853, + "step": 1296 + }, + { + "epoch": 0.06992667673064482, + "grad_norm": 1.1155178546905518, + "learning_rate": 9.993561388983845e-06, + "loss": 0.8199, + "step": 1297 + }, + { + "epoch": 0.06998059089928833, + "grad_norm": 1.035699486732483, + "learning_rate": 9.993550628891293e-06, + "loss": 0.9498, + "step": 1298 + }, + { + "epoch": 0.07003450506793185, + "grad_norm": 0.8635748028755188, + "learning_rate": 9.99353985982101e-06, + "loss": 0.8741, + "step": 1299 + }, + { + "epoch": 0.07008841923657537, + "grad_norm": 0.8650850653648376, + "learning_rate": 9.993529081773016e-06, + "loss": 0.7337, + "step": 1300 + }, + { + "epoch": 0.07014233340521889, + "grad_norm": 0.8334539532661438, + "learning_rate": 9.99351829474733e-06, + "loss": 0.8927, + "step": 1301 + }, + { + "epoch": 0.0701962475738624, + "grad_norm": 0.9150926470756531, + "learning_rate": 9.993507498743971e-06, + "loss": 0.8464, + "step": 1302 + }, + { + "epoch": 0.07025016174250594, + "grad_norm": 0.8916522860527039, + "learning_rate": 9.993496693762958e-06, + "loss": 0.7899, + "step": 1303 + }, + { + "epoch": 0.07030407591114946, + "grad_norm": 1.0224976539611816, + "learning_rate": 9.993485879804314e-06, + "loss": 0.8256, + "step": 1304 + }, + { + "epoch": 0.07035799007979297, + "grad_norm": 0.921816885471344, + "learning_rate": 9.993475056868054e-06, + "loss": 0.7944, + "step": 1305 + }, + { + "epoch": 0.07041190424843649, + "grad_norm": 0.8775705099105835, + "learning_rate": 9.9934642249542e-06, + "loss": 0.9098, + "step": 1306 + }, + { + "epoch": 0.07046581841708001, + "grad_norm": 0.9802567362785339, + "learning_rate": 9.99345338406277e-06, + "loss": 0.9756, + "step": 1307 + }, + { + "epoch": 0.07051973258572353, + "grad_norm": 0.9785491228103638, + "learning_rate": 9.993442534193786e-06, + "loss": 1.0017, + "step": 1308 + }, + { + "epoch": 0.07057364675436704, + "grad_norm": 0.8796840906143188, + "learning_rate": 9.993431675347265e-06, + "loss": 0.7202, + "step": 1309 + }, + { + "epoch": 0.07062756092301056, + "grad_norm": 0.878099799156189, + "learning_rate": 9.993420807523227e-06, + "loss": 0.8655, + "step": 1310 + }, + { + "epoch": 0.07068147509165408, + "grad_norm": 0.8361509442329407, + "learning_rate": 9.99340993072169e-06, + "loss": 0.8522, + "step": 1311 + }, + { + "epoch": 0.07073538926029761, + "grad_norm": 0.8556873798370361, + "learning_rate": 9.99339904494268e-06, + "loss": 0.8603, + "step": 1312 + }, + { + "epoch": 0.07078930342894113, + "grad_norm": 0.8434461355209351, + "learning_rate": 9.993388150186208e-06, + "loss": 0.8571, + "step": 1313 + }, + { + "epoch": 0.07084321759758465, + "grad_norm": 0.8545907139778137, + "learning_rate": 9.9933772464523e-06, + "loss": 0.8145, + "step": 1314 + }, + { + "epoch": 0.07089713176622817, + "grad_norm": 0.9502561092376709, + "learning_rate": 9.993366333740971e-06, + "loss": 0.8068, + "step": 1315 + }, + { + "epoch": 0.07095104593487168, + "grad_norm": 0.848628580570221, + "learning_rate": 9.993355412052244e-06, + "loss": 0.8793, + "step": 1316 + }, + { + "epoch": 0.0710049601035152, + "grad_norm": 0.9699797630310059, + "learning_rate": 9.993344481386137e-06, + "loss": 0.9904, + "step": 1317 + }, + { + "epoch": 0.07105887427215872, + "grad_norm": 0.8888396620750427, + "learning_rate": 9.993333541742671e-06, + "loss": 0.8363, + "step": 1318 + }, + { + "epoch": 0.07111278844080224, + "grad_norm": 0.8805423974990845, + "learning_rate": 9.993322593121863e-06, + "loss": 0.8905, + "step": 1319 + }, + { + "epoch": 0.07116670260944576, + "grad_norm": 0.8875272274017334, + "learning_rate": 9.993311635523736e-06, + "loss": 0.7717, + "step": 1320 + }, + { + "epoch": 0.07122061677808929, + "grad_norm": 0.8853299617767334, + "learning_rate": 9.993300668948308e-06, + "loss": 0.9077, + "step": 1321 + }, + { + "epoch": 0.0712745309467328, + "grad_norm": 0.8847644329071045, + "learning_rate": 9.993289693395599e-06, + "loss": 0.8362, + "step": 1322 + }, + { + "epoch": 0.07132844511537632, + "grad_norm": 0.9531683325767517, + "learning_rate": 9.993278708865629e-06, + "loss": 0.8848, + "step": 1323 + }, + { + "epoch": 0.07138235928401984, + "grad_norm": 0.8573325276374817, + "learning_rate": 9.993267715358414e-06, + "loss": 0.8367, + "step": 1324 + }, + { + "epoch": 0.07143627345266336, + "grad_norm": 0.8920298218727112, + "learning_rate": 9.99325671287398e-06, + "loss": 0.8838, + "step": 1325 + }, + { + "epoch": 0.07149018762130688, + "grad_norm": 0.8472782969474792, + "learning_rate": 9.993245701412343e-06, + "loss": 0.8313, + "step": 1326 + }, + { + "epoch": 0.0715441017899504, + "grad_norm": 1.047664761543274, + "learning_rate": 9.993234680973525e-06, + "loss": 0.8663, + "step": 1327 + }, + { + "epoch": 0.07159801595859391, + "grad_norm": 0.9395570158958435, + "learning_rate": 9.993223651557542e-06, + "loss": 0.7703, + "step": 1328 + }, + { + "epoch": 0.07165193012723743, + "grad_norm": 0.9125472903251648, + "learning_rate": 9.993212613164419e-06, + "loss": 0.9335, + "step": 1329 + }, + { + "epoch": 0.07170584429588096, + "grad_norm": 0.9043323397636414, + "learning_rate": 9.993201565794172e-06, + "loss": 0.9185, + "step": 1330 + }, + { + "epoch": 0.07175975846452448, + "grad_norm": 0.8764339089393616, + "learning_rate": 9.993190509446821e-06, + "loss": 0.8807, + "step": 1331 + }, + { + "epoch": 0.071813672633168, + "grad_norm": 0.9123268723487854, + "learning_rate": 9.99317944412239e-06, + "loss": 0.8134, + "step": 1332 + }, + { + "epoch": 0.07186758680181152, + "grad_norm": 0.9625567197799683, + "learning_rate": 9.993168369820892e-06, + "loss": 0.8132, + "step": 1333 + }, + { + "epoch": 0.07192150097045504, + "grad_norm": 0.880536675453186, + "learning_rate": 9.993157286542352e-06, + "loss": 0.8107, + "step": 1334 + }, + { + "epoch": 0.07197541513909855, + "grad_norm": 0.9165224432945251, + "learning_rate": 9.99314619428679e-06, + "loss": 0.8376, + "step": 1335 + }, + { + "epoch": 0.07202932930774207, + "grad_norm": 0.8278066515922546, + "learning_rate": 9.993135093054223e-06, + "loss": 0.8075, + "step": 1336 + }, + { + "epoch": 0.07208324347638559, + "grad_norm": 0.9237795472145081, + "learning_rate": 9.993123982844674e-06, + "loss": 0.7838, + "step": 1337 + }, + { + "epoch": 0.0721371576450291, + "grad_norm": 0.8200939297676086, + "learning_rate": 9.993112863658161e-06, + "loss": 0.8475, + "step": 1338 + }, + { + "epoch": 0.07219107181367264, + "grad_norm": 0.8505958318710327, + "learning_rate": 9.993101735494704e-06, + "loss": 0.7891, + "step": 1339 + }, + { + "epoch": 0.07224498598231616, + "grad_norm": 0.8407264351844788, + "learning_rate": 9.993090598354323e-06, + "loss": 0.8128, + "step": 1340 + }, + { + "epoch": 0.07229890015095967, + "grad_norm": 0.8039887547492981, + "learning_rate": 9.993079452237038e-06, + "loss": 0.8504, + "step": 1341 + }, + { + "epoch": 0.07235281431960319, + "grad_norm": 0.7590643167495728, + "learning_rate": 9.993068297142871e-06, + "loss": 0.7402, + "step": 1342 + }, + { + "epoch": 0.07240672848824671, + "grad_norm": 0.7866249680519104, + "learning_rate": 9.993057133071842e-06, + "loss": 0.7076, + "step": 1343 + }, + { + "epoch": 0.07246064265689023, + "grad_norm": 0.9846029281616211, + "learning_rate": 9.993045960023967e-06, + "loss": 0.9179, + "step": 1344 + }, + { + "epoch": 0.07251455682553375, + "grad_norm": 0.8918319940567017, + "learning_rate": 9.99303477799927e-06, + "loss": 0.8087, + "step": 1345 + }, + { + "epoch": 0.07256847099417726, + "grad_norm": 0.8407700061798096, + "learning_rate": 9.99302358699777e-06, + "loss": 0.7272, + "step": 1346 + }, + { + "epoch": 0.0726223851628208, + "grad_norm": 0.9637326598167419, + "learning_rate": 9.993012387019486e-06, + "loss": 0.8613, + "step": 1347 + }, + { + "epoch": 0.07267629933146431, + "grad_norm": 0.8362317681312561, + "learning_rate": 9.99300117806444e-06, + "loss": 0.917, + "step": 1348 + }, + { + "epoch": 0.07273021350010783, + "grad_norm": 0.8584982752799988, + "learning_rate": 9.992989960132651e-06, + "loss": 0.8857, + "step": 1349 + }, + { + "epoch": 0.07278412766875135, + "grad_norm": 0.8341198563575745, + "learning_rate": 9.992978733224139e-06, + "loss": 0.802, + "step": 1350 + }, + { + "epoch": 0.07283804183739487, + "grad_norm": 1.6860167980194092, + "learning_rate": 9.992967497338926e-06, + "loss": 0.8789, + "step": 1351 + }, + { + "epoch": 0.07289195600603839, + "grad_norm": 0.8399189114570618, + "learning_rate": 9.99295625247703e-06, + "loss": 0.6338, + "step": 1352 + }, + { + "epoch": 0.0729458701746819, + "grad_norm": 0.9616976976394653, + "learning_rate": 9.992944998638473e-06, + "loss": 0.9735, + "step": 1353 + }, + { + "epoch": 0.07299978434332542, + "grad_norm": 0.8592861890792847, + "learning_rate": 9.992933735823272e-06, + "loss": 0.8159, + "step": 1354 + }, + { + "epoch": 0.07305369851196894, + "grad_norm": 0.8448725342750549, + "learning_rate": 9.992922464031451e-06, + "loss": 0.7942, + "step": 1355 + }, + { + "epoch": 0.07310761268061247, + "grad_norm": 0.8015927672386169, + "learning_rate": 9.99291118326303e-06, + "loss": 0.7429, + "step": 1356 + }, + { + "epoch": 0.07316152684925599, + "grad_norm": 0.8255912065505981, + "learning_rate": 9.992899893518025e-06, + "loss": 0.8532, + "step": 1357 + }, + { + "epoch": 0.07321544101789951, + "grad_norm": 0.8764085173606873, + "learning_rate": 9.992888594796462e-06, + "loss": 0.7989, + "step": 1358 + }, + { + "epoch": 0.07326935518654303, + "grad_norm": 0.8405522704124451, + "learning_rate": 9.992877287098357e-06, + "loss": 0.8709, + "step": 1359 + }, + { + "epoch": 0.07332326935518654, + "grad_norm": 0.8657836318016052, + "learning_rate": 9.992865970423733e-06, + "loss": 0.8236, + "step": 1360 + }, + { + "epoch": 0.07337718352383006, + "grad_norm": 0.8817959427833557, + "learning_rate": 9.992854644772609e-06, + "loss": 0.902, + "step": 1361 + }, + { + "epoch": 0.07343109769247358, + "grad_norm": 0.8290701508522034, + "learning_rate": 9.992843310145006e-06, + "loss": 0.8454, + "step": 1362 + }, + { + "epoch": 0.0734850118611171, + "grad_norm": 0.9637642502784729, + "learning_rate": 9.992831966540946e-06, + "loss": 0.9414, + "step": 1363 + }, + { + "epoch": 0.07353892602976062, + "grad_norm": 0.9220197200775146, + "learning_rate": 9.992820613960446e-06, + "loss": 0.9827, + "step": 1364 + }, + { + "epoch": 0.07359284019840415, + "grad_norm": 0.9008362889289856, + "learning_rate": 9.992809252403526e-06, + "loss": 0.8388, + "step": 1365 + }, + { + "epoch": 0.07364675436704766, + "grad_norm": 0.9517331123352051, + "learning_rate": 9.992797881870212e-06, + "loss": 0.8758, + "step": 1366 + }, + { + "epoch": 0.07370066853569118, + "grad_norm": 0.7811571359634399, + "learning_rate": 9.992786502360517e-06, + "loss": 0.6984, + "step": 1367 + }, + { + "epoch": 0.0737545827043347, + "grad_norm": 0.9887184500694275, + "learning_rate": 9.992775113874466e-06, + "loss": 0.7832, + "step": 1368 + }, + { + "epoch": 0.07380849687297822, + "grad_norm": 1.025869607925415, + "learning_rate": 9.99276371641208e-06, + "loss": 0.8417, + "step": 1369 + }, + { + "epoch": 0.07386241104162174, + "grad_norm": 0.8479165434837341, + "learning_rate": 9.99275230997338e-06, + "loss": 0.7862, + "step": 1370 + }, + { + "epoch": 0.07391632521026525, + "grad_norm": 0.9213555455207825, + "learning_rate": 9.992740894558381e-06, + "loss": 0.915, + "step": 1371 + }, + { + "epoch": 0.07397023937890877, + "grad_norm": 0.832306444644928, + "learning_rate": 9.992729470167109e-06, + "loss": 0.7566, + "step": 1372 + }, + { + "epoch": 0.07402415354755229, + "grad_norm": 1.0360348224639893, + "learning_rate": 9.992718036799583e-06, + "loss": 0.9096, + "step": 1373 + }, + { + "epoch": 0.07407806771619582, + "grad_norm": 0.8898483514785767, + "learning_rate": 9.992706594455823e-06, + "loss": 0.8738, + "step": 1374 + }, + { + "epoch": 0.07413198188483934, + "grad_norm": 0.8813758492469788, + "learning_rate": 9.992695143135849e-06, + "loss": 0.8736, + "step": 1375 + }, + { + "epoch": 0.07418589605348286, + "grad_norm": 1.1480571031570435, + "learning_rate": 9.992683682839683e-06, + "loss": 0.915, + "step": 1376 + }, + { + "epoch": 0.07423981022212638, + "grad_norm": 0.8588376641273499, + "learning_rate": 9.992672213567345e-06, + "loss": 0.8295, + "step": 1377 + }, + { + "epoch": 0.0742937243907699, + "grad_norm": 0.8729918599128723, + "learning_rate": 9.992660735318858e-06, + "loss": 0.9058, + "step": 1378 + }, + { + "epoch": 0.07434763855941341, + "grad_norm": 0.7953224778175354, + "learning_rate": 9.992649248094236e-06, + "loss": 0.7857, + "step": 1379 + }, + { + "epoch": 0.07440155272805693, + "grad_norm": 0.8485717177391052, + "learning_rate": 9.992637751893508e-06, + "loss": 0.7641, + "step": 1380 + }, + { + "epoch": 0.07445546689670045, + "grad_norm": 0.8630878329277039, + "learning_rate": 9.99262624671669e-06, + "loss": 0.8624, + "step": 1381 + }, + { + "epoch": 0.07450938106534397, + "grad_norm": 0.8655185103416443, + "learning_rate": 9.992614732563802e-06, + "loss": 0.8428, + "step": 1382 + }, + { + "epoch": 0.0745632952339875, + "grad_norm": 0.7875732779502869, + "learning_rate": 9.992603209434868e-06, + "loss": 0.7272, + "step": 1383 + }, + { + "epoch": 0.07461720940263102, + "grad_norm": 0.875879168510437, + "learning_rate": 9.992591677329905e-06, + "loss": 0.8539, + "step": 1384 + }, + { + "epoch": 0.07467112357127453, + "grad_norm": 0.8618319034576416, + "learning_rate": 9.992580136248934e-06, + "loss": 0.879, + "step": 1385 + }, + { + "epoch": 0.07472503773991805, + "grad_norm": 0.8695591688156128, + "learning_rate": 9.992568586191981e-06, + "loss": 0.8477, + "step": 1386 + }, + { + "epoch": 0.07477895190856157, + "grad_norm": 0.8539825677871704, + "learning_rate": 9.992557027159062e-06, + "loss": 0.7347, + "step": 1387 + }, + { + "epoch": 0.07483286607720509, + "grad_norm": 0.9625217914581299, + "learning_rate": 9.992545459150197e-06, + "loss": 0.8561, + "step": 1388 + }, + { + "epoch": 0.0748867802458486, + "grad_norm": 0.9862298369407654, + "learning_rate": 9.992533882165409e-06, + "loss": 0.9583, + "step": 1389 + }, + { + "epoch": 0.07494069441449212, + "grad_norm": 0.8217719793319702, + "learning_rate": 9.99252229620472e-06, + "loss": 0.7995, + "step": 1390 + }, + { + "epoch": 0.07499460858313564, + "grad_norm": 0.8668621182441711, + "learning_rate": 9.992510701268147e-06, + "loss": 0.8484, + "step": 1391 + }, + { + "epoch": 0.07504852275177917, + "grad_norm": 0.8549453616142273, + "learning_rate": 9.992499097355716e-06, + "loss": 0.8552, + "step": 1392 + }, + { + "epoch": 0.07510243692042269, + "grad_norm": 0.8262618184089661, + "learning_rate": 9.992487484467444e-06, + "loss": 0.7054, + "step": 1393 + }, + { + "epoch": 0.07515635108906621, + "grad_norm": 0.8524961471557617, + "learning_rate": 9.992475862603352e-06, + "loss": 0.8231, + "step": 1394 + }, + { + "epoch": 0.07521026525770973, + "grad_norm": 0.7805570363998413, + "learning_rate": 9.99246423176346e-06, + "loss": 0.7778, + "step": 1395 + }, + { + "epoch": 0.07526417942635324, + "grad_norm": 0.950484037399292, + "learning_rate": 9.992452591947794e-06, + "loss": 0.8662, + "step": 1396 + }, + { + "epoch": 0.07531809359499676, + "grad_norm": 0.8746458888053894, + "learning_rate": 9.99244094315637e-06, + "loss": 0.7854, + "step": 1397 + }, + { + "epoch": 0.07537200776364028, + "grad_norm": 0.9450538754463196, + "learning_rate": 9.992429285389212e-06, + "loss": 0.954, + "step": 1398 + }, + { + "epoch": 0.0754259219322838, + "grad_norm": 0.9048300385475159, + "learning_rate": 9.992417618646337e-06, + "loss": 0.8915, + "step": 1399 + }, + { + "epoch": 0.07547983610092733, + "grad_norm": 0.8735381364822388, + "learning_rate": 9.99240594292777e-06, + "loss": 0.8391, + "step": 1400 + }, + { + "epoch": 0.07553375026957085, + "grad_norm": 1.0980675220489502, + "learning_rate": 9.99239425823353e-06, + "loss": 0.8892, + "step": 1401 + }, + { + "epoch": 0.07558766443821437, + "grad_norm": 0.9016425013542175, + "learning_rate": 9.992382564563638e-06, + "loss": 0.8192, + "step": 1402 + }, + { + "epoch": 0.07564157860685788, + "grad_norm": 0.801419198513031, + "learning_rate": 9.992370861918117e-06, + "loss": 0.7914, + "step": 1403 + }, + { + "epoch": 0.0756954927755014, + "grad_norm": 0.9043407440185547, + "learning_rate": 9.992359150296985e-06, + "loss": 0.8767, + "step": 1404 + }, + { + "epoch": 0.07574940694414492, + "grad_norm": 0.9703086018562317, + "learning_rate": 9.992347429700266e-06, + "loss": 0.9173, + "step": 1405 + }, + { + "epoch": 0.07580332111278844, + "grad_norm": 0.8154104351997375, + "learning_rate": 9.992335700127978e-06, + "loss": 0.8453, + "step": 1406 + }, + { + "epoch": 0.07585723528143196, + "grad_norm": 0.8551482558250427, + "learning_rate": 9.992323961580146e-06, + "loss": 0.9132, + "step": 1407 + }, + { + "epoch": 0.07591114945007547, + "grad_norm": 0.9425063729286194, + "learning_rate": 9.992312214056785e-06, + "loss": 0.8171, + "step": 1408 + }, + { + "epoch": 0.075965063618719, + "grad_norm": 0.8958794474601746, + "learning_rate": 9.992300457557922e-06, + "loss": 0.7983, + "step": 1409 + }, + { + "epoch": 0.07601897778736252, + "grad_norm": 0.873874843120575, + "learning_rate": 9.992288692083579e-06, + "loss": 0.798, + "step": 1410 + }, + { + "epoch": 0.07607289195600604, + "grad_norm": 0.7951189279556274, + "learning_rate": 9.99227691763377e-06, + "loss": 0.8671, + "step": 1411 + }, + { + "epoch": 0.07612680612464956, + "grad_norm": 0.8073802590370178, + "learning_rate": 9.992265134208522e-06, + "loss": 0.8214, + "step": 1412 + }, + { + "epoch": 0.07618072029329308, + "grad_norm": 0.918222188949585, + "learning_rate": 9.992253341807854e-06, + "loss": 0.807, + "step": 1413 + }, + { + "epoch": 0.0762346344619366, + "grad_norm": 0.834381103515625, + "learning_rate": 9.992241540431789e-06, + "loss": 0.8737, + "step": 1414 + }, + { + "epoch": 0.07628854863058011, + "grad_norm": 0.808437168598175, + "learning_rate": 9.992229730080347e-06, + "loss": 0.7982, + "step": 1415 + }, + { + "epoch": 0.07634246279922363, + "grad_norm": 0.7868708968162537, + "learning_rate": 9.992217910753547e-06, + "loss": 0.7071, + "step": 1416 + }, + { + "epoch": 0.07639637696786715, + "grad_norm": 0.8445919156074524, + "learning_rate": 9.992206082451416e-06, + "loss": 0.8353, + "step": 1417 + }, + { + "epoch": 0.07645029113651068, + "grad_norm": 0.8283419609069824, + "learning_rate": 9.992194245173969e-06, + "loss": 0.867, + "step": 1418 + }, + { + "epoch": 0.0765042053051542, + "grad_norm": 0.8390635251998901, + "learning_rate": 9.99218239892123e-06, + "loss": 0.822, + "step": 1419 + }, + { + "epoch": 0.07655811947379772, + "grad_norm": 0.9037001132965088, + "learning_rate": 9.992170543693222e-06, + "loss": 0.8759, + "step": 1420 + }, + { + "epoch": 0.07661203364244124, + "grad_norm": 0.9708169102668762, + "learning_rate": 9.992158679489965e-06, + "loss": 0.875, + "step": 1421 + }, + { + "epoch": 0.07666594781108475, + "grad_norm": 0.8712205290794373, + "learning_rate": 9.992146806311479e-06, + "loss": 0.8711, + "step": 1422 + }, + { + "epoch": 0.07671986197972827, + "grad_norm": 0.953936755657196, + "learning_rate": 9.992134924157786e-06, + "loss": 0.8117, + "step": 1423 + }, + { + "epoch": 0.07677377614837179, + "grad_norm": 1.3178669214248657, + "learning_rate": 9.992123033028908e-06, + "loss": 0.8932, + "step": 1424 + }, + { + "epoch": 0.0768276903170153, + "grad_norm": 0.8657799959182739, + "learning_rate": 9.992111132924867e-06, + "loss": 0.8429, + "step": 1425 + }, + { + "epoch": 0.07688160448565882, + "grad_norm": 0.8979378938674927, + "learning_rate": 9.992099223845681e-06, + "loss": 0.9165, + "step": 1426 + }, + { + "epoch": 0.07693551865430236, + "grad_norm": 0.797493040561676, + "learning_rate": 9.992087305791376e-06, + "loss": 0.8139, + "step": 1427 + }, + { + "epoch": 0.07698943282294587, + "grad_norm": 0.9762497544288635, + "learning_rate": 9.99207537876197e-06, + "loss": 0.8006, + "step": 1428 + }, + { + "epoch": 0.07704334699158939, + "grad_norm": 0.9322238564491272, + "learning_rate": 9.992063442757487e-06, + "loss": 0.8708, + "step": 1429 + }, + { + "epoch": 0.07709726116023291, + "grad_norm": 0.9208402037620544, + "learning_rate": 9.992051497777947e-06, + "loss": 0.9137, + "step": 1430 + }, + { + "epoch": 0.07715117532887643, + "grad_norm": 0.9262849688529968, + "learning_rate": 9.99203954382337e-06, + "loss": 0.8043, + "step": 1431 + }, + { + "epoch": 0.07720508949751995, + "grad_norm": 1.0556507110595703, + "learning_rate": 9.992027580893781e-06, + "loss": 0.8321, + "step": 1432 + }, + { + "epoch": 0.07725900366616346, + "grad_norm": 1.0503417253494263, + "learning_rate": 9.9920156089892e-06, + "loss": 0.8875, + "step": 1433 + }, + { + "epoch": 0.07731291783480698, + "grad_norm": 0.8772387504577637, + "learning_rate": 9.992003628109647e-06, + "loss": 0.7407, + "step": 1434 + }, + { + "epoch": 0.0773668320034505, + "grad_norm": 0.942286491394043, + "learning_rate": 9.991991638255146e-06, + "loss": 0.8493, + "step": 1435 + }, + { + "epoch": 0.07742074617209403, + "grad_norm": 0.8584794998168945, + "learning_rate": 9.991979639425717e-06, + "loss": 0.8003, + "step": 1436 + }, + { + "epoch": 0.07747466034073755, + "grad_norm": 0.8247780203819275, + "learning_rate": 9.99196763162138e-06, + "loss": 0.9156, + "step": 1437 + }, + { + "epoch": 0.07752857450938107, + "grad_norm": 0.859018862247467, + "learning_rate": 9.99195561484216e-06, + "loss": 0.8255, + "step": 1438 + }, + { + "epoch": 0.07758248867802459, + "grad_norm": 0.9073282480239868, + "learning_rate": 9.991943589088078e-06, + "loss": 0.903, + "step": 1439 + }, + { + "epoch": 0.0776364028466681, + "grad_norm": 0.9324385523796082, + "learning_rate": 9.991931554359154e-06, + "loss": 0.8618, + "step": 1440 + }, + { + "epoch": 0.07769031701531162, + "grad_norm": 0.8038938045501709, + "learning_rate": 9.991919510655409e-06, + "loss": 0.7545, + "step": 1441 + }, + { + "epoch": 0.07774423118395514, + "grad_norm": 0.7999526858329773, + "learning_rate": 9.991907457976866e-06, + "loss": 0.6804, + "step": 1442 + }, + { + "epoch": 0.07779814535259866, + "grad_norm": 1.0165048837661743, + "learning_rate": 9.991895396323548e-06, + "loss": 0.7664, + "step": 1443 + }, + { + "epoch": 0.07785205952124218, + "grad_norm": 0.9513073563575745, + "learning_rate": 9.991883325695475e-06, + "loss": 0.8115, + "step": 1444 + }, + { + "epoch": 0.07790597368988571, + "grad_norm": 1.0391769409179688, + "learning_rate": 9.991871246092669e-06, + "loss": 0.9197, + "step": 1445 + }, + { + "epoch": 0.07795988785852923, + "grad_norm": 0.8990768194198608, + "learning_rate": 9.991859157515151e-06, + "loss": 0.9507, + "step": 1446 + }, + { + "epoch": 0.07801380202717274, + "grad_norm": 0.9990912079811096, + "learning_rate": 9.991847059962945e-06, + "loss": 0.7951, + "step": 1447 + }, + { + "epoch": 0.07806771619581626, + "grad_norm": 1.0030032396316528, + "learning_rate": 9.99183495343607e-06, + "loss": 0.7237, + "step": 1448 + }, + { + "epoch": 0.07812163036445978, + "grad_norm": 0.889561116695404, + "learning_rate": 9.991822837934551e-06, + "loss": 0.9061, + "step": 1449 + }, + { + "epoch": 0.0781755445331033, + "grad_norm": 0.8766982555389404, + "learning_rate": 9.991810713458405e-06, + "loss": 0.7952, + "step": 1450 + }, + { + "epoch": 0.07822945870174682, + "grad_norm": 0.9144406914710999, + "learning_rate": 9.991798580007658e-06, + "loss": 0.9235, + "step": 1451 + }, + { + "epoch": 0.07828337287039033, + "grad_norm": 0.895516037940979, + "learning_rate": 9.99178643758233e-06, + "loss": 0.9469, + "step": 1452 + }, + { + "epoch": 0.07833728703903386, + "grad_norm": 0.8802943229675293, + "learning_rate": 9.991774286182443e-06, + "loss": 0.8548, + "step": 1453 + }, + { + "epoch": 0.07839120120767738, + "grad_norm": 1.2773913145065308, + "learning_rate": 9.99176212580802e-06, + "loss": 0.794, + "step": 1454 + }, + { + "epoch": 0.0784451153763209, + "grad_norm": 0.9501168131828308, + "learning_rate": 9.99174995645908e-06, + "loss": 0.8711, + "step": 1455 + }, + { + "epoch": 0.07849902954496442, + "grad_norm": 0.9047390222549438, + "learning_rate": 9.991737778135649e-06, + "loss": 0.8419, + "step": 1456 + }, + { + "epoch": 0.07855294371360794, + "grad_norm": 0.9492837190628052, + "learning_rate": 9.991725590837747e-06, + "loss": 0.9832, + "step": 1457 + }, + { + "epoch": 0.07860685788225145, + "grad_norm": 0.9585106372833252, + "learning_rate": 9.991713394565394e-06, + "loss": 0.8393, + "step": 1458 + }, + { + "epoch": 0.07866077205089497, + "grad_norm": 0.9568297266960144, + "learning_rate": 9.991701189318615e-06, + "loss": 0.8711, + "step": 1459 + }, + { + "epoch": 0.07871468621953849, + "grad_norm": 0.9201347231864929, + "learning_rate": 9.991688975097429e-06, + "loss": 0.7947, + "step": 1460 + }, + { + "epoch": 0.07876860038818201, + "grad_norm": 0.8375768661499023, + "learning_rate": 9.99167675190186e-06, + "loss": 0.8051, + "step": 1461 + }, + { + "epoch": 0.07882251455682554, + "grad_norm": 0.8397765755653381, + "learning_rate": 9.99166451973193e-06, + "loss": 0.7727, + "step": 1462 + }, + { + "epoch": 0.07887642872546906, + "grad_norm": 0.8697947859764099, + "learning_rate": 9.99165227858766e-06, + "loss": 0.8171, + "step": 1463 + }, + { + "epoch": 0.07893034289411258, + "grad_norm": 0.8894750475883484, + "learning_rate": 9.991640028469073e-06, + "loss": 0.8773, + "step": 1464 + }, + { + "epoch": 0.0789842570627561, + "grad_norm": 0.8817871809005737, + "learning_rate": 9.991627769376189e-06, + "loss": 0.8983, + "step": 1465 + }, + { + "epoch": 0.07903817123139961, + "grad_norm": 0.9241123795509338, + "learning_rate": 9.99161550130903e-06, + "loss": 0.8967, + "step": 1466 + }, + { + "epoch": 0.07909208540004313, + "grad_norm": 0.852982223033905, + "learning_rate": 9.991603224267623e-06, + "loss": 0.9054, + "step": 1467 + }, + { + "epoch": 0.07914599956868665, + "grad_norm": 0.7719098925590515, + "learning_rate": 9.991590938251986e-06, + "loss": 0.7845, + "step": 1468 + }, + { + "epoch": 0.07919991373733017, + "grad_norm": 0.8700329661369324, + "learning_rate": 9.99157864326214e-06, + "loss": 0.9664, + "step": 1469 + }, + { + "epoch": 0.07925382790597368, + "grad_norm": 0.880553126335144, + "learning_rate": 9.991566339298112e-06, + "loss": 0.8803, + "step": 1470 + }, + { + "epoch": 0.07930774207461722, + "grad_norm": 0.9425762295722961, + "learning_rate": 9.991554026359918e-06, + "loss": 0.8259, + "step": 1471 + }, + { + "epoch": 0.07936165624326073, + "grad_norm": 0.8611294031143188, + "learning_rate": 9.991541704447585e-06, + "loss": 0.8693, + "step": 1472 + }, + { + "epoch": 0.07941557041190425, + "grad_norm": 0.856023907661438, + "learning_rate": 9.99152937356113e-06, + "loss": 0.7073, + "step": 1473 + }, + { + "epoch": 0.07946948458054777, + "grad_norm": 0.7763693332672119, + "learning_rate": 9.991517033700582e-06, + "loss": 0.6815, + "step": 1474 + }, + { + "epoch": 0.07952339874919129, + "grad_norm": 0.8417321443557739, + "learning_rate": 9.991504684865959e-06, + "loss": 0.8239, + "step": 1475 + }, + { + "epoch": 0.0795773129178348, + "grad_norm": 0.9151323437690735, + "learning_rate": 9.991492327057282e-06, + "loss": 0.8327, + "step": 1476 + }, + { + "epoch": 0.07963122708647832, + "grad_norm": 0.8285405039787292, + "learning_rate": 9.991479960274576e-06, + "loss": 0.8623, + "step": 1477 + }, + { + "epoch": 0.07968514125512184, + "grad_norm": 0.8204792141914368, + "learning_rate": 9.991467584517863e-06, + "loss": 0.8494, + "step": 1478 + }, + { + "epoch": 0.07973905542376536, + "grad_norm": 0.8516230583190918, + "learning_rate": 9.991455199787164e-06, + "loss": 0.8219, + "step": 1479 + }, + { + "epoch": 0.07979296959240889, + "grad_norm": 0.9418333172798157, + "learning_rate": 9.991442806082501e-06, + "loss": 0.9293, + "step": 1480 + }, + { + "epoch": 0.07984688376105241, + "grad_norm": 0.8852763175964355, + "learning_rate": 9.991430403403898e-06, + "loss": 0.8124, + "step": 1481 + }, + { + "epoch": 0.07990079792969593, + "grad_norm": 0.8435791730880737, + "learning_rate": 9.991417991751376e-06, + "loss": 0.8634, + "step": 1482 + }, + { + "epoch": 0.07995471209833944, + "grad_norm": 0.7795083522796631, + "learning_rate": 9.991405571124957e-06, + "loss": 0.802, + "step": 1483 + }, + { + "epoch": 0.08000862626698296, + "grad_norm": 0.8102303743362427, + "learning_rate": 9.991393141524663e-06, + "loss": 0.7492, + "step": 1484 + }, + { + "epoch": 0.08006254043562648, + "grad_norm": 0.8433593511581421, + "learning_rate": 9.99138070295052e-06, + "loss": 0.7926, + "step": 1485 + }, + { + "epoch": 0.08011645460427, + "grad_norm": 0.8992267847061157, + "learning_rate": 9.991368255402546e-06, + "loss": 0.7859, + "step": 1486 + }, + { + "epoch": 0.08017036877291352, + "grad_norm": 0.8748059868812561, + "learning_rate": 9.991355798880765e-06, + "loss": 0.8245, + "step": 1487 + }, + { + "epoch": 0.08022428294155703, + "grad_norm": 0.8456832766532898, + "learning_rate": 9.9913433333852e-06, + "loss": 0.9009, + "step": 1488 + }, + { + "epoch": 0.08027819711020057, + "grad_norm": 0.8582474589347839, + "learning_rate": 9.991330858915873e-06, + "loss": 0.7607, + "step": 1489 + }, + { + "epoch": 0.08033211127884408, + "grad_norm": 0.8157060146331787, + "learning_rate": 9.991318375472807e-06, + "loss": 0.8426, + "step": 1490 + }, + { + "epoch": 0.0803860254474876, + "grad_norm": 0.7474784851074219, + "learning_rate": 9.991305883056021e-06, + "loss": 0.8014, + "step": 1491 + }, + { + "epoch": 0.08043993961613112, + "grad_norm": 0.8432475924491882, + "learning_rate": 9.991293381665543e-06, + "loss": 0.8254, + "step": 1492 + }, + { + "epoch": 0.08049385378477464, + "grad_norm": 0.8733057379722595, + "learning_rate": 9.991280871301392e-06, + "loss": 0.8694, + "step": 1493 + }, + { + "epoch": 0.08054776795341816, + "grad_norm": 0.8694074153900146, + "learning_rate": 9.991268351963592e-06, + "loss": 0.7306, + "step": 1494 + }, + { + "epoch": 0.08060168212206167, + "grad_norm": 0.8981258869171143, + "learning_rate": 9.991255823652162e-06, + "loss": 0.7821, + "step": 1495 + }, + { + "epoch": 0.08065559629070519, + "grad_norm": 0.9740719795227051, + "learning_rate": 9.99124328636713e-06, + "loss": 0.7678, + "step": 1496 + }, + { + "epoch": 0.08070951045934871, + "grad_norm": 0.8847763538360596, + "learning_rate": 9.991230740108515e-06, + "loss": 0.73, + "step": 1497 + }, + { + "epoch": 0.08076342462799224, + "grad_norm": 0.8909339308738708, + "learning_rate": 9.99121818487634e-06, + "loss": 0.7713, + "step": 1498 + }, + { + "epoch": 0.08081733879663576, + "grad_norm": 0.8183975219726562, + "learning_rate": 9.991205620670626e-06, + "loss": 0.8234, + "step": 1499 + }, + { + "epoch": 0.08087125296527928, + "grad_norm": 1.241355299949646, + "learning_rate": 9.991193047491399e-06, + "loss": 0.8135, + "step": 1500 + }, + { + "epoch": 0.0809251671339228, + "grad_norm": 0.9039500951766968, + "learning_rate": 9.991180465338682e-06, + "loss": 0.8642, + "step": 1501 + }, + { + "epoch": 0.08097908130256631, + "grad_norm": 1.1762068271636963, + "learning_rate": 9.991167874212493e-06, + "loss": 0.7892, + "step": 1502 + }, + { + "epoch": 0.08103299547120983, + "grad_norm": 0.8402833938598633, + "learning_rate": 9.991155274112857e-06, + "loss": 0.9054, + "step": 1503 + }, + { + "epoch": 0.08108690963985335, + "grad_norm": 0.9271976351737976, + "learning_rate": 9.991142665039799e-06, + "loss": 0.8902, + "step": 1504 + }, + { + "epoch": 0.08114082380849687, + "grad_norm": 0.9105845093727112, + "learning_rate": 9.991130046993337e-06, + "loss": 0.8522, + "step": 1505 + }, + { + "epoch": 0.0811947379771404, + "grad_norm": 0.8248290419578552, + "learning_rate": 9.991117419973499e-06, + "loss": 0.882, + "step": 1506 + }, + { + "epoch": 0.08124865214578392, + "grad_norm": 1.0726820230484009, + "learning_rate": 9.991104783980305e-06, + "loss": 0.8001, + "step": 1507 + }, + { + "epoch": 0.08130256631442744, + "grad_norm": 1.296281337738037, + "learning_rate": 9.991092139013776e-06, + "loss": 1.0022, + "step": 1508 + }, + { + "epoch": 0.08135648048307095, + "grad_norm": 1.7287628650665283, + "learning_rate": 9.991079485073938e-06, + "loss": 0.914, + "step": 1509 + }, + { + "epoch": 0.08141039465171447, + "grad_norm": 0.8731694221496582, + "learning_rate": 9.991066822160813e-06, + "loss": 0.8672, + "step": 1510 + }, + { + "epoch": 0.08146430882035799, + "grad_norm": 0.875747799873352, + "learning_rate": 9.99105415027442e-06, + "loss": 0.8044, + "step": 1511 + }, + { + "epoch": 0.08151822298900151, + "grad_norm": 0.9055120348930359, + "learning_rate": 9.991041469414787e-06, + "loss": 0.8312, + "step": 1512 + }, + { + "epoch": 0.08157213715764502, + "grad_norm": 0.8849499821662903, + "learning_rate": 9.991028779581935e-06, + "loss": 0.889, + "step": 1513 + }, + { + "epoch": 0.08162605132628854, + "grad_norm": 0.9549855589866638, + "learning_rate": 9.991016080775884e-06, + "loss": 0.8929, + "step": 1514 + }, + { + "epoch": 0.08167996549493207, + "grad_norm": 0.8395527005195618, + "learning_rate": 9.991003372996662e-06, + "loss": 0.6774, + "step": 1515 + }, + { + "epoch": 0.08173387966357559, + "grad_norm": 0.7791672945022583, + "learning_rate": 9.990990656244287e-06, + "loss": 0.7178, + "step": 1516 + }, + { + "epoch": 0.08178779383221911, + "grad_norm": 0.91841721534729, + "learning_rate": 9.990977930518785e-06, + "loss": 0.8372, + "step": 1517 + }, + { + "epoch": 0.08184170800086263, + "grad_norm": 0.923937976360321, + "learning_rate": 9.990965195820178e-06, + "loss": 0.8467, + "step": 1518 + }, + { + "epoch": 0.08189562216950615, + "grad_norm": 0.9804415106773376, + "learning_rate": 9.990952452148488e-06, + "loss": 0.9281, + "step": 1519 + }, + { + "epoch": 0.08194953633814966, + "grad_norm": 0.9396255016326904, + "learning_rate": 9.99093969950374e-06, + "loss": 0.8606, + "step": 1520 + }, + { + "epoch": 0.08200345050679318, + "grad_norm": 0.8492118120193481, + "learning_rate": 9.990926937885953e-06, + "loss": 0.8253, + "step": 1521 + }, + { + "epoch": 0.0820573646754367, + "grad_norm": 0.8482204079627991, + "learning_rate": 9.990914167295154e-06, + "loss": 0.7361, + "step": 1522 + }, + { + "epoch": 0.08211127884408022, + "grad_norm": 1.1302778720855713, + "learning_rate": 9.990901387731365e-06, + "loss": 0.7511, + "step": 1523 + }, + { + "epoch": 0.08216519301272375, + "grad_norm": 0.9285756945610046, + "learning_rate": 9.990888599194607e-06, + "loss": 0.8329, + "step": 1524 + }, + { + "epoch": 0.08221910718136727, + "grad_norm": 0.8932104110717773, + "learning_rate": 9.990875801684905e-06, + "loss": 0.8146, + "step": 1525 + }, + { + "epoch": 0.08227302135001079, + "grad_norm": 0.8232647180557251, + "learning_rate": 9.990862995202282e-06, + "loss": 0.763, + "step": 1526 + }, + { + "epoch": 0.0823269355186543, + "grad_norm": 0.8582163453102112, + "learning_rate": 9.990850179746759e-06, + "loss": 0.7675, + "step": 1527 + }, + { + "epoch": 0.08238084968729782, + "grad_norm": 0.9890977144241333, + "learning_rate": 9.990837355318362e-06, + "loss": 0.8438, + "step": 1528 + }, + { + "epoch": 0.08243476385594134, + "grad_norm": 0.9228235483169556, + "learning_rate": 9.990824521917113e-06, + "loss": 0.9324, + "step": 1529 + }, + { + "epoch": 0.08248867802458486, + "grad_norm": 0.8286252617835999, + "learning_rate": 9.990811679543033e-06, + "loss": 0.872, + "step": 1530 + }, + { + "epoch": 0.08254259219322838, + "grad_norm": 0.8546530604362488, + "learning_rate": 9.990798828196146e-06, + "loss": 0.7256, + "step": 1531 + }, + { + "epoch": 0.0825965063618719, + "grad_norm": 0.8240640759468079, + "learning_rate": 9.990785967876478e-06, + "loss": 0.8083, + "step": 1532 + }, + { + "epoch": 0.08265042053051543, + "grad_norm": 0.8650565147399902, + "learning_rate": 9.99077309858405e-06, + "loss": 0.8274, + "step": 1533 + }, + { + "epoch": 0.08270433469915894, + "grad_norm": 0.7865849137306213, + "learning_rate": 9.990760220318884e-06, + "loss": 0.7978, + "step": 1534 + }, + { + "epoch": 0.08275824886780246, + "grad_norm": 0.8567995429039001, + "learning_rate": 9.990747333081005e-06, + "loss": 0.8172, + "step": 1535 + }, + { + "epoch": 0.08281216303644598, + "grad_norm": 0.8242521286010742, + "learning_rate": 9.990734436870435e-06, + "loss": 0.8045, + "step": 1536 + }, + { + "epoch": 0.0828660772050895, + "grad_norm": 0.801266074180603, + "learning_rate": 9.990721531687197e-06, + "loss": 0.8312, + "step": 1537 + }, + { + "epoch": 0.08291999137373302, + "grad_norm": 0.8027862906455994, + "learning_rate": 9.990708617531314e-06, + "loss": 0.7227, + "step": 1538 + }, + { + "epoch": 0.08297390554237653, + "grad_norm": 1.0332401990890503, + "learning_rate": 9.990695694402811e-06, + "loss": 0.9091, + "step": 1539 + }, + { + "epoch": 0.08302781971102005, + "grad_norm": 0.8537373542785645, + "learning_rate": 9.99068276230171e-06, + "loss": 0.7573, + "step": 1540 + }, + { + "epoch": 0.08308173387966357, + "grad_norm": 0.8734087944030762, + "learning_rate": 9.990669821228037e-06, + "loss": 0.901, + "step": 1541 + }, + { + "epoch": 0.0831356480483071, + "grad_norm": 0.8546577095985413, + "learning_rate": 9.99065687118181e-06, + "loss": 0.8294, + "step": 1542 + }, + { + "epoch": 0.08318956221695062, + "grad_norm": 0.9555438756942749, + "learning_rate": 9.990643912163055e-06, + "loss": 0.83, + "step": 1543 + }, + { + "epoch": 0.08324347638559414, + "grad_norm": 0.8778670430183411, + "learning_rate": 9.990630944171798e-06, + "loss": 0.8694, + "step": 1544 + }, + { + "epoch": 0.08329739055423765, + "grad_norm": 0.973791241645813, + "learning_rate": 9.990617967208058e-06, + "loss": 0.8348, + "step": 1545 + }, + { + "epoch": 0.08335130472288117, + "grad_norm": 0.7933714389801025, + "learning_rate": 9.990604981271858e-06, + "loss": 0.8208, + "step": 1546 + }, + { + "epoch": 0.08340521889152469, + "grad_norm": 0.9328469634056091, + "learning_rate": 9.990591986363226e-06, + "loss": 0.8188, + "step": 1547 + }, + { + "epoch": 0.08345913306016821, + "grad_norm": 0.8217103481292725, + "learning_rate": 9.990578982482183e-06, + "loss": 0.7948, + "step": 1548 + }, + { + "epoch": 0.08351304722881173, + "grad_norm": 0.8556894659996033, + "learning_rate": 9.990565969628749e-06, + "loss": 0.8129, + "step": 1549 + }, + { + "epoch": 0.08356696139745524, + "grad_norm": 0.901633083820343, + "learning_rate": 9.990552947802954e-06, + "loss": 0.9025, + "step": 1550 + }, + { + "epoch": 0.08362087556609878, + "grad_norm": 0.9021494388580322, + "learning_rate": 9.990539917004815e-06, + "loss": 0.8882, + "step": 1551 + }, + { + "epoch": 0.0836747897347423, + "grad_norm": 0.8187722563743591, + "learning_rate": 9.990526877234359e-06, + "loss": 0.7385, + "step": 1552 + }, + { + "epoch": 0.08372870390338581, + "grad_norm": 0.9237630367279053, + "learning_rate": 9.990513828491609e-06, + "loss": 0.851, + "step": 1553 + }, + { + "epoch": 0.08378261807202933, + "grad_norm": 1.1868582963943481, + "learning_rate": 9.990500770776589e-06, + "loss": 0.7701, + "step": 1554 + }, + { + "epoch": 0.08383653224067285, + "grad_norm": 0.9831421971321106, + "learning_rate": 9.990487704089322e-06, + "loss": 0.836, + "step": 1555 + }, + { + "epoch": 0.08389044640931637, + "grad_norm": 0.9255663752555847, + "learning_rate": 9.99047462842983e-06, + "loss": 0.7916, + "step": 1556 + }, + { + "epoch": 0.08394436057795988, + "grad_norm": 1.0069084167480469, + "learning_rate": 9.990461543798137e-06, + "loss": 0.8652, + "step": 1557 + }, + { + "epoch": 0.0839982747466034, + "grad_norm": 0.943044900894165, + "learning_rate": 9.990448450194267e-06, + "loss": 0.9511, + "step": 1558 + }, + { + "epoch": 0.08405218891524693, + "grad_norm": 0.9996150135993958, + "learning_rate": 9.990435347618246e-06, + "loss": 0.8751, + "step": 1559 + }, + { + "epoch": 0.08410610308389045, + "grad_norm": 0.9531681537628174, + "learning_rate": 9.990422236070094e-06, + "loss": 0.8988, + "step": 1560 + }, + { + "epoch": 0.08416001725253397, + "grad_norm": 0.9504678249359131, + "learning_rate": 9.990409115549837e-06, + "loss": 0.808, + "step": 1561 + }, + { + "epoch": 0.08421393142117749, + "grad_norm": 0.9796282052993774, + "learning_rate": 9.990395986057496e-06, + "loss": 0.778, + "step": 1562 + }, + { + "epoch": 0.084267845589821, + "grad_norm": 0.8871618509292603, + "learning_rate": 9.990382847593096e-06, + "loss": 0.8945, + "step": 1563 + }, + { + "epoch": 0.08432175975846452, + "grad_norm": 0.8253110647201538, + "learning_rate": 9.990369700156662e-06, + "loss": 0.8206, + "step": 1564 + }, + { + "epoch": 0.08437567392710804, + "grad_norm": 0.8799824118614197, + "learning_rate": 9.990356543748216e-06, + "loss": 0.7665, + "step": 1565 + }, + { + "epoch": 0.08442958809575156, + "grad_norm": 0.8275637626647949, + "learning_rate": 9.990343378367782e-06, + "loss": 0.8468, + "step": 1566 + }, + { + "epoch": 0.08448350226439508, + "grad_norm": 1.0431691408157349, + "learning_rate": 9.990330204015382e-06, + "loss": 0.8539, + "step": 1567 + }, + { + "epoch": 0.08453741643303861, + "grad_norm": 1.298999547958374, + "learning_rate": 9.990317020691043e-06, + "loss": 0.8989, + "step": 1568 + }, + { + "epoch": 0.08459133060168213, + "grad_norm": 0.865868866443634, + "learning_rate": 9.990303828394787e-06, + "loss": 0.8296, + "step": 1569 + }, + { + "epoch": 0.08464524477032564, + "grad_norm": 0.9162652492523193, + "learning_rate": 9.990290627126637e-06, + "loss": 0.8617, + "step": 1570 + }, + { + "epoch": 0.08469915893896916, + "grad_norm": 0.9753283858299255, + "learning_rate": 9.990277416886618e-06, + "loss": 0.8082, + "step": 1571 + }, + { + "epoch": 0.08475307310761268, + "grad_norm": 0.9561176300048828, + "learning_rate": 9.990264197674754e-06, + "loss": 0.8678, + "step": 1572 + }, + { + "epoch": 0.0848069872762562, + "grad_norm": 0.833341658115387, + "learning_rate": 9.990250969491067e-06, + "loss": 0.8164, + "step": 1573 + }, + { + "epoch": 0.08486090144489972, + "grad_norm": 0.9928603172302246, + "learning_rate": 9.990237732335581e-06, + "loss": 0.6889, + "step": 1574 + }, + { + "epoch": 0.08491481561354323, + "grad_norm": 1.0163367986679077, + "learning_rate": 9.990224486208322e-06, + "loss": 0.8278, + "step": 1575 + }, + { + "epoch": 0.08496872978218675, + "grad_norm": 0.9905970096588135, + "learning_rate": 9.990211231109312e-06, + "loss": 0.8094, + "step": 1576 + }, + { + "epoch": 0.08502264395083028, + "grad_norm": 0.9112648963928223, + "learning_rate": 9.990197967038574e-06, + "loss": 0.8782, + "step": 1577 + }, + { + "epoch": 0.0850765581194738, + "grad_norm": 1.1176974773406982, + "learning_rate": 9.990184693996136e-06, + "loss": 0.8826, + "step": 1578 + }, + { + "epoch": 0.08513047228811732, + "grad_norm": 0.7696222066879272, + "learning_rate": 9.990171411982016e-06, + "loss": 0.8025, + "step": 1579 + }, + { + "epoch": 0.08518438645676084, + "grad_norm": 0.9288634061813354, + "learning_rate": 9.990158120996242e-06, + "loss": 0.8777, + "step": 1580 + }, + { + "epoch": 0.08523830062540436, + "grad_norm": 0.9235022068023682, + "learning_rate": 9.990144821038839e-06, + "loss": 0.9339, + "step": 1581 + }, + { + "epoch": 0.08529221479404787, + "grad_norm": 0.9124205708503723, + "learning_rate": 9.990131512109826e-06, + "loss": 0.8368, + "step": 1582 + }, + { + "epoch": 0.08534612896269139, + "grad_norm": 0.8409048914909363, + "learning_rate": 9.990118194209229e-06, + "loss": 0.7772, + "step": 1583 + }, + { + "epoch": 0.08540004313133491, + "grad_norm": 0.8279136419296265, + "learning_rate": 9.990104867337074e-06, + "loss": 0.738, + "step": 1584 + }, + { + "epoch": 0.08545395729997843, + "grad_norm": 0.8895745873451233, + "learning_rate": 9.990091531493382e-06, + "loss": 0.7669, + "step": 1585 + }, + { + "epoch": 0.08550787146862196, + "grad_norm": 0.9280734062194824, + "learning_rate": 9.99007818667818e-06, + "loss": 0.9052, + "step": 1586 + }, + { + "epoch": 0.08556178563726548, + "grad_norm": 0.7676610350608826, + "learning_rate": 9.990064832891491e-06, + "loss": 0.807, + "step": 1587 + }, + { + "epoch": 0.085615699805909, + "grad_norm": 0.9035676121711731, + "learning_rate": 9.990051470133337e-06, + "loss": 0.8848, + "step": 1588 + }, + { + "epoch": 0.08566961397455251, + "grad_norm": 1.0960334539413452, + "learning_rate": 9.990038098403742e-06, + "loss": 0.8279, + "step": 1589 + }, + { + "epoch": 0.08572352814319603, + "grad_norm": 0.87922203540802, + "learning_rate": 9.990024717702736e-06, + "loss": 0.8325, + "step": 1590 + }, + { + "epoch": 0.08577744231183955, + "grad_norm": 0.922815203666687, + "learning_rate": 9.990011328030335e-06, + "loss": 0.881, + "step": 1591 + }, + { + "epoch": 0.08583135648048307, + "grad_norm": 0.9880780577659607, + "learning_rate": 9.989997929386567e-06, + "loss": 0.7506, + "step": 1592 + }, + { + "epoch": 0.08588527064912659, + "grad_norm": 0.8827483057975769, + "learning_rate": 9.989984521771456e-06, + "loss": 0.8961, + "step": 1593 + }, + { + "epoch": 0.0859391848177701, + "grad_norm": 0.8395072817802429, + "learning_rate": 9.989971105185026e-06, + "loss": 0.8564, + "step": 1594 + }, + { + "epoch": 0.08599309898641364, + "grad_norm": 0.8731534481048584, + "learning_rate": 9.989957679627302e-06, + "loss": 0.8209, + "step": 1595 + }, + { + "epoch": 0.08604701315505715, + "grad_norm": 0.7969424724578857, + "learning_rate": 9.989944245098305e-06, + "loss": 0.8031, + "step": 1596 + }, + { + "epoch": 0.08610092732370067, + "grad_norm": 0.8420547246932983, + "learning_rate": 9.989930801598062e-06, + "loss": 0.8027, + "step": 1597 + }, + { + "epoch": 0.08615484149234419, + "grad_norm": 0.7900253534317017, + "learning_rate": 9.989917349126597e-06, + "loss": 0.8246, + "step": 1598 + }, + { + "epoch": 0.08620875566098771, + "grad_norm": 0.8860716819763184, + "learning_rate": 9.989903887683934e-06, + "loss": 0.7846, + "step": 1599 + }, + { + "epoch": 0.08626266982963122, + "grad_norm": 0.907744288444519, + "learning_rate": 9.989890417270097e-06, + "loss": 0.7813, + "step": 1600 + }, + { + "epoch": 0.08631658399827474, + "grad_norm": 0.764076828956604, + "learning_rate": 9.989876937885108e-06, + "loss": 0.7953, + "step": 1601 + }, + { + "epoch": 0.08637049816691826, + "grad_norm": 1.0143790245056152, + "learning_rate": 9.989863449528994e-06, + "loss": 0.8854, + "step": 1602 + }, + { + "epoch": 0.08642441233556178, + "grad_norm": 0.8605815172195435, + "learning_rate": 9.989849952201779e-06, + "loss": 0.9289, + "step": 1603 + }, + { + "epoch": 0.08647832650420531, + "grad_norm": 0.8897641897201538, + "learning_rate": 9.989836445903487e-06, + "loss": 0.8659, + "step": 1604 + }, + { + "epoch": 0.08653224067284883, + "grad_norm": 0.8893518447875977, + "learning_rate": 9.989822930634141e-06, + "loss": 0.8724, + "step": 1605 + }, + { + "epoch": 0.08658615484149235, + "grad_norm": 0.8152129054069519, + "learning_rate": 9.989809406393767e-06, + "loss": 0.8321, + "step": 1606 + }, + { + "epoch": 0.08664006901013586, + "grad_norm": 0.8394732475280762, + "learning_rate": 9.98979587318239e-06, + "loss": 0.8074, + "step": 1607 + }, + { + "epoch": 0.08669398317877938, + "grad_norm": 0.8038346767425537, + "learning_rate": 9.989782331000031e-06, + "loss": 0.8132, + "step": 1608 + }, + { + "epoch": 0.0867478973474229, + "grad_norm": 0.8574134111404419, + "learning_rate": 9.989768779846717e-06, + "loss": 0.8191, + "step": 1609 + }, + { + "epoch": 0.08680181151606642, + "grad_norm": 1.0049889087677002, + "learning_rate": 9.989755219722472e-06, + "loss": 0.8771, + "step": 1610 + }, + { + "epoch": 0.08685572568470994, + "grad_norm": 0.9765112996101379, + "learning_rate": 9.989741650627319e-06, + "loss": 0.839, + "step": 1611 + }, + { + "epoch": 0.08690963985335347, + "grad_norm": 0.9430082440376282, + "learning_rate": 9.989728072561284e-06, + "loss": 1.0316, + "step": 1612 + }, + { + "epoch": 0.08696355402199699, + "grad_norm": 0.841590404510498, + "learning_rate": 9.989714485524391e-06, + "loss": 0.8727, + "step": 1613 + }, + { + "epoch": 0.0870174681906405, + "grad_norm": 0.9475975632667542, + "learning_rate": 9.989700889516664e-06, + "loss": 0.8131, + "step": 1614 + }, + { + "epoch": 0.08707138235928402, + "grad_norm": 0.8059530258178711, + "learning_rate": 9.98968728453813e-06, + "loss": 0.8297, + "step": 1615 + }, + { + "epoch": 0.08712529652792754, + "grad_norm": 0.8513601422309875, + "learning_rate": 9.989673670588808e-06, + "loss": 0.8016, + "step": 1616 + }, + { + "epoch": 0.08717921069657106, + "grad_norm": 0.8434658646583557, + "learning_rate": 9.989660047668728e-06, + "loss": 0.866, + "step": 1617 + }, + { + "epoch": 0.08723312486521458, + "grad_norm": 0.9081484079360962, + "learning_rate": 9.989646415777912e-06, + "loss": 0.816, + "step": 1618 + }, + { + "epoch": 0.0872870390338581, + "grad_norm": 0.7941877841949463, + "learning_rate": 9.989632774916385e-06, + "loss": 0.7191, + "step": 1619 + }, + { + "epoch": 0.08734095320250161, + "grad_norm": 0.8800172209739685, + "learning_rate": 9.98961912508417e-06, + "loss": 0.8135, + "step": 1620 + }, + { + "epoch": 0.08739486737114514, + "grad_norm": 0.7940575480461121, + "learning_rate": 9.989605466281292e-06, + "loss": 0.8124, + "step": 1621 + }, + { + "epoch": 0.08744878153978866, + "grad_norm": 0.9570618271827698, + "learning_rate": 9.989591798507779e-06, + "loss": 0.9043, + "step": 1622 + }, + { + "epoch": 0.08750269570843218, + "grad_norm": 0.8635395169258118, + "learning_rate": 9.98957812176365e-06, + "loss": 0.835, + "step": 1623 + }, + { + "epoch": 0.0875566098770757, + "grad_norm": 0.8289955258369446, + "learning_rate": 9.989564436048932e-06, + "loss": 0.8265, + "step": 1624 + }, + { + "epoch": 0.08761052404571922, + "grad_norm": 0.9519028663635254, + "learning_rate": 9.989550741363654e-06, + "loss": 0.8127, + "step": 1625 + }, + { + "epoch": 0.08766443821436273, + "grad_norm": 0.9611422419548035, + "learning_rate": 9.989537037707834e-06, + "loss": 0.8422, + "step": 1626 + }, + { + "epoch": 0.08771835238300625, + "grad_norm": 0.8824746608734131, + "learning_rate": 9.9895233250815e-06, + "loss": 0.8669, + "step": 1627 + }, + { + "epoch": 0.08777226655164977, + "grad_norm": 0.8402838706970215, + "learning_rate": 9.989509603484676e-06, + "loss": 0.8072, + "step": 1628 + }, + { + "epoch": 0.08782618072029329, + "grad_norm": 0.7537099719047546, + "learning_rate": 9.989495872917386e-06, + "loss": 0.7127, + "step": 1629 + }, + { + "epoch": 0.08788009488893682, + "grad_norm": 0.78285151720047, + "learning_rate": 9.989482133379656e-06, + "loss": 0.819, + "step": 1630 + }, + { + "epoch": 0.08793400905758034, + "grad_norm": 0.9339445233345032, + "learning_rate": 9.98946838487151e-06, + "loss": 0.8694, + "step": 1631 + }, + { + "epoch": 0.08798792322622385, + "grad_norm": 0.8022040128707886, + "learning_rate": 9.989454627392973e-06, + "loss": 0.7601, + "step": 1632 + }, + { + "epoch": 0.08804183739486737, + "grad_norm": 0.8593827486038208, + "learning_rate": 9.98944086094407e-06, + "loss": 0.8536, + "step": 1633 + }, + { + "epoch": 0.08809575156351089, + "grad_norm": 0.8415039777755737, + "learning_rate": 9.989427085524824e-06, + "loss": 0.9027, + "step": 1634 + }, + { + "epoch": 0.08814966573215441, + "grad_norm": 0.9551103711128235, + "learning_rate": 9.989413301135263e-06, + "loss": 0.8063, + "step": 1635 + }, + { + "epoch": 0.08820357990079793, + "grad_norm": 0.8554351925849915, + "learning_rate": 9.989399507775407e-06, + "loss": 0.7694, + "step": 1636 + }, + { + "epoch": 0.08825749406944144, + "grad_norm": 0.8688547015190125, + "learning_rate": 9.989385705445285e-06, + "loss": 0.8862, + "step": 1637 + }, + { + "epoch": 0.08831140823808496, + "grad_norm": 0.816558837890625, + "learning_rate": 9.98937189414492e-06, + "loss": 0.7302, + "step": 1638 + }, + { + "epoch": 0.0883653224067285, + "grad_norm": 0.8164445757865906, + "learning_rate": 9.989358073874337e-06, + "loss": 0.8724, + "step": 1639 + }, + { + "epoch": 0.08841923657537201, + "grad_norm": 0.8909460306167603, + "learning_rate": 9.989344244633564e-06, + "loss": 0.7618, + "step": 1640 + }, + { + "epoch": 0.08847315074401553, + "grad_norm": 1.0117470026016235, + "learning_rate": 9.98933040642262e-06, + "loss": 0.8191, + "step": 1641 + }, + { + "epoch": 0.08852706491265905, + "grad_norm": 0.8317937850952148, + "learning_rate": 9.989316559241533e-06, + "loss": 0.8339, + "step": 1642 + }, + { + "epoch": 0.08858097908130257, + "grad_norm": 0.7955135107040405, + "learning_rate": 9.98930270309033e-06, + "loss": 0.7799, + "step": 1643 + }, + { + "epoch": 0.08863489324994608, + "grad_norm": 0.996306300163269, + "learning_rate": 9.98928883796903e-06, + "loss": 0.8547, + "step": 1644 + }, + { + "epoch": 0.0886888074185896, + "grad_norm": 0.9679511189460754, + "learning_rate": 9.989274963877664e-06, + "loss": 1.0831, + "step": 1645 + }, + { + "epoch": 0.08874272158723312, + "grad_norm": 0.8471615314483643, + "learning_rate": 9.989261080816253e-06, + "loss": 0.7765, + "step": 1646 + }, + { + "epoch": 0.08879663575587664, + "grad_norm": 0.8662555813789368, + "learning_rate": 9.989247188784826e-06, + "loss": 0.8894, + "step": 1647 + }, + { + "epoch": 0.08885054992452017, + "grad_norm": 0.9549373388290405, + "learning_rate": 9.989233287783402e-06, + "loss": 0.8341, + "step": 1648 + }, + { + "epoch": 0.08890446409316369, + "grad_norm": 0.8179014325141907, + "learning_rate": 9.989219377812014e-06, + "loss": 0.8653, + "step": 1649 + }, + { + "epoch": 0.0889583782618072, + "grad_norm": 0.9237802624702454, + "learning_rate": 9.989205458870678e-06, + "loss": 0.8206, + "step": 1650 + }, + { + "epoch": 0.08901229243045072, + "grad_norm": 0.940217137336731, + "learning_rate": 9.989191530959426e-06, + "loss": 0.8695, + "step": 1651 + }, + { + "epoch": 0.08906620659909424, + "grad_norm": 0.9200409054756165, + "learning_rate": 9.98917759407828e-06, + "loss": 0.7984, + "step": 1652 + }, + { + "epoch": 0.08912012076773776, + "grad_norm": 0.9270562529563904, + "learning_rate": 9.989163648227265e-06, + "loss": 0.8265, + "step": 1653 + }, + { + "epoch": 0.08917403493638128, + "grad_norm": 0.9945223331451416, + "learning_rate": 9.989149693406408e-06, + "loss": 0.84, + "step": 1654 + }, + { + "epoch": 0.0892279491050248, + "grad_norm": 0.826195478439331, + "learning_rate": 9.98913572961573e-06, + "loss": 0.7862, + "step": 1655 + }, + { + "epoch": 0.08928186327366831, + "grad_norm": 0.9132022857666016, + "learning_rate": 9.989121756855263e-06, + "loss": 0.826, + "step": 1656 + }, + { + "epoch": 0.08933577744231185, + "grad_norm": 0.8559401631355286, + "learning_rate": 9.989107775125023e-06, + "loss": 0.8007, + "step": 1657 + }, + { + "epoch": 0.08938969161095536, + "grad_norm": 0.8000867366790771, + "learning_rate": 9.989093784425044e-06, + "loss": 0.7547, + "step": 1658 + }, + { + "epoch": 0.08944360577959888, + "grad_norm": 0.7761433720588684, + "learning_rate": 9.989079784755346e-06, + "loss": 0.8083, + "step": 1659 + }, + { + "epoch": 0.0894975199482424, + "grad_norm": 0.8072230815887451, + "learning_rate": 9.989065776115956e-06, + "loss": 0.892, + "step": 1660 + }, + { + "epoch": 0.08955143411688592, + "grad_norm": 0.9021360874176025, + "learning_rate": 9.989051758506898e-06, + "loss": 0.8715, + "step": 1661 + }, + { + "epoch": 0.08960534828552943, + "grad_norm": 0.7585147023200989, + "learning_rate": 9.989037731928197e-06, + "loss": 0.7115, + "step": 1662 + }, + { + "epoch": 0.08965926245417295, + "grad_norm": 0.9388399124145508, + "learning_rate": 9.98902369637988e-06, + "loss": 0.8976, + "step": 1663 + }, + { + "epoch": 0.08971317662281647, + "grad_norm": 0.8454418778419495, + "learning_rate": 9.989009651861972e-06, + "loss": 0.8063, + "step": 1664 + }, + { + "epoch": 0.08976709079146, + "grad_norm": 0.82308030128479, + "learning_rate": 9.988995598374496e-06, + "loss": 0.8044, + "step": 1665 + }, + { + "epoch": 0.08982100496010352, + "grad_norm": 1.006800651550293, + "learning_rate": 9.98898153591748e-06, + "loss": 0.8609, + "step": 1666 + }, + { + "epoch": 0.08987491912874704, + "grad_norm": 0.8325724601745605, + "learning_rate": 9.988967464490947e-06, + "loss": 0.8295, + "step": 1667 + }, + { + "epoch": 0.08992883329739056, + "grad_norm": 0.7575547695159912, + "learning_rate": 9.988953384094923e-06, + "loss": 0.8252, + "step": 1668 + }, + { + "epoch": 0.08998274746603407, + "grad_norm": 0.869877278804779, + "learning_rate": 9.988939294729436e-06, + "loss": 0.8304, + "step": 1669 + }, + { + "epoch": 0.09003666163467759, + "grad_norm": 0.7840037941932678, + "learning_rate": 9.988925196394508e-06, + "loss": 0.7742, + "step": 1670 + }, + { + "epoch": 0.09009057580332111, + "grad_norm": 0.8044409155845642, + "learning_rate": 9.988911089090163e-06, + "loss": 0.8371, + "step": 1671 + }, + { + "epoch": 0.09014448997196463, + "grad_norm": 0.8635613322257996, + "learning_rate": 9.988896972816431e-06, + "loss": 0.7693, + "step": 1672 + }, + { + "epoch": 0.09019840414060815, + "grad_norm": 0.7780656814575195, + "learning_rate": 9.988882847573335e-06, + "loss": 0.841, + "step": 1673 + }, + { + "epoch": 0.09025231830925168, + "grad_norm": 0.8938048481941223, + "learning_rate": 9.9888687133609e-06, + "loss": 0.8149, + "step": 1674 + }, + { + "epoch": 0.0903062324778952, + "grad_norm": 0.8432002663612366, + "learning_rate": 9.988854570179152e-06, + "loss": 0.853, + "step": 1675 + }, + { + "epoch": 0.09036014664653871, + "grad_norm": 0.8222450613975525, + "learning_rate": 9.988840418028118e-06, + "loss": 0.897, + "step": 1676 + }, + { + "epoch": 0.09041406081518223, + "grad_norm": 0.8370371460914612, + "learning_rate": 9.98882625690782e-06, + "loss": 0.8288, + "step": 1677 + }, + { + "epoch": 0.09046797498382575, + "grad_norm": 0.8510713577270508, + "learning_rate": 9.988812086818285e-06, + "loss": 0.7637, + "step": 1678 + }, + { + "epoch": 0.09052188915246927, + "grad_norm": 0.8271141648292542, + "learning_rate": 9.98879790775954e-06, + "loss": 0.853, + "step": 1679 + }, + { + "epoch": 0.09057580332111279, + "grad_norm": 1.0627025365829468, + "learning_rate": 9.988783719731607e-06, + "loss": 0.7569, + "step": 1680 + }, + { + "epoch": 0.0906297174897563, + "grad_norm": 0.880283534526825, + "learning_rate": 9.988769522734517e-06, + "loss": 0.8362, + "step": 1681 + }, + { + "epoch": 0.09068363165839982, + "grad_norm": 0.8721734881401062, + "learning_rate": 9.988755316768288e-06, + "loss": 0.8585, + "step": 1682 + }, + { + "epoch": 0.09073754582704335, + "grad_norm": 0.8830682039260864, + "learning_rate": 9.988741101832952e-06, + "loss": 0.8853, + "step": 1683 + }, + { + "epoch": 0.09079145999568687, + "grad_norm": 0.7676220536231995, + "learning_rate": 9.988726877928534e-06, + "loss": 0.7832, + "step": 1684 + }, + { + "epoch": 0.09084537416433039, + "grad_norm": 0.866149365901947, + "learning_rate": 9.988712645055055e-06, + "loss": 0.8534, + "step": 1685 + }, + { + "epoch": 0.09089928833297391, + "grad_norm": 0.8467028141021729, + "learning_rate": 9.988698403212546e-06, + "loss": 0.8637, + "step": 1686 + }, + { + "epoch": 0.09095320250161743, + "grad_norm": 0.913436770439148, + "learning_rate": 9.988684152401028e-06, + "loss": 0.855, + "step": 1687 + }, + { + "epoch": 0.09100711667026094, + "grad_norm": 0.8307977914810181, + "learning_rate": 9.98866989262053e-06, + "loss": 0.8538, + "step": 1688 + }, + { + "epoch": 0.09106103083890446, + "grad_norm": 1.13442862033844, + "learning_rate": 9.988655623871075e-06, + "loss": 0.8129, + "step": 1689 + }, + { + "epoch": 0.09111494500754798, + "grad_norm": 0.8950080871582031, + "learning_rate": 9.988641346152692e-06, + "loss": 0.8674, + "step": 1690 + }, + { + "epoch": 0.0911688591761915, + "grad_norm": 0.9107043147087097, + "learning_rate": 9.988627059465403e-06, + "loss": 0.9507, + "step": 1691 + }, + { + "epoch": 0.09122277334483503, + "grad_norm": 0.8210874795913696, + "learning_rate": 9.988612763809237e-06, + "loss": 0.8913, + "step": 1692 + }, + { + "epoch": 0.09127668751347855, + "grad_norm": 1.0306476354599, + "learning_rate": 9.988598459184217e-06, + "loss": 0.8589, + "step": 1693 + }, + { + "epoch": 0.09133060168212206, + "grad_norm": 0.7582615613937378, + "learning_rate": 9.98858414559037e-06, + "loss": 0.7482, + "step": 1694 + }, + { + "epoch": 0.09138451585076558, + "grad_norm": 0.8572216629981995, + "learning_rate": 9.98856982302772e-06, + "loss": 0.822, + "step": 1695 + }, + { + "epoch": 0.0914384300194091, + "grad_norm": 0.9358139038085938, + "learning_rate": 9.988555491496297e-06, + "loss": 0.8298, + "step": 1696 + }, + { + "epoch": 0.09149234418805262, + "grad_norm": 0.8705672025680542, + "learning_rate": 9.988541150996123e-06, + "loss": 0.8818, + "step": 1697 + }, + { + "epoch": 0.09154625835669614, + "grad_norm": 0.9081273674964905, + "learning_rate": 9.988526801527224e-06, + "loss": 0.8994, + "step": 1698 + }, + { + "epoch": 0.09160017252533965, + "grad_norm": 0.7358905076980591, + "learning_rate": 9.988512443089627e-06, + "loss": 0.7752, + "step": 1699 + }, + { + "epoch": 0.09165408669398317, + "grad_norm": 0.8570963740348816, + "learning_rate": 9.988498075683357e-06, + "loss": 0.908, + "step": 1700 + }, + { + "epoch": 0.0917080008626267, + "grad_norm": 0.8998208045959473, + "learning_rate": 9.988483699308442e-06, + "loss": 0.8561, + "step": 1701 + }, + { + "epoch": 0.09176191503127022, + "grad_norm": 0.7481779456138611, + "learning_rate": 9.988469313964903e-06, + "loss": 0.7184, + "step": 1702 + }, + { + "epoch": 0.09181582919991374, + "grad_norm": 1.052809238433838, + "learning_rate": 9.988454919652772e-06, + "loss": 0.8579, + "step": 1703 + }, + { + "epoch": 0.09186974336855726, + "grad_norm": 0.8492130637168884, + "learning_rate": 9.988440516372071e-06, + "loss": 0.8796, + "step": 1704 + }, + { + "epoch": 0.09192365753720078, + "grad_norm": 0.884483277797699, + "learning_rate": 9.988426104122826e-06, + "loss": 0.8781, + "step": 1705 + }, + { + "epoch": 0.0919775717058443, + "grad_norm": 0.8844857811927795, + "learning_rate": 9.988411682905065e-06, + "loss": 0.8981, + "step": 1706 + }, + { + "epoch": 0.09203148587448781, + "grad_norm": 0.906216025352478, + "learning_rate": 9.988397252718811e-06, + "loss": 0.8741, + "step": 1707 + }, + { + "epoch": 0.09208540004313133, + "grad_norm": 0.8565787076950073, + "learning_rate": 9.988382813564092e-06, + "loss": 0.7358, + "step": 1708 + }, + { + "epoch": 0.09213931421177485, + "grad_norm": 0.8036391139030457, + "learning_rate": 9.988368365440935e-06, + "loss": 0.7966, + "step": 1709 + }, + { + "epoch": 0.09219322838041838, + "grad_norm": 1.1708556413650513, + "learning_rate": 9.988353908349361e-06, + "loss": 0.8385, + "step": 1710 + }, + { + "epoch": 0.0922471425490619, + "grad_norm": 0.8536746501922607, + "learning_rate": 9.988339442289403e-06, + "loss": 0.7387, + "step": 1711 + }, + { + "epoch": 0.09230105671770542, + "grad_norm": 0.8376518487930298, + "learning_rate": 9.988324967261083e-06, + "loss": 0.8537, + "step": 1712 + }, + { + "epoch": 0.09235497088634893, + "grad_norm": 0.8793227672576904, + "learning_rate": 9.988310483264426e-06, + "loss": 0.8028, + "step": 1713 + }, + { + "epoch": 0.09240888505499245, + "grad_norm": 0.8186830282211304, + "learning_rate": 9.98829599029946e-06, + "loss": 0.8478, + "step": 1714 + }, + { + "epoch": 0.09246279922363597, + "grad_norm": 0.8845428824424744, + "learning_rate": 9.98828148836621e-06, + "loss": 0.8524, + "step": 1715 + }, + { + "epoch": 0.09251671339227949, + "grad_norm": 1.0494492053985596, + "learning_rate": 9.988266977464704e-06, + "loss": 0.8542, + "step": 1716 + }, + { + "epoch": 0.092570627560923, + "grad_norm": 0.8876493573188782, + "learning_rate": 9.988252457594966e-06, + "loss": 0.8989, + "step": 1717 + }, + { + "epoch": 0.09262454172956654, + "grad_norm": 0.8787088394165039, + "learning_rate": 9.988237928757024e-06, + "loss": 0.8214, + "step": 1718 + }, + { + "epoch": 0.09267845589821005, + "grad_norm": 1.069684624671936, + "learning_rate": 9.988223390950901e-06, + "loss": 0.9714, + "step": 1719 + }, + { + "epoch": 0.09273237006685357, + "grad_norm": 0.7957501411437988, + "learning_rate": 9.988208844176626e-06, + "loss": 0.7562, + "step": 1720 + }, + { + "epoch": 0.09278628423549709, + "grad_norm": 0.8354908227920532, + "learning_rate": 9.988194288434225e-06, + "loss": 0.7494, + "step": 1721 + }, + { + "epoch": 0.09284019840414061, + "grad_norm": 0.8205936551094055, + "learning_rate": 9.988179723723722e-06, + "loss": 0.7727, + "step": 1722 + }, + { + "epoch": 0.09289411257278413, + "grad_norm": 0.8364951014518738, + "learning_rate": 9.988165150045146e-06, + "loss": 0.861, + "step": 1723 + }, + { + "epoch": 0.09294802674142764, + "grad_norm": 0.8664119243621826, + "learning_rate": 9.98815056739852e-06, + "loss": 0.8512, + "step": 1724 + }, + { + "epoch": 0.09300194091007116, + "grad_norm": 0.9565482139587402, + "learning_rate": 9.988135975783874e-06, + "loss": 0.8606, + "step": 1725 + }, + { + "epoch": 0.09305585507871468, + "grad_norm": 0.8696085214614868, + "learning_rate": 9.988121375201232e-06, + "loss": 0.8614, + "step": 1726 + }, + { + "epoch": 0.09310976924735821, + "grad_norm": 0.8623467683792114, + "learning_rate": 9.98810676565062e-06, + "loss": 0.8547, + "step": 1727 + }, + { + "epoch": 0.09316368341600173, + "grad_norm": 0.8284831047058105, + "learning_rate": 9.988092147132064e-06, + "loss": 0.8376, + "step": 1728 + }, + { + "epoch": 0.09321759758464525, + "grad_norm": 0.7768245339393616, + "learning_rate": 9.988077519645591e-06, + "loss": 0.7472, + "step": 1729 + }, + { + "epoch": 0.09327151175328877, + "grad_norm": 1.221225619316101, + "learning_rate": 9.988062883191228e-06, + "loss": 0.9052, + "step": 1730 + }, + { + "epoch": 0.09332542592193228, + "grad_norm": 1.0027954578399658, + "learning_rate": 9.988048237769002e-06, + "loss": 0.9411, + "step": 1731 + }, + { + "epoch": 0.0933793400905758, + "grad_norm": 0.8029824495315552, + "learning_rate": 9.988033583378937e-06, + "loss": 0.8141, + "step": 1732 + }, + { + "epoch": 0.09343325425921932, + "grad_norm": 0.8081389665603638, + "learning_rate": 9.98801892002106e-06, + "loss": 0.7977, + "step": 1733 + }, + { + "epoch": 0.09348716842786284, + "grad_norm": 0.887438952922821, + "learning_rate": 9.988004247695398e-06, + "loss": 0.8574, + "step": 1734 + }, + { + "epoch": 0.09354108259650636, + "grad_norm": 0.887238085269928, + "learning_rate": 9.987989566401977e-06, + "loss": 0.9041, + "step": 1735 + }, + { + "epoch": 0.09359499676514989, + "grad_norm": 0.9135997891426086, + "learning_rate": 9.987974876140822e-06, + "loss": 0.738, + "step": 1736 + }, + { + "epoch": 0.0936489109337934, + "grad_norm": 0.7749861478805542, + "learning_rate": 9.987960176911964e-06, + "loss": 0.773, + "step": 1737 + }, + { + "epoch": 0.09370282510243692, + "grad_norm": 0.7850096225738525, + "learning_rate": 9.987945468715425e-06, + "loss": 0.7924, + "step": 1738 + }, + { + "epoch": 0.09375673927108044, + "grad_norm": 0.8044145107269287, + "learning_rate": 9.987930751551231e-06, + "loss": 0.8196, + "step": 1739 + }, + { + "epoch": 0.09381065343972396, + "grad_norm": 0.8781464695930481, + "learning_rate": 9.987916025419413e-06, + "loss": 0.9337, + "step": 1740 + }, + { + "epoch": 0.09386456760836748, + "grad_norm": 1.0839952230453491, + "learning_rate": 9.987901290319993e-06, + "loss": 0.8092, + "step": 1741 + }, + { + "epoch": 0.093918481777011, + "grad_norm": 0.7910736203193665, + "learning_rate": 9.987886546253e-06, + "loss": 0.8775, + "step": 1742 + }, + { + "epoch": 0.09397239594565451, + "grad_norm": 0.887287974357605, + "learning_rate": 9.98787179321846e-06, + "loss": 0.8271, + "step": 1743 + }, + { + "epoch": 0.09402631011429803, + "grad_norm": 1.1318427324295044, + "learning_rate": 9.987857031216397e-06, + "loss": 0.8328, + "step": 1744 + }, + { + "epoch": 0.09408022428294156, + "grad_norm": 0.8660401105880737, + "learning_rate": 9.987842260246842e-06, + "loss": 0.8647, + "step": 1745 + }, + { + "epoch": 0.09413413845158508, + "grad_norm": 0.9396790266036987, + "learning_rate": 9.98782748030982e-06, + "loss": 0.9373, + "step": 1746 + }, + { + "epoch": 0.0941880526202286, + "grad_norm": 0.8715323209762573, + "learning_rate": 9.987812691405353e-06, + "loss": 0.8621, + "step": 1747 + }, + { + "epoch": 0.09424196678887212, + "grad_norm": 0.7882347106933594, + "learning_rate": 9.987797893533475e-06, + "loss": 0.7283, + "step": 1748 + }, + { + "epoch": 0.09429588095751563, + "grad_norm": 0.9641733765602112, + "learning_rate": 9.987783086694208e-06, + "loss": 0.8038, + "step": 1749 + }, + { + "epoch": 0.09434979512615915, + "grad_norm": 0.8808518648147583, + "learning_rate": 9.98776827088758e-06, + "loss": 0.8072, + "step": 1750 + }, + { + "epoch": 0.09440370929480267, + "grad_norm": 0.7720713019371033, + "learning_rate": 9.987753446113618e-06, + "loss": 0.7786, + "step": 1751 + }, + { + "epoch": 0.09445762346344619, + "grad_norm": 1.0507936477661133, + "learning_rate": 9.987738612372346e-06, + "loss": 0.9302, + "step": 1752 + }, + { + "epoch": 0.0945115376320897, + "grad_norm": 0.7705017328262329, + "learning_rate": 9.987723769663795e-06, + "loss": 0.7366, + "step": 1753 + }, + { + "epoch": 0.09456545180073324, + "grad_norm": 0.82464200258255, + "learning_rate": 9.987708917987989e-06, + "loss": 0.8063, + "step": 1754 + }, + { + "epoch": 0.09461936596937676, + "grad_norm": 0.9387272000312805, + "learning_rate": 9.987694057344953e-06, + "loss": 0.8108, + "step": 1755 + }, + { + "epoch": 0.09467328013802027, + "grad_norm": 0.9161933064460754, + "learning_rate": 9.987679187734717e-06, + "loss": 0.8331, + "step": 1756 + }, + { + "epoch": 0.09472719430666379, + "grad_norm": 0.9379769563674927, + "learning_rate": 9.987664309157306e-06, + "loss": 0.9064, + "step": 1757 + }, + { + "epoch": 0.09478110847530731, + "grad_norm": 0.9597976803779602, + "learning_rate": 9.987649421612748e-06, + "loss": 0.7785, + "step": 1758 + }, + { + "epoch": 0.09483502264395083, + "grad_norm": 0.8689720630645752, + "learning_rate": 9.98763452510107e-06, + "loss": 0.7828, + "step": 1759 + }, + { + "epoch": 0.09488893681259435, + "grad_norm": 0.9207726716995239, + "learning_rate": 9.987619619622296e-06, + "loss": 0.7853, + "step": 1760 + }, + { + "epoch": 0.09494285098123786, + "grad_norm": 0.8130320310592651, + "learning_rate": 9.987604705176455e-06, + "loss": 0.858, + "step": 1761 + }, + { + "epoch": 0.09499676514988138, + "grad_norm": 0.9004638195037842, + "learning_rate": 9.987589781763574e-06, + "loss": 0.8148, + "step": 1762 + }, + { + "epoch": 0.09505067931852491, + "grad_norm": 0.8554181456565857, + "learning_rate": 9.987574849383678e-06, + "loss": 0.8103, + "step": 1763 + }, + { + "epoch": 0.09510459348716843, + "grad_norm": 0.9148527979850769, + "learning_rate": 9.987559908036797e-06, + "loss": 0.9467, + "step": 1764 + }, + { + "epoch": 0.09515850765581195, + "grad_norm": 0.890083909034729, + "learning_rate": 9.987544957722956e-06, + "loss": 0.8338, + "step": 1765 + }, + { + "epoch": 0.09521242182445547, + "grad_norm": 0.8118012547492981, + "learning_rate": 9.98752999844218e-06, + "loss": 0.8355, + "step": 1766 + }, + { + "epoch": 0.09526633599309899, + "grad_norm": 0.8115151524543762, + "learning_rate": 9.987515030194498e-06, + "loss": 0.9172, + "step": 1767 + }, + { + "epoch": 0.0953202501617425, + "grad_norm": 0.8750082850456238, + "learning_rate": 9.987500052979938e-06, + "loss": 0.8301, + "step": 1768 + }, + { + "epoch": 0.09537416433038602, + "grad_norm": 0.9008756875991821, + "learning_rate": 9.987485066798525e-06, + "loss": 0.8642, + "step": 1769 + }, + { + "epoch": 0.09542807849902954, + "grad_norm": 0.8335922956466675, + "learning_rate": 9.987470071650287e-06, + "loss": 0.8466, + "step": 1770 + }, + { + "epoch": 0.09548199266767307, + "grad_norm": 0.8604272603988647, + "learning_rate": 9.987455067535249e-06, + "loss": 0.8801, + "step": 1771 + }, + { + "epoch": 0.09553590683631659, + "grad_norm": 0.889854371547699, + "learning_rate": 9.98744005445344e-06, + "loss": 0.8804, + "step": 1772 + }, + { + "epoch": 0.09558982100496011, + "grad_norm": 0.8756876587867737, + "learning_rate": 9.987425032404887e-06, + "loss": 0.8367, + "step": 1773 + }, + { + "epoch": 0.09564373517360363, + "grad_norm": 0.9071298837661743, + "learning_rate": 9.987410001389616e-06, + "loss": 0.8875, + "step": 1774 + }, + { + "epoch": 0.09569764934224714, + "grad_norm": 0.8214284777641296, + "learning_rate": 9.987394961407654e-06, + "loss": 0.7859, + "step": 1775 + }, + { + "epoch": 0.09575156351089066, + "grad_norm": 0.940034806728363, + "learning_rate": 9.98737991245903e-06, + "loss": 0.8272, + "step": 1776 + }, + { + "epoch": 0.09580547767953418, + "grad_norm": 0.8156501054763794, + "learning_rate": 9.987364854543768e-06, + "loss": 0.7831, + "step": 1777 + }, + { + "epoch": 0.0958593918481777, + "grad_norm": 0.8450450301170349, + "learning_rate": 9.987349787661898e-06, + "loss": 0.7888, + "step": 1778 + }, + { + "epoch": 0.09591330601682121, + "grad_norm": 0.8143148422241211, + "learning_rate": 9.987334711813446e-06, + "loss": 0.7593, + "step": 1779 + }, + { + "epoch": 0.09596722018546475, + "grad_norm": 1.0489457845687866, + "learning_rate": 9.987319626998437e-06, + "loss": 0.8248, + "step": 1780 + }, + { + "epoch": 0.09602113435410826, + "grad_norm": 0.9584689140319824, + "learning_rate": 9.987304533216901e-06, + "loss": 0.9025, + "step": 1781 + }, + { + "epoch": 0.09607504852275178, + "grad_norm": 0.8366501331329346, + "learning_rate": 9.987289430468862e-06, + "loss": 0.7513, + "step": 1782 + }, + { + "epoch": 0.0961289626913953, + "grad_norm": 0.9896461963653564, + "learning_rate": 9.987274318754352e-06, + "loss": 0.8598, + "step": 1783 + }, + { + "epoch": 0.09618287686003882, + "grad_norm": 1.1904568672180176, + "learning_rate": 9.987259198073396e-06, + "loss": 0.9143, + "step": 1784 + }, + { + "epoch": 0.09623679102868234, + "grad_norm": 0.8100086450576782, + "learning_rate": 9.987244068426019e-06, + "loss": 0.7733, + "step": 1785 + }, + { + "epoch": 0.09629070519732585, + "grad_norm": 0.7814387083053589, + "learning_rate": 9.987228929812249e-06, + "loss": 0.7735, + "step": 1786 + }, + { + "epoch": 0.09634461936596937, + "grad_norm": 0.8880924582481384, + "learning_rate": 9.987213782232115e-06, + "loss": 0.8377, + "step": 1787 + }, + { + "epoch": 0.09639853353461289, + "grad_norm": 0.8739203810691833, + "learning_rate": 9.987198625685643e-06, + "loss": 0.8851, + "step": 1788 + }, + { + "epoch": 0.09645244770325642, + "grad_norm": 0.8984062671661377, + "learning_rate": 9.987183460172861e-06, + "loss": 0.8773, + "step": 1789 + }, + { + "epoch": 0.09650636187189994, + "grad_norm": 1.2485296726226807, + "learning_rate": 9.987168285693795e-06, + "loss": 0.787, + "step": 1790 + }, + { + "epoch": 0.09656027604054346, + "grad_norm": 0.8414161205291748, + "learning_rate": 9.987153102248474e-06, + "loss": 0.7895, + "step": 1791 + }, + { + "epoch": 0.09661419020918698, + "grad_norm": 0.7895180583000183, + "learning_rate": 9.987137909836924e-06, + "loss": 0.7592, + "step": 1792 + }, + { + "epoch": 0.0966681043778305, + "grad_norm": 1.0752787590026855, + "learning_rate": 9.987122708459173e-06, + "loss": 0.8472, + "step": 1793 + }, + { + "epoch": 0.09672201854647401, + "grad_norm": 0.9069424271583557, + "learning_rate": 9.987107498115247e-06, + "loss": 0.8746, + "step": 1794 + }, + { + "epoch": 0.09677593271511753, + "grad_norm": 0.8566716909408569, + "learning_rate": 9.987092278805175e-06, + "loss": 0.7604, + "step": 1795 + }, + { + "epoch": 0.09682984688376105, + "grad_norm": 0.833852231502533, + "learning_rate": 9.987077050528983e-06, + "loss": 0.8645, + "step": 1796 + }, + { + "epoch": 0.09688376105240457, + "grad_norm": 0.8439596891403198, + "learning_rate": 9.9870618132867e-06, + "loss": 0.7673, + "step": 1797 + }, + { + "epoch": 0.0969376752210481, + "grad_norm": 0.9743669629096985, + "learning_rate": 9.987046567078352e-06, + "loss": 0.7754, + "step": 1798 + }, + { + "epoch": 0.09699158938969162, + "grad_norm": 0.9291634559631348, + "learning_rate": 9.987031311903968e-06, + "loss": 0.8431, + "step": 1799 + }, + { + "epoch": 0.09704550355833513, + "grad_norm": 1.169450283050537, + "learning_rate": 9.987016047763571e-06, + "loss": 0.9321, + "step": 1800 + }, + { + "epoch": 0.09709941772697865, + "grad_norm": 0.7758163809776306, + "learning_rate": 9.987000774657195e-06, + "loss": 0.7832, + "step": 1801 + }, + { + "epoch": 0.09715333189562217, + "grad_norm": 0.9673672914505005, + "learning_rate": 9.986985492584863e-06, + "loss": 0.9822, + "step": 1802 + }, + { + "epoch": 0.09720724606426569, + "grad_norm": 1.1516417264938354, + "learning_rate": 9.986970201546605e-06, + "loss": 0.9956, + "step": 1803 + }, + { + "epoch": 0.0972611602329092, + "grad_norm": 0.9660587906837463, + "learning_rate": 9.986954901542445e-06, + "loss": 0.8248, + "step": 1804 + }, + { + "epoch": 0.09731507440155272, + "grad_norm": 0.9452739953994751, + "learning_rate": 9.986939592572413e-06, + "loss": 0.8805, + "step": 1805 + }, + { + "epoch": 0.09736898857019624, + "grad_norm": 0.9339364171028137, + "learning_rate": 9.986924274636538e-06, + "loss": 0.8819, + "step": 1806 + }, + { + "epoch": 0.09742290273883977, + "grad_norm": 0.9344542026519775, + "learning_rate": 9.986908947734844e-06, + "loss": 0.8531, + "step": 1807 + }, + { + "epoch": 0.09747681690748329, + "grad_norm": 0.8910528421401978, + "learning_rate": 9.986893611867362e-06, + "loss": 0.8949, + "step": 1808 + }, + { + "epoch": 0.09753073107612681, + "grad_norm": 0.8484895825386047, + "learning_rate": 9.986878267034115e-06, + "loss": 0.8028, + "step": 1809 + }, + { + "epoch": 0.09758464524477033, + "grad_norm": 1.0784810781478882, + "learning_rate": 9.986862913235135e-06, + "loss": 0.9564, + "step": 1810 + }, + { + "epoch": 0.09763855941341384, + "grad_norm": 0.8350296020507812, + "learning_rate": 9.98684755047045e-06, + "loss": 0.8672, + "step": 1811 + }, + { + "epoch": 0.09769247358205736, + "grad_norm": 0.8558050990104675, + "learning_rate": 9.986832178740084e-06, + "loss": 0.8538, + "step": 1812 + }, + { + "epoch": 0.09774638775070088, + "grad_norm": 0.8633396029472351, + "learning_rate": 9.986816798044066e-06, + "loss": 0.8356, + "step": 1813 + }, + { + "epoch": 0.0978003019193444, + "grad_norm": 0.8256344199180603, + "learning_rate": 9.986801408382424e-06, + "loss": 0.7552, + "step": 1814 + }, + { + "epoch": 0.09785421608798792, + "grad_norm": 0.872844398021698, + "learning_rate": 9.986786009755186e-06, + "loss": 0.9153, + "step": 1815 + }, + { + "epoch": 0.09790813025663145, + "grad_norm": 0.842241108417511, + "learning_rate": 9.986770602162378e-06, + "loss": 0.7965, + "step": 1816 + }, + { + "epoch": 0.09796204442527497, + "grad_norm": 0.9673634171485901, + "learning_rate": 9.98675518560403e-06, + "loss": 0.8317, + "step": 1817 + }, + { + "epoch": 0.09801595859391848, + "grad_norm": 0.8744896650314331, + "learning_rate": 9.98673976008017e-06, + "loss": 0.7342, + "step": 1818 + }, + { + "epoch": 0.098069872762562, + "grad_norm": 0.7830422520637512, + "learning_rate": 9.986724325590825e-06, + "loss": 0.721, + "step": 1819 + }, + { + "epoch": 0.09812378693120552, + "grad_norm": 1.0335441827774048, + "learning_rate": 9.986708882136021e-06, + "loss": 0.8088, + "step": 1820 + }, + { + "epoch": 0.09817770109984904, + "grad_norm": 0.841342568397522, + "learning_rate": 9.986693429715785e-06, + "loss": 0.8847, + "step": 1821 + }, + { + "epoch": 0.09823161526849256, + "grad_norm": 0.9405834674835205, + "learning_rate": 9.98667796833015e-06, + "loss": 0.8878, + "step": 1822 + }, + { + "epoch": 0.09828552943713607, + "grad_norm": 0.8358225226402283, + "learning_rate": 9.986662497979138e-06, + "loss": 0.7377, + "step": 1823 + }, + { + "epoch": 0.0983394436057796, + "grad_norm": 0.8844004273414612, + "learning_rate": 9.98664701866278e-06, + "loss": 0.7236, + "step": 1824 + }, + { + "epoch": 0.09839335777442312, + "grad_norm": 0.8165417313575745, + "learning_rate": 9.986631530381105e-06, + "loss": 0.819, + "step": 1825 + }, + { + "epoch": 0.09844727194306664, + "grad_norm": 0.9569553732872009, + "learning_rate": 9.986616033134137e-06, + "loss": 0.9337, + "step": 1826 + }, + { + "epoch": 0.09850118611171016, + "grad_norm": 0.8311771750450134, + "learning_rate": 9.986600526921907e-06, + "loss": 0.8516, + "step": 1827 + }, + { + "epoch": 0.09855510028035368, + "grad_norm": 0.9444357752799988, + "learning_rate": 9.986585011744441e-06, + "loss": 0.805, + "step": 1828 + }, + { + "epoch": 0.0986090144489972, + "grad_norm": 1.0128875970840454, + "learning_rate": 9.986569487601769e-06, + "loss": 0.8514, + "step": 1829 + }, + { + "epoch": 0.09866292861764071, + "grad_norm": 0.8973994255065918, + "learning_rate": 9.986553954493917e-06, + "loss": 0.7938, + "step": 1830 + }, + { + "epoch": 0.09871684278628423, + "grad_norm": 0.8571779131889343, + "learning_rate": 9.986538412420912e-06, + "loss": 0.7506, + "step": 1831 + }, + { + "epoch": 0.09877075695492775, + "grad_norm": 0.9053436517715454, + "learning_rate": 9.986522861382785e-06, + "loss": 0.8551, + "step": 1832 + }, + { + "epoch": 0.09882467112357128, + "grad_norm": 0.9941746592521667, + "learning_rate": 9.986507301379562e-06, + "loss": 0.8828, + "step": 1833 + }, + { + "epoch": 0.0988785852922148, + "grad_norm": 0.9620066285133362, + "learning_rate": 9.986491732411272e-06, + "loss": 0.8982, + "step": 1834 + }, + { + "epoch": 0.09893249946085832, + "grad_norm": 0.9470074772834778, + "learning_rate": 9.986476154477941e-06, + "loss": 0.8295, + "step": 1835 + }, + { + "epoch": 0.09898641362950183, + "grad_norm": 0.9962137937545776, + "learning_rate": 9.986460567579599e-06, + "loss": 0.8714, + "step": 1836 + }, + { + "epoch": 0.09904032779814535, + "grad_norm": 0.8492829203605652, + "learning_rate": 9.986444971716273e-06, + "loss": 0.8234, + "step": 1837 + }, + { + "epoch": 0.09909424196678887, + "grad_norm": 0.9463719725608826, + "learning_rate": 9.986429366887994e-06, + "loss": 0.7769, + "step": 1838 + }, + { + "epoch": 0.09914815613543239, + "grad_norm": 0.8588153123855591, + "learning_rate": 9.986413753094786e-06, + "loss": 0.8883, + "step": 1839 + }, + { + "epoch": 0.0992020703040759, + "grad_norm": 0.7692183256149292, + "learning_rate": 9.986398130336677e-06, + "loss": 0.7691, + "step": 1840 + }, + { + "epoch": 0.09925598447271942, + "grad_norm": 0.8377199172973633, + "learning_rate": 9.986382498613699e-06, + "loss": 0.789, + "step": 1841 + }, + { + "epoch": 0.09930989864136296, + "grad_norm": 0.9783869385719299, + "learning_rate": 9.986366857925876e-06, + "loss": 0.8517, + "step": 1842 + }, + { + "epoch": 0.09936381281000647, + "grad_norm": 0.8233169913291931, + "learning_rate": 9.986351208273239e-06, + "loss": 0.8701, + "step": 1843 + }, + { + "epoch": 0.09941772697864999, + "grad_norm": 0.9393780827522278, + "learning_rate": 9.986335549655814e-06, + "loss": 0.8837, + "step": 1844 + }, + { + "epoch": 0.09947164114729351, + "grad_norm": 0.8517693877220154, + "learning_rate": 9.986319882073631e-06, + "loss": 0.9043, + "step": 1845 + }, + { + "epoch": 0.09952555531593703, + "grad_norm": 0.8296724557876587, + "learning_rate": 9.986304205526718e-06, + "loss": 0.7406, + "step": 1846 + }, + { + "epoch": 0.09957946948458055, + "grad_norm": 0.8372161388397217, + "learning_rate": 9.986288520015102e-06, + "loss": 0.7763, + "step": 1847 + }, + { + "epoch": 0.09963338365322406, + "grad_norm": 0.8086470365524292, + "learning_rate": 9.986272825538812e-06, + "loss": 0.8786, + "step": 1848 + }, + { + "epoch": 0.09968729782186758, + "grad_norm": 0.8562842011451721, + "learning_rate": 9.986257122097875e-06, + "loss": 0.8391, + "step": 1849 + }, + { + "epoch": 0.0997412119905111, + "grad_norm": 0.9052720665931702, + "learning_rate": 9.986241409692321e-06, + "loss": 0.948, + "step": 1850 + }, + { + "epoch": 0.09979512615915463, + "grad_norm": 0.8220609426498413, + "learning_rate": 9.986225688322178e-06, + "loss": 0.8039, + "step": 1851 + }, + { + "epoch": 0.09984904032779815, + "grad_norm": 0.8018030524253845, + "learning_rate": 9.98620995798747e-06, + "loss": 0.7748, + "step": 1852 + }, + { + "epoch": 0.09990295449644167, + "grad_norm": 0.8150879144668579, + "learning_rate": 9.986194218688235e-06, + "loss": 0.7304, + "step": 1853 + }, + { + "epoch": 0.09995686866508519, + "grad_norm": 0.8677535653114319, + "learning_rate": 9.98617847042449e-06, + "loss": 0.8756, + "step": 1854 + }, + { + "epoch": 0.1000107828337287, + "grad_norm": 0.8889294862747192, + "learning_rate": 9.986162713196272e-06, + "loss": 0.8926, + "step": 1855 + }, + { + "epoch": 0.10006469700237222, + "grad_norm": 0.7618375420570374, + "learning_rate": 9.986146947003603e-06, + "loss": 0.7317, + "step": 1856 + }, + { + "epoch": 0.10011861117101574, + "grad_norm": 0.8775038719177246, + "learning_rate": 9.986131171846518e-06, + "loss": 0.8318, + "step": 1857 + }, + { + "epoch": 0.10017252533965926, + "grad_norm": 0.9671807289123535, + "learning_rate": 9.986115387725039e-06, + "loss": 0.7412, + "step": 1858 + }, + { + "epoch": 0.10022643950830278, + "grad_norm": 0.8808870911598206, + "learning_rate": 9.986099594639197e-06, + "loss": 0.8213, + "step": 1859 + }, + { + "epoch": 0.10028035367694631, + "grad_norm": 0.8104208707809448, + "learning_rate": 9.986083792589021e-06, + "loss": 0.8108, + "step": 1860 + }, + { + "epoch": 0.10033426784558983, + "grad_norm": 0.839911937713623, + "learning_rate": 9.986067981574538e-06, + "loss": 0.8391, + "step": 1861 + }, + { + "epoch": 0.10038818201423334, + "grad_norm": 0.8402823805809021, + "learning_rate": 9.986052161595778e-06, + "loss": 0.7434, + "step": 1862 + }, + { + "epoch": 0.10044209618287686, + "grad_norm": 0.7591431140899658, + "learning_rate": 9.986036332652768e-06, + "loss": 0.763, + "step": 1863 + }, + { + "epoch": 0.10049601035152038, + "grad_norm": 0.8613053560256958, + "learning_rate": 9.986020494745538e-06, + "loss": 0.8324, + "step": 1864 + }, + { + "epoch": 0.1005499245201639, + "grad_norm": 0.8467068076133728, + "learning_rate": 9.986004647874117e-06, + "loss": 0.882, + "step": 1865 + }, + { + "epoch": 0.10060383868880741, + "grad_norm": 1.0717257261276245, + "learning_rate": 9.98598879203853e-06, + "loss": 0.9305, + "step": 1866 + }, + { + "epoch": 0.10065775285745093, + "grad_norm": 0.8680382370948792, + "learning_rate": 9.985972927238808e-06, + "loss": 0.7521, + "step": 1867 + }, + { + "epoch": 0.10071166702609445, + "grad_norm": 0.8465799689292908, + "learning_rate": 9.98595705347498e-06, + "loss": 0.8562, + "step": 1868 + }, + { + "epoch": 0.10076558119473798, + "grad_norm": 0.938218355178833, + "learning_rate": 9.985941170747072e-06, + "loss": 0.7737, + "step": 1869 + }, + { + "epoch": 0.1008194953633815, + "grad_norm": 0.8189761638641357, + "learning_rate": 9.985925279055117e-06, + "loss": 0.8502, + "step": 1870 + }, + { + "epoch": 0.10087340953202502, + "grad_norm": 0.915703535079956, + "learning_rate": 9.985909378399138e-06, + "loss": 0.9576, + "step": 1871 + }, + { + "epoch": 0.10092732370066854, + "grad_norm": 0.7837297916412354, + "learning_rate": 9.985893468779168e-06, + "loss": 0.7091, + "step": 1872 + }, + { + "epoch": 0.10098123786931205, + "grad_norm": 0.7426577806472778, + "learning_rate": 9.985877550195234e-06, + "loss": 0.768, + "step": 1873 + }, + { + "epoch": 0.10103515203795557, + "grad_norm": 0.9437102675437927, + "learning_rate": 9.985861622647364e-06, + "loss": 0.8308, + "step": 1874 + }, + { + "epoch": 0.10108906620659909, + "grad_norm": 0.7381339073181152, + "learning_rate": 9.985845686135586e-06, + "loss": 0.7206, + "step": 1875 + }, + { + "epoch": 0.10114298037524261, + "grad_norm": 0.8478738069534302, + "learning_rate": 9.985829740659932e-06, + "loss": 0.7512, + "step": 1876 + }, + { + "epoch": 0.10119689454388614, + "grad_norm": 0.8331673741340637, + "learning_rate": 9.985813786220428e-06, + "loss": 0.8281, + "step": 1877 + }, + { + "epoch": 0.10125080871252966, + "grad_norm": 0.7703354954719543, + "learning_rate": 9.985797822817102e-06, + "loss": 0.7313, + "step": 1878 + }, + { + "epoch": 0.10130472288117318, + "grad_norm": 0.9182866811752319, + "learning_rate": 9.985781850449985e-06, + "loss": 0.8365, + "step": 1879 + }, + { + "epoch": 0.1013586370498167, + "grad_norm": 0.8285559415817261, + "learning_rate": 9.985765869119104e-06, + "loss": 0.8439, + "step": 1880 + }, + { + "epoch": 0.10141255121846021, + "grad_norm": 0.8400557041168213, + "learning_rate": 9.985749878824488e-06, + "loss": 0.8011, + "step": 1881 + }, + { + "epoch": 0.10146646538710373, + "grad_norm": 0.9225326776504517, + "learning_rate": 9.985733879566168e-06, + "loss": 0.8402, + "step": 1882 + }, + { + "epoch": 0.10152037955574725, + "grad_norm": 0.9194371700286865, + "learning_rate": 9.985717871344172e-06, + "loss": 0.8245, + "step": 1883 + }, + { + "epoch": 0.10157429372439077, + "grad_norm": 0.7443274259567261, + "learning_rate": 9.985701854158525e-06, + "loss": 0.7708, + "step": 1884 + }, + { + "epoch": 0.10162820789303428, + "grad_norm": 1.1139355897903442, + "learning_rate": 9.985685828009259e-06, + "loss": 0.8384, + "step": 1885 + }, + { + "epoch": 0.10168212206167782, + "grad_norm": 0.8835493326187134, + "learning_rate": 9.985669792896402e-06, + "loss": 0.8063, + "step": 1886 + }, + { + "epoch": 0.10173603623032133, + "grad_norm": 0.8012663125991821, + "learning_rate": 9.985653748819983e-06, + "loss": 0.8393, + "step": 1887 + }, + { + "epoch": 0.10178995039896485, + "grad_norm": 0.8092807531356812, + "learning_rate": 9.985637695780033e-06, + "loss": 0.7631, + "step": 1888 + }, + { + "epoch": 0.10184386456760837, + "grad_norm": 1.7357290983200073, + "learning_rate": 9.985621633776577e-06, + "loss": 0.8067, + "step": 1889 + }, + { + "epoch": 0.10189777873625189, + "grad_norm": 0.8562015891075134, + "learning_rate": 9.985605562809646e-06, + "loss": 0.8543, + "step": 1890 + }, + { + "epoch": 0.1019516929048954, + "grad_norm": 0.9570844769477844, + "learning_rate": 9.98558948287927e-06, + "loss": 0.7778, + "step": 1891 + }, + { + "epoch": 0.10200560707353892, + "grad_norm": 0.748468279838562, + "learning_rate": 9.985573393985475e-06, + "loss": 0.6559, + "step": 1892 + }, + { + "epoch": 0.10205952124218244, + "grad_norm": 1.004490852355957, + "learning_rate": 9.98555729612829e-06, + "loss": 0.8453, + "step": 1893 + }, + { + "epoch": 0.10211343541082596, + "grad_norm": 0.9566166996955872, + "learning_rate": 9.985541189307749e-06, + "loss": 0.8984, + "step": 1894 + }, + { + "epoch": 0.10216734957946949, + "grad_norm": 0.8624017834663391, + "learning_rate": 9.985525073523874e-06, + "loss": 0.7442, + "step": 1895 + }, + { + "epoch": 0.10222126374811301, + "grad_norm": 1.0596553087234497, + "learning_rate": 9.9855089487767e-06, + "loss": 0.778, + "step": 1896 + }, + { + "epoch": 0.10227517791675653, + "grad_norm": 0.8003553152084351, + "learning_rate": 9.985492815066252e-06, + "loss": 0.7513, + "step": 1897 + }, + { + "epoch": 0.10232909208540004, + "grad_norm": 1.0067185163497925, + "learning_rate": 9.98547667239256e-06, + "loss": 0.8878, + "step": 1898 + }, + { + "epoch": 0.10238300625404356, + "grad_norm": 0.8398754596710205, + "learning_rate": 9.985460520755654e-06, + "loss": 0.8222, + "step": 1899 + }, + { + "epoch": 0.10243692042268708, + "grad_norm": 0.9688541293144226, + "learning_rate": 9.985444360155563e-06, + "loss": 0.8304, + "step": 1900 + }, + { + "epoch": 0.1024908345913306, + "grad_norm": 0.8848011493682861, + "learning_rate": 9.985428190592314e-06, + "loss": 0.7853, + "step": 1901 + }, + { + "epoch": 0.10254474875997412, + "grad_norm": 0.9240403771400452, + "learning_rate": 9.985412012065937e-06, + "loss": 0.9058, + "step": 1902 + }, + { + "epoch": 0.10259866292861763, + "grad_norm": 0.814194917678833, + "learning_rate": 9.985395824576463e-06, + "loss": 0.7775, + "step": 1903 + }, + { + "epoch": 0.10265257709726117, + "grad_norm": 0.9210302233695984, + "learning_rate": 9.98537962812392e-06, + "loss": 0.9288, + "step": 1904 + }, + { + "epoch": 0.10270649126590468, + "grad_norm": 0.8850705027580261, + "learning_rate": 9.985363422708336e-06, + "loss": 0.9036, + "step": 1905 + }, + { + "epoch": 0.1027604054345482, + "grad_norm": 0.8312196731567383, + "learning_rate": 9.985347208329742e-06, + "loss": 0.811, + "step": 1906 + }, + { + "epoch": 0.10281431960319172, + "grad_norm": 1.1294670104980469, + "learning_rate": 9.985330984988164e-06, + "loss": 0.9775, + "step": 1907 + }, + { + "epoch": 0.10286823377183524, + "grad_norm": 0.7980399131774902, + "learning_rate": 9.985314752683635e-06, + "loss": 0.7786, + "step": 1908 + }, + { + "epoch": 0.10292214794047876, + "grad_norm": 0.8291264176368713, + "learning_rate": 9.985298511416181e-06, + "loss": 0.7028, + "step": 1909 + }, + { + "epoch": 0.10297606210912227, + "grad_norm": 0.8284684419631958, + "learning_rate": 9.985282261185833e-06, + "loss": 0.8043, + "step": 1910 + }, + { + "epoch": 0.10302997627776579, + "grad_norm": 0.8680904507637024, + "learning_rate": 9.985266001992622e-06, + "loss": 0.8274, + "step": 1911 + }, + { + "epoch": 0.10308389044640931, + "grad_norm": 0.7380900979042053, + "learning_rate": 9.985249733836573e-06, + "loss": 0.6991, + "step": 1912 + }, + { + "epoch": 0.10313780461505284, + "grad_norm": 0.8572129011154175, + "learning_rate": 9.985233456717718e-06, + "loss": 0.7751, + "step": 1913 + }, + { + "epoch": 0.10319171878369636, + "grad_norm": 0.8797627687454224, + "learning_rate": 9.985217170636085e-06, + "loss": 0.8681, + "step": 1914 + }, + { + "epoch": 0.10324563295233988, + "grad_norm": 0.9301999807357788, + "learning_rate": 9.985200875591704e-06, + "loss": 0.6208, + "step": 1915 + }, + { + "epoch": 0.1032995471209834, + "grad_norm": 0.8296228647232056, + "learning_rate": 9.985184571584606e-06, + "loss": 0.8027, + "step": 1916 + }, + { + "epoch": 0.10335346128962691, + "grad_norm": 0.8241246342658997, + "learning_rate": 9.985168258614815e-06, + "loss": 0.8223, + "step": 1917 + }, + { + "epoch": 0.10340737545827043, + "grad_norm": 0.9633389115333557, + "learning_rate": 9.985151936682367e-06, + "loss": 0.9037, + "step": 1918 + }, + { + "epoch": 0.10346128962691395, + "grad_norm": 0.8903288245201111, + "learning_rate": 9.985135605787286e-06, + "loss": 0.8949, + "step": 1919 + }, + { + "epoch": 0.10351520379555747, + "grad_norm": 0.8670981526374817, + "learning_rate": 9.985119265929604e-06, + "loss": 0.7094, + "step": 1920 + }, + { + "epoch": 0.10356911796420099, + "grad_norm": 0.9681735038757324, + "learning_rate": 9.985102917109351e-06, + "loss": 0.9617, + "step": 1921 + }, + { + "epoch": 0.10362303213284452, + "grad_norm": 0.9229291081428528, + "learning_rate": 9.985086559326555e-06, + "loss": 0.9384, + "step": 1922 + }, + { + "epoch": 0.10367694630148803, + "grad_norm": 0.8501392602920532, + "learning_rate": 9.985070192581245e-06, + "loss": 0.8647, + "step": 1923 + }, + { + "epoch": 0.10373086047013155, + "grad_norm": 1.4047728776931763, + "learning_rate": 9.985053816873452e-06, + "loss": 0.7905, + "step": 1924 + }, + { + "epoch": 0.10378477463877507, + "grad_norm": 1.154661774635315, + "learning_rate": 9.985037432203204e-06, + "loss": 0.8666, + "step": 1925 + }, + { + "epoch": 0.10383868880741859, + "grad_norm": 1.042126178741455, + "learning_rate": 9.985021038570532e-06, + "loss": 0.7736, + "step": 1926 + }, + { + "epoch": 0.1038926029760621, + "grad_norm": 0.7904629111289978, + "learning_rate": 9.985004635975464e-06, + "loss": 0.7247, + "step": 1927 + }, + { + "epoch": 0.10394651714470562, + "grad_norm": 0.8718095421791077, + "learning_rate": 9.984988224418029e-06, + "loss": 0.7792, + "step": 1928 + }, + { + "epoch": 0.10400043131334914, + "grad_norm": 0.870330274105072, + "learning_rate": 9.984971803898258e-06, + "loss": 0.7992, + "step": 1929 + }, + { + "epoch": 0.10405434548199267, + "grad_norm": 0.8473007678985596, + "learning_rate": 9.98495537441618e-06, + "loss": 0.883, + "step": 1930 + }, + { + "epoch": 0.10410825965063619, + "grad_norm": 1.0333232879638672, + "learning_rate": 9.984938935971824e-06, + "loss": 0.9228, + "step": 1931 + }, + { + "epoch": 0.10416217381927971, + "grad_norm": 0.9389268159866333, + "learning_rate": 9.984922488565221e-06, + "loss": 0.7792, + "step": 1932 + }, + { + "epoch": 0.10421608798792323, + "grad_norm": 0.9977405667304993, + "learning_rate": 9.9849060321964e-06, + "loss": 0.7971, + "step": 1933 + }, + { + "epoch": 0.10427000215656675, + "grad_norm": 0.7879780530929565, + "learning_rate": 9.98488956686539e-06, + "loss": 0.8149, + "step": 1934 + }, + { + "epoch": 0.10432391632521026, + "grad_norm": 0.8149437308311462, + "learning_rate": 9.98487309257222e-06, + "loss": 0.8391, + "step": 1935 + }, + { + "epoch": 0.10437783049385378, + "grad_norm": 0.9226745367050171, + "learning_rate": 9.984856609316921e-06, + "loss": 0.9581, + "step": 1936 + }, + { + "epoch": 0.1044317446624973, + "grad_norm": 0.9190924167633057, + "learning_rate": 9.984840117099524e-06, + "loss": 0.8859, + "step": 1937 + }, + { + "epoch": 0.10448565883114082, + "grad_norm": 0.7996852397918701, + "learning_rate": 9.984823615920054e-06, + "loss": 0.7377, + "step": 1938 + }, + { + "epoch": 0.10453957299978435, + "grad_norm": 1.0055615901947021, + "learning_rate": 9.984807105778544e-06, + "loss": 1.4365, + "step": 1939 + }, + { + "epoch": 0.10459348716842787, + "grad_norm": 0.8595201969146729, + "learning_rate": 9.984790586675023e-06, + "loss": 0.807, + "step": 1940 + }, + { + "epoch": 0.10464740133707139, + "grad_norm": 0.9500923156738281, + "learning_rate": 9.984774058609522e-06, + "loss": 0.9378, + "step": 1941 + }, + { + "epoch": 0.1047013155057149, + "grad_norm": 0.8677893877029419, + "learning_rate": 9.98475752158207e-06, + "loss": 0.8399, + "step": 1942 + }, + { + "epoch": 0.10475522967435842, + "grad_norm": 0.8256751298904419, + "learning_rate": 9.984740975592695e-06, + "loss": 0.8552, + "step": 1943 + }, + { + "epoch": 0.10480914384300194, + "grad_norm": 0.8910439610481262, + "learning_rate": 9.984724420641427e-06, + "loss": 0.9704, + "step": 1944 + }, + { + "epoch": 0.10486305801164546, + "grad_norm": 0.8732389807701111, + "learning_rate": 9.9847078567283e-06, + "loss": 0.8448, + "step": 1945 + }, + { + "epoch": 0.10491697218028898, + "grad_norm": 0.856151282787323, + "learning_rate": 9.984691283853338e-06, + "loss": 0.7403, + "step": 1946 + }, + { + "epoch": 0.1049708863489325, + "grad_norm": 0.8741405010223389, + "learning_rate": 9.984674702016573e-06, + "loss": 0.8913, + "step": 1947 + }, + { + "epoch": 0.10502480051757603, + "grad_norm": 0.9214139580726624, + "learning_rate": 9.984658111218036e-06, + "loss": 0.8901, + "step": 1948 + }, + { + "epoch": 0.10507871468621954, + "grad_norm": 0.9773908853530884, + "learning_rate": 9.984641511457757e-06, + "loss": 0.7979, + "step": 1949 + }, + { + "epoch": 0.10513262885486306, + "grad_norm": 0.9136568903923035, + "learning_rate": 9.984624902735765e-06, + "loss": 0.9019, + "step": 1950 + }, + { + "epoch": 0.10518654302350658, + "grad_norm": 0.857468843460083, + "learning_rate": 9.984608285052087e-06, + "loss": 0.7663, + "step": 1951 + }, + { + "epoch": 0.1052404571921501, + "grad_norm": 0.8473180532455444, + "learning_rate": 9.984591658406756e-06, + "loss": 0.8137, + "step": 1952 + }, + { + "epoch": 0.10529437136079361, + "grad_norm": 0.8932186961174011, + "learning_rate": 9.984575022799805e-06, + "loss": 0.8859, + "step": 1953 + }, + { + "epoch": 0.10534828552943713, + "grad_norm": 0.8191091418266296, + "learning_rate": 9.984558378231257e-06, + "loss": 0.8111, + "step": 1954 + }, + { + "epoch": 0.10540219969808065, + "grad_norm": 0.8452546000480652, + "learning_rate": 9.984541724701147e-06, + "loss": 0.8563, + "step": 1955 + }, + { + "epoch": 0.10545611386672417, + "grad_norm": 0.8053101897239685, + "learning_rate": 9.984525062209502e-06, + "loss": 0.8166, + "step": 1956 + }, + { + "epoch": 0.1055100280353677, + "grad_norm": 0.7936314344406128, + "learning_rate": 9.984508390756354e-06, + "loss": 0.8446, + "step": 1957 + }, + { + "epoch": 0.10556394220401122, + "grad_norm": 0.7867884635925293, + "learning_rate": 9.984491710341733e-06, + "loss": 0.7719, + "step": 1958 + }, + { + "epoch": 0.10561785637265474, + "grad_norm": 0.8387873768806458, + "learning_rate": 9.984475020965667e-06, + "loss": 0.842, + "step": 1959 + }, + { + "epoch": 0.10567177054129825, + "grad_norm": 0.8028631806373596, + "learning_rate": 9.984458322628188e-06, + "loss": 0.7673, + "step": 1960 + }, + { + "epoch": 0.10572568470994177, + "grad_norm": 0.765836238861084, + "learning_rate": 9.984441615329323e-06, + "loss": 0.7383, + "step": 1961 + }, + { + "epoch": 0.10577959887858529, + "grad_norm": 0.8619019389152527, + "learning_rate": 9.984424899069106e-06, + "loss": 0.8076, + "step": 1962 + }, + { + "epoch": 0.10583351304722881, + "grad_norm": 1.1085911989212036, + "learning_rate": 9.984408173847565e-06, + "loss": 0.9379, + "step": 1963 + }, + { + "epoch": 0.10588742721587233, + "grad_norm": 0.7861249446868896, + "learning_rate": 9.98439143966473e-06, + "loss": 0.7531, + "step": 1964 + }, + { + "epoch": 0.10594134138451584, + "grad_norm": 0.8964807391166687, + "learning_rate": 9.984374696520633e-06, + "loss": 0.7991, + "step": 1965 + }, + { + "epoch": 0.10599525555315938, + "grad_norm": 0.720808207988739, + "learning_rate": 9.984357944415302e-06, + "loss": 0.7171, + "step": 1966 + }, + { + "epoch": 0.1060491697218029, + "grad_norm": 0.9870907068252563, + "learning_rate": 9.984341183348766e-06, + "loss": 0.8168, + "step": 1967 + }, + { + "epoch": 0.10610308389044641, + "grad_norm": 0.7987208366394043, + "learning_rate": 9.984324413321057e-06, + "loss": 0.817, + "step": 1968 + }, + { + "epoch": 0.10615699805908993, + "grad_norm": 0.7737677097320557, + "learning_rate": 9.984307634332206e-06, + "loss": 0.855, + "step": 1969 + }, + { + "epoch": 0.10621091222773345, + "grad_norm": 0.9125123620033264, + "learning_rate": 9.984290846382243e-06, + "loss": 0.8059, + "step": 1970 + }, + { + "epoch": 0.10626482639637697, + "grad_norm": 0.8460454344749451, + "learning_rate": 9.984274049471197e-06, + "loss": 0.7415, + "step": 1971 + }, + { + "epoch": 0.10631874056502048, + "grad_norm": 0.8322888016700745, + "learning_rate": 9.984257243599096e-06, + "loss": 0.793, + "step": 1972 + }, + { + "epoch": 0.106372654733664, + "grad_norm": 0.7797715067863464, + "learning_rate": 9.984240428765975e-06, + "loss": 0.7324, + "step": 1973 + }, + { + "epoch": 0.10642656890230752, + "grad_norm": 0.847457766532898, + "learning_rate": 9.98422360497186e-06, + "loss": 0.7949, + "step": 1974 + }, + { + "epoch": 0.10648048307095105, + "grad_norm": 0.8471247553825378, + "learning_rate": 9.984206772216785e-06, + "loss": 0.8368, + "step": 1975 + }, + { + "epoch": 0.10653439723959457, + "grad_norm": 0.879416823387146, + "learning_rate": 9.984189930500778e-06, + "loss": 0.7779, + "step": 1976 + }, + { + "epoch": 0.10658831140823809, + "grad_norm": 0.8355580568313599, + "learning_rate": 9.98417307982387e-06, + "loss": 0.7741, + "step": 1977 + }, + { + "epoch": 0.1066422255768816, + "grad_norm": 0.8388553857803345, + "learning_rate": 9.98415622018609e-06, + "loss": 0.7839, + "step": 1978 + }, + { + "epoch": 0.10669613974552512, + "grad_norm": 0.7899215221405029, + "learning_rate": 9.98413935158747e-06, + "loss": 0.7419, + "step": 1979 + }, + { + "epoch": 0.10675005391416864, + "grad_norm": 0.9422525763511658, + "learning_rate": 9.98412247402804e-06, + "loss": 0.7977, + "step": 1980 + }, + { + "epoch": 0.10680396808281216, + "grad_norm": 0.8084313869476318, + "learning_rate": 9.984105587507831e-06, + "loss": 0.6813, + "step": 1981 + }, + { + "epoch": 0.10685788225145568, + "grad_norm": 0.9860095977783203, + "learning_rate": 9.98408869202687e-06, + "loss": 0.8934, + "step": 1982 + }, + { + "epoch": 0.10691179642009921, + "grad_norm": 0.9511064887046814, + "learning_rate": 9.98407178758519e-06, + "loss": 0.8438, + "step": 1983 + }, + { + "epoch": 0.10696571058874273, + "grad_norm": 0.9021103978157043, + "learning_rate": 9.984054874182822e-06, + "loss": 0.854, + "step": 1984 + }, + { + "epoch": 0.10701962475738624, + "grad_norm": 0.8343318104743958, + "learning_rate": 9.984037951819796e-06, + "loss": 0.8075, + "step": 1985 + }, + { + "epoch": 0.10707353892602976, + "grad_norm": 0.8592053651809692, + "learning_rate": 9.984021020496141e-06, + "loss": 0.8431, + "step": 1986 + }, + { + "epoch": 0.10712745309467328, + "grad_norm": 0.8554633259773254, + "learning_rate": 9.98400408021189e-06, + "loss": 0.797, + "step": 1987 + }, + { + "epoch": 0.1071813672633168, + "grad_norm": 0.8476511240005493, + "learning_rate": 9.98398713096707e-06, + "loss": 0.834, + "step": 1988 + }, + { + "epoch": 0.10723528143196032, + "grad_norm": 0.8374871611595154, + "learning_rate": 9.983970172761715e-06, + "loss": 0.7934, + "step": 1989 + }, + { + "epoch": 0.10728919560060383, + "grad_norm": 0.8740583658218384, + "learning_rate": 9.983953205595853e-06, + "loss": 0.8945, + "step": 1990 + }, + { + "epoch": 0.10734310976924735, + "grad_norm": 0.8888646364212036, + "learning_rate": 9.983936229469514e-06, + "loss": 0.8582, + "step": 1991 + }, + { + "epoch": 0.10739702393789088, + "grad_norm": 0.7999173402786255, + "learning_rate": 9.983919244382732e-06, + "loss": 0.7906, + "step": 1992 + }, + { + "epoch": 0.1074509381065344, + "grad_norm": 0.8284609913825989, + "learning_rate": 9.983902250335532e-06, + "loss": 0.8282, + "step": 1993 + }, + { + "epoch": 0.10750485227517792, + "grad_norm": 0.8933084607124329, + "learning_rate": 9.98388524732795e-06, + "loss": 0.8332, + "step": 1994 + }, + { + "epoch": 0.10755876644382144, + "grad_norm": 1.1771386861801147, + "learning_rate": 9.983868235360017e-06, + "loss": 0.6624, + "step": 1995 + }, + { + "epoch": 0.10761268061246496, + "grad_norm": 0.7977056503295898, + "learning_rate": 9.98385121443176e-06, + "loss": 0.7169, + "step": 1996 + }, + { + "epoch": 0.10766659478110847, + "grad_norm": 1.1132346391677856, + "learning_rate": 9.98383418454321e-06, + "loss": 0.8448, + "step": 1997 + }, + { + "epoch": 0.10772050894975199, + "grad_norm": 0.8148393034934998, + "learning_rate": 9.983817145694396e-06, + "loss": 0.7313, + "step": 1998 + }, + { + "epoch": 0.10777442311839551, + "grad_norm": 1.0594265460968018, + "learning_rate": 9.983800097885353e-06, + "loss": 0.9795, + "step": 1999 + }, + { + "epoch": 0.10782833728703903, + "grad_norm": 0.8699034452438354, + "learning_rate": 9.983783041116109e-06, + "loss": 0.8717, + "step": 2000 + }, + { + "epoch": 0.10788225145568256, + "grad_norm": 1.0455189943313599, + "learning_rate": 9.983765975386696e-06, + "loss": 0.898, + "step": 2001 + }, + { + "epoch": 0.10793616562432608, + "grad_norm": 1.0363630056381226, + "learning_rate": 9.983748900697143e-06, + "loss": 0.8404, + "step": 2002 + }, + { + "epoch": 0.1079900797929696, + "grad_norm": 0.7753402590751648, + "learning_rate": 9.983731817047482e-06, + "loss": 0.8416, + "step": 2003 + }, + { + "epoch": 0.10804399396161311, + "grad_norm": 0.7321370244026184, + "learning_rate": 9.983714724437744e-06, + "loss": 0.7051, + "step": 2004 + }, + { + "epoch": 0.10809790813025663, + "grad_norm": 0.8907992839813232, + "learning_rate": 9.983697622867959e-06, + "loss": 0.8347, + "step": 2005 + }, + { + "epoch": 0.10815182229890015, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.983680512338157e-06, + "loss": 0.7704, + "step": 2006 + }, + { + "epoch": 0.10820573646754367, + "grad_norm": 0.9187548756599426, + "learning_rate": 9.983663392848371e-06, + "loss": 0.8926, + "step": 2007 + }, + { + "epoch": 0.10825965063618719, + "grad_norm": 1.0350191593170166, + "learning_rate": 9.983646264398629e-06, + "loss": 0.8253, + "step": 2008 + }, + { + "epoch": 0.1083135648048307, + "grad_norm": 0.9566621780395508, + "learning_rate": 9.983629126988963e-06, + "loss": 0.8545, + "step": 2009 + }, + { + "epoch": 0.10836747897347423, + "grad_norm": 0.7644455432891846, + "learning_rate": 9.983611980619405e-06, + "loss": 0.707, + "step": 2010 + }, + { + "epoch": 0.10842139314211775, + "grad_norm": 0.7929621934890747, + "learning_rate": 9.983594825289983e-06, + "loss": 0.8123, + "step": 2011 + }, + { + "epoch": 0.10847530731076127, + "grad_norm": 0.8667447566986084, + "learning_rate": 9.983577661000732e-06, + "loss": 0.8371, + "step": 2012 + }, + { + "epoch": 0.10852922147940479, + "grad_norm": 0.9008684158325195, + "learning_rate": 9.98356048775168e-06, + "loss": 0.8088, + "step": 2013 + }, + { + "epoch": 0.1085831356480483, + "grad_norm": 0.8797710537910461, + "learning_rate": 9.983543305542858e-06, + "loss": 0.8315, + "step": 2014 + }, + { + "epoch": 0.10863704981669182, + "grad_norm": 1.0082249641418457, + "learning_rate": 9.983526114374296e-06, + "loss": 0.6944, + "step": 2015 + }, + { + "epoch": 0.10869096398533534, + "grad_norm": 0.8216932415962219, + "learning_rate": 9.983508914246027e-06, + "loss": 0.7704, + "step": 2016 + }, + { + "epoch": 0.10874487815397886, + "grad_norm": 0.7873802781105042, + "learning_rate": 9.983491705158082e-06, + "loss": 0.8269, + "step": 2017 + }, + { + "epoch": 0.10879879232262238, + "grad_norm": 0.9200018644332886, + "learning_rate": 9.983474487110492e-06, + "loss": 0.8736, + "step": 2018 + }, + { + "epoch": 0.10885270649126591, + "grad_norm": 0.8780434727668762, + "learning_rate": 9.983457260103284e-06, + "loss": 0.8959, + "step": 2019 + }, + { + "epoch": 0.10890662065990943, + "grad_norm": 0.8503702878952026, + "learning_rate": 9.983440024136493e-06, + "loss": 0.874, + "step": 2020 + }, + { + "epoch": 0.10896053482855295, + "grad_norm": 0.8003312349319458, + "learning_rate": 9.98342277921015e-06, + "loss": 0.8053, + "step": 2021 + }, + { + "epoch": 0.10901444899719646, + "grad_norm": 0.8508152961730957, + "learning_rate": 9.983405525324284e-06, + "loss": 0.8349, + "step": 2022 + }, + { + "epoch": 0.10906836316583998, + "grad_norm": 0.7947866320610046, + "learning_rate": 9.983388262478928e-06, + "loss": 0.7969, + "step": 2023 + }, + { + "epoch": 0.1091222773344835, + "grad_norm": 0.7566391229629517, + "learning_rate": 9.98337099067411e-06, + "loss": 0.7485, + "step": 2024 + }, + { + "epoch": 0.10917619150312702, + "grad_norm": 0.7484708428382874, + "learning_rate": 9.983353709909865e-06, + "loss": 0.7223, + "step": 2025 + }, + { + "epoch": 0.10923010567177054, + "grad_norm": 0.7474842667579651, + "learning_rate": 9.983336420186223e-06, + "loss": 0.7643, + "step": 2026 + }, + { + "epoch": 0.10928401984041405, + "grad_norm": 0.9116804003715515, + "learning_rate": 9.983319121503212e-06, + "loss": 0.9259, + "step": 2027 + }, + { + "epoch": 0.10933793400905759, + "grad_norm": 0.7918151617050171, + "learning_rate": 9.983301813860866e-06, + "loss": 0.8006, + "step": 2028 + }, + { + "epoch": 0.1093918481777011, + "grad_norm": 0.8043256998062134, + "learning_rate": 9.983284497259216e-06, + "loss": 0.7776, + "step": 2029 + }, + { + "epoch": 0.10944576234634462, + "grad_norm": 0.7829573154449463, + "learning_rate": 9.983267171698292e-06, + "loss": 0.7518, + "step": 2030 + }, + { + "epoch": 0.10949967651498814, + "grad_norm": 0.9080957174301147, + "learning_rate": 9.983249837178126e-06, + "loss": 0.777, + "step": 2031 + }, + { + "epoch": 0.10955359068363166, + "grad_norm": 0.9077693223953247, + "learning_rate": 9.983232493698748e-06, + "loss": 0.7412, + "step": 2032 + }, + { + "epoch": 0.10960750485227518, + "grad_norm": 0.7891800403594971, + "learning_rate": 9.98321514126019e-06, + "loss": 0.8089, + "step": 2033 + }, + { + "epoch": 0.1096614190209187, + "grad_norm": 0.8350703716278076, + "learning_rate": 9.983197779862485e-06, + "loss": 0.8414, + "step": 2034 + }, + { + "epoch": 0.10971533318956221, + "grad_norm": 0.8714777231216431, + "learning_rate": 9.983180409505663e-06, + "loss": 0.7355, + "step": 2035 + }, + { + "epoch": 0.10976924735820574, + "grad_norm": 0.8524130582809448, + "learning_rate": 9.98316303018975e-06, + "loss": 0.8611, + "step": 2036 + }, + { + "epoch": 0.10982316152684926, + "grad_norm": 0.8570566177368164, + "learning_rate": 9.983145641914787e-06, + "loss": 0.799, + "step": 2037 + }, + { + "epoch": 0.10987707569549278, + "grad_norm": 0.8222963213920593, + "learning_rate": 9.983128244680797e-06, + "loss": 0.8302, + "step": 2038 + }, + { + "epoch": 0.1099309898641363, + "grad_norm": 0.7977816462516785, + "learning_rate": 9.983110838487818e-06, + "loss": 0.8475, + "step": 2039 + }, + { + "epoch": 0.10998490403277981, + "grad_norm": 0.7925818562507629, + "learning_rate": 9.983093423335875e-06, + "loss": 0.7176, + "step": 2040 + }, + { + "epoch": 0.11003881820142333, + "grad_norm": 0.8456152081489563, + "learning_rate": 9.983075999225002e-06, + "loss": 0.785, + "step": 2041 + }, + { + "epoch": 0.11009273237006685, + "grad_norm": 0.8691622018814087, + "learning_rate": 9.98305856615523e-06, + "loss": 0.8871, + "step": 2042 + }, + { + "epoch": 0.11014664653871037, + "grad_norm": 0.9402886629104614, + "learning_rate": 9.983041124126593e-06, + "loss": 0.8239, + "step": 2043 + }, + { + "epoch": 0.11020056070735389, + "grad_norm": 0.7975844144821167, + "learning_rate": 9.98302367313912e-06, + "loss": 0.7336, + "step": 2044 + }, + { + "epoch": 0.11025447487599742, + "grad_norm": 0.8384075164794922, + "learning_rate": 9.98300621319284e-06, + "loss": 0.9003, + "step": 2045 + }, + { + "epoch": 0.11030838904464094, + "grad_norm": 0.847994327545166, + "learning_rate": 9.98298874428779e-06, + "loss": 0.8611, + "step": 2046 + }, + { + "epoch": 0.11036230321328445, + "grad_norm": 0.801159143447876, + "learning_rate": 9.982971266423996e-06, + "loss": 0.7967, + "step": 2047 + }, + { + "epoch": 0.11041621738192797, + "grad_norm": 0.8316680192947388, + "learning_rate": 9.982953779601492e-06, + "loss": 0.8644, + "step": 2048 + }, + { + "epoch": 0.11047013155057149, + "grad_norm": 0.9387392401695251, + "learning_rate": 9.982936283820311e-06, + "loss": 0.916, + "step": 2049 + }, + { + "epoch": 0.11052404571921501, + "grad_norm": 0.8682491779327393, + "learning_rate": 9.982918779080481e-06, + "loss": 0.8267, + "step": 2050 + }, + { + "epoch": 0.11057795988785853, + "grad_norm": 0.8443827629089355, + "learning_rate": 9.982901265382034e-06, + "loss": 0.8129, + "step": 2051 + }, + { + "epoch": 0.11063187405650204, + "grad_norm": 0.8612427115440369, + "learning_rate": 9.982883742725005e-06, + "loss": 0.9203, + "step": 2052 + }, + { + "epoch": 0.11068578822514556, + "grad_norm": 0.786834716796875, + "learning_rate": 9.98286621110942e-06, + "loss": 0.7731, + "step": 2053 + }, + { + "epoch": 0.1107397023937891, + "grad_norm": 0.8566606044769287, + "learning_rate": 9.982848670535316e-06, + "loss": 0.8111, + "step": 2054 + }, + { + "epoch": 0.11079361656243261, + "grad_norm": 0.7485222816467285, + "learning_rate": 9.982831121002722e-06, + "loss": 0.722, + "step": 2055 + }, + { + "epoch": 0.11084753073107613, + "grad_norm": 0.7441151738166809, + "learning_rate": 9.98281356251167e-06, + "loss": 0.7081, + "step": 2056 + }, + { + "epoch": 0.11090144489971965, + "grad_norm": 0.8212536573410034, + "learning_rate": 9.98279599506219e-06, + "loss": 0.8572, + "step": 2057 + }, + { + "epoch": 0.11095535906836317, + "grad_norm": 0.8686707019805908, + "learning_rate": 9.982778418654315e-06, + "loss": 0.8553, + "step": 2058 + }, + { + "epoch": 0.11100927323700668, + "grad_norm": 0.8908647298812866, + "learning_rate": 9.982760833288079e-06, + "loss": 0.9059, + "step": 2059 + }, + { + "epoch": 0.1110631874056502, + "grad_norm": 0.9393401741981506, + "learning_rate": 9.982743238963508e-06, + "loss": 0.8574, + "step": 2060 + }, + { + "epoch": 0.11111710157429372, + "grad_norm": 0.9027063250541687, + "learning_rate": 9.982725635680638e-06, + "loss": 0.7717, + "step": 2061 + }, + { + "epoch": 0.11117101574293724, + "grad_norm": 0.7742587924003601, + "learning_rate": 9.982708023439498e-06, + "loss": 0.6618, + "step": 2062 + }, + { + "epoch": 0.11122492991158077, + "grad_norm": 0.8025707602500916, + "learning_rate": 9.982690402240124e-06, + "loss": 0.7263, + "step": 2063 + }, + { + "epoch": 0.11127884408022429, + "grad_norm": 0.8629397749900818, + "learning_rate": 9.982672772082541e-06, + "loss": 0.8222, + "step": 2064 + }, + { + "epoch": 0.1113327582488678, + "grad_norm": 0.8332691788673401, + "learning_rate": 9.982655132966785e-06, + "loss": 0.8302, + "step": 2065 + }, + { + "epoch": 0.11138667241751132, + "grad_norm": 0.8381907939910889, + "learning_rate": 9.982637484892889e-06, + "loss": 0.8638, + "step": 2066 + }, + { + "epoch": 0.11144058658615484, + "grad_norm": 1.0945167541503906, + "learning_rate": 9.982619827860882e-06, + "loss": 0.8866, + "step": 2067 + }, + { + "epoch": 0.11149450075479836, + "grad_norm": 0.8755025267601013, + "learning_rate": 9.982602161870795e-06, + "loss": 0.8587, + "step": 2068 + }, + { + "epoch": 0.11154841492344188, + "grad_norm": 0.8665636777877808, + "learning_rate": 9.982584486922664e-06, + "loss": 0.8309, + "step": 2069 + }, + { + "epoch": 0.1116023290920854, + "grad_norm": 0.8764104247093201, + "learning_rate": 9.982566803016516e-06, + "loss": 0.9003, + "step": 2070 + }, + { + "epoch": 0.11165624326072891, + "grad_norm": 1.1225675344467163, + "learning_rate": 9.982549110152387e-06, + "loss": 0.8897, + "step": 2071 + }, + { + "epoch": 0.11171015742937244, + "grad_norm": 0.7883412837982178, + "learning_rate": 9.982531408330304e-06, + "loss": 0.7104, + "step": 2072 + }, + { + "epoch": 0.11176407159801596, + "grad_norm": 0.8683668971061707, + "learning_rate": 9.982513697550303e-06, + "loss": 0.831, + "step": 2073 + }, + { + "epoch": 0.11181798576665948, + "grad_norm": 0.9139745831489563, + "learning_rate": 9.982495977812415e-06, + "loss": 0.7492, + "step": 2074 + }, + { + "epoch": 0.111871899935303, + "grad_norm": 0.8651925921440125, + "learning_rate": 9.98247824911667e-06, + "loss": 0.8385, + "step": 2075 + }, + { + "epoch": 0.11192581410394652, + "grad_norm": 0.9110192656517029, + "learning_rate": 9.982460511463102e-06, + "loss": 0.8513, + "step": 2076 + }, + { + "epoch": 0.11197972827259003, + "grad_norm": 0.8511810302734375, + "learning_rate": 9.982442764851742e-06, + "loss": 0.8352, + "step": 2077 + }, + { + "epoch": 0.11203364244123355, + "grad_norm": 0.8981106877326965, + "learning_rate": 9.982425009282622e-06, + "loss": 0.7837, + "step": 2078 + }, + { + "epoch": 0.11208755660987707, + "grad_norm": 0.7660240530967712, + "learning_rate": 9.982407244755771e-06, + "loss": 0.6994, + "step": 2079 + }, + { + "epoch": 0.11214147077852059, + "grad_norm": 0.830569863319397, + "learning_rate": 9.982389471271228e-06, + "loss": 0.7756, + "step": 2080 + }, + { + "epoch": 0.11219538494716412, + "grad_norm": 0.8888838887214661, + "learning_rate": 9.982371688829018e-06, + "loss": 0.7302, + "step": 2081 + }, + { + "epoch": 0.11224929911580764, + "grad_norm": 0.823513388633728, + "learning_rate": 9.982353897429176e-06, + "loss": 0.8357, + "step": 2082 + }, + { + "epoch": 0.11230321328445116, + "grad_norm": 0.8353226780891418, + "learning_rate": 9.982336097071734e-06, + "loss": 0.7939, + "step": 2083 + }, + { + "epoch": 0.11235712745309467, + "grad_norm": 1.0246703624725342, + "learning_rate": 9.982318287756725e-06, + "loss": 0.9416, + "step": 2084 + }, + { + "epoch": 0.11241104162173819, + "grad_norm": 0.9405194520950317, + "learning_rate": 9.982300469484178e-06, + "loss": 0.8296, + "step": 2085 + }, + { + "epoch": 0.11246495579038171, + "grad_norm": 0.905885636806488, + "learning_rate": 9.982282642254126e-06, + "loss": 0.8181, + "step": 2086 + }, + { + "epoch": 0.11251886995902523, + "grad_norm": 0.8098746538162231, + "learning_rate": 9.982264806066604e-06, + "loss": 0.7372, + "step": 2087 + }, + { + "epoch": 0.11257278412766875, + "grad_norm": 1.2416350841522217, + "learning_rate": 9.98224696092164e-06, + "loss": 0.8984, + "step": 2088 + }, + { + "epoch": 0.11262669829631228, + "grad_norm": 0.8675969839096069, + "learning_rate": 9.98222910681927e-06, + "loss": 0.8417, + "step": 2089 + }, + { + "epoch": 0.1126806124649558, + "grad_norm": 1.063124179840088, + "learning_rate": 9.982211243759522e-06, + "loss": 0.9227, + "step": 2090 + }, + { + "epoch": 0.11273452663359931, + "grad_norm": 0.9010531902313232, + "learning_rate": 9.98219337174243e-06, + "loss": 0.9547, + "step": 2091 + }, + { + "epoch": 0.11278844080224283, + "grad_norm": 0.7843347191810608, + "learning_rate": 9.982175490768027e-06, + "loss": 0.8607, + "step": 2092 + }, + { + "epoch": 0.11284235497088635, + "grad_norm": 0.8451966643333435, + "learning_rate": 9.982157600836344e-06, + "loss": 0.8788, + "step": 2093 + }, + { + "epoch": 0.11289626913952987, + "grad_norm": 0.7359250783920288, + "learning_rate": 9.982139701947415e-06, + "loss": 0.7916, + "step": 2094 + }, + { + "epoch": 0.11295018330817339, + "grad_norm": 0.8133944869041443, + "learning_rate": 9.98212179410127e-06, + "loss": 0.8327, + "step": 2095 + }, + { + "epoch": 0.1130040974768169, + "grad_norm": 0.8658613562583923, + "learning_rate": 9.982103877297941e-06, + "loss": 0.7648, + "step": 2096 + }, + { + "epoch": 0.11305801164546042, + "grad_norm": 0.8523211479187012, + "learning_rate": 9.982085951537463e-06, + "loss": 0.8618, + "step": 2097 + }, + { + "epoch": 0.11311192581410395, + "grad_norm": 0.9494971632957458, + "learning_rate": 9.982068016819867e-06, + "loss": 0.8116, + "step": 2098 + }, + { + "epoch": 0.11316583998274747, + "grad_norm": 0.797603964805603, + "learning_rate": 9.982050073145182e-06, + "loss": 0.7268, + "step": 2099 + }, + { + "epoch": 0.11321975415139099, + "grad_norm": 0.8662691712379456, + "learning_rate": 9.982032120513443e-06, + "loss": 0.8007, + "step": 2100 + }, + { + "epoch": 0.1132736683200345, + "grad_norm": 0.8377127051353455, + "learning_rate": 9.982014158924684e-06, + "loss": 0.813, + "step": 2101 + }, + { + "epoch": 0.11332758248867802, + "grad_norm": 1.0051186084747314, + "learning_rate": 9.981996188378934e-06, + "loss": 0.921, + "step": 2102 + }, + { + "epoch": 0.11338149665732154, + "grad_norm": 0.7831799983978271, + "learning_rate": 9.981978208876228e-06, + "loss": 0.9197, + "step": 2103 + }, + { + "epoch": 0.11343541082596506, + "grad_norm": 1.0273268222808838, + "learning_rate": 9.981960220416595e-06, + "loss": 0.9144, + "step": 2104 + }, + { + "epoch": 0.11348932499460858, + "grad_norm": 0.8754317164421082, + "learning_rate": 9.981942223000072e-06, + "loss": 0.8359, + "step": 2105 + }, + { + "epoch": 0.1135432391632521, + "grad_norm": 0.7923420071601868, + "learning_rate": 9.981924216626686e-06, + "loss": 0.737, + "step": 2106 + }, + { + "epoch": 0.11359715333189563, + "grad_norm": 0.8651608824729919, + "learning_rate": 9.981906201296475e-06, + "loss": 0.7588, + "step": 2107 + }, + { + "epoch": 0.11365106750053915, + "grad_norm": 0.9219616651535034, + "learning_rate": 9.981888177009468e-06, + "loss": 0.8598, + "step": 2108 + }, + { + "epoch": 0.11370498166918266, + "grad_norm": 0.8936532139778137, + "learning_rate": 9.981870143765697e-06, + "loss": 0.7718, + "step": 2109 + }, + { + "epoch": 0.11375889583782618, + "grad_norm": 0.8959317803382874, + "learning_rate": 9.981852101565195e-06, + "loss": 0.794, + "step": 2110 + }, + { + "epoch": 0.1138128100064697, + "grad_norm": 0.8781943917274475, + "learning_rate": 9.981834050407997e-06, + "loss": 0.8045, + "step": 2111 + }, + { + "epoch": 0.11386672417511322, + "grad_norm": 0.8148792386054993, + "learning_rate": 9.981815990294131e-06, + "loss": 0.7398, + "step": 2112 + }, + { + "epoch": 0.11392063834375674, + "grad_norm": 0.8491646647453308, + "learning_rate": 9.981797921223633e-06, + "loss": 0.878, + "step": 2113 + }, + { + "epoch": 0.11397455251240025, + "grad_norm": 0.8166778087615967, + "learning_rate": 9.981779843196533e-06, + "loss": 0.918, + "step": 2114 + }, + { + "epoch": 0.11402846668104377, + "grad_norm": 0.8016941547393799, + "learning_rate": 9.981761756212867e-06, + "loss": 0.7958, + "step": 2115 + }, + { + "epoch": 0.1140823808496873, + "grad_norm": 0.9108608961105347, + "learning_rate": 9.981743660272663e-06, + "loss": 0.8645, + "step": 2116 + }, + { + "epoch": 0.11413629501833082, + "grad_norm": 0.8930072784423828, + "learning_rate": 9.981725555375956e-06, + "loss": 0.842, + "step": 2117 + }, + { + "epoch": 0.11419020918697434, + "grad_norm": 0.75871342420578, + "learning_rate": 9.981707441522778e-06, + "loss": 0.7513, + "step": 2118 + }, + { + "epoch": 0.11424412335561786, + "grad_norm": 0.9924628734588623, + "learning_rate": 9.981689318713163e-06, + "loss": 0.8248, + "step": 2119 + }, + { + "epoch": 0.11429803752426138, + "grad_norm": 0.9345909953117371, + "learning_rate": 9.981671186947145e-06, + "loss": 0.7963, + "step": 2120 + }, + { + "epoch": 0.1143519516929049, + "grad_norm": 0.8094825148582458, + "learning_rate": 9.98165304622475e-06, + "loss": 0.8189, + "step": 2121 + }, + { + "epoch": 0.11440586586154841, + "grad_norm": 0.789262056350708, + "learning_rate": 9.981634896546017e-06, + "loss": 0.721, + "step": 2122 + }, + { + "epoch": 0.11445978003019193, + "grad_norm": 0.9279952645301819, + "learning_rate": 9.981616737910975e-06, + "loss": 0.8499, + "step": 2123 + }, + { + "epoch": 0.11451369419883545, + "grad_norm": 0.8332392573356628, + "learning_rate": 9.981598570319657e-06, + "loss": 0.8296, + "step": 2124 + }, + { + "epoch": 0.11456760836747898, + "grad_norm": 0.7957965731620789, + "learning_rate": 9.981580393772098e-06, + "loss": 0.7872, + "step": 2125 + }, + { + "epoch": 0.1146215225361225, + "grad_norm": 0.7587382197380066, + "learning_rate": 9.981562208268331e-06, + "loss": 0.721, + "step": 2126 + }, + { + "epoch": 0.11467543670476602, + "grad_norm": 0.7246111631393433, + "learning_rate": 9.981544013808385e-06, + "loss": 0.7965, + "step": 2127 + }, + { + "epoch": 0.11472935087340953, + "grad_norm": 0.9953028559684753, + "learning_rate": 9.981525810392295e-06, + "loss": 0.7129, + "step": 2128 + }, + { + "epoch": 0.11478326504205305, + "grad_norm": 1.0731823444366455, + "learning_rate": 9.981507598020094e-06, + "loss": 0.8532, + "step": 2129 + }, + { + "epoch": 0.11483717921069657, + "grad_norm": 0.8425208926200867, + "learning_rate": 9.981489376691814e-06, + "loss": 0.8191, + "step": 2130 + }, + { + "epoch": 0.11489109337934009, + "grad_norm": 0.7841627597808838, + "learning_rate": 9.981471146407487e-06, + "loss": 0.7946, + "step": 2131 + }, + { + "epoch": 0.1149450075479836, + "grad_norm": 0.8923974633216858, + "learning_rate": 9.981452907167148e-06, + "loss": 0.8445, + "step": 2132 + }, + { + "epoch": 0.11499892171662712, + "grad_norm": 0.7729552984237671, + "learning_rate": 9.981434658970828e-06, + "loss": 0.7566, + "step": 2133 + }, + { + "epoch": 0.11505283588527065, + "grad_norm": 0.910899817943573, + "learning_rate": 9.98141640181856e-06, + "loss": 0.8236, + "step": 2134 + }, + { + "epoch": 0.11510675005391417, + "grad_norm": 0.8768936395645142, + "learning_rate": 9.981398135710377e-06, + "loss": 0.8929, + "step": 2135 + }, + { + "epoch": 0.11516066422255769, + "grad_norm": 0.9078627824783325, + "learning_rate": 9.981379860646313e-06, + "loss": 0.745, + "step": 2136 + }, + { + "epoch": 0.11521457839120121, + "grad_norm": 0.8225182890892029, + "learning_rate": 9.981361576626399e-06, + "loss": 0.8349, + "step": 2137 + }, + { + "epoch": 0.11526849255984473, + "grad_norm": 0.8092076778411865, + "learning_rate": 9.981343283650668e-06, + "loss": 0.8157, + "step": 2138 + }, + { + "epoch": 0.11532240672848824, + "grad_norm": 0.8253282308578491, + "learning_rate": 9.981324981719156e-06, + "loss": 0.7412, + "step": 2139 + }, + { + "epoch": 0.11537632089713176, + "grad_norm": 0.9668901562690735, + "learning_rate": 9.981306670831892e-06, + "loss": 0.7868, + "step": 2140 + }, + { + "epoch": 0.11543023506577528, + "grad_norm": 0.7919616103172302, + "learning_rate": 9.981288350988911e-06, + "loss": 0.7384, + "step": 2141 + }, + { + "epoch": 0.11548414923441881, + "grad_norm": 0.8589178919792175, + "learning_rate": 9.981270022190244e-06, + "loss": 0.8352, + "step": 2142 + }, + { + "epoch": 0.11553806340306233, + "grad_norm": 0.8211520910263062, + "learning_rate": 9.981251684435926e-06, + "loss": 0.8124, + "step": 2143 + }, + { + "epoch": 0.11559197757170585, + "grad_norm": 0.911702573299408, + "learning_rate": 9.98123333772599e-06, + "loss": 0.8468, + "step": 2144 + }, + { + "epoch": 0.11564589174034937, + "grad_norm": 0.7934874892234802, + "learning_rate": 9.981214982060469e-06, + "loss": 0.8091, + "step": 2145 + }, + { + "epoch": 0.11569980590899288, + "grad_norm": 0.7407031655311584, + "learning_rate": 9.981196617439394e-06, + "loss": 0.7755, + "step": 2146 + }, + { + "epoch": 0.1157537200776364, + "grad_norm": 0.757688581943512, + "learning_rate": 9.9811782438628e-06, + "loss": 0.7468, + "step": 2147 + }, + { + "epoch": 0.11580763424627992, + "grad_norm": 1.0007857084274292, + "learning_rate": 9.981159861330717e-06, + "loss": 0.9108, + "step": 2148 + }, + { + "epoch": 0.11586154841492344, + "grad_norm": 1.300113558769226, + "learning_rate": 9.981141469843183e-06, + "loss": 0.8099, + "step": 2149 + }, + { + "epoch": 0.11591546258356696, + "grad_norm": 1.0352274179458618, + "learning_rate": 9.981123069400226e-06, + "loss": 0.801, + "step": 2150 + }, + { + "epoch": 0.11596937675221049, + "grad_norm": 0.9033756256103516, + "learning_rate": 9.981104660001885e-06, + "loss": 0.8789, + "step": 2151 + }, + { + "epoch": 0.116023290920854, + "grad_norm": 0.9051264524459839, + "learning_rate": 9.981086241648188e-06, + "loss": 0.8737, + "step": 2152 + }, + { + "epoch": 0.11607720508949752, + "grad_norm": 0.7855859398841858, + "learning_rate": 9.98106781433917e-06, + "loss": 0.7508, + "step": 2153 + }, + { + "epoch": 0.11613111925814104, + "grad_norm": 0.9001717567443848, + "learning_rate": 9.981049378074862e-06, + "loss": 0.6852, + "step": 2154 + }, + { + "epoch": 0.11618503342678456, + "grad_norm": 0.8165149092674255, + "learning_rate": 9.9810309328553e-06, + "loss": 0.8755, + "step": 2155 + }, + { + "epoch": 0.11623894759542808, + "grad_norm": 0.8920814990997314, + "learning_rate": 9.981012478680517e-06, + "loss": 0.753, + "step": 2156 + }, + { + "epoch": 0.1162928617640716, + "grad_norm": 0.8186051249504089, + "learning_rate": 9.980994015550544e-06, + "loss": 0.8341, + "step": 2157 + }, + { + "epoch": 0.11634677593271511, + "grad_norm": 0.8103832602500916, + "learning_rate": 9.980975543465417e-06, + "loss": 0.8276, + "step": 2158 + }, + { + "epoch": 0.11640069010135863, + "grad_norm": 0.8752830028533936, + "learning_rate": 9.980957062425167e-06, + "loss": 0.8449, + "step": 2159 + }, + { + "epoch": 0.11645460427000216, + "grad_norm": 0.9748302698135376, + "learning_rate": 9.98093857242983e-06, + "loss": 0.8323, + "step": 2160 + }, + { + "epoch": 0.11650851843864568, + "grad_norm": 0.8948556184768677, + "learning_rate": 9.980920073479435e-06, + "loss": 0.7836, + "step": 2161 + }, + { + "epoch": 0.1165624326072892, + "grad_norm": 0.8715651035308838, + "learning_rate": 9.980901565574017e-06, + "loss": 0.7942, + "step": 2162 + }, + { + "epoch": 0.11661634677593272, + "grad_norm": 0.7667563557624817, + "learning_rate": 9.980883048713612e-06, + "loss": 0.7517, + "step": 2163 + }, + { + "epoch": 0.11667026094457623, + "grad_norm": 0.8058063387870789, + "learning_rate": 9.980864522898247e-06, + "loss": 0.7997, + "step": 2164 + }, + { + "epoch": 0.11672417511321975, + "grad_norm": 0.9300008416175842, + "learning_rate": 9.980845988127963e-06, + "loss": 0.856, + "step": 2165 + }, + { + "epoch": 0.11677808928186327, + "grad_norm": 0.8321848511695862, + "learning_rate": 9.98082744440279e-06, + "loss": 0.7483, + "step": 2166 + }, + { + "epoch": 0.11683200345050679, + "grad_norm": 0.9346274137496948, + "learning_rate": 9.98080889172276e-06, + "loss": 0.8149, + "step": 2167 + }, + { + "epoch": 0.1168859176191503, + "grad_norm": 0.9119831919670105, + "learning_rate": 9.980790330087906e-06, + "loss": 0.8384, + "step": 2168 + }, + { + "epoch": 0.11693983178779384, + "grad_norm": 0.8416613936424255, + "learning_rate": 9.980771759498264e-06, + "loss": 0.776, + "step": 2169 + }, + { + "epoch": 0.11699374595643736, + "grad_norm": 0.765889048576355, + "learning_rate": 9.980753179953867e-06, + "loss": 0.7413, + "step": 2170 + }, + { + "epoch": 0.11704766012508087, + "grad_norm": 1.3491352796554565, + "learning_rate": 9.980734591454746e-06, + "loss": 0.7444, + "step": 2171 + }, + { + "epoch": 0.11710157429372439, + "grad_norm": 0.926618218421936, + "learning_rate": 9.980715994000936e-06, + "loss": 0.8495, + "step": 2172 + }, + { + "epoch": 0.11715548846236791, + "grad_norm": 0.7720175981521606, + "learning_rate": 9.98069738759247e-06, + "loss": 0.8238, + "step": 2173 + }, + { + "epoch": 0.11720940263101143, + "grad_norm": 0.9114102125167847, + "learning_rate": 9.980678772229385e-06, + "loss": 0.7805, + "step": 2174 + }, + { + "epoch": 0.11726331679965495, + "grad_norm": 0.778404712677002, + "learning_rate": 9.980660147911709e-06, + "loss": 0.7705, + "step": 2175 + }, + { + "epoch": 0.11731723096829846, + "grad_norm": 0.7945864200592041, + "learning_rate": 9.980641514639478e-06, + "loss": 0.7052, + "step": 2176 + }, + { + "epoch": 0.11737114513694198, + "grad_norm": 0.8246831297874451, + "learning_rate": 9.980622872412723e-06, + "loss": 0.8514, + "step": 2177 + }, + { + "epoch": 0.11742505930558551, + "grad_norm": 0.899563193321228, + "learning_rate": 9.980604221231482e-06, + "loss": 0.761, + "step": 2178 + }, + { + "epoch": 0.11747897347422903, + "grad_norm": 0.7277782559394836, + "learning_rate": 9.980585561095788e-06, + "loss": 0.6671, + "step": 2179 + }, + { + "epoch": 0.11753288764287255, + "grad_norm": 0.7977896928787231, + "learning_rate": 9.98056689200567e-06, + "loss": 0.8045, + "step": 2180 + }, + { + "epoch": 0.11758680181151607, + "grad_norm": 0.8606321811676025, + "learning_rate": 9.980548213961165e-06, + "loss": 0.8232, + "step": 2181 + }, + { + "epoch": 0.11764071598015959, + "grad_norm": 0.769458532333374, + "learning_rate": 9.980529526962308e-06, + "loss": 0.729, + "step": 2182 + }, + { + "epoch": 0.1176946301488031, + "grad_norm": 1.1045739650726318, + "learning_rate": 9.98051083100913e-06, + "loss": 0.802, + "step": 2183 + }, + { + "epoch": 0.11774854431744662, + "grad_norm": 0.7568592429161072, + "learning_rate": 9.980492126101664e-06, + "loss": 0.7427, + "step": 2184 + }, + { + "epoch": 0.11780245848609014, + "grad_norm": 0.7503477931022644, + "learning_rate": 9.980473412239946e-06, + "loss": 0.7857, + "step": 2185 + }, + { + "epoch": 0.11785637265473366, + "grad_norm": 0.8330819606781006, + "learning_rate": 9.980454689424007e-06, + "loss": 0.7561, + "step": 2186 + }, + { + "epoch": 0.11791028682337719, + "grad_norm": 0.792736291885376, + "learning_rate": 9.980435957653884e-06, + "loss": 0.837, + "step": 2187 + }, + { + "epoch": 0.1179642009920207, + "grad_norm": 0.8983330130577087, + "learning_rate": 9.980417216929608e-06, + "loss": 0.8499, + "step": 2188 + }, + { + "epoch": 0.11801811516066422, + "grad_norm": 0.8700925707817078, + "learning_rate": 9.980398467251214e-06, + "loss": 0.9048, + "step": 2189 + }, + { + "epoch": 0.11807202932930774, + "grad_norm": 0.8873588442802429, + "learning_rate": 9.980379708618734e-06, + "loss": 0.7617, + "step": 2190 + }, + { + "epoch": 0.11812594349795126, + "grad_norm": 0.7786865234375, + "learning_rate": 9.980360941032204e-06, + "loss": 0.7828, + "step": 2191 + }, + { + "epoch": 0.11817985766659478, + "grad_norm": 0.796852171421051, + "learning_rate": 9.980342164491657e-06, + "loss": 0.7739, + "step": 2192 + }, + { + "epoch": 0.1182337718352383, + "grad_norm": 0.7752018570899963, + "learning_rate": 9.980323378997126e-06, + "loss": 0.6969, + "step": 2193 + }, + { + "epoch": 0.11828768600388181, + "grad_norm": 0.8607134819030762, + "learning_rate": 9.980304584548644e-06, + "loss": 0.8623, + "step": 2194 + }, + { + "epoch": 0.11834160017252535, + "grad_norm": 0.8624950051307678, + "learning_rate": 9.980285781146248e-06, + "loss": 0.8124, + "step": 2195 + }, + { + "epoch": 0.11839551434116886, + "grad_norm": 0.8951582908630371, + "learning_rate": 9.98026696878997e-06, + "loss": 0.8491, + "step": 2196 + }, + { + "epoch": 0.11844942850981238, + "grad_norm": 0.8373478055000305, + "learning_rate": 9.980248147479843e-06, + "loss": 0.7166, + "step": 2197 + }, + { + "epoch": 0.1185033426784559, + "grad_norm": 0.8007619976997375, + "learning_rate": 9.980229317215901e-06, + "loss": 0.8137, + "step": 2198 + }, + { + "epoch": 0.11855725684709942, + "grad_norm": 0.8464154601097107, + "learning_rate": 9.980210477998177e-06, + "loss": 0.7803, + "step": 2199 + }, + { + "epoch": 0.11861117101574294, + "grad_norm": 0.8384450078010559, + "learning_rate": 9.98019162982671e-06, + "loss": 0.8511, + "step": 2200 + }, + { + "epoch": 0.11866508518438645, + "grad_norm": 0.9059091210365295, + "learning_rate": 9.980172772701527e-06, + "loss": 0.8538, + "step": 2201 + }, + { + "epoch": 0.11871899935302997, + "grad_norm": 1.1080526113510132, + "learning_rate": 9.980153906622667e-06, + "loss": 1.0067, + "step": 2202 + }, + { + "epoch": 0.11877291352167349, + "grad_norm": 0.8379873633384705, + "learning_rate": 9.980135031590162e-06, + "loss": 0.8285, + "step": 2203 + }, + { + "epoch": 0.11882682769031702, + "grad_norm": 0.9143814444541931, + "learning_rate": 9.980116147604044e-06, + "loss": 0.8286, + "step": 2204 + }, + { + "epoch": 0.11888074185896054, + "grad_norm": 0.8619917631149292, + "learning_rate": 9.98009725466435e-06, + "loss": 0.8304, + "step": 2205 + }, + { + "epoch": 0.11893465602760406, + "grad_norm": 0.8470893502235413, + "learning_rate": 9.980078352771112e-06, + "loss": 0.8245, + "step": 2206 + }, + { + "epoch": 0.11898857019624758, + "grad_norm": 0.9560073614120483, + "learning_rate": 9.980059441924365e-06, + "loss": 0.8821, + "step": 2207 + }, + { + "epoch": 0.1190424843648911, + "grad_norm": 0.8186134696006775, + "learning_rate": 9.980040522124143e-06, + "loss": 0.7166, + "step": 2208 + }, + { + "epoch": 0.11909639853353461, + "grad_norm": 0.8410859704017639, + "learning_rate": 9.980021593370481e-06, + "loss": 0.7465, + "step": 2209 + }, + { + "epoch": 0.11915031270217813, + "grad_norm": 0.9180718660354614, + "learning_rate": 9.980002655663412e-06, + "loss": 0.8508, + "step": 2210 + }, + { + "epoch": 0.11920422687082165, + "grad_norm": 0.8384451270103455, + "learning_rate": 9.979983709002967e-06, + "loss": 0.7723, + "step": 2211 + }, + { + "epoch": 0.11925814103946517, + "grad_norm": 0.815075159072876, + "learning_rate": 9.979964753389187e-06, + "loss": 0.7769, + "step": 2212 + }, + { + "epoch": 0.1193120552081087, + "grad_norm": 0.9130523800849915, + "learning_rate": 9.9799457888221e-06, + "loss": 0.8616, + "step": 2213 + }, + { + "epoch": 0.11936596937675222, + "grad_norm": 0.8262661099433899, + "learning_rate": 9.97992681530174e-06, + "loss": 0.7507, + "step": 2214 + }, + { + "epoch": 0.11941988354539573, + "grad_norm": 0.8962772488594055, + "learning_rate": 9.979907832828145e-06, + "loss": 0.8387, + "step": 2215 + }, + { + "epoch": 0.11947379771403925, + "grad_norm": 0.8966812491416931, + "learning_rate": 9.979888841401348e-06, + "loss": 0.8095, + "step": 2216 + }, + { + "epoch": 0.11952771188268277, + "grad_norm": 0.8484013676643372, + "learning_rate": 9.979869841021381e-06, + "loss": 0.8475, + "step": 2217 + }, + { + "epoch": 0.11958162605132629, + "grad_norm": 0.8858511447906494, + "learning_rate": 9.979850831688282e-06, + "loss": 0.8576, + "step": 2218 + }, + { + "epoch": 0.1196355402199698, + "grad_norm": 0.8044704794883728, + "learning_rate": 9.97983181340208e-06, + "loss": 0.8195, + "step": 2219 + }, + { + "epoch": 0.11968945438861332, + "grad_norm": 0.8463665246963501, + "learning_rate": 9.979812786162815e-06, + "loss": 0.8177, + "step": 2220 + }, + { + "epoch": 0.11974336855725684, + "grad_norm": 0.8145734071731567, + "learning_rate": 9.979793749970517e-06, + "loss": 0.8307, + "step": 2221 + }, + { + "epoch": 0.11979728272590037, + "grad_norm": 0.7789961695671082, + "learning_rate": 9.97977470482522e-06, + "loss": 0.7854, + "step": 2222 + }, + { + "epoch": 0.11985119689454389, + "grad_norm": 0.858213484287262, + "learning_rate": 9.97975565072696e-06, + "loss": 0.8914, + "step": 2223 + }, + { + "epoch": 0.11990511106318741, + "grad_norm": 0.8503074645996094, + "learning_rate": 9.979736587675772e-06, + "loss": 0.8731, + "step": 2224 + }, + { + "epoch": 0.11995902523183093, + "grad_norm": 0.9815833568572998, + "learning_rate": 9.97971751567169e-06, + "loss": 0.8769, + "step": 2225 + }, + { + "epoch": 0.12001293940047444, + "grad_norm": 0.7897947430610657, + "learning_rate": 9.979698434714747e-06, + "loss": 0.8308, + "step": 2226 + }, + { + "epoch": 0.12006685356911796, + "grad_norm": 0.9122232794761658, + "learning_rate": 9.979679344804976e-06, + "loss": 0.8934, + "step": 2227 + }, + { + "epoch": 0.12012076773776148, + "grad_norm": 0.7640379071235657, + "learning_rate": 9.979660245942416e-06, + "loss": 0.8205, + "step": 2228 + }, + { + "epoch": 0.120174681906405, + "grad_norm": 0.8736944198608398, + "learning_rate": 9.979641138127097e-06, + "loss": 0.8522, + "step": 2229 + }, + { + "epoch": 0.12022859607504852, + "grad_norm": 0.8782697916030884, + "learning_rate": 9.979622021359054e-06, + "loss": 0.812, + "step": 2230 + }, + { + "epoch": 0.12028251024369205, + "grad_norm": 0.8260065317153931, + "learning_rate": 9.979602895638322e-06, + "loss": 0.768, + "step": 2231 + }, + { + "epoch": 0.12033642441233557, + "grad_norm": 0.8338255286216736, + "learning_rate": 9.979583760964939e-06, + "loss": 0.7747, + "step": 2232 + }, + { + "epoch": 0.12039033858097908, + "grad_norm": 0.8310086131095886, + "learning_rate": 9.979564617338933e-06, + "loss": 0.8206, + "step": 2233 + }, + { + "epoch": 0.1204442527496226, + "grad_norm": 0.8234529495239258, + "learning_rate": 9.979545464760342e-06, + "loss": 0.847, + "step": 2234 + }, + { + "epoch": 0.12049816691826612, + "grad_norm": 0.9490135908126831, + "learning_rate": 9.9795263032292e-06, + "loss": 0.7277, + "step": 2235 + }, + { + "epoch": 0.12055208108690964, + "grad_norm": 0.8937979340553284, + "learning_rate": 9.97950713274554e-06, + "loss": 0.8714, + "step": 2236 + }, + { + "epoch": 0.12060599525555316, + "grad_norm": 0.7739347219467163, + "learning_rate": 9.9794879533094e-06, + "loss": 0.8009, + "step": 2237 + }, + { + "epoch": 0.12065990942419667, + "grad_norm": 0.8843472003936768, + "learning_rate": 9.979468764920812e-06, + "loss": 0.7748, + "step": 2238 + }, + { + "epoch": 0.12071382359284019, + "grad_norm": 0.815528154373169, + "learning_rate": 9.979449567579809e-06, + "loss": 0.7896, + "step": 2239 + }, + { + "epoch": 0.12076773776148372, + "grad_norm": 0.8802885413169861, + "learning_rate": 9.979430361286428e-06, + "loss": 0.8468, + "step": 2240 + }, + { + "epoch": 0.12082165193012724, + "grad_norm": 0.7907035946846008, + "learning_rate": 9.979411146040703e-06, + "loss": 0.7742, + "step": 2241 + }, + { + "epoch": 0.12087556609877076, + "grad_norm": 0.8344926238059998, + "learning_rate": 9.979391921842669e-06, + "loss": 0.8242, + "step": 2242 + }, + { + "epoch": 0.12092948026741428, + "grad_norm": 0.8011842370033264, + "learning_rate": 9.979372688692359e-06, + "loss": 0.7697, + "step": 2243 + }, + { + "epoch": 0.1209833944360578, + "grad_norm": 0.9063104391098022, + "learning_rate": 9.97935344658981e-06, + "loss": 0.8487, + "step": 2244 + }, + { + "epoch": 0.12103730860470131, + "grad_norm": 0.8313894867897034, + "learning_rate": 9.979334195535053e-06, + "loss": 0.8601, + "step": 2245 + }, + { + "epoch": 0.12109122277334483, + "grad_norm": 0.7892987728118896, + "learning_rate": 9.979314935528125e-06, + "loss": 0.7539, + "step": 2246 + }, + { + "epoch": 0.12114513694198835, + "grad_norm": 0.8141210079193115, + "learning_rate": 9.979295666569062e-06, + "loss": 0.8749, + "step": 2247 + }, + { + "epoch": 0.12119905111063188, + "grad_norm": 0.8218675851821899, + "learning_rate": 9.979276388657895e-06, + "loss": 0.743, + "step": 2248 + }, + { + "epoch": 0.1212529652792754, + "grad_norm": 0.8640784025192261, + "learning_rate": 9.979257101794661e-06, + "loss": 0.8876, + "step": 2249 + }, + { + "epoch": 0.12130687944791892, + "grad_norm": 0.8411698341369629, + "learning_rate": 9.979237805979395e-06, + "loss": 0.8692, + "step": 2250 + }, + { + "epoch": 0.12136079361656243, + "grad_norm": 0.9402859210968018, + "learning_rate": 9.97921850121213e-06, + "loss": 0.9362, + "step": 2251 + }, + { + "epoch": 0.12141470778520595, + "grad_norm": 0.8132252097129822, + "learning_rate": 9.979199187492903e-06, + "loss": 0.8119, + "step": 2252 + }, + { + "epoch": 0.12146862195384947, + "grad_norm": 0.9142205119132996, + "learning_rate": 9.979179864821747e-06, + "loss": 0.8219, + "step": 2253 + }, + { + "epoch": 0.12152253612249299, + "grad_norm": 0.9614750742912292, + "learning_rate": 9.979160533198697e-06, + "loss": 0.8342, + "step": 2254 + }, + { + "epoch": 0.1215764502911365, + "grad_norm": 0.7893047332763672, + "learning_rate": 9.979141192623787e-06, + "loss": 0.7111, + "step": 2255 + }, + { + "epoch": 0.12163036445978002, + "grad_norm": 0.8807032704353333, + "learning_rate": 9.979121843097053e-06, + "loss": 0.7677, + "step": 2256 + }, + { + "epoch": 0.12168427862842356, + "grad_norm": 1.1099025011062622, + "learning_rate": 9.97910248461853e-06, + "loss": 0.9548, + "step": 2257 + }, + { + "epoch": 0.12173819279706707, + "grad_norm": 0.9182586669921875, + "learning_rate": 9.979083117188253e-06, + "loss": 0.8734, + "step": 2258 + }, + { + "epoch": 0.12179210696571059, + "grad_norm": 0.9201869964599609, + "learning_rate": 9.979063740806253e-06, + "loss": 0.823, + "step": 2259 + }, + { + "epoch": 0.12184602113435411, + "grad_norm": 1.0309760570526123, + "learning_rate": 9.979044355472571e-06, + "loss": 0.7175, + "step": 2260 + }, + { + "epoch": 0.12189993530299763, + "grad_norm": 0.8577457070350647, + "learning_rate": 9.979024961187238e-06, + "loss": 0.8963, + "step": 2261 + }, + { + "epoch": 0.12195384947164115, + "grad_norm": 0.8203986883163452, + "learning_rate": 9.97900555795029e-06, + "loss": 0.736, + "step": 2262 + }, + { + "epoch": 0.12200776364028466, + "grad_norm": 0.8232439160346985, + "learning_rate": 9.97898614576176e-06, + "loss": 0.8104, + "step": 2263 + }, + { + "epoch": 0.12206167780892818, + "grad_norm": 1.276479959487915, + "learning_rate": 9.978966724621686e-06, + "loss": 0.7975, + "step": 2264 + }, + { + "epoch": 0.1221155919775717, + "grad_norm": 1.0115424394607544, + "learning_rate": 9.978947294530102e-06, + "loss": 1.0566, + "step": 2265 + }, + { + "epoch": 0.12216950614621523, + "grad_norm": 0.8645843863487244, + "learning_rate": 9.97892785548704e-06, + "loss": 0.8772, + "step": 2266 + }, + { + "epoch": 0.12222342031485875, + "grad_norm": 0.8335905075073242, + "learning_rate": 9.978908407492539e-06, + "loss": 0.7735, + "step": 2267 + }, + { + "epoch": 0.12227733448350227, + "grad_norm": 0.7752977013587952, + "learning_rate": 9.978888950546632e-06, + "loss": 0.725, + "step": 2268 + }, + { + "epoch": 0.12233124865214579, + "grad_norm": 0.9533143639564514, + "learning_rate": 9.978869484649354e-06, + "loss": 0.7845, + "step": 2269 + }, + { + "epoch": 0.1223851628207893, + "grad_norm": 1.2071044445037842, + "learning_rate": 9.978850009800739e-06, + "loss": 0.8394, + "step": 2270 + }, + { + "epoch": 0.12243907698943282, + "grad_norm": 0.8296889662742615, + "learning_rate": 9.978830526000825e-06, + "loss": 0.8088, + "step": 2271 + }, + { + "epoch": 0.12249299115807634, + "grad_norm": 0.7804126739501953, + "learning_rate": 9.978811033249643e-06, + "loss": 0.8174, + "step": 2272 + }, + { + "epoch": 0.12254690532671986, + "grad_norm": 0.9114241600036621, + "learning_rate": 9.978791531547232e-06, + "loss": 0.8601, + "step": 2273 + }, + { + "epoch": 0.12260081949536338, + "grad_norm": 0.9482108354568481, + "learning_rate": 9.978772020893626e-06, + "loss": 0.8063, + "step": 2274 + }, + { + "epoch": 0.1226547336640069, + "grad_norm": 0.7750483751296997, + "learning_rate": 9.978752501288857e-06, + "loss": 0.7875, + "step": 2275 + }, + { + "epoch": 0.12270864783265042, + "grad_norm": 0.838796854019165, + "learning_rate": 9.978732972732964e-06, + "loss": 0.7617, + "step": 2276 + }, + { + "epoch": 0.12276256200129394, + "grad_norm": 0.8419491052627563, + "learning_rate": 9.97871343522598e-06, + "loss": 0.8438, + "step": 2277 + }, + { + "epoch": 0.12281647616993746, + "grad_norm": 0.8125029802322388, + "learning_rate": 9.97869388876794e-06, + "loss": 0.8376, + "step": 2278 + }, + { + "epoch": 0.12287039033858098, + "grad_norm": 0.8310109972953796, + "learning_rate": 9.978674333358882e-06, + "loss": 0.8159, + "step": 2279 + }, + { + "epoch": 0.1229243045072245, + "grad_norm": 0.9533166289329529, + "learning_rate": 9.978654768998838e-06, + "loss": 0.8911, + "step": 2280 + }, + { + "epoch": 0.12297821867586801, + "grad_norm": 0.7564504742622375, + "learning_rate": 9.978635195687845e-06, + "loss": 0.7685, + "step": 2281 + }, + { + "epoch": 0.12303213284451153, + "grad_norm": 0.7912551760673523, + "learning_rate": 9.978615613425937e-06, + "loss": 0.7392, + "step": 2282 + }, + { + "epoch": 0.12308604701315505, + "grad_norm": 0.8196814656257629, + "learning_rate": 9.978596022213148e-06, + "loss": 0.8619, + "step": 2283 + }, + { + "epoch": 0.12313996118179858, + "grad_norm": 0.9053134918212891, + "learning_rate": 9.978576422049515e-06, + "loss": 0.8822, + "step": 2284 + }, + { + "epoch": 0.1231938753504421, + "grad_norm": 0.7988365292549133, + "learning_rate": 9.978556812935074e-06, + "loss": 0.7993, + "step": 2285 + }, + { + "epoch": 0.12324778951908562, + "grad_norm": 0.7595045566558838, + "learning_rate": 9.978537194869859e-06, + "loss": 0.7589, + "step": 2286 + }, + { + "epoch": 0.12330170368772914, + "grad_norm": 0.872302234172821, + "learning_rate": 9.978517567853908e-06, + "loss": 0.8315, + "step": 2287 + }, + { + "epoch": 0.12335561785637265, + "grad_norm": 0.8375674486160278, + "learning_rate": 9.97849793188725e-06, + "loss": 0.8348, + "step": 2288 + }, + { + "epoch": 0.12340953202501617, + "grad_norm": 0.8239575624465942, + "learning_rate": 9.978478286969927e-06, + "loss": 0.7636, + "step": 2289 + }, + { + "epoch": 0.12346344619365969, + "grad_norm": 0.8614348769187927, + "learning_rate": 9.97845863310197e-06, + "loss": 0.8162, + "step": 2290 + }, + { + "epoch": 0.12351736036230321, + "grad_norm": 0.8609321713447571, + "learning_rate": 9.978438970283417e-06, + "loss": 0.7776, + "step": 2291 + }, + { + "epoch": 0.12357127453094673, + "grad_norm": 0.9590173959732056, + "learning_rate": 9.978419298514302e-06, + "loss": 0.8761, + "step": 2292 + }, + { + "epoch": 0.12362518869959026, + "grad_norm": 0.8345216512680054, + "learning_rate": 9.978399617794659e-06, + "loss": 0.8353, + "step": 2293 + }, + { + "epoch": 0.12367910286823378, + "grad_norm": 0.8771556615829468, + "learning_rate": 9.978379928124526e-06, + "loss": 0.773, + "step": 2294 + }, + { + "epoch": 0.1237330170368773, + "grad_norm": 0.8305835127830505, + "learning_rate": 9.978360229503936e-06, + "loss": 0.7898, + "step": 2295 + }, + { + "epoch": 0.12378693120552081, + "grad_norm": 0.8536269664764404, + "learning_rate": 9.978340521932927e-06, + "loss": 0.8261, + "step": 2296 + }, + { + "epoch": 0.12384084537416433, + "grad_norm": 0.9008522629737854, + "learning_rate": 9.978320805411534e-06, + "loss": 0.7114, + "step": 2297 + }, + { + "epoch": 0.12389475954280785, + "grad_norm": 0.7834939956665039, + "learning_rate": 9.97830107993979e-06, + "loss": 0.7338, + "step": 2298 + }, + { + "epoch": 0.12394867371145137, + "grad_norm": 0.8269515037536621, + "learning_rate": 9.978281345517733e-06, + "loss": 0.7676, + "step": 2299 + }, + { + "epoch": 0.12400258788009488, + "grad_norm": 0.8482736945152283, + "learning_rate": 9.978261602145398e-06, + "loss": 0.8185, + "step": 2300 + }, + { + "epoch": 0.12405650204873842, + "grad_norm": 0.8833953142166138, + "learning_rate": 9.978241849822819e-06, + "loss": 0.7776, + "step": 2301 + }, + { + "epoch": 0.12411041621738193, + "grad_norm": 0.8089832067489624, + "learning_rate": 9.978222088550033e-06, + "loss": 0.7697, + "step": 2302 + }, + { + "epoch": 0.12416433038602545, + "grad_norm": 0.8204466104507446, + "learning_rate": 9.978202318327075e-06, + "loss": 0.839, + "step": 2303 + }, + { + "epoch": 0.12421824455466897, + "grad_norm": 0.8547719120979309, + "learning_rate": 9.97818253915398e-06, + "loss": 0.9022, + "step": 2304 + }, + { + "epoch": 0.12427215872331249, + "grad_norm": 1.090289831161499, + "learning_rate": 9.978162751030787e-06, + "loss": 0.7154, + "step": 2305 + }, + { + "epoch": 0.124326072891956, + "grad_norm": 0.88922518491745, + "learning_rate": 9.978142953957526e-06, + "loss": 0.8962, + "step": 2306 + }, + { + "epoch": 0.12437998706059952, + "grad_norm": 0.8741730451583862, + "learning_rate": 9.978123147934236e-06, + "loss": 0.7742, + "step": 2307 + }, + { + "epoch": 0.12443390122924304, + "grad_norm": 1.2885240316390991, + "learning_rate": 9.97810333296095e-06, + "loss": 0.7256, + "step": 2308 + }, + { + "epoch": 0.12448781539788656, + "grad_norm": 0.7973229885101318, + "learning_rate": 9.978083509037711e-06, + "loss": 0.8433, + "step": 2309 + }, + { + "epoch": 0.12454172956653009, + "grad_norm": 0.8328043222427368, + "learning_rate": 9.978063676164544e-06, + "loss": 0.8617, + "step": 2310 + }, + { + "epoch": 0.12459564373517361, + "grad_norm": 0.8093283176422119, + "learning_rate": 9.978043834341493e-06, + "loss": 0.8407, + "step": 2311 + }, + { + "epoch": 0.12464955790381713, + "grad_norm": 0.7566602826118469, + "learning_rate": 9.978023983568588e-06, + "loss": 0.7602, + "step": 2312 + }, + { + "epoch": 0.12470347207246064, + "grad_norm": 0.7731996178627014, + "learning_rate": 9.97800412384587e-06, + "loss": 0.8323, + "step": 2313 + }, + { + "epoch": 0.12475738624110416, + "grad_norm": 0.9148348569869995, + "learning_rate": 9.97798425517337e-06, + "loss": 0.7886, + "step": 2314 + }, + { + "epoch": 0.12481130040974768, + "grad_norm": 0.8546224236488342, + "learning_rate": 9.977964377551126e-06, + "loss": 0.8116, + "step": 2315 + }, + { + "epoch": 0.1248652145783912, + "grad_norm": 1.0733944177627563, + "learning_rate": 9.977944490979175e-06, + "loss": 0.8255, + "step": 2316 + }, + { + "epoch": 0.12491912874703472, + "grad_norm": 0.8404545783996582, + "learning_rate": 9.977924595457549e-06, + "loss": 0.8542, + "step": 2317 + }, + { + "epoch": 0.12497304291567823, + "grad_norm": 0.8276603817939758, + "learning_rate": 9.977904690986286e-06, + "loss": 0.8242, + "step": 2318 + }, + { + "epoch": 0.12502695708432177, + "grad_norm": 0.8703106641769409, + "learning_rate": 9.977884777565423e-06, + "loss": 0.8525, + "step": 2319 + }, + { + "epoch": 0.12508087125296527, + "grad_norm": 0.8353367447853088, + "learning_rate": 9.977864855194994e-06, + "loss": 0.7921, + "step": 2320 + }, + { + "epoch": 0.1251347854216088, + "grad_norm": 0.8283559083938599, + "learning_rate": 9.977844923875036e-06, + "loss": 0.8262, + "step": 2321 + }, + { + "epoch": 0.1251886995902523, + "grad_norm": 0.8737161755561829, + "learning_rate": 9.977824983605584e-06, + "loss": 0.9117, + "step": 2322 + }, + { + "epoch": 0.12524261375889584, + "grad_norm": 0.8616884350776672, + "learning_rate": 9.977805034386675e-06, + "loss": 0.8178, + "step": 2323 + }, + { + "epoch": 0.12529652792753937, + "grad_norm": 0.9863162636756897, + "learning_rate": 9.977785076218342e-06, + "loss": 0.8671, + "step": 2324 + }, + { + "epoch": 0.12535044209618287, + "grad_norm": 0.9636940360069275, + "learning_rate": 9.977765109100624e-06, + "loss": 0.894, + "step": 2325 + }, + { + "epoch": 0.1254043562648264, + "grad_norm": 0.741320013999939, + "learning_rate": 9.977745133033554e-06, + "loss": 0.7474, + "step": 2326 + }, + { + "epoch": 0.1254582704334699, + "grad_norm": 0.7776119709014893, + "learning_rate": 9.97772514801717e-06, + "loss": 0.7867, + "step": 2327 + }, + { + "epoch": 0.12551218460211344, + "grad_norm": 0.8219690918922424, + "learning_rate": 9.97770515405151e-06, + "loss": 0.8443, + "step": 2328 + }, + { + "epoch": 0.12556609877075695, + "grad_norm": 0.8977565765380859, + "learning_rate": 9.977685151136605e-06, + "loss": 0.7831, + "step": 2329 + }, + { + "epoch": 0.12562001293940048, + "grad_norm": 0.8503162264823914, + "learning_rate": 9.977665139272495e-06, + "loss": 0.8733, + "step": 2330 + }, + { + "epoch": 0.12567392710804398, + "grad_norm": 0.7666327953338623, + "learning_rate": 9.977645118459213e-06, + "loss": 0.7165, + "step": 2331 + }, + { + "epoch": 0.1257278412766875, + "grad_norm": 0.8265602588653564, + "learning_rate": 9.977625088696797e-06, + "loss": 0.8894, + "step": 2332 + }, + { + "epoch": 0.12578175544533104, + "grad_norm": 0.9852930307388306, + "learning_rate": 9.977605049985282e-06, + "loss": 0.9223, + "step": 2333 + }, + { + "epoch": 0.12583566961397455, + "grad_norm": 0.9563886523246765, + "learning_rate": 9.977585002324705e-06, + "loss": 0.8275, + "step": 2334 + }, + { + "epoch": 0.12588958378261808, + "grad_norm": 0.8098574876785278, + "learning_rate": 9.977564945715102e-06, + "loss": 0.8831, + "step": 2335 + }, + { + "epoch": 0.12594349795126158, + "grad_norm": 0.8795431852340698, + "learning_rate": 9.977544880156507e-06, + "loss": 0.8079, + "step": 2336 + }, + { + "epoch": 0.12599741211990512, + "grad_norm": 0.7483893036842346, + "learning_rate": 9.97752480564896e-06, + "loss": 0.7734, + "step": 2337 + }, + { + "epoch": 0.12605132628854862, + "grad_norm": 0.7988960146903992, + "learning_rate": 9.977504722192493e-06, + "loss": 0.6936, + "step": 2338 + }, + { + "epoch": 0.12610524045719215, + "grad_norm": 0.7945669293403625, + "learning_rate": 9.977484629787143e-06, + "loss": 0.8608, + "step": 2339 + }, + { + "epoch": 0.12615915462583566, + "grad_norm": 0.8720629215240479, + "learning_rate": 9.977464528432948e-06, + "loss": 0.8656, + "step": 2340 + }, + { + "epoch": 0.1262130687944792, + "grad_norm": 0.8935837745666504, + "learning_rate": 9.977444418129943e-06, + "loss": 0.8854, + "step": 2341 + }, + { + "epoch": 0.12626698296312272, + "grad_norm": 0.8034403324127197, + "learning_rate": 9.977424298878165e-06, + "loss": 0.8422, + "step": 2342 + }, + { + "epoch": 0.12632089713176622, + "grad_norm": 1.0071096420288086, + "learning_rate": 9.977404170677648e-06, + "loss": 0.9105, + "step": 2343 + }, + { + "epoch": 0.12637481130040976, + "grad_norm": 1.0757510662078857, + "learning_rate": 9.97738403352843e-06, + "loss": 0.7454, + "step": 2344 + }, + { + "epoch": 0.12642872546905326, + "grad_norm": 0.7133142352104187, + "learning_rate": 9.977363887430548e-06, + "loss": 0.6814, + "step": 2345 + }, + { + "epoch": 0.1264826396376968, + "grad_norm": 0.769752025604248, + "learning_rate": 9.977343732384035e-06, + "loss": 0.7209, + "step": 2346 + }, + { + "epoch": 0.1265365538063403, + "grad_norm": 0.8043524622917175, + "learning_rate": 9.977323568388933e-06, + "loss": 0.8379, + "step": 2347 + }, + { + "epoch": 0.12659046797498383, + "grad_norm": 0.9236345887184143, + "learning_rate": 9.97730339544527e-06, + "loss": 0.8091, + "step": 2348 + }, + { + "epoch": 0.12664438214362733, + "grad_norm": 0.8852472305297852, + "learning_rate": 9.97728321355309e-06, + "loss": 0.8527, + "step": 2349 + }, + { + "epoch": 0.12669829631227086, + "grad_norm": 0.8866454362869263, + "learning_rate": 9.977263022712425e-06, + "loss": 0.7412, + "step": 2350 + }, + { + "epoch": 0.1267522104809144, + "grad_norm": 0.7950204014778137, + "learning_rate": 9.977242822923311e-06, + "loss": 0.7778, + "step": 2351 + }, + { + "epoch": 0.1268061246495579, + "grad_norm": 0.8775694966316223, + "learning_rate": 9.977222614185787e-06, + "loss": 0.7437, + "step": 2352 + }, + { + "epoch": 0.12686003881820143, + "grad_norm": 0.8059643507003784, + "learning_rate": 9.977202396499889e-06, + "loss": 0.7935, + "step": 2353 + }, + { + "epoch": 0.12691395298684494, + "grad_norm": 0.8250171542167664, + "learning_rate": 9.977182169865652e-06, + "loss": 0.7936, + "step": 2354 + }, + { + "epoch": 0.12696786715548847, + "grad_norm": 0.8618381023406982, + "learning_rate": 9.97716193428311e-06, + "loss": 0.7884, + "step": 2355 + }, + { + "epoch": 0.12702178132413197, + "grad_norm": 0.8977087140083313, + "learning_rate": 9.977141689752306e-06, + "loss": 0.7764, + "step": 2356 + }, + { + "epoch": 0.1270756954927755, + "grad_norm": 0.7616862058639526, + "learning_rate": 9.97712143627327e-06, + "loss": 0.7222, + "step": 2357 + }, + { + "epoch": 0.127129609661419, + "grad_norm": 0.8255194425582886, + "learning_rate": 9.977101173846042e-06, + "loss": 0.8015, + "step": 2358 + }, + { + "epoch": 0.12718352383006254, + "grad_norm": 0.7783398628234863, + "learning_rate": 9.977080902470657e-06, + "loss": 0.7403, + "step": 2359 + }, + { + "epoch": 0.12723743799870607, + "grad_norm": 1.201339840888977, + "learning_rate": 9.977060622147152e-06, + "loss": 0.8994, + "step": 2360 + }, + { + "epoch": 0.12729135216734958, + "grad_norm": 0.906428337097168, + "learning_rate": 9.977040332875563e-06, + "loss": 0.7791, + "step": 2361 + }, + { + "epoch": 0.1273452663359931, + "grad_norm": 0.8238182663917542, + "learning_rate": 9.977020034655927e-06, + "loss": 0.728, + "step": 2362 + }, + { + "epoch": 0.1273991805046366, + "grad_norm": 0.9390681385993958, + "learning_rate": 9.976999727488279e-06, + "loss": 0.8697, + "step": 2363 + }, + { + "epoch": 0.12745309467328014, + "grad_norm": 0.8595122694969177, + "learning_rate": 9.976979411372658e-06, + "loss": 0.8481, + "step": 2364 + }, + { + "epoch": 0.12750700884192365, + "grad_norm": 0.8220391273498535, + "learning_rate": 9.976959086309099e-06, + "loss": 0.709, + "step": 2365 + }, + { + "epoch": 0.12756092301056718, + "grad_norm": 0.9712308645248413, + "learning_rate": 9.976938752297638e-06, + "loss": 0.8898, + "step": 2366 + }, + { + "epoch": 0.12761483717921068, + "grad_norm": 0.8864933848381042, + "learning_rate": 9.976918409338315e-06, + "loss": 0.8798, + "step": 2367 + }, + { + "epoch": 0.12766875134785421, + "grad_norm": 0.7780918478965759, + "learning_rate": 9.976898057431162e-06, + "loss": 0.8123, + "step": 2368 + }, + { + "epoch": 0.12772266551649775, + "grad_norm": 0.8338439464569092, + "learning_rate": 9.976877696576218e-06, + "loss": 0.8177, + "step": 2369 + }, + { + "epoch": 0.12777657968514125, + "grad_norm": 0.9967712759971619, + "learning_rate": 9.976857326773517e-06, + "loss": 0.8613, + "step": 2370 + }, + { + "epoch": 0.12783049385378478, + "grad_norm": 0.7666492462158203, + "learning_rate": 9.976836948023099e-06, + "loss": 0.7226, + "step": 2371 + }, + { + "epoch": 0.1278844080224283, + "grad_norm": 0.9783684611320496, + "learning_rate": 9.976816560325e-06, + "loss": 0.8616, + "step": 2372 + }, + { + "epoch": 0.12793832219107182, + "grad_norm": 1.0170663595199585, + "learning_rate": 9.976796163679256e-06, + "loss": 0.8211, + "step": 2373 + }, + { + "epoch": 0.12799223635971532, + "grad_norm": 0.8657981157302856, + "learning_rate": 9.976775758085903e-06, + "loss": 0.867, + "step": 2374 + }, + { + "epoch": 0.12804615052835885, + "grad_norm": 0.8487955927848816, + "learning_rate": 9.976755343544979e-06, + "loss": 0.8056, + "step": 2375 + }, + { + "epoch": 0.12810006469700239, + "grad_norm": 0.90731281042099, + "learning_rate": 9.976734920056522e-06, + "loss": 0.8492, + "step": 2376 + }, + { + "epoch": 0.1281539788656459, + "grad_norm": 0.9684501886367798, + "learning_rate": 9.976714487620565e-06, + "loss": 0.8023, + "step": 2377 + }, + { + "epoch": 0.12820789303428942, + "grad_norm": 0.8361303806304932, + "learning_rate": 9.976694046237146e-06, + "loss": 0.8132, + "step": 2378 + }, + { + "epoch": 0.12826180720293293, + "grad_norm": 0.9570466876029968, + "learning_rate": 9.976673595906303e-06, + "loss": 0.8991, + "step": 2379 + }, + { + "epoch": 0.12831572137157646, + "grad_norm": 0.8944576978683472, + "learning_rate": 9.976653136628071e-06, + "loss": 0.8163, + "step": 2380 + }, + { + "epoch": 0.12836963554021996, + "grad_norm": 0.7991742491722107, + "learning_rate": 9.976632668402489e-06, + "loss": 0.7962, + "step": 2381 + }, + { + "epoch": 0.1284235497088635, + "grad_norm": 0.9284802079200745, + "learning_rate": 9.976612191229594e-06, + "loss": 1.0115, + "step": 2382 + }, + { + "epoch": 0.128477463877507, + "grad_norm": 0.8092453479766846, + "learning_rate": 9.97659170510942e-06, + "loss": 0.705, + "step": 2383 + }, + { + "epoch": 0.12853137804615053, + "grad_norm": 0.8068677186965942, + "learning_rate": 9.976571210042005e-06, + "loss": 0.8283, + "step": 2384 + }, + { + "epoch": 0.12858529221479406, + "grad_norm": 0.8636525869369507, + "learning_rate": 9.976550706027386e-06, + "loss": 0.7824, + "step": 2385 + }, + { + "epoch": 0.12863920638343757, + "grad_norm": 0.9768033027648926, + "learning_rate": 9.9765301930656e-06, + "loss": 0.8317, + "step": 2386 + }, + { + "epoch": 0.1286931205520811, + "grad_norm": 0.8494508862495422, + "learning_rate": 9.976509671156684e-06, + "loss": 0.9464, + "step": 2387 + }, + { + "epoch": 0.1287470347207246, + "grad_norm": 0.8336171507835388, + "learning_rate": 9.976489140300676e-06, + "loss": 0.8003, + "step": 2388 + }, + { + "epoch": 0.12880094888936813, + "grad_norm": 0.819869339466095, + "learning_rate": 9.97646860049761e-06, + "loss": 0.6779, + "step": 2389 + }, + { + "epoch": 0.12885486305801164, + "grad_norm": 1.179028868675232, + "learning_rate": 9.976448051747526e-06, + "loss": 0.8183, + "step": 2390 + }, + { + "epoch": 0.12890877722665517, + "grad_norm": 0.8214680552482605, + "learning_rate": 9.97642749405046e-06, + "loss": 0.7659, + "step": 2391 + }, + { + "epoch": 0.12896269139529867, + "grad_norm": 0.8303862810134888, + "learning_rate": 9.976406927406446e-06, + "loss": 0.8993, + "step": 2392 + }, + { + "epoch": 0.1290166055639422, + "grad_norm": 0.8043105006217957, + "learning_rate": 9.976386351815526e-06, + "loss": 0.7948, + "step": 2393 + }, + { + "epoch": 0.12907051973258574, + "grad_norm": 0.7988419532775879, + "learning_rate": 9.976365767277734e-06, + "loss": 0.8042, + "step": 2394 + }, + { + "epoch": 0.12912443390122924, + "grad_norm": 0.8145790696144104, + "learning_rate": 9.976345173793107e-06, + "loss": 0.7214, + "step": 2395 + }, + { + "epoch": 0.12917834806987277, + "grad_norm": 0.8323239088058472, + "learning_rate": 9.976324571361682e-06, + "loss": 0.8692, + "step": 2396 + }, + { + "epoch": 0.12923226223851628, + "grad_norm": 1.5968064069747925, + "learning_rate": 9.976303959983498e-06, + "loss": 0.8573, + "step": 2397 + }, + { + "epoch": 0.1292861764071598, + "grad_norm": 0.8523521423339844, + "learning_rate": 9.976283339658589e-06, + "loss": 0.8856, + "step": 2398 + }, + { + "epoch": 0.1293400905758033, + "grad_norm": 1.3875633478164673, + "learning_rate": 9.976262710386994e-06, + "loss": 0.829, + "step": 2399 + }, + { + "epoch": 0.12939400474444684, + "grad_norm": 0.8131827712059021, + "learning_rate": 9.976242072168751e-06, + "loss": 0.7787, + "step": 2400 + }, + { + "epoch": 0.12944791891309035, + "grad_norm": 0.8347164392471313, + "learning_rate": 9.976221425003896e-06, + "loss": 0.9119, + "step": 2401 + }, + { + "epoch": 0.12950183308173388, + "grad_norm": 0.791674792766571, + "learning_rate": 9.976200768892465e-06, + "loss": 0.8483, + "step": 2402 + }, + { + "epoch": 0.1295557472503774, + "grad_norm": 0.8207666277885437, + "learning_rate": 9.976180103834496e-06, + "loss": 0.7688, + "step": 2403 + }, + { + "epoch": 0.12960966141902092, + "grad_norm": 0.8335880041122437, + "learning_rate": 9.976159429830027e-06, + "loss": 0.8943, + "step": 2404 + }, + { + "epoch": 0.12966357558766445, + "grad_norm": 0.8273102045059204, + "learning_rate": 9.976138746879094e-06, + "loss": 0.7847, + "step": 2405 + }, + { + "epoch": 0.12971748975630795, + "grad_norm": 0.9029181003570557, + "learning_rate": 9.976118054981735e-06, + "loss": 0.9779, + "step": 2406 + }, + { + "epoch": 0.12977140392495148, + "grad_norm": 1.0253269672393799, + "learning_rate": 9.976097354137986e-06, + "loss": 0.8301, + "step": 2407 + }, + { + "epoch": 0.129825318093595, + "grad_norm": 0.859992265701294, + "learning_rate": 9.976076644347887e-06, + "loss": 0.7809, + "step": 2408 + }, + { + "epoch": 0.12987923226223852, + "grad_norm": 0.8313273787498474, + "learning_rate": 9.976055925611472e-06, + "loss": 0.8435, + "step": 2409 + }, + { + "epoch": 0.12993314643088202, + "grad_norm": 0.8921852707862854, + "learning_rate": 9.976035197928779e-06, + "loss": 0.8407, + "step": 2410 + }, + { + "epoch": 0.12998706059952556, + "grad_norm": 0.9168267846107483, + "learning_rate": 9.976014461299848e-06, + "loss": 0.8428, + "step": 2411 + }, + { + "epoch": 0.1300409747681691, + "grad_norm": 0.8943728804588318, + "learning_rate": 9.975993715724712e-06, + "loss": 0.8953, + "step": 2412 + }, + { + "epoch": 0.1300948889368126, + "grad_norm": 0.8288392424583435, + "learning_rate": 9.975972961203411e-06, + "loss": 0.8008, + "step": 2413 + }, + { + "epoch": 0.13014880310545612, + "grad_norm": 0.8432718515396118, + "learning_rate": 9.975952197735982e-06, + "loss": 0.775, + "step": 2414 + }, + { + "epoch": 0.13020271727409963, + "grad_norm": 1.029341220855713, + "learning_rate": 9.975931425322462e-06, + "loss": 0.9086, + "step": 2415 + }, + { + "epoch": 0.13025663144274316, + "grad_norm": 0.8342422842979431, + "learning_rate": 9.975910643962888e-06, + "loss": 0.8867, + "step": 2416 + }, + { + "epoch": 0.13031054561138666, + "grad_norm": 0.7766898274421692, + "learning_rate": 9.975889853657298e-06, + "loss": 0.7597, + "step": 2417 + }, + { + "epoch": 0.1303644597800302, + "grad_norm": 0.865112841129303, + "learning_rate": 9.97586905440573e-06, + "loss": 0.8164, + "step": 2418 + }, + { + "epoch": 0.1304183739486737, + "grad_norm": 0.7938675880432129, + "learning_rate": 9.97584824620822e-06, + "loss": 0.8053, + "step": 2419 + }, + { + "epoch": 0.13047228811731723, + "grad_norm": 0.8813329339027405, + "learning_rate": 9.975827429064805e-06, + "loss": 0.8662, + "step": 2420 + }, + { + "epoch": 0.13052620228596076, + "grad_norm": 0.8217114210128784, + "learning_rate": 9.975806602975525e-06, + "loss": 0.8647, + "step": 2421 + }, + { + "epoch": 0.13058011645460427, + "grad_norm": 1.0177736282348633, + "learning_rate": 9.975785767940413e-06, + "loss": 0.813, + "step": 2422 + }, + { + "epoch": 0.1306340306232478, + "grad_norm": 0.7887234687805176, + "learning_rate": 9.975764923959512e-06, + "loss": 0.7759, + "step": 2423 + }, + { + "epoch": 0.1306879447918913, + "grad_norm": 0.7670013904571533, + "learning_rate": 9.975744071032856e-06, + "loss": 0.7534, + "step": 2424 + }, + { + "epoch": 0.13074185896053483, + "grad_norm": 0.7348708510398865, + "learning_rate": 9.975723209160483e-06, + "loss": 0.7955, + "step": 2425 + }, + { + "epoch": 0.13079577312917834, + "grad_norm": 0.8183468580245972, + "learning_rate": 9.97570233834243e-06, + "loss": 0.8664, + "step": 2426 + }, + { + "epoch": 0.13084968729782187, + "grad_norm": 0.8783697485923767, + "learning_rate": 9.975681458578736e-06, + "loss": 0.8399, + "step": 2427 + }, + { + "epoch": 0.13090360146646537, + "grad_norm": 0.7653324007987976, + "learning_rate": 9.975660569869439e-06, + "loss": 0.7723, + "step": 2428 + }, + { + "epoch": 0.1309575156351089, + "grad_norm": 0.9938413500785828, + "learning_rate": 9.975639672214574e-06, + "loss": 0.7439, + "step": 2429 + }, + { + "epoch": 0.13101142980375244, + "grad_norm": 0.7844074368476868, + "learning_rate": 9.975618765614181e-06, + "loss": 0.8234, + "step": 2430 + }, + { + "epoch": 0.13106534397239594, + "grad_norm": 0.8992919325828552, + "learning_rate": 9.975597850068295e-06, + "loss": 0.7485, + "step": 2431 + }, + { + "epoch": 0.13111925814103947, + "grad_norm": 0.8023738265037537, + "learning_rate": 9.975576925576956e-06, + "loss": 0.7986, + "step": 2432 + }, + { + "epoch": 0.13117317230968298, + "grad_norm": 0.8369026184082031, + "learning_rate": 9.9755559921402e-06, + "loss": 0.8695, + "step": 2433 + }, + { + "epoch": 0.1312270864783265, + "grad_norm": 0.812224805355072, + "learning_rate": 9.975535049758067e-06, + "loss": 0.834, + "step": 2434 + }, + { + "epoch": 0.13128100064697001, + "grad_norm": 0.7718735337257385, + "learning_rate": 9.975514098430591e-06, + "loss": 0.8055, + "step": 2435 + }, + { + "epoch": 0.13133491481561355, + "grad_norm": 0.8709392547607422, + "learning_rate": 9.975493138157813e-06, + "loss": 0.899, + "step": 2436 + }, + { + "epoch": 0.13138882898425705, + "grad_norm": 0.8817125558853149, + "learning_rate": 9.97547216893977e-06, + "loss": 0.7908, + "step": 2437 + }, + { + "epoch": 0.13144274315290058, + "grad_norm": 0.9631084203720093, + "learning_rate": 9.975451190776498e-06, + "loss": 0.9153, + "step": 2438 + }, + { + "epoch": 0.1314966573215441, + "grad_norm": 0.998906672000885, + "learning_rate": 9.975430203668037e-06, + "loss": 0.971, + "step": 2439 + }, + { + "epoch": 0.13155057149018762, + "grad_norm": 0.9689096212387085, + "learning_rate": 9.975409207614422e-06, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 0.13160448565883115, + "grad_norm": 0.7694187760353088, + "learning_rate": 9.975388202615692e-06, + "loss": 0.757, + "step": 2441 + }, + { + "epoch": 0.13165839982747465, + "grad_norm": 0.8082549571990967, + "learning_rate": 9.975367188671885e-06, + "loss": 0.8704, + "step": 2442 + }, + { + "epoch": 0.13171231399611819, + "grad_norm": 0.8493963479995728, + "learning_rate": 9.97534616578304e-06, + "loss": 0.8171, + "step": 2443 + }, + { + "epoch": 0.1317662281647617, + "grad_norm": 0.972273588180542, + "learning_rate": 9.975325133949195e-06, + "loss": 0.9834, + "step": 2444 + }, + { + "epoch": 0.13182014233340522, + "grad_norm": 0.8235988616943359, + "learning_rate": 9.975304093170384e-06, + "loss": 0.8896, + "step": 2445 + }, + { + "epoch": 0.13187405650204873, + "grad_norm": 0.8405951261520386, + "learning_rate": 9.975283043446649e-06, + "loss": 0.8362, + "step": 2446 + }, + { + "epoch": 0.13192797067069226, + "grad_norm": 0.765640377998352, + "learning_rate": 9.975261984778024e-06, + "loss": 0.7543, + "step": 2447 + }, + { + "epoch": 0.1319818848393358, + "grad_norm": 0.9431920051574707, + "learning_rate": 9.97524091716455e-06, + "loss": 0.8322, + "step": 2448 + }, + { + "epoch": 0.1320357990079793, + "grad_norm": 0.8060823082923889, + "learning_rate": 9.975219840606265e-06, + "loss": 0.8153, + "step": 2449 + }, + { + "epoch": 0.13208971317662282, + "grad_norm": 1.1293737888336182, + "learning_rate": 9.975198755103203e-06, + "loss": 0.8969, + "step": 2450 + }, + { + "epoch": 0.13214362734526633, + "grad_norm": 0.8462950587272644, + "learning_rate": 9.975177660655407e-06, + "loss": 0.7758, + "step": 2451 + }, + { + "epoch": 0.13219754151390986, + "grad_norm": 0.8241791725158691, + "learning_rate": 9.975156557262914e-06, + "loss": 0.8046, + "step": 2452 + }, + { + "epoch": 0.13225145568255336, + "grad_norm": 0.8260864615440369, + "learning_rate": 9.975135444925756e-06, + "loss": 0.7559, + "step": 2453 + }, + { + "epoch": 0.1323053698511969, + "grad_norm": 0.8952769637107849, + "learning_rate": 9.975114323643978e-06, + "loss": 0.8292, + "step": 2454 + }, + { + "epoch": 0.1323592840198404, + "grad_norm": 0.8182158470153809, + "learning_rate": 9.975093193417615e-06, + "loss": 0.7137, + "step": 2455 + }, + { + "epoch": 0.13241319818848393, + "grad_norm": 0.9926600456237793, + "learning_rate": 9.975072054246706e-06, + "loss": 0.7935, + "step": 2456 + }, + { + "epoch": 0.13246711235712746, + "grad_norm": 0.872171938419342, + "learning_rate": 9.97505090613129e-06, + "loss": 0.882, + "step": 2457 + }, + { + "epoch": 0.13252102652577097, + "grad_norm": 0.8218923807144165, + "learning_rate": 9.975029749071401e-06, + "loss": 0.7675, + "step": 2458 + }, + { + "epoch": 0.1325749406944145, + "grad_norm": 0.8250816464424133, + "learning_rate": 9.97500858306708e-06, + "loss": 0.8404, + "step": 2459 + }, + { + "epoch": 0.132628854863058, + "grad_norm": 0.8135029673576355, + "learning_rate": 9.974987408118365e-06, + "loss": 0.8387, + "step": 2460 + }, + { + "epoch": 0.13268276903170154, + "grad_norm": 1.3989582061767578, + "learning_rate": 9.974966224225293e-06, + "loss": 0.817, + "step": 2461 + }, + { + "epoch": 0.13273668320034504, + "grad_norm": 0.8212644457817078, + "learning_rate": 9.974945031387902e-06, + "loss": 0.8377, + "step": 2462 + }, + { + "epoch": 0.13279059736898857, + "grad_norm": 1.5513782501220703, + "learning_rate": 9.974923829606232e-06, + "loss": 0.7645, + "step": 2463 + }, + { + "epoch": 0.13284451153763208, + "grad_norm": 0.9355224370956421, + "learning_rate": 9.97490261888032e-06, + "loss": 0.7943, + "step": 2464 + }, + { + "epoch": 0.1328984257062756, + "grad_norm": 0.8264141082763672, + "learning_rate": 9.974881399210204e-06, + "loss": 0.7868, + "step": 2465 + }, + { + "epoch": 0.13295233987491914, + "grad_norm": 0.8267685770988464, + "learning_rate": 9.974860170595921e-06, + "loss": 0.8482, + "step": 2466 + }, + { + "epoch": 0.13300625404356264, + "grad_norm": 0.7816182374954224, + "learning_rate": 9.974838933037512e-06, + "loss": 0.6735, + "step": 2467 + }, + { + "epoch": 0.13306016821220618, + "grad_norm": 0.8686188459396362, + "learning_rate": 9.974817686535013e-06, + "loss": 0.7639, + "step": 2468 + }, + { + "epoch": 0.13311408238084968, + "grad_norm": 0.8006383776664734, + "learning_rate": 9.974796431088462e-06, + "loss": 0.9035, + "step": 2469 + }, + { + "epoch": 0.1331679965494932, + "grad_norm": 0.829788327217102, + "learning_rate": 9.974775166697898e-06, + "loss": 0.7724, + "step": 2470 + }, + { + "epoch": 0.13322191071813672, + "grad_norm": 0.7149111032485962, + "learning_rate": 9.97475389336336e-06, + "loss": 0.7543, + "step": 2471 + }, + { + "epoch": 0.13327582488678025, + "grad_norm": 0.8626448512077332, + "learning_rate": 9.974732611084886e-06, + "loss": 0.8903, + "step": 2472 + }, + { + "epoch": 0.13332973905542375, + "grad_norm": 0.818778395652771, + "learning_rate": 9.974711319862514e-06, + "loss": 0.7862, + "step": 2473 + }, + { + "epoch": 0.13338365322406728, + "grad_norm": 0.8285005688667297, + "learning_rate": 9.97469001969628e-06, + "loss": 0.8186, + "step": 2474 + }, + { + "epoch": 0.13343756739271082, + "grad_norm": 0.9331484436988831, + "learning_rate": 9.974668710586226e-06, + "loss": 0.7278, + "step": 2475 + }, + { + "epoch": 0.13349148156135432, + "grad_norm": 0.7760492563247681, + "learning_rate": 9.974647392532387e-06, + "loss": 0.82, + "step": 2476 + }, + { + "epoch": 0.13354539572999785, + "grad_norm": 0.9858410358428955, + "learning_rate": 9.974626065534804e-06, + "loss": 0.9733, + "step": 2477 + }, + { + "epoch": 0.13359930989864136, + "grad_norm": 0.774960458278656, + "learning_rate": 9.974604729593513e-06, + "loss": 0.7899, + "step": 2478 + }, + { + "epoch": 0.1336532240672849, + "grad_norm": 0.7779082655906677, + "learning_rate": 9.974583384708556e-06, + "loss": 0.7727, + "step": 2479 + }, + { + "epoch": 0.1337071382359284, + "grad_norm": 0.8611405491828918, + "learning_rate": 9.974562030879967e-06, + "loss": 0.8341, + "step": 2480 + }, + { + "epoch": 0.13376105240457192, + "grad_norm": 0.9042904376983643, + "learning_rate": 9.974540668107788e-06, + "loss": 0.8015, + "step": 2481 + }, + { + "epoch": 0.13381496657321545, + "grad_norm": 1.067806601524353, + "learning_rate": 9.974519296392054e-06, + "loss": 0.8583, + "step": 2482 + }, + { + "epoch": 0.13386888074185896, + "grad_norm": 0.8079432845115662, + "learning_rate": 9.974497915732806e-06, + "loss": 0.7246, + "step": 2483 + }, + { + "epoch": 0.1339227949105025, + "grad_norm": 0.7360541224479675, + "learning_rate": 9.974476526130082e-06, + "loss": 0.7228, + "step": 2484 + }, + { + "epoch": 0.133976709079146, + "grad_norm": 0.7532739639282227, + "learning_rate": 9.97445512758392e-06, + "loss": 0.7472, + "step": 2485 + }, + { + "epoch": 0.13403062324778953, + "grad_norm": 0.794747531414032, + "learning_rate": 9.974433720094358e-06, + "loss": 0.8288, + "step": 2486 + }, + { + "epoch": 0.13408453741643303, + "grad_norm": 0.9305081367492676, + "learning_rate": 9.974412303661435e-06, + "loss": 0.9414, + "step": 2487 + }, + { + "epoch": 0.13413845158507656, + "grad_norm": 0.9857872128486633, + "learning_rate": 9.97439087828519e-06, + "loss": 0.9123, + "step": 2488 + }, + { + "epoch": 0.13419236575372007, + "grad_norm": 0.9159066081047058, + "learning_rate": 9.97436944396566e-06, + "loss": 0.815, + "step": 2489 + }, + { + "epoch": 0.1342462799223636, + "grad_norm": 0.920803427696228, + "learning_rate": 9.974348000702887e-06, + "loss": 0.855, + "step": 2490 + }, + { + "epoch": 0.13430019409100713, + "grad_norm": 0.8599058389663696, + "learning_rate": 9.974326548496906e-06, + "loss": 0.8944, + "step": 2491 + }, + { + "epoch": 0.13435410825965063, + "grad_norm": 0.7708035111427307, + "learning_rate": 9.974305087347758e-06, + "loss": 0.7733, + "step": 2492 + }, + { + "epoch": 0.13440802242829417, + "grad_norm": 0.771906852722168, + "learning_rate": 9.974283617255478e-06, + "loss": 0.8555, + "step": 2493 + }, + { + "epoch": 0.13446193659693767, + "grad_norm": 0.7494363188743591, + "learning_rate": 9.974262138220108e-06, + "loss": 0.7575, + "step": 2494 + }, + { + "epoch": 0.1345158507655812, + "grad_norm": 0.8488510251045227, + "learning_rate": 9.974240650241687e-06, + "loss": 0.8423, + "step": 2495 + }, + { + "epoch": 0.1345697649342247, + "grad_norm": 0.7665607929229736, + "learning_rate": 9.97421915332025e-06, + "loss": 0.8221, + "step": 2496 + }, + { + "epoch": 0.13462367910286824, + "grad_norm": 0.83452969789505, + "learning_rate": 9.974197647455839e-06, + "loss": 0.8192, + "step": 2497 + }, + { + "epoch": 0.13467759327151174, + "grad_norm": 0.8927843570709229, + "learning_rate": 9.97417613264849e-06, + "loss": 0.8041, + "step": 2498 + }, + { + "epoch": 0.13473150744015527, + "grad_norm": 0.8050754070281982, + "learning_rate": 9.974154608898246e-06, + "loss": 0.7374, + "step": 2499 + }, + { + "epoch": 0.1347854216087988, + "grad_norm": 0.8286676406860352, + "learning_rate": 9.97413307620514e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.1348393357774423, + "grad_norm": 0.8953397870063782, + "learning_rate": 9.974111534569215e-06, + "loss": 0.8419, + "step": 2501 + }, + { + "epoch": 0.13489324994608584, + "grad_norm": 0.8619454503059387, + "learning_rate": 9.974089983990507e-06, + "loss": 0.7231, + "step": 2502 + }, + { + "epoch": 0.13494716411472935, + "grad_norm": 0.8102728724479675, + "learning_rate": 9.974068424469058e-06, + "loss": 0.8701, + "step": 2503 + }, + { + "epoch": 0.13500107828337288, + "grad_norm": 0.7568274736404419, + "learning_rate": 9.974046856004904e-06, + "loss": 0.7864, + "step": 2504 + }, + { + "epoch": 0.13505499245201638, + "grad_norm": 0.7835590839385986, + "learning_rate": 9.974025278598086e-06, + "loss": 0.8595, + "step": 2505 + }, + { + "epoch": 0.1351089066206599, + "grad_norm": 0.854015052318573, + "learning_rate": 9.974003692248638e-06, + "loss": 0.7683, + "step": 2506 + }, + { + "epoch": 0.13516282078930342, + "grad_norm": 0.7973034977912903, + "learning_rate": 9.973982096956604e-06, + "loss": 0.7332, + "step": 2507 + }, + { + "epoch": 0.13521673495794695, + "grad_norm": 0.8860466480255127, + "learning_rate": 9.973960492722022e-06, + "loss": 0.8312, + "step": 2508 + }, + { + "epoch": 0.13527064912659048, + "grad_norm": 0.8370612263679504, + "learning_rate": 9.973938879544928e-06, + "loss": 0.8307, + "step": 2509 + }, + { + "epoch": 0.13532456329523398, + "grad_norm": 0.9102504253387451, + "learning_rate": 9.973917257425365e-06, + "loss": 0.8276, + "step": 2510 + }, + { + "epoch": 0.13537847746387752, + "grad_norm": 0.9040873646736145, + "learning_rate": 9.973895626363367e-06, + "loss": 0.7717, + "step": 2511 + }, + { + "epoch": 0.13543239163252102, + "grad_norm": 0.7447285056114197, + "learning_rate": 9.973873986358977e-06, + "loss": 0.7836, + "step": 2512 + }, + { + "epoch": 0.13548630580116455, + "grad_norm": 0.7533379197120667, + "learning_rate": 9.973852337412234e-06, + "loss": 0.8308, + "step": 2513 + }, + { + "epoch": 0.13554021996980806, + "grad_norm": 0.7503568530082703, + "learning_rate": 9.973830679523173e-06, + "loss": 0.7893, + "step": 2514 + }, + { + "epoch": 0.1355941341384516, + "grad_norm": 0.786011815071106, + "learning_rate": 9.973809012691836e-06, + "loss": 0.7562, + "step": 2515 + }, + { + "epoch": 0.1356480483070951, + "grad_norm": 0.9311261773109436, + "learning_rate": 9.973787336918262e-06, + "loss": 0.7295, + "step": 2516 + }, + { + "epoch": 0.13570196247573862, + "grad_norm": 0.8217887878417969, + "learning_rate": 9.973765652202488e-06, + "loss": 0.8399, + "step": 2517 + }, + { + "epoch": 0.13575587664438216, + "grad_norm": 0.8265646696090698, + "learning_rate": 9.973743958544554e-06, + "loss": 0.8146, + "step": 2518 + }, + { + "epoch": 0.13580979081302566, + "grad_norm": 0.9443806409835815, + "learning_rate": 9.9737222559445e-06, + "loss": 0.9217, + "step": 2519 + }, + { + "epoch": 0.1358637049816692, + "grad_norm": 0.807623028755188, + "learning_rate": 9.973700544402362e-06, + "loss": 0.8266, + "step": 2520 + }, + { + "epoch": 0.1359176191503127, + "grad_norm": 0.819793164730072, + "learning_rate": 9.973678823918184e-06, + "loss": 0.755, + "step": 2521 + }, + { + "epoch": 0.13597153331895623, + "grad_norm": 0.7608258724212646, + "learning_rate": 9.973657094492002e-06, + "loss": 0.7707, + "step": 2522 + }, + { + "epoch": 0.13602544748759973, + "grad_norm": 0.795218825340271, + "learning_rate": 9.973635356123854e-06, + "loss": 0.7235, + "step": 2523 + }, + { + "epoch": 0.13607936165624326, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.973613608813782e-06, + "loss": 0.8698, + "step": 2524 + }, + { + "epoch": 0.13613327582488677, + "grad_norm": 0.8091539144515991, + "learning_rate": 9.973591852561822e-06, + "loss": 0.8492, + "step": 2525 + }, + { + "epoch": 0.1361871899935303, + "grad_norm": 0.9144110679626465, + "learning_rate": 9.973570087368015e-06, + "loss": 0.7952, + "step": 2526 + }, + { + "epoch": 0.13624110416217383, + "grad_norm": 0.761695921421051, + "learning_rate": 9.9735483132324e-06, + "loss": 0.7841, + "step": 2527 + }, + { + "epoch": 0.13629501833081734, + "grad_norm": 0.887026846408844, + "learning_rate": 9.973526530155016e-06, + "loss": 0.8855, + "step": 2528 + }, + { + "epoch": 0.13634893249946087, + "grad_norm": 0.8282152414321899, + "learning_rate": 9.973504738135903e-06, + "loss": 0.8857, + "step": 2529 + }, + { + "epoch": 0.13640284666810437, + "grad_norm": 0.7782665491104126, + "learning_rate": 9.973482937175098e-06, + "loss": 0.8076, + "step": 2530 + }, + { + "epoch": 0.1364567608367479, + "grad_norm": 0.8865575194358826, + "learning_rate": 9.973461127272642e-06, + "loss": 0.8596, + "step": 2531 + }, + { + "epoch": 0.1365106750053914, + "grad_norm": 0.7215422987937927, + "learning_rate": 9.973439308428572e-06, + "loss": 0.7437, + "step": 2532 + }, + { + "epoch": 0.13656458917403494, + "grad_norm": 0.7932387590408325, + "learning_rate": 9.97341748064293e-06, + "loss": 0.8439, + "step": 2533 + }, + { + "epoch": 0.13661850334267844, + "grad_norm": 0.8260403871536255, + "learning_rate": 9.973395643915756e-06, + "loss": 0.7956, + "step": 2534 + }, + { + "epoch": 0.13667241751132198, + "grad_norm": 0.7879858016967773, + "learning_rate": 9.973373798247085e-06, + "loss": 0.8501, + "step": 2535 + }, + { + "epoch": 0.1367263316799655, + "grad_norm": 0.7268496751785278, + "learning_rate": 9.97335194363696e-06, + "loss": 0.78, + "step": 2536 + }, + { + "epoch": 0.136780245848609, + "grad_norm": 0.8170067071914673, + "learning_rate": 9.973330080085417e-06, + "loss": 0.829, + "step": 2537 + }, + { + "epoch": 0.13683416001725254, + "grad_norm": 0.8400061726570129, + "learning_rate": 9.973308207592498e-06, + "loss": 0.8576, + "step": 2538 + }, + { + "epoch": 0.13688807418589605, + "grad_norm": 0.9156914353370667, + "learning_rate": 9.973286326158244e-06, + "loss": 0.8633, + "step": 2539 + }, + { + "epoch": 0.13694198835453958, + "grad_norm": 0.7413343191146851, + "learning_rate": 9.97326443578269e-06, + "loss": 0.8128, + "step": 2540 + }, + { + "epoch": 0.13699590252318308, + "grad_norm": 0.8003092408180237, + "learning_rate": 9.973242536465877e-06, + "loss": 0.7743, + "step": 2541 + }, + { + "epoch": 0.13704981669182661, + "grad_norm": 0.8532862067222595, + "learning_rate": 9.973220628207844e-06, + "loss": 0.8526, + "step": 2542 + }, + { + "epoch": 0.13710373086047012, + "grad_norm": 0.7677969336509705, + "learning_rate": 9.973198711008634e-06, + "loss": 0.8493, + "step": 2543 + }, + { + "epoch": 0.13715764502911365, + "grad_norm": 0.8414867520332336, + "learning_rate": 9.973176784868282e-06, + "loss": 0.7674, + "step": 2544 + }, + { + "epoch": 0.13721155919775718, + "grad_norm": 0.825450599193573, + "learning_rate": 9.973154849786828e-06, + "loss": 0.8328, + "step": 2545 + }, + { + "epoch": 0.1372654733664007, + "grad_norm": 0.8429614305496216, + "learning_rate": 9.973132905764313e-06, + "loss": 0.787, + "step": 2546 + }, + { + "epoch": 0.13731938753504422, + "grad_norm": 0.9791093468666077, + "learning_rate": 9.973110952800776e-06, + "loss": 0.7836, + "step": 2547 + }, + { + "epoch": 0.13737330170368772, + "grad_norm": 0.8728508353233337, + "learning_rate": 9.973088990896255e-06, + "loss": 0.8897, + "step": 2548 + }, + { + "epoch": 0.13742721587233125, + "grad_norm": 0.9933381080627441, + "learning_rate": 9.973067020050792e-06, + "loss": 0.8679, + "step": 2549 + }, + { + "epoch": 0.13748113004097476, + "grad_norm": 0.8786694407463074, + "learning_rate": 9.973045040264423e-06, + "loss": 0.8599, + "step": 2550 + }, + { + "epoch": 0.1375350442096183, + "grad_norm": 0.7714465260505676, + "learning_rate": 9.973023051537193e-06, + "loss": 0.6355, + "step": 2551 + }, + { + "epoch": 0.1375889583782618, + "grad_norm": 0.9043986201286316, + "learning_rate": 9.973001053869138e-06, + "loss": 0.7445, + "step": 2552 + }, + { + "epoch": 0.13764287254690533, + "grad_norm": 0.879623532295227, + "learning_rate": 9.972979047260297e-06, + "loss": 0.8086, + "step": 2553 + }, + { + "epoch": 0.13769678671554886, + "grad_norm": 0.8384745121002197, + "learning_rate": 9.972957031710708e-06, + "loss": 0.6832, + "step": 2554 + }, + { + "epoch": 0.13775070088419236, + "grad_norm": 0.8574655055999756, + "learning_rate": 9.972935007220415e-06, + "loss": 0.8326, + "step": 2555 + }, + { + "epoch": 0.1378046150528359, + "grad_norm": 0.8241353034973145, + "learning_rate": 9.972912973789458e-06, + "loss": 0.7526, + "step": 2556 + }, + { + "epoch": 0.1378585292214794, + "grad_norm": 0.8306788802146912, + "learning_rate": 9.97289093141787e-06, + "loss": 0.9423, + "step": 2557 + }, + { + "epoch": 0.13791244339012293, + "grad_norm": 0.7930428385734558, + "learning_rate": 9.972868880105696e-06, + "loss": 0.8635, + "step": 2558 + }, + { + "epoch": 0.13796635755876643, + "grad_norm": 0.856482207775116, + "learning_rate": 9.972846819852974e-06, + "loss": 0.7902, + "step": 2559 + }, + { + "epoch": 0.13802027172740997, + "grad_norm": 0.8513977527618408, + "learning_rate": 9.972824750659747e-06, + "loss": 0.8485, + "step": 2560 + }, + { + "epoch": 0.13807418589605347, + "grad_norm": 0.7595572471618652, + "learning_rate": 9.97280267252605e-06, + "loss": 0.7294, + "step": 2561 + }, + { + "epoch": 0.138128100064697, + "grad_norm": 0.9774705767631531, + "learning_rate": 9.972780585451923e-06, + "loss": 0.8758, + "step": 2562 + }, + { + "epoch": 0.13818201423334053, + "grad_norm": 0.8011289834976196, + "learning_rate": 9.972758489437408e-06, + "loss": 0.7649, + "step": 2563 + }, + { + "epoch": 0.13823592840198404, + "grad_norm": 0.8921117186546326, + "learning_rate": 9.972736384482545e-06, + "loss": 0.8745, + "step": 2564 + }, + { + "epoch": 0.13828984257062757, + "grad_norm": 0.8739173412322998, + "learning_rate": 9.972714270587372e-06, + "loss": 0.841, + "step": 2565 + }, + { + "epoch": 0.13834375673927107, + "grad_norm": 0.7379958033561707, + "learning_rate": 9.97269214775193e-06, + "loss": 0.813, + "step": 2566 + }, + { + "epoch": 0.1383976709079146, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.972670015976258e-06, + "loss": 0.8319, + "step": 2567 + }, + { + "epoch": 0.1384515850765581, + "grad_norm": 0.7312106490135193, + "learning_rate": 9.972647875260395e-06, + "loss": 0.7494, + "step": 2568 + }, + { + "epoch": 0.13850549924520164, + "grad_norm": 0.8182246088981628, + "learning_rate": 9.972625725604383e-06, + "loss": 0.9543, + "step": 2569 + }, + { + "epoch": 0.13855941341384514, + "grad_norm": 0.8153319358825684, + "learning_rate": 9.97260356700826e-06, + "loss": 0.8411, + "step": 2570 + }, + { + "epoch": 0.13861332758248868, + "grad_norm": 0.7589008212089539, + "learning_rate": 9.972581399472066e-06, + "loss": 0.7576, + "step": 2571 + }, + { + "epoch": 0.1386672417511322, + "grad_norm": 0.8160014748573303, + "learning_rate": 9.972559222995841e-06, + "loss": 0.8801, + "step": 2572 + }, + { + "epoch": 0.1387211559197757, + "grad_norm": 0.752868115901947, + "learning_rate": 9.972537037579626e-06, + "loss": 0.7504, + "step": 2573 + }, + { + "epoch": 0.13877507008841924, + "grad_norm": 0.8015901446342468, + "learning_rate": 9.97251484322346e-06, + "loss": 0.7468, + "step": 2574 + }, + { + "epoch": 0.13882898425706275, + "grad_norm": 0.815352737903595, + "learning_rate": 9.972492639927384e-06, + "loss": 0.8526, + "step": 2575 + }, + { + "epoch": 0.13888289842570628, + "grad_norm": 0.7475571036338806, + "learning_rate": 9.972470427691436e-06, + "loss": 0.7653, + "step": 2576 + }, + { + "epoch": 0.13893681259434978, + "grad_norm": 1.1950535774230957, + "learning_rate": 9.972448206515656e-06, + "loss": 0.9106, + "step": 2577 + }, + { + "epoch": 0.13899072676299332, + "grad_norm": 0.843235194683075, + "learning_rate": 9.972425976400086e-06, + "loss": 0.8922, + "step": 2578 + }, + { + "epoch": 0.13904464093163682, + "grad_norm": 0.8039982914924622, + "learning_rate": 9.972403737344763e-06, + "loss": 0.6855, + "step": 2579 + }, + { + "epoch": 0.13909855510028035, + "grad_norm": 0.7598289251327515, + "learning_rate": 9.97238148934973e-06, + "loss": 0.832, + "step": 2580 + }, + { + "epoch": 0.13915246926892388, + "grad_norm": 0.7986323237419128, + "learning_rate": 9.972359232415025e-06, + "loss": 0.7886, + "step": 2581 + }, + { + "epoch": 0.1392063834375674, + "grad_norm": 0.7465773820877075, + "learning_rate": 9.97233696654069e-06, + "loss": 0.7875, + "step": 2582 + }, + { + "epoch": 0.13926029760621092, + "grad_norm": 0.8853508830070496, + "learning_rate": 9.972314691726764e-06, + "loss": 0.9263, + "step": 2583 + }, + { + "epoch": 0.13931421177485442, + "grad_norm": 0.7267711162567139, + "learning_rate": 9.972292407973286e-06, + "loss": 0.78, + "step": 2584 + }, + { + "epoch": 0.13936812594349796, + "grad_norm": 0.7631322145462036, + "learning_rate": 9.972270115280295e-06, + "loss": 0.7726, + "step": 2585 + }, + { + "epoch": 0.13942204011214146, + "grad_norm": 0.8661205768585205, + "learning_rate": 9.972247813647836e-06, + "loss": 0.977, + "step": 2586 + }, + { + "epoch": 0.139475954280785, + "grad_norm": 0.7955568432807922, + "learning_rate": 9.972225503075943e-06, + "loss": 0.8481, + "step": 2587 + }, + { + "epoch": 0.13952986844942852, + "grad_norm": 0.8810243606567383, + "learning_rate": 9.972203183564661e-06, + "loss": 0.8938, + "step": 2588 + }, + { + "epoch": 0.13958378261807203, + "grad_norm": 0.783968985080719, + "learning_rate": 9.972180855114029e-06, + "loss": 0.7565, + "step": 2589 + }, + { + "epoch": 0.13963769678671556, + "grad_norm": 0.749191164970398, + "learning_rate": 9.972158517724084e-06, + "loss": 0.7283, + "step": 2590 + }, + { + "epoch": 0.13969161095535906, + "grad_norm": 0.7926847338676453, + "learning_rate": 9.972136171394871e-06, + "loss": 0.9073, + "step": 2591 + }, + { + "epoch": 0.1397455251240026, + "grad_norm": 0.7621777653694153, + "learning_rate": 9.972113816126427e-06, + "loss": 0.7176, + "step": 2592 + }, + { + "epoch": 0.1397994392926461, + "grad_norm": 0.8856351375579834, + "learning_rate": 9.972091451918792e-06, + "loss": 0.7428, + "step": 2593 + }, + { + "epoch": 0.13985335346128963, + "grad_norm": 0.8027200698852539, + "learning_rate": 9.972069078772008e-06, + "loss": 0.7794, + "step": 2594 + }, + { + "epoch": 0.13990726762993314, + "grad_norm": 0.8776759505271912, + "learning_rate": 9.972046696686115e-06, + "loss": 0.9087, + "step": 2595 + }, + { + "epoch": 0.13996118179857667, + "grad_norm": 0.8979713320732117, + "learning_rate": 9.972024305661152e-06, + "loss": 0.8031, + "step": 2596 + }, + { + "epoch": 0.1400150959672202, + "grad_norm": 0.8233299851417542, + "learning_rate": 9.97200190569716e-06, + "loss": 0.8462, + "step": 2597 + }, + { + "epoch": 0.1400690101358637, + "grad_norm": 0.8777962327003479, + "learning_rate": 9.971979496794178e-06, + "loss": 0.8464, + "step": 2598 + }, + { + "epoch": 0.14012292430450723, + "grad_norm": 0.7185937166213989, + "learning_rate": 9.971957078952249e-06, + "loss": 0.7423, + "step": 2599 + }, + { + "epoch": 0.14017683847315074, + "grad_norm": 0.8226794600486755, + "learning_rate": 9.971934652171412e-06, + "loss": 0.8017, + "step": 2600 + }, + { + "epoch": 0.14023075264179427, + "grad_norm": 0.8021965622901917, + "learning_rate": 9.971912216451705e-06, + "loss": 0.8018, + "step": 2601 + }, + { + "epoch": 0.14028466681043777, + "grad_norm": 1.0516051054000854, + "learning_rate": 9.971889771793172e-06, + "loss": 0.8894, + "step": 2602 + }, + { + "epoch": 0.1403385809790813, + "grad_norm": 0.8212647438049316, + "learning_rate": 9.971867318195851e-06, + "loss": 0.826, + "step": 2603 + }, + { + "epoch": 0.1403924951477248, + "grad_norm": 0.8427513241767883, + "learning_rate": 9.971844855659783e-06, + "loss": 0.815, + "step": 2604 + }, + { + "epoch": 0.14044640931636834, + "grad_norm": 0.779569149017334, + "learning_rate": 9.97182238418501e-06, + "loss": 0.797, + "step": 2605 + }, + { + "epoch": 0.14050032348501187, + "grad_norm": 0.7430607080459595, + "learning_rate": 9.97179990377157e-06, + "loss": 0.7925, + "step": 2606 + }, + { + "epoch": 0.14055423765365538, + "grad_norm": 0.8079801797866821, + "learning_rate": 9.971777414419503e-06, + "loss": 0.8259, + "step": 2607 + }, + { + "epoch": 0.1406081518222989, + "grad_norm": 0.794086754322052, + "learning_rate": 9.971754916128853e-06, + "loss": 0.833, + "step": 2608 + }, + { + "epoch": 0.14066206599094241, + "grad_norm": 0.8177362680435181, + "learning_rate": 9.971732408899657e-06, + "loss": 0.8543, + "step": 2609 + }, + { + "epoch": 0.14071598015958595, + "grad_norm": 0.8591805100440979, + "learning_rate": 9.971709892731956e-06, + "loss": 0.9323, + "step": 2610 + }, + { + "epoch": 0.14076989432822945, + "grad_norm": 0.8102341890335083, + "learning_rate": 9.971687367625793e-06, + "loss": 0.7679, + "step": 2611 + }, + { + "epoch": 0.14082380849687298, + "grad_norm": 0.8556869626045227, + "learning_rate": 9.971664833581205e-06, + "loss": 0.8458, + "step": 2612 + }, + { + "epoch": 0.14087772266551649, + "grad_norm": 0.7998070120811462, + "learning_rate": 9.971642290598235e-06, + "loss": 0.7663, + "step": 2613 + }, + { + "epoch": 0.14093163683416002, + "grad_norm": 0.8800550103187561, + "learning_rate": 9.971619738676923e-06, + "loss": 0.8653, + "step": 2614 + }, + { + "epoch": 0.14098555100280355, + "grad_norm": 0.8199629187583923, + "learning_rate": 9.971597177817308e-06, + "loss": 0.8804, + "step": 2615 + }, + { + "epoch": 0.14103946517144705, + "grad_norm": 0.8774363398551941, + "learning_rate": 9.971574608019432e-06, + "loss": 0.8468, + "step": 2616 + }, + { + "epoch": 0.14109337934009059, + "grad_norm": 0.7911790013313293, + "learning_rate": 9.971552029283335e-06, + "loss": 0.7841, + "step": 2617 + }, + { + "epoch": 0.1411472935087341, + "grad_norm": 0.8152750134468079, + "learning_rate": 9.97152944160906e-06, + "loss": 0.7753, + "step": 2618 + }, + { + "epoch": 0.14120120767737762, + "grad_norm": 0.8709943890571594, + "learning_rate": 9.971506844996645e-06, + "loss": 0.7259, + "step": 2619 + }, + { + "epoch": 0.14125512184602113, + "grad_norm": 1.1131712198257446, + "learning_rate": 9.97148423944613e-06, + "loss": 0.9422, + "step": 2620 + }, + { + "epoch": 0.14130903601466466, + "grad_norm": 0.8992665410041809, + "learning_rate": 9.971461624957557e-06, + "loss": 0.733, + "step": 2621 + }, + { + "epoch": 0.14136295018330816, + "grad_norm": 0.7548032402992249, + "learning_rate": 9.971439001530967e-06, + "loss": 0.7733, + "step": 2622 + }, + { + "epoch": 0.1414168643519517, + "grad_norm": 0.7988988161087036, + "learning_rate": 9.9714163691664e-06, + "loss": 0.8218, + "step": 2623 + }, + { + "epoch": 0.14147077852059523, + "grad_norm": 0.7697865962982178, + "learning_rate": 9.971393727863899e-06, + "loss": 0.7882, + "step": 2624 + }, + { + "epoch": 0.14152469268923873, + "grad_norm": 0.993664026260376, + "learning_rate": 9.9713710776235e-06, + "loss": 0.8331, + "step": 2625 + }, + { + "epoch": 0.14157860685788226, + "grad_norm": 1.0097055435180664, + "learning_rate": 9.971348418445245e-06, + "loss": 0.8959, + "step": 2626 + }, + { + "epoch": 0.14163252102652577, + "grad_norm": 0.7682481408119202, + "learning_rate": 9.97132575032918e-06, + "loss": 0.7425, + "step": 2627 + }, + { + "epoch": 0.1416864351951693, + "grad_norm": 0.790695309638977, + "learning_rate": 9.971303073275338e-06, + "loss": 0.6887, + "step": 2628 + }, + { + "epoch": 0.1417403493638128, + "grad_norm": 0.9672498106956482, + "learning_rate": 9.971280387283766e-06, + "loss": 0.8617, + "step": 2629 + }, + { + "epoch": 0.14179426353245633, + "grad_norm": 0.8538743853569031, + "learning_rate": 9.971257692354502e-06, + "loss": 0.7826, + "step": 2630 + }, + { + "epoch": 0.14184817770109984, + "grad_norm": 0.7527078986167908, + "learning_rate": 9.971234988487587e-06, + "loss": 0.7542, + "step": 2631 + }, + { + "epoch": 0.14190209186974337, + "grad_norm": 0.9390487670898438, + "learning_rate": 9.97121227568306e-06, + "loss": 0.8415, + "step": 2632 + }, + { + "epoch": 0.1419560060383869, + "grad_norm": 0.8717443346977234, + "learning_rate": 9.971189553940966e-06, + "loss": 0.7969, + "step": 2633 + }, + { + "epoch": 0.1420099202070304, + "grad_norm": 0.7848197817802429, + "learning_rate": 9.971166823261343e-06, + "loss": 0.8049, + "step": 2634 + }, + { + "epoch": 0.14206383437567394, + "grad_norm": 0.8002238273620605, + "learning_rate": 9.971144083644233e-06, + "loss": 0.8681, + "step": 2635 + }, + { + "epoch": 0.14211774854431744, + "grad_norm": 0.7699506282806396, + "learning_rate": 9.971121335089676e-06, + "loss": 0.7815, + "step": 2636 + }, + { + "epoch": 0.14217166271296097, + "grad_norm": 0.9187048673629761, + "learning_rate": 9.971098577597713e-06, + "loss": 0.8611, + "step": 2637 + }, + { + "epoch": 0.14222557688160448, + "grad_norm": 0.802859365940094, + "learning_rate": 9.971075811168385e-06, + "loss": 0.7991, + "step": 2638 + }, + { + "epoch": 0.142279491050248, + "grad_norm": 1.0536410808563232, + "learning_rate": 9.971053035801735e-06, + "loss": 0.9726, + "step": 2639 + }, + { + "epoch": 0.1423334052188915, + "grad_norm": 0.8278898000717163, + "learning_rate": 9.9710302514978e-06, + "loss": 0.8636, + "step": 2640 + }, + { + "epoch": 0.14238731938753504, + "grad_norm": 0.7639529705047607, + "learning_rate": 9.971007458256623e-06, + "loss": 0.7849, + "step": 2641 + }, + { + "epoch": 0.14244123355617858, + "grad_norm": 0.9108867049217224, + "learning_rate": 9.970984656078246e-06, + "loss": 0.891, + "step": 2642 + }, + { + "epoch": 0.14249514772482208, + "grad_norm": 0.8182162046432495, + "learning_rate": 9.97096184496271e-06, + "loss": 0.7975, + "step": 2643 + }, + { + "epoch": 0.1425490618934656, + "grad_norm": 0.848781168460846, + "learning_rate": 9.970939024910053e-06, + "loss": 0.8677, + "step": 2644 + }, + { + "epoch": 0.14260297606210912, + "grad_norm": 0.8322750926017761, + "learning_rate": 9.97091619592032e-06, + "loss": 0.776, + "step": 2645 + }, + { + "epoch": 0.14265689023075265, + "grad_norm": 0.8054049611091614, + "learning_rate": 9.970893357993548e-06, + "loss": 0.804, + "step": 2646 + }, + { + "epoch": 0.14271080439939615, + "grad_norm": 0.8162119388580322, + "learning_rate": 9.970870511129782e-06, + "loss": 0.7856, + "step": 2647 + }, + { + "epoch": 0.14276471856803968, + "grad_norm": 0.73929363489151, + "learning_rate": 9.97084765532906e-06, + "loss": 0.7687, + "step": 2648 + }, + { + "epoch": 0.1428186327366832, + "grad_norm": 0.866688072681427, + "learning_rate": 9.970824790591425e-06, + "loss": 0.8751, + "step": 2649 + }, + { + "epoch": 0.14287254690532672, + "grad_norm": 0.7772359251976013, + "learning_rate": 9.970801916916917e-06, + "loss": 0.7232, + "step": 2650 + }, + { + "epoch": 0.14292646107397025, + "grad_norm": 0.8912346363067627, + "learning_rate": 9.970779034305578e-06, + "loss": 0.8393, + "step": 2651 + }, + { + "epoch": 0.14298037524261376, + "grad_norm": 0.7827256917953491, + "learning_rate": 9.970756142757448e-06, + "loss": 0.7924, + "step": 2652 + }, + { + "epoch": 0.1430342894112573, + "grad_norm": 0.7557843923568726, + "learning_rate": 9.97073324227257e-06, + "loss": 0.8032, + "step": 2653 + }, + { + "epoch": 0.1430882035799008, + "grad_norm": 0.7939576506614685, + "learning_rate": 9.970710332850983e-06, + "loss": 0.7251, + "step": 2654 + }, + { + "epoch": 0.14314211774854432, + "grad_norm": 0.8175502419471741, + "learning_rate": 9.97068741449273e-06, + "loss": 0.7685, + "step": 2655 + }, + { + "epoch": 0.14319603191718783, + "grad_norm": 0.7537406086921692, + "learning_rate": 9.970664487197851e-06, + "loss": 0.7354, + "step": 2656 + }, + { + "epoch": 0.14324994608583136, + "grad_norm": 0.8045641779899597, + "learning_rate": 9.970641550966388e-06, + "loss": 0.7581, + "step": 2657 + }, + { + "epoch": 0.14330386025447486, + "grad_norm": 0.69786137342453, + "learning_rate": 9.97061860579838e-06, + "loss": 0.6923, + "step": 2658 + }, + { + "epoch": 0.1433577744231184, + "grad_norm": 0.7913051843643188, + "learning_rate": 9.970595651693874e-06, + "loss": 0.7579, + "step": 2659 + }, + { + "epoch": 0.14341168859176193, + "grad_norm": 0.7890749573707581, + "learning_rate": 9.970572688652905e-06, + "loss": 0.7843, + "step": 2660 + }, + { + "epoch": 0.14346560276040543, + "grad_norm": 0.913074791431427, + "learning_rate": 9.970549716675516e-06, + "loss": 0.8318, + "step": 2661 + }, + { + "epoch": 0.14351951692904896, + "grad_norm": 0.757522463798523, + "learning_rate": 9.97052673576175e-06, + "loss": 0.6803, + "step": 2662 + }, + { + "epoch": 0.14357343109769247, + "grad_norm": 0.9279198050498962, + "learning_rate": 9.970503745911645e-06, + "loss": 0.8591, + "step": 2663 + }, + { + "epoch": 0.143627345266336, + "grad_norm": 0.8218236565589905, + "learning_rate": 9.97048074712525e-06, + "loss": 0.8253, + "step": 2664 + }, + { + "epoch": 0.1436812594349795, + "grad_norm": 0.7562058568000793, + "learning_rate": 9.970457739402596e-06, + "loss": 0.8114, + "step": 2665 + }, + { + "epoch": 0.14373517360362303, + "grad_norm": 0.7626449465751648, + "learning_rate": 9.970434722743732e-06, + "loss": 0.7932, + "step": 2666 + }, + { + "epoch": 0.14378908777226654, + "grad_norm": 0.8287700414657593, + "learning_rate": 9.970411697148696e-06, + "loss": 0.754, + "step": 2667 + }, + { + "epoch": 0.14384300194091007, + "grad_norm": 1.0403661727905273, + "learning_rate": 9.97038866261753e-06, + "loss": 0.9062, + "step": 2668 + }, + { + "epoch": 0.1438969161095536, + "grad_norm": 0.8278779983520508, + "learning_rate": 9.970365619150276e-06, + "loss": 0.9181, + "step": 2669 + }, + { + "epoch": 0.1439508302781971, + "grad_norm": 0.950964629650116, + "learning_rate": 9.970342566746973e-06, + "loss": 0.9235, + "step": 2670 + }, + { + "epoch": 0.14400474444684064, + "grad_norm": 0.9529917240142822, + "learning_rate": 9.970319505407667e-06, + "loss": 0.7929, + "step": 2671 + }, + { + "epoch": 0.14405865861548414, + "grad_norm": 0.7601970434188843, + "learning_rate": 9.970296435132395e-06, + "loss": 0.7133, + "step": 2672 + }, + { + "epoch": 0.14411257278412767, + "grad_norm": 0.8906385898590088, + "learning_rate": 9.970273355921201e-06, + "loss": 0.8679, + "step": 2673 + }, + { + "epoch": 0.14416648695277118, + "grad_norm": 0.8250144720077515, + "learning_rate": 9.970250267774126e-06, + "loss": 0.7871, + "step": 2674 + }, + { + "epoch": 0.1442204011214147, + "grad_norm": 0.8182716965675354, + "learning_rate": 9.970227170691212e-06, + "loss": 0.7391, + "step": 2675 + }, + { + "epoch": 0.1442743152900582, + "grad_norm": 0.8261950016021729, + "learning_rate": 9.970204064672498e-06, + "loss": 0.8914, + "step": 2676 + }, + { + "epoch": 0.14432822945870175, + "grad_norm": 1.248270869255066, + "learning_rate": 9.97018094971803e-06, + "loss": 0.7834, + "step": 2677 + }, + { + "epoch": 0.14438214362734528, + "grad_norm": 0.7821226119995117, + "learning_rate": 9.970157825827844e-06, + "loss": 0.7436, + "step": 2678 + }, + { + "epoch": 0.14443605779598878, + "grad_norm": 0.9708791375160217, + "learning_rate": 9.970134693001987e-06, + "loss": 0.9038, + "step": 2679 + }, + { + "epoch": 0.1444899719646323, + "grad_norm": 0.8178976774215698, + "learning_rate": 9.970111551240499e-06, + "loss": 0.8748, + "step": 2680 + }, + { + "epoch": 0.14454388613327582, + "grad_norm": 0.8477594256401062, + "learning_rate": 9.970088400543417e-06, + "loss": 0.8169, + "step": 2681 + }, + { + "epoch": 0.14459780030191935, + "grad_norm": 0.9478195309638977, + "learning_rate": 9.970065240910789e-06, + "loss": 0.789, + "step": 2682 + }, + { + "epoch": 0.14465171447056285, + "grad_norm": 0.9151026010513306, + "learning_rate": 9.970042072342652e-06, + "loss": 0.8804, + "step": 2683 + }, + { + "epoch": 0.14470562863920639, + "grad_norm": 0.8062365651130676, + "learning_rate": 9.970018894839052e-06, + "loss": 0.8329, + "step": 2684 + }, + { + "epoch": 0.1447595428078499, + "grad_norm": 0.8029241561889648, + "learning_rate": 9.969995708400028e-06, + "loss": 0.7053, + "step": 2685 + }, + { + "epoch": 0.14481345697649342, + "grad_norm": 0.8023892641067505, + "learning_rate": 9.969972513025621e-06, + "loss": 0.7921, + "step": 2686 + }, + { + "epoch": 0.14486737114513695, + "grad_norm": 0.9224045276641846, + "learning_rate": 9.969949308715874e-06, + "loss": 0.7416, + "step": 2687 + }, + { + "epoch": 0.14492128531378046, + "grad_norm": 0.7767837047576904, + "learning_rate": 9.969926095470829e-06, + "loss": 0.7844, + "step": 2688 + }, + { + "epoch": 0.144975199482424, + "grad_norm": 0.7804312109947205, + "learning_rate": 9.969902873290526e-06, + "loss": 0.712, + "step": 2689 + }, + { + "epoch": 0.1450291136510675, + "grad_norm": 0.9595988988876343, + "learning_rate": 9.969879642175009e-06, + "loss": 0.7686, + "step": 2690 + }, + { + "epoch": 0.14508302781971102, + "grad_norm": 1.0414133071899414, + "learning_rate": 9.969856402124318e-06, + "loss": 0.8833, + "step": 2691 + }, + { + "epoch": 0.14513694198835453, + "grad_norm": 0.9321674108505249, + "learning_rate": 9.969833153138498e-06, + "loss": 0.7576, + "step": 2692 + }, + { + "epoch": 0.14519085615699806, + "grad_norm": 0.7715985774993896, + "learning_rate": 9.969809895217586e-06, + "loss": 0.7371, + "step": 2693 + }, + { + "epoch": 0.1452447703256416, + "grad_norm": 1.0257316827774048, + "learning_rate": 9.969786628361625e-06, + "loss": 0.8394, + "step": 2694 + }, + { + "epoch": 0.1452986844942851, + "grad_norm": 0.7823453545570374, + "learning_rate": 9.969763352570659e-06, + "loss": 0.7974, + "step": 2695 + }, + { + "epoch": 0.14535259866292863, + "grad_norm": 0.8257505893707275, + "learning_rate": 9.969740067844728e-06, + "loss": 0.7948, + "step": 2696 + }, + { + "epoch": 0.14540651283157213, + "grad_norm": 0.6493780016899109, + "learning_rate": 9.969716774183878e-06, + "loss": 0.6531, + "step": 2697 + }, + { + "epoch": 0.14546042700021566, + "grad_norm": 0.8953896760940552, + "learning_rate": 9.969693471588144e-06, + "loss": 0.7414, + "step": 2698 + }, + { + "epoch": 0.14551434116885917, + "grad_norm": 0.7177074551582336, + "learning_rate": 9.969670160057572e-06, + "loss": 0.65, + "step": 2699 + }, + { + "epoch": 0.1455682553375027, + "grad_norm": 0.8214414715766907, + "learning_rate": 9.969646839592204e-06, + "loss": 0.7605, + "step": 2700 + }, + { + "epoch": 0.1456221695061462, + "grad_norm": 0.8062289953231812, + "learning_rate": 9.969623510192081e-06, + "loss": 0.8275, + "step": 2701 + }, + { + "epoch": 0.14567608367478974, + "grad_norm": 0.9606921076774597, + "learning_rate": 9.969600171857246e-06, + "loss": 0.8472, + "step": 2702 + }, + { + "epoch": 0.14572999784343327, + "grad_norm": 1.0146433115005493, + "learning_rate": 9.96957682458774e-06, + "loss": 0.8398, + "step": 2703 + }, + { + "epoch": 0.14578391201207677, + "grad_norm": 0.8463965058326721, + "learning_rate": 9.969553468383604e-06, + "loss": 0.7563, + "step": 2704 + }, + { + "epoch": 0.1458378261807203, + "grad_norm": 0.8125115633010864, + "learning_rate": 9.96953010324488e-06, + "loss": 0.8042, + "step": 2705 + }, + { + "epoch": 0.1458917403493638, + "grad_norm": 0.9350455403327942, + "learning_rate": 9.969506729171612e-06, + "loss": 0.9067, + "step": 2706 + }, + { + "epoch": 0.14594565451800734, + "grad_norm": 0.9979991316795349, + "learning_rate": 9.969483346163843e-06, + "loss": 0.778, + "step": 2707 + }, + { + "epoch": 0.14599956868665084, + "grad_norm": 0.8236498236656189, + "learning_rate": 9.969459954221612e-06, + "loss": 0.9011, + "step": 2708 + }, + { + "epoch": 0.14605348285529438, + "grad_norm": 0.6965605616569519, + "learning_rate": 9.969436553344962e-06, + "loss": 0.6657, + "step": 2709 + }, + { + "epoch": 0.14610739702393788, + "grad_norm": 0.810246467590332, + "learning_rate": 9.969413143533936e-06, + "loss": 0.8099, + "step": 2710 + }, + { + "epoch": 0.1461613111925814, + "grad_norm": 1.1437804698944092, + "learning_rate": 9.969389724788574e-06, + "loss": 0.7457, + "step": 2711 + }, + { + "epoch": 0.14621522536122494, + "grad_norm": 0.8632565140724182, + "learning_rate": 9.96936629710892e-06, + "loss": 0.8549, + "step": 2712 + }, + { + "epoch": 0.14626913952986845, + "grad_norm": 0.9616119265556335, + "learning_rate": 9.969342860495018e-06, + "loss": 0.6219, + "step": 2713 + }, + { + "epoch": 0.14632305369851198, + "grad_norm": 0.9943077564239502, + "learning_rate": 9.969319414946906e-06, + "loss": 0.8676, + "step": 2714 + }, + { + "epoch": 0.14637696786715548, + "grad_norm": 0.861070454120636, + "learning_rate": 9.969295960464627e-06, + "loss": 0.7235, + "step": 2715 + }, + { + "epoch": 0.14643088203579901, + "grad_norm": 0.9375396370887756, + "learning_rate": 9.969272497048225e-06, + "loss": 0.9169, + "step": 2716 + }, + { + "epoch": 0.14648479620444252, + "grad_norm": 0.8180664777755737, + "learning_rate": 9.969249024697741e-06, + "loss": 0.8109, + "step": 2717 + }, + { + "epoch": 0.14653871037308605, + "grad_norm": 0.8574398159980774, + "learning_rate": 9.969225543413218e-06, + "loss": 0.767, + "step": 2718 + }, + { + "epoch": 0.14659262454172955, + "grad_norm": 1.0249319076538086, + "learning_rate": 9.969202053194697e-06, + "loss": 0.902, + "step": 2719 + }, + { + "epoch": 0.1466465387103731, + "grad_norm": 0.8045467734336853, + "learning_rate": 9.96917855404222e-06, + "loss": 0.7797, + "step": 2720 + }, + { + "epoch": 0.14670045287901662, + "grad_norm": 0.880533754825592, + "learning_rate": 9.969155045955831e-06, + "loss": 0.8071, + "step": 2721 + }, + { + "epoch": 0.14675436704766012, + "grad_norm": 0.8733983635902405, + "learning_rate": 9.969131528935572e-06, + "loss": 0.8309, + "step": 2722 + }, + { + "epoch": 0.14680828121630365, + "grad_norm": 0.8205264210700989, + "learning_rate": 9.969108002981484e-06, + "loss": 0.8126, + "step": 2723 + }, + { + "epoch": 0.14686219538494716, + "grad_norm": 0.8250916600227356, + "learning_rate": 9.96908446809361e-06, + "loss": 0.7488, + "step": 2724 + }, + { + "epoch": 0.1469161095535907, + "grad_norm": 0.8082099556922913, + "learning_rate": 9.969060924271994e-06, + "loss": 0.8039, + "step": 2725 + }, + { + "epoch": 0.1469700237222342, + "grad_norm": 0.8376840353012085, + "learning_rate": 9.969037371516674e-06, + "loss": 0.7603, + "step": 2726 + }, + { + "epoch": 0.14702393789087773, + "grad_norm": 1.2106066942214966, + "learning_rate": 9.969013809827697e-06, + "loss": 0.8187, + "step": 2727 + }, + { + "epoch": 0.14707785205952123, + "grad_norm": 0.8828561305999756, + "learning_rate": 9.968990239205103e-06, + "loss": 0.7249, + "step": 2728 + }, + { + "epoch": 0.14713176622816476, + "grad_norm": 0.8182427883148193, + "learning_rate": 9.968966659648935e-06, + "loss": 0.8353, + "step": 2729 + }, + { + "epoch": 0.1471856803968083, + "grad_norm": 0.8091077208518982, + "learning_rate": 9.968943071159234e-06, + "loss": 0.8261, + "step": 2730 + }, + { + "epoch": 0.1472395945654518, + "grad_norm": 0.9515360593795776, + "learning_rate": 9.968919473736043e-06, + "loss": 0.9099, + "step": 2731 + }, + { + "epoch": 0.14729350873409533, + "grad_norm": 0.7404700517654419, + "learning_rate": 9.968895867379407e-06, + "loss": 0.7793, + "step": 2732 + }, + { + "epoch": 0.14734742290273883, + "grad_norm": 0.7887243032455444, + "learning_rate": 9.968872252089365e-06, + "loss": 0.8749, + "step": 2733 + }, + { + "epoch": 0.14740133707138237, + "grad_norm": 1.1335293054580688, + "learning_rate": 9.968848627865962e-06, + "loss": 0.8428, + "step": 2734 + }, + { + "epoch": 0.14745525124002587, + "grad_norm": 0.787325382232666, + "learning_rate": 9.968824994709238e-06, + "loss": 0.8026, + "step": 2735 + }, + { + "epoch": 0.1475091654086694, + "grad_norm": 0.8006013035774231, + "learning_rate": 9.968801352619238e-06, + "loss": 0.9083, + "step": 2736 + }, + { + "epoch": 0.1475630795773129, + "grad_norm": 0.8923180103302002, + "learning_rate": 9.968777701596002e-06, + "loss": 0.8628, + "step": 2737 + }, + { + "epoch": 0.14761699374595644, + "grad_norm": 0.798041582107544, + "learning_rate": 9.968754041639573e-06, + "loss": 0.7519, + "step": 2738 + }, + { + "epoch": 0.14767090791459997, + "grad_norm": 0.8984145522117615, + "learning_rate": 9.968730372749996e-06, + "loss": 0.7624, + "step": 2739 + }, + { + "epoch": 0.14772482208324347, + "grad_norm": 0.8182528018951416, + "learning_rate": 9.968706694927312e-06, + "loss": 0.8442, + "step": 2740 + }, + { + "epoch": 0.147778736251887, + "grad_norm": 0.8047756552696228, + "learning_rate": 9.968683008171562e-06, + "loss": 0.847, + "step": 2741 + }, + { + "epoch": 0.1478326504205305, + "grad_norm": 0.7935258150100708, + "learning_rate": 9.968659312482792e-06, + "loss": 0.8072, + "step": 2742 + }, + { + "epoch": 0.14788656458917404, + "grad_norm": 0.8043146729469299, + "learning_rate": 9.968635607861042e-06, + "loss": 0.7769, + "step": 2743 + }, + { + "epoch": 0.14794047875781755, + "grad_norm": 0.7826459407806396, + "learning_rate": 9.968611894306356e-06, + "loss": 0.8418, + "step": 2744 + }, + { + "epoch": 0.14799439292646108, + "grad_norm": 0.9293491244316101, + "learning_rate": 9.968588171818775e-06, + "loss": 0.8704, + "step": 2745 + }, + { + "epoch": 0.14804830709510458, + "grad_norm": 0.8281397223472595, + "learning_rate": 9.968564440398343e-06, + "loss": 0.9288, + "step": 2746 + }, + { + "epoch": 0.1481022212637481, + "grad_norm": 0.8558036684989929, + "learning_rate": 9.968540700045101e-06, + "loss": 0.8406, + "step": 2747 + }, + { + "epoch": 0.14815613543239164, + "grad_norm": 0.8167025446891785, + "learning_rate": 9.968516950759096e-06, + "loss": 0.8268, + "step": 2748 + }, + { + "epoch": 0.14821004960103515, + "grad_norm": 0.8612670302391052, + "learning_rate": 9.968493192540364e-06, + "loss": 0.8265, + "step": 2749 + }, + { + "epoch": 0.14826396376967868, + "grad_norm": 0.9208493232727051, + "learning_rate": 9.968469425388953e-06, + "loss": 0.8555, + "step": 2750 + }, + { + "epoch": 0.14831787793832218, + "grad_norm": 0.756591260433197, + "learning_rate": 9.968445649304904e-06, + "loss": 0.7655, + "step": 2751 + }, + { + "epoch": 0.14837179210696572, + "grad_norm": 0.8566586375236511, + "learning_rate": 9.96842186428826e-06, + "loss": 0.8125, + "step": 2752 + }, + { + "epoch": 0.14842570627560922, + "grad_norm": 0.7984357476234436, + "learning_rate": 9.968398070339063e-06, + "loss": 0.7307, + "step": 2753 + }, + { + "epoch": 0.14847962044425275, + "grad_norm": 0.8943261504173279, + "learning_rate": 9.968374267457356e-06, + "loss": 0.757, + "step": 2754 + }, + { + "epoch": 0.14853353461289626, + "grad_norm": 0.9466004967689514, + "learning_rate": 9.968350455643184e-06, + "loss": 0.8271, + "step": 2755 + }, + { + "epoch": 0.1485874487815398, + "grad_norm": 0.7604812383651733, + "learning_rate": 9.968326634896585e-06, + "loss": 0.7654, + "step": 2756 + }, + { + "epoch": 0.14864136295018332, + "grad_norm": 0.7803215384483337, + "learning_rate": 9.968302805217609e-06, + "loss": 0.7691, + "step": 2757 + }, + { + "epoch": 0.14869527711882682, + "grad_norm": 0.8579596281051636, + "learning_rate": 9.96827896660629e-06, + "loss": 0.859, + "step": 2758 + }, + { + "epoch": 0.14874919128747036, + "grad_norm": 0.8205640316009521, + "learning_rate": 9.968255119062679e-06, + "loss": 0.8588, + "step": 2759 + }, + { + "epoch": 0.14880310545611386, + "grad_norm": 0.8601415753364563, + "learning_rate": 9.968231262586814e-06, + "loss": 0.8399, + "step": 2760 + }, + { + "epoch": 0.1488570196247574, + "grad_norm": 0.8827456831932068, + "learning_rate": 9.96820739717874e-06, + "loss": 0.8413, + "step": 2761 + }, + { + "epoch": 0.1489109337934009, + "grad_norm": 0.7422264218330383, + "learning_rate": 9.968183522838499e-06, + "loss": 0.7451, + "step": 2762 + }, + { + "epoch": 0.14896484796204443, + "grad_norm": 0.9764127135276794, + "learning_rate": 9.968159639566133e-06, + "loss": 0.8436, + "step": 2763 + }, + { + "epoch": 0.14901876213068793, + "grad_norm": 0.7435232400894165, + "learning_rate": 9.968135747361687e-06, + "loss": 0.7553, + "step": 2764 + }, + { + "epoch": 0.14907267629933146, + "grad_norm": 0.7399751543998718, + "learning_rate": 9.968111846225202e-06, + "loss": 0.7695, + "step": 2765 + }, + { + "epoch": 0.149126590467975, + "grad_norm": 0.882901668548584, + "learning_rate": 9.968087936156722e-06, + "loss": 0.8418, + "step": 2766 + }, + { + "epoch": 0.1491805046366185, + "grad_norm": 0.840501606464386, + "learning_rate": 9.968064017156292e-06, + "loss": 0.83, + "step": 2767 + }, + { + "epoch": 0.14923441880526203, + "grad_norm": 0.9809413552284241, + "learning_rate": 9.96804008922395e-06, + "loss": 0.8029, + "step": 2768 + }, + { + "epoch": 0.14928833297390554, + "grad_norm": 0.7534085512161255, + "learning_rate": 9.968016152359744e-06, + "loss": 0.7201, + "step": 2769 + }, + { + "epoch": 0.14934224714254907, + "grad_norm": 0.813582718372345, + "learning_rate": 9.967992206563714e-06, + "loss": 0.8533, + "step": 2770 + }, + { + "epoch": 0.14939616131119257, + "grad_norm": 0.9827276468276978, + "learning_rate": 9.967968251835905e-06, + "loss": 0.8097, + "step": 2771 + }, + { + "epoch": 0.1494500754798361, + "grad_norm": 0.828959047794342, + "learning_rate": 9.967944288176359e-06, + "loss": 0.859, + "step": 2772 + }, + { + "epoch": 0.1495039896484796, + "grad_norm": 0.8123818039894104, + "learning_rate": 9.967920315585118e-06, + "loss": 0.7044, + "step": 2773 + }, + { + "epoch": 0.14955790381712314, + "grad_norm": 0.7503589987754822, + "learning_rate": 9.967896334062228e-06, + "loss": 0.7255, + "step": 2774 + }, + { + "epoch": 0.14961181798576667, + "grad_norm": 0.7414034605026245, + "learning_rate": 9.96787234360773e-06, + "loss": 0.7599, + "step": 2775 + }, + { + "epoch": 0.14966573215441017, + "grad_norm": 0.7467254400253296, + "learning_rate": 9.967848344221667e-06, + "loss": 0.6835, + "step": 2776 + }, + { + "epoch": 0.1497196463230537, + "grad_norm": 0.8653414249420166, + "learning_rate": 9.967824335904082e-06, + "loss": 0.8205, + "step": 2777 + }, + { + "epoch": 0.1497735604916972, + "grad_norm": 0.9113380312919617, + "learning_rate": 9.96780031865502e-06, + "loss": 0.8758, + "step": 2778 + }, + { + "epoch": 0.14982747466034074, + "grad_norm": 0.8330965042114258, + "learning_rate": 9.967776292474523e-06, + "loss": 0.8696, + "step": 2779 + }, + { + "epoch": 0.14988138882898425, + "grad_norm": 0.9087555408477783, + "learning_rate": 9.967752257362633e-06, + "loss": 0.8381, + "step": 2780 + }, + { + "epoch": 0.14993530299762778, + "grad_norm": 0.856777548789978, + "learning_rate": 9.967728213319394e-06, + "loss": 0.8365, + "step": 2781 + }, + { + "epoch": 0.14998921716627128, + "grad_norm": 0.8314496874809265, + "learning_rate": 9.967704160344852e-06, + "loss": 0.7403, + "step": 2782 + }, + { + "epoch": 0.15004313133491481, + "grad_norm": 0.8357448577880859, + "learning_rate": 9.967680098439047e-06, + "loss": 0.8256, + "step": 2783 + }, + { + "epoch": 0.15009704550355835, + "grad_norm": 0.8366092443466187, + "learning_rate": 9.967656027602023e-06, + "loss": 0.8221, + "step": 2784 + }, + { + "epoch": 0.15015095967220185, + "grad_norm": 0.7944943904876709, + "learning_rate": 9.967631947833823e-06, + "loss": 0.813, + "step": 2785 + }, + { + "epoch": 0.15020487384084538, + "grad_norm": 0.8407523036003113, + "learning_rate": 9.967607859134492e-06, + "loss": 0.8237, + "step": 2786 + }, + { + "epoch": 0.1502587880094889, + "grad_norm": 0.7879778146743774, + "learning_rate": 9.967583761504071e-06, + "loss": 0.777, + "step": 2787 + }, + { + "epoch": 0.15031270217813242, + "grad_norm": 0.8307899832725525, + "learning_rate": 9.967559654942604e-06, + "loss": 0.8394, + "step": 2788 + }, + { + "epoch": 0.15036661634677592, + "grad_norm": 0.8068673610687256, + "learning_rate": 9.967535539450135e-06, + "loss": 0.8435, + "step": 2789 + }, + { + "epoch": 0.15042053051541945, + "grad_norm": 0.8473932147026062, + "learning_rate": 9.967511415026709e-06, + "loss": 0.8698, + "step": 2790 + }, + { + "epoch": 0.15047444468406296, + "grad_norm": 0.8352688550949097, + "learning_rate": 9.967487281672365e-06, + "loss": 0.8617, + "step": 2791 + }, + { + "epoch": 0.1505283588527065, + "grad_norm": 0.7729620337486267, + "learning_rate": 9.96746313938715e-06, + "loss": 0.779, + "step": 2792 + }, + { + "epoch": 0.15058227302135002, + "grad_norm": 0.8704085946083069, + "learning_rate": 9.967438988171106e-06, + "loss": 0.833, + "step": 2793 + }, + { + "epoch": 0.15063618718999353, + "grad_norm": 0.7538182735443115, + "learning_rate": 9.967414828024276e-06, + "loss": 0.7479, + "step": 2794 + }, + { + "epoch": 0.15069010135863706, + "grad_norm": 0.7672195434570312, + "learning_rate": 9.967390658946704e-06, + "loss": 0.7778, + "step": 2795 + }, + { + "epoch": 0.15074401552728056, + "grad_norm": 0.8245819211006165, + "learning_rate": 9.967366480938435e-06, + "loss": 0.6898, + "step": 2796 + }, + { + "epoch": 0.1507979296959241, + "grad_norm": 0.8197571635246277, + "learning_rate": 9.967342293999512e-06, + "loss": 0.8714, + "step": 2797 + }, + { + "epoch": 0.1508518438645676, + "grad_norm": 0.8135389685630798, + "learning_rate": 9.967318098129974e-06, + "loss": 0.8906, + "step": 2798 + }, + { + "epoch": 0.15090575803321113, + "grad_norm": 0.7287562489509583, + "learning_rate": 9.96729389332987e-06, + "loss": 0.7834, + "step": 2799 + }, + { + "epoch": 0.15095967220185466, + "grad_norm": 0.8642309904098511, + "learning_rate": 9.967269679599242e-06, + "loss": 0.7912, + "step": 2800 + }, + { + "epoch": 0.15101358637049817, + "grad_norm": 0.886060893535614, + "learning_rate": 9.967245456938132e-06, + "loss": 0.8614, + "step": 2801 + }, + { + "epoch": 0.1510675005391417, + "grad_norm": 0.8505488038063049, + "learning_rate": 9.967221225346584e-06, + "loss": 0.8323, + "step": 2802 + }, + { + "epoch": 0.1511214147077852, + "grad_norm": 0.8862965703010559, + "learning_rate": 9.967196984824644e-06, + "loss": 0.8292, + "step": 2803 + }, + { + "epoch": 0.15117532887642873, + "grad_norm": 0.8016111254692078, + "learning_rate": 9.967172735372353e-06, + "loss": 0.643, + "step": 2804 + }, + { + "epoch": 0.15122924304507224, + "grad_norm": 0.7599527835845947, + "learning_rate": 9.967148476989755e-06, + "loss": 0.8166, + "step": 2805 + }, + { + "epoch": 0.15128315721371577, + "grad_norm": 0.9574166536331177, + "learning_rate": 9.967124209676894e-06, + "loss": 0.8867, + "step": 2806 + }, + { + "epoch": 0.15133707138235927, + "grad_norm": 0.8384936451911926, + "learning_rate": 9.967099933433815e-06, + "loss": 0.9021, + "step": 2807 + }, + { + "epoch": 0.1513909855510028, + "grad_norm": 0.7779715061187744, + "learning_rate": 9.967075648260559e-06, + "loss": 0.7672, + "step": 2808 + }, + { + "epoch": 0.15144489971964634, + "grad_norm": 0.7783359885215759, + "learning_rate": 9.96705135415717e-06, + "loss": 0.8012, + "step": 2809 + }, + { + "epoch": 0.15149881388828984, + "grad_norm": 0.9124150276184082, + "learning_rate": 9.967027051123695e-06, + "loss": 0.8803, + "step": 2810 + }, + { + "epoch": 0.15155272805693337, + "grad_norm": 0.8135334849357605, + "learning_rate": 9.967002739160173e-06, + "loss": 0.7764, + "step": 2811 + }, + { + "epoch": 0.15160664222557688, + "grad_norm": 0.8082837462425232, + "learning_rate": 9.966978418266651e-06, + "loss": 0.8552, + "step": 2812 + }, + { + "epoch": 0.1516605563942204, + "grad_norm": 0.7978013753890991, + "learning_rate": 9.966954088443171e-06, + "loss": 0.7321, + "step": 2813 + }, + { + "epoch": 0.1517144705628639, + "grad_norm": 0.7845378518104553, + "learning_rate": 9.966929749689778e-06, + "loss": 0.7694, + "step": 2814 + }, + { + "epoch": 0.15176838473150744, + "grad_norm": 0.8671941161155701, + "learning_rate": 9.966905402006516e-06, + "loss": 0.886, + "step": 2815 + }, + { + "epoch": 0.15182229890015095, + "grad_norm": 0.8316017389297485, + "learning_rate": 9.966881045393426e-06, + "loss": 0.8844, + "step": 2816 + }, + { + "epoch": 0.15187621306879448, + "grad_norm": 0.7372319102287292, + "learning_rate": 9.966856679850554e-06, + "loss": 0.739, + "step": 2817 + }, + { + "epoch": 0.151930127237438, + "grad_norm": 0.7547122240066528, + "learning_rate": 9.966832305377944e-06, + "loss": 0.7518, + "step": 2818 + }, + { + "epoch": 0.15198404140608152, + "grad_norm": 0.8701632022857666, + "learning_rate": 9.96680792197564e-06, + "loss": 0.8632, + "step": 2819 + }, + { + "epoch": 0.15203795557472505, + "grad_norm": 0.7842714786529541, + "learning_rate": 9.966783529643686e-06, + "loss": 0.8161, + "step": 2820 + }, + { + "epoch": 0.15209186974336855, + "grad_norm": 0.858406126499176, + "learning_rate": 9.966759128382125e-06, + "loss": 0.7742, + "step": 2821 + }, + { + "epoch": 0.15214578391201208, + "grad_norm": 1.02357816696167, + "learning_rate": 9.966734718190998e-06, + "loss": 0.9142, + "step": 2822 + }, + { + "epoch": 0.1521996980806556, + "grad_norm": 0.81562739610672, + "learning_rate": 9.966710299070355e-06, + "loss": 0.8426, + "step": 2823 + }, + { + "epoch": 0.15225361224929912, + "grad_norm": 0.8576202988624573, + "learning_rate": 9.966685871020236e-06, + "loss": 0.7546, + "step": 2824 + }, + { + "epoch": 0.15230752641794262, + "grad_norm": 0.8974374532699585, + "learning_rate": 9.966661434040684e-06, + "loss": 0.7236, + "step": 2825 + }, + { + "epoch": 0.15236144058658616, + "grad_norm": 0.7306199073791504, + "learning_rate": 9.966636988131745e-06, + "loss": 0.7581, + "step": 2826 + }, + { + "epoch": 0.1524153547552297, + "grad_norm": 0.9296971559524536, + "learning_rate": 9.966612533293465e-06, + "loss": 0.9214, + "step": 2827 + }, + { + "epoch": 0.1524692689238732, + "grad_norm": 1.029969573020935, + "learning_rate": 9.966588069525885e-06, + "loss": 0.8371, + "step": 2828 + }, + { + "epoch": 0.15252318309251672, + "grad_norm": 0.869320809841156, + "learning_rate": 9.966563596829046e-06, + "loss": 0.6396, + "step": 2829 + }, + { + "epoch": 0.15257709726116023, + "grad_norm": 0.8893983960151672, + "learning_rate": 9.966539115202998e-06, + "loss": 0.8423, + "step": 2830 + }, + { + "epoch": 0.15263101142980376, + "grad_norm": 0.823639452457428, + "learning_rate": 9.966514624647783e-06, + "loss": 0.7924, + "step": 2831 + }, + { + "epoch": 0.15268492559844726, + "grad_norm": 0.805551290512085, + "learning_rate": 9.966490125163444e-06, + "loss": 0.8091, + "step": 2832 + }, + { + "epoch": 0.1527388397670908, + "grad_norm": 0.9040341377258301, + "learning_rate": 9.966465616750025e-06, + "loss": 0.8924, + "step": 2833 + }, + { + "epoch": 0.1527927539357343, + "grad_norm": 0.8297836780548096, + "learning_rate": 9.966441099407572e-06, + "loss": 0.7538, + "step": 2834 + }, + { + "epoch": 0.15284666810437783, + "grad_norm": 0.8824244141578674, + "learning_rate": 9.966416573136127e-06, + "loss": 0.8892, + "step": 2835 + }, + { + "epoch": 0.15290058227302136, + "grad_norm": 1.0663546323776245, + "learning_rate": 9.966392037935734e-06, + "loss": 0.7809, + "step": 2836 + }, + { + "epoch": 0.15295449644166487, + "grad_norm": 0.8324514627456665, + "learning_rate": 9.966367493806439e-06, + "loss": 0.8308, + "step": 2837 + }, + { + "epoch": 0.1530084106103084, + "grad_norm": 0.7742459177970886, + "learning_rate": 9.966342940748286e-06, + "loss": 0.8269, + "step": 2838 + }, + { + "epoch": 0.1530623247789519, + "grad_norm": 0.9513984322547913, + "learning_rate": 9.966318378761317e-06, + "loss": 0.8538, + "step": 2839 + }, + { + "epoch": 0.15311623894759543, + "grad_norm": 0.8030692934989929, + "learning_rate": 9.966293807845577e-06, + "loss": 0.7752, + "step": 2840 + }, + { + "epoch": 0.15317015311623894, + "grad_norm": 0.8903285264968872, + "learning_rate": 9.966269228001112e-06, + "loss": 0.8556, + "step": 2841 + }, + { + "epoch": 0.15322406728488247, + "grad_norm": 0.8221173286437988, + "learning_rate": 9.966244639227962e-06, + "loss": 0.7249, + "step": 2842 + }, + { + "epoch": 0.15327798145352597, + "grad_norm": 0.9883365035057068, + "learning_rate": 9.966220041526176e-06, + "loss": 0.961, + "step": 2843 + }, + { + "epoch": 0.1533318956221695, + "grad_norm": 0.8654862642288208, + "learning_rate": 9.966195434895796e-06, + "loss": 0.7779, + "step": 2844 + }, + { + "epoch": 0.15338580979081304, + "grad_norm": 0.7924084663391113, + "learning_rate": 9.966170819336866e-06, + "loss": 0.7706, + "step": 2845 + }, + { + "epoch": 0.15343972395945654, + "grad_norm": 0.8227209448814392, + "learning_rate": 9.96614619484943e-06, + "loss": 0.8659, + "step": 2846 + }, + { + "epoch": 0.15349363812810007, + "grad_norm": 0.9436708688735962, + "learning_rate": 9.966121561433534e-06, + "loss": 0.87, + "step": 2847 + }, + { + "epoch": 0.15354755229674358, + "grad_norm": 1.137171983718872, + "learning_rate": 9.96609691908922e-06, + "loss": 0.7883, + "step": 2848 + }, + { + "epoch": 0.1536014664653871, + "grad_norm": 0.8868550658226013, + "learning_rate": 9.966072267816535e-06, + "loss": 0.8309, + "step": 2849 + }, + { + "epoch": 0.1536553806340306, + "grad_norm": 0.7190971970558167, + "learning_rate": 9.966047607615521e-06, + "loss": 0.6938, + "step": 2850 + }, + { + "epoch": 0.15370929480267415, + "grad_norm": 0.883866548538208, + "learning_rate": 9.966022938486223e-06, + "loss": 0.8368, + "step": 2851 + }, + { + "epoch": 0.15376320897131765, + "grad_norm": 0.9433422684669495, + "learning_rate": 9.965998260428686e-06, + "loss": 0.7739, + "step": 2852 + }, + { + "epoch": 0.15381712313996118, + "grad_norm": 0.9166012406349182, + "learning_rate": 9.965973573442956e-06, + "loss": 0.8308, + "step": 2853 + }, + { + "epoch": 0.1538710373086047, + "grad_norm": 0.8955514430999756, + "learning_rate": 9.965948877529071e-06, + "loss": 0.8403, + "step": 2854 + }, + { + "epoch": 0.15392495147724822, + "grad_norm": 0.8281451463699341, + "learning_rate": 9.965924172687083e-06, + "loss": 0.8127, + "step": 2855 + }, + { + "epoch": 0.15397886564589175, + "grad_norm": 0.8765435218811035, + "learning_rate": 9.965899458917031e-06, + "loss": 0.87, + "step": 2856 + }, + { + "epoch": 0.15403277981453525, + "grad_norm": 0.9525101780891418, + "learning_rate": 9.965874736218964e-06, + "loss": 0.8665, + "step": 2857 + }, + { + "epoch": 0.15408669398317879, + "grad_norm": 0.7836191654205322, + "learning_rate": 9.965850004592921e-06, + "loss": 0.8261, + "step": 2858 + }, + { + "epoch": 0.1541406081518223, + "grad_norm": 0.7918692827224731, + "learning_rate": 9.96582526403895e-06, + "loss": 0.8422, + "step": 2859 + }, + { + "epoch": 0.15419452232046582, + "grad_norm": 0.8489586710929871, + "learning_rate": 9.965800514557096e-06, + "loss": 0.8871, + "step": 2860 + }, + { + "epoch": 0.15424843648910933, + "grad_norm": 0.9581596255302429, + "learning_rate": 9.965775756147402e-06, + "loss": 0.9346, + "step": 2861 + }, + { + "epoch": 0.15430235065775286, + "grad_norm": 1.0253969430923462, + "learning_rate": 9.965750988809913e-06, + "loss": 0.8381, + "step": 2862 + }, + { + "epoch": 0.1543562648263964, + "grad_norm": 0.8403491377830505, + "learning_rate": 9.965726212544674e-06, + "loss": 0.8307, + "step": 2863 + }, + { + "epoch": 0.1544101789950399, + "grad_norm": 0.729560375213623, + "learning_rate": 9.965701427351728e-06, + "loss": 0.8021, + "step": 2864 + }, + { + "epoch": 0.15446409316368342, + "grad_norm": 0.7576143741607666, + "learning_rate": 9.965676633231121e-06, + "loss": 0.7896, + "step": 2865 + }, + { + "epoch": 0.15451800733232693, + "grad_norm": 1.100948452949524, + "learning_rate": 9.965651830182898e-06, + "loss": 0.797, + "step": 2866 + }, + { + "epoch": 0.15457192150097046, + "grad_norm": 1.0760526657104492, + "learning_rate": 9.965627018207102e-06, + "loss": 0.7875, + "step": 2867 + }, + { + "epoch": 0.15462583566961396, + "grad_norm": 0.8553655743598938, + "learning_rate": 9.96560219730378e-06, + "loss": 0.872, + "step": 2868 + }, + { + "epoch": 0.1546797498382575, + "grad_norm": 1.1357450485229492, + "learning_rate": 9.965577367472971e-06, + "loss": 0.7306, + "step": 2869 + }, + { + "epoch": 0.154733664006901, + "grad_norm": 0.8308514952659607, + "learning_rate": 9.965552528714725e-06, + "loss": 0.8106, + "step": 2870 + }, + { + "epoch": 0.15478757817554453, + "grad_norm": 0.8406074047088623, + "learning_rate": 9.965527681029088e-06, + "loss": 0.9085, + "step": 2871 + }, + { + "epoch": 0.15484149234418806, + "grad_norm": 0.8215218186378479, + "learning_rate": 9.9655028244161e-06, + "loss": 0.733, + "step": 2872 + }, + { + "epoch": 0.15489540651283157, + "grad_norm": 1.0004653930664062, + "learning_rate": 9.965477958875806e-06, + "loss": 0.8625, + "step": 2873 + }, + { + "epoch": 0.1549493206814751, + "grad_norm": 0.8359742760658264, + "learning_rate": 9.965453084408256e-06, + "loss": 0.7847, + "step": 2874 + }, + { + "epoch": 0.1550032348501186, + "grad_norm": 1.0257774591445923, + "learning_rate": 9.965428201013488e-06, + "loss": 0.8654, + "step": 2875 + }, + { + "epoch": 0.15505714901876214, + "grad_norm": 0.7931713461875916, + "learning_rate": 9.96540330869155e-06, + "loss": 0.7498, + "step": 2876 + }, + { + "epoch": 0.15511106318740564, + "grad_norm": 0.7873162031173706, + "learning_rate": 9.965378407442488e-06, + "loss": 0.7617, + "step": 2877 + }, + { + "epoch": 0.15516497735604917, + "grad_norm": 0.8008442521095276, + "learning_rate": 9.965353497266346e-06, + "loss": 0.8464, + "step": 2878 + }, + { + "epoch": 0.15521889152469268, + "grad_norm": 0.798004686832428, + "learning_rate": 9.965328578163166e-06, + "loss": 0.8519, + "step": 2879 + }, + { + "epoch": 0.1552728056933362, + "grad_norm": 0.8730151057243347, + "learning_rate": 9.965303650132996e-06, + "loss": 0.8257, + "step": 2880 + }, + { + "epoch": 0.15532671986197974, + "grad_norm": 0.7465460896492004, + "learning_rate": 9.965278713175879e-06, + "loss": 0.7786, + "step": 2881 + }, + { + "epoch": 0.15538063403062324, + "grad_norm": 0.9565917253494263, + "learning_rate": 9.96525376729186e-06, + "loss": 0.8694, + "step": 2882 + }, + { + "epoch": 0.15543454819926678, + "grad_norm": 0.880181074142456, + "learning_rate": 9.965228812480987e-06, + "loss": 0.813, + "step": 2883 + }, + { + "epoch": 0.15548846236791028, + "grad_norm": 0.7912368774414062, + "learning_rate": 9.965203848743299e-06, + "loss": 0.7764, + "step": 2884 + }, + { + "epoch": 0.1555423765365538, + "grad_norm": 0.8370791077613831, + "learning_rate": 9.965178876078846e-06, + "loss": 0.8591, + "step": 2885 + }, + { + "epoch": 0.15559629070519732, + "grad_norm": 0.8508057594299316, + "learning_rate": 9.965153894487672e-06, + "loss": 0.8535, + "step": 2886 + }, + { + "epoch": 0.15565020487384085, + "grad_norm": 1.0393366813659668, + "learning_rate": 9.965128903969818e-06, + "loss": 0.8032, + "step": 2887 + }, + { + "epoch": 0.15570411904248435, + "grad_norm": 0.7545601725578308, + "learning_rate": 9.965103904525334e-06, + "loss": 0.7024, + "step": 2888 + }, + { + "epoch": 0.15575803321112788, + "grad_norm": 0.7933251261711121, + "learning_rate": 9.965078896154262e-06, + "loss": 0.8325, + "step": 2889 + }, + { + "epoch": 0.15581194737977141, + "grad_norm": 0.8319270610809326, + "learning_rate": 9.965053878856648e-06, + "loss": 0.7781, + "step": 2890 + }, + { + "epoch": 0.15586586154841492, + "grad_norm": 1.0789637565612793, + "learning_rate": 9.965028852632537e-06, + "loss": 0.7931, + "step": 2891 + }, + { + "epoch": 0.15591977571705845, + "grad_norm": 0.9561448097229004, + "learning_rate": 9.965003817481974e-06, + "loss": 0.7472, + "step": 2892 + }, + { + "epoch": 0.15597368988570195, + "grad_norm": 0.9099969267845154, + "learning_rate": 9.964978773405003e-06, + "loss": 0.9154, + "step": 2893 + }, + { + "epoch": 0.1560276040543455, + "grad_norm": 0.9164708852767944, + "learning_rate": 9.96495372040167e-06, + "loss": 0.8552, + "step": 2894 + }, + { + "epoch": 0.156081518222989, + "grad_norm": 0.9367608428001404, + "learning_rate": 9.96492865847202e-06, + "loss": 0.7926, + "step": 2895 + }, + { + "epoch": 0.15613543239163252, + "grad_norm": 0.8970937728881836, + "learning_rate": 9.9649035876161e-06, + "loss": 0.8798, + "step": 2896 + }, + { + "epoch": 0.15618934656027603, + "grad_norm": 0.8037889003753662, + "learning_rate": 9.96487850783395e-06, + "loss": 0.8157, + "step": 2897 + }, + { + "epoch": 0.15624326072891956, + "grad_norm": 0.906944215297699, + "learning_rate": 9.964853419125619e-06, + "loss": 0.8191, + "step": 2898 + }, + { + "epoch": 0.1562971748975631, + "grad_norm": 0.8197054266929626, + "learning_rate": 9.964828321491152e-06, + "loss": 0.7899, + "step": 2899 + }, + { + "epoch": 0.1563510890662066, + "grad_norm": 0.7816088795661926, + "learning_rate": 9.96480321493059e-06, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.15640500323485013, + "grad_norm": 0.8319717645645142, + "learning_rate": 9.964778099443985e-06, + "loss": 0.7835, + "step": 2901 + }, + { + "epoch": 0.15645891740349363, + "grad_norm": 0.7739672660827637, + "learning_rate": 9.964752975031378e-06, + "loss": 0.7813, + "step": 2902 + }, + { + "epoch": 0.15651283157213716, + "grad_norm": 0.8002716898918152, + "learning_rate": 9.964727841692815e-06, + "loss": 0.7971, + "step": 2903 + }, + { + "epoch": 0.15656674574078067, + "grad_norm": 0.8796008229255676, + "learning_rate": 9.964702699428339e-06, + "loss": 0.7462, + "step": 2904 + }, + { + "epoch": 0.1566206599094242, + "grad_norm": 0.837027907371521, + "learning_rate": 9.964677548237998e-06, + "loss": 0.864, + "step": 2905 + }, + { + "epoch": 0.15667457407806773, + "grad_norm": 0.9098290205001831, + "learning_rate": 9.964652388121837e-06, + "loss": 0.9079, + "step": 2906 + }, + { + "epoch": 0.15672848824671123, + "grad_norm": 0.7707619071006775, + "learning_rate": 9.964627219079898e-06, + "loss": 0.7472, + "step": 2907 + }, + { + "epoch": 0.15678240241535477, + "grad_norm": 1.0109550952911377, + "learning_rate": 9.964602041112233e-06, + "loss": 0.8981, + "step": 2908 + }, + { + "epoch": 0.15683631658399827, + "grad_norm": 0.8410045504570007, + "learning_rate": 9.964576854218882e-06, + "loss": 0.8488, + "step": 2909 + }, + { + "epoch": 0.1568902307526418, + "grad_norm": 0.8624899983406067, + "learning_rate": 9.96455165839989e-06, + "loss": 0.817, + "step": 2910 + }, + { + "epoch": 0.1569441449212853, + "grad_norm": 0.9060286283493042, + "learning_rate": 9.964526453655304e-06, + "loss": 0.8171, + "step": 2911 + }, + { + "epoch": 0.15699805908992884, + "grad_norm": 0.7718086838722229, + "learning_rate": 9.96450123998517e-06, + "loss": 0.7158, + "step": 2912 + }, + { + "epoch": 0.15705197325857234, + "grad_norm": 0.8690425157546997, + "learning_rate": 9.96447601738953e-06, + "loss": 0.8347, + "step": 2913 + }, + { + "epoch": 0.15710588742721587, + "grad_norm": 0.782656192779541, + "learning_rate": 9.964450785868433e-06, + "loss": 0.7581, + "step": 2914 + }, + { + "epoch": 0.1571598015958594, + "grad_norm": 1.0090769529342651, + "learning_rate": 9.964425545421924e-06, + "loss": 0.8179, + "step": 2915 + }, + { + "epoch": 0.1572137157645029, + "grad_norm": 0.8786135911941528, + "learning_rate": 9.964400296050047e-06, + "loss": 0.8733, + "step": 2916 + }, + { + "epoch": 0.15726762993314644, + "grad_norm": 0.8163133859634399, + "learning_rate": 9.964375037752847e-06, + "loss": 0.8091, + "step": 2917 + }, + { + "epoch": 0.15732154410178995, + "grad_norm": 0.8213543891906738, + "learning_rate": 9.964349770530371e-06, + "loss": 0.7978, + "step": 2918 + }, + { + "epoch": 0.15737545827043348, + "grad_norm": 0.849274218082428, + "learning_rate": 9.964324494382663e-06, + "loss": 0.8168, + "step": 2919 + }, + { + "epoch": 0.15742937243907698, + "grad_norm": 0.8099618554115295, + "learning_rate": 9.964299209309769e-06, + "loss": 0.8372, + "step": 2920 + }, + { + "epoch": 0.1574832866077205, + "grad_norm": 0.9064434766769409, + "learning_rate": 9.964273915311734e-06, + "loss": 0.8681, + "step": 2921 + }, + { + "epoch": 0.15753720077636402, + "grad_norm": 0.7269558310508728, + "learning_rate": 9.964248612388607e-06, + "loss": 0.7179, + "step": 2922 + }, + { + "epoch": 0.15759111494500755, + "grad_norm": 0.8115706443786621, + "learning_rate": 9.964223300540427e-06, + "loss": 0.8572, + "step": 2923 + }, + { + "epoch": 0.15764502911365108, + "grad_norm": 0.8180872797966003, + "learning_rate": 9.964197979767246e-06, + "loss": 0.7463, + "step": 2924 + }, + { + "epoch": 0.15769894328229458, + "grad_norm": 0.741603434085846, + "learning_rate": 9.964172650069105e-06, + "loss": 0.7646, + "step": 2925 + }, + { + "epoch": 0.15775285745093812, + "grad_norm": 0.7558543682098389, + "learning_rate": 9.964147311446051e-06, + "loss": 0.7363, + "step": 2926 + }, + { + "epoch": 0.15780677161958162, + "grad_norm": 0.8128615617752075, + "learning_rate": 9.96412196389813e-06, + "loss": 0.8515, + "step": 2927 + }, + { + "epoch": 0.15786068578822515, + "grad_norm": 0.9731131196022034, + "learning_rate": 9.964096607425388e-06, + "loss": 0.8847, + "step": 2928 + }, + { + "epoch": 0.15791459995686866, + "grad_norm": 1.136883020401001, + "learning_rate": 9.964071242027868e-06, + "loss": 0.8457, + "step": 2929 + }, + { + "epoch": 0.1579685141255122, + "grad_norm": 0.7780461311340332, + "learning_rate": 9.964045867705618e-06, + "loss": 0.737, + "step": 2930 + }, + { + "epoch": 0.1580224282941557, + "grad_norm": 0.801013708114624, + "learning_rate": 9.964020484458684e-06, + "loss": 0.8164, + "step": 2931 + }, + { + "epoch": 0.15807634246279922, + "grad_norm": 0.8851730823516846, + "learning_rate": 9.96399509228711e-06, + "loss": 0.8762, + "step": 2932 + }, + { + "epoch": 0.15813025663144276, + "grad_norm": 0.9501338005065918, + "learning_rate": 9.963969691190942e-06, + "loss": 0.7788, + "step": 2933 + }, + { + "epoch": 0.15818417080008626, + "grad_norm": 0.9714099168777466, + "learning_rate": 9.963944281170227e-06, + "loss": 0.9207, + "step": 2934 + }, + { + "epoch": 0.1582380849687298, + "grad_norm": 0.764689564704895, + "learning_rate": 9.963918862225009e-06, + "loss": 0.737, + "step": 2935 + }, + { + "epoch": 0.1582919991373733, + "grad_norm": 1.1618343591690063, + "learning_rate": 9.963893434355335e-06, + "loss": 0.8055, + "step": 2936 + }, + { + "epoch": 0.15834591330601683, + "grad_norm": 0.8724596500396729, + "learning_rate": 9.96386799756125e-06, + "loss": 0.8449, + "step": 2937 + }, + { + "epoch": 0.15839982747466033, + "grad_norm": 0.7769358158111572, + "learning_rate": 9.963842551842798e-06, + "loss": 0.8155, + "step": 2938 + }, + { + "epoch": 0.15845374164330386, + "grad_norm": 0.8337542414665222, + "learning_rate": 9.963817097200028e-06, + "loss": 0.7331, + "step": 2939 + }, + { + "epoch": 0.15850765581194737, + "grad_norm": 0.8240610957145691, + "learning_rate": 9.963791633632984e-06, + "loss": 0.8076, + "step": 2940 + }, + { + "epoch": 0.1585615699805909, + "grad_norm": 0.7781216502189636, + "learning_rate": 9.963766161141713e-06, + "loss": 0.7274, + "step": 2941 + }, + { + "epoch": 0.15861548414923443, + "grad_norm": 0.8469343781471252, + "learning_rate": 9.96374067972626e-06, + "loss": 0.8364, + "step": 2942 + }, + { + "epoch": 0.15866939831787794, + "grad_norm": 0.7859261631965637, + "learning_rate": 9.963715189386669e-06, + "loss": 0.8006, + "step": 2943 + }, + { + "epoch": 0.15872331248652147, + "grad_norm": 0.8646130561828613, + "learning_rate": 9.963689690122988e-06, + "loss": 0.808, + "step": 2944 + }, + { + "epoch": 0.15877722665516497, + "grad_norm": 0.8905766010284424, + "learning_rate": 9.963664181935263e-06, + "loss": 0.8406, + "step": 2945 + }, + { + "epoch": 0.1588311408238085, + "grad_norm": 0.8756605982780457, + "learning_rate": 9.963638664823539e-06, + "loss": 0.8643, + "step": 2946 + }, + { + "epoch": 0.158885054992452, + "grad_norm": 0.899135410785675, + "learning_rate": 9.963613138787862e-06, + "loss": 0.9063, + "step": 2947 + }, + { + "epoch": 0.15893896916109554, + "grad_norm": 0.8382771015167236, + "learning_rate": 9.96358760382828e-06, + "loss": 0.8004, + "step": 2948 + }, + { + "epoch": 0.15899288332973904, + "grad_norm": 0.7687328457832336, + "learning_rate": 9.963562059944833e-06, + "loss": 0.7695, + "step": 2949 + }, + { + "epoch": 0.15904679749838257, + "grad_norm": 0.807344913482666, + "learning_rate": 9.963536507137574e-06, + "loss": 0.7514, + "step": 2950 + }, + { + "epoch": 0.1591007116670261, + "grad_norm": 0.7882648706436157, + "learning_rate": 9.963510945406545e-06, + "loss": 0.7537, + "step": 2951 + }, + { + "epoch": 0.1591546258356696, + "grad_norm": 0.8422887921333313, + "learning_rate": 9.963485374751793e-06, + "loss": 0.7937, + "step": 2952 + }, + { + "epoch": 0.15920854000431314, + "grad_norm": 0.7578607797622681, + "learning_rate": 9.963459795173362e-06, + "loss": 0.8071, + "step": 2953 + }, + { + "epoch": 0.15926245417295665, + "grad_norm": 0.8854062557220459, + "learning_rate": 9.963434206671302e-06, + "loss": 0.9078, + "step": 2954 + }, + { + "epoch": 0.15931636834160018, + "grad_norm": 0.8705536723136902, + "learning_rate": 9.963408609245654e-06, + "loss": 0.7971, + "step": 2955 + }, + { + "epoch": 0.15937028251024368, + "grad_norm": 0.8247761726379395, + "learning_rate": 9.96338300289647e-06, + "loss": 0.7889, + "step": 2956 + }, + { + "epoch": 0.15942419667888721, + "grad_norm": 0.8216410279273987, + "learning_rate": 9.96335738762379e-06, + "loss": 0.9097, + "step": 2957 + }, + { + "epoch": 0.15947811084753072, + "grad_norm": 0.9624109268188477, + "learning_rate": 9.963331763427666e-06, + "loss": 0.8562, + "step": 2958 + }, + { + "epoch": 0.15953202501617425, + "grad_norm": 0.8426920175552368, + "learning_rate": 9.96330613030814e-06, + "loss": 0.8011, + "step": 2959 + }, + { + "epoch": 0.15958593918481778, + "grad_norm": 0.8987439870834351, + "learning_rate": 9.963280488265256e-06, + "loss": 0.7965, + "step": 2960 + }, + { + "epoch": 0.1596398533534613, + "grad_norm": 0.8105943202972412, + "learning_rate": 9.963254837299066e-06, + "loss": 0.8178, + "step": 2961 + }, + { + "epoch": 0.15969376752210482, + "grad_norm": 0.928841769695282, + "learning_rate": 9.963229177409612e-06, + "loss": 0.8106, + "step": 2962 + }, + { + "epoch": 0.15974768169074832, + "grad_norm": 0.7369773983955383, + "learning_rate": 9.963203508596942e-06, + "loss": 0.7401, + "step": 2963 + }, + { + "epoch": 0.15980159585939185, + "grad_norm": 0.7476964592933655, + "learning_rate": 9.9631778308611e-06, + "loss": 0.8112, + "step": 2964 + }, + { + "epoch": 0.15985551002803536, + "grad_norm": 0.8257710337638855, + "learning_rate": 9.963152144202135e-06, + "loss": 0.8489, + "step": 2965 + }, + { + "epoch": 0.1599094241966789, + "grad_norm": 0.8324301242828369, + "learning_rate": 9.963126448620091e-06, + "loss": 0.8511, + "step": 2966 + }, + { + "epoch": 0.1599633383653224, + "grad_norm": 0.8221176266670227, + "learning_rate": 9.963100744115017e-06, + "loss": 0.7924, + "step": 2967 + }, + { + "epoch": 0.16001725253396593, + "grad_norm": 0.7942221164703369, + "learning_rate": 9.963075030686955e-06, + "loss": 0.7936, + "step": 2968 + }, + { + "epoch": 0.16007116670260946, + "grad_norm": 0.7341020107269287, + "learning_rate": 9.963049308335954e-06, + "loss": 0.7381, + "step": 2969 + }, + { + "epoch": 0.16012508087125296, + "grad_norm": 0.8118404746055603, + "learning_rate": 9.963023577062062e-06, + "loss": 0.756, + "step": 2970 + }, + { + "epoch": 0.1601789950398965, + "grad_norm": 0.7517318725585938, + "learning_rate": 9.96299783686532e-06, + "loss": 0.7051, + "step": 2971 + }, + { + "epoch": 0.16023290920854, + "grad_norm": 0.7982935905456543, + "learning_rate": 9.962972087745777e-06, + "loss": 0.8412, + "step": 2972 + }, + { + "epoch": 0.16028682337718353, + "grad_norm": 0.8397754430770874, + "learning_rate": 9.962946329703482e-06, + "loss": 0.8314, + "step": 2973 + }, + { + "epoch": 0.16034073754582703, + "grad_norm": 0.8342095613479614, + "learning_rate": 9.962920562738477e-06, + "loss": 0.7649, + "step": 2974 + }, + { + "epoch": 0.16039465171447057, + "grad_norm": 0.8053215742111206, + "learning_rate": 9.96289478685081e-06, + "loss": 0.7315, + "step": 2975 + }, + { + "epoch": 0.16044856588311407, + "grad_norm": 0.8931438326835632, + "learning_rate": 9.962869002040529e-06, + "loss": 0.9241, + "step": 2976 + }, + { + "epoch": 0.1605024800517576, + "grad_norm": 0.8217912316322327, + "learning_rate": 9.962843208307677e-06, + "loss": 0.7551, + "step": 2977 + }, + { + "epoch": 0.16055639422040113, + "grad_norm": 0.7592090964317322, + "learning_rate": 9.962817405652305e-06, + "loss": 0.7243, + "step": 2978 + }, + { + "epoch": 0.16061030838904464, + "grad_norm": 0.8466029167175293, + "learning_rate": 9.962791594074455e-06, + "loss": 0.785, + "step": 2979 + }, + { + "epoch": 0.16066422255768817, + "grad_norm": 0.859207272529602, + "learning_rate": 9.962765773574174e-06, + "loss": 0.8344, + "step": 2980 + }, + { + "epoch": 0.16071813672633167, + "grad_norm": 0.8134403824806213, + "learning_rate": 9.962739944151511e-06, + "loss": 0.7595, + "step": 2981 + }, + { + "epoch": 0.1607720508949752, + "grad_norm": 0.7411110401153564, + "learning_rate": 9.962714105806511e-06, + "loss": 0.7751, + "step": 2982 + }, + { + "epoch": 0.1608259650636187, + "grad_norm": 0.7976831793785095, + "learning_rate": 9.962688258539219e-06, + "loss": 0.7353, + "step": 2983 + }, + { + "epoch": 0.16087987923226224, + "grad_norm": 0.8306836485862732, + "learning_rate": 9.962662402349684e-06, + "loss": 0.7903, + "step": 2984 + }, + { + "epoch": 0.16093379340090574, + "grad_norm": 0.794691264629364, + "learning_rate": 9.96263653723795e-06, + "loss": 0.7972, + "step": 2985 + }, + { + "epoch": 0.16098770756954928, + "grad_norm": 0.7471837401390076, + "learning_rate": 9.962610663204066e-06, + "loss": 0.7994, + "step": 2986 + }, + { + "epoch": 0.1610416217381928, + "grad_norm": 0.8046342134475708, + "learning_rate": 9.962584780248079e-06, + "loss": 0.7912, + "step": 2987 + }, + { + "epoch": 0.1610955359068363, + "grad_norm": 0.7935966849327087, + "learning_rate": 9.96255888837003e-06, + "loss": 0.8053, + "step": 2988 + }, + { + "epoch": 0.16114945007547984, + "grad_norm": 0.7403679490089417, + "learning_rate": 9.962532987569973e-06, + "loss": 0.6707, + "step": 2989 + }, + { + "epoch": 0.16120336424412335, + "grad_norm": 0.8277058005332947, + "learning_rate": 9.96250707784795e-06, + "loss": 0.8074, + "step": 2990 + }, + { + "epoch": 0.16125727841276688, + "grad_norm": 1.0225850343704224, + "learning_rate": 9.962481159204008e-06, + "loss": 0.8475, + "step": 2991 + }, + { + "epoch": 0.16131119258141038, + "grad_norm": 0.8091806769371033, + "learning_rate": 9.962455231638193e-06, + "loss": 0.7714, + "step": 2992 + }, + { + "epoch": 0.16136510675005392, + "grad_norm": 0.7496880292892456, + "learning_rate": 9.962429295150554e-06, + "loss": 0.7449, + "step": 2993 + }, + { + "epoch": 0.16141902091869742, + "grad_norm": 0.7799220085144043, + "learning_rate": 9.962403349741137e-06, + "loss": 0.7241, + "step": 2994 + }, + { + "epoch": 0.16147293508734095, + "grad_norm": 0.92058926820755, + "learning_rate": 9.962377395409986e-06, + "loss": 0.8374, + "step": 2995 + }, + { + "epoch": 0.16152684925598448, + "grad_norm": 0.7713897228240967, + "learning_rate": 9.96235143215715e-06, + "loss": 0.7571, + "step": 2996 + }, + { + "epoch": 0.161580763424628, + "grad_norm": 0.779852032661438, + "learning_rate": 9.962325459982678e-06, + "loss": 0.796, + "step": 2997 + }, + { + "epoch": 0.16163467759327152, + "grad_norm": 0.8362038731575012, + "learning_rate": 9.962299478886613e-06, + "loss": 0.8645, + "step": 2998 + }, + { + "epoch": 0.16168859176191502, + "grad_norm": 0.8759078979492188, + "learning_rate": 9.962273488869003e-06, + "loss": 0.8192, + "step": 2999 + }, + { + "epoch": 0.16174250593055856, + "grad_norm": 0.7853894233703613, + "learning_rate": 9.962247489929892e-06, + "loss": 0.81, + "step": 3000 + }, + { + "epoch": 0.16179642009920206, + "grad_norm": 0.8752580881118774, + "learning_rate": 9.962221482069332e-06, + "loss": 0.8172, + "step": 3001 + }, + { + "epoch": 0.1618503342678456, + "grad_norm": 0.8129578828811646, + "learning_rate": 9.962195465287367e-06, + "loss": 0.698, + "step": 3002 + }, + { + "epoch": 0.1619042484364891, + "grad_norm": 0.7905570268630981, + "learning_rate": 9.962169439584043e-06, + "loss": 0.7755, + "step": 3003 + }, + { + "epoch": 0.16195816260513263, + "grad_norm": 1.1296168565750122, + "learning_rate": 9.962143404959408e-06, + "loss": 0.829, + "step": 3004 + }, + { + "epoch": 0.16201207677377616, + "grad_norm": 0.8880928158760071, + "learning_rate": 9.962117361413508e-06, + "loss": 0.8542, + "step": 3005 + }, + { + "epoch": 0.16206599094241966, + "grad_norm": 0.7933239936828613, + "learning_rate": 9.96209130894639e-06, + "loss": 0.714, + "step": 3006 + }, + { + "epoch": 0.1621199051110632, + "grad_norm": 0.8112434148788452, + "learning_rate": 9.962065247558101e-06, + "loss": 0.7967, + "step": 3007 + }, + { + "epoch": 0.1621738192797067, + "grad_norm": 0.7101603150367737, + "learning_rate": 9.962039177248689e-06, + "loss": 0.7054, + "step": 3008 + }, + { + "epoch": 0.16222773344835023, + "grad_norm": 0.9327304363250732, + "learning_rate": 9.962013098018198e-06, + "loss": 0.7683, + "step": 3009 + }, + { + "epoch": 0.16228164761699373, + "grad_norm": 0.8223574161529541, + "learning_rate": 9.961987009866678e-06, + "loss": 0.7174, + "step": 3010 + }, + { + "epoch": 0.16233556178563727, + "grad_norm": 0.889711856842041, + "learning_rate": 9.961960912794176e-06, + "loss": 0.8562, + "step": 3011 + }, + { + "epoch": 0.1623894759542808, + "grad_norm": 0.9297184348106384, + "learning_rate": 9.961934806800736e-06, + "loss": 0.8887, + "step": 3012 + }, + { + "epoch": 0.1624433901229243, + "grad_norm": 0.8206717371940613, + "learning_rate": 9.961908691886404e-06, + "loss": 0.8272, + "step": 3013 + }, + { + "epoch": 0.16249730429156783, + "grad_norm": 0.7833002805709839, + "learning_rate": 9.961882568051233e-06, + "loss": 0.848, + "step": 3014 + }, + { + "epoch": 0.16255121846021134, + "grad_norm": 0.8386265635490417, + "learning_rate": 9.961856435295265e-06, + "loss": 0.7528, + "step": 3015 + }, + { + "epoch": 0.16260513262885487, + "grad_norm": 0.8227097392082214, + "learning_rate": 9.961830293618547e-06, + "loss": 0.8181, + "step": 3016 + }, + { + "epoch": 0.16265904679749837, + "grad_norm": 0.7938892245292664, + "learning_rate": 9.96180414302113e-06, + "loss": 0.8293, + "step": 3017 + }, + { + "epoch": 0.1627129609661419, + "grad_norm": 1.1556557416915894, + "learning_rate": 9.961777983503056e-06, + "loss": 0.9544, + "step": 3018 + }, + { + "epoch": 0.1627668751347854, + "grad_norm": 0.8379788994789124, + "learning_rate": 9.961751815064375e-06, + "loss": 0.7168, + "step": 3019 + }, + { + "epoch": 0.16282078930342894, + "grad_norm": 0.9397227764129639, + "learning_rate": 9.961725637705134e-06, + "loss": 0.8804, + "step": 3020 + }, + { + "epoch": 0.16287470347207247, + "grad_norm": 0.8950162529945374, + "learning_rate": 9.96169945142538e-06, + "loss": 0.8652, + "step": 3021 + }, + { + "epoch": 0.16292861764071598, + "grad_norm": 0.8643755912780762, + "learning_rate": 9.961673256225159e-06, + "loss": 0.9041, + "step": 3022 + }, + { + "epoch": 0.1629825318093595, + "grad_norm": 0.8658211827278137, + "learning_rate": 9.961647052104517e-06, + "loss": 0.8721, + "step": 3023 + }, + { + "epoch": 0.16303644597800301, + "grad_norm": 0.812038242816925, + "learning_rate": 9.961620839063507e-06, + "loss": 0.8715, + "step": 3024 + }, + { + "epoch": 0.16309036014664655, + "grad_norm": 0.7646269798278809, + "learning_rate": 9.961594617102169e-06, + "loss": 0.7805, + "step": 3025 + }, + { + "epoch": 0.16314427431529005, + "grad_norm": 0.7684099674224854, + "learning_rate": 9.961568386220553e-06, + "loss": 0.8214, + "step": 3026 + }, + { + "epoch": 0.16319818848393358, + "grad_norm": 0.888566255569458, + "learning_rate": 9.961542146418706e-06, + "loss": 0.8972, + "step": 3027 + }, + { + "epoch": 0.16325210265257709, + "grad_norm": 0.8100109100341797, + "learning_rate": 9.961515897696675e-06, + "loss": 0.7337, + "step": 3028 + }, + { + "epoch": 0.16330601682122062, + "grad_norm": 0.8838690519332886, + "learning_rate": 9.96148964005451e-06, + "loss": 0.7148, + "step": 3029 + }, + { + "epoch": 0.16335993098986415, + "grad_norm": 0.7518458962440491, + "learning_rate": 9.961463373492253e-06, + "loss": 0.7127, + "step": 3030 + }, + { + "epoch": 0.16341384515850765, + "grad_norm": 0.8280466198921204, + "learning_rate": 9.961437098009956e-06, + "loss": 0.7569, + "step": 3031 + }, + { + "epoch": 0.16346775932715119, + "grad_norm": 0.7333472371101379, + "learning_rate": 9.961410813607663e-06, + "loss": 0.7984, + "step": 3032 + }, + { + "epoch": 0.1635216734957947, + "grad_norm": 0.8064109086990356, + "learning_rate": 9.961384520285423e-06, + "loss": 0.8255, + "step": 3033 + }, + { + "epoch": 0.16357558766443822, + "grad_norm": 0.8310550451278687, + "learning_rate": 9.961358218043282e-06, + "loss": 0.828, + "step": 3034 + }, + { + "epoch": 0.16362950183308173, + "grad_norm": 0.8141489028930664, + "learning_rate": 9.961331906881289e-06, + "loss": 0.8121, + "step": 3035 + }, + { + "epoch": 0.16368341600172526, + "grad_norm": 0.9229308366775513, + "learning_rate": 9.96130558679949e-06, + "loss": 0.9288, + "step": 3036 + }, + { + "epoch": 0.16373733017036876, + "grad_norm": 0.9087804555892944, + "learning_rate": 9.961279257797933e-06, + "loss": 0.8725, + "step": 3037 + }, + { + "epoch": 0.1637912443390123, + "grad_norm": 0.8357719779014587, + "learning_rate": 9.961252919876665e-06, + "loss": 0.8413, + "step": 3038 + }, + { + "epoch": 0.16384515850765582, + "grad_norm": 0.8311809301376343, + "learning_rate": 9.961226573035734e-06, + "loss": 0.885, + "step": 3039 + }, + { + "epoch": 0.16389907267629933, + "grad_norm": 0.7797298431396484, + "learning_rate": 9.961200217275185e-06, + "loss": 0.8767, + "step": 3040 + }, + { + "epoch": 0.16395298684494286, + "grad_norm": 0.8659999370574951, + "learning_rate": 9.961173852595069e-06, + "loss": 0.7852, + "step": 3041 + }, + { + "epoch": 0.16400690101358636, + "grad_norm": 0.8036298155784607, + "learning_rate": 9.96114747899543e-06, + "loss": 0.8122, + "step": 3042 + }, + { + "epoch": 0.1640608151822299, + "grad_norm": 0.8683627843856812, + "learning_rate": 9.961121096476318e-06, + "loss": 0.8197, + "step": 3043 + }, + { + "epoch": 0.1641147293508734, + "grad_norm": 0.8885881900787354, + "learning_rate": 9.96109470503778e-06, + "loss": 0.7302, + "step": 3044 + }, + { + "epoch": 0.16416864351951693, + "grad_norm": 0.7480132579803467, + "learning_rate": 9.961068304679861e-06, + "loss": 0.7938, + "step": 3045 + }, + { + "epoch": 0.16422255768816044, + "grad_norm": 0.680261492729187, + "learning_rate": 9.96104189540261e-06, + "loss": 0.7016, + "step": 3046 + }, + { + "epoch": 0.16427647185680397, + "grad_norm": 0.8690764904022217, + "learning_rate": 9.961015477206078e-06, + "loss": 0.7716, + "step": 3047 + }, + { + "epoch": 0.1643303860254475, + "grad_norm": 0.8533129692077637, + "learning_rate": 9.960989050090306e-06, + "loss": 0.8561, + "step": 3048 + }, + { + "epoch": 0.164384300194091, + "grad_norm": 0.6941283345222473, + "learning_rate": 9.960962614055345e-06, + "loss": 0.6501, + "step": 3049 + }, + { + "epoch": 0.16443821436273454, + "grad_norm": 0.9178086519241333, + "learning_rate": 9.960936169101244e-06, + "loss": 0.8511, + "step": 3050 + }, + { + "epoch": 0.16449212853137804, + "grad_norm": 0.7419497966766357, + "learning_rate": 9.960909715228049e-06, + "loss": 0.7331, + "step": 3051 + }, + { + "epoch": 0.16454604270002157, + "grad_norm": 0.879289984703064, + "learning_rate": 9.960883252435807e-06, + "loss": 0.8969, + "step": 3052 + }, + { + "epoch": 0.16459995686866508, + "grad_norm": 0.7679347991943359, + "learning_rate": 9.960856780724563e-06, + "loss": 0.7467, + "step": 3053 + }, + { + "epoch": 0.1646538710373086, + "grad_norm": 0.7927586436271667, + "learning_rate": 9.960830300094371e-06, + "loss": 0.7479, + "step": 3054 + }, + { + "epoch": 0.1647077852059521, + "grad_norm": 0.7693600058555603, + "learning_rate": 9.960803810545275e-06, + "loss": 0.8421, + "step": 3055 + }, + { + "epoch": 0.16476169937459564, + "grad_norm": 0.8548445105552673, + "learning_rate": 9.96077731207732e-06, + "loss": 0.8104, + "step": 3056 + }, + { + "epoch": 0.16481561354323918, + "grad_norm": 0.8420791029930115, + "learning_rate": 9.960750804690559e-06, + "loss": 0.6974, + "step": 3057 + }, + { + "epoch": 0.16486952771188268, + "grad_norm": 0.7880173921585083, + "learning_rate": 9.960724288385037e-06, + "loss": 0.7723, + "step": 3058 + }, + { + "epoch": 0.1649234418805262, + "grad_norm": 0.8810162544250488, + "learning_rate": 9.960697763160803e-06, + "loss": 0.7488, + "step": 3059 + }, + { + "epoch": 0.16497735604916972, + "grad_norm": 0.9951279759407043, + "learning_rate": 9.9606712290179e-06, + "loss": 0.8119, + "step": 3060 + }, + { + "epoch": 0.16503127021781325, + "grad_norm": 0.755189836025238, + "learning_rate": 9.960644685956383e-06, + "loss": 0.7568, + "step": 3061 + }, + { + "epoch": 0.16508518438645675, + "grad_norm": 0.99064040184021, + "learning_rate": 9.960618133976292e-06, + "loss": 0.8493, + "step": 3062 + }, + { + "epoch": 0.16513909855510028, + "grad_norm": 0.8672367334365845, + "learning_rate": 9.960591573077682e-06, + "loss": 0.7961, + "step": 3063 + }, + { + "epoch": 0.1651930127237438, + "grad_norm": 0.9614015817642212, + "learning_rate": 9.960565003260596e-06, + "loss": 0.8894, + "step": 3064 + }, + { + "epoch": 0.16524692689238732, + "grad_norm": 0.7433729767799377, + "learning_rate": 9.960538424525083e-06, + "loss": 0.7586, + "step": 3065 + }, + { + "epoch": 0.16530084106103085, + "grad_norm": 0.8151267766952515, + "learning_rate": 9.96051183687119e-06, + "loss": 0.8311, + "step": 3066 + }, + { + "epoch": 0.16535475522967436, + "grad_norm": 0.9241605401039124, + "learning_rate": 9.960485240298967e-06, + "loss": 0.8526, + "step": 3067 + }, + { + "epoch": 0.1654086693983179, + "grad_norm": 0.8612751364707947, + "learning_rate": 9.96045863480846e-06, + "loss": 0.7672, + "step": 3068 + }, + { + "epoch": 0.1654625835669614, + "grad_norm": 0.8707523345947266, + "learning_rate": 9.960432020399719e-06, + "loss": 0.7862, + "step": 3069 + }, + { + "epoch": 0.16551649773560492, + "grad_norm": 0.8456318378448486, + "learning_rate": 9.960405397072788e-06, + "loss": 0.8221, + "step": 3070 + }, + { + "epoch": 0.16557041190424843, + "grad_norm": 0.7929409742355347, + "learning_rate": 9.960378764827719e-06, + "loss": 0.8438, + "step": 3071 + }, + { + "epoch": 0.16562432607289196, + "grad_norm": 0.8241098523139954, + "learning_rate": 9.960352123664556e-06, + "loss": 0.7769, + "step": 3072 + }, + { + "epoch": 0.16567824024153546, + "grad_norm": 0.9634597301483154, + "learning_rate": 9.96032547358335e-06, + "loss": 0.8323, + "step": 3073 + }, + { + "epoch": 0.165732154410179, + "grad_norm": 0.6783578395843506, + "learning_rate": 9.960298814584148e-06, + "loss": 0.6585, + "step": 3074 + }, + { + "epoch": 0.16578606857882253, + "grad_norm": 0.756289005279541, + "learning_rate": 9.960272146666997e-06, + "loss": 0.7109, + "step": 3075 + }, + { + "epoch": 0.16583998274746603, + "grad_norm": 0.8414442539215088, + "learning_rate": 9.960245469831947e-06, + "loss": 0.7543, + "step": 3076 + }, + { + "epoch": 0.16589389691610956, + "grad_norm": 0.7551240921020508, + "learning_rate": 9.960218784079044e-06, + "loss": 0.7131, + "step": 3077 + }, + { + "epoch": 0.16594781108475307, + "grad_norm": 0.8211004137992859, + "learning_rate": 9.960192089408335e-06, + "loss": 0.8335, + "step": 3078 + }, + { + "epoch": 0.1660017252533966, + "grad_norm": 0.7540998458862305, + "learning_rate": 9.960165385819873e-06, + "loss": 0.7557, + "step": 3079 + }, + { + "epoch": 0.1660556394220401, + "grad_norm": 0.7917600274085999, + "learning_rate": 9.9601386733137e-06, + "loss": 0.7522, + "step": 3080 + }, + { + "epoch": 0.16610955359068363, + "grad_norm": 0.9180947542190552, + "learning_rate": 9.960111951889868e-06, + "loss": 0.7943, + "step": 3081 + }, + { + "epoch": 0.16616346775932714, + "grad_norm": 0.8169807195663452, + "learning_rate": 9.960085221548422e-06, + "loss": 0.8633, + "step": 3082 + }, + { + "epoch": 0.16621738192797067, + "grad_norm": 0.8790155649185181, + "learning_rate": 9.960058482289413e-06, + "loss": 0.8265, + "step": 3083 + }, + { + "epoch": 0.1662712960966142, + "grad_norm": 0.8958606123924255, + "learning_rate": 9.960031734112887e-06, + "loss": 0.8601, + "step": 3084 + }, + { + "epoch": 0.1663252102652577, + "grad_norm": 0.8116661906242371, + "learning_rate": 9.960004977018893e-06, + "loss": 0.8203, + "step": 3085 + }, + { + "epoch": 0.16637912443390124, + "grad_norm": 0.771135687828064, + "learning_rate": 9.95997821100748e-06, + "loss": 0.7258, + "step": 3086 + }, + { + "epoch": 0.16643303860254474, + "grad_norm": 0.9094653725624084, + "learning_rate": 9.959951436078696e-06, + "loss": 0.9094, + "step": 3087 + }, + { + "epoch": 0.16648695277118827, + "grad_norm": 0.9042958617210388, + "learning_rate": 9.959924652232586e-06, + "loss": 0.7434, + "step": 3088 + }, + { + "epoch": 0.16654086693983178, + "grad_norm": 0.7170906662940979, + "learning_rate": 9.959897859469201e-06, + "loss": 0.7134, + "step": 3089 + }, + { + "epoch": 0.1665947811084753, + "grad_norm": 0.7896520495414734, + "learning_rate": 9.959871057788589e-06, + "loss": 0.7727, + "step": 3090 + }, + { + "epoch": 0.1666486952771188, + "grad_norm": 0.9295204281806946, + "learning_rate": 9.959844247190797e-06, + "loss": 0.8928, + "step": 3091 + }, + { + "epoch": 0.16670260944576235, + "grad_norm": 0.8025391101837158, + "learning_rate": 9.959817427675875e-06, + "loss": 0.7808, + "step": 3092 + }, + { + "epoch": 0.16675652361440588, + "grad_norm": 0.9727420210838318, + "learning_rate": 9.95979059924387e-06, + "loss": 0.9677, + "step": 3093 + }, + { + "epoch": 0.16681043778304938, + "grad_norm": 0.8534692525863647, + "learning_rate": 9.95976376189483e-06, + "loss": 0.8642, + "step": 3094 + }, + { + "epoch": 0.1668643519516929, + "grad_norm": 0.8361443877220154, + "learning_rate": 9.959736915628803e-06, + "loss": 0.8746, + "step": 3095 + }, + { + "epoch": 0.16691826612033642, + "grad_norm": 0.8551936745643616, + "learning_rate": 9.95971006044584e-06, + "loss": 0.7973, + "step": 3096 + }, + { + "epoch": 0.16697218028897995, + "grad_norm": 0.6986585259437561, + "learning_rate": 9.959683196345987e-06, + "loss": 0.6689, + "step": 3097 + }, + { + "epoch": 0.16702609445762345, + "grad_norm": 0.9048603773117065, + "learning_rate": 9.959656323329291e-06, + "loss": 0.7924, + "step": 3098 + }, + { + "epoch": 0.16708000862626698, + "grad_norm": 0.8295788764953613, + "learning_rate": 9.959629441395802e-06, + "loss": 0.843, + "step": 3099 + }, + { + "epoch": 0.1671339227949105, + "grad_norm": 0.838590681552887, + "learning_rate": 9.959602550545568e-06, + "loss": 0.7615, + "step": 3100 + }, + { + "epoch": 0.16718783696355402, + "grad_norm": 0.8323560357093811, + "learning_rate": 9.959575650778639e-06, + "loss": 0.8375, + "step": 3101 + }, + { + "epoch": 0.16724175113219755, + "grad_norm": 0.8825474381446838, + "learning_rate": 9.959548742095062e-06, + "loss": 0.7701, + "step": 3102 + }, + { + "epoch": 0.16729566530084106, + "grad_norm": 0.8911004662513733, + "learning_rate": 9.959521824494884e-06, + "loss": 0.8, + "step": 3103 + }, + { + "epoch": 0.1673495794694846, + "grad_norm": 0.76695317029953, + "learning_rate": 9.959494897978154e-06, + "loss": 0.7177, + "step": 3104 + }, + { + "epoch": 0.1674034936381281, + "grad_norm": 0.9462987184524536, + "learning_rate": 9.959467962544922e-06, + "loss": 0.8479, + "step": 3105 + }, + { + "epoch": 0.16745740780677162, + "grad_norm": 0.7185036540031433, + "learning_rate": 9.959441018195235e-06, + "loss": 0.6444, + "step": 3106 + }, + { + "epoch": 0.16751132197541513, + "grad_norm": 0.9797527194023132, + "learning_rate": 9.959414064929143e-06, + "loss": 0.916, + "step": 3107 + }, + { + "epoch": 0.16756523614405866, + "grad_norm": 0.7815739512443542, + "learning_rate": 9.959387102746693e-06, + "loss": 0.7315, + "step": 3108 + }, + { + "epoch": 0.1676191503127022, + "grad_norm": 0.9536890387535095, + "learning_rate": 9.959360131647933e-06, + "loss": 0.7795, + "step": 3109 + }, + { + "epoch": 0.1676730644813457, + "grad_norm": 0.7770065069198608, + "learning_rate": 9.959333151632913e-06, + "loss": 0.8203, + "step": 3110 + }, + { + "epoch": 0.16772697864998923, + "grad_norm": 0.8031367659568787, + "learning_rate": 9.959306162701681e-06, + "loss": 0.8362, + "step": 3111 + }, + { + "epoch": 0.16778089281863273, + "grad_norm": 0.8009032011032104, + "learning_rate": 9.959279164854286e-06, + "loss": 0.8113, + "step": 3112 + }, + { + "epoch": 0.16783480698727626, + "grad_norm": 0.8091812133789062, + "learning_rate": 9.959252158090775e-06, + "loss": 0.84, + "step": 3113 + }, + { + "epoch": 0.16788872115591977, + "grad_norm": 0.7102682590484619, + "learning_rate": 9.959225142411197e-06, + "loss": 0.7378, + "step": 3114 + }, + { + "epoch": 0.1679426353245633, + "grad_norm": 0.8190940618515015, + "learning_rate": 9.959198117815602e-06, + "loss": 0.8478, + "step": 3115 + }, + { + "epoch": 0.1679965494932068, + "grad_norm": 0.7320457696914673, + "learning_rate": 9.959171084304037e-06, + "loss": 0.8358, + "step": 3116 + }, + { + "epoch": 0.16805046366185034, + "grad_norm": 0.8222710490226746, + "learning_rate": 9.959144041876551e-06, + "loss": 0.809, + "step": 3117 + }, + { + "epoch": 0.16810437783049387, + "grad_norm": 0.7939282059669495, + "learning_rate": 9.959116990533195e-06, + "loss": 0.8562, + "step": 3118 + }, + { + "epoch": 0.16815829199913737, + "grad_norm": 0.7231613993644714, + "learning_rate": 9.959089930274013e-06, + "loss": 0.7656, + "step": 3119 + }, + { + "epoch": 0.1682122061677809, + "grad_norm": 0.8997424840927124, + "learning_rate": 9.959062861099058e-06, + "loss": 0.8831, + "step": 3120 + }, + { + "epoch": 0.1682661203364244, + "grad_norm": 0.80366450548172, + "learning_rate": 9.959035783008374e-06, + "loss": 0.8044, + "step": 3121 + }, + { + "epoch": 0.16832003450506794, + "grad_norm": 0.8153119683265686, + "learning_rate": 9.959008696002015e-06, + "loss": 0.8325, + "step": 3122 + }, + { + "epoch": 0.16837394867371144, + "grad_norm": 0.8638020157814026, + "learning_rate": 9.958981600080026e-06, + "loss": 0.8197, + "step": 3123 + }, + { + "epoch": 0.16842786284235498, + "grad_norm": 0.8430980443954468, + "learning_rate": 9.95895449524246e-06, + "loss": 0.8212, + "step": 3124 + }, + { + "epoch": 0.16848177701099848, + "grad_norm": 0.9273066520690918, + "learning_rate": 9.958927381489358e-06, + "loss": 0.8145, + "step": 3125 + }, + { + "epoch": 0.168535691179642, + "grad_norm": 0.8697495460510254, + "learning_rate": 9.958900258820777e-06, + "loss": 0.8519, + "step": 3126 + }, + { + "epoch": 0.16858960534828554, + "grad_norm": 0.7957634925842285, + "learning_rate": 9.95887312723676e-06, + "loss": 0.8065, + "step": 3127 + }, + { + "epoch": 0.16864351951692905, + "grad_norm": 0.8890637755393982, + "learning_rate": 9.958845986737357e-06, + "loss": 0.822, + "step": 3128 + }, + { + "epoch": 0.16869743368557258, + "grad_norm": 0.7979970574378967, + "learning_rate": 9.95881883732262e-06, + "loss": 0.8346, + "step": 3129 + }, + { + "epoch": 0.16875134785421608, + "grad_norm": 0.8589211106300354, + "learning_rate": 9.958791678992594e-06, + "loss": 0.7498, + "step": 3130 + }, + { + "epoch": 0.16880526202285961, + "grad_norm": 0.7819254398345947, + "learning_rate": 9.95876451174733e-06, + "loss": 0.7515, + "step": 3131 + }, + { + "epoch": 0.16885917619150312, + "grad_norm": 0.9037144184112549, + "learning_rate": 9.958737335586877e-06, + "loss": 0.7684, + "step": 3132 + }, + { + "epoch": 0.16891309036014665, + "grad_norm": 0.9139670133590698, + "learning_rate": 9.958710150511282e-06, + "loss": 0.7848, + "step": 3133 + }, + { + "epoch": 0.16896700452879015, + "grad_norm": 0.8177505135536194, + "learning_rate": 9.958682956520596e-06, + "loss": 0.8656, + "step": 3134 + }, + { + "epoch": 0.1690209186974337, + "grad_norm": 0.7351679801940918, + "learning_rate": 9.958655753614865e-06, + "loss": 0.769, + "step": 3135 + }, + { + "epoch": 0.16907483286607722, + "grad_norm": 0.8661699891090393, + "learning_rate": 9.958628541794142e-06, + "loss": 0.8523, + "step": 3136 + }, + { + "epoch": 0.16912874703472072, + "grad_norm": 0.7755950689315796, + "learning_rate": 9.958601321058471e-06, + "loss": 0.7737, + "step": 3137 + }, + { + "epoch": 0.16918266120336425, + "grad_norm": 0.8523197174072266, + "learning_rate": 9.958574091407906e-06, + "loss": 0.8508, + "step": 3138 + }, + { + "epoch": 0.16923657537200776, + "grad_norm": 0.7154935598373413, + "learning_rate": 9.958546852842493e-06, + "loss": 0.6725, + "step": 3139 + }, + { + "epoch": 0.1692904895406513, + "grad_norm": 0.8140445947647095, + "learning_rate": 9.95851960536228e-06, + "loss": 0.92, + "step": 3140 + }, + { + "epoch": 0.1693444037092948, + "grad_norm": 0.7320675849914551, + "learning_rate": 9.95849234896732e-06, + "loss": 0.8091, + "step": 3141 + }, + { + "epoch": 0.16939831787793833, + "grad_norm": 0.7761030197143555, + "learning_rate": 9.958465083657659e-06, + "loss": 0.7444, + "step": 3142 + }, + { + "epoch": 0.16945223204658183, + "grad_norm": 0.8432923555374146, + "learning_rate": 9.958437809433345e-06, + "loss": 0.8112, + "step": 3143 + }, + { + "epoch": 0.16950614621522536, + "grad_norm": 0.8015188574790955, + "learning_rate": 9.958410526294428e-06, + "loss": 0.8383, + "step": 3144 + }, + { + "epoch": 0.1695600603838689, + "grad_norm": 0.7635226845741272, + "learning_rate": 9.95838323424096e-06, + "loss": 0.7942, + "step": 3145 + }, + { + "epoch": 0.1696139745525124, + "grad_norm": 0.942131757736206, + "learning_rate": 9.958355933272986e-06, + "loss": 0.8877, + "step": 3146 + }, + { + "epoch": 0.16966788872115593, + "grad_norm": 1.1072907447814941, + "learning_rate": 9.958328623390558e-06, + "loss": 0.7369, + "step": 3147 + }, + { + "epoch": 0.16972180288979943, + "grad_norm": 0.8342657685279846, + "learning_rate": 9.958301304593722e-06, + "loss": 0.7946, + "step": 3148 + }, + { + "epoch": 0.16977571705844297, + "grad_norm": 0.7320284843444824, + "learning_rate": 9.958273976882531e-06, + "loss": 0.754, + "step": 3149 + }, + { + "epoch": 0.16982963122708647, + "grad_norm": 0.7840715646743774, + "learning_rate": 9.958246640257031e-06, + "loss": 0.7897, + "step": 3150 + }, + { + "epoch": 0.16988354539573, + "grad_norm": 0.7383304834365845, + "learning_rate": 9.958219294717273e-06, + "loss": 0.8205, + "step": 3151 + }, + { + "epoch": 0.1699374595643735, + "grad_norm": 0.7597193121910095, + "learning_rate": 9.958191940263305e-06, + "loss": 0.8016, + "step": 3152 + }, + { + "epoch": 0.16999137373301704, + "grad_norm": 0.7770809531211853, + "learning_rate": 9.958164576895176e-06, + "loss": 0.7228, + "step": 3153 + }, + { + "epoch": 0.17004528790166057, + "grad_norm": 0.891514241695404, + "learning_rate": 9.958137204612936e-06, + "loss": 0.8598, + "step": 3154 + }, + { + "epoch": 0.17009920207030407, + "grad_norm": 0.8025946021080017, + "learning_rate": 9.958109823416635e-06, + "loss": 0.8979, + "step": 3155 + }, + { + "epoch": 0.1701531162389476, + "grad_norm": 0.7912386059761047, + "learning_rate": 9.95808243330632e-06, + "loss": 0.7562, + "step": 3156 + }, + { + "epoch": 0.1702070304075911, + "grad_norm": 0.8642987608909607, + "learning_rate": 9.958055034282043e-06, + "loss": 0.7916, + "step": 3157 + }, + { + "epoch": 0.17026094457623464, + "grad_norm": 0.8047364950180054, + "learning_rate": 9.958027626343852e-06, + "loss": 0.7598, + "step": 3158 + }, + { + "epoch": 0.17031485874487814, + "grad_norm": 0.8402281999588013, + "learning_rate": 9.958000209491794e-06, + "loss": 0.8572, + "step": 3159 + }, + { + "epoch": 0.17036877291352168, + "grad_norm": 0.7486295700073242, + "learning_rate": 9.95797278372592e-06, + "loss": 0.7221, + "step": 3160 + }, + { + "epoch": 0.17042268708216518, + "grad_norm": 0.7889320254325867, + "learning_rate": 9.95794534904628e-06, + "loss": 0.7734, + "step": 3161 + }, + { + "epoch": 0.1704766012508087, + "grad_norm": 0.7864039540290833, + "learning_rate": 9.957917905452925e-06, + "loss": 0.7763, + "step": 3162 + }, + { + "epoch": 0.17053051541945224, + "grad_norm": 0.8366582989692688, + "learning_rate": 9.957890452945903e-06, + "loss": 0.8594, + "step": 3163 + }, + { + "epoch": 0.17058442958809575, + "grad_norm": 0.8014213442802429, + "learning_rate": 9.95786299152526e-06, + "loss": 0.7802, + "step": 3164 + }, + { + "epoch": 0.17063834375673928, + "grad_norm": 0.8158774375915527, + "learning_rate": 9.957835521191048e-06, + "loss": 0.7693, + "step": 3165 + }, + { + "epoch": 0.17069225792538278, + "grad_norm": 1.0622320175170898, + "learning_rate": 9.957808041943316e-06, + "loss": 0.8949, + "step": 3166 + }, + { + "epoch": 0.17074617209402632, + "grad_norm": 0.7825013399124146, + "learning_rate": 9.957780553782114e-06, + "loss": 0.7681, + "step": 3167 + }, + { + "epoch": 0.17080008626266982, + "grad_norm": 1.0727826356887817, + "learning_rate": 9.957753056707493e-06, + "loss": 0.876, + "step": 3168 + }, + { + "epoch": 0.17085400043131335, + "grad_norm": 0.7952837944030762, + "learning_rate": 9.9577255507195e-06, + "loss": 0.7671, + "step": 3169 + }, + { + "epoch": 0.17090791459995686, + "grad_norm": 0.7251336574554443, + "learning_rate": 9.957698035818185e-06, + "loss": 0.7938, + "step": 3170 + }, + { + "epoch": 0.1709618287686004, + "grad_norm": 0.8674930930137634, + "learning_rate": 9.957670512003598e-06, + "loss": 0.9387, + "step": 3171 + }, + { + "epoch": 0.17101574293724392, + "grad_norm": 0.7578595876693726, + "learning_rate": 9.957642979275787e-06, + "loss": 0.8295, + "step": 3172 + }, + { + "epoch": 0.17106965710588742, + "grad_norm": 0.8236204385757446, + "learning_rate": 9.957615437634802e-06, + "loss": 0.871, + "step": 3173 + }, + { + "epoch": 0.17112357127453096, + "grad_norm": 0.7528506517410278, + "learning_rate": 9.957587887080696e-06, + "loss": 0.7034, + "step": 3174 + }, + { + "epoch": 0.17117748544317446, + "grad_norm": 0.8170275092124939, + "learning_rate": 9.957560327613514e-06, + "loss": 0.7412, + "step": 3175 + }, + { + "epoch": 0.171231399611818, + "grad_norm": 0.91305011510849, + "learning_rate": 9.957532759233307e-06, + "loss": 0.8861, + "step": 3176 + }, + { + "epoch": 0.1712853137804615, + "grad_norm": 0.7793359756469727, + "learning_rate": 9.957505181940124e-06, + "loss": 0.8106, + "step": 3177 + }, + { + "epoch": 0.17133922794910503, + "grad_norm": 0.9424631595611572, + "learning_rate": 9.957477595734016e-06, + "loss": 0.8271, + "step": 3178 + }, + { + "epoch": 0.17139314211774853, + "grad_norm": 0.8909611701965332, + "learning_rate": 9.957450000615031e-06, + "loss": 0.8711, + "step": 3179 + }, + { + "epoch": 0.17144705628639206, + "grad_norm": 0.703960657119751, + "learning_rate": 9.95742239658322e-06, + "loss": 0.6693, + "step": 3180 + }, + { + "epoch": 0.1715009704550356, + "grad_norm": 0.8511449098587036, + "learning_rate": 9.957394783638632e-06, + "loss": 0.8075, + "step": 3181 + }, + { + "epoch": 0.1715548846236791, + "grad_norm": 0.93243008852005, + "learning_rate": 9.957367161781318e-06, + "loss": 0.8663, + "step": 3182 + }, + { + "epoch": 0.17160879879232263, + "grad_norm": 0.926092803478241, + "learning_rate": 9.957339531011325e-06, + "loss": 0.8973, + "step": 3183 + }, + { + "epoch": 0.17166271296096614, + "grad_norm": 0.8564586043357849, + "learning_rate": 9.957311891328705e-06, + "loss": 0.7561, + "step": 3184 + }, + { + "epoch": 0.17171662712960967, + "grad_norm": 0.8317960500717163, + "learning_rate": 9.957284242733507e-06, + "loss": 0.817, + "step": 3185 + }, + { + "epoch": 0.17177054129825317, + "grad_norm": 0.7291557788848877, + "learning_rate": 9.95725658522578e-06, + "loss": 0.6963, + "step": 3186 + }, + { + "epoch": 0.1718244554668967, + "grad_norm": 0.8154743313789368, + "learning_rate": 9.957228918805574e-06, + "loss": 0.8005, + "step": 3187 + }, + { + "epoch": 0.1718783696355402, + "grad_norm": 0.7985217571258545, + "learning_rate": 9.95720124347294e-06, + "loss": 0.8471, + "step": 3188 + }, + { + "epoch": 0.17193228380418374, + "grad_norm": 0.7928630709648132, + "learning_rate": 9.957173559227926e-06, + "loss": 0.8809, + "step": 3189 + }, + { + "epoch": 0.17198619797282727, + "grad_norm": 0.800392210483551, + "learning_rate": 9.957145866070583e-06, + "loss": 0.8031, + "step": 3190 + }, + { + "epoch": 0.17204011214147077, + "grad_norm": 0.8904628157615662, + "learning_rate": 9.95711816400096e-06, + "loss": 0.7583, + "step": 3191 + }, + { + "epoch": 0.1720940263101143, + "grad_norm": 0.7246114611625671, + "learning_rate": 9.957090453019106e-06, + "loss": 0.7365, + "step": 3192 + }, + { + "epoch": 0.1721479404787578, + "grad_norm": 0.8280320763587952, + "learning_rate": 9.957062733125074e-06, + "loss": 0.7723, + "step": 3193 + }, + { + "epoch": 0.17220185464740134, + "grad_norm": 0.929804265499115, + "learning_rate": 9.957035004318911e-06, + "loss": 0.8412, + "step": 3194 + }, + { + "epoch": 0.17225576881604485, + "grad_norm": 0.815108060836792, + "learning_rate": 9.957007266600666e-06, + "loss": 0.8076, + "step": 3195 + }, + { + "epoch": 0.17230968298468838, + "grad_norm": 0.7849567532539368, + "learning_rate": 9.956979519970393e-06, + "loss": 0.8245, + "step": 3196 + }, + { + "epoch": 0.17236359715333188, + "grad_norm": 1.458945393562317, + "learning_rate": 9.956951764428138e-06, + "loss": 0.7647, + "step": 3197 + }, + { + "epoch": 0.17241751132197541, + "grad_norm": 0.8327317833900452, + "learning_rate": 9.956923999973954e-06, + "loss": 0.8824, + "step": 3198 + }, + { + "epoch": 0.17247142549061895, + "grad_norm": 0.7398284077644348, + "learning_rate": 9.956896226607887e-06, + "loss": 0.7907, + "step": 3199 + }, + { + "epoch": 0.17252533965926245, + "grad_norm": 0.8546818494796753, + "learning_rate": 9.95686844432999e-06, + "loss": 0.8723, + "step": 3200 + }, + { + "epoch": 0.17257925382790598, + "grad_norm": 0.7967200875282288, + "learning_rate": 9.956840653140311e-06, + "loss": 0.8156, + "step": 3201 + }, + { + "epoch": 0.17263316799654949, + "grad_norm": 0.9093504548072815, + "learning_rate": 9.956812853038903e-06, + "loss": 0.8002, + "step": 3202 + }, + { + "epoch": 0.17268708216519302, + "grad_norm": 0.7995857000350952, + "learning_rate": 9.956785044025811e-06, + "loss": 0.8413, + "step": 3203 + }, + { + "epoch": 0.17274099633383652, + "grad_norm": 0.828748881816864, + "learning_rate": 9.95675722610109e-06, + "loss": 0.7162, + "step": 3204 + }, + { + "epoch": 0.17279491050248005, + "grad_norm": 0.7679111361503601, + "learning_rate": 9.956729399264789e-06, + "loss": 0.7909, + "step": 3205 + }, + { + "epoch": 0.17284882467112356, + "grad_norm": 0.9187313318252563, + "learning_rate": 9.956701563516956e-06, + "loss": 0.8537, + "step": 3206 + }, + { + "epoch": 0.1729027388397671, + "grad_norm": 0.7859029173851013, + "learning_rate": 9.956673718857642e-06, + "loss": 0.7392, + "step": 3207 + }, + { + "epoch": 0.17295665300841062, + "grad_norm": 0.8365893363952637, + "learning_rate": 9.956645865286897e-06, + "loss": 0.7921, + "step": 3208 + }, + { + "epoch": 0.17301056717705413, + "grad_norm": 0.912382960319519, + "learning_rate": 9.956618002804771e-06, + "loss": 0.8651, + "step": 3209 + }, + { + "epoch": 0.17306448134569766, + "grad_norm": 0.7380210757255554, + "learning_rate": 9.956590131411314e-06, + "loss": 0.7031, + "step": 3210 + }, + { + "epoch": 0.17311839551434116, + "grad_norm": 0.7943229675292969, + "learning_rate": 9.956562251106578e-06, + "loss": 0.7725, + "step": 3211 + }, + { + "epoch": 0.1731723096829847, + "grad_norm": 0.8835777640342712, + "learning_rate": 9.95653436189061e-06, + "loss": 0.8633, + "step": 3212 + }, + { + "epoch": 0.1732262238516282, + "grad_norm": 0.8082174062728882, + "learning_rate": 9.956506463763464e-06, + "loss": 0.8833, + "step": 3213 + }, + { + "epoch": 0.17328013802027173, + "grad_norm": 0.8236085772514343, + "learning_rate": 9.956478556725186e-06, + "loss": 0.8517, + "step": 3214 + }, + { + "epoch": 0.17333405218891526, + "grad_norm": 0.8428922891616821, + "learning_rate": 9.956450640775829e-06, + "loss": 0.8659, + "step": 3215 + }, + { + "epoch": 0.17338796635755876, + "grad_norm": 0.8443105220794678, + "learning_rate": 9.95642271591544e-06, + "loss": 0.9589, + "step": 3216 + }, + { + "epoch": 0.1734418805262023, + "grad_norm": 0.7856699228286743, + "learning_rate": 9.956394782144074e-06, + "loss": 0.787, + "step": 3217 + }, + { + "epoch": 0.1734957946948458, + "grad_norm": 0.8537113666534424, + "learning_rate": 9.95636683946178e-06, + "loss": 0.9339, + "step": 3218 + }, + { + "epoch": 0.17354970886348933, + "grad_norm": 0.8206045627593994, + "learning_rate": 9.956338887868603e-06, + "loss": 0.832, + "step": 3219 + }, + { + "epoch": 0.17360362303213284, + "grad_norm": 0.7913991808891296, + "learning_rate": 9.956310927364599e-06, + "loss": 0.7647, + "step": 3220 + }, + { + "epoch": 0.17365753720077637, + "grad_norm": 0.9481332302093506, + "learning_rate": 9.956282957949817e-06, + "loss": 0.7113, + "step": 3221 + }, + { + "epoch": 0.17371145136941987, + "grad_norm": 0.9326061606407166, + "learning_rate": 9.956254979624304e-06, + "loss": 0.8324, + "step": 3222 + }, + { + "epoch": 0.1737653655380634, + "grad_norm": 1.0496339797973633, + "learning_rate": 9.956226992388117e-06, + "loss": 0.7959, + "step": 3223 + }, + { + "epoch": 0.17381927970670694, + "grad_norm": 0.8025851249694824, + "learning_rate": 9.9561989962413e-06, + "loss": 0.811, + "step": 3224 + }, + { + "epoch": 0.17387319387535044, + "grad_norm": 0.9083681106567383, + "learning_rate": 9.956170991183905e-06, + "loss": 0.7957, + "step": 3225 + }, + { + "epoch": 0.17392710804399397, + "grad_norm": 0.8242226243019104, + "learning_rate": 9.956142977215983e-06, + "loss": 0.8224, + "step": 3226 + }, + { + "epoch": 0.17398102221263748, + "grad_norm": 0.8805774450302124, + "learning_rate": 9.956114954337586e-06, + "loss": 0.8847, + "step": 3227 + }, + { + "epoch": 0.174034936381281, + "grad_norm": 0.748651921749115, + "learning_rate": 9.956086922548761e-06, + "loss": 0.7719, + "step": 3228 + }, + { + "epoch": 0.1740888505499245, + "grad_norm": 0.7385552525520325, + "learning_rate": 9.956058881849562e-06, + "loss": 0.7591, + "step": 3229 + }, + { + "epoch": 0.17414276471856804, + "grad_norm": 0.7795779705047607, + "learning_rate": 9.956030832240037e-06, + "loss": 0.8071, + "step": 3230 + }, + { + "epoch": 0.17419667888721155, + "grad_norm": 9.106490135192871, + "learning_rate": 9.956002773720236e-06, + "loss": 0.7915, + "step": 3231 + }, + { + "epoch": 0.17425059305585508, + "grad_norm": 0.861794650554657, + "learning_rate": 9.955974706290212e-06, + "loss": 0.8293, + "step": 3232 + }, + { + "epoch": 0.1743045072244986, + "grad_norm": 0.8002027869224548, + "learning_rate": 9.955946629950012e-06, + "loss": 0.8404, + "step": 3233 + }, + { + "epoch": 0.17435842139314212, + "grad_norm": 0.8162701725959778, + "learning_rate": 9.95591854469969e-06, + "loss": 0.8362, + "step": 3234 + }, + { + "epoch": 0.17441233556178565, + "grad_norm": 0.7436956763267517, + "learning_rate": 9.955890450539295e-06, + "loss": 0.8339, + "step": 3235 + }, + { + "epoch": 0.17446624973042915, + "grad_norm": 0.8074719309806824, + "learning_rate": 9.955862347468875e-06, + "loss": 0.8403, + "step": 3236 + }, + { + "epoch": 0.17452016389907268, + "grad_norm": 0.8527933955192566, + "learning_rate": 9.955834235488485e-06, + "loss": 0.8201, + "step": 3237 + }, + { + "epoch": 0.1745740780677162, + "grad_norm": 0.792177140712738, + "learning_rate": 9.955806114598173e-06, + "loss": 0.8304, + "step": 3238 + }, + { + "epoch": 0.17462799223635972, + "grad_norm": 0.8211845755577087, + "learning_rate": 9.95577798479799e-06, + "loss": 0.8013, + "step": 3239 + }, + { + "epoch": 0.17468190640500322, + "grad_norm": 0.906973659992218, + "learning_rate": 9.955749846087986e-06, + "loss": 0.823, + "step": 3240 + }, + { + "epoch": 0.17473582057364676, + "grad_norm": 0.904077410697937, + "learning_rate": 9.955721698468213e-06, + "loss": 0.7651, + "step": 3241 + }, + { + "epoch": 0.1747897347422903, + "grad_norm": 0.8147358298301697, + "learning_rate": 9.95569354193872e-06, + "loss": 0.9268, + "step": 3242 + }, + { + "epoch": 0.1748436489109338, + "grad_norm": 0.8664659857749939, + "learning_rate": 9.95566537649956e-06, + "loss": 0.8366, + "step": 3243 + }, + { + "epoch": 0.17489756307957732, + "grad_norm": 0.6882225871086121, + "learning_rate": 9.95563720215078e-06, + "loss": 0.7152, + "step": 3244 + }, + { + "epoch": 0.17495147724822083, + "grad_norm": 0.7605637907981873, + "learning_rate": 9.955609018892434e-06, + "loss": 0.7864, + "step": 3245 + }, + { + "epoch": 0.17500539141686436, + "grad_norm": 0.7316586375236511, + "learning_rate": 9.95558082672457e-06, + "loss": 0.7175, + "step": 3246 + }, + { + "epoch": 0.17505930558550786, + "grad_norm": 0.8258477449417114, + "learning_rate": 9.955552625647241e-06, + "loss": 0.8463, + "step": 3247 + }, + { + "epoch": 0.1751132197541514, + "grad_norm": 0.7658422589302063, + "learning_rate": 9.955524415660498e-06, + "loss": 0.9477, + "step": 3248 + }, + { + "epoch": 0.1751671339227949, + "grad_norm": 0.9374455809593201, + "learning_rate": 9.955496196764387e-06, + "loss": 0.8725, + "step": 3249 + }, + { + "epoch": 0.17522104809143843, + "grad_norm": 0.7676389813423157, + "learning_rate": 9.955467968958965e-06, + "loss": 0.7868, + "step": 3250 + }, + { + "epoch": 0.17527496226008196, + "grad_norm": 0.9800841808319092, + "learning_rate": 9.955439732244279e-06, + "loss": 0.7787, + "step": 3251 + }, + { + "epoch": 0.17532887642872547, + "grad_norm": 0.7501618266105652, + "learning_rate": 9.95541148662038e-06, + "loss": 0.7703, + "step": 3252 + }, + { + "epoch": 0.175382790597369, + "grad_norm": 0.8019260168075562, + "learning_rate": 9.95538323208732e-06, + "loss": 0.7635, + "step": 3253 + }, + { + "epoch": 0.1754367047660125, + "grad_norm": 0.7791414260864258, + "learning_rate": 9.95535496864515e-06, + "loss": 0.7372, + "step": 3254 + }, + { + "epoch": 0.17549061893465603, + "grad_norm": 0.7667005658149719, + "learning_rate": 9.955326696293921e-06, + "loss": 0.8481, + "step": 3255 + }, + { + "epoch": 0.17554453310329954, + "grad_norm": 0.7585765719413757, + "learning_rate": 9.955298415033681e-06, + "loss": 0.7933, + "step": 3256 + }, + { + "epoch": 0.17559844727194307, + "grad_norm": 0.8037384152412415, + "learning_rate": 9.955270124864485e-06, + "loss": 0.8716, + "step": 3257 + }, + { + "epoch": 0.17565236144058657, + "grad_norm": 0.7610961198806763, + "learning_rate": 9.955241825786379e-06, + "loss": 0.7647, + "step": 3258 + }, + { + "epoch": 0.1757062756092301, + "grad_norm": 0.7867752909660339, + "learning_rate": 9.955213517799418e-06, + "loss": 0.7685, + "step": 3259 + }, + { + "epoch": 0.17576018977787364, + "grad_norm": 1.1530165672302246, + "learning_rate": 9.955185200903652e-06, + "loss": 0.9032, + "step": 3260 + }, + { + "epoch": 0.17581410394651714, + "grad_norm": 0.7161276936531067, + "learning_rate": 9.955156875099129e-06, + "loss": 0.7367, + "step": 3261 + }, + { + "epoch": 0.17586801811516067, + "grad_norm": 0.7634873390197754, + "learning_rate": 9.955128540385903e-06, + "loss": 0.6914, + "step": 3262 + }, + { + "epoch": 0.17592193228380418, + "grad_norm": 0.8375166654586792, + "learning_rate": 9.955100196764025e-06, + "loss": 0.965, + "step": 3263 + }, + { + "epoch": 0.1759758464524477, + "grad_norm": 0.784824788570404, + "learning_rate": 9.955071844233545e-06, + "loss": 0.7825, + "step": 3264 + }, + { + "epoch": 0.1760297606210912, + "grad_norm": 0.7765333652496338, + "learning_rate": 9.955043482794514e-06, + "loss": 0.9057, + "step": 3265 + }, + { + "epoch": 0.17608367478973475, + "grad_norm": 0.9159989356994629, + "learning_rate": 9.955015112446985e-06, + "loss": 0.8055, + "step": 3266 + }, + { + "epoch": 0.17613758895837825, + "grad_norm": 0.8813021183013916, + "learning_rate": 9.954986733191003e-06, + "loss": 0.8811, + "step": 3267 + }, + { + "epoch": 0.17619150312702178, + "grad_norm": 0.7664482593536377, + "learning_rate": 9.954958345026627e-06, + "loss": 0.7138, + "step": 3268 + }, + { + "epoch": 0.1762454172956653, + "grad_norm": 0.8903096914291382, + "learning_rate": 9.954929947953902e-06, + "loss": 0.8884, + "step": 3269 + }, + { + "epoch": 0.17629933146430882, + "grad_norm": 0.750549852848053, + "learning_rate": 9.95490154197288e-06, + "loss": 0.7948, + "step": 3270 + }, + { + "epoch": 0.17635324563295235, + "grad_norm": 0.8723561763763428, + "learning_rate": 9.954873127083615e-06, + "loss": 0.8896, + "step": 3271 + }, + { + "epoch": 0.17640715980159585, + "grad_norm": 0.8852900862693787, + "learning_rate": 9.954844703286157e-06, + "loss": 0.8504, + "step": 3272 + }, + { + "epoch": 0.17646107397023938, + "grad_norm": 0.8535251021385193, + "learning_rate": 9.954816270580555e-06, + "loss": 0.7198, + "step": 3273 + }, + { + "epoch": 0.1765149881388829, + "grad_norm": 0.8378668427467346, + "learning_rate": 9.954787828966864e-06, + "loss": 0.8361, + "step": 3274 + }, + { + "epoch": 0.17656890230752642, + "grad_norm": 0.7617664337158203, + "learning_rate": 9.954759378445132e-06, + "loss": 0.8147, + "step": 3275 + }, + { + "epoch": 0.17662281647616992, + "grad_norm": 0.8433284163475037, + "learning_rate": 9.95473091901541e-06, + "loss": 0.9083, + "step": 3276 + }, + { + "epoch": 0.17667673064481346, + "grad_norm": 0.82453453540802, + "learning_rate": 9.954702450677749e-06, + "loss": 0.8646, + "step": 3277 + }, + { + "epoch": 0.176730644813457, + "grad_norm": 0.8066715598106384, + "learning_rate": 9.954673973432202e-06, + "loss": 0.7837, + "step": 3278 + }, + { + "epoch": 0.1767845589821005, + "grad_norm": 0.7899057865142822, + "learning_rate": 9.95464548727882e-06, + "loss": 0.8418, + "step": 3279 + }, + { + "epoch": 0.17683847315074402, + "grad_norm": 0.7744193077087402, + "learning_rate": 9.954616992217654e-06, + "loss": 0.7316, + "step": 3280 + }, + { + "epoch": 0.17689238731938753, + "grad_norm": 0.9195299744606018, + "learning_rate": 9.954588488248756e-06, + "loss": 0.9387, + "step": 3281 + }, + { + "epoch": 0.17694630148803106, + "grad_norm": 0.9263700246810913, + "learning_rate": 9.954559975372173e-06, + "loss": 0.7165, + "step": 3282 + }, + { + "epoch": 0.17700021565667456, + "grad_norm": 0.7949888706207275, + "learning_rate": 9.954531453587962e-06, + "loss": 0.7981, + "step": 3283 + }, + { + "epoch": 0.1770541298253181, + "grad_norm": 0.9938671588897705, + "learning_rate": 9.95450292289617e-06, + "loss": 0.754, + "step": 3284 + }, + { + "epoch": 0.1771080439939616, + "grad_norm": 0.7466611862182617, + "learning_rate": 9.95447438329685e-06, + "loss": 0.8182, + "step": 3285 + }, + { + "epoch": 0.17716195816260513, + "grad_norm": 0.7918881177902222, + "learning_rate": 9.954445834790054e-06, + "loss": 0.6938, + "step": 3286 + }, + { + "epoch": 0.17721587233124866, + "grad_norm": 0.7867146730422974, + "learning_rate": 9.954417277375832e-06, + "loss": 0.7999, + "step": 3287 + }, + { + "epoch": 0.17726978649989217, + "grad_norm": 0.7873522043228149, + "learning_rate": 9.954388711054237e-06, + "loss": 0.7822, + "step": 3288 + }, + { + "epoch": 0.1773237006685357, + "grad_norm": 0.7909482717514038, + "learning_rate": 9.954360135825319e-06, + "loss": 0.724, + "step": 3289 + }, + { + "epoch": 0.1773776148371792, + "grad_norm": 0.7893263697624207, + "learning_rate": 9.954331551689129e-06, + "loss": 0.8892, + "step": 3290 + }, + { + "epoch": 0.17743152900582274, + "grad_norm": 0.813908040523529, + "learning_rate": 9.954302958645719e-06, + "loss": 0.8261, + "step": 3291 + }, + { + "epoch": 0.17748544317446624, + "grad_norm": 1.0279232263565063, + "learning_rate": 9.95427435669514e-06, + "loss": 0.8383, + "step": 3292 + }, + { + "epoch": 0.17753935734310977, + "grad_norm": 0.7427249550819397, + "learning_rate": 9.954245745837445e-06, + "loss": 0.7883, + "step": 3293 + }, + { + "epoch": 0.17759327151175328, + "grad_norm": 0.7699581980705261, + "learning_rate": 9.954217126072686e-06, + "loss": 0.749, + "step": 3294 + }, + { + "epoch": 0.1776471856803968, + "grad_norm": 0.8005263209342957, + "learning_rate": 9.954188497400909e-06, + "loss": 0.7886, + "step": 3295 + }, + { + "epoch": 0.17770109984904034, + "grad_norm": 0.8718039393424988, + "learning_rate": 9.95415985982217e-06, + "loss": 0.8397, + "step": 3296 + }, + { + "epoch": 0.17775501401768384, + "grad_norm": 0.7747098207473755, + "learning_rate": 9.954131213336522e-06, + "loss": 0.7193, + "step": 3297 + }, + { + "epoch": 0.17780892818632738, + "grad_norm": 0.8327599167823792, + "learning_rate": 9.954102557944013e-06, + "loss": 0.8484, + "step": 3298 + }, + { + "epoch": 0.17786284235497088, + "grad_norm": 0.7737470269203186, + "learning_rate": 9.954073893644696e-06, + "loss": 0.7638, + "step": 3299 + }, + { + "epoch": 0.1779167565236144, + "grad_norm": 0.8054937124252319, + "learning_rate": 9.954045220438622e-06, + "loss": 0.7772, + "step": 3300 + }, + { + "epoch": 0.17797067069225792, + "grad_norm": 0.7954006195068359, + "learning_rate": 9.954016538325844e-06, + "loss": 0.7746, + "step": 3301 + }, + { + "epoch": 0.17802458486090145, + "grad_norm": 0.8075349926948547, + "learning_rate": 9.95398784730641e-06, + "loss": 0.794, + "step": 3302 + }, + { + "epoch": 0.17807849902954495, + "grad_norm": 0.8701021075248718, + "learning_rate": 9.953959147380376e-06, + "loss": 0.8493, + "step": 3303 + }, + { + "epoch": 0.17813241319818848, + "grad_norm": 0.9046748876571655, + "learning_rate": 9.953930438547792e-06, + "loss": 0.8491, + "step": 3304 + }, + { + "epoch": 0.17818632736683201, + "grad_norm": 0.8041692972183228, + "learning_rate": 9.953901720808708e-06, + "loss": 0.7422, + "step": 3305 + }, + { + "epoch": 0.17824024153547552, + "grad_norm": 0.8486021757125854, + "learning_rate": 9.953872994163176e-06, + "loss": 0.7876, + "step": 3306 + }, + { + "epoch": 0.17829415570411905, + "grad_norm": 0.7282015085220337, + "learning_rate": 9.95384425861125e-06, + "loss": 0.7729, + "step": 3307 + }, + { + "epoch": 0.17834806987276255, + "grad_norm": 0.8199304342269897, + "learning_rate": 9.953815514152979e-06, + "loss": 0.8046, + "step": 3308 + }, + { + "epoch": 0.1784019840414061, + "grad_norm": 0.9033650755882263, + "learning_rate": 9.953786760788416e-06, + "loss": 0.735, + "step": 3309 + }, + { + "epoch": 0.1784558982100496, + "grad_norm": 1.1363990306854248, + "learning_rate": 9.953757998517614e-06, + "loss": 0.8351, + "step": 3310 + }, + { + "epoch": 0.17850981237869312, + "grad_norm": 0.747763454914093, + "learning_rate": 9.953729227340621e-06, + "loss": 0.7603, + "step": 3311 + }, + { + "epoch": 0.17856372654733663, + "grad_norm": 0.8733643293380737, + "learning_rate": 9.953700447257493e-06, + "loss": 0.8538, + "step": 3312 + }, + { + "epoch": 0.17861764071598016, + "grad_norm": 0.8054553270339966, + "learning_rate": 9.953671658268279e-06, + "loss": 0.6782, + "step": 3313 + }, + { + "epoch": 0.1786715548846237, + "grad_norm": 0.8797160387039185, + "learning_rate": 9.953642860373032e-06, + "loss": 0.613, + "step": 3314 + }, + { + "epoch": 0.1787254690532672, + "grad_norm": 0.7065737843513489, + "learning_rate": 9.953614053571802e-06, + "loss": 0.7912, + "step": 3315 + }, + { + "epoch": 0.17877938322191073, + "grad_norm": 0.8206682205200195, + "learning_rate": 9.953585237864642e-06, + "loss": 0.8505, + "step": 3316 + }, + { + "epoch": 0.17883329739055423, + "grad_norm": 0.7129380702972412, + "learning_rate": 9.953556413251605e-06, + "loss": 0.7242, + "step": 3317 + }, + { + "epoch": 0.17888721155919776, + "grad_norm": 0.8084376454353333, + "learning_rate": 9.953527579732742e-06, + "loss": 0.7626, + "step": 3318 + }, + { + "epoch": 0.17894112572784127, + "grad_norm": 0.8610605001449585, + "learning_rate": 9.953498737308103e-06, + "loss": 0.8255, + "step": 3319 + }, + { + "epoch": 0.1789950398964848, + "grad_norm": 0.7437496185302734, + "learning_rate": 9.953469885977742e-06, + "loss": 0.677, + "step": 3320 + }, + { + "epoch": 0.17904895406512833, + "grad_norm": 0.7540122270584106, + "learning_rate": 9.95344102574171e-06, + "loss": 0.7094, + "step": 3321 + }, + { + "epoch": 0.17910286823377183, + "grad_norm": 0.8017913699150085, + "learning_rate": 9.95341215660006e-06, + "loss": 0.8882, + "step": 3322 + }, + { + "epoch": 0.17915678240241537, + "grad_norm": 1.0244393348693848, + "learning_rate": 9.953383278552841e-06, + "loss": 0.7879, + "step": 3323 + }, + { + "epoch": 0.17921069657105887, + "grad_norm": 0.7007571458816528, + "learning_rate": 9.953354391600109e-06, + "loss": 0.757, + "step": 3324 + }, + { + "epoch": 0.1792646107397024, + "grad_norm": 0.8408647775650024, + "learning_rate": 9.953325495741913e-06, + "loss": 0.7772, + "step": 3325 + }, + { + "epoch": 0.1793185249083459, + "grad_norm": 0.718988299369812, + "learning_rate": 9.953296590978305e-06, + "loss": 0.7885, + "step": 3326 + }, + { + "epoch": 0.17937243907698944, + "grad_norm": 0.7917525768280029, + "learning_rate": 9.95326767730934e-06, + "loss": 0.8321, + "step": 3327 + }, + { + "epoch": 0.17942635324563294, + "grad_norm": 0.9516105055809021, + "learning_rate": 9.953238754735066e-06, + "loss": 0.8124, + "step": 3328 + }, + { + "epoch": 0.17948026741427647, + "grad_norm": 0.8829317688941956, + "learning_rate": 9.953209823255536e-06, + "loss": 0.7426, + "step": 3329 + }, + { + "epoch": 0.17953418158292, + "grad_norm": 0.83402019739151, + "learning_rate": 9.953180882870805e-06, + "loss": 0.7358, + "step": 3330 + }, + { + "epoch": 0.1795880957515635, + "grad_norm": 0.819425106048584, + "learning_rate": 9.953151933580923e-06, + "loss": 0.8002, + "step": 3331 + }, + { + "epoch": 0.17964200992020704, + "grad_norm": 0.8458916544914246, + "learning_rate": 9.95312297538594e-06, + "loss": 0.8305, + "step": 3332 + }, + { + "epoch": 0.17969592408885054, + "grad_norm": 0.8235782980918884, + "learning_rate": 9.95309400828591e-06, + "loss": 0.8228, + "step": 3333 + }, + { + "epoch": 0.17974983825749408, + "grad_norm": 0.7924965023994446, + "learning_rate": 9.953065032280885e-06, + "loss": 0.7369, + "step": 3334 + }, + { + "epoch": 0.17980375242613758, + "grad_norm": 0.7931050658226013, + "learning_rate": 9.953036047370919e-06, + "loss": 0.8337, + "step": 3335 + }, + { + "epoch": 0.1798576665947811, + "grad_norm": 0.7998207211494446, + "learning_rate": 9.95300705355606e-06, + "loss": 0.7341, + "step": 3336 + }, + { + "epoch": 0.17991158076342462, + "grad_norm": 0.713846743106842, + "learning_rate": 9.952978050836364e-06, + "loss": 0.6958, + "step": 3337 + }, + { + "epoch": 0.17996549493206815, + "grad_norm": 0.807744026184082, + "learning_rate": 9.95294903921188e-06, + "loss": 0.7723, + "step": 3338 + }, + { + "epoch": 0.18001940910071168, + "grad_norm": 0.865696370601654, + "learning_rate": 9.95292001868266e-06, + "loss": 0.8957, + "step": 3339 + }, + { + "epoch": 0.18007332326935518, + "grad_norm": 0.7955803871154785, + "learning_rate": 9.952890989248763e-06, + "loss": 0.7632, + "step": 3340 + }, + { + "epoch": 0.18012723743799872, + "grad_norm": 0.8028436303138733, + "learning_rate": 9.952861950910233e-06, + "loss": 0.8642, + "step": 3341 + }, + { + "epoch": 0.18018115160664222, + "grad_norm": 0.8755636215209961, + "learning_rate": 9.952832903667125e-06, + "loss": 0.8521, + "step": 3342 + }, + { + "epoch": 0.18023506577528575, + "grad_norm": 0.8018125891685486, + "learning_rate": 9.952803847519492e-06, + "loss": 0.8719, + "step": 3343 + }, + { + "epoch": 0.18028897994392926, + "grad_norm": 0.6923267245292664, + "learning_rate": 9.952774782467384e-06, + "loss": 0.718, + "step": 3344 + }, + { + "epoch": 0.1803428941125728, + "grad_norm": 0.7926875948905945, + "learning_rate": 9.952745708510856e-06, + "loss": 0.8657, + "step": 3345 + }, + { + "epoch": 0.1803968082812163, + "grad_norm": 0.8815774917602539, + "learning_rate": 9.95271662564996e-06, + "loss": 0.8196, + "step": 3346 + }, + { + "epoch": 0.18045072244985982, + "grad_norm": 0.8497309684753418, + "learning_rate": 9.952687533884748e-06, + "loss": 0.7563, + "step": 3347 + }, + { + "epoch": 0.18050463661850336, + "grad_norm": 0.7040117979049683, + "learning_rate": 9.952658433215269e-06, + "loss": 0.687, + "step": 3348 + }, + { + "epoch": 0.18055855078714686, + "grad_norm": 0.8446635007858276, + "learning_rate": 9.95262932364158e-06, + "loss": 0.895, + "step": 3349 + }, + { + "epoch": 0.1806124649557904, + "grad_norm": 0.821702778339386, + "learning_rate": 9.952600205163733e-06, + "loss": 0.8387, + "step": 3350 + }, + { + "epoch": 0.1806663791244339, + "grad_norm": 0.9755251407623291, + "learning_rate": 9.952571077781776e-06, + "loss": 0.9119, + "step": 3351 + }, + { + "epoch": 0.18072029329307743, + "grad_norm": 0.8260585069656372, + "learning_rate": 9.952541941495766e-06, + "loss": 0.7827, + "step": 3352 + }, + { + "epoch": 0.18077420746172093, + "grad_norm": 0.7443965673446655, + "learning_rate": 9.952512796305753e-06, + "loss": 0.7331, + "step": 3353 + }, + { + "epoch": 0.18082812163036446, + "grad_norm": 0.8674094676971436, + "learning_rate": 9.95248364221179e-06, + "loss": 0.8789, + "step": 3354 + }, + { + "epoch": 0.18088203579900797, + "grad_norm": 0.7950018644332886, + "learning_rate": 9.952454479213929e-06, + "loss": 0.7802, + "step": 3355 + }, + { + "epoch": 0.1809359499676515, + "grad_norm": 0.8740068078041077, + "learning_rate": 9.952425307312223e-06, + "loss": 0.9354, + "step": 3356 + }, + { + "epoch": 0.18098986413629503, + "grad_norm": 0.8254936933517456, + "learning_rate": 9.952396126506724e-06, + "loss": 0.8903, + "step": 3357 + }, + { + "epoch": 0.18104377830493854, + "grad_norm": 0.7814514636993408, + "learning_rate": 9.952366936797484e-06, + "loss": 0.7214, + "step": 3358 + }, + { + "epoch": 0.18109769247358207, + "grad_norm": 0.7647988796234131, + "learning_rate": 9.952337738184557e-06, + "loss": 0.7591, + "step": 3359 + }, + { + "epoch": 0.18115160664222557, + "grad_norm": 0.8247759938240051, + "learning_rate": 9.952308530667996e-06, + "loss": 0.7825, + "step": 3360 + }, + { + "epoch": 0.1812055208108691, + "grad_norm": 0.724585771560669, + "learning_rate": 9.95227931424785e-06, + "loss": 0.7828, + "step": 3361 + }, + { + "epoch": 0.1812594349795126, + "grad_norm": 0.8304919004440308, + "learning_rate": 9.952250088924175e-06, + "loss": 0.8071, + "step": 3362 + }, + { + "epoch": 0.18131334914815614, + "grad_norm": 0.8318499326705933, + "learning_rate": 9.95222085469702e-06, + "loss": 0.7571, + "step": 3363 + }, + { + "epoch": 0.18136726331679964, + "grad_norm": 0.7315414547920227, + "learning_rate": 9.952191611566443e-06, + "loss": 0.7644, + "step": 3364 + }, + { + "epoch": 0.18142117748544317, + "grad_norm": 0.853285551071167, + "learning_rate": 9.952162359532493e-06, + "loss": 0.8946, + "step": 3365 + }, + { + "epoch": 0.1814750916540867, + "grad_norm": 0.8418978452682495, + "learning_rate": 9.95213309859522e-06, + "loss": 0.7892, + "step": 3366 + }, + { + "epoch": 0.1815290058227302, + "grad_norm": 0.7926337122917175, + "learning_rate": 9.952103828754682e-06, + "loss": 0.7182, + "step": 3367 + }, + { + "epoch": 0.18158291999137374, + "grad_norm": 0.9103478193283081, + "learning_rate": 9.95207455001093e-06, + "loss": 0.8474, + "step": 3368 + }, + { + "epoch": 0.18163683416001725, + "grad_norm": 0.8050599098205566, + "learning_rate": 9.952045262364014e-06, + "loss": 0.7581, + "step": 3369 + }, + { + "epoch": 0.18169074832866078, + "grad_norm": 0.7441660165786743, + "learning_rate": 9.952015965813988e-06, + "loss": 0.7713, + "step": 3370 + }, + { + "epoch": 0.18174466249730428, + "grad_norm": 0.7210862636566162, + "learning_rate": 9.951986660360906e-06, + "loss": 0.7732, + "step": 3371 + }, + { + "epoch": 0.18179857666594781, + "grad_norm": 0.8199747204780579, + "learning_rate": 9.951957346004822e-06, + "loss": 0.8697, + "step": 3372 + }, + { + "epoch": 0.18185249083459132, + "grad_norm": 0.7781465649604797, + "learning_rate": 9.951928022745784e-06, + "loss": 0.8011, + "step": 3373 + }, + { + "epoch": 0.18190640500323485, + "grad_norm": 0.8713019490242004, + "learning_rate": 9.951898690583848e-06, + "loss": 0.8328, + "step": 3374 + }, + { + "epoch": 0.18196031917187838, + "grad_norm": 0.7194361686706543, + "learning_rate": 9.951869349519066e-06, + "loss": 0.7291, + "step": 3375 + }, + { + "epoch": 0.18201423334052189, + "grad_norm": 0.7940298914909363, + "learning_rate": 9.95183999955149e-06, + "loss": 0.8128, + "step": 3376 + }, + { + "epoch": 0.18206814750916542, + "grad_norm": 0.8048009872436523, + "learning_rate": 9.951810640681175e-06, + "loss": 0.7627, + "step": 3377 + }, + { + "epoch": 0.18212206167780892, + "grad_norm": 0.8479227423667908, + "learning_rate": 9.951781272908173e-06, + "loss": 0.7587, + "step": 3378 + }, + { + "epoch": 0.18217597584645245, + "grad_norm": 0.8620457053184509, + "learning_rate": 9.951751896232534e-06, + "loss": 0.7409, + "step": 3379 + }, + { + "epoch": 0.18222989001509596, + "grad_norm": 0.8283497095108032, + "learning_rate": 9.951722510654314e-06, + "loss": 0.7953, + "step": 3380 + }, + { + "epoch": 0.1822838041837395, + "grad_norm": 0.9071113467216492, + "learning_rate": 9.951693116173565e-06, + "loss": 0.8476, + "step": 3381 + }, + { + "epoch": 0.182337718352383, + "grad_norm": 0.8383519053459167, + "learning_rate": 9.951663712790338e-06, + "loss": 0.8388, + "step": 3382 + }, + { + "epoch": 0.18239163252102653, + "grad_norm": 0.8026612997055054, + "learning_rate": 9.951634300504689e-06, + "loss": 0.8848, + "step": 3383 + }, + { + "epoch": 0.18244554668967006, + "grad_norm": 0.8395872116088867, + "learning_rate": 9.951604879316667e-06, + "loss": 0.7759, + "step": 3384 + }, + { + "epoch": 0.18249946085831356, + "grad_norm": 1.1459238529205322, + "learning_rate": 9.95157544922633e-06, + "loss": 0.8005, + "step": 3385 + }, + { + "epoch": 0.1825533750269571, + "grad_norm": 0.8083657026290894, + "learning_rate": 9.951546010233729e-06, + "loss": 0.8298, + "step": 3386 + }, + { + "epoch": 0.1826072891956006, + "grad_norm": 0.8329801559448242, + "learning_rate": 9.951516562338912e-06, + "loss": 0.7743, + "step": 3387 + }, + { + "epoch": 0.18266120336424413, + "grad_norm": 0.7916942834854126, + "learning_rate": 9.951487105541939e-06, + "loss": 0.7934, + "step": 3388 + }, + { + "epoch": 0.18271511753288763, + "grad_norm": 0.8752714395523071, + "learning_rate": 9.951457639842861e-06, + "loss": 0.8031, + "step": 3389 + }, + { + "epoch": 0.18276903170153116, + "grad_norm": 0.7645601630210876, + "learning_rate": 9.951428165241728e-06, + "loss": 0.6987, + "step": 3390 + }, + { + "epoch": 0.18282294587017467, + "grad_norm": 0.9860275983810425, + "learning_rate": 9.951398681738595e-06, + "loss": 0.8027, + "step": 3391 + }, + { + "epoch": 0.1828768600388182, + "grad_norm": 0.8548283576965332, + "learning_rate": 9.951369189333515e-06, + "loss": 0.8595, + "step": 3392 + }, + { + "epoch": 0.18293077420746173, + "grad_norm": 0.843217670917511, + "learning_rate": 9.95133968802654e-06, + "loss": 0.8437, + "step": 3393 + }, + { + "epoch": 0.18298468837610524, + "grad_norm": 0.7996432781219482, + "learning_rate": 9.951310177817726e-06, + "loss": 0.7229, + "step": 3394 + }, + { + "epoch": 0.18303860254474877, + "grad_norm": 0.8908971548080444, + "learning_rate": 9.951280658707124e-06, + "loss": 0.8639, + "step": 3395 + }, + { + "epoch": 0.18309251671339227, + "grad_norm": 0.9041224718093872, + "learning_rate": 9.951251130694787e-06, + "loss": 0.8026, + "step": 3396 + }, + { + "epoch": 0.1831464308820358, + "grad_norm": 0.7458503842353821, + "learning_rate": 9.951221593780768e-06, + "loss": 0.8228, + "step": 3397 + }, + { + "epoch": 0.1832003450506793, + "grad_norm": 0.8241537809371948, + "learning_rate": 9.95119204796512e-06, + "loss": 0.7937, + "step": 3398 + }, + { + "epoch": 0.18325425921932284, + "grad_norm": 0.8728781342506409, + "learning_rate": 9.951162493247897e-06, + "loss": 0.8829, + "step": 3399 + }, + { + "epoch": 0.18330817338796634, + "grad_norm": 0.843101978302002, + "learning_rate": 9.95113292962915e-06, + "loss": 0.9562, + "step": 3400 + }, + { + "epoch": 0.18336208755660988, + "grad_norm": 1.031156301498413, + "learning_rate": 9.951103357108935e-06, + "loss": 0.6757, + "step": 3401 + }, + { + "epoch": 0.1834160017252534, + "grad_norm": 0.9858013391494751, + "learning_rate": 9.951073775687304e-06, + "loss": 0.7922, + "step": 3402 + }, + { + "epoch": 0.1834699158938969, + "grad_norm": 0.9532352685928345, + "learning_rate": 9.95104418536431e-06, + "loss": 0.8979, + "step": 3403 + }, + { + "epoch": 0.18352383006254044, + "grad_norm": 0.9552246332168579, + "learning_rate": 9.951014586140006e-06, + "loss": 0.8682, + "step": 3404 + }, + { + "epoch": 0.18357774423118395, + "grad_norm": 0.8952224850654602, + "learning_rate": 9.950984978014446e-06, + "loss": 0.9064, + "step": 3405 + }, + { + "epoch": 0.18363165839982748, + "grad_norm": 0.8228804469108582, + "learning_rate": 9.950955360987684e-06, + "loss": 0.8337, + "step": 3406 + }, + { + "epoch": 0.18368557256847098, + "grad_norm": 0.8621776103973389, + "learning_rate": 9.95092573505977e-06, + "loss": 0.8418, + "step": 3407 + }, + { + "epoch": 0.18373948673711452, + "grad_norm": 0.8312029242515564, + "learning_rate": 9.95089610023076e-06, + "loss": 0.8453, + "step": 3408 + }, + { + "epoch": 0.18379340090575802, + "grad_norm": 0.8212811350822449, + "learning_rate": 9.950866456500706e-06, + "loss": 0.7226, + "step": 3409 + }, + { + "epoch": 0.18384731507440155, + "grad_norm": 0.7918773293495178, + "learning_rate": 9.950836803869663e-06, + "loss": 0.7546, + "step": 3410 + }, + { + "epoch": 0.18390122924304508, + "grad_norm": 0.8544521331787109, + "learning_rate": 9.950807142337682e-06, + "loss": 0.8975, + "step": 3411 + }, + { + "epoch": 0.1839551434116886, + "grad_norm": 0.7909727692604065, + "learning_rate": 9.950777471904818e-06, + "loss": 0.8266, + "step": 3412 + }, + { + "epoch": 0.18400905758033212, + "grad_norm": 0.7834721207618713, + "learning_rate": 9.950747792571122e-06, + "loss": 0.7647, + "step": 3413 + }, + { + "epoch": 0.18406297174897562, + "grad_norm": 1.0084491968154907, + "learning_rate": 9.950718104336651e-06, + "loss": 0.8954, + "step": 3414 + }, + { + "epoch": 0.18411688591761916, + "grad_norm": 0.9300922155380249, + "learning_rate": 9.950688407201457e-06, + "loss": 0.8106, + "step": 3415 + }, + { + "epoch": 0.18417080008626266, + "grad_norm": 0.7957245111465454, + "learning_rate": 9.950658701165593e-06, + "loss": 0.7556, + "step": 3416 + }, + { + "epoch": 0.1842247142549062, + "grad_norm": 0.7386512160301208, + "learning_rate": 9.950628986229111e-06, + "loss": 0.7384, + "step": 3417 + }, + { + "epoch": 0.1842786284235497, + "grad_norm": 0.8791146874427795, + "learning_rate": 9.950599262392067e-06, + "loss": 0.7681, + "step": 3418 + }, + { + "epoch": 0.18433254259219323, + "grad_norm": 0.78180330991745, + "learning_rate": 9.950569529654512e-06, + "loss": 0.7641, + "step": 3419 + }, + { + "epoch": 0.18438645676083676, + "grad_norm": 0.7648051977157593, + "learning_rate": 9.950539788016502e-06, + "loss": 0.7782, + "step": 3420 + }, + { + "epoch": 0.18444037092948026, + "grad_norm": 0.8135426640510559, + "learning_rate": 9.950510037478089e-06, + "loss": 0.8313, + "step": 3421 + }, + { + "epoch": 0.1844942850981238, + "grad_norm": 0.8623054623603821, + "learning_rate": 9.950480278039325e-06, + "loss": 0.8142, + "step": 3422 + }, + { + "epoch": 0.1845481992667673, + "grad_norm": 0.774558424949646, + "learning_rate": 9.950450509700267e-06, + "loss": 0.7747, + "step": 3423 + }, + { + "epoch": 0.18460211343541083, + "grad_norm": 0.7947419285774231, + "learning_rate": 9.950420732460965e-06, + "loss": 0.8757, + "step": 3424 + }, + { + "epoch": 0.18465602760405433, + "grad_norm": 0.8677110075950623, + "learning_rate": 9.950390946321475e-06, + "loss": 0.8527, + "step": 3425 + }, + { + "epoch": 0.18470994177269787, + "grad_norm": 0.8350674510002136, + "learning_rate": 9.950361151281852e-06, + "loss": 0.7209, + "step": 3426 + }, + { + "epoch": 0.1847638559413414, + "grad_norm": 0.7326707243919373, + "learning_rate": 9.950331347342143e-06, + "loss": 0.749, + "step": 3427 + }, + { + "epoch": 0.1848177701099849, + "grad_norm": 0.8775684237480164, + "learning_rate": 9.95030153450241e-06, + "loss": 0.762, + "step": 3428 + }, + { + "epoch": 0.18487168427862843, + "grad_norm": 0.8116014003753662, + "learning_rate": 9.9502717127627e-06, + "loss": 0.7592, + "step": 3429 + }, + { + "epoch": 0.18492559844727194, + "grad_norm": 0.7852542996406555, + "learning_rate": 9.950241882123068e-06, + "loss": 0.8254, + "step": 3430 + }, + { + "epoch": 0.18497951261591547, + "grad_norm": 0.761076807975769, + "learning_rate": 9.950212042583571e-06, + "loss": 0.7444, + "step": 3431 + }, + { + "epoch": 0.18503342678455897, + "grad_norm": 0.914729118347168, + "learning_rate": 9.95018219414426e-06, + "loss": 0.8847, + "step": 3432 + }, + { + "epoch": 0.1850873409532025, + "grad_norm": 0.7256419062614441, + "learning_rate": 9.950152336805188e-06, + "loss": 0.7069, + "step": 3433 + }, + { + "epoch": 0.185141255121846, + "grad_norm": 0.7481849193572998, + "learning_rate": 9.950122470566411e-06, + "loss": 0.7921, + "step": 3434 + }, + { + "epoch": 0.18519516929048954, + "grad_norm": 0.7878799438476562, + "learning_rate": 9.95009259542798e-06, + "loss": 0.7422, + "step": 3435 + }, + { + "epoch": 0.18524908345913307, + "grad_norm": 0.8083212375640869, + "learning_rate": 9.950062711389953e-06, + "loss": 0.8445, + "step": 3436 + }, + { + "epoch": 0.18530299762777658, + "grad_norm": 0.9458408355712891, + "learning_rate": 9.950032818452377e-06, + "loss": 0.771, + "step": 3437 + }, + { + "epoch": 0.1853569117964201, + "grad_norm": 0.7575398087501526, + "learning_rate": 9.950002916615311e-06, + "loss": 0.765, + "step": 3438 + }, + { + "epoch": 0.1854108259650636, + "grad_norm": 0.8672422766685486, + "learning_rate": 9.94997300587881e-06, + "loss": 0.8499, + "step": 3439 + }, + { + "epoch": 0.18546474013370715, + "grad_norm": 0.7971605658531189, + "learning_rate": 9.949943086242923e-06, + "loss": 0.8617, + "step": 3440 + }, + { + "epoch": 0.18551865430235065, + "grad_norm": 1.0215446949005127, + "learning_rate": 9.949913157707704e-06, + "loss": 0.8224, + "step": 3441 + }, + { + "epoch": 0.18557256847099418, + "grad_norm": 0.7983795404434204, + "learning_rate": 9.949883220273211e-06, + "loss": 0.7497, + "step": 3442 + }, + { + "epoch": 0.18562648263963769, + "grad_norm": 0.8548665642738342, + "learning_rate": 9.949853273939496e-06, + "loss": 0.856, + "step": 3443 + }, + { + "epoch": 0.18568039680828122, + "grad_norm": 0.7996117472648621, + "learning_rate": 9.949823318706611e-06, + "loss": 0.7344, + "step": 3444 + }, + { + "epoch": 0.18573431097692475, + "grad_norm": 0.9108440279960632, + "learning_rate": 9.949793354574612e-06, + "loss": 0.8229, + "step": 3445 + }, + { + "epoch": 0.18578822514556825, + "grad_norm": 0.8484078049659729, + "learning_rate": 9.949763381543553e-06, + "loss": 0.7366, + "step": 3446 + }, + { + "epoch": 0.18584213931421179, + "grad_norm": 0.7617974877357483, + "learning_rate": 9.949733399613486e-06, + "loss": 0.777, + "step": 3447 + }, + { + "epoch": 0.1858960534828553, + "grad_norm": 1.0613569021224976, + "learning_rate": 9.949703408784465e-06, + "loss": 0.9028, + "step": 3448 + }, + { + "epoch": 0.18594996765149882, + "grad_norm": 0.7503539323806763, + "learning_rate": 9.949673409056546e-06, + "loss": 0.797, + "step": 3449 + }, + { + "epoch": 0.18600388182014232, + "grad_norm": 0.8162353038787842, + "learning_rate": 9.949643400429782e-06, + "loss": 0.8698, + "step": 3450 + }, + { + "epoch": 0.18605779598878586, + "grad_norm": 0.8876883387565613, + "learning_rate": 9.949613382904226e-06, + "loss": 0.8422, + "step": 3451 + }, + { + "epoch": 0.18611171015742936, + "grad_norm": 0.7412144541740417, + "learning_rate": 9.949583356479934e-06, + "loss": 0.7977, + "step": 3452 + }, + { + "epoch": 0.1861656243260729, + "grad_norm": 0.7515407204627991, + "learning_rate": 9.949553321156957e-06, + "loss": 0.8046, + "step": 3453 + }, + { + "epoch": 0.18621953849471642, + "grad_norm": 0.8171376585960388, + "learning_rate": 9.949523276935352e-06, + "loss": 0.7121, + "step": 3454 + }, + { + "epoch": 0.18627345266335993, + "grad_norm": 0.838368833065033, + "learning_rate": 9.94949322381517e-06, + "loss": 0.833, + "step": 3455 + }, + { + "epoch": 0.18632736683200346, + "grad_norm": 1.0004788637161255, + "learning_rate": 9.949463161796468e-06, + "loss": 0.7967, + "step": 3456 + }, + { + "epoch": 0.18638128100064696, + "grad_norm": 0.8949950337409973, + "learning_rate": 9.949433090879298e-06, + "loss": 0.815, + "step": 3457 + }, + { + "epoch": 0.1864351951692905, + "grad_norm": 0.8611262440681458, + "learning_rate": 9.949403011063716e-06, + "loss": 0.8998, + "step": 3458 + }, + { + "epoch": 0.186489109337934, + "grad_norm": 0.7873225212097168, + "learning_rate": 9.949372922349775e-06, + "loss": 0.8011, + "step": 3459 + }, + { + "epoch": 0.18654302350657753, + "grad_norm": 0.7770752310752869, + "learning_rate": 9.949342824737529e-06, + "loss": 0.7687, + "step": 3460 + }, + { + "epoch": 0.18659693767522104, + "grad_norm": 0.7723278403282166, + "learning_rate": 9.949312718227031e-06, + "loss": 0.8047, + "step": 3461 + }, + { + "epoch": 0.18665085184386457, + "grad_norm": 0.8038878440856934, + "learning_rate": 9.949282602818335e-06, + "loss": 0.6522, + "step": 3462 + }, + { + "epoch": 0.1867047660125081, + "grad_norm": 0.8243177533149719, + "learning_rate": 9.949252478511499e-06, + "loss": 0.7859, + "step": 3463 + }, + { + "epoch": 0.1867586801811516, + "grad_norm": 0.8061205744743347, + "learning_rate": 9.949222345306574e-06, + "loss": 0.8, + "step": 3464 + }, + { + "epoch": 0.18681259434979514, + "grad_norm": 0.8916036486625671, + "learning_rate": 9.949192203203615e-06, + "loss": 0.7831, + "step": 3465 + }, + { + "epoch": 0.18686650851843864, + "grad_norm": 0.7694443464279175, + "learning_rate": 9.949162052202675e-06, + "loss": 0.753, + "step": 3466 + }, + { + "epoch": 0.18692042268708217, + "grad_norm": 0.8028594255447388, + "learning_rate": 9.94913189230381e-06, + "loss": 0.7834, + "step": 3467 + }, + { + "epoch": 0.18697433685572568, + "grad_norm": 0.8558024764060974, + "learning_rate": 9.94910172350707e-06, + "loss": 0.8479, + "step": 3468 + }, + { + "epoch": 0.1870282510243692, + "grad_norm": 0.8418707251548767, + "learning_rate": 9.949071545812517e-06, + "loss": 0.7841, + "step": 3469 + }, + { + "epoch": 0.1870821651930127, + "grad_norm": 0.9143140316009521, + "learning_rate": 9.9490413592202e-06, + "loss": 0.7803, + "step": 3470 + }, + { + "epoch": 0.18713607936165624, + "grad_norm": 0.927670419216156, + "learning_rate": 9.949011163730172e-06, + "loss": 0.7969, + "step": 3471 + }, + { + "epoch": 0.18718999353029978, + "grad_norm": 0.7614530324935913, + "learning_rate": 9.948980959342492e-06, + "loss": 0.7541, + "step": 3472 + }, + { + "epoch": 0.18724390769894328, + "grad_norm": 0.7719544172286987, + "learning_rate": 9.948950746057208e-06, + "loss": 0.6996, + "step": 3473 + }, + { + "epoch": 0.1872978218675868, + "grad_norm": 0.8512967824935913, + "learning_rate": 9.94892052387438e-06, + "loss": 0.8749, + "step": 3474 + }, + { + "epoch": 0.18735173603623032, + "grad_norm": 0.7408632636070251, + "learning_rate": 9.948890292794062e-06, + "loss": 0.7646, + "step": 3475 + }, + { + "epoch": 0.18740565020487385, + "grad_norm": 0.7667837142944336, + "learning_rate": 9.948860052816305e-06, + "loss": 0.7721, + "step": 3476 + }, + { + "epoch": 0.18745956437351735, + "grad_norm": 0.8099546432495117, + "learning_rate": 9.948829803941167e-06, + "loss": 0.8604, + "step": 3477 + }, + { + "epoch": 0.18751347854216088, + "grad_norm": 0.7130147814750671, + "learning_rate": 9.948799546168699e-06, + "loss": 0.7215, + "step": 3478 + }, + { + "epoch": 0.1875673927108044, + "grad_norm": 0.7442251443862915, + "learning_rate": 9.948769279498955e-06, + "loss": 0.7691, + "step": 3479 + }, + { + "epoch": 0.18762130687944792, + "grad_norm": 0.8528403043746948, + "learning_rate": 9.948739003931995e-06, + "loss": 0.8738, + "step": 3480 + }, + { + "epoch": 0.18767522104809145, + "grad_norm": 0.7217040061950684, + "learning_rate": 9.948708719467868e-06, + "loss": 0.6989, + "step": 3481 + }, + { + "epoch": 0.18772913521673495, + "grad_norm": 1.0738893747329712, + "learning_rate": 9.94867842610663e-06, + "loss": 0.7464, + "step": 3482 + }, + { + "epoch": 0.1877830493853785, + "grad_norm": 0.7653424739837646, + "learning_rate": 9.948648123848334e-06, + "loss": 0.8552, + "step": 3483 + }, + { + "epoch": 0.187836963554022, + "grad_norm": 0.791019856929779, + "learning_rate": 9.948617812693037e-06, + "loss": 0.8548, + "step": 3484 + }, + { + "epoch": 0.18789087772266552, + "grad_norm": 0.8527680039405823, + "learning_rate": 9.948587492640796e-06, + "loss": 0.7717, + "step": 3485 + }, + { + "epoch": 0.18794479189130903, + "grad_norm": 1.0001403093338013, + "learning_rate": 9.948557163691659e-06, + "loss": 0.8061, + "step": 3486 + }, + { + "epoch": 0.18799870605995256, + "grad_norm": 0.7622776627540588, + "learning_rate": 9.948526825845683e-06, + "loss": 0.7082, + "step": 3487 + }, + { + "epoch": 0.18805262022859606, + "grad_norm": 0.7377861142158508, + "learning_rate": 9.948496479102925e-06, + "loss": 0.7776, + "step": 3488 + }, + { + "epoch": 0.1881065343972396, + "grad_norm": 0.9017737507820129, + "learning_rate": 9.948466123463436e-06, + "loss": 0.7676, + "step": 3489 + }, + { + "epoch": 0.18816044856588313, + "grad_norm": 0.7733216881752014, + "learning_rate": 9.948435758927274e-06, + "loss": 0.7503, + "step": 3490 + }, + { + "epoch": 0.18821436273452663, + "grad_norm": 0.9103933572769165, + "learning_rate": 9.948405385494491e-06, + "loss": 0.8696, + "step": 3491 + }, + { + "epoch": 0.18826827690317016, + "grad_norm": 0.7228747010231018, + "learning_rate": 9.948375003165143e-06, + "loss": 0.8396, + "step": 3492 + }, + { + "epoch": 0.18832219107181367, + "grad_norm": 0.9336891174316406, + "learning_rate": 9.948344611939283e-06, + "loss": 0.7994, + "step": 3493 + }, + { + "epoch": 0.1883761052404572, + "grad_norm": 0.8534504175186157, + "learning_rate": 9.948314211816968e-06, + "loss": 0.7627, + "step": 3494 + }, + { + "epoch": 0.1884300194091007, + "grad_norm": 0.867060661315918, + "learning_rate": 9.94828380279825e-06, + "loss": 0.8503, + "step": 3495 + }, + { + "epoch": 0.18848393357774423, + "grad_norm": 0.7721019983291626, + "learning_rate": 9.948253384883188e-06, + "loss": 0.7409, + "step": 3496 + }, + { + "epoch": 0.18853784774638774, + "grad_norm": 0.7308738827705383, + "learning_rate": 9.948222958071832e-06, + "loss": 0.7579, + "step": 3497 + }, + { + "epoch": 0.18859176191503127, + "grad_norm": 1.1277705430984497, + "learning_rate": 9.948192522364237e-06, + "loss": 0.8288, + "step": 3498 + }, + { + "epoch": 0.1886456760836748, + "grad_norm": 0.8183790445327759, + "learning_rate": 9.948162077760462e-06, + "loss": 0.7819, + "step": 3499 + }, + { + "epoch": 0.1886995902523183, + "grad_norm": 0.7458687424659729, + "learning_rate": 9.948131624260557e-06, + "loss": 0.7482, + "step": 3500 + }, + { + "epoch": 0.18875350442096184, + "grad_norm": 0.9347942471504211, + "learning_rate": 9.94810116186458e-06, + "loss": 0.8208, + "step": 3501 + }, + { + "epoch": 0.18880741858960534, + "grad_norm": 0.7442129254341125, + "learning_rate": 9.948070690572582e-06, + "loss": 0.7843, + "step": 3502 + }, + { + "epoch": 0.18886133275824887, + "grad_norm": 0.8121855854988098, + "learning_rate": 9.948040210384622e-06, + "loss": 0.738, + "step": 3503 + }, + { + "epoch": 0.18891524692689238, + "grad_norm": 0.8118747472763062, + "learning_rate": 9.948009721300754e-06, + "loss": 0.8792, + "step": 3504 + }, + { + "epoch": 0.1889691610955359, + "grad_norm": 0.8263816833496094, + "learning_rate": 9.94797922332103e-06, + "loss": 0.7759, + "step": 3505 + }, + { + "epoch": 0.1890230752641794, + "grad_norm": 0.7452372908592224, + "learning_rate": 9.947948716445508e-06, + "loss": 0.7588, + "step": 3506 + }, + { + "epoch": 0.18907698943282295, + "grad_norm": 0.7385339736938477, + "learning_rate": 9.94791820067424e-06, + "loss": 0.8412, + "step": 3507 + }, + { + "epoch": 0.18913090360146648, + "grad_norm": 0.7456401586532593, + "learning_rate": 9.947887676007284e-06, + "loss": 0.7539, + "step": 3508 + }, + { + "epoch": 0.18918481777010998, + "grad_norm": 0.8101776242256165, + "learning_rate": 9.947857142444693e-06, + "loss": 0.8006, + "step": 3509 + }, + { + "epoch": 0.1892387319387535, + "grad_norm": 0.7587085962295532, + "learning_rate": 9.947826599986523e-06, + "loss": 0.7958, + "step": 3510 + }, + { + "epoch": 0.18929264610739702, + "grad_norm": 0.7974298596382141, + "learning_rate": 9.947796048632826e-06, + "loss": 0.7954, + "step": 3511 + }, + { + "epoch": 0.18934656027604055, + "grad_norm": 0.8407479524612427, + "learning_rate": 9.94776548838366e-06, + "loss": 0.825, + "step": 3512 + }, + { + "epoch": 0.18940047444468405, + "grad_norm": 0.7465969324111938, + "learning_rate": 9.94773491923908e-06, + "loss": 0.7725, + "step": 3513 + }, + { + "epoch": 0.18945438861332758, + "grad_norm": 0.9324356913566589, + "learning_rate": 9.947704341199137e-06, + "loss": 0.755, + "step": 3514 + }, + { + "epoch": 0.1895083027819711, + "grad_norm": 0.8157918453216553, + "learning_rate": 9.94767375426389e-06, + "loss": 0.8678, + "step": 3515 + }, + { + "epoch": 0.18956221695061462, + "grad_norm": 0.8501976132392883, + "learning_rate": 9.947643158433395e-06, + "loss": 0.8431, + "step": 3516 + }, + { + "epoch": 0.18961613111925815, + "grad_norm": 0.7773411273956299, + "learning_rate": 9.947612553707703e-06, + "loss": 0.748, + "step": 3517 + }, + { + "epoch": 0.18967004528790166, + "grad_norm": 0.7716071605682373, + "learning_rate": 9.947581940086873e-06, + "loss": 0.7563, + "step": 3518 + }, + { + "epoch": 0.1897239594565452, + "grad_norm": 0.9465253353118896, + "learning_rate": 9.947551317570957e-06, + "loss": 0.9289, + "step": 3519 + }, + { + "epoch": 0.1897778736251887, + "grad_norm": 0.7123626470565796, + "learning_rate": 9.94752068616001e-06, + "loss": 0.7012, + "step": 3520 + }, + { + "epoch": 0.18983178779383222, + "grad_norm": 0.7318246960639954, + "learning_rate": 9.94749004585409e-06, + "loss": 0.8247, + "step": 3521 + }, + { + "epoch": 0.18988570196247573, + "grad_norm": 0.8028656244277954, + "learning_rate": 9.947459396653248e-06, + "loss": 0.8606, + "step": 3522 + }, + { + "epoch": 0.18993961613111926, + "grad_norm": 0.7580826282501221, + "learning_rate": 9.947428738557541e-06, + "loss": 0.7801, + "step": 3523 + }, + { + "epoch": 0.18999353029976276, + "grad_norm": 0.7612492442131042, + "learning_rate": 9.947398071567025e-06, + "loss": 0.8298, + "step": 3524 + }, + { + "epoch": 0.1900474444684063, + "grad_norm": 0.7892666459083557, + "learning_rate": 9.947367395681755e-06, + "loss": 0.739, + "step": 3525 + }, + { + "epoch": 0.19010135863704983, + "grad_norm": 0.7531749606132507, + "learning_rate": 9.947336710901785e-06, + "loss": 0.7804, + "step": 3526 + }, + { + "epoch": 0.19015527280569333, + "grad_norm": 0.7833613753318787, + "learning_rate": 9.947306017227171e-06, + "loss": 0.6541, + "step": 3527 + }, + { + "epoch": 0.19020918697433686, + "grad_norm": 0.749286413192749, + "learning_rate": 9.94727531465797e-06, + "loss": 0.6982, + "step": 3528 + }, + { + "epoch": 0.19026310114298037, + "grad_norm": 0.9150011539459229, + "learning_rate": 9.947244603194233e-06, + "loss": 0.8681, + "step": 3529 + }, + { + "epoch": 0.1903170153116239, + "grad_norm": 0.8265007138252258, + "learning_rate": 9.947213882836018e-06, + "loss": 0.9088, + "step": 3530 + }, + { + "epoch": 0.1903709294802674, + "grad_norm": 0.7807170152664185, + "learning_rate": 9.947183153583379e-06, + "loss": 0.7875, + "step": 3531 + }, + { + "epoch": 0.19042484364891094, + "grad_norm": 1.0078792572021484, + "learning_rate": 9.947152415436375e-06, + "loss": 1.2045, + "step": 3532 + }, + { + "epoch": 0.19047875781755447, + "grad_norm": 0.7661539912223816, + "learning_rate": 9.947121668395055e-06, + "loss": 0.8202, + "step": 3533 + }, + { + "epoch": 0.19053267198619797, + "grad_norm": 0.7419549226760864, + "learning_rate": 9.947090912459479e-06, + "loss": 0.7775, + "step": 3534 + }, + { + "epoch": 0.1905865861548415, + "grad_norm": 0.9671319723129272, + "learning_rate": 9.947060147629698e-06, + "loss": 0.8328, + "step": 3535 + }, + { + "epoch": 0.190640500323485, + "grad_norm": 0.9418153762817383, + "learning_rate": 9.947029373905773e-06, + "loss": 0.8476, + "step": 3536 + }, + { + "epoch": 0.19069441449212854, + "grad_norm": 0.8007176518440247, + "learning_rate": 9.946998591287755e-06, + "loss": 0.8379, + "step": 3537 + }, + { + "epoch": 0.19074832866077204, + "grad_norm": 1.0271466970443726, + "learning_rate": 9.946967799775701e-06, + "loss": 0.7789, + "step": 3538 + }, + { + "epoch": 0.19080224282941557, + "grad_norm": 0.7577568888664246, + "learning_rate": 9.946936999369668e-06, + "loss": 0.7749, + "step": 3539 + }, + { + "epoch": 0.19085615699805908, + "grad_norm": 0.7766523361206055, + "learning_rate": 9.946906190069707e-06, + "loss": 0.7143, + "step": 3540 + }, + { + "epoch": 0.1909100711667026, + "grad_norm": 0.798589825630188, + "learning_rate": 9.946875371875876e-06, + "loss": 0.8481, + "step": 3541 + }, + { + "epoch": 0.19096398533534614, + "grad_norm": 0.8279602527618408, + "learning_rate": 9.946844544788232e-06, + "loss": 0.8369, + "step": 3542 + }, + { + "epoch": 0.19101789950398965, + "grad_norm": 0.7607479691505432, + "learning_rate": 9.946813708806828e-06, + "loss": 0.8088, + "step": 3543 + }, + { + "epoch": 0.19107181367263318, + "grad_norm": 0.7722266912460327, + "learning_rate": 9.946782863931719e-06, + "loss": 0.704, + "step": 3544 + }, + { + "epoch": 0.19112572784127668, + "grad_norm": 0.8101015686988831, + "learning_rate": 9.946752010162964e-06, + "loss": 0.7828, + "step": 3545 + }, + { + "epoch": 0.19117964200992021, + "grad_norm": 0.8161671161651611, + "learning_rate": 9.946721147500613e-06, + "loss": 0.8875, + "step": 3546 + }, + { + "epoch": 0.19123355617856372, + "grad_norm": 0.9234161972999573, + "learning_rate": 9.946690275944727e-06, + "loss": 0.8846, + "step": 3547 + }, + { + "epoch": 0.19128747034720725, + "grad_norm": 0.7948644757270813, + "learning_rate": 9.946659395495357e-06, + "loss": 0.8331, + "step": 3548 + }, + { + "epoch": 0.19134138451585075, + "grad_norm": 0.9087135791778564, + "learning_rate": 9.946628506152563e-06, + "loss": 0.7462, + "step": 3549 + }, + { + "epoch": 0.19139529868449429, + "grad_norm": 0.7624903917312622, + "learning_rate": 9.946597607916396e-06, + "loss": 0.6431, + "step": 3550 + }, + { + "epoch": 0.19144921285313782, + "grad_norm": 0.9236660003662109, + "learning_rate": 9.946566700786914e-06, + "loss": 0.921, + "step": 3551 + }, + { + "epoch": 0.19150312702178132, + "grad_norm": 0.8824177980422974, + "learning_rate": 9.946535784764173e-06, + "loss": 0.805, + "step": 3552 + }, + { + "epoch": 0.19155704119042485, + "grad_norm": 0.7843056917190552, + "learning_rate": 9.946504859848227e-06, + "loss": 0.8528, + "step": 3553 + }, + { + "epoch": 0.19161095535906836, + "grad_norm": 1.2314038276672363, + "learning_rate": 9.946473926039134e-06, + "loss": 0.8141, + "step": 3554 + }, + { + "epoch": 0.1916648695277119, + "grad_norm": 0.7956500053405762, + "learning_rate": 9.946442983336945e-06, + "loss": 0.7946, + "step": 3555 + }, + { + "epoch": 0.1917187836963554, + "grad_norm": 0.850674033164978, + "learning_rate": 9.94641203174172e-06, + "loss": 0.8965, + "step": 3556 + }, + { + "epoch": 0.19177269786499893, + "grad_norm": 0.8371244668960571, + "learning_rate": 9.946381071253514e-06, + "loss": 0.7859, + "step": 3557 + }, + { + "epoch": 0.19182661203364243, + "grad_norm": 0.7423365712165833, + "learning_rate": 9.946350101872382e-06, + "loss": 0.8012, + "step": 3558 + }, + { + "epoch": 0.19188052620228596, + "grad_norm": 0.8446981310844421, + "learning_rate": 9.946319123598379e-06, + "loss": 0.9037, + "step": 3559 + }, + { + "epoch": 0.1919344403709295, + "grad_norm": 0.8565588593482971, + "learning_rate": 9.946288136431562e-06, + "loss": 0.7398, + "step": 3560 + }, + { + "epoch": 0.191988354539573, + "grad_norm": 0.8087875843048096, + "learning_rate": 9.946257140371985e-06, + "loss": 0.7214, + "step": 3561 + }, + { + "epoch": 0.19204226870821653, + "grad_norm": 0.7951125502586365, + "learning_rate": 9.946226135419705e-06, + "loss": 0.7988, + "step": 3562 + }, + { + "epoch": 0.19209618287686003, + "grad_norm": 0.8709264397621155, + "learning_rate": 9.946195121574779e-06, + "loss": 0.8563, + "step": 3563 + }, + { + "epoch": 0.19215009704550357, + "grad_norm": 0.7908393740653992, + "learning_rate": 9.94616409883726e-06, + "loss": 0.7874, + "step": 3564 + }, + { + "epoch": 0.19220401121414707, + "grad_norm": 1.0512382984161377, + "learning_rate": 9.946133067207204e-06, + "loss": 0.9174, + "step": 3565 + }, + { + "epoch": 0.1922579253827906, + "grad_norm": 0.7937822937965393, + "learning_rate": 9.94610202668467e-06, + "loss": 0.6863, + "step": 3566 + }, + { + "epoch": 0.1923118395514341, + "grad_norm": 0.9130533337593079, + "learning_rate": 9.94607097726971e-06, + "loss": 0.8287, + "step": 3567 + }, + { + "epoch": 0.19236575372007764, + "grad_norm": 1.1604489088058472, + "learning_rate": 9.946039918962383e-06, + "loss": 0.6922, + "step": 3568 + }, + { + "epoch": 0.19241966788872117, + "grad_norm": 1.0400906801223755, + "learning_rate": 9.946008851762743e-06, + "loss": 0.7978, + "step": 3569 + }, + { + "epoch": 0.19247358205736467, + "grad_norm": 0.8068282008171082, + "learning_rate": 9.945977775670845e-06, + "loss": 0.7365, + "step": 3570 + }, + { + "epoch": 0.1925274962260082, + "grad_norm": 0.8328807353973389, + "learning_rate": 9.945946690686747e-06, + "loss": 0.7308, + "step": 3571 + }, + { + "epoch": 0.1925814103946517, + "grad_norm": 0.946949303150177, + "learning_rate": 9.945915596810502e-06, + "loss": 0.9117, + "step": 3572 + }, + { + "epoch": 0.19263532456329524, + "grad_norm": 0.8421696424484253, + "learning_rate": 9.94588449404217e-06, + "loss": 0.7132, + "step": 3573 + }, + { + "epoch": 0.19268923873193874, + "grad_norm": 0.7321984171867371, + "learning_rate": 9.945853382381805e-06, + "loss": 0.752, + "step": 3574 + }, + { + "epoch": 0.19274315290058228, + "grad_norm": 0.8039024472236633, + "learning_rate": 9.94582226182946e-06, + "loss": 0.7952, + "step": 3575 + }, + { + "epoch": 0.19279706706922578, + "grad_norm": 0.8612285256385803, + "learning_rate": 9.945791132385196e-06, + "loss": 0.7944, + "step": 3576 + }, + { + "epoch": 0.1928509812378693, + "grad_norm": 1.0525864362716675, + "learning_rate": 9.945759994049066e-06, + "loss": 0.8078, + "step": 3577 + }, + { + "epoch": 0.19290489540651284, + "grad_norm": 0.8032466769218445, + "learning_rate": 9.945728846821128e-06, + "loss": 0.8522, + "step": 3578 + }, + { + "epoch": 0.19295880957515635, + "grad_norm": 1.324041485786438, + "learning_rate": 9.945697690701435e-06, + "loss": 0.7705, + "step": 3579 + }, + { + "epoch": 0.19301272374379988, + "grad_norm": 0.8733030557632446, + "learning_rate": 9.945666525690044e-06, + "loss": 0.8115, + "step": 3580 + }, + { + "epoch": 0.19306663791244338, + "grad_norm": 0.8208357095718384, + "learning_rate": 9.945635351787012e-06, + "loss": 0.7975, + "step": 3581 + }, + { + "epoch": 0.19312055208108692, + "grad_norm": 0.744498074054718, + "learning_rate": 9.945604168992395e-06, + "loss": 0.8088, + "step": 3582 + }, + { + "epoch": 0.19317446624973042, + "grad_norm": 0.9391197562217712, + "learning_rate": 9.945572977306249e-06, + "loss": 0.8403, + "step": 3583 + }, + { + "epoch": 0.19322838041837395, + "grad_norm": 0.8050488829612732, + "learning_rate": 9.945541776728629e-06, + "loss": 0.769, + "step": 3584 + }, + { + "epoch": 0.19328229458701746, + "grad_norm": 0.8373685479164124, + "learning_rate": 9.945510567259592e-06, + "loss": 0.7803, + "step": 3585 + }, + { + "epoch": 0.193336208755661, + "grad_norm": 0.8766368627548218, + "learning_rate": 9.945479348899194e-06, + "loss": 0.8325, + "step": 3586 + }, + { + "epoch": 0.19339012292430452, + "grad_norm": 0.8029547333717346, + "learning_rate": 9.945448121647492e-06, + "loss": 0.6647, + "step": 3587 + }, + { + "epoch": 0.19344403709294802, + "grad_norm": 0.7231468558311462, + "learning_rate": 9.94541688550454e-06, + "loss": 0.6939, + "step": 3588 + }, + { + "epoch": 0.19349795126159156, + "grad_norm": 0.8487125039100647, + "learning_rate": 9.945385640470397e-06, + "loss": 0.8097, + "step": 3589 + }, + { + "epoch": 0.19355186543023506, + "grad_norm": 0.7813920378684998, + "learning_rate": 9.945354386545116e-06, + "loss": 0.8023, + "step": 3590 + }, + { + "epoch": 0.1936057795988786, + "grad_norm": 0.8754404783248901, + "learning_rate": 9.945323123728756e-06, + "loss": 0.8401, + "step": 3591 + }, + { + "epoch": 0.1936596937675221, + "grad_norm": 0.8191613554954529, + "learning_rate": 9.945291852021371e-06, + "loss": 0.8151, + "step": 3592 + }, + { + "epoch": 0.19371360793616563, + "grad_norm": 0.7882266044616699, + "learning_rate": 9.945260571423019e-06, + "loss": 0.77, + "step": 3593 + }, + { + "epoch": 0.19376752210480913, + "grad_norm": 0.816411554813385, + "learning_rate": 9.945229281933756e-06, + "loss": 0.7378, + "step": 3594 + }, + { + "epoch": 0.19382143627345266, + "grad_norm": 0.8545891046524048, + "learning_rate": 9.945197983553636e-06, + "loss": 0.7563, + "step": 3595 + }, + { + "epoch": 0.1938753504420962, + "grad_norm": 0.8293501138687134, + "learning_rate": 9.945166676282717e-06, + "loss": 0.893, + "step": 3596 + }, + { + "epoch": 0.1939292646107397, + "grad_norm": 0.7536304593086243, + "learning_rate": 9.945135360121058e-06, + "loss": 0.7101, + "step": 3597 + }, + { + "epoch": 0.19398317877938323, + "grad_norm": 0.96649569272995, + "learning_rate": 9.94510403506871e-06, + "loss": 0.8027, + "step": 3598 + }, + { + "epoch": 0.19403709294802673, + "grad_norm": 0.7543211579322815, + "learning_rate": 9.945072701125733e-06, + "loss": 0.8144, + "step": 3599 + }, + { + "epoch": 0.19409100711667027, + "grad_norm": 0.7223193049430847, + "learning_rate": 9.945041358292183e-06, + "loss": 0.7585, + "step": 3600 + }, + { + "epoch": 0.19414492128531377, + "grad_norm": 0.8515756726264954, + "learning_rate": 9.945010006568115e-06, + "loss": 0.9114, + "step": 3601 + }, + { + "epoch": 0.1941988354539573, + "grad_norm": 0.7318340539932251, + "learning_rate": 9.944978645953585e-06, + "loss": 0.7554, + "step": 3602 + }, + { + "epoch": 0.1942527496226008, + "grad_norm": 0.8565723299980164, + "learning_rate": 9.944947276448649e-06, + "loss": 0.8918, + "step": 3603 + }, + { + "epoch": 0.19430666379124434, + "grad_norm": 0.8536270260810852, + "learning_rate": 9.944915898053367e-06, + "loss": 0.8184, + "step": 3604 + }, + { + "epoch": 0.19436057795988787, + "grad_norm": 0.7093652486801147, + "learning_rate": 9.944884510767792e-06, + "loss": 0.8031, + "step": 3605 + }, + { + "epoch": 0.19441449212853137, + "grad_norm": 0.7644805312156677, + "learning_rate": 9.944853114591984e-06, + "loss": 0.8546, + "step": 3606 + }, + { + "epoch": 0.1944684062971749, + "grad_norm": 0.6533430218696594, + "learning_rate": 9.944821709525994e-06, + "loss": 0.6453, + "step": 3607 + }, + { + "epoch": 0.1945223204658184, + "grad_norm": 0.8608343005180359, + "learning_rate": 9.944790295569883e-06, + "loss": 0.8539, + "step": 3608 + }, + { + "epoch": 0.19457623463446194, + "grad_norm": 0.777740478515625, + "learning_rate": 9.944758872723706e-06, + "loss": 0.7414, + "step": 3609 + }, + { + "epoch": 0.19463014880310545, + "grad_norm": 0.7757480144500732, + "learning_rate": 9.944727440987518e-06, + "loss": 0.7394, + "step": 3610 + }, + { + "epoch": 0.19468406297174898, + "grad_norm": 0.7862492203712463, + "learning_rate": 9.944696000361379e-06, + "loss": 0.8264, + "step": 3611 + }, + { + "epoch": 0.19473797714039248, + "grad_norm": 0.72691410779953, + "learning_rate": 9.944664550845342e-06, + "loss": 0.6876, + "step": 3612 + }, + { + "epoch": 0.194791891309036, + "grad_norm": 0.8702194094657898, + "learning_rate": 9.944633092439467e-06, + "loss": 0.7286, + "step": 3613 + }, + { + "epoch": 0.19484580547767955, + "grad_norm": 1.1160287857055664, + "learning_rate": 9.944601625143806e-06, + "loss": 0.8619, + "step": 3614 + }, + { + "epoch": 0.19489971964632305, + "grad_norm": 0.8278397917747498, + "learning_rate": 9.944570148958419e-06, + "loss": 0.7458, + "step": 3615 + }, + { + "epoch": 0.19495363381496658, + "grad_norm": 0.8430503606796265, + "learning_rate": 9.944538663883362e-06, + "loss": 0.7681, + "step": 3616 + }, + { + "epoch": 0.19500754798361009, + "grad_norm": 0.8198543190956116, + "learning_rate": 9.94450716991869e-06, + "loss": 0.6681, + "step": 3617 + }, + { + "epoch": 0.19506146215225362, + "grad_norm": 0.7874541282653809, + "learning_rate": 9.944475667064464e-06, + "loss": 0.813, + "step": 3618 + }, + { + "epoch": 0.19511537632089712, + "grad_norm": 0.76181960105896, + "learning_rate": 9.944444155320736e-06, + "loss": 0.7443, + "step": 3619 + }, + { + "epoch": 0.19516929048954065, + "grad_norm": 0.7647060751914978, + "learning_rate": 9.944412634687563e-06, + "loss": 0.8232, + "step": 3620 + }, + { + "epoch": 0.19522320465818416, + "grad_norm": 0.7609487771987915, + "learning_rate": 9.944381105165006e-06, + "loss": 0.8134, + "step": 3621 + }, + { + "epoch": 0.1952771188268277, + "grad_norm": 0.8139258027076721, + "learning_rate": 9.944349566753116e-06, + "loss": 0.8053, + "step": 3622 + }, + { + "epoch": 0.19533103299547122, + "grad_norm": 0.7404879927635193, + "learning_rate": 9.944318019451952e-06, + "loss": 0.7774, + "step": 3623 + }, + { + "epoch": 0.19538494716411473, + "grad_norm": 0.863972008228302, + "learning_rate": 9.944286463261573e-06, + "loss": 0.8824, + "step": 3624 + }, + { + "epoch": 0.19543886133275826, + "grad_norm": 0.907744824886322, + "learning_rate": 9.944254898182033e-06, + "loss": 0.7537, + "step": 3625 + }, + { + "epoch": 0.19549277550140176, + "grad_norm": 0.8722240328788757, + "learning_rate": 9.944223324213389e-06, + "loss": 0.8688, + "step": 3626 + }, + { + "epoch": 0.1955466896700453, + "grad_norm": 0.7386543154716492, + "learning_rate": 9.9441917413557e-06, + "loss": 0.6962, + "step": 3627 + }, + { + "epoch": 0.1956006038386888, + "grad_norm": 0.7577354907989502, + "learning_rate": 9.944160149609018e-06, + "loss": 0.7261, + "step": 3628 + }, + { + "epoch": 0.19565451800733233, + "grad_norm": 0.8413889408111572, + "learning_rate": 9.944128548973407e-06, + "loss": 0.8369, + "step": 3629 + }, + { + "epoch": 0.19570843217597583, + "grad_norm": 0.8649793863296509, + "learning_rate": 9.944096939448917e-06, + "loss": 0.8363, + "step": 3630 + }, + { + "epoch": 0.19576234634461936, + "grad_norm": 0.7515233755111694, + "learning_rate": 9.944065321035607e-06, + "loss": 0.7634, + "step": 3631 + }, + { + "epoch": 0.1958162605132629, + "grad_norm": 0.9059920310974121, + "learning_rate": 9.944033693733535e-06, + "loss": 0.9312, + "step": 3632 + }, + { + "epoch": 0.1958701746819064, + "grad_norm": 0.780707597732544, + "learning_rate": 9.944002057542757e-06, + "loss": 0.7545, + "step": 3633 + }, + { + "epoch": 0.19592408885054993, + "grad_norm": 0.7543255686759949, + "learning_rate": 9.94397041246333e-06, + "loss": 0.7496, + "step": 3634 + }, + { + "epoch": 0.19597800301919344, + "grad_norm": 0.7795106172561646, + "learning_rate": 9.943938758495313e-06, + "loss": 0.6734, + "step": 3635 + }, + { + "epoch": 0.19603191718783697, + "grad_norm": 0.9682700037956238, + "learning_rate": 9.943907095638758e-06, + "loss": 0.8928, + "step": 3636 + }, + { + "epoch": 0.19608583135648047, + "grad_norm": 0.7332949638366699, + "learning_rate": 9.943875423893727e-06, + "loss": 0.7507, + "step": 3637 + }, + { + "epoch": 0.196139745525124, + "grad_norm": 0.8316323161125183, + "learning_rate": 9.943843743260275e-06, + "loss": 0.7492, + "step": 3638 + }, + { + "epoch": 0.19619365969376754, + "grad_norm": 0.7973113059997559, + "learning_rate": 9.943812053738458e-06, + "loss": 0.8381, + "step": 3639 + }, + { + "epoch": 0.19624757386241104, + "grad_norm": 0.7654823064804077, + "learning_rate": 9.943780355328332e-06, + "loss": 0.8497, + "step": 3640 + }, + { + "epoch": 0.19630148803105457, + "grad_norm": 0.7055602073669434, + "learning_rate": 9.943748648029958e-06, + "loss": 0.7949, + "step": 3641 + }, + { + "epoch": 0.19635540219969808, + "grad_norm": 0.9971569180488586, + "learning_rate": 9.94371693184339e-06, + "loss": 0.8311, + "step": 3642 + }, + { + "epoch": 0.1964093163683416, + "grad_norm": 0.7608943581581116, + "learning_rate": 9.943685206768686e-06, + "loss": 0.8303, + "step": 3643 + }, + { + "epoch": 0.1964632305369851, + "grad_norm": 0.9169919490814209, + "learning_rate": 9.943653472805901e-06, + "loss": 0.8314, + "step": 3644 + }, + { + "epoch": 0.19651714470562864, + "grad_norm": 0.8501203656196594, + "learning_rate": 9.943621729955096e-06, + "loss": 0.8765, + "step": 3645 + }, + { + "epoch": 0.19657105887427215, + "grad_norm": 0.7438945770263672, + "learning_rate": 9.943589978216325e-06, + "loss": 0.7323, + "step": 3646 + }, + { + "epoch": 0.19662497304291568, + "grad_norm": 0.8795550465583801, + "learning_rate": 9.943558217589646e-06, + "loss": 0.7916, + "step": 3647 + }, + { + "epoch": 0.1966788872115592, + "grad_norm": 0.7928707003593445, + "learning_rate": 9.943526448075117e-06, + "loss": 0.8621, + "step": 3648 + }, + { + "epoch": 0.19673280138020272, + "grad_norm": 0.8225892782211304, + "learning_rate": 9.943494669672792e-06, + "loss": 0.8718, + "step": 3649 + }, + { + "epoch": 0.19678671554884625, + "grad_norm": 0.8227444291114807, + "learning_rate": 9.943462882382732e-06, + "loss": 0.8374, + "step": 3650 + }, + { + "epoch": 0.19684062971748975, + "grad_norm": 0.7860620021820068, + "learning_rate": 9.943431086204991e-06, + "loss": 0.8919, + "step": 3651 + }, + { + "epoch": 0.19689454388613328, + "grad_norm": 0.8000875115394592, + "learning_rate": 9.94339928113963e-06, + "loss": 0.7822, + "step": 3652 + }, + { + "epoch": 0.1969484580547768, + "grad_norm": 0.796389639377594, + "learning_rate": 9.943367467186702e-06, + "loss": 0.7149, + "step": 3653 + }, + { + "epoch": 0.19700237222342032, + "grad_norm": 0.8032622337341309, + "learning_rate": 9.943335644346267e-06, + "loss": 0.8442, + "step": 3654 + }, + { + "epoch": 0.19705628639206382, + "grad_norm": 0.8624833226203918, + "learning_rate": 9.94330381261838e-06, + "loss": 0.8681, + "step": 3655 + }, + { + "epoch": 0.19711020056070735, + "grad_norm": 0.9663752317428589, + "learning_rate": 9.9432719720031e-06, + "loss": 0.8749, + "step": 3656 + }, + { + "epoch": 0.1971641147293509, + "grad_norm": 0.6869292259216309, + "learning_rate": 9.943240122500484e-06, + "loss": 0.7288, + "step": 3657 + }, + { + "epoch": 0.1972180288979944, + "grad_norm": 0.7496824264526367, + "learning_rate": 9.943208264110589e-06, + "loss": 0.7191, + "step": 3658 + }, + { + "epoch": 0.19727194306663792, + "grad_norm": 0.7637088894844055, + "learning_rate": 9.943176396833471e-06, + "loss": 0.7602, + "step": 3659 + }, + { + "epoch": 0.19732585723528143, + "grad_norm": 0.7049651741981506, + "learning_rate": 9.94314452066919e-06, + "loss": 0.7097, + "step": 3660 + }, + { + "epoch": 0.19737977140392496, + "grad_norm": 0.8979986310005188, + "learning_rate": 9.943112635617802e-06, + "loss": 0.7953, + "step": 3661 + }, + { + "epoch": 0.19743368557256846, + "grad_norm": 0.7865282893180847, + "learning_rate": 9.943080741679364e-06, + "loss": 0.7394, + "step": 3662 + }, + { + "epoch": 0.197487599741212, + "grad_norm": 0.7790982723236084, + "learning_rate": 9.943048838853932e-06, + "loss": 0.8587, + "step": 3663 + }, + { + "epoch": 0.1975415139098555, + "grad_norm": 0.8486214876174927, + "learning_rate": 9.943016927141566e-06, + "loss": 0.9232, + "step": 3664 + }, + { + "epoch": 0.19759542807849903, + "grad_norm": 0.7729238867759705, + "learning_rate": 9.942985006542322e-06, + "loss": 0.7704, + "step": 3665 + }, + { + "epoch": 0.19764934224714256, + "grad_norm": 0.7827340960502625, + "learning_rate": 9.942953077056259e-06, + "loss": 0.7834, + "step": 3666 + }, + { + "epoch": 0.19770325641578607, + "grad_norm": 0.8735725283622742, + "learning_rate": 9.94292113868343e-06, + "loss": 0.7521, + "step": 3667 + }, + { + "epoch": 0.1977571705844296, + "grad_norm": 0.803302526473999, + "learning_rate": 9.942889191423897e-06, + "loss": 0.7475, + "step": 3668 + }, + { + "epoch": 0.1978110847530731, + "grad_norm": 0.7523918747901917, + "learning_rate": 9.942857235277716e-06, + "loss": 0.7882, + "step": 3669 + }, + { + "epoch": 0.19786499892171663, + "grad_norm": 0.891010582447052, + "learning_rate": 9.942825270244944e-06, + "loss": 0.6855, + "step": 3670 + }, + { + "epoch": 0.19791891309036014, + "grad_norm": 0.8103521466255188, + "learning_rate": 9.94279329632564e-06, + "loss": 0.7604, + "step": 3671 + }, + { + "epoch": 0.19797282725900367, + "grad_norm": 0.7801117897033691, + "learning_rate": 9.94276131351986e-06, + "loss": 0.757, + "step": 3672 + }, + { + "epoch": 0.19802674142764717, + "grad_norm": 0.8760844469070435, + "learning_rate": 9.942729321827661e-06, + "loss": 0.9507, + "step": 3673 + }, + { + "epoch": 0.1980806555962907, + "grad_norm": 0.7129818201065063, + "learning_rate": 9.942697321249101e-06, + "loss": 0.7118, + "step": 3674 + }, + { + "epoch": 0.19813456976493424, + "grad_norm": 0.7223137021064758, + "learning_rate": 9.942665311784239e-06, + "loss": 0.6911, + "step": 3675 + }, + { + "epoch": 0.19818848393357774, + "grad_norm": 0.7100752592086792, + "learning_rate": 9.94263329343313e-06, + "loss": 0.7569, + "step": 3676 + }, + { + "epoch": 0.19824239810222127, + "grad_norm": 0.955298662185669, + "learning_rate": 9.942601266195834e-06, + "loss": 0.8562, + "step": 3677 + }, + { + "epoch": 0.19829631227086478, + "grad_norm": 0.7367860078811646, + "learning_rate": 9.942569230072408e-06, + "loss": 0.7184, + "step": 3678 + }, + { + "epoch": 0.1983502264395083, + "grad_norm": 0.7822328805923462, + "learning_rate": 9.942537185062909e-06, + "loss": 0.7111, + "step": 3679 + }, + { + "epoch": 0.1984041406081518, + "grad_norm": 0.8836474418640137, + "learning_rate": 9.942505131167394e-06, + "loss": 0.731, + "step": 3680 + }, + { + "epoch": 0.19845805477679535, + "grad_norm": 0.7033706903457642, + "learning_rate": 9.942473068385921e-06, + "loss": 0.7228, + "step": 3681 + }, + { + "epoch": 0.19851196894543885, + "grad_norm": 0.7241103649139404, + "learning_rate": 9.942440996718549e-06, + "loss": 0.7045, + "step": 3682 + }, + { + "epoch": 0.19856588311408238, + "grad_norm": 0.8266516923904419, + "learning_rate": 9.942408916165334e-06, + "loss": 0.781, + "step": 3683 + }, + { + "epoch": 0.1986197972827259, + "grad_norm": 0.9639707207679749, + "learning_rate": 9.942376826726334e-06, + "loss": 0.8136, + "step": 3684 + }, + { + "epoch": 0.19867371145136942, + "grad_norm": 0.874279797077179, + "learning_rate": 9.942344728401609e-06, + "loss": 0.8147, + "step": 3685 + }, + { + "epoch": 0.19872762562001295, + "grad_norm": 0.7670862674713135, + "learning_rate": 9.942312621191213e-06, + "loss": 0.8134, + "step": 3686 + }, + { + "epoch": 0.19878153978865645, + "grad_norm": 0.8974711894989014, + "learning_rate": 9.942280505095206e-06, + "loss": 0.8211, + "step": 3687 + }, + { + "epoch": 0.19883545395729998, + "grad_norm": 0.8174877762794495, + "learning_rate": 9.942248380113646e-06, + "loss": 0.8641, + "step": 3688 + }, + { + "epoch": 0.1988893681259435, + "grad_norm": 0.7798371315002441, + "learning_rate": 9.942216246246588e-06, + "loss": 0.7226, + "step": 3689 + }, + { + "epoch": 0.19894328229458702, + "grad_norm": 0.8269854784011841, + "learning_rate": 9.942184103494093e-06, + "loss": 0.8789, + "step": 3690 + }, + { + "epoch": 0.19899719646323052, + "grad_norm": 0.8148782253265381, + "learning_rate": 9.942151951856217e-06, + "loss": 0.8436, + "step": 3691 + }, + { + "epoch": 0.19905111063187406, + "grad_norm": 0.823692262172699, + "learning_rate": 9.942119791333017e-06, + "loss": 0.6935, + "step": 3692 + }, + { + "epoch": 0.1991050248005176, + "grad_norm": 0.8396292924880981, + "learning_rate": 9.942087621924555e-06, + "loss": 0.8814, + "step": 3693 + }, + { + "epoch": 0.1991589389691611, + "grad_norm": 0.7293786406517029, + "learning_rate": 9.942055443630885e-06, + "loss": 0.7735, + "step": 3694 + }, + { + "epoch": 0.19921285313780462, + "grad_norm": 0.7367222905158997, + "learning_rate": 9.942023256452066e-06, + "loss": 0.7797, + "step": 3695 + }, + { + "epoch": 0.19926676730644813, + "grad_norm": 0.7078450322151184, + "learning_rate": 9.941991060388155e-06, + "loss": 0.7192, + "step": 3696 + }, + { + "epoch": 0.19932068147509166, + "grad_norm": 0.7927302718162537, + "learning_rate": 9.941958855439211e-06, + "loss": 0.8249, + "step": 3697 + }, + { + "epoch": 0.19937459564373516, + "grad_norm": 0.806266725063324, + "learning_rate": 9.941926641605292e-06, + "loss": 0.7829, + "step": 3698 + }, + { + "epoch": 0.1994285098123787, + "grad_norm": 0.8022493720054626, + "learning_rate": 9.941894418886455e-06, + "loss": 0.7843, + "step": 3699 + }, + { + "epoch": 0.1994824239810222, + "grad_norm": 0.8877873420715332, + "learning_rate": 9.941862187282759e-06, + "loss": 0.7266, + "step": 3700 + }, + { + "epoch": 0.19953633814966573, + "grad_norm": 0.7944962382316589, + "learning_rate": 9.94182994679426e-06, + "loss": 0.8078, + "step": 3701 + }, + { + "epoch": 0.19959025231830926, + "grad_norm": 0.8684442639350891, + "learning_rate": 9.941797697421017e-06, + "loss": 0.7445, + "step": 3702 + }, + { + "epoch": 0.19964416648695277, + "grad_norm": 0.7841063141822815, + "learning_rate": 9.94176543916309e-06, + "loss": 0.7231, + "step": 3703 + }, + { + "epoch": 0.1996980806555963, + "grad_norm": 0.7657507658004761, + "learning_rate": 9.941733172020533e-06, + "loss": 0.7018, + "step": 3704 + }, + { + "epoch": 0.1997519948242398, + "grad_norm": 1.086627721786499, + "learning_rate": 9.94170089599341e-06, + "loss": 0.7914, + "step": 3705 + }, + { + "epoch": 0.19980590899288334, + "grad_norm": 0.7400459051132202, + "learning_rate": 9.941668611081771e-06, + "loss": 0.7841, + "step": 3706 + }, + { + "epoch": 0.19985982316152684, + "grad_norm": 1.0587258338928223, + "learning_rate": 9.94163631728568e-06, + "loss": 0.923, + "step": 3707 + }, + { + "epoch": 0.19991373733017037, + "grad_norm": 0.8322579264640808, + "learning_rate": 9.941604014605193e-06, + "loss": 0.8095, + "step": 3708 + }, + { + "epoch": 0.19996765149881388, + "grad_norm": 0.6660327911376953, + "learning_rate": 9.94157170304037e-06, + "loss": 0.6977, + "step": 3709 + }, + { + "epoch": 0.2000215656674574, + "grad_norm": 0.8063632249832153, + "learning_rate": 9.941539382591267e-06, + "loss": 0.7693, + "step": 3710 + } + ], + "logging_steps": 1, + "max_steps": 74192, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1855, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0948419465951314e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3710/training_args.bin b/checkpoint-3710/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..736549377f516c4bc25a43293c6f37ec549a9a60 --- /dev/null +++ b/checkpoint-3710/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb97268504007caea6a1175a54f08b974d7fa47a1a5fb4547021d5b9d223b4a4 +size 7928 diff --git a/checkpoint-3710/zero_to_fp32.py b/checkpoint-3710/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-3710/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-5565/config.json b/checkpoint-5565/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7f34bbd5159c9a132258ecf79562e79459cb64d9 --- /dev/null +++ b/checkpoint-5565/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B-Instruct/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-5565/generation_config.json b/checkpoint-5565/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0484b997a9ea9b5b6d711db644716bfd32d5470e --- /dev/null +++ b/checkpoint-5565/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e7de65d1f2c6b49017ee4efd2402145f65a3842 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b071f292e6c6bf5c790fc0250f83924c7ccfbfaee2f52d0afde7c1b9f53aa12 +size 12045398464 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee6c07d6b08d664b624a5c98edebe31e5165ee38 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1897f2c7a94ff34817b29c8a8324beb342769d6a4973477b2c059230e887337f +size 12045399232 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba088b040c7e9ea3b9a771ecfac21693d911eb6a --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62dd056ed83f51fca522ba4a55062449157f523edf13f3bf7f50d17cd76b5877 +size 12045399488 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d0a67dfc66f43dc56fc1ec974fa703eddad8116 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f64fcb07354f3a7e88391f20967b8f716f889ef791ba3d6b2f62b822d43c0f9 +size 12045399232 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f1e34aecfd5d06911aa2b35d8ae5d55b15ca693 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0630db7d6de28a43ccf68d7c438dd8678fb11e029a1d3ae9760366d2731ad9 +size 12045399488 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b3d1942a1dd327c82848030b30b3cb0e914f516 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399daf4b95b4214682bc3cc51007986b62c5f65a642863b0afc4f90a61ee8161 +size 12045399552 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d879cb0effab60e898dfbfb24257ed0179f32e09 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e10c7265bd0fe6926e0180365698a798ea10f4ccade66f15894cc3fbcd1e52d +size 12045399232 diff --git a/checkpoint-5565/global_step5565/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7087de4162c688cdbb9cfdf23d286f5b6cc5c3c7 --- /dev/null +++ b/checkpoint-5565/global_step5565/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74be9feb732f740ca8341184235451ab55af9c0a72e7cb04fb44aab28c0351eb +size 12045398144 diff --git a/checkpoint-5565/global_step5565/mp_rank_00_model_states.pt b/checkpoint-5565/global_step5565/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..358b60a665e2db3a0b06d49333177114cbfa8f30 --- /dev/null +++ b/checkpoint-5565/global_step5565/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fe941df635844a3ad0e682f6e9795e40d16c80a985e9d78c0e55cd767c98bf +size 16060610552 diff --git a/checkpoint-5565/latest b/checkpoint-5565/latest new file mode 100644 index 0000000000000000000000000000000000000000..f0742f11a78d5454850fa89b17273e75e450c01c --- /dev/null +++ b/checkpoint-5565/latest @@ -0,0 +1 @@ +global_step5565 \ No newline at end of file diff --git a/checkpoint-5565/model-00001-of-00004.safetensors b/checkpoint-5565/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ef1418a246de1b3e43b70e3c2c828e5e81a5714 --- /dev/null +++ b/checkpoint-5565/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:363bd3b7096ce6daa7c186b3ca056a8b0de5eef789e74123ce491ba8fce06f4d +size 4976698672 diff --git a/checkpoint-5565/model-00002-of-00004.safetensors b/checkpoint-5565/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..36f5619802d1ee8fb3f70c1075be119bd9987b48 --- /dev/null +++ b/checkpoint-5565/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e735a5d9a6a9b0617798be17fe7aef4380326f7e62f6a2e591615aa2468d58ff +size 4999802720 diff --git a/checkpoint-5565/model-00003-of-00004.safetensors b/checkpoint-5565/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..70e6d050f695f487e2246d156e1914a56e2f212a --- /dev/null +++ b/checkpoint-5565/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd64010b9d960b468e43a9675b6a0c8643562f88b35371b8dc87122b94bad3f +size 4915916176 diff --git a/checkpoint-5565/model-00004-of-00004.safetensors b/checkpoint-5565/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c025dd47a0e0d981c3d7c98e6ddf3909147a2cd4 --- /dev/null +++ b/checkpoint-5565/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3335748613723eff551bf4812a75c861433a095c35de7620f9b9c942b31673 +size 1168138808 diff --git a/checkpoint-5565/model.safetensors.index.json b/checkpoint-5565/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/checkpoint-5565/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-5565/rng_state_0.pth b/checkpoint-5565/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-5565/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-5565/rng_state_1.pth b/checkpoint-5565/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-5565/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-5565/rng_state_2.pth b/checkpoint-5565/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-5565/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-5565/rng_state_3.pth b/checkpoint-5565/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-5565/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-5565/rng_state_4.pth b/checkpoint-5565/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-5565/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-5565/rng_state_5.pth b/checkpoint-5565/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-5565/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-5565/rng_state_6.pth b/checkpoint-5565/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-5565/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-5565/rng_state_7.pth b/checkpoint-5565/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-5565/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-5565/scheduler.pt b/checkpoint-5565/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7abd7c38e2912e4516666eecd3917dc2f5034a90 --- /dev/null +++ b/checkpoint-5565/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6780e1d2e5c661abdebda0c1750f016ccd9e268d936dc75bb155dfe1c9bbca88 +size 1064 diff --git a/checkpoint-5565/special_tokens_map.json b/checkpoint-5565/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-5565/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-5565/tokenizer.json b/checkpoint-5565/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a5a81eb733cae803b39ffc7644de0048c3a26c3 --- /dev/null +++ b/checkpoint-5565/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d7990a7c3f12081b24b3d098ab366211161e43494d2368211815c164b5f2b7 +size 17209828 diff --git a/checkpoint-5565/tokenizer_config.json b/checkpoint-5565/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5cd68a680b8f949dba64516158c30db7ea52c3cd --- /dev/null +++ b/checkpoint-5565/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-5565/trainer_state.json b/checkpoint-5565/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5503c4f537b6f635377853a4e974e731ffa4367 --- /dev/null +++ b/checkpoint-5565/trainer_state.json @@ -0,0 +1,38988 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3000323485011861, + "eval_steps": 500, + "global_step": 5565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.391416864351952e-05, + "grad_norm": 53.75010299682617, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.5864, + "step": 1 + }, + { + "epoch": 0.00010782833728703904, + "grad_norm": 45.00067138671875, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3757, + "step": 2 + }, + { + "epoch": 0.00016174250593055855, + "grad_norm": 51.22366714477539, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.4653, + "step": 3 + }, + { + "epoch": 0.00021565667457407807, + "grad_norm": 62.225242614746094, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5819, + "step": 4 + }, + { + "epoch": 0.0002695708432175976, + "grad_norm": 54.67008590698242, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6368, + "step": 5 + }, + { + "epoch": 0.0003234850118611171, + "grad_norm": 51.261009216308594, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3245, + "step": 6 + }, + { + "epoch": 0.0003773991805046366, + "grad_norm": 53.58714294433594, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7622, + "step": 7 + }, + { + "epoch": 0.00043131334914815614, + "grad_norm": 41.32997131347656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.6444, + "step": 8 + }, + { + "epoch": 0.00048522751779167566, + "grad_norm": 33.232242584228516, + "learning_rate": 9.000000000000001e-07, + "loss": 2.1475, + "step": 9 + }, + { + "epoch": 0.0005391416864351952, + "grad_norm": 34.1890983581543, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7256, + "step": 10 + }, + { + "epoch": 0.0005930558550787146, + "grad_norm": 19.263437271118164, + "learning_rate": 1.1e-06, + "loss": 2.4132, + "step": 11 + }, + { + "epoch": 0.0006469700237222342, + "grad_norm": 15.612638473510742, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.0422, + "step": 12 + }, + { + "epoch": 0.0007008841923657537, + "grad_norm": 13.81751537322998, + "learning_rate": 1.3e-06, + "loss": 1.9663, + "step": 13 + }, + { + "epoch": 0.0007547983610092732, + "grad_norm": 16.390897750854492, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.1135, + "step": 14 + }, + { + "epoch": 0.0008087125296527927, + "grad_norm": 21.830646514892578, + "learning_rate": 1.5e-06, + "loss": 2.217, + "step": 15 + }, + { + "epoch": 0.0008626266982963123, + "grad_norm": 18.630046844482422, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1612, + "step": 16 + }, + { + "epoch": 0.0009165408669398317, + "grad_norm": 12.403571128845215, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9358, + "step": 17 + }, + { + "epoch": 0.0009704550355833513, + "grad_norm": 7.713366508483887, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8522, + "step": 18 + }, + { + "epoch": 0.001024369204226871, + "grad_norm": 7.731616973876953, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.7984, + "step": 19 + }, + { + "epoch": 0.0010782833728703904, + "grad_norm": 7.5799174308776855, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.701, + "step": 20 + }, + { + "epoch": 0.0011321975415139098, + "grad_norm": 5.5428080558776855, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.624, + "step": 21 + }, + { + "epoch": 0.0011861117101574293, + "grad_norm": 5.851474285125732, + "learning_rate": 2.2e-06, + "loss": 1.8064, + "step": 22 + }, + { + "epoch": 0.001240025878800949, + "grad_norm": 5.243111610412598, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7246, + "step": 23 + }, + { + "epoch": 0.0012939400474444684, + "grad_norm": 4.835971832275391, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.763, + "step": 24 + }, + { + "epoch": 0.0013478542160879879, + "grad_norm": 4.127845287322998, + "learning_rate": 2.5e-06, + "loss": 1.5869, + "step": 25 + }, + { + "epoch": 0.0014017683847315074, + "grad_norm": 3.7648322582244873, + "learning_rate": 2.6e-06, + "loss": 1.5599, + "step": 26 + }, + { + "epoch": 0.001455682553375027, + "grad_norm": 3.5424962043762207, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4703, + "step": 27 + }, + { + "epoch": 0.0015095967220185465, + "grad_norm": 3.3707985877990723, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.354, + "step": 28 + }, + { + "epoch": 0.001563510890662066, + "grad_norm": 4.71254825592041, + "learning_rate": 2.9e-06, + "loss": 1.8162, + "step": 29 + }, + { + "epoch": 0.0016174250593055854, + "grad_norm": 3.7660300731658936, + "learning_rate": 3e-06, + "loss": 1.5951, + "step": 30 + }, + { + "epoch": 0.001671339227949105, + "grad_norm": 3.4810571670532227, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.5183, + "step": 31 + }, + { + "epoch": 0.0017252533965926246, + "grad_norm": 3.672693967819214, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6374, + "step": 32 + }, + { + "epoch": 0.001779167565236144, + "grad_norm": 3.3589682579040527, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.4371, + "step": 33 + }, + { + "epoch": 0.0018330817338796635, + "grad_norm": 3.6365807056427, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.595, + "step": 34 + }, + { + "epoch": 0.0018869959025231832, + "grad_norm": 3.6467039585113525, + "learning_rate": 3.5e-06, + "loss": 1.5714, + "step": 35 + }, + { + "epoch": 0.0019409100711667026, + "grad_norm": 3.4684648513793945, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4897, + "step": 36 + }, + { + "epoch": 0.001994824239810222, + "grad_norm": 3.70845627784729, + "learning_rate": 3.7e-06, + "loss": 1.5954, + "step": 37 + }, + { + "epoch": 0.002048738408453742, + "grad_norm": 3.1803395748138428, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3976, + "step": 38 + }, + { + "epoch": 0.002102652577097261, + "grad_norm": 2.851703405380249, + "learning_rate": 3.900000000000001e-06, + "loss": 1.1894, + "step": 39 + }, + { + "epoch": 0.0021565667457407807, + "grad_norm": 2.832003593444824, + "learning_rate": 4.000000000000001e-06, + "loss": 1.353, + "step": 40 + }, + { + "epoch": 0.0022104809143843004, + "grad_norm": 3.397498607635498, + "learning_rate": 4.1e-06, + "loss": 1.4541, + "step": 41 + }, + { + "epoch": 0.0022643950830278196, + "grad_norm": 3.4537954330444336, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.4475, + "step": 42 + }, + { + "epoch": 0.0023183092516713393, + "grad_norm": 3.1131632328033447, + "learning_rate": 4.3e-06, + "loss": 1.2707, + "step": 43 + }, + { + "epoch": 0.0023722234203148586, + "grad_norm": 3.0421881675720215, + "learning_rate": 4.4e-06, + "loss": 1.3418, + "step": 44 + }, + { + "epoch": 0.0024261375889583782, + "grad_norm": 3.528514862060547, + "learning_rate": 4.5e-06, + "loss": 1.4432, + "step": 45 + }, + { + "epoch": 0.002480051757601898, + "grad_norm": 3.6783225536346436, + "learning_rate": 4.600000000000001e-06, + "loss": 1.4863, + "step": 46 + }, + { + "epoch": 0.002533965926245417, + "grad_norm": 2.9829189777374268, + "learning_rate": 4.7e-06, + "loss": 1.2856, + "step": 47 + }, + { + "epoch": 0.002587880094888937, + "grad_norm": 3.4480350017547607, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4129, + "step": 48 + }, + { + "epoch": 0.0026417942635324565, + "grad_norm": 3.4247214794158936, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3467, + "step": 49 + }, + { + "epoch": 0.0026957084321759758, + "grad_norm": 3.5268948078155518, + "learning_rate": 5e-06, + "loss": 1.4795, + "step": 50 + }, + { + "epoch": 0.0027496226008194955, + "grad_norm": 3.3228304386138916, + "learning_rate": 5.1e-06, + "loss": 1.461, + "step": 51 + }, + { + "epoch": 0.0028035367694630147, + "grad_norm": 3.365630865097046, + "learning_rate": 5.2e-06, + "loss": 1.2947, + "step": 52 + }, + { + "epoch": 0.0028574509381065344, + "grad_norm": 3.4889328479766846, + "learning_rate": 5.300000000000001e-06, + "loss": 1.432, + "step": 53 + }, + { + "epoch": 0.002911365106750054, + "grad_norm": 3.5767273902893066, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3773, + "step": 54 + }, + { + "epoch": 0.0029652792753935733, + "grad_norm": 3.499298095703125, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4132, + "step": 55 + }, + { + "epoch": 0.003019193444037093, + "grad_norm": 3.6990244388580322, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4595, + "step": 56 + }, + { + "epoch": 0.0030731076126806127, + "grad_norm": 3.0908327102661133, + "learning_rate": 5.7e-06, + "loss": 1.1873, + "step": 57 + }, + { + "epoch": 0.003127021781324132, + "grad_norm": 3.149425745010376, + "learning_rate": 5.8e-06, + "loss": 1.3306, + "step": 58 + }, + { + "epoch": 0.0031809359499676516, + "grad_norm": 3.193023204803467, + "learning_rate": 5.9e-06, + "loss": 1.3326, + "step": 59 + }, + { + "epoch": 0.003234850118611171, + "grad_norm": 3.610344409942627, + "learning_rate": 6e-06, + "loss": 1.4527, + "step": 60 + }, + { + "epoch": 0.0032887642872546905, + "grad_norm": 2.9877095222473145, + "learning_rate": 6.1e-06, + "loss": 1.2029, + "step": 61 + }, + { + "epoch": 0.00334267845589821, + "grad_norm": 3.0241923332214355, + "learning_rate": 6.200000000000001e-06, + "loss": 1.3413, + "step": 62 + }, + { + "epoch": 0.0033965926245417295, + "grad_norm": 3.212700366973877, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3471, + "step": 63 + }, + { + "epoch": 0.003450506793185249, + "grad_norm": 2.7138960361480713, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.0885, + "step": 64 + }, + { + "epoch": 0.0035044209618287684, + "grad_norm": 2.5690340995788574, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1168, + "step": 65 + }, + { + "epoch": 0.003558335130472288, + "grad_norm": 3.0344784259796143, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2828, + "step": 66 + }, + { + "epoch": 0.0036122492991158077, + "grad_norm": 3.0589816570281982, + "learning_rate": 6.700000000000001e-06, + "loss": 1.2604, + "step": 67 + }, + { + "epoch": 0.003666163467759327, + "grad_norm": 2.676417112350464, + "learning_rate": 6.800000000000001e-06, + "loss": 1.1679, + "step": 68 + }, + { + "epoch": 0.0037200776364028467, + "grad_norm": 2.6590960025787354, + "learning_rate": 6.9e-06, + "loss": 1.2283, + "step": 69 + }, + { + "epoch": 0.0037739918050463664, + "grad_norm": 2.6973354816436768, + "learning_rate": 7e-06, + "loss": 1.2028, + "step": 70 + }, + { + "epoch": 0.0038279059736898856, + "grad_norm": 2.7046608924865723, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2629, + "step": 71 + }, + { + "epoch": 0.0038818201423334053, + "grad_norm": 2.2172696590423584, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.1367, + "step": 72 + }, + { + "epoch": 0.0039357343109769245, + "grad_norm": 2.6138789653778076, + "learning_rate": 7.3e-06, + "loss": 1.3167, + "step": 73 + }, + { + "epoch": 0.003989648479620444, + "grad_norm": 2.2926838397979736, + "learning_rate": 7.4e-06, + "loss": 1.2909, + "step": 74 + }, + { + "epoch": 0.004043562648263964, + "grad_norm": 2.0647220611572266, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2054, + "step": 75 + }, + { + "epoch": 0.004097476816907484, + "grad_norm": 2.1190452575683594, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1497, + "step": 76 + }, + { + "epoch": 0.004151390985551002, + "grad_norm": 1.9973243474960327, + "learning_rate": 7.7e-06, + "loss": 1.1997, + "step": 77 + }, + { + "epoch": 0.004205305154194522, + "grad_norm": 2.11751651763916, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2181, + "step": 78 + }, + { + "epoch": 0.004259219322838042, + "grad_norm": 1.8975950479507446, + "learning_rate": 7.9e-06, + "loss": 1.1582, + "step": 79 + }, + { + "epoch": 0.004313133491481561, + "grad_norm": 1.8368147611618042, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1389, + "step": 80 + }, + { + "epoch": 0.004367047660125081, + "grad_norm": 1.7472988367080688, + "learning_rate": 8.1e-06, + "loss": 1.0959, + "step": 81 + }, + { + "epoch": 0.004420961828768601, + "grad_norm": 1.7325443029403687, + "learning_rate": 8.2e-06, + "loss": 1.1847, + "step": 82 + }, + { + "epoch": 0.00447487599741212, + "grad_norm": 1.6171561479568481, + "learning_rate": 8.3e-06, + "loss": 0.9834, + "step": 83 + }, + { + "epoch": 0.004528790166055639, + "grad_norm": 1.6583327054977417, + "learning_rate": 8.400000000000001e-06, + "loss": 1.0413, + "step": 84 + }, + { + "epoch": 0.004582704334699159, + "grad_norm": 1.8914967775344849, + "learning_rate": 8.5e-06, + "loss": 1.2413, + "step": 85 + }, + { + "epoch": 0.004636618503342679, + "grad_norm": 1.6018317937850952, + "learning_rate": 8.6e-06, + "loss": 1.0577, + "step": 86 + }, + { + "epoch": 0.004690532671986198, + "grad_norm": 1.9170053005218506, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2463, + "step": 87 + }, + { + "epoch": 0.004744446840629717, + "grad_norm": 1.666536569595337, + "learning_rate": 8.8e-06, + "loss": 1.0532, + "step": 88 + }, + { + "epoch": 0.004798361009273237, + "grad_norm": 1.660115361213684, + "learning_rate": 8.900000000000001e-06, + "loss": 1.0514, + "step": 89 + }, + { + "epoch": 0.0048522751779167565, + "grad_norm": 1.8667477369308472, + "learning_rate": 9e-06, + "loss": 1.2039, + "step": 90 + }, + { + "epoch": 0.004906189346560276, + "grad_norm": 1.9490039348602295, + "learning_rate": 9.100000000000001e-06, + "loss": 1.1804, + "step": 91 + }, + { + "epoch": 0.004960103515203796, + "grad_norm": 1.8415377140045166, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1435, + "step": 92 + }, + { + "epoch": 0.005014017683847315, + "grad_norm": 1.8571438789367676, + "learning_rate": 9.3e-06, + "loss": 1.0974, + "step": 93 + }, + { + "epoch": 0.005067931852490834, + "grad_norm": 1.8480113744735718, + "learning_rate": 9.4e-06, + "loss": 1.149, + "step": 94 + }, + { + "epoch": 0.005121846021134354, + "grad_norm": 2.003490447998047, + "learning_rate": 9.5e-06, + "loss": 1.1954, + "step": 95 + }, + { + "epoch": 0.005175760189777874, + "grad_norm": 1.8002668619155884, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9953, + "step": 96 + }, + { + "epoch": 0.005229674358421393, + "grad_norm": 1.9040817022323608, + "learning_rate": 9.7e-06, + "loss": 1.1195, + "step": 97 + }, + { + "epoch": 0.005283588527064913, + "grad_norm": 1.8311433792114258, + "learning_rate": 9.800000000000001e-06, + "loss": 1.083, + "step": 98 + }, + { + "epoch": 0.005337502695708432, + "grad_norm": 1.9509624242782593, + "learning_rate": 9.9e-06, + "loss": 1.176, + "step": 99 + }, + { + "epoch": 0.0053914168643519516, + "grad_norm": 2.0624589920043945, + "learning_rate": 1e-05, + "loss": 1.119, + "step": 100 + }, + { + "epoch": 0.005445331032995471, + "grad_norm": 1.9618796110153198, + "learning_rate": 9.999999995505339e-06, + "loss": 1.1371, + "step": 101 + }, + { + "epoch": 0.005499245201638991, + "grad_norm": 1.946245551109314, + "learning_rate": 9.999999982021349e-06, + "loss": 0.9736, + "step": 102 + }, + { + "epoch": 0.005553159370282511, + "grad_norm": 1.9871301651000977, + "learning_rate": 9.999999959548035e-06, + "loss": 1.1077, + "step": 103 + }, + { + "epoch": 0.005607073538926029, + "grad_norm": 1.86216402053833, + "learning_rate": 9.999999928085396e-06, + "loss": 1.0882, + "step": 104 + }, + { + "epoch": 0.005660987707569549, + "grad_norm": 1.8447723388671875, + "learning_rate": 9.999999887633432e-06, + "loss": 1.0344, + "step": 105 + }, + { + "epoch": 0.005714901876213069, + "grad_norm": 1.8345638513565063, + "learning_rate": 9.99999983819214e-06, + "loss": 1.1077, + "step": 106 + }, + { + "epoch": 0.0057688160448565885, + "grad_norm": 1.8410178422927856, + "learning_rate": 9.999999779761524e-06, + "loss": 1.0824, + "step": 107 + }, + { + "epoch": 0.005822730213500108, + "grad_norm": 1.5881969928741455, + "learning_rate": 9.999999712341583e-06, + "loss": 0.9439, + "step": 108 + }, + { + "epoch": 0.005876644382143627, + "grad_norm": 1.6704047918319702, + "learning_rate": 9.999999635932316e-06, + "loss": 1.033, + "step": 109 + }, + { + "epoch": 0.005930558550787147, + "grad_norm": 1.792449712753296, + "learning_rate": 9.999999550533726e-06, + "loss": 1.0279, + "step": 110 + }, + { + "epoch": 0.005984472719430666, + "grad_norm": 1.6515668630599976, + "learning_rate": 9.999999456145809e-06, + "loss": 1.0301, + "step": 111 + }, + { + "epoch": 0.006038386888074186, + "grad_norm": 1.8541395664215088, + "learning_rate": 9.999999352768568e-06, + "loss": 1.1057, + "step": 112 + }, + { + "epoch": 0.006092301056717706, + "grad_norm": 1.6490236520767212, + "learning_rate": 9.999999240402002e-06, + "loss": 1.0523, + "step": 113 + }, + { + "epoch": 0.006146215225361225, + "grad_norm": 1.655333161354065, + "learning_rate": 9.999999119046113e-06, + "loss": 1.0448, + "step": 114 + }, + { + "epoch": 0.006200129394004744, + "grad_norm": 1.5721609592437744, + "learning_rate": 9.999998988700899e-06, + "loss": 0.9883, + "step": 115 + }, + { + "epoch": 0.006254043562648264, + "grad_norm": 1.6411349773406982, + "learning_rate": 9.99999884936636e-06, + "loss": 1.0255, + "step": 116 + }, + { + "epoch": 0.0063079577312917835, + "grad_norm": 1.6399502754211426, + "learning_rate": 9.999998701042501e-06, + "loss": 1.0146, + "step": 117 + }, + { + "epoch": 0.006361871899935303, + "grad_norm": 1.615026831626892, + "learning_rate": 9.999998543729316e-06, + "loss": 1.0022, + "step": 118 + }, + { + "epoch": 0.006415786068578823, + "grad_norm": 1.4867664575576782, + "learning_rate": 9.99999837742681e-06, + "loss": 1.0164, + "step": 119 + }, + { + "epoch": 0.006469700237222342, + "grad_norm": 1.540153980255127, + "learning_rate": 9.999998202134979e-06, + "loss": 0.989, + "step": 120 + }, + { + "epoch": 0.006523614405865861, + "grad_norm": 1.5535691976547241, + "learning_rate": 9.999998017853825e-06, + "loss": 0.9942, + "step": 121 + }, + { + "epoch": 0.006577528574509381, + "grad_norm": 1.4892929792404175, + "learning_rate": 9.999997824583351e-06, + "loss": 1.0537, + "step": 122 + }, + { + "epoch": 0.006631442743152901, + "grad_norm": 1.4674094915390015, + "learning_rate": 9.999997622323554e-06, + "loss": 1.0239, + "step": 123 + }, + { + "epoch": 0.00668535691179642, + "grad_norm": 1.394027590751648, + "learning_rate": 9.999997411074436e-06, + "loss": 0.9781, + "step": 124 + }, + { + "epoch": 0.006739271080439939, + "grad_norm": 1.372728705406189, + "learning_rate": 9.999997190835999e-06, + "loss": 1.0433, + "step": 125 + }, + { + "epoch": 0.006793185249083459, + "grad_norm": 1.2535908222198486, + "learning_rate": 9.999996961608238e-06, + "loss": 0.958, + "step": 126 + }, + { + "epoch": 0.006847099417726979, + "grad_norm": 1.337633490562439, + "learning_rate": 9.999996723391158e-06, + "loss": 1.0213, + "step": 127 + }, + { + "epoch": 0.006901013586370498, + "grad_norm": 1.3640319108963013, + "learning_rate": 9.999996476184759e-06, + "loss": 1.0432, + "step": 128 + }, + { + "epoch": 0.006954927755014018, + "grad_norm": 1.2663391828536987, + "learning_rate": 9.99999621998904e-06, + "loss": 1.0154, + "step": 129 + }, + { + "epoch": 0.007008841923657537, + "grad_norm": 1.450737476348877, + "learning_rate": 9.999995954804004e-06, + "loss": 1.0074, + "step": 130 + }, + { + "epoch": 0.0070627560923010565, + "grad_norm": 1.2757987976074219, + "learning_rate": 9.999995680629649e-06, + "loss": 0.9996, + "step": 131 + }, + { + "epoch": 0.007116670260944576, + "grad_norm": 1.3978132009506226, + "learning_rate": 9.999995397465974e-06, + "loss": 1.04, + "step": 132 + }, + { + "epoch": 0.007170584429588096, + "grad_norm": 1.3167297840118408, + "learning_rate": 9.999995105312982e-06, + "loss": 1.0069, + "step": 133 + }, + { + "epoch": 0.0072244985982316155, + "grad_norm": 1.1626744270324707, + "learning_rate": 9.999994804170674e-06, + "loss": 0.9722, + "step": 134 + }, + { + "epoch": 0.007278412766875135, + "grad_norm": 1.354797601699829, + "learning_rate": 9.99999449403905e-06, + "loss": 0.9019, + "step": 135 + }, + { + "epoch": 0.007332326935518654, + "grad_norm": 1.2605732679367065, + "learning_rate": 9.99999417491811e-06, + "loss": 1.0038, + "step": 136 + }, + { + "epoch": 0.007386241104162174, + "grad_norm": 1.3804657459259033, + "learning_rate": 9.999993846807855e-06, + "loss": 1.0139, + "step": 137 + }, + { + "epoch": 0.007440155272805693, + "grad_norm": 1.3001742362976074, + "learning_rate": 9.999993509708286e-06, + "loss": 1.1436, + "step": 138 + }, + { + "epoch": 0.007494069441449213, + "grad_norm": 1.2776422500610352, + "learning_rate": 9.999993163619401e-06, + "loss": 0.9792, + "step": 139 + }, + { + "epoch": 0.007547983610092733, + "grad_norm": 1.2149187326431274, + "learning_rate": 9.999992808541204e-06, + "loss": 0.963, + "step": 140 + }, + { + "epoch": 0.0076018977787362515, + "grad_norm": 1.341806173324585, + "learning_rate": 9.999992444473694e-06, + "loss": 0.9639, + "step": 141 + }, + { + "epoch": 0.007655811947379771, + "grad_norm": 1.2565757036209106, + "learning_rate": 9.999992071416874e-06, + "loss": 0.9193, + "step": 142 + }, + { + "epoch": 0.007709726116023291, + "grad_norm": 1.3059918880462646, + "learning_rate": 9.99999168937074e-06, + "loss": 0.9632, + "step": 143 + }, + { + "epoch": 0.0077636402846668106, + "grad_norm": 1.1719332933425903, + "learning_rate": 9.999991298335295e-06, + "loss": 0.9687, + "step": 144 + }, + { + "epoch": 0.00781755445331033, + "grad_norm": 1.125950813293457, + "learning_rate": 9.999990898310542e-06, + "loss": 0.968, + "step": 145 + }, + { + "epoch": 0.007871468621953849, + "grad_norm": 1.2400416135787964, + "learning_rate": 9.999990489296478e-06, + "loss": 0.972, + "step": 146 + }, + { + "epoch": 0.007925382790597369, + "grad_norm": 1.172117829322815, + "learning_rate": 9.999990071293106e-06, + "loss": 0.9243, + "step": 147 + }, + { + "epoch": 0.007979296959240888, + "grad_norm": 1.240317463874817, + "learning_rate": 9.999989644300427e-06, + "loss": 1.0655, + "step": 148 + }, + { + "epoch": 0.008033211127884408, + "grad_norm": 1.1535708904266357, + "learning_rate": 9.999989208318438e-06, + "loss": 0.9871, + "step": 149 + }, + { + "epoch": 0.008087125296527928, + "grad_norm": 1.2711198329925537, + "learning_rate": 9.999988763347145e-06, + "loss": 1.0307, + "step": 150 + }, + { + "epoch": 0.008141039465171447, + "grad_norm": 1.2345954179763794, + "learning_rate": 9.999988309386548e-06, + "loss": 1.1343, + "step": 151 + }, + { + "epoch": 0.008194953633814967, + "grad_norm": 1.2489601373672485, + "learning_rate": 9.999987846436645e-06, + "loss": 1.0303, + "step": 152 + }, + { + "epoch": 0.008248867802458487, + "grad_norm": 1.264240026473999, + "learning_rate": 9.999987374497439e-06, + "loss": 0.9562, + "step": 153 + }, + { + "epoch": 0.008302781971102005, + "grad_norm": 1.2613575458526611, + "learning_rate": 9.99998689356893e-06, + "loss": 0.954, + "step": 154 + }, + { + "epoch": 0.008356696139745524, + "grad_norm": 1.2091072797775269, + "learning_rate": 9.999986403651116e-06, + "loss": 1.0734, + "step": 155 + }, + { + "epoch": 0.008410610308389044, + "grad_norm": 1.18421471118927, + "learning_rate": 9.999985904744002e-06, + "loss": 0.9167, + "step": 156 + }, + { + "epoch": 0.008464524477032564, + "grad_norm": 1.0399659872055054, + "learning_rate": 9.99998539684759e-06, + "loss": 0.9068, + "step": 157 + }, + { + "epoch": 0.008518438645676083, + "grad_norm": 1.1292288303375244, + "learning_rate": 9.999984879961877e-06, + "loss": 1.0027, + "step": 158 + }, + { + "epoch": 0.008572352814319603, + "grad_norm": 1.2592105865478516, + "learning_rate": 9.999984354086867e-06, + "loss": 1.0794, + "step": 159 + }, + { + "epoch": 0.008626266982963123, + "grad_norm": 1.1646504402160645, + "learning_rate": 9.999983819222558e-06, + "loss": 1.0468, + "step": 160 + }, + { + "epoch": 0.008680181151606643, + "grad_norm": 1.156711220741272, + "learning_rate": 9.999983275368952e-06, + "loss": 0.9053, + "step": 161 + }, + { + "epoch": 0.008734095320250162, + "grad_norm": 1.1169341802597046, + "learning_rate": 9.999982722526051e-06, + "loss": 0.97, + "step": 162 + }, + { + "epoch": 0.008788009488893682, + "grad_norm": 1.3474149703979492, + "learning_rate": 9.999982160693856e-06, + "loss": 1.0221, + "step": 163 + }, + { + "epoch": 0.008841923657537202, + "grad_norm": 1.2021468877792358, + "learning_rate": 9.999981589872368e-06, + "loss": 0.9303, + "step": 164 + }, + { + "epoch": 0.00889583782618072, + "grad_norm": 1.0625534057617188, + "learning_rate": 9.999981010061586e-06, + "loss": 0.8765, + "step": 165 + }, + { + "epoch": 0.00894975199482424, + "grad_norm": 1.2688498497009277, + "learning_rate": 9.999980421261512e-06, + "loss": 1.0163, + "step": 166 + }, + { + "epoch": 0.009003666163467759, + "grad_norm": 1.122948408126831, + "learning_rate": 9.999979823472148e-06, + "loss": 0.9953, + "step": 167 + }, + { + "epoch": 0.009057580332111279, + "grad_norm": 1.1817872524261475, + "learning_rate": 9.999979216693495e-06, + "loss": 1.0774, + "step": 168 + }, + { + "epoch": 0.009111494500754798, + "grad_norm": 1.1483280658721924, + "learning_rate": 9.999978600925553e-06, + "loss": 1.0105, + "step": 169 + }, + { + "epoch": 0.009165408669398318, + "grad_norm": 1.4039335250854492, + "learning_rate": 9.999977976168325e-06, + "loss": 0.944, + "step": 170 + }, + { + "epoch": 0.009219322838041838, + "grad_norm": 1.1459723711013794, + "learning_rate": 9.999977342421812e-06, + "loss": 0.9208, + "step": 171 + }, + { + "epoch": 0.009273237006685357, + "grad_norm": 1.0897774696350098, + "learning_rate": 9.999976699686011e-06, + "loss": 0.8719, + "step": 172 + }, + { + "epoch": 0.009327151175328877, + "grad_norm": 1.206467866897583, + "learning_rate": 9.999976047960928e-06, + "loss": 1.0645, + "step": 173 + }, + { + "epoch": 0.009381065343972397, + "grad_norm": 1.004550814628601, + "learning_rate": 9.999975387246563e-06, + "loss": 0.9317, + "step": 174 + }, + { + "epoch": 0.009434979512615916, + "grad_norm": 1.2359992265701294, + "learning_rate": 9.999974717542916e-06, + "loss": 1.1136, + "step": 175 + }, + { + "epoch": 0.009488893681259434, + "grad_norm": 1.1922352313995361, + "learning_rate": 9.999974038849989e-06, + "loss": 1.0307, + "step": 176 + }, + { + "epoch": 0.009542807849902954, + "grad_norm": 1.1597613096237183, + "learning_rate": 9.999973351167782e-06, + "loss": 1.0275, + "step": 177 + }, + { + "epoch": 0.009596722018546474, + "grad_norm": 1.172133445739746, + "learning_rate": 9.999972654496298e-06, + "loss": 0.9269, + "step": 178 + }, + { + "epoch": 0.009650636187189993, + "grad_norm": 1.1879733800888062, + "learning_rate": 9.999971948835538e-06, + "loss": 0.9547, + "step": 179 + }, + { + "epoch": 0.009704550355833513, + "grad_norm": 1.0029833316802979, + "learning_rate": 9.999971234185502e-06, + "loss": 0.8994, + "step": 180 + }, + { + "epoch": 0.009758464524477033, + "grad_norm": 1.0769891738891602, + "learning_rate": 9.999970510546194e-06, + "loss": 0.9107, + "step": 181 + }, + { + "epoch": 0.009812378693120552, + "grad_norm": 1.3288064002990723, + "learning_rate": 9.99996977791761e-06, + "loss": 1.0116, + "step": 182 + }, + { + "epoch": 0.009866292861764072, + "grad_norm": 1.142452597618103, + "learning_rate": 9.999969036299757e-06, + "loss": 0.9367, + "step": 183 + }, + { + "epoch": 0.009920207030407592, + "grad_norm": 1.2458518743515015, + "learning_rate": 9.999968285692632e-06, + "loss": 1.1398, + "step": 184 + }, + { + "epoch": 0.009974121199051111, + "grad_norm": 1.3373422622680664, + "learning_rate": 9.99996752609624e-06, + "loss": 0.959, + "step": 185 + }, + { + "epoch": 0.01002803536769463, + "grad_norm": 1.2288920879364014, + "learning_rate": 9.99996675751058e-06, + "loss": 0.9908, + "step": 186 + }, + { + "epoch": 0.010081949536338149, + "grad_norm": 1.1954001188278198, + "learning_rate": 9.999965979935656e-06, + "loss": 0.9332, + "step": 187 + }, + { + "epoch": 0.010135863704981669, + "grad_norm": 1.171021819114685, + "learning_rate": 9.999965193371466e-06, + "loss": 0.9119, + "step": 188 + }, + { + "epoch": 0.010189777873625188, + "grad_norm": 1.025169014930725, + "learning_rate": 9.999964397818013e-06, + "loss": 0.784, + "step": 189 + }, + { + "epoch": 0.010243692042268708, + "grad_norm": 1.1340326070785522, + "learning_rate": 9.999963593275298e-06, + "loss": 1.0036, + "step": 190 + }, + { + "epoch": 0.010297606210912228, + "grad_norm": 1.0302847623825073, + "learning_rate": 9.999962779743324e-06, + "loss": 0.8293, + "step": 191 + }, + { + "epoch": 0.010351520379555747, + "grad_norm": 1.2410109043121338, + "learning_rate": 9.99996195722209e-06, + "loss": 0.9507, + "step": 192 + }, + { + "epoch": 0.010405434548199267, + "grad_norm": 1.2054308652877808, + "learning_rate": 9.9999611257116e-06, + "loss": 0.9356, + "step": 193 + }, + { + "epoch": 0.010459348716842787, + "grad_norm": 1.2046679258346558, + "learning_rate": 9.999960285211853e-06, + "loss": 1.0638, + "step": 194 + }, + { + "epoch": 0.010513262885486306, + "grad_norm": 1.4594306945800781, + "learning_rate": 9.999959435722852e-06, + "loss": 0.9624, + "step": 195 + }, + { + "epoch": 0.010567177054129826, + "grad_norm": 1.0909247398376465, + "learning_rate": 9.999958577244598e-06, + "loss": 0.9503, + "step": 196 + }, + { + "epoch": 0.010621091222773344, + "grad_norm": 1.1524754762649536, + "learning_rate": 9.999957709777094e-06, + "loss": 0.8954, + "step": 197 + }, + { + "epoch": 0.010675005391416864, + "grad_norm": 1.4128906726837158, + "learning_rate": 9.99995683332034e-06, + "loss": 0.8903, + "step": 198 + }, + { + "epoch": 0.010728919560060383, + "grad_norm": 1.1304652690887451, + "learning_rate": 9.999955947874338e-06, + "loss": 0.9247, + "step": 199 + }, + { + "epoch": 0.010782833728703903, + "grad_norm": 1.2978957891464233, + "learning_rate": 9.99995505343909e-06, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.010836747897347423, + "grad_norm": 1.0742554664611816, + "learning_rate": 9.999954150014595e-06, + "loss": 0.9626, + "step": 201 + }, + { + "epoch": 0.010890662065990942, + "grad_norm": 1.0707745552062988, + "learning_rate": 9.999953237600859e-06, + "loss": 0.8721, + "step": 202 + }, + { + "epoch": 0.010944576234634462, + "grad_norm": 1.17974853515625, + "learning_rate": 9.99995231619788e-06, + "loss": 1.0059, + "step": 203 + }, + { + "epoch": 0.010998490403277982, + "grad_norm": 1.0108370780944824, + "learning_rate": 9.999951385805662e-06, + "loss": 0.9527, + "step": 204 + }, + { + "epoch": 0.011052404571921502, + "grad_norm": 0.9983445405960083, + "learning_rate": 9.999950446424204e-06, + "loss": 0.7626, + "step": 205 + }, + { + "epoch": 0.011106318740565021, + "grad_norm": 1.0860002040863037, + "learning_rate": 9.99994949805351e-06, + "loss": 0.9591, + "step": 206 + }, + { + "epoch": 0.01116023290920854, + "grad_norm": 1.0447322130203247, + "learning_rate": 9.999948540693584e-06, + "loss": 0.9861, + "step": 207 + }, + { + "epoch": 0.011214147077852059, + "grad_norm": 1.2582998275756836, + "learning_rate": 9.999947574344423e-06, + "loss": 0.8949, + "step": 208 + }, + { + "epoch": 0.011268061246495579, + "grad_norm": 1.1507002115249634, + "learning_rate": 9.99994659900603e-06, + "loss": 0.918, + "step": 209 + }, + { + "epoch": 0.011321975415139098, + "grad_norm": 1.135169267654419, + "learning_rate": 9.999945614678408e-06, + "loss": 0.9891, + "step": 210 + }, + { + "epoch": 0.011375889583782618, + "grad_norm": 1.1746275424957275, + "learning_rate": 9.999944621361558e-06, + "loss": 1.0186, + "step": 211 + }, + { + "epoch": 0.011429803752426138, + "grad_norm": 1.1137248277664185, + "learning_rate": 9.999943619055483e-06, + "loss": 0.9584, + "step": 212 + }, + { + "epoch": 0.011483717921069657, + "grad_norm": 1.336651086807251, + "learning_rate": 9.999942607760182e-06, + "loss": 1.091, + "step": 213 + }, + { + "epoch": 0.011537632089713177, + "grad_norm": 1.1966856718063354, + "learning_rate": 9.999941587475658e-06, + "loss": 0.9761, + "step": 214 + }, + { + "epoch": 0.011591546258356697, + "grad_norm": 1.0843144655227661, + "learning_rate": 9.999940558201915e-06, + "loss": 0.8917, + "step": 215 + }, + { + "epoch": 0.011645460427000216, + "grad_norm": 1.2089293003082275, + "learning_rate": 9.999939519938953e-06, + "loss": 0.9704, + "step": 216 + }, + { + "epoch": 0.011699374595643736, + "grad_norm": 1.2409982681274414, + "learning_rate": 9.999938472686775e-06, + "loss": 0.9949, + "step": 217 + }, + { + "epoch": 0.011753288764287254, + "grad_norm": 1.1310094594955444, + "learning_rate": 9.99993741644538e-06, + "loss": 0.9666, + "step": 218 + }, + { + "epoch": 0.011807202932930774, + "grad_norm": 1.120510220527649, + "learning_rate": 9.999936351214772e-06, + "loss": 0.8844, + "step": 219 + }, + { + "epoch": 0.011861117101574293, + "grad_norm": 1.0931518077850342, + "learning_rate": 9.999935276994954e-06, + "loss": 0.9647, + "step": 220 + }, + { + "epoch": 0.011915031270217813, + "grad_norm": 1.2821122407913208, + "learning_rate": 9.999934193785926e-06, + "loss": 1.0533, + "step": 221 + }, + { + "epoch": 0.011968945438861333, + "grad_norm": 1.183580756187439, + "learning_rate": 9.999933101587691e-06, + "loss": 0.9196, + "step": 222 + }, + { + "epoch": 0.012022859607504852, + "grad_norm": 1.045825719833374, + "learning_rate": 9.99993200040025e-06, + "loss": 0.8953, + "step": 223 + }, + { + "epoch": 0.012076773776148372, + "grad_norm": 1.0963969230651855, + "learning_rate": 9.999930890223605e-06, + "loss": 0.9723, + "step": 224 + }, + { + "epoch": 0.012130687944791892, + "grad_norm": 1.0356731414794922, + "learning_rate": 9.999929771057761e-06, + "loss": 1.0215, + "step": 225 + }, + { + "epoch": 0.012184602113435411, + "grad_norm": 1.112277626991272, + "learning_rate": 9.999928642902717e-06, + "loss": 0.9886, + "step": 226 + }, + { + "epoch": 0.012238516282078931, + "grad_norm": 0.9969072937965393, + "learning_rate": 9.999927505758475e-06, + "loss": 0.8601, + "step": 227 + }, + { + "epoch": 0.01229243045072245, + "grad_norm": 1.123781442642212, + "learning_rate": 9.999926359625036e-06, + "loss": 0.9894, + "step": 228 + }, + { + "epoch": 0.012346344619365969, + "grad_norm": 1.2122100591659546, + "learning_rate": 9.999925204502406e-06, + "loss": 1.0783, + "step": 229 + }, + { + "epoch": 0.012400258788009488, + "grad_norm": 1.1256672143936157, + "learning_rate": 9.999924040390584e-06, + "loss": 0.9116, + "step": 230 + }, + { + "epoch": 0.012454172956653008, + "grad_norm": 1.0646952390670776, + "learning_rate": 9.999922867289573e-06, + "loss": 0.8993, + "step": 231 + }, + { + "epoch": 0.012508087125296528, + "grad_norm": 1.194676399230957, + "learning_rate": 9.999921685199376e-06, + "loss": 1.0377, + "step": 232 + }, + { + "epoch": 0.012562001293940047, + "grad_norm": 1.0519152879714966, + "learning_rate": 9.999920494119992e-06, + "loss": 0.8283, + "step": 233 + }, + { + "epoch": 0.012615915462583567, + "grad_norm": 1.243249773979187, + "learning_rate": 9.999919294051427e-06, + "loss": 0.9741, + "step": 234 + }, + { + "epoch": 0.012669829631227087, + "grad_norm": 1.1071687936782837, + "learning_rate": 9.999918084993681e-06, + "loss": 1.0402, + "step": 235 + }, + { + "epoch": 0.012723743799870606, + "grad_norm": 1.1224809885025024, + "learning_rate": 9.999916866946757e-06, + "loss": 0.8793, + "step": 236 + }, + { + "epoch": 0.012777657968514126, + "grad_norm": 1.0458532571792603, + "learning_rate": 9.999915639910656e-06, + "loss": 0.9855, + "step": 237 + }, + { + "epoch": 0.012831572137157646, + "grad_norm": 1.0610811710357666, + "learning_rate": 9.999914403885383e-06, + "loss": 0.8092, + "step": 238 + }, + { + "epoch": 0.012885486305801164, + "grad_norm": 1.2818992137908936, + "learning_rate": 9.999913158870936e-06, + "loss": 1.0101, + "step": 239 + }, + { + "epoch": 0.012939400474444683, + "grad_norm": 1.110400915145874, + "learning_rate": 9.999911904867319e-06, + "loss": 0.9782, + "step": 240 + }, + { + "epoch": 0.012993314643088203, + "grad_norm": 1.3290835618972778, + "learning_rate": 9.999910641874537e-06, + "loss": 1.0683, + "step": 241 + }, + { + "epoch": 0.013047228811731723, + "grad_norm": 1.1448980569839478, + "learning_rate": 9.999909369892588e-06, + "loss": 0.9223, + "step": 242 + }, + { + "epoch": 0.013101142980375242, + "grad_norm": 1.1710877418518066, + "learning_rate": 9.999908088921477e-06, + "loss": 0.8022, + "step": 243 + }, + { + "epoch": 0.013155057149018762, + "grad_norm": 1.1242793798446655, + "learning_rate": 9.999906798961207e-06, + "loss": 0.9238, + "step": 244 + }, + { + "epoch": 0.013208971317662282, + "grad_norm": 1.0338802337646484, + "learning_rate": 9.999905500011778e-06, + "loss": 0.8386, + "step": 245 + }, + { + "epoch": 0.013262885486305801, + "grad_norm": 1.0910224914550781, + "learning_rate": 9.999904192073193e-06, + "loss": 0.937, + "step": 246 + }, + { + "epoch": 0.013316799654949321, + "grad_norm": 1.297788143157959, + "learning_rate": 9.999902875145453e-06, + "loss": 0.9054, + "step": 247 + }, + { + "epoch": 0.01337071382359284, + "grad_norm": 1.1317543983459473, + "learning_rate": 9.999901549228564e-06, + "loss": 0.9418, + "step": 248 + }, + { + "epoch": 0.01342462799223636, + "grad_norm": 1.0944132804870605, + "learning_rate": 9.999900214322526e-06, + "loss": 0.9445, + "step": 249 + }, + { + "epoch": 0.013478542160879878, + "grad_norm": 1.4942843914031982, + "learning_rate": 9.999898870427342e-06, + "loss": 0.8956, + "step": 250 + }, + { + "epoch": 0.013532456329523398, + "grad_norm": 1.0630019903182983, + "learning_rate": 9.999897517543013e-06, + "loss": 0.8381, + "step": 251 + }, + { + "epoch": 0.013586370498166918, + "grad_norm": 1.65073561668396, + "learning_rate": 9.999896155669544e-06, + "loss": 1.0148, + "step": 252 + }, + { + "epoch": 0.013640284666810438, + "grad_norm": 1.035731315612793, + "learning_rate": 9.999894784806936e-06, + "loss": 0.8092, + "step": 253 + }, + { + "epoch": 0.013694198835453957, + "grad_norm": 1.308863639831543, + "learning_rate": 9.99989340495519e-06, + "loss": 0.9742, + "step": 254 + }, + { + "epoch": 0.013748113004097477, + "grad_norm": 1.1512938737869263, + "learning_rate": 9.999892016114313e-06, + "loss": 0.8747, + "step": 255 + }, + { + "epoch": 0.013802027172740997, + "grad_norm": 0.9977009296417236, + "learning_rate": 9.9998906182843e-06, + "loss": 0.8183, + "step": 256 + }, + { + "epoch": 0.013855941341384516, + "grad_norm": 1.2228175401687622, + "learning_rate": 9.99988921146516e-06, + "loss": 0.9917, + "step": 257 + }, + { + "epoch": 0.013909855510028036, + "grad_norm": 1.0753847360610962, + "learning_rate": 9.999887795656896e-06, + "loss": 1.0063, + "step": 258 + }, + { + "epoch": 0.013963769678671556, + "grad_norm": 1.0010429620742798, + "learning_rate": 9.999886370859506e-06, + "loss": 0.9315, + "step": 259 + }, + { + "epoch": 0.014017683847315074, + "grad_norm": 1.2038911581039429, + "learning_rate": 9.999884937072995e-06, + "loss": 0.8764, + "step": 260 + }, + { + "epoch": 0.014071598015958593, + "grad_norm": 1.1268917322158813, + "learning_rate": 9.999883494297365e-06, + "loss": 1.0059, + "step": 261 + }, + { + "epoch": 0.014125512184602113, + "grad_norm": 1.1053709983825684, + "learning_rate": 9.999882042532619e-06, + "loss": 0.8866, + "step": 262 + }, + { + "epoch": 0.014179426353245633, + "grad_norm": 1.091145396232605, + "learning_rate": 9.999880581778758e-06, + "loss": 1.0415, + "step": 263 + }, + { + "epoch": 0.014233340521889152, + "grad_norm": 1.0019958019256592, + "learning_rate": 9.999879112035786e-06, + "loss": 0.8177, + "step": 264 + }, + { + "epoch": 0.014287254690532672, + "grad_norm": 1.1044156551361084, + "learning_rate": 9.999877633303708e-06, + "loss": 0.9508, + "step": 265 + }, + { + "epoch": 0.014341168859176192, + "grad_norm": 0.9750218391418457, + "learning_rate": 9.999876145582524e-06, + "loss": 0.8501, + "step": 266 + }, + { + "epoch": 0.014395083027819711, + "grad_norm": 1.4015804529190063, + "learning_rate": 9.999874648872235e-06, + "loss": 0.9491, + "step": 267 + }, + { + "epoch": 0.014448997196463231, + "grad_norm": 1.066422939300537, + "learning_rate": 9.999873143172848e-06, + "loss": 1.0104, + "step": 268 + }, + { + "epoch": 0.01450291136510675, + "grad_norm": 1.1133167743682861, + "learning_rate": 9.99987162848436e-06, + "loss": 1.0142, + "step": 269 + }, + { + "epoch": 0.01455682553375027, + "grad_norm": 1.1259140968322754, + "learning_rate": 9.999870104806782e-06, + "loss": 0.9803, + "step": 270 + }, + { + "epoch": 0.014610739702393788, + "grad_norm": 1.0813393592834473, + "learning_rate": 9.999868572140108e-06, + "loss": 0.8728, + "step": 271 + }, + { + "epoch": 0.014664653871037308, + "grad_norm": 0.9939939379692078, + "learning_rate": 9.999867030484347e-06, + "loss": 0.8826, + "step": 272 + }, + { + "epoch": 0.014718568039680828, + "grad_norm": 1.0081939697265625, + "learning_rate": 9.999865479839499e-06, + "loss": 0.8682, + "step": 273 + }, + { + "epoch": 0.014772482208324347, + "grad_norm": 1.0190658569335938, + "learning_rate": 9.999863920205567e-06, + "loss": 0.9094, + "step": 274 + }, + { + "epoch": 0.014826396376967867, + "grad_norm": 1.0702111721038818, + "learning_rate": 9.999862351582553e-06, + "loss": 0.9244, + "step": 275 + }, + { + "epoch": 0.014880310545611387, + "grad_norm": 1.0891972780227661, + "learning_rate": 9.999860773970461e-06, + "loss": 1.0318, + "step": 276 + }, + { + "epoch": 0.014934224714254906, + "grad_norm": 0.9788139462471008, + "learning_rate": 9.999859187369294e-06, + "loss": 0.8779, + "step": 277 + }, + { + "epoch": 0.014988138882898426, + "grad_norm": 1.0678125619888306, + "learning_rate": 9.999857591779055e-06, + "loss": 0.8962, + "step": 278 + }, + { + "epoch": 0.015042053051541946, + "grad_norm": 0.9882293343544006, + "learning_rate": 9.999855987199747e-06, + "loss": 0.9082, + "step": 279 + }, + { + "epoch": 0.015095967220185465, + "grad_norm": 0.9987571835517883, + "learning_rate": 9.999854373631371e-06, + "loss": 0.9708, + "step": 280 + }, + { + "epoch": 0.015149881388828985, + "grad_norm": 1.0238722562789917, + "learning_rate": 9.99985275107393e-06, + "loss": 0.9461, + "step": 281 + }, + { + "epoch": 0.015203795557472503, + "grad_norm": 0.9628013372421265, + "learning_rate": 9.999851119527431e-06, + "loss": 0.9412, + "step": 282 + }, + { + "epoch": 0.015257709726116023, + "grad_norm": 1.0021862983703613, + "learning_rate": 9.999849478991873e-06, + "loss": 0.8461, + "step": 283 + }, + { + "epoch": 0.015311623894759542, + "grad_norm": 0.9776142239570618, + "learning_rate": 9.99984782946726e-06, + "loss": 0.962, + "step": 284 + }, + { + "epoch": 0.015365538063403062, + "grad_norm": 1.0114799737930298, + "learning_rate": 9.999846170953593e-06, + "loss": 0.8732, + "step": 285 + }, + { + "epoch": 0.015419452232046582, + "grad_norm": 0.9860401749610901, + "learning_rate": 9.999844503450879e-06, + "loss": 0.8204, + "step": 286 + }, + { + "epoch": 0.015473366400690101, + "grad_norm": 1.0743263959884644, + "learning_rate": 9.999842826959119e-06, + "loss": 0.9445, + "step": 287 + }, + { + "epoch": 0.015527280569333621, + "grad_norm": 1.0456606149673462, + "learning_rate": 9.999841141478315e-06, + "loss": 0.8869, + "step": 288 + }, + { + "epoch": 0.01558119473797714, + "grad_norm": 1.0299748182296753, + "learning_rate": 9.99983944700847e-06, + "loss": 0.9543, + "step": 289 + }, + { + "epoch": 0.01563510890662066, + "grad_norm": 1.0176036357879639, + "learning_rate": 9.99983774354959e-06, + "loss": 0.9672, + "step": 290 + }, + { + "epoch": 0.01568902307526418, + "grad_norm": 1.0023303031921387, + "learning_rate": 9.999836031101675e-06, + "loss": 0.9417, + "step": 291 + }, + { + "epoch": 0.015742937243907698, + "grad_norm": 0.9801005721092224, + "learning_rate": 9.99983430966473e-06, + "loss": 0.9376, + "step": 292 + }, + { + "epoch": 0.01579685141255122, + "grad_norm": 1.002906322479248, + "learning_rate": 9.999832579238756e-06, + "loss": 0.8973, + "step": 293 + }, + { + "epoch": 0.015850765581194737, + "grad_norm": 1.0014845132827759, + "learning_rate": 9.999830839823759e-06, + "loss": 0.9583, + "step": 294 + }, + { + "epoch": 0.01590467974983826, + "grad_norm": 1.0173449516296387, + "learning_rate": 9.999829091419739e-06, + "loss": 0.9006, + "step": 295 + }, + { + "epoch": 0.015958593918481777, + "grad_norm": 0.9779545664787292, + "learning_rate": 9.999827334026702e-06, + "loss": 0.9342, + "step": 296 + }, + { + "epoch": 0.016012508087125298, + "grad_norm": 0.9800315499305725, + "learning_rate": 9.999825567644648e-06, + "loss": 0.7948, + "step": 297 + }, + { + "epoch": 0.016066422255768816, + "grad_norm": 0.9628249406814575, + "learning_rate": 9.999823792273583e-06, + "loss": 0.8415, + "step": 298 + }, + { + "epoch": 0.016120336424412334, + "grad_norm": 1.1227449178695679, + "learning_rate": 9.99982200791351e-06, + "loss": 0.9646, + "step": 299 + }, + { + "epoch": 0.016174250593055856, + "grad_norm": 1.1018567085266113, + "learning_rate": 9.99982021456443e-06, + "loss": 0.8647, + "step": 300 + }, + { + "epoch": 0.016228164761699373, + "grad_norm": 1.1017298698425293, + "learning_rate": 9.999818412226347e-06, + "loss": 0.8708, + "step": 301 + }, + { + "epoch": 0.016282078930342895, + "grad_norm": 1.084594488143921, + "learning_rate": 9.999816600899267e-06, + "loss": 0.9765, + "step": 302 + }, + { + "epoch": 0.016335993098986413, + "grad_norm": 1.3735941648483276, + "learning_rate": 9.99981478058319e-06, + "loss": 1.0253, + "step": 303 + }, + { + "epoch": 0.016389907267629934, + "grad_norm": 1.1644489765167236, + "learning_rate": 9.999812951278119e-06, + "loss": 0.8519, + "step": 304 + }, + { + "epoch": 0.016443821436273452, + "grad_norm": 1.0079474449157715, + "learning_rate": 9.99981111298406e-06, + "loss": 0.9422, + "step": 305 + }, + { + "epoch": 0.016497735604916974, + "grad_norm": 1.0046736001968384, + "learning_rate": 9.999809265701015e-06, + "loss": 0.7766, + "step": 306 + }, + { + "epoch": 0.01655164977356049, + "grad_norm": 1.0312374830245972, + "learning_rate": 9.999807409428987e-06, + "loss": 0.8844, + "step": 307 + }, + { + "epoch": 0.01660556394220401, + "grad_norm": 1.0419421195983887, + "learning_rate": 9.99980554416798e-06, + "loss": 0.8902, + "step": 308 + }, + { + "epoch": 0.01665947811084753, + "grad_norm": 1.2056832313537598, + "learning_rate": 9.999803669917996e-06, + "loss": 0.9842, + "step": 309 + }, + { + "epoch": 0.01671339227949105, + "grad_norm": 0.9645346403121948, + "learning_rate": 9.999801786679039e-06, + "loss": 0.7837, + "step": 310 + }, + { + "epoch": 0.01676730644813457, + "grad_norm": 1.0259841680526733, + "learning_rate": 9.999799894451115e-06, + "loss": 0.8927, + "step": 311 + }, + { + "epoch": 0.016821220616778088, + "grad_norm": 0.9932212233543396, + "learning_rate": 9.999797993234224e-06, + "loss": 0.815, + "step": 312 + }, + { + "epoch": 0.01687513478542161, + "grad_norm": 1.0666078329086304, + "learning_rate": 9.99979608302837e-06, + "loss": 0.8245, + "step": 313 + }, + { + "epoch": 0.016929048954065128, + "grad_norm": 0.9566568732261658, + "learning_rate": 9.999794163833557e-06, + "loss": 0.851, + "step": 314 + }, + { + "epoch": 0.01698296312270865, + "grad_norm": 1.0056332349777222, + "learning_rate": 9.999792235649789e-06, + "loss": 0.8704, + "step": 315 + }, + { + "epoch": 0.017036877291352167, + "grad_norm": 1.036537528038025, + "learning_rate": 9.999790298477068e-06, + "loss": 0.9512, + "step": 316 + }, + { + "epoch": 0.01709079145999569, + "grad_norm": 1.1026023626327515, + "learning_rate": 9.9997883523154e-06, + "loss": 1.0007, + "step": 317 + }, + { + "epoch": 0.017144705628639206, + "grad_norm": 1.006659984588623, + "learning_rate": 9.999786397164786e-06, + "loss": 0.8992, + "step": 318 + }, + { + "epoch": 0.017198619797282724, + "grad_norm": 1.0100573301315308, + "learning_rate": 9.99978443302523e-06, + "loss": 0.9545, + "step": 319 + }, + { + "epoch": 0.017252533965926246, + "grad_norm": 1.000086784362793, + "learning_rate": 9.999782459896735e-06, + "loss": 0.8732, + "step": 320 + }, + { + "epoch": 0.017306448134569764, + "grad_norm": 1.2039650678634644, + "learning_rate": 9.999780477779306e-06, + "loss": 0.9881, + "step": 321 + }, + { + "epoch": 0.017360362303213285, + "grad_norm": 1.0316474437713623, + "learning_rate": 9.999778486672948e-06, + "loss": 0.8686, + "step": 322 + }, + { + "epoch": 0.017414276471856803, + "grad_norm": 1.1697666645050049, + "learning_rate": 9.999776486577661e-06, + "loss": 0.9185, + "step": 323 + }, + { + "epoch": 0.017468190640500324, + "grad_norm": 0.9523053169250488, + "learning_rate": 9.999774477493451e-06, + "loss": 0.858, + "step": 324 + }, + { + "epoch": 0.017522104809143842, + "grad_norm": 0.9660015106201172, + "learning_rate": 9.999772459420319e-06, + "loss": 0.9964, + "step": 325 + }, + { + "epoch": 0.017576018977787364, + "grad_norm": 0.971128523349762, + "learning_rate": 9.999770432358271e-06, + "loss": 0.8999, + "step": 326 + }, + { + "epoch": 0.01762993314643088, + "grad_norm": 1.221969485282898, + "learning_rate": 9.999768396307312e-06, + "loss": 0.8628, + "step": 327 + }, + { + "epoch": 0.017683847315074403, + "grad_norm": 1.0868507623672485, + "learning_rate": 9.999766351267442e-06, + "loss": 1.0732, + "step": 328 + }, + { + "epoch": 0.01773776148371792, + "grad_norm": 0.9527992606163025, + "learning_rate": 9.999764297238666e-06, + "loss": 0.8221, + "step": 329 + }, + { + "epoch": 0.01779167565236144, + "grad_norm": 0.9969122409820557, + "learning_rate": 9.99976223422099e-06, + "loss": 0.9234, + "step": 330 + }, + { + "epoch": 0.01784558982100496, + "grad_norm": 0.9291784763336182, + "learning_rate": 9.999760162214415e-06, + "loss": 0.7839, + "step": 331 + }, + { + "epoch": 0.01789950398964848, + "grad_norm": 0.9766960144042969, + "learning_rate": 9.999758081218944e-06, + "loss": 0.7929, + "step": 332 + }, + { + "epoch": 0.017953418158292, + "grad_norm": 0.9536904692649841, + "learning_rate": 9.999755991234585e-06, + "loss": 0.9136, + "step": 333 + }, + { + "epoch": 0.018007332326935518, + "grad_norm": 1.0325372219085693, + "learning_rate": 9.999753892261337e-06, + "loss": 0.8367, + "step": 334 + }, + { + "epoch": 0.01806124649557904, + "grad_norm": 0.9486141800880432, + "learning_rate": 9.999751784299207e-06, + "loss": 0.8802, + "step": 335 + }, + { + "epoch": 0.018115160664222557, + "grad_norm": 0.9880577921867371, + "learning_rate": 9.999749667348198e-06, + "loss": 0.8597, + "step": 336 + }, + { + "epoch": 0.01816907483286608, + "grad_norm": 1.043199896812439, + "learning_rate": 9.999747541408312e-06, + "loss": 0.9142, + "step": 337 + }, + { + "epoch": 0.018222989001509596, + "grad_norm": 1.0606465339660645, + "learning_rate": 9.999745406479554e-06, + "loss": 0.9876, + "step": 338 + }, + { + "epoch": 0.018276903170153118, + "grad_norm": 1.139449954032898, + "learning_rate": 9.999743262561929e-06, + "loss": 0.7773, + "step": 339 + }, + { + "epoch": 0.018330817338796636, + "grad_norm": 1.1416115760803223, + "learning_rate": 9.99974110965544e-06, + "loss": 0.9566, + "step": 340 + }, + { + "epoch": 0.018384731507440154, + "grad_norm": 1.0145153999328613, + "learning_rate": 9.99973894776009e-06, + "loss": 0.9543, + "step": 341 + }, + { + "epoch": 0.018438645676083675, + "grad_norm": 0.950528621673584, + "learning_rate": 9.999736776875885e-06, + "loss": 0.8007, + "step": 342 + }, + { + "epoch": 0.018492559844727193, + "grad_norm": 0.9080097079277039, + "learning_rate": 9.999734597002826e-06, + "loss": 0.8273, + "step": 343 + }, + { + "epoch": 0.018546474013370715, + "grad_norm": 1.0038888454437256, + "learning_rate": 9.99973240814092e-06, + "loss": 0.9394, + "step": 344 + }, + { + "epoch": 0.018600388182014232, + "grad_norm": 1.05253267288208, + "learning_rate": 9.999730210290168e-06, + "loss": 0.9485, + "step": 345 + }, + { + "epoch": 0.018654302350657754, + "grad_norm": 0.9396592974662781, + "learning_rate": 9.999728003450577e-06, + "loss": 0.8943, + "step": 346 + }, + { + "epoch": 0.018708216519301272, + "grad_norm": 1.149387240409851, + "learning_rate": 9.999725787622148e-06, + "loss": 0.8566, + "step": 347 + }, + { + "epoch": 0.018762130687944793, + "grad_norm": 1.1573290824890137, + "learning_rate": 9.999723562804887e-06, + "loss": 0.9641, + "step": 348 + }, + { + "epoch": 0.01881604485658831, + "grad_norm": 1.0217385292053223, + "learning_rate": 9.999721328998797e-06, + "loss": 0.9555, + "step": 349 + }, + { + "epoch": 0.018869959025231833, + "grad_norm": 1.034690499305725, + "learning_rate": 9.999719086203884e-06, + "loss": 0.9407, + "step": 350 + }, + { + "epoch": 0.01892387319387535, + "grad_norm": 0.9819002151489258, + "learning_rate": 9.999716834420148e-06, + "loss": 0.9104, + "step": 351 + }, + { + "epoch": 0.01897778736251887, + "grad_norm": 1.0459688901901245, + "learning_rate": 9.999714573647597e-06, + "loss": 0.9296, + "step": 352 + }, + { + "epoch": 0.01903170153116239, + "grad_norm": 0.9575183391571045, + "learning_rate": 9.999712303886232e-06, + "loss": 0.8517, + "step": 353 + }, + { + "epoch": 0.019085615699805908, + "grad_norm": 1.0018881559371948, + "learning_rate": 9.99971002513606e-06, + "loss": 0.9208, + "step": 354 + }, + { + "epoch": 0.01913952986844943, + "grad_norm": 1.0291972160339355, + "learning_rate": 9.999707737397085e-06, + "loss": 0.8765, + "step": 355 + }, + { + "epoch": 0.019193444037092947, + "grad_norm": 1.0081498622894287, + "learning_rate": 9.999705440669306e-06, + "loss": 0.9204, + "step": 356 + }, + { + "epoch": 0.01924735820573647, + "grad_norm": 0.956950843334198, + "learning_rate": 9.999703134952733e-06, + "loss": 0.8058, + "step": 357 + }, + { + "epoch": 0.019301272374379987, + "grad_norm": 1.1130229234695435, + "learning_rate": 9.999700820247369e-06, + "loss": 0.8202, + "step": 358 + }, + { + "epoch": 0.019355186543023508, + "grad_norm": 1.047211766242981, + "learning_rate": 9.999698496553216e-06, + "loss": 0.9357, + "step": 359 + }, + { + "epoch": 0.019409100711667026, + "grad_norm": 1.0225415229797363, + "learning_rate": 9.99969616387028e-06, + "loss": 0.8306, + "step": 360 + }, + { + "epoch": 0.019463014880310544, + "grad_norm": 1.060727596282959, + "learning_rate": 9.999693822198564e-06, + "loss": 0.9178, + "step": 361 + }, + { + "epoch": 0.019516929048954065, + "grad_norm": 1.0743412971496582, + "learning_rate": 9.999691471538074e-06, + "loss": 0.8761, + "step": 362 + }, + { + "epoch": 0.019570843217597583, + "grad_norm": 1.2229491472244263, + "learning_rate": 9.99968911188881e-06, + "loss": 1.0738, + "step": 363 + }, + { + "epoch": 0.019624757386241105, + "grad_norm": 0.9889073967933655, + "learning_rate": 9.999686743250783e-06, + "loss": 0.9458, + "step": 364 + }, + { + "epoch": 0.019678671554884623, + "grad_norm": 1.0398520231246948, + "learning_rate": 9.999684365623992e-06, + "loss": 0.9096, + "step": 365 + }, + { + "epoch": 0.019732585723528144, + "grad_norm": 1.0613081455230713, + "learning_rate": 9.999681979008442e-06, + "loss": 0.9312, + "step": 366 + }, + { + "epoch": 0.019786499892171662, + "grad_norm": 0.946211040019989, + "learning_rate": 9.99967958340414e-06, + "loss": 0.9208, + "step": 367 + }, + { + "epoch": 0.019840414060815183, + "grad_norm": 1.1298933029174805, + "learning_rate": 9.999677178811087e-06, + "loss": 0.9378, + "step": 368 + }, + { + "epoch": 0.0198943282294587, + "grad_norm": 1.1042351722717285, + "learning_rate": 9.999674765229288e-06, + "loss": 0.9487, + "step": 369 + }, + { + "epoch": 0.019948242398102223, + "grad_norm": 1.0717188119888306, + "learning_rate": 9.999672342658751e-06, + "loss": 0.939, + "step": 370 + }, + { + "epoch": 0.02000215656674574, + "grad_norm": 1.0936871767044067, + "learning_rate": 9.999669911099474e-06, + "loss": 1.1361, + "step": 371 + }, + { + "epoch": 0.02005607073538926, + "grad_norm": 1.0650005340576172, + "learning_rate": 9.999667470551466e-06, + "loss": 0.9709, + "step": 372 + }, + { + "epoch": 0.02010998490403278, + "grad_norm": 1.0154083967208862, + "learning_rate": 9.999665021014731e-06, + "loss": 0.9422, + "step": 373 + }, + { + "epoch": 0.020163899072676298, + "grad_norm": 1.1382607221603394, + "learning_rate": 9.999662562489272e-06, + "loss": 0.984, + "step": 374 + }, + { + "epoch": 0.02021781324131982, + "grad_norm": 0.9372896552085876, + "learning_rate": 9.999660094975095e-06, + "loss": 0.9857, + "step": 375 + }, + { + "epoch": 0.020271727409963337, + "grad_norm": 1.1777011156082153, + "learning_rate": 9.999657618472203e-06, + "loss": 0.9731, + "step": 376 + }, + { + "epoch": 0.02032564157860686, + "grad_norm": 0.9054237604141235, + "learning_rate": 9.9996551329806e-06, + "loss": 0.9104, + "step": 377 + }, + { + "epoch": 0.020379555747250377, + "grad_norm": 0.9255661964416504, + "learning_rate": 9.999652638500292e-06, + "loss": 0.8632, + "step": 378 + }, + { + "epoch": 0.020433469915893898, + "grad_norm": 0.9440998435020447, + "learning_rate": 9.999650135031282e-06, + "loss": 0.8945, + "step": 379 + }, + { + "epoch": 0.020487384084537416, + "grad_norm": 0.9822732210159302, + "learning_rate": 9.999647622573577e-06, + "loss": 0.8874, + "step": 380 + }, + { + "epoch": 0.020541298253180938, + "grad_norm": 1.1294387578964233, + "learning_rate": 9.999645101127179e-06, + "loss": 0.9892, + "step": 381 + }, + { + "epoch": 0.020595212421824455, + "grad_norm": 1.0458290576934814, + "learning_rate": 9.999642570692094e-06, + "loss": 0.9163, + "step": 382 + }, + { + "epoch": 0.020649126590467973, + "grad_norm": 0.8124557733535767, + "learning_rate": 9.999640031268326e-06, + "loss": 0.6927, + "step": 383 + }, + { + "epoch": 0.020703040759111495, + "grad_norm": 1.1053259372711182, + "learning_rate": 9.999637482855878e-06, + "loss": 0.8651, + "step": 384 + }, + { + "epoch": 0.020756954927755013, + "grad_norm": 1.1280632019042969, + "learning_rate": 9.999634925454757e-06, + "loss": 0.9708, + "step": 385 + }, + { + "epoch": 0.020810869096398534, + "grad_norm": 0.9916180372238159, + "learning_rate": 9.999632359064965e-06, + "loss": 0.9081, + "step": 386 + }, + { + "epoch": 0.020864783265042052, + "grad_norm": 1.0430771112442017, + "learning_rate": 9.99962978368651e-06, + "loss": 0.9837, + "step": 387 + }, + { + "epoch": 0.020918697433685574, + "grad_norm": 1.031343698501587, + "learning_rate": 9.999627199319398e-06, + "loss": 0.9156, + "step": 388 + }, + { + "epoch": 0.02097261160232909, + "grad_norm": 1.0157191753387451, + "learning_rate": 9.999624605963627e-06, + "loss": 0.9379, + "step": 389 + }, + { + "epoch": 0.021026525770972613, + "grad_norm": 0.9524544477462769, + "learning_rate": 9.999622003619204e-06, + "loss": 0.8448, + "step": 390 + }, + { + "epoch": 0.02108043993961613, + "grad_norm": 1.091670036315918, + "learning_rate": 9.999619392286137e-06, + "loss": 0.9794, + "step": 391 + }, + { + "epoch": 0.021134354108259652, + "grad_norm": 1.0502233505249023, + "learning_rate": 9.999616771964429e-06, + "loss": 1.0047, + "step": 392 + }, + { + "epoch": 0.02118826827690317, + "grad_norm": 1.2087476253509521, + "learning_rate": 9.999614142654084e-06, + "loss": 0.8964, + "step": 393 + }, + { + "epoch": 0.021242182445546688, + "grad_norm": 1.0264590978622437, + "learning_rate": 9.999611504355106e-06, + "loss": 0.8608, + "step": 394 + }, + { + "epoch": 0.02129609661419021, + "grad_norm": 0.9883281588554382, + "learning_rate": 9.999608857067503e-06, + "loss": 0.9109, + "step": 395 + }, + { + "epoch": 0.021350010782833728, + "grad_norm": 0.9913623332977295, + "learning_rate": 9.999606200791276e-06, + "loss": 0.8993, + "step": 396 + }, + { + "epoch": 0.02140392495147725, + "grad_norm": 1.019178867340088, + "learning_rate": 9.999603535526432e-06, + "loss": 0.9115, + "step": 397 + }, + { + "epoch": 0.021457839120120767, + "grad_norm": 0.9756026864051819, + "learning_rate": 9.999600861272974e-06, + "loss": 0.834, + "step": 398 + }, + { + "epoch": 0.02151175328876429, + "grad_norm": 0.9956341981887817, + "learning_rate": 9.999598178030909e-06, + "loss": 0.8756, + "step": 399 + }, + { + "epoch": 0.021565667457407806, + "grad_norm": 1.0267717838287354, + "learning_rate": 9.999595485800239e-06, + "loss": 0.9427, + "step": 400 + }, + { + "epoch": 0.021619581626051328, + "grad_norm": 1.061139464378357, + "learning_rate": 9.999592784580974e-06, + "loss": 0.9835, + "step": 401 + }, + { + "epoch": 0.021673495794694846, + "grad_norm": 0.9970353245735168, + "learning_rate": 9.999590074373114e-06, + "loss": 0.8946, + "step": 402 + }, + { + "epoch": 0.021727409963338367, + "grad_norm": 1.056242823600769, + "learning_rate": 9.999587355176664e-06, + "loss": 0.9076, + "step": 403 + }, + { + "epoch": 0.021781324131981885, + "grad_norm": 1.0285427570343018, + "learning_rate": 9.999584626991632e-06, + "loss": 0.8506, + "step": 404 + }, + { + "epoch": 0.021835238300625403, + "grad_norm": 1.0026901960372925, + "learning_rate": 9.99958188981802e-06, + "loss": 0.8457, + "step": 405 + }, + { + "epoch": 0.021889152469268924, + "grad_norm": 0.8921003341674805, + "learning_rate": 9.999579143655833e-06, + "loss": 0.8215, + "step": 406 + }, + { + "epoch": 0.021943066637912442, + "grad_norm": 1.2816855907440186, + "learning_rate": 9.99957638850508e-06, + "loss": 0.8779, + "step": 407 + }, + { + "epoch": 0.021996980806555964, + "grad_norm": 1.4713681936264038, + "learning_rate": 9.99957362436576e-06, + "loss": 0.8581, + "step": 408 + }, + { + "epoch": 0.02205089497519948, + "grad_norm": 1.0117568969726562, + "learning_rate": 9.999570851237883e-06, + "loss": 0.8865, + "step": 409 + }, + { + "epoch": 0.022104809143843003, + "grad_norm": 0.9530962705612183, + "learning_rate": 9.99956806912145e-06, + "loss": 0.8888, + "step": 410 + }, + { + "epoch": 0.02215872331248652, + "grad_norm": 0.865692675113678, + "learning_rate": 9.99956527801647e-06, + "loss": 0.8075, + "step": 411 + }, + { + "epoch": 0.022212637481130042, + "grad_norm": 0.9613220691680908, + "learning_rate": 9.999562477922944e-06, + "loss": 0.9289, + "step": 412 + }, + { + "epoch": 0.02226655164977356, + "grad_norm": 0.9419745802879333, + "learning_rate": 9.99955966884088e-06, + "loss": 0.8758, + "step": 413 + }, + { + "epoch": 0.02232046581841708, + "grad_norm": 1.0120573043823242, + "learning_rate": 9.999556850770282e-06, + "loss": 0.9014, + "step": 414 + }, + { + "epoch": 0.0223743799870606, + "grad_norm": 0.9833963513374329, + "learning_rate": 9.999554023711155e-06, + "loss": 0.9354, + "step": 415 + }, + { + "epoch": 0.022428294155704118, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.999551187663505e-06, + "loss": 0.9201, + "step": 416 + }, + { + "epoch": 0.02248220832434764, + "grad_norm": 1.0103633403778076, + "learning_rate": 9.999548342627334e-06, + "loss": 0.9023, + "step": 417 + }, + { + "epoch": 0.022536122492991157, + "grad_norm": 0.8671039342880249, + "learning_rate": 9.99954548860265e-06, + "loss": 0.7263, + "step": 418 + }, + { + "epoch": 0.02259003666163468, + "grad_norm": 1.0967090129852295, + "learning_rate": 9.999542625589461e-06, + "loss": 1.0616, + "step": 419 + }, + { + "epoch": 0.022643950830278196, + "grad_norm": 0.9032139778137207, + "learning_rate": 9.999539753587764e-06, + "loss": 0.782, + "step": 420 + }, + { + "epoch": 0.022697864998921718, + "grad_norm": 0.9532387256622314, + "learning_rate": 9.99953687259757e-06, + "loss": 0.9628, + "step": 421 + }, + { + "epoch": 0.022751779167565236, + "grad_norm": 0.9732246994972229, + "learning_rate": 9.999533982618885e-06, + "loss": 0.8682, + "step": 422 + }, + { + "epoch": 0.022805693336208757, + "grad_norm": 0.9160019159317017, + "learning_rate": 9.99953108365171e-06, + "loss": 0.9051, + "step": 423 + }, + { + "epoch": 0.022859607504852275, + "grad_norm": 1.0100488662719727, + "learning_rate": 9.999528175696054e-06, + "loss": 0.9836, + "step": 424 + }, + { + "epoch": 0.022913521673495793, + "grad_norm": 1.0130014419555664, + "learning_rate": 9.99952525875192e-06, + "loss": 0.8653, + "step": 425 + }, + { + "epoch": 0.022967435842139314, + "grad_norm": 0.9726247787475586, + "learning_rate": 9.999522332819313e-06, + "loss": 0.8761, + "step": 426 + }, + { + "epoch": 0.023021350010782832, + "grad_norm": 0.9457972049713135, + "learning_rate": 9.99951939789824e-06, + "loss": 0.8792, + "step": 427 + }, + { + "epoch": 0.023075264179426354, + "grad_norm": 1.083130121231079, + "learning_rate": 9.999516453988706e-06, + "loss": 0.9035, + "step": 428 + }, + { + "epoch": 0.023129178348069872, + "grad_norm": 0.9195771217346191, + "learning_rate": 9.999513501090714e-06, + "loss": 0.8586, + "step": 429 + }, + { + "epoch": 0.023183092516713393, + "grad_norm": 0.983346700668335, + "learning_rate": 9.999510539204273e-06, + "loss": 0.8335, + "step": 430 + }, + { + "epoch": 0.02323700668535691, + "grad_norm": 1.0524029731750488, + "learning_rate": 9.999507568329386e-06, + "loss": 0.838, + "step": 431 + }, + { + "epoch": 0.023290920854000433, + "grad_norm": 1.0267860889434814, + "learning_rate": 9.999504588466058e-06, + "loss": 0.9345, + "step": 432 + }, + { + "epoch": 0.02334483502264395, + "grad_norm": 1.025707483291626, + "learning_rate": 9.999501599614294e-06, + "loss": 0.9042, + "step": 433 + }, + { + "epoch": 0.023398749191287472, + "grad_norm": 0.9739174842834473, + "learning_rate": 9.999498601774101e-06, + "loss": 0.7433, + "step": 434 + }, + { + "epoch": 0.02345266335993099, + "grad_norm": 0.9468310475349426, + "learning_rate": 9.999495594945486e-06, + "loss": 0.8447, + "step": 435 + }, + { + "epoch": 0.023506577528574508, + "grad_norm": 0.9820529818534851, + "learning_rate": 9.99949257912845e-06, + "loss": 0.8842, + "step": 436 + }, + { + "epoch": 0.02356049169721803, + "grad_norm": 0.998515784740448, + "learning_rate": 9.999489554323e-06, + "loss": 0.9226, + "step": 437 + }, + { + "epoch": 0.023614405865861547, + "grad_norm": 0.9819791316986084, + "learning_rate": 9.999486520529144e-06, + "loss": 0.8559, + "step": 438 + }, + { + "epoch": 0.02366832003450507, + "grad_norm": 0.9468326568603516, + "learning_rate": 9.999483477746884e-06, + "loss": 0.8064, + "step": 439 + }, + { + "epoch": 0.023722234203148587, + "grad_norm": 1.0087614059448242, + "learning_rate": 9.999480425976229e-06, + "loss": 0.9232, + "step": 440 + }, + { + "epoch": 0.023776148371792108, + "grad_norm": 0.9446098208427429, + "learning_rate": 9.99947736521718e-06, + "loss": 0.8511, + "step": 441 + }, + { + "epoch": 0.023830062540435626, + "grad_norm": 1.0966850519180298, + "learning_rate": 9.999474295469746e-06, + "loss": 0.9929, + "step": 442 + }, + { + "epoch": 0.023883976709079147, + "grad_norm": 0.8858770728111267, + "learning_rate": 9.99947121673393e-06, + "loss": 0.8492, + "step": 443 + }, + { + "epoch": 0.023937890877722665, + "grad_norm": 1.083717703819275, + "learning_rate": 9.999468129009742e-06, + "loss": 0.9948, + "step": 444 + }, + { + "epoch": 0.023991805046366187, + "grad_norm": 1.0251178741455078, + "learning_rate": 9.999465032297184e-06, + "loss": 0.8769, + "step": 445 + }, + { + "epoch": 0.024045719215009705, + "grad_norm": 0.9331875443458557, + "learning_rate": 9.999461926596261e-06, + "loss": 0.8663, + "step": 446 + }, + { + "epoch": 0.024099633383653223, + "grad_norm": 0.8941493034362793, + "learning_rate": 9.999458811906979e-06, + "loss": 0.8172, + "step": 447 + }, + { + "epoch": 0.024153547552296744, + "grad_norm": 0.9978699684143066, + "learning_rate": 9.999455688229347e-06, + "loss": 0.9303, + "step": 448 + }, + { + "epoch": 0.024207461720940262, + "grad_norm": 0.8835211992263794, + "learning_rate": 9.999452555563366e-06, + "loss": 0.8921, + "step": 449 + }, + { + "epoch": 0.024261375889583783, + "grad_norm": 0.9061810970306396, + "learning_rate": 9.999449413909043e-06, + "loss": 0.8201, + "step": 450 + }, + { + "epoch": 0.0243152900582273, + "grad_norm": 1.0061571598052979, + "learning_rate": 9.999446263266385e-06, + "loss": 0.8506, + "step": 451 + }, + { + "epoch": 0.024369204226870823, + "grad_norm": 0.9286402463912964, + "learning_rate": 9.999443103635398e-06, + "loss": 0.8532, + "step": 452 + }, + { + "epoch": 0.02442311839551434, + "grad_norm": 1.0919772386550903, + "learning_rate": 9.999439935016087e-06, + "loss": 0.9466, + "step": 453 + }, + { + "epoch": 0.024477032564157862, + "grad_norm": 1.0552513599395752, + "learning_rate": 9.999436757408453e-06, + "loss": 0.8406, + "step": 454 + }, + { + "epoch": 0.02453094673280138, + "grad_norm": 0.9604331851005554, + "learning_rate": 9.999433570812511e-06, + "loss": 0.8928, + "step": 455 + }, + { + "epoch": 0.0245848609014449, + "grad_norm": 1.0126323699951172, + "learning_rate": 9.999430375228259e-06, + "loss": 0.924, + "step": 456 + }, + { + "epoch": 0.02463877507008842, + "grad_norm": 1.0540791749954224, + "learning_rate": 9.999427170655707e-06, + "loss": 0.9656, + "step": 457 + }, + { + "epoch": 0.024692689238731937, + "grad_norm": 0.8622417449951172, + "learning_rate": 9.999423957094857e-06, + "loss": 0.7428, + "step": 458 + }, + { + "epoch": 0.02474660340737546, + "grad_norm": 1.106581211090088, + "learning_rate": 9.999420734545719e-06, + "loss": 0.9258, + "step": 459 + }, + { + "epoch": 0.024800517576018977, + "grad_norm": 0.990807294845581, + "learning_rate": 9.999417503008296e-06, + "loss": 0.9083, + "step": 460 + }, + { + "epoch": 0.024854431744662498, + "grad_norm": 0.9302589893341064, + "learning_rate": 9.999414262482594e-06, + "loss": 0.8654, + "step": 461 + }, + { + "epoch": 0.024908345913306016, + "grad_norm": 1.0218255519866943, + "learning_rate": 9.999411012968621e-06, + "loss": 0.8996, + "step": 462 + }, + { + "epoch": 0.024962260081949537, + "grad_norm": 0.976108193397522, + "learning_rate": 9.99940775446638e-06, + "loss": 0.9423, + "step": 463 + }, + { + "epoch": 0.025016174250593055, + "grad_norm": 1.1027617454528809, + "learning_rate": 9.99940448697588e-06, + "loss": 1.0407, + "step": 464 + }, + { + "epoch": 0.025070088419236577, + "grad_norm": 1.0148764848709106, + "learning_rate": 9.999401210497122e-06, + "loss": 0.9418, + "step": 465 + }, + { + "epoch": 0.025124002587880095, + "grad_norm": 1.0120681524276733, + "learning_rate": 9.999397925030116e-06, + "loss": 0.92, + "step": 466 + }, + { + "epoch": 0.025177916756523613, + "grad_norm": 1.1855127811431885, + "learning_rate": 9.999394630574868e-06, + "loss": 0.9285, + "step": 467 + }, + { + "epoch": 0.025231830925167134, + "grad_norm": 1.8014320135116577, + "learning_rate": 9.999391327131383e-06, + "loss": 0.979, + "step": 468 + }, + { + "epoch": 0.025285745093810652, + "grad_norm": 1.1568403244018555, + "learning_rate": 9.999388014699664e-06, + "loss": 0.9574, + "step": 469 + }, + { + "epoch": 0.025339659262454173, + "grad_norm": 1.2544865608215332, + "learning_rate": 9.99938469327972e-06, + "loss": 0.8356, + "step": 470 + }, + { + "epoch": 0.02539357343109769, + "grad_norm": 1.8647997379302979, + "learning_rate": 9.99938136287156e-06, + "loss": 0.9181, + "step": 471 + }, + { + "epoch": 0.025447487599741213, + "grad_norm": 0.9942222237586975, + "learning_rate": 9.999378023475184e-06, + "loss": 0.9297, + "step": 472 + }, + { + "epoch": 0.02550140176838473, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.9993746750906e-06, + "loss": 0.9181, + "step": 473 + }, + { + "epoch": 0.025555315937028252, + "grad_norm": 0.9353258609771729, + "learning_rate": 9.999371317717817e-06, + "loss": 0.8789, + "step": 474 + }, + { + "epoch": 0.02560923010567177, + "grad_norm": 0.9256170988082886, + "learning_rate": 9.999367951356838e-06, + "loss": 0.8725, + "step": 475 + }, + { + "epoch": 0.02566314427431529, + "grad_norm": 1.1102124452590942, + "learning_rate": 9.999364576007669e-06, + "loss": 0.9818, + "step": 476 + }, + { + "epoch": 0.02571705844295881, + "grad_norm": 1.04171884059906, + "learning_rate": 9.999361191670316e-06, + "loss": 0.9275, + "step": 477 + }, + { + "epoch": 0.025770972611602327, + "grad_norm": 0.9670290350914001, + "learning_rate": 9.999357798344787e-06, + "loss": 0.8919, + "step": 478 + }, + { + "epoch": 0.02582488678024585, + "grad_norm": 1.0543723106384277, + "learning_rate": 9.999354396031085e-06, + "loss": 0.9356, + "step": 479 + }, + { + "epoch": 0.025878800948889367, + "grad_norm": 1.1368457078933716, + "learning_rate": 9.99935098472922e-06, + "loss": 0.9387, + "step": 480 + }, + { + "epoch": 0.025932715117532888, + "grad_norm": 1.0627872943878174, + "learning_rate": 9.999347564439196e-06, + "loss": 1.0047, + "step": 481 + }, + { + "epoch": 0.025986629286176406, + "grad_norm": 0.9553730487823486, + "learning_rate": 9.999344135161018e-06, + "loss": 0.8845, + "step": 482 + }, + { + "epoch": 0.026040543454819928, + "grad_norm": 0.9605830907821655, + "learning_rate": 9.999340696894694e-06, + "loss": 0.8816, + "step": 483 + }, + { + "epoch": 0.026094457623463446, + "grad_norm": 1.0464140176773071, + "learning_rate": 9.999337249640232e-06, + "loss": 0.9344, + "step": 484 + }, + { + "epoch": 0.026148371792106967, + "grad_norm": 1.0667988061904907, + "learning_rate": 9.999333793397635e-06, + "loss": 0.8834, + "step": 485 + }, + { + "epoch": 0.026202285960750485, + "grad_norm": 0.8996486663818359, + "learning_rate": 9.999330328166908e-06, + "loss": 0.8247, + "step": 486 + }, + { + "epoch": 0.026256200129394006, + "grad_norm": 1.0483838319778442, + "learning_rate": 9.99932685394806e-06, + "loss": 0.9414, + "step": 487 + }, + { + "epoch": 0.026310114298037524, + "grad_norm": 1.2089953422546387, + "learning_rate": 9.999323370741097e-06, + "loss": 1.0913, + "step": 488 + }, + { + "epoch": 0.026364028466681042, + "grad_norm": 1.074291467666626, + "learning_rate": 9.999319878546025e-06, + "loss": 0.8882, + "step": 489 + }, + { + "epoch": 0.026417942635324564, + "grad_norm": 1.0076494216918945, + "learning_rate": 9.99931637736285e-06, + "loss": 0.8393, + "step": 490 + }, + { + "epoch": 0.02647185680396808, + "grad_norm": 1.2263407707214355, + "learning_rate": 9.99931286719158e-06, + "loss": 0.955, + "step": 491 + }, + { + "epoch": 0.026525770972611603, + "grad_norm": 0.9093664884567261, + "learning_rate": 9.999309348032218e-06, + "loss": 0.8366, + "step": 492 + }, + { + "epoch": 0.02657968514125512, + "grad_norm": 1.0704407691955566, + "learning_rate": 9.999305819884772e-06, + "loss": 0.981, + "step": 493 + }, + { + "epoch": 0.026633599309898642, + "grad_norm": 1.2105270624160767, + "learning_rate": 9.999302282749249e-06, + "loss": 0.8896, + "step": 494 + }, + { + "epoch": 0.02668751347854216, + "grad_norm": 1.0142449140548706, + "learning_rate": 9.999298736625654e-06, + "loss": 0.8627, + "step": 495 + }, + { + "epoch": 0.02674142764718568, + "grad_norm": 1.0887057781219482, + "learning_rate": 9.999295181513994e-06, + "loss": 0.8884, + "step": 496 + }, + { + "epoch": 0.0267953418158292, + "grad_norm": 0.9958952069282532, + "learning_rate": 9.999291617414277e-06, + "loss": 0.7768, + "step": 497 + }, + { + "epoch": 0.02684925598447272, + "grad_norm": 0.8576722741127014, + "learning_rate": 9.999288044326508e-06, + "loss": 0.715, + "step": 498 + }, + { + "epoch": 0.02690317015311624, + "grad_norm": 1.058148741722107, + "learning_rate": 9.999284462250691e-06, + "loss": 0.8693, + "step": 499 + }, + { + "epoch": 0.026957084321759757, + "grad_norm": 0.9429569244384766, + "learning_rate": 9.999280871186837e-06, + "loss": 0.8883, + "step": 500 + }, + { + "epoch": 0.02701099849040328, + "grad_norm": 0.9450993537902832, + "learning_rate": 9.999277271134948e-06, + "loss": 0.9376, + "step": 501 + }, + { + "epoch": 0.027064912659046796, + "grad_norm": 1.0307891368865967, + "learning_rate": 9.999273662095035e-06, + "loss": 0.9098, + "step": 502 + }, + { + "epoch": 0.027118826827690318, + "grad_norm": 0.9515891671180725, + "learning_rate": 9.999270044067101e-06, + "loss": 0.8854, + "step": 503 + }, + { + "epoch": 0.027172740996333836, + "grad_norm": 1.1173255443572998, + "learning_rate": 9.999266417051154e-06, + "loss": 0.7977, + "step": 504 + }, + { + "epoch": 0.027226655164977357, + "grad_norm": 1.028194785118103, + "learning_rate": 9.9992627810472e-06, + "loss": 0.9585, + "step": 505 + }, + { + "epoch": 0.027280569333620875, + "grad_norm": 1.0855528116226196, + "learning_rate": 9.999259136055245e-06, + "loss": 0.9807, + "step": 506 + }, + { + "epoch": 0.027334483502264396, + "grad_norm": 1.1148236989974976, + "learning_rate": 9.999255482075298e-06, + "loss": 0.9672, + "step": 507 + }, + { + "epoch": 0.027388397670907914, + "grad_norm": 0.9697713255882263, + "learning_rate": 9.999251819107364e-06, + "loss": 0.9073, + "step": 508 + }, + { + "epoch": 0.027442311839551436, + "grad_norm": 0.9802384972572327, + "learning_rate": 9.999248147151448e-06, + "loss": 0.8704, + "step": 509 + }, + { + "epoch": 0.027496226008194954, + "grad_norm": 0.963330090045929, + "learning_rate": 9.999244466207559e-06, + "loss": 0.9312, + "step": 510 + }, + { + "epoch": 0.02755014017683847, + "grad_norm": 0.8776309490203857, + "learning_rate": 9.999240776275703e-06, + "loss": 0.8068, + "step": 511 + }, + { + "epoch": 0.027604054345481993, + "grad_norm": 1.1159353256225586, + "learning_rate": 9.999237077355886e-06, + "loss": 0.8164, + "step": 512 + }, + { + "epoch": 0.02765796851412551, + "grad_norm": 1.004232406616211, + "learning_rate": 9.999233369448115e-06, + "loss": 0.8666, + "step": 513 + }, + { + "epoch": 0.027711882682769032, + "grad_norm": 1.0300110578536987, + "learning_rate": 9.999229652552395e-06, + "loss": 0.8774, + "step": 514 + }, + { + "epoch": 0.02776579685141255, + "grad_norm": 0.8823155164718628, + "learning_rate": 9.999225926668736e-06, + "loss": 0.7579, + "step": 515 + }, + { + "epoch": 0.027819711020056072, + "grad_norm": 0.938956618309021, + "learning_rate": 9.999222191797144e-06, + "loss": 0.8749, + "step": 516 + }, + { + "epoch": 0.02787362518869959, + "grad_norm": 0.9111800789833069, + "learning_rate": 9.999218447937624e-06, + "loss": 0.8915, + "step": 517 + }, + { + "epoch": 0.02792753935734311, + "grad_norm": 0.971813440322876, + "learning_rate": 9.999214695090182e-06, + "loss": 0.9038, + "step": 518 + }, + { + "epoch": 0.02798145352598663, + "grad_norm": 0.9159868359565735, + "learning_rate": 9.999210933254828e-06, + "loss": 0.8726, + "step": 519 + }, + { + "epoch": 0.028035367694630147, + "grad_norm": 1.0223439931869507, + "learning_rate": 9.999207162431566e-06, + "loss": 0.8738, + "step": 520 + }, + { + "epoch": 0.02808928186327367, + "grad_norm": 0.9844004511833191, + "learning_rate": 9.999203382620404e-06, + "loss": 0.8815, + "step": 521 + }, + { + "epoch": 0.028143196031917186, + "grad_norm": 1.1636719703674316, + "learning_rate": 9.99919959382135e-06, + "loss": 0.8781, + "step": 522 + }, + { + "epoch": 0.028197110200560708, + "grad_norm": 0.9637702703475952, + "learning_rate": 9.999195796034407e-06, + "loss": 0.8491, + "step": 523 + }, + { + "epoch": 0.028251024369204226, + "grad_norm": 0.975931704044342, + "learning_rate": 9.999191989259584e-06, + "loss": 0.9983, + "step": 524 + }, + { + "epoch": 0.028304938537847747, + "grad_norm": 0.9855527877807617, + "learning_rate": 9.999188173496889e-06, + "loss": 0.9587, + "step": 525 + }, + { + "epoch": 0.028358852706491265, + "grad_norm": 0.9925652742385864, + "learning_rate": 9.99918434874633e-06, + "loss": 0.8408, + "step": 526 + }, + { + "epoch": 0.028412766875134787, + "grad_norm": 0.9272180795669556, + "learning_rate": 9.999180515007908e-06, + "loss": 0.8267, + "step": 527 + }, + { + "epoch": 0.028466681043778305, + "grad_norm": 1.161076307296753, + "learning_rate": 9.999176672281636e-06, + "loss": 0.9282, + "step": 528 + }, + { + "epoch": 0.028520595212421826, + "grad_norm": 0.8953909277915955, + "learning_rate": 9.99917282056752e-06, + "loss": 0.8078, + "step": 529 + }, + { + "epoch": 0.028574509381065344, + "grad_norm": 0.9194382429122925, + "learning_rate": 9.999168959865562e-06, + "loss": 0.8385, + "step": 530 + }, + { + "epoch": 0.028628423549708862, + "grad_norm": 1.0351816415786743, + "learning_rate": 9.999165090175775e-06, + "loss": 0.8155, + "step": 531 + }, + { + "epoch": 0.028682337718352383, + "grad_norm": 0.9233224391937256, + "learning_rate": 9.999161211498163e-06, + "loss": 0.8825, + "step": 532 + }, + { + "epoch": 0.0287362518869959, + "grad_norm": 1.0415356159210205, + "learning_rate": 9.999157323832732e-06, + "loss": 0.7844, + "step": 533 + }, + { + "epoch": 0.028790166055639423, + "grad_norm": 1.0329923629760742, + "learning_rate": 9.999153427179492e-06, + "loss": 0.893, + "step": 534 + }, + { + "epoch": 0.02884408022428294, + "grad_norm": 1.237291932106018, + "learning_rate": 9.999149521538448e-06, + "loss": 0.9786, + "step": 535 + }, + { + "epoch": 0.028897994392926462, + "grad_norm": 0.9952654242515564, + "learning_rate": 9.999145606909607e-06, + "loss": 0.9262, + "step": 536 + }, + { + "epoch": 0.02895190856156998, + "grad_norm": 1.016533374786377, + "learning_rate": 9.999141683292977e-06, + "loss": 0.9854, + "step": 537 + }, + { + "epoch": 0.0290058227302135, + "grad_norm": 1.0334454774856567, + "learning_rate": 9.999137750688564e-06, + "loss": 0.8928, + "step": 538 + }, + { + "epoch": 0.02905973689885702, + "grad_norm": 0.941662609577179, + "learning_rate": 9.999133809096374e-06, + "loss": 0.8698, + "step": 539 + }, + { + "epoch": 0.02911365106750054, + "grad_norm": 0.9454428553581238, + "learning_rate": 9.999129858516418e-06, + "loss": 0.9261, + "step": 540 + }, + { + "epoch": 0.02916756523614406, + "grad_norm": 1.0921217203140259, + "learning_rate": 9.9991258989487e-06, + "loss": 0.9163, + "step": 541 + }, + { + "epoch": 0.029221479404787577, + "grad_norm": 0.8999170064926147, + "learning_rate": 9.999121930393227e-06, + "loss": 0.883, + "step": 542 + }, + { + "epoch": 0.029275393573431098, + "grad_norm": 0.9732702970504761, + "learning_rate": 9.999117952850009e-06, + "loss": 0.9168, + "step": 543 + }, + { + "epoch": 0.029329307742074616, + "grad_norm": 1.00196373462677, + "learning_rate": 9.99911396631905e-06, + "loss": 0.826, + "step": 544 + }, + { + "epoch": 0.029383221910718137, + "grad_norm": 0.9776156544685364, + "learning_rate": 9.999109970800358e-06, + "loss": 0.8176, + "step": 545 + }, + { + "epoch": 0.029437136079361655, + "grad_norm": 1.0503387451171875, + "learning_rate": 9.99910596629394e-06, + "loss": 0.8617, + "step": 546 + }, + { + "epoch": 0.029491050248005177, + "grad_norm": 0.9195687174797058, + "learning_rate": 9.999101952799805e-06, + "loss": 0.8224, + "step": 547 + }, + { + "epoch": 0.029544964416648695, + "grad_norm": 0.8746809959411621, + "learning_rate": 9.999097930317959e-06, + "loss": 0.8407, + "step": 548 + }, + { + "epoch": 0.029598878585292216, + "grad_norm": 0.9035898447036743, + "learning_rate": 9.999093898848407e-06, + "loss": 0.8344, + "step": 549 + }, + { + "epoch": 0.029652792753935734, + "grad_norm": 0.8764795064926147, + "learning_rate": 9.99908985839116e-06, + "loss": 0.8323, + "step": 550 + }, + { + "epoch": 0.029706706922579255, + "grad_norm": 0.9654614329338074, + "learning_rate": 9.999085808946224e-06, + "loss": 0.8696, + "step": 551 + }, + { + "epoch": 0.029760621091222773, + "grad_norm": 1.1295796632766724, + "learning_rate": 9.999081750513606e-06, + "loss": 0.9608, + "step": 552 + }, + { + "epoch": 0.02981453525986629, + "grad_norm": 0.9591107368469238, + "learning_rate": 9.999077683093313e-06, + "loss": 0.8762, + "step": 553 + }, + { + "epoch": 0.029868449428509813, + "grad_norm": 0.8287899494171143, + "learning_rate": 9.999073606685353e-06, + "loss": 0.7265, + "step": 554 + }, + { + "epoch": 0.02992236359715333, + "grad_norm": 0.9429282546043396, + "learning_rate": 9.99906952128973e-06, + "loss": 0.8835, + "step": 555 + }, + { + "epoch": 0.029976277765796852, + "grad_norm": 0.9617370963096619, + "learning_rate": 9.999065426906459e-06, + "loss": 0.9138, + "step": 556 + }, + { + "epoch": 0.03003019193444037, + "grad_norm": 1.2346372604370117, + "learning_rate": 9.999061323535538e-06, + "loss": 0.831, + "step": 557 + }, + { + "epoch": 0.03008410610308389, + "grad_norm": 1.2413623332977295, + "learning_rate": 9.999057211176982e-06, + "loss": 1.0211, + "step": 558 + }, + { + "epoch": 0.03013802027172741, + "grad_norm": 0.98906010389328, + "learning_rate": 9.999053089830794e-06, + "loss": 0.7821, + "step": 559 + }, + { + "epoch": 0.03019193444037093, + "grad_norm": 0.96706622838974, + "learning_rate": 9.999048959496983e-06, + "loss": 0.8593, + "step": 560 + }, + { + "epoch": 0.03024584860901445, + "grad_norm": 0.9400071501731873, + "learning_rate": 9.999044820175556e-06, + "loss": 0.8731, + "step": 561 + }, + { + "epoch": 0.03029976277765797, + "grad_norm": 1.1276499032974243, + "learning_rate": 9.999040671866522e-06, + "loss": 0.86, + "step": 562 + }, + { + "epoch": 0.030353676946301488, + "grad_norm": 0.8859087228775024, + "learning_rate": 9.999036514569885e-06, + "loss": 0.8274, + "step": 563 + }, + { + "epoch": 0.030407591114945006, + "grad_norm": 1.1617575883865356, + "learning_rate": 9.999032348285656e-06, + "loss": 1.0519, + "step": 564 + }, + { + "epoch": 0.030461505283588527, + "grad_norm": 0.9717594385147095, + "learning_rate": 9.99902817301384e-06, + "loss": 0.9276, + "step": 565 + }, + { + "epoch": 0.030515419452232045, + "grad_norm": 1.000722050666809, + "learning_rate": 9.999023988754446e-06, + "loss": 0.8714, + "step": 566 + }, + { + "epoch": 0.030569333620875567, + "grad_norm": 1.1744625568389893, + "learning_rate": 9.999019795507481e-06, + "loss": 1.0087, + "step": 567 + }, + { + "epoch": 0.030623247789519085, + "grad_norm": 1.0199978351593018, + "learning_rate": 9.999015593272953e-06, + "loss": 0.8537, + "step": 568 + }, + { + "epoch": 0.030677161958162606, + "grad_norm": 0.9232216477394104, + "learning_rate": 9.999011382050869e-06, + "loss": 0.8488, + "step": 569 + }, + { + "epoch": 0.030731076126806124, + "grad_norm": 0.9905959367752075, + "learning_rate": 9.99900716184124e-06, + "loss": 0.9048, + "step": 570 + }, + { + "epoch": 0.030784990295449646, + "grad_norm": 0.9921644330024719, + "learning_rate": 9.999002932644066e-06, + "loss": 0.9294, + "step": 571 + }, + { + "epoch": 0.030838904464093164, + "grad_norm": 1.1583740711212158, + "learning_rate": 9.99899869445936e-06, + "loss": 0.727, + "step": 572 + }, + { + "epoch": 0.03089281863273668, + "grad_norm": 0.906736433506012, + "learning_rate": 9.998994447287127e-06, + "loss": 0.7889, + "step": 573 + }, + { + "epoch": 0.030946732801380203, + "grad_norm": 0.9060770869255066, + "learning_rate": 9.998990191127379e-06, + "loss": 0.8493, + "step": 574 + }, + { + "epoch": 0.03100064697002372, + "grad_norm": 0.9094041585922241, + "learning_rate": 9.99898592598012e-06, + "loss": 0.8604, + "step": 575 + }, + { + "epoch": 0.031054561138667242, + "grad_norm": 1.0964977741241455, + "learning_rate": 9.998981651845358e-06, + "loss": 0.8481, + "step": 576 + }, + { + "epoch": 0.03110847530731076, + "grad_norm": 0.9509627223014832, + "learning_rate": 9.998977368723102e-06, + "loss": 0.8601, + "step": 577 + }, + { + "epoch": 0.03116238947595428, + "grad_norm": 1.0108642578125, + "learning_rate": 9.998973076613359e-06, + "loss": 0.9076, + "step": 578 + }, + { + "epoch": 0.0312163036445978, + "grad_norm": 1.0268129110336304, + "learning_rate": 9.998968775516136e-06, + "loss": 0.8273, + "step": 579 + }, + { + "epoch": 0.03127021781324132, + "grad_norm": 0.968941867351532, + "learning_rate": 9.99896446543144e-06, + "loss": 0.8859, + "step": 580 + }, + { + "epoch": 0.03132413198188484, + "grad_norm": 0.936779260635376, + "learning_rate": 9.998960146359283e-06, + "loss": 0.8589, + "step": 581 + }, + { + "epoch": 0.03137804615052836, + "grad_norm": 0.9675167202949524, + "learning_rate": 9.998955818299667e-06, + "loss": 0.973, + "step": 582 + }, + { + "epoch": 0.03143196031917188, + "grad_norm": 0.9475553035736084, + "learning_rate": 9.998951481252604e-06, + "loss": 0.8936, + "step": 583 + }, + { + "epoch": 0.031485874487815396, + "grad_norm": 0.9130968451499939, + "learning_rate": 9.9989471352181e-06, + "loss": 0.7668, + "step": 584 + }, + { + "epoch": 0.031539788656458914, + "grad_norm": 0.8890071511268616, + "learning_rate": 9.998942780196164e-06, + "loss": 0.8971, + "step": 585 + }, + { + "epoch": 0.03159370282510244, + "grad_norm": 0.9298738837242126, + "learning_rate": 9.998938416186803e-06, + "loss": 0.9313, + "step": 586 + }, + { + "epoch": 0.03164761699374596, + "grad_norm": 1.0683361291885376, + "learning_rate": 9.998934043190025e-06, + "loss": 0.9018, + "step": 587 + }, + { + "epoch": 0.031701531162389475, + "grad_norm": 0.939253568649292, + "learning_rate": 9.99892966120584e-06, + "loss": 0.9119, + "step": 588 + }, + { + "epoch": 0.03175544533103299, + "grad_norm": 0.9245349764823914, + "learning_rate": 9.99892527023425e-06, + "loss": 0.9258, + "step": 589 + }, + { + "epoch": 0.03180935949967652, + "grad_norm": 0.9318797588348389, + "learning_rate": 9.998920870275267e-06, + "loss": 0.9557, + "step": 590 + }, + { + "epoch": 0.031863273668320036, + "grad_norm": 0.8909592628479004, + "learning_rate": 9.998916461328899e-06, + "loss": 0.8122, + "step": 591 + }, + { + "epoch": 0.031917187836963554, + "grad_norm": 1.0637080669403076, + "learning_rate": 9.998912043395154e-06, + "loss": 0.9517, + "step": 592 + }, + { + "epoch": 0.03197110200560707, + "grad_norm": 0.881934642791748, + "learning_rate": 9.99890761647404e-06, + "loss": 0.8729, + "step": 593 + }, + { + "epoch": 0.032025016174250596, + "grad_norm": 0.8882094025611877, + "learning_rate": 9.998903180565562e-06, + "loss": 0.7943, + "step": 594 + }, + { + "epoch": 0.032078930342894114, + "grad_norm": 0.965085506439209, + "learning_rate": 9.99889873566973e-06, + "loss": 0.8894, + "step": 595 + }, + { + "epoch": 0.03213284451153763, + "grad_norm": 0.9679432511329651, + "learning_rate": 9.998894281786556e-06, + "loss": 0.854, + "step": 596 + }, + { + "epoch": 0.03218675868018115, + "grad_norm": 1.4454354047775269, + "learning_rate": 9.998889818916043e-06, + "loss": 0.9944, + "step": 597 + }, + { + "epoch": 0.03224067284882467, + "grad_norm": 0.9369311928749084, + "learning_rate": 9.998885347058198e-06, + "loss": 0.8699, + "step": 598 + }, + { + "epoch": 0.03229458701746819, + "grad_norm": 0.9014303088188171, + "learning_rate": 9.998880866213033e-06, + "loss": 0.8735, + "step": 599 + }, + { + "epoch": 0.03234850118611171, + "grad_norm": 0.989251971244812, + "learning_rate": 9.998876376380555e-06, + "loss": 0.8872, + "step": 600 + }, + { + "epoch": 0.03240241535475523, + "grad_norm": 1.0256885290145874, + "learning_rate": 9.99887187756077e-06, + "loss": 0.8787, + "step": 601 + }, + { + "epoch": 0.03245632952339875, + "grad_norm": 0.9560148119926453, + "learning_rate": 9.998867369753688e-06, + "loss": 0.8301, + "step": 602 + }, + { + "epoch": 0.03251024369204227, + "grad_norm": 1.044754147529602, + "learning_rate": 9.998862852959316e-06, + "loss": 0.9286, + "step": 603 + }, + { + "epoch": 0.03256415786068579, + "grad_norm": 0.8769629597663879, + "learning_rate": 9.998858327177665e-06, + "loss": 0.7927, + "step": 604 + }, + { + "epoch": 0.03261807202932931, + "grad_norm": 0.9217430949211121, + "learning_rate": 9.99885379240874e-06, + "loss": 0.8327, + "step": 605 + }, + { + "epoch": 0.032671986197972826, + "grad_norm": 0.8202590942382812, + "learning_rate": 9.99884924865255e-06, + "loss": 0.7269, + "step": 606 + }, + { + "epoch": 0.032725900366616344, + "grad_norm": 0.9598796367645264, + "learning_rate": 9.998844695909102e-06, + "loss": 0.9329, + "step": 607 + }, + { + "epoch": 0.03277981453525987, + "grad_norm": 1.1016643047332764, + "learning_rate": 9.998840134178407e-06, + "loss": 0.9836, + "step": 608 + }, + { + "epoch": 0.032833728703903386, + "grad_norm": 0.9639281630516052, + "learning_rate": 9.998835563460471e-06, + "loss": 0.8475, + "step": 609 + }, + { + "epoch": 0.032887642872546904, + "grad_norm": 0.9266204833984375, + "learning_rate": 9.998830983755304e-06, + "loss": 0.7307, + "step": 610 + }, + { + "epoch": 0.03294155704119042, + "grad_norm": 0.9282877445220947, + "learning_rate": 9.99882639506291e-06, + "loss": 0.8163, + "step": 611 + }, + { + "epoch": 0.03299547120983395, + "grad_norm": 0.8939738869667053, + "learning_rate": 9.998821797383302e-06, + "loss": 0.6902, + "step": 612 + }, + { + "epoch": 0.033049385378477465, + "grad_norm": 0.9041041731834412, + "learning_rate": 9.998817190716488e-06, + "loss": 0.8735, + "step": 613 + }, + { + "epoch": 0.03310329954712098, + "grad_norm": 0.9973318576812744, + "learning_rate": 9.998812575062473e-06, + "loss": 0.9017, + "step": 614 + }, + { + "epoch": 0.0331572137157645, + "grad_norm": 1.0416412353515625, + "learning_rate": 9.998807950421268e-06, + "loss": 0.9293, + "step": 615 + }, + { + "epoch": 0.03321112788440802, + "grad_norm": 0.8686584234237671, + "learning_rate": 9.998803316792882e-06, + "loss": 0.8585, + "step": 616 + }, + { + "epoch": 0.033265042053051544, + "grad_norm": 0.9907833337783813, + "learning_rate": 9.998798674177319e-06, + "loss": 0.9264, + "step": 617 + }, + { + "epoch": 0.03331895622169506, + "grad_norm": 0.9927001595497131, + "learning_rate": 9.998794022574592e-06, + "loss": 0.895, + "step": 618 + }, + { + "epoch": 0.03337287039033858, + "grad_norm": 0.9314623475074768, + "learning_rate": 9.998789361984707e-06, + "loss": 0.8353, + "step": 619 + }, + { + "epoch": 0.0334267845589821, + "grad_norm": 0.9768248796463013, + "learning_rate": 9.998784692407673e-06, + "loss": 0.8917, + "step": 620 + }, + { + "epoch": 0.03348069872762562, + "grad_norm": 0.9487942457199097, + "learning_rate": 9.998780013843498e-06, + "loss": 0.9022, + "step": 621 + }, + { + "epoch": 0.03353461289626914, + "grad_norm": 1.0376895666122437, + "learning_rate": 9.99877532629219e-06, + "loss": 0.7692, + "step": 622 + }, + { + "epoch": 0.03358852706491266, + "grad_norm": 1.021345853805542, + "learning_rate": 9.99877062975376e-06, + "loss": 1.0386, + "step": 623 + }, + { + "epoch": 0.033642441233556176, + "grad_norm": 0.9979421496391296, + "learning_rate": 9.998765924228214e-06, + "loss": 0.9209, + "step": 624 + }, + { + "epoch": 0.0336963554021997, + "grad_norm": 0.8552166819572449, + "learning_rate": 9.998761209715559e-06, + "loss": 0.8765, + "step": 625 + }, + { + "epoch": 0.03375026957084322, + "grad_norm": 0.9737898707389832, + "learning_rate": 9.998756486215809e-06, + "loss": 0.7459, + "step": 626 + }, + { + "epoch": 0.03380418373948674, + "grad_norm": 1.1067259311676025, + "learning_rate": 9.998751753728967e-06, + "loss": 0.8582, + "step": 627 + }, + { + "epoch": 0.033858097908130255, + "grad_norm": 1.0689613819122314, + "learning_rate": 9.998747012255044e-06, + "loss": 0.8523, + "step": 628 + }, + { + "epoch": 0.03391201207677377, + "grad_norm": 1.1880419254302979, + "learning_rate": 9.998742261794048e-06, + "loss": 0.9085, + "step": 629 + }, + { + "epoch": 0.0339659262454173, + "grad_norm": 0.9569217562675476, + "learning_rate": 9.998737502345987e-06, + "loss": 0.9112, + "step": 630 + }, + { + "epoch": 0.034019840414060816, + "grad_norm": 0.9955928921699524, + "learning_rate": 9.99873273391087e-06, + "loss": 0.9166, + "step": 631 + }, + { + "epoch": 0.034073754582704334, + "grad_norm": 0.8906963467597961, + "learning_rate": 9.998727956488708e-06, + "loss": 0.882, + "step": 632 + }, + { + "epoch": 0.03412766875134785, + "grad_norm": 0.9241589307785034, + "learning_rate": 9.998723170079506e-06, + "loss": 0.8488, + "step": 633 + }, + { + "epoch": 0.03418158291999138, + "grad_norm": 0.9666005969047546, + "learning_rate": 9.998718374683271e-06, + "loss": 0.8432, + "step": 634 + }, + { + "epoch": 0.034235497088634895, + "grad_norm": 0.9036918878555298, + "learning_rate": 9.998713570300018e-06, + "loss": 0.7979, + "step": 635 + }, + { + "epoch": 0.03428941125727841, + "grad_norm": 0.8946508765220642, + "learning_rate": 9.998708756929751e-06, + "loss": 0.8854, + "step": 636 + }, + { + "epoch": 0.03434332542592193, + "grad_norm": 1.0300164222717285, + "learning_rate": 9.99870393457248e-06, + "loss": 0.9116, + "step": 637 + }, + { + "epoch": 0.03439723959456545, + "grad_norm": 1.0635035037994385, + "learning_rate": 9.998699103228214e-06, + "loss": 0.9138, + "step": 638 + }, + { + "epoch": 0.03445115376320897, + "grad_norm": 1.0362621545791626, + "learning_rate": 9.998694262896962e-06, + "loss": 1.0177, + "step": 639 + }, + { + "epoch": 0.03450506793185249, + "grad_norm": 0.9081454873085022, + "learning_rate": 9.99868941357873e-06, + "loss": 0.7802, + "step": 640 + }, + { + "epoch": 0.03455898210049601, + "grad_norm": 0.9943915605545044, + "learning_rate": 9.998684555273529e-06, + "loss": 0.9356, + "step": 641 + }, + { + "epoch": 0.03461289626913953, + "grad_norm": 0.9647786021232605, + "learning_rate": 9.998679687981367e-06, + "loss": 0.741, + "step": 642 + }, + { + "epoch": 0.03466681043778305, + "grad_norm": 0.9655315279960632, + "learning_rate": 9.998674811702255e-06, + "loss": 0.8644, + "step": 643 + }, + { + "epoch": 0.03472072460642657, + "grad_norm": 0.9162091612815857, + "learning_rate": 9.998669926436197e-06, + "loss": 0.8383, + "step": 644 + }, + { + "epoch": 0.03477463877507009, + "grad_norm": 0.9509754776954651, + "learning_rate": 9.998665032183207e-06, + "loss": 0.8066, + "step": 645 + }, + { + "epoch": 0.034828552943713606, + "grad_norm": 1.0545740127563477, + "learning_rate": 9.998660128943292e-06, + "loss": 0.8455, + "step": 646 + }, + { + "epoch": 0.03488246711235713, + "grad_norm": 1.0928760766983032, + "learning_rate": 9.998655216716458e-06, + "loss": 0.8708, + "step": 647 + }, + { + "epoch": 0.03493638128100065, + "grad_norm": 0.9743762016296387, + "learning_rate": 9.998650295502717e-06, + "loss": 0.878, + "step": 648 + }, + { + "epoch": 0.03499029544964417, + "grad_norm": 1.016741156578064, + "learning_rate": 9.998645365302077e-06, + "loss": 0.867, + "step": 649 + }, + { + "epoch": 0.035044209618287685, + "grad_norm": 1.125252366065979, + "learning_rate": 9.998640426114548e-06, + "loss": 0.9443, + "step": 650 + }, + { + "epoch": 0.0350981237869312, + "grad_norm": 0.9555762410163879, + "learning_rate": 9.998635477940135e-06, + "loss": 0.8353, + "step": 651 + }, + { + "epoch": 0.03515203795557473, + "grad_norm": 0.930173397064209, + "learning_rate": 9.998630520778851e-06, + "loss": 0.8383, + "step": 652 + }, + { + "epoch": 0.035205952124218245, + "grad_norm": 1.1592127084732056, + "learning_rate": 9.998625554630704e-06, + "loss": 0.9708, + "step": 653 + }, + { + "epoch": 0.03525986629286176, + "grad_norm": 0.9333894848823547, + "learning_rate": 9.998620579495701e-06, + "loss": 0.9055, + "step": 654 + }, + { + "epoch": 0.03531378046150528, + "grad_norm": 0.9495646357536316, + "learning_rate": 9.998615595373853e-06, + "loss": 0.7993, + "step": 655 + }, + { + "epoch": 0.035367694630148806, + "grad_norm": 1.0919233560562134, + "learning_rate": 9.99861060226517e-06, + "loss": 0.8852, + "step": 656 + }, + { + "epoch": 0.035421608798792324, + "grad_norm": 0.907940685749054, + "learning_rate": 9.998605600169657e-06, + "loss": 0.8294, + "step": 657 + }, + { + "epoch": 0.03547552296743584, + "grad_norm": 1.0423756837844849, + "learning_rate": 9.998600589087328e-06, + "loss": 0.8758, + "step": 658 + }, + { + "epoch": 0.03552943713607936, + "grad_norm": 1.0387269258499146, + "learning_rate": 9.998595569018186e-06, + "loss": 0.9099, + "step": 659 + }, + { + "epoch": 0.03558335130472288, + "grad_norm": 0.9186104536056519, + "learning_rate": 9.998590539962245e-06, + "loss": 0.9025, + "step": 660 + }, + { + "epoch": 0.0356372654733664, + "grad_norm": 1.0173289775848389, + "learning_rate": 9.998585501919514e-06, + "loss": 0.8468, + "step": 661 + }, + { + "epoch": 0.03569117964200992, + "grad_norm": 0.9579570889472961, + "learning_rate": 9.998580454889996e-06, + "loss": 0.8542, + "step": 662 + }, + { + "epoch": 0.03574509381065344, + "grad_norm": 1.093515396118164, + "learning_rate": 9.99857539887371e-06, + "loss": 0.8932, + "step": 663 + }, + { + "epoch": 0.03579900797929696, + "grad_norm": 1.0651243925094604, + "learning_rate": 9.998570333870656e-06, + "loss": 0.8822, + "step": 664 + }, + { + "epoch": 0.03585292214794048, + "grad_norm": 0.973278284072876, + "learning_rate": 9.998565259880845e-06, + "loss": 0.8724, + "step": 665 + }, + { + "epoch": 0.035906836316584, + "grad_norm": 0.961321234703064, + "learning_rate": 9.998560176904291e-06, + "loss": 0.947, + "step": 666 + }, + { + "epoch": 0.03596075048522752, + "grad_norm": 1.0216654539108276, + "learning_rate": 9.998555084940999e-06, + "loss": 0.8528, + "step": 667 + }, + { + "epoch": 0.036014664653871035, + "grad_norm": 0.9917817711830139, + "learning_rate": 9.99854998399098e-06, + "loss": 0.8608, + "step": 668 + }, + { + "epoch": 0.03606857882251455, + "grad_norm": 1.0164326429367065, + "learning_rate": 9.998544874054243e-06, + "loss": 0.8752, + "step": 669 + }, + { + "epoch": 0.03612249299115808, + "grad_norm": 0.9181317687034607, + "learning_rate": 9.998539755130793e-06, + "loss": 0.8032, + "step": 670 + }, + { + "epoch": 0.036176407159801596, + "grad_norm": 1.0100011825561523, + "learning_rate": 9.998534627220646e-06, + "loss": 0.9205, + "step": 671 + }, + { + "epoch": 0.036230321328445114, + "grad_norm": 0.9306463599205017, + "learning_rate": 9.998529490323807e-06, + "loss": 0.8209, + "step": 672 + }, + { + "epoch": 0.03628423549708863, + "grad_norm": 1.8988754749298096, + "learning_rate": 9.998524344440286e-06, + "loss": 0.8455, + "step": 673 + }, + { + "epoch": 0.03633814966573216, + "grad_norm": 0.9742317795753479, + "learning_rate": 9.998519189570091e-06, + "loss": 0.8733, + "step": 674 + }, + { + "epoch": 0.036392063834375675, + "grad_norm": 0.9334224462509155, + "learning_rate": 9.998514025713234e-06, + "loss": 0.8761, + "step": 675 + }, + { + "epoch": 0.03644597800301919, + "grad_norm": 0.9729838371276855, + "learning_rate": 9.998508852869724e-06, + "loss": 0.8916, + "step": 676 + }, + { + "epoch": 0.03649989217166271, + "grad_norm": 0.9721505641937256, + "learning_rate": 9.998503671039568e-06, + "loss": 0.8735, + "step": 677 + }, + { + "epoch": 0.036553806340306236, + "grad_norm": 0.9600850939750671, + "learning_rate": 9.998498480222775e-06, + "loss": 0.9157, + "step": 678 + }, + { + "epoch": 0.036607720508949754, + "grad_norm": 0.9010732173919678, + "learning_rate": 9.998493280419358e-06, + "loss": 0.9215, + "step": 679 + }, + { + "epoch": 0.03666163467759327, + "grad_norm": 0.8708087801933289, + "learning_rate": 9.998488071629324e-06, + "loss": 0.7218, + "step": 680 + }, + { + "epoch": 0.03671554884623679, + "grad_norm": 0.9739180207252502, + "learning_rate": 9.998482853852682e-06, + "loss": 0.8845, + "step": 681 + }, + { + "epoch": 0.03676946301488031, + "grad_norm": 0.9823595881462097, + "learning_rate": 9.998477627089443e-06, + "loss": 0.896, + "step": 682 + }, + { + "epoch": 0.03682337718352383, + "grad_norm": 0.9629859328269958, + "learning_rate": 9.998472391339612e-06, + "loss": 0.8636, + "step": 683 + }, + { + "epoch": 0.03687729135216735, + "grad_norm": 0.8644251823425293, + "learning_rate": 9.998467146603206e-06, + "loss": 0.9124, + "step": 684 + }, + { + "epoch": 0.03693120552081087, + "grad_norm": 0.8987632989883423, + "learning_rate": 9.99846189288023e-06, + "loss": 0.801, + "step": 685 + }, + { + "epoch": 0.036985119689454386, + "grad_norm": 0.9017630219459534, + "learning_rate": 9.99845663017069e-06, + "loss": 0.8675, + "step": 686 + }, + { + "epoch": 0.03703903385809791, + "grad_norm": 0.8905850648880005, + "learning_rate": 9.998451358474603e-06, + "loss": 0.8512, + "step": 687 + }, + { + "epoch": 0.03709294802674143, + "grad_norm": 0.9807800650596619, + "learning_rate": 9.998446077791972e-06, + "loss": 0.9258, + "step": 688 + }, + { + "epoch": 0.03714686219538495, + "grad_norm": 0.8916336894035339, + "learning_rate": 9.99844078812281e-06, + "loss": 0.8236, + "step": 689 + }, + { + "epoch": 0.037200776364028465, + "grad_norm": 0.9330187439918518, + "learning_rate": 9.998435489467126e-06, + "loss": 0.7812, + "step": 690 + }, + { + "epoch": 0.03725469053267198, + "grad_norm": 0.9859142899513245, + "learning_rate": 9.99843018182493e-06, + "loss": 0.8699, + "step": 691 + }, + { + "epoch": 0.03730860470131551, + "grad_norm": 0.9277002215385437, + "learning_rate": 9.998424865196228e-06, + "loss": 0.9276, + "step": 692 + }, + { + "epoch": 0.037362518869959026, + "grad_norm": 0.9764281511306763, + "learning_rate": 9.998419539581034e-06, + "loss": 0.9482, + "step": 693 + }, + { + "epoch": 0.037416433038602544, + "grad_norm": 1.0108616352081299, + "learning_rate": 9.998414204979357e-06, + "loss": 0.8582, + "step": 694 + }, + { + "epoch": 0.03747034720724606, + "grad_norm": 1.2767362594604492, + "learning_rate": 9.998408861391202e-06, + "loss": 0.7833, + "step": 695 + }, + { + "epoch": 0.03752426137588959, + "grad_norm": 0.8874560594558716, + "learning_rate": 9.998403508816585e-06, + "loss": 0.8935, + "step": 696 + }, + { + "epoch": 0.037578175544533104, + "grad_norm": 0.8549458980560303, + "learning_rate": 9.998398147255511e-06, + "loss": 0.7747, + "step": 697 + }, + { + "epoch": 0.03763208971317662, + "grad_norm": 0.9971988201141357, + "learning_rate": 9.998392776707993e-06, + "loss": 0.753, + "step": 698 + }, + { + "epoch": 0.03768600388182014, + "grad_norm": 0.9822113513946533, + "learning_rate": 9.998387397174037e-06, + "loss": 0.9121, + "step": 699 + }, + { + "epoch": 0.037739918050463665, + "grad_norm": 0.996151864528656, + "learning_rate": 9.998382008653656e-06, + "loss": 0.9356, + "step": 700 + }, + { + "epoch": 0.03779383221910718, + "grad_norm": 1.7505156993865967, + "learning_rate": 9.998376611146857e-06, + "loss": 0.8351, + "step": 701 + }, + { + "epoch": 0.0378477463877507, + "grad_norm": 1.070356011390686, + "learning_rate": 9.998371204653651e-06, + "loss": 0.9153, + "step": 702 + }, + { + "epoch": 0.03790166055639422, + "grad_norm": 0.9383741617202759, + "learning_rate": 9.998365789174048e-06, + "loss": 0.8904, + "step": 703 + }, + { + "epoch": 0.03795557472503774, + "grad_norm": 0.8444882035255432, + "learning_rate": 9.998360364708058e-06, + "loss": 0.8243, + "step": 704 + }, + { + "epoch": 0.03800948889368126, + "grad_norm": 1.0012257099151611, + "learning_rate": 9.99835493125569e-06, + "loss": 0.9439, + "step": 705 + }, + { + "epoch": 0.03806340306232478, + "grad_norm": 0.9745193719863892, + "learning_rate": 9.998349488816954e-06, + "loss": 0.8667, + "step": 706 + }, + { + "epoch": 0.0381173172309683, + "grad_norm": 0.8363852500915527, + "learning_rate": 9.998344037391859e-06, + "loss": 0.8082, + "step": 707 + }, + { + "epoch": 0.038171231399611816, + "grad_norm": 0.9389918446540833, + "learning_rate": 9.998338576980417e-06, + "loss": 0.8113, + "step": 708 + }, + { + "epoch": 0.03822514556825534, + "grad_norm": 0.9216110110282898, + "learning_rate": 9.998333107582635e-06, + "loss": 0.8179, + "step": 709 + }, + { + "epoch": 0.03827905973689886, + "grad_norm": 1.0292471647262573, + "learning_rate": 9.998327629198526e-06, + "loss": 0.8605, + "step": 710 + }, + { + "epoch": 0.03833297390554238, + "grad_norm": 0.9812708497047424, + "learning_rate": 9.998322141828097e-06, + "loss": 0.9279, + "step": 711 + }, + { + "epoch": 0.038386888074185894, + "grad_norm": 0.8186620473861694, + "learning_rate": 9.998316645471358e-06, + "loss": 0.7877, + "step": 712 + }, + { + "epoch": 0.03844080224282941, + "grad_norm": 1.034134864807129, + "learning_rate": 9.99831114012832e-06, + "loss": 0.9867, + "step": 713 + }, + { + "epoch": 0.03849471641147294, + "grad_norm": 1.1604938507080078, + "learning_rate": 9.998305625798993e-06, + "loss": 0.9134, + "step": 714 + }, + { + "epoch": 0.038548630580116455, + "grad_norm": 0.8452483415603638, + "learning_rate": 9.998300102483388e-06, + "loss": 0.8732, + "step": 715 + }, + { + "epoch": 0.03860254474875997, + "grad_norm": 0.8881269693374634, + "learning_rate": 9.998294570181512e-06, + "loss": 0.847, + "step": 716 + }, + { + "epoch": 0.03865645891740349, + "grad_norm": 0.8822013735771179, + "learning_rate": 9.998289028893375e-06, + "loss": 0.8404, + "step": 717 + }, + { + "epoch": 0.038710373086047016, + "grad_norm": 1.0011916160583496, + "learning_rate": 9.998283478618991e-06, + "loss": 0.8133, + "step": 718 + }, + { + "epoch": 0.038764287254690534, + "grad_norm": 1.0004018545150757, + "learning_rate": 9.998277919358367e-06, + "loss": 0.9556, + "step": 719 + }, + { + "epoch": 0.03881820142333405, + "grad_norm": 0.8176954984664917, + "learning_rate": 9.998272351111513e-06, + "loss": 0.7977, + "step": 720 + }, + { + "epoch": 0.03887211559197757, + "grad_norm": 0.9160690307617188, + "learning_rate": 9.99826677387844e-06, + "loss": 0.9239, + "step": 721 + }, + { + "epoch": 0.03892602976062109, + "grad_norm": 1.2158405780792236, + "learning_rate": 9.998261187659157e-06, + "loss": 0.9023, + "step": 722 + }, + { + "epoch": 0.03897994392926461, + "grad_norm": 0.9564448595046997, + "learning_rate": 9.998255592453674e-06, + "loss": 0.8585, + "step": 723 + }, + { + "epoch": 0.03903385809790813, + "grad_norm": 0.8902252316474915, + "learning_rate": 9.998249988262002e-06, + "loss": 0.8388, + "step": 724 + }, + { + "epoch": 0.03908777226655165, + "grad_norm": 0.8738620281219482, + "learning_rate": 9.998244375084152e-06, + "loss": 0.9545, + "step": 725 + }, + { + "epoch": 0.03914168643519517, + "grad_norm": 0.9670735001564026, + "learning_rate": 9.99823875292013e-06, + "loss": 0.8335, + "step": 726 + }, + { + "epoch": 0.03919560060383869, + "grad_norm": 0.8719429969787598, + "learning_rate": 9.998233121769952e-06, + "loss": 0.8546, + "step": 727 + }, + { + "epoch": 0.03924951477248221, + "grad_norm": 1.318429708480835, + "learning_rate": 9.998227481633622e-06, + "loss": 1.0658, + "step": 728 + }, + { + "epoch": 0.03930342894112573, + "grad_norm": 0.962630569934845, + "learning_rate": 9.998221832511155e-06, + "loss": 0.9049, + "step": 729 + }, + { + "epoch": 0.039357343109769245, + "grad_norm": 0.9639857411384583, + "learning_rate": 9.998216174402558e-06, + "loss": 0.9114, + "step": 730 + }, + { + "epoch": 0.03941125727841277, + "grad_norm": 1.1621571779251099, + "learning_rate": 9.998210507307843e-06, + "loss": 0.8776, + "step": 731 + }, + { + "epoch": 0.03946517144705629, + "grad_norm": 1.170089840888977, + "learning_rate": 9.998204831227019e-06, + "loss": 0.9928, + "step": 732 + }, + { + "epoch": 0.039519085615699806, + "grad_norm": 0.8257297873497009, + "learning_rate": 9.998199146160098e-06, + "loss": 0.7885, + "step": 733 + }, + { + "epoch": 0.039572999784343324, + "grad_norm": 0.8887513279914856, + "learning_rate": 9.998193452107088e-06, + "loss": 0.8389, + "step": 734 + }, + { + "epoch": 0.03962691395298684, + "grad_norm": 0.9321185350418091, + "learning_rate": 9.998187749068001e-06, + "loss": 0.9083, + "step": 735 + }, + { + "epoch": 0.03968082812163037, + "grad_norm": 0.9926772713661194, + "learning_rate": 9.998182037042847e-06, + "loss": 0.9102, + "step": 736 + }, + { + "epoch": 0.039734742290273885, + "grad_norm": 1.0760009288787842, + "learning_rate": 9.998176316031634e-06, + "loss": 0.7781, + "step": 737 + }, + { + "epoch": 0.0397886564589174, + "grad_norm": 1.0998133420944214, + "learning_rate": 9.998170586034376e-06, + "loss": 0.9725, + "step": 738 + }, + { + "epoch": 0.03984257062756092, + "grad_norm": 0.9367475509643555, + "learning_rate": 9.99816484705108e-06, + "loss": 0.8277, + "step": 739 + }, + { + "epoch": 0.039896484796204446, + "grad_norm": 0.942954957485199, + "learning_rate": 9.998159099081758e-06, + "loss": 0.8542, + "step": 740 + }, + { + "epoch": 0.039950398964847963, + "grad_norm": 0.9841166138648987, + "learning_rate": 9.998153342126421e-06, + "loss": 0.9179, + "step": 741 + }, + { + "epoch": 0.04000431313349148, + "grad_norm": 0.9215245246887207, + "learning_rate": 9.998147576185077e-06, + "loss": 0.8899, + "step": 742 + }, + { + "epoch": 0.040058227302135, + "grad_norm": 1.0368192195892334, + "learning_rate": 9.998141801257739e-06, + "loss": 0.9828, + "step": 743 + }, + { + "epoch": 0.04011214147077852, + "grad_norm": 0.9696660041809082, + "learning_rate": 9.998136017344416e-06, + "loss": 0.9431, + "step": 744 + }, + { + "epoch": 0.04016605563942204, + "grad_norm": 1.111257791519165, + "learning_rate": 9.998130224445117e-06, + "loss": 0.9666, + "step": 745 + }, + { + "epoch": 0.04021996980806556, + "grad_norm": 0.9260644316673279, + "learning_rate": 9.998124422559856e-06, + "loss": 0.8941, + "step": 746 + }, + { + "epoch": 0.04027388397670908, + "grad_norm": 0.8622020483016968, + "learning_rate": 9.99811861168864e-06, + "loss": 0.8148, + "step": 747 + }, + { + "epoch": 0.040327798145352596, + "grad_norm": 0.8767471313476562, + "learning_rate": 9.998112791831483e-06, + "loss": 0.7093, + "step": 748 + }, + { + "epoch": 0.04038171231399612, + "grad_norm": 0.902917206287384, + "learning_rate": 9.998106962988391e-06, + "loss": 0.7677, + "step": 749 + }, + { + "epoch": 0.04043562648263964, + "grad_norm": 1.351694941520691, + "learning_rate": 9.998101125159377e-06, + "loss": 1.0382, + "step": 750 + }, + { + "epoch": 0.04048954065128316, + "grad_norm": 0.8547930121421814, + "learning_rate": 9.998095278344452e-06, + "loss": 0.7974, + "step": 751 + }, + { + "epoch": 0.040543454819926675, + "grad_norm": 0.941149115562439, + "learning_rate": 9.998089422543626e-06, + "loss": 0.8518, + "step": 752 + }, + { + "epoch": 0.0405973689885702, + "grad_norm": 0.8671521544456482, + "learning_rate": 9.998083557756908e-06, + "loss": 0.8049, + "step": 753 + }, + { + "epoch": 0.04065128315721372, + "grad_norm": 0.9877942800521851, + "learning_rate": 9.998077683984311e-06, + "loss": 0.8874, + "step": 754 + }, + { + "epoch": 0.040705197325857236, + "grad_norm": 1.2130393981933594, + "learning_rate": 9.998071801225843e-06, + "loss": 0.9794, + "step": 755 + }, + { + "epoch": 0.040759111494500753, + "grad_norm": 0.9422823786735535, + "learning_rate": 9.998065909481518e-06, + "loss": 0.899, + "step": 756 + }, + { + "epoch": 0.04081302566314427, + "grad_norm": 0.9770492911338806, + "learning_rate": 9.998060008751343e-06, + "loss": 0.8434, + "step": 757 + }, + { + "epoch": 0.040866939831787796, + "grad_norm": 0.9227531552314758, + "learning_rate": 9.998054099035332e-06, + "loss": 0.8797, + "step": 758 + }, + { + "epoch": 0.040920854000431314, + "grad_norm": 1.0452102422714233, + "learning_rate": 9.998048180333492e-06, + "loss": 0.8702, + "step": 759 + }, + { + "epoch": 0.04097476816907483, + "grad_norm": 1.034125566482544, + "learning_rate": 9.998042252645837e-06, + "loss": 0.9041, + "step": 760 + }, + { + "epoch": 0.04102868233771835, + "grad_norm": 0.886029064655304, + "learning_rate": 9.998036315972375e-06, + "loss": 0.7805, + "step": 761 + }, + { + "epoch": 0.041082596506361875, + "grad_norm": 0.9845888614654541, + "learning_rate": 9.998030370313116e-06, + "loss": 0.9836, + "step": 762 + }, + { + "epoch": 0.04113651067500539, + "grad_norm": 0.9223973155021667, + "learning_rate": 9.998024415668075e-06, + "loss": 0.768, + "step": 763 + }, + { + "epoch": 0.04119042484364891, + "grad_norm": 1.0607362985610962, + "learning_rate": 9.99801845203726e-06, + "loss": 0.865, + "step": 764 + }, + { + "epoch": 0.04124433901229243, + "grad_norm": 0.9620907306671143, + "learning_rate": 9.998012479420683e-06, + "loss": 0.7645, + "step": 765 + }, + { + "epoch": 0.04129825318093595, + "grad_norm": 0.9490310549736023, + "learning_rate": 9.99800649781835e-06, + "loss": 0.9124, + "step": 766 + }, + { + "epoch": 0.04135216734957947, + "grad_norm": 0.9684557914733887, + "learning_rate": 9.99800050723028e-06, + "loss": 0.876, + "step": 767 + }, + { + "epoch": 0.04140608151822299, + "grad_norm": 0.9633080959320068, + "learning_rate": 9.997994507656476e-06, + "loss": 0.8976, + "step": 768 + }, + { + "epoch": 0.04145999568686651, + "grad_norm": 0.9495208263397217, + "learning_rate": 9.997988499096953e-06, + "loss": 0.9049, + "step": 769 + }, + { + "epoch": 0.041513909855510026, + "grad_norm": 1.0614326000213623, + "learning_rate": 9.997982481551721e-06, + "loss": 0.905, + "step": 770 + }, + { + "epoch": 0.04156782402415355, + "grad_norm": 0.820672869682312, + "learning_rate": 9.99797645502079e-06, + "loss": 0.8306, + "step": 771 + }, + { + "epoch": 0.04162173819279707, + "grad_norm": 0.9719771146774292, + "learning_rate": 9.997970419504171e-06, + "loss": 0.828, + "step": 772 + }, + { + "epoch": 0.041675652361440586, + "grad_norm": 0.893326997756958, + "learning_rate": 9.997964375001875e-06, + "loss": 0.8416, + "step": 773 + }, + { + "epoch": 0.041729566530084104, + "grad_norm": 0.858121395111084, + "learning_rate": 9.997958321513915e-06, + "loss": 0.8779, + "step": 774 + }, + { + "epoch": 0.04178348069872762, + "grad_norm": 0.9703636765480042, + "learning_rate": 9.997952259040297e-06, + "loss": 0.8623, + "step": 775 + }, + { + "epoch": 0.04183739486737115, + "grad_norm": 0.9626398086547852, + "learning_rate": 9.997946187581039e-06, + "loss": 0.8309, + "step": 776 + }, + { + "epoch": 0.041891309036014665, + "grad_norm": 0.9132344722747803, + "learning_rate": 9.997940107136143e-06, + "loss": 0.8798, + "step": 777 + }, + { + "epoch": 0.04194522320465818, + "grad_norm": 0.9608821272850037, + "learning_rate": 9.997934017705629e-06, + "loss": 0.8764, + "step": 778 + }, + { + "epoch": 0.0419991373733017, + "grad_norm": 1.0852513313293457, + "learning_rate": 9.997927919289501e-06, + "loss": 0.8908, + "step": 779 + }, + { + "epoch": 0.042053051541945226, + "grad_norm": 0.9690573215484619, + "learning_rate": 9.997921811887774e-06, + "loss": 0.8556, + "step": 780 + }, + { + "epoch": 0.042106965710588744, + "grad_norm": 0.9107050895690918, + "learning_rate": 9.997915695500458e-06, + "loss": 0.9249, + "step": 781 + }, + { + "epoch": 0.04216087987923226, + "grad_norm": 1.029974102973938, + "learning_rate": 9.997909570127564e-06, + "loss": 0.8369, + "step": 782 + }, + { + "epoch": 0.04221479404787578, + "grad_norm": 0.8179258704185486, + "learning_rate": 9.997903435769101e-06, + "loss": 0.7729, + "step": 783 + }, + { + "epoch": 0.042268708216519305, + "grad_norm": 1.0664961338043213, + "learning_rate": 9.997897292425082e-06, + "loss": 0.8815, + "step": 784 + }, + { + "epoch": 0.04232262238516282, + "grad_norm": 0.9794465899467468, + "learning_rate": 9.997891140095519e-06, + "loss": 0.9244, + "step": 785 + }, + { + "epoch": 0.04237653655380634, + "grad_norm": 0.875953197479248, + "learning_rate": 9.99788497878042e-06, + "loss": 0.9191, + "step": 786 + }, + { + "epoch": 0.04243045072244986, + "grad_norm": 0.9880902767181396, + "learning_rate": 9.9978788084798e-06, + "loss": 0.8639, + "step": 787 + }, + { + "epoch": 0.042484364891093376, + "grad_norm": 1.0391566753387451, + "learning_rate": 9.997872629193666e-06, + "loss": 0.9943, + "step": 788 + }, + { + "epoch": 0.0425382790597369, + "grad_norm": 0.9321290850639343, + "learning_rate": 9.997866440922033e-06, + "loss": 0.7809, + "step": 789 + }, + { + "epoch": 0.04259219322838042, + "grad_norm": 0.8898556232452393, + "learning_rate": 9.99786024366491e-06, + "loss": 0.9353, + "step": 790 + }, + { + "epoch": 0.04264610739702394, + "grad_norm": 1.1177983283996582, + "learning_rate": 9.997854037422306e-06, + "loss": 0.8157, + "step": 791 + }, + { + "epoch": 0.042700021565667455, + "grad_norm": 0.8821296691894531, + "learning_rate": 9.997847822194236e-06, + "loss": 0.8729, + "step": 792 + }, + { + "epoch": 0.04275393573431098, + "grad_norm": 0.8545325398445129, + "learning_rate": 9.997841597980709e-06, + "loss": 0.8415, + "step": 793 + }, + { + "epoch": 0.0428078499029545, + "grad_norm": 0.9313606023788452, + "learning_rate": 9.997835364781739e-06, + "loss": 0.8411, + "step": 794 + }, + { + "epoch": 0.042861764071598016, + "grad_norm": 0.9587781429290771, + "learning_rate": 9.997829122597332e-06, + "loss": 0.8086, + "step": 795 + }, + { + "epoch": 0.042915678240241534, + "grad_norm": 0.9708360433578491, + "learning_rate": 9.997822871427504e-06, + "loss": 0.8715, + "step": 796 + }, + { + "epoch": 0.04296959240888505, + "grad_norm": 0.8868080973625183, + "learning_rate": 9.997816611272265e-06, + "loss": 0.8549, + "step": 797 + }, + { + "epoch": 0.04302350657752858, + "grad_norm": 0.9147778153419495, + "learning_rate": 9.997810342131624e-06, + "loss": 0.7854, + "step": 798 + }, + { + "epoch": 0.043077420746172095, + "grad_norm": 0.9853960275650024, + "learning_rate": 9.997804064005596e-06, + "loss": 0.8243, + "step": 799 + }, + { + "epoch": 0.04313133491481561, + "grad_norm": 1.0076130628585815, + "learning_rate": 9.997797776894189e-06, + "loss": 0.9077, + "step": 800 + }, + { + "epoch": 0.04318524908345913, + "grad_norm": 0.9694076776504517, + "learning_rate": 9.997791480797417e-06, + "loss": 0.8767, + "step": 801 + }, + { + "epoch": 0.043239163252102655, + "grad_norm": 1.114001750946045, + "learning_rate": 9.99778517571529e-06, + "loss": 0.8211, + "step": 802 + }, + { + "epoch": 0.04329307742074617, + "grad_norm": 0.9701128005981445, + "learning_rate": 9.997778861647817e-06, + "loss": 0.9084, + "step": 803 + }, + { + "epoch": 0.04334699158938969, + "grad_norm": 0.868299126625061, + "learning_rate": 9.997772538595015e-06, + "loss": 0.7556, + "step": 804 + }, + { + "epoch": 0.04340090575803321, + "grad_norm": 0.9160446524620056, + "learning_rate": 9.997766206556888e-06, + "loss": 0.821, + "step": 805 + }, + { + "epoch": 0.043454819926676734, + "grad_norm": 0.934198260307312, + "learning_rate": 9.997759865533454e-06, + "loss": 0.9113, + "step": 806 + }, + { + "epoch": 0.04350873409532025, + "grad_norm": 0.8949079513549805, + "learning_rate": 9.997753515524722e-06, + "loss": 0.7821, + "step": 807 + }, + { + "epoch": 0.04356264826396377, + "grad_norm": 0.9035944938659668, + "learning_rate": 9.997747156530702e-06, + "loss": 0.8233, + "step": 808 + }, + { + "epoch": 0.04361656243260729, + "grad_norm": 0.9681552052497864, + "learning_rate": 9.99774078855141e-06, + "loss": 0.9241, + "step": 809 + }, + { + "epoch": 0.043670476601250806, + "grad_norm": 0.906092643737793, + "learning_rate": 9.99773441158685e-06, + "loss": 0.8948, + "step": 810 + }, + { + "epoch": 0.04372439076989433, + "grad_norm": 0.9229143261909485, + "learning_rate": 9.997728025637039e-06, + "loss": 0.8897, + "step": 811 + }, + { + "epoch": 0.04377830493853785, + "grad_norm": 0.9263061881065369, + "learning_rate": 9.997721630701986e-06, + "loss": 0.7923, + "step": 812 + }, + { + "epoch": 0.04383221910718137, + "grad_norm": 0.8474372029304504, + "learning_rate": 9.997715226781706e-06, + "loss": 0.796, + "step": 813 + }, + { + "epoch": 0.043886133275824885, + "grad_norm": 0.9960548877716064, + "learning_rate": 9.997708813876206e-06, + "loss": 0.9166, + "step": 814 + }, + { + "epoch": 0.04394004744446841, + "grad_norm": 0.9843032956123352, + "learning_rate": 9.997702391985499e-06, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 0.04399396161311193, + "grad_norm": 0.9313154220581055, + "learning_rate": 9.997695961109599e-06, + "loss": 0.8972, + "step": 816 + }, + { + "epoch": 0.044047875781755445, + "grad_norm": 0.8846973180770874, + "learning_rate": 9.997689521248515e-06, + "loss": 0.8599, + "step": 817 + }, + { + "epoch": 0.04410178995039896, + "grad_norm": 0.8113641738891602, + "learning_rate": 9.99768307240226e-06, + "loss": 0.8509, + "step": 818 + }, + { + "epoch": 0.04415570411904248, + "grad_norm": 1.0659984350204468, + "learning_rate": 9.997676614570844e-06, + "loss": 0.938, + "step": 819 + }, + { + "epoch": 0.044209618287686006, + "grad_norm": 0.9183745384216309, + "learning_rate": 9.99767014775428e-06, + "loss": 0.8761, + "step": 820 + }, + { + "epoch": 0.044263532456329524, + "grad_norm": 0.87090003490448, + "learning_rate": 9.997663671952578e-06, + "loss": 0.8535, + "step": 821 + }, + { + "epoch": 0.04431744662497304, + "grad_norm": 0.9857214093208313, + "learning_rate": 9.997657187165753e-06, + "loss": 0.9434, + "step": 822 + }, + { + "epoch": 0.04437136079361656, + "grad_norm": 1.0443209409713745, + "learning_rate": 9.997650693393812e-06, + "loss": 0.8994, + "step": 823 + }, + { + "epoch": 0.044425274962260085, + "grad_norm": 0.8348391652107239, + "learning_rate": 9.99764419063677e-06, + "loss": 0.8383, + "step": 824 + }, + { + "epoch": 0.0444791891309036, + "grad_norm": 1.2708821296691895, + "learning_rate": 9.997637678894639e-06, + "loss": 0.8733, + "step": 825 + }, + { + "epoch": 0.04453310329954712, + "grad_norm": 0.9863126277923584, + "learning_rate": 9.997631158167428e-06, + "loss": 0.9364, + "step": 826 + }, + { + "epoch": 0.04458701746819064, + "grad_norm": 1.0223352909088135, + "learning_rate": 9.99762462845515e-06, + "loss": 0.9139, + "step": 827 + }, + { + "epoch": 0.04464093163683416, + "grad_norm": 0.8559738397598267, + "learning_rate": 9.997618089757818e-06, + "loss": 0.7461, + "step": 828 + }, + { + "epoch": 0.04469484580547768, + "grad_norm": 0.9347368478775024, + "learning_rate": 9.997611542075442e-06, + "loss": 0.9275, + "step": 829 + }, + { + "epoch": 0.0447487599741212, + "grad_norm": 1.0208019018173218, + "learning_rate": 9.997604985408036e-06, + "loss": 0.8338, + "step": 830 + }, + { + "epoch": 0.04480267414276472, + "grad_norm": 0.9792174100875854, + "learning_rate": 9.997598419755607e-06, + "loss": 0.9437, + "step": 831 + }, + { + "epoch": 0.044856588311408235, + "grad_norm": 0.851665198802948, + "learning_rate": 9.997591845118173e-06, + "loss": 0.8008, + "step": 832 + }, + { + "epoch": 0.04491050248005176, + "grad_norm": 0.9315025806427002, + "learning_rate": 9.997585261495742e-06, + "loss": 0.8389, + "step": 833 + }, + { + "epoch": 0.04496441664869528, + "grad_norm": 0.9658921360969543, + "learning_rate": 9.997578668888326e-06, + "loss": 0.9252, + "step": 834 + }, + { + "epoch": 0.045018330817338796, + "grad_norm": 0.8989397287368774, + "learning_rate": 9.997572067295938e-06, + "loss": 0.8648, + "step": 835 + }, + { + "epoch": 0.045072244985982314, + "grad_norm": 0.8874988555908203, + "learning_rate": 9.99756545671859e-06, + "loss": 0.7801, + "step": 836 + }, + { + "epoch": 0.04512615915462584, + "grad_norm": 0.9186223745346069, + "learning_rate": 9.997558837156293e-06, + "loss": 0.767, + "step": 837 + }, + { + "epoch": 0.04518007332326936, + "grad_norm": 1.163044810295105, + "learning_rate": 9.997552208609059e-06, + "loss": 0.8938, + "step": 838 + }, + { + "epoch": 0.045233987491912875, + "grad_norm": 0.8315468430519104, + "learning_rate": 9.997545571076901e-06, + "loss": 0.725, + "step": 839 + }, + { + "epoch": 0.04528790166055639, + "grad_norm": 1.0088660717010498, + "learning_rate": 9.99753892455983e-06, + "loss": 0.8533, + "step": 840 + }, + { + "epoch": 0.04534181582919991, + "grad_norm": 0.9268692135810852, + "learning_rate": 9.997532269057857e-06, + "loss": 0.8739, + "step": 841 + }, + { + "epoch": 0.045395729997843436, + "grad_norm": 1.0793242454528809, + "learning_rate": 9.997525604570995e-06, + "loss": 0.9605, + "step": 842 + }, + { + "epoch": 0.045449644166486954, + "grad_norm": 1.101798176765442, + "learning_rate": 9.997518931099258e-06, + "loss": 0.9525, + "step": 843 + }, + { + "epoch": 0.04550355833513047, + "grad_norm": 0.9046466946601868, + "learning_rate": 9.997512248642654e-06, + "loss": 0.8853, + "step": 844 + }, + { + "epoch": 0.04555747250377399, + "grad_norm": 0.9629097580909729, + "learning_rate": 9.997505557201198e-06, + "loss": 0.8882, + "step": 845 + }, + { + "epoch": 0.045611386672417514, + "grad_norm": 1.1880977153778076, + "learning_rate": 9.997498856774898e-06, + "loss": 0.8812, + "step": 846 + }, + { + "epoch": 0.04566530084106103, + "grad_norm": 0.8678451180458069, + "learning_rate": 9.997492147363772e-06, + "loss": 0.887, + "step": 847 + }, + { + "epoch": 0.04571921500970455, + "grad_norm": 1.3359739780426025, + "learning_rate": 9.99748542896783e-06, + "loss": 0.8141, + "step": 848 + }, + { + "epoch": 0.04577312917834807, + "grad_norm": 0.9263296127319336, + "learning_rate": 9.99747870158708e-06, + "loss": 0.9357, + "step": 849 + }, + { + "epoch": 0.045827043346991586, + "grad_norm": 0.9199776649475098, + "learning_rate": 9.997471965221541e-06, + "loss": 0.8352, + "step": 850 + }, + { + "epoch": 0.04588095751563511, + "grad_norm": 0.8880730867385864, + "learning_rate": 9.997465219871218e-06, + "loss": 0.7802, + "step": 851 + }, + { + "epoch": 0.04593487168427863, + "grad_norm": 0.8561250567436218, + "learning_rate": 9.99745846553613e-06, + "loss": 0.7987, + "step": 852 + }, + { + "epoch": 0.04598878585292215, + "grad_norm": 0.8975661396980286, + "learning_rate": 9.997451702216283e-06, + "loss": 0.8325, + "step": 853 + }, + { + "epoch": 0.046042700021565665, + "grad_norm": 0.9350215196609497, + "learning_rate": 9.997444929911693e-06, + "loss": 0.7708, + "step": 854 + }, + { + "epoch": 0.04609661419020919, + "grad_norm": 1.0229014158248901, + "learning_rate": 9.99743814862237e-06, + "loss": 0.9643, + "step": 855 + }, + { + "epoch": 0.04615052835885271, + "grad_norm": 0.9249217510223389, + "learning_rate": 9.997431358348329e-06, + "loss": 0.8411, + "step": 856 + }, + { + "epoch": 0.046204442527496226, + "grad_norm": 0.9823042154312134, + "learning_rate": 9.99742455908958e-06, + "loss": 0.9406, + "step": 857 + }, + { + "epoch": 0.046258356696139744, + "grad_norm": 1.2525794506072998, + "learning_rate": 9.997417750846134e-06, + "loss": 0.8507, + "step": 858 + }, + { + "epoch": 0.04631227086478327, + "grad_norm": 0.9583309888839722, + "learning_rate": 9.997410933618006e-06, + "loss": 0.8504, + "step": 859 + }, + { + "epoch": 0.046366185033426786, + "grad_norm": 0.9264401793479919, + "learning_rate": 9.997404107405207e-06, + "loss": 0.8595, + "step": 860 + }, + { + "epoch": 0.046420099202070304, + "grad_norm": 0.9833316206932068, + "learning_rate": 9.99739727220775e-06, + "loss": 0.9025, + "step": 861 + }, + { + "epoch": 0.04647401337071382, + "grad_norm": 1.0220664739608765, + "learning_rate": 9.997390428025645e-06, + "loss": 0.8671, + "step": 862 + }, + { + "epoch": 0.04652792753935734, + "grad_norm": 1.0774664878845215, + "learning_rate": 9.997383574858908e-06, + "loss": 0.8463, + "step": 863 + }, + { + "epoch": 0.046581841708000865, + "grad_norm": 0.8821879029273987, + "learning_rate": 9.997376712707547e-06, + "loss": 0.7565, + "step": 864 + }, + { + "epoch": 0.04663575587664438, + "grad_norm": 0.9233925938606262, + "learning_rate": 9.997369841571577e-06, + "loss": 0.9151, + "step": 865 + }, + { + "epoch": 0.0466896700452879, + "grad_norm": 1.0006109476089478, + "learning_rate": 9.997362961451015e-06, + "loss": 0.8339, + "step": 866 + }, + { + "epoch": 0.04674358421393142, + "grad_norm": 0.865035891532898, + "learning_rate": 9.997356072345863e-06, + "loss": 0.8997, + "step": 867 + }, + { + "epoch": 0.046797498382574944, + "grad_norm": 1.0450654029846191, + "learning_rate": 9.99734917425614e-06, + "loss": 0.7966, + "step": 868 + }, + { + "epoch": 0.04685141255121846, + "grad_norm": 0.8878824710845947, + "learning_rate": 9.997342267181857e-06, + "loss": 0.831, + "step": 869 + }, + { + "epoch": 0.04690532671986198, + "grad_norm": 1.0056546926498413, + "learning_rate": 9.997335351123028e-06, + "loss": 0.8178, + "step": 870 + }, + { + "epoch": 0.0469592408885055, + "grad_norm": 1.0531659126281738, + "learning_rate": 9.997328426079661e-06, + "loss": 0.7773, + "step": 871 + }, + { + "epoch": 0.047013155057149016, + "grad_norm": 0.911021888256073, + "learning_rate": 9.997321492051775e-06, + "loss": 0.9001, + "step": 872 + }, + { + "epoch": 0.04706706922579254, + "grad_norm": 0.920103132724762, + "learning_rate": 9.997314549039379e-06, + "loss": 0.7222, + "step": 873 + }, + { + "epoch": 0.04712098339443606, + "grad_norm": 0.9449265599250793, + "learning_rate": 9.997307597042483e-06, + "loss": 0.9197, + "step": 874 + }, + { + "epoch": 0.047174897563079576, + "grad_norm": 1.013066291809082, + "learning_rate": 9.997300636061103e-06, + "loss": 0.8854, + "step": 875 + }, + { + "epoch": 0.047228811731723094, + "grad_norm": 0.8990256786346436, + "learning_rate": 9.99729366609525e-06, + "loss": 0.81, + "step": 876 + }, + { + "epoch": 0.04728272590036662, + "grad_norm": 1.0211769342422485, + "learning_rate": 9.997286687144938e-06, + "loss": 0.8335, + "step": 877 + }, + { + "epoch": 0.04733664006901014, + "grad_norm": 1.14606773853302, + "learning_rate": 9.997279699210178e-06, + "loss": 1.0956, + "step": 878 + }, + { + "epoch": 0.047390554237653655, + "grad_norm": 0.982725977897644, + "learning_rate": 9.997272702290981e-06, + "loss": 0.8289, + "step": 879 + }, + { + "epoch": 0.04744446840629717, + "grad_norm": 0.8667361736297607, + "learning_rate": 9.997265696387364e-06, + "loss": 0.8056, + "step": 880 + }, + { + "epoch": 0.04749838257494069, + "grad_norm": 0.9029837250709534, + "learning_rate": 9.997258681499338e-06, + "loss": 0.8461, + "step": 881 + }, + { + "epoch": 0.047552296743584216, + "grad_norm": 0.8767060041427612, + "learning_rate": 9.997251657626915e-06, + "loss": 0.8162, + "step": 882 + }, + { + "epoch": 0.047606210912227734, + "grad_norm": 1.4750713109970093, + "learning_rate": 9.997244624770104e-06, + "loss": 0.8677, + "step": 883 + }, + { + "epoch": 0.04766012508087125, + "grad_norm": 1.001286506652832, + "learning_rate": 9.997237582928924e-06, + "loss": 0.7673, + "step": 884 + }, + { + "epoch": 0.04771403924951477, + "grad_norm": 0.9560269713401794, + "learning_rate": 9.997230532103384e-06, + "loss": 0.8597, + "step": 885 + }, + { + "epoch": 0.047767953418158295, + "grad_norm": 0.834237277507782, + "learning_rate": 9.997223472293499e-06, + "loss": 0.7629, + "step": 886 + }, + { + "epoch": 0.04782186758680181, + "grad_norm": 0.9642406702041626, + "learning_rate": 9.997216403499278e-06, + "loss": 0.83, + "step": 887 + }, + { + "epoch": 0.04787578175544533, + "grad_norm": 1.2931480407714844, + "learning_rate": 9.997209325720736e-06, + "loss": 1.0333, + "step": 888 + }, + { + "epoch": 0.04792969592408885, + "grad_norm": 0.8024531602859497, + "learning_rate": 9.997202238957886e-06, + "loss": 0.7166, + "step": 889 + }, + { + "epoch": 0.04798361009273237, + "grad_norm": 0.9585899710655212, + "learning_rate": 9.997195143210741e-06, + "loss": 0.8099, + "step": 890 + }, + { + "epoch": 0.04803752426137589, + "grad_norm": 0.9917063117027283, + "learning_rate": 9.997188038479313e-06, + "loss": 0.8486, + "step": 891 + }, + { + "epoch": 0.04809143843001941, + "grad_norm": 1.6290080547332764, + "learning_rate": 9.997180924763616e-06, + "loss": 0.863, + "step": 892 + }, + { + "epoch": 0.04814535259866293, + "grad_norm": 0.9488585591316223, + "learning_rate": 9.99717380206366e-06, + "loss": 0.8277, + "step": 893 + }, + { + "epoch": 0.048199266767306445, + "grad_norm": 1.0710817575454712, + "learning_rate": 9.997166670379459e-06, + "loss": 0.8898, + "step": 894 + }, + { + "epoch": 0.04825318093594997, + "grad_norm": 0.9916248917579651, + "learning_rate": 9.997159529711026e-06, + "loss": 0.9144, + "step": 895 + }, + { + "epoch": 0.04830709510459349, + "grad_norm": 1.0074565410614014, + "learning_rate": 9.997152380058378e-06, + "loss": 0.8391, + "step": 896 + }, + { + "epoch": 0.048361009273237006, + "grad_norm": 1.0258312225341797, + "learning_rate": 9.99714522142152e-06, + "loss": 0.973, + "step": 897 + }, + { + "epoch": 0.048414923441880524, + "grad_norm": 0.9497826099395752, + "learning_rate": 9.99713805380047e-06, + "loss": 0.9221, + "step": 898 + }, + { + "epoch": 0.04846883761052405, + "grad_norm": 0.9103115200996399, + "learning_rate": 9.99713087719524e-06, + "loss": 0.7942, + "step": 899 + }, + { + "epoch": 0.04852275177916757, + "grad_norm": 0.9810470938682556, + "learning_rate": 9.997123691605843e-06, + "loss": 0.8673, + "step": 900 + }, + { + "epoch": 0.048576665947811085, + "grad_norm": 1.0422937870025635, + "learning_rate": 9.997116497032291e-06, + "loss": 0.9263, + "step": 901 + }, + { + "epoch": 0.0486305801164546, + "grad_norm": 0.8522017002105713, + "learning_rate": 9.997109293474596e-06, + "loss": 0.8296, + "step": 902 + }, + { + "epoch": 0.04868449428509812, + "grad_norm": 0.818270742893219, + "learning_rate": 9.997102080932775e-06, + "loss": 0.7898, + "step": 903 + }, + { + "epoch": 0.048738408453741645, + "grad_norm": 0.9286766648292542, + "learning_rate": 9.997094859406838e-06, + "loss": 0.8751, + "step": 904 + }, + { + "epoch": 0.04879232262238516, + "grad_norm": 1.0779087543487549, + "learning_rate": 9.997087628896797e-06, + "loss": 0.8377, + "step": 905 + }, + { + "epoch": 0.04884623679102868, + "grad_norm": 0.8711867928504944, + "learning_rate": 9.997080389402667e-06, + "loss": 0.8547, + "step": 906 + }, + { + "epoch": 0.0489001509596722, + "grad_norm": 0.8919721245765686, + "learning_rate": 9.99707314092446e-06, + "loss": 0.8178, + "step": 907 + }, + { + "epoch": 0.048954065128315724, + "grad_norm": 0.9084917306900024, + "learning_rate": 9.997065883462192e-06, + "loss": 0.8618, + "step": 908 + }, + { + "epoch": 0.04900797929695924, + "grad_norm": 0.869216799736023, + "learning_rate": 9.997058617015871e-06, + "loss": 0.8636, + "step": 909 + }, + { + "epoch": 0.04906189346560276, + "grad_norm": 0.9376553893089294, + "learning_rate": 9.997051341585513e-06, + "loss": 0.8986, + "step": 910 + }, + { + "epoch": 0.04911580763424628, + "grad_norm": 0.9041107892990112, + "learning_rate": 9.99704405717113e-06, + "loss": 0.817, + "step": 911 + }, + { + "epoch": 0.0491697218028898, + "grad_norm": 0.9530431628227234, + "learning_rate": 9.997036763772737e-06, + "loss": 0.9464, + "step": 912 + }, + { + "epoch": 0.04922363597153332, + "grad_norm": 0.9601117968559265, + "learning_rate": 9.997029461390344e-06, + "loss": 0.9014, + "step": 913 + }, + { + "epoch": 0.04927755014017684, + "grad_norm": 0.9162781834602356, + "learning_rate": 9.997022150023968e-06, + "loss": 0.8851, + "step": 914 + }, + { + "epoch": 0.04933146430882036, + "grad_norm": 0.9514605402946472, + "learning_rate": 9.99701482967362e-06, + "loss": 0.8975, + "step": 915 + }, + { + "epoch": 0.049385378477463875, + "grad_norm": 0.897203803062439, + "learning_rate": 9.997007500339313e-06, + "loss": 0.8371, + "step": 916 + }, + { + "epoch": 0.0494392926461074, + "grad_norm": 0.9372673630714417, + "learning_rate": 9.99700016202106e-06, + "loss": 0.9432, + "step": 917 + }, + { + "epoch": 0.04949320681475092, + "grad_norm": 0.8993443846702576, + "learning_rate": 9.996992814718875e-06, + "loss": 0.8528, + "step": 918 + }, + { + "epoch": 0.049547120983394435, + "grad_norm": 0.9300720691680908, + "learning_rate": 9.996985458432771e-06, + "loss": 0.873, + "step": 919 + }, + { + "epoch": 0.04960103515203795, + "grad_norm": 0.9311426281929016, + "learning_rate": 9.996978093162761e-06, + "loss": 0.9092, + "step": 920 + }, + { + "epoch": 0.04965494932068148, + "grad_norm": 0.9244507551193237, + "learning_rate": 9.996970718908859e-06, + "loss": 0.764, + "step": 921 + }, + { + "epoch": 0.049708863489324996, + "grad_norm": 0.915512204170227, + "learning_rate": 9.996963335671074e-06, + "loss": 0.8328, + "step": 922 + }, + { + "epoch": 0.049762777657968514, + "grad_norm": 0.889994740486145, + "learning_rate": 9.996955943449426e-06, + "loss": 0.8491, + "step": 923 + }, + { + "epoch": 0.04981669182661203, + "grad_norm": 0.8676478266716003, + "learning_rate": 9.996948542243925e-06, + "loss": 0.7677, + "step": 924 + }, + { + "epoch": 0.04987060599525555, + "grad_norm": 0.9795013070106506, + "learning_rate": 9.996941132054586e-06, + "loss": 0.9279, + "step": 925 + }, + { + "epoch": 0.049924520163899075, + "grad_norm": 0.940078854560852, + "learning_rate": 9.996933712881419e-06, + "loss": 0.8685, + "step": 926 + }, + { + "epoch": 0.04997843433254259, + "grad_norm": 0.9440926313400269, + "learning_rate": 9.996926284724437e-06, + "loss": 0.9634, + "step": 927 + }, + { + "epoch": 0.05003234850118611, + "grad_norm": 0.9120537638664246, + "learning_rate": 9.99691884758366e-06, + "loss": 0.7656, + "step": 928 + }, + { + "epoch": 0.05008626266982963, + "grad_norm": 1.1514596939086914, + "learning_rate": 9.996911401459093e-06, + "loss": 0.864, + "step": 929 + }, + { + "epoch": 0.050140176838473154, + "grad_norm": 0.8924434185028076, + "learning_rate": 9.996903946350756e-06, + "loss": 0.877, + "step": 930 + }, + { + "epoch": 0.05019409100711667, + "grad_norm": 0.9884456992149353, + "learning_rate": 9.996896482258657e-06, + "loss": 0.94, + "step": 931 + }, + { + "epoch": 0.05024800517576019, + "grad_norm": 0.9282665252685547, + "learning_rate": 9.996889009182814e-06, + "loss": 0.8443, + "step": 932 + }, + { + "epoch": 0.05030191934440371, + "grad_norm": 1.1029064655303955, + "learning_rate": 9.996881527123237e-06, + "loss": 0.9168, + "step": 933 + }, + { + "epoch": 0.050355833513047225, + "grad_norm": 0.839625358581543, + "learning_rate": 9.996874036079942e-06, + "loss": 0.8261, + "step": 934 + }, + { + "epoch": 0.05040974768169075, + "grad_norm": 0.8612869381904602, + "learning_rate": 9.996866536052942e-06, + "loss": 0.8197, + "step": 935 + }, + { + "epoch": 0.05046366185033427, + "grad_norm": 0.9483891129493713, + "learning_rate": 9.996859027042249e-06, + "loss": 0.8374, + "step": 936 + }, + { + "epoch": 0.050517576018977786, + "grad_norm": 0.9374566674232483, + "learning_rate": 9.996851509047877e-06, + "loss": 0.8884, + "step": 937 + }, + { + "epoch": 0.050571490187621304, + "grad_norm": 0.9164647459983826, + "learning_rate": 9.99684398206984e-06, + "loss": 0.8419, + "step": 938 + }, + { + "epoch": 0.05062540435626483, + "grad_norm": 1.0109184980392456, + "learning_rate": 9.996836446108153e-06, + "loss": 0.8912, + "step": 939 + }, + { + "epoch": 0.05067931852490835, + "grad_norm": 0.8549674153327942, + "learning_rate": 9.996828901162825e-06, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.050733232693551865, + "grad_norm": 0.9618684649467468, + "learning_rate": 9.996821347233875e-06, + "loss": 0.8246, + "step": 941 + }, + { + "epoch": 0.05078714686219538, + "grad_norm": 0.9777100682258606, + "learning_rate": 9.996813784321314e-06, + "loss": 0.887, + "step": 942 + }, + { + "epoch": 0.05084106103083891, + "grad_norm": 0.8675182461738586, + "learning_rate": 9.996806212425157e-06, + "loss": 0.7584, + "step": 943 + }, + { + "epoch": 0.050894975199482426, + "grad_norm": 0.9174523949623108, + "learning_rate": 9.996798631545414e-06, + "loss": 0.8911, + "step": 944 + }, + { + "epoch": 0.050948889368125944, + "grad_norm": 0.9269078373908997, + "learning_rate": 9.996791041682101e-06, + "loss": 0.8049, + "step": 945 + }, + { + "epoch": 0.05100280353676946, + "grad_norm": 0.8447721600532532, + "learning_rate": 9.996783442835233e-06, + "loss": 0.7781, + "step": 946 + }, + { + "epoch": 0.05105671770541298, + "grad_norm": 0.9178231954574585, + "learning_rate": 9.99677583500482e-06, + "loss": 0.8107, + "step": 947 + }, + { + "epoch": 0.051110631874056504, + "grad_norm": 0.8741039633750916, + "learning_rate": 9.996768218190879e-06, + "loss": 0.9278, + "step": 948 + }, + { + "epoch": 0.05116454604270002, + "grad_norm": 0.7997228503227234, + "learning_rate": 9.996760592393425e-06, + "loss": 0.7706, + "step": 949 + }, + { + "epoch": 0.05121846021134354, + "grad_norm": 1.003300428390503, + "learning_rate": 9.996752957612468e-06, + "loss": 0.8464, + "step": 950 + }, + { + "epoch": 0.05127237437998706, + "grad_norm": 0.9237748980522156, + "learning_rate": 9.996745313848021e-06, + "loss": 0.9088, + "step": 951 + }, + { + "epoch": 0.05132628854863058, + "grad_norm": 0.8565654754638672, + "learning_rate": 9.996737661100103e-06, + "loss": 0.8208, + "step": 952 + }, + { + "epoch": 0.0513802027172741, + "grad_norm": 1.0590770244598389, + "learning_rate": 9.996729999368722e-06, + "loss": 0.9272, + "step": 953 + }, + { + "epoch": 0.05143411688591762, + "grad_norm": 0.8888198733329773, + "learning_rate": 9.996722328653897e-06, + "loss": 0.8264, + "step": 954 + }, + { + "epoch": 0.05148803105456114, + "grad_norm": 0.9211130142211914, + "learning_rate": 9.996714648955636e-06, + "loss": 0.8807, + "step": 955 + }, + { + "epoch": 0.051541945223204655, + "grad_norm": 1.0241321325302124, + "learning_rate": 9.996706960273958e-06, + "loss": 0.7638, + "step": 956 + }, + { + "epoch": 0.05159585939184818, + "grad_norm": 0.903762698173523, + "learning_rate": 9.996699262608875e-06, + "loss": 0.8583, + "step": 957 + }, + { + "epoch": 0.0516497735604917, + "grad_norm": 0.9271189570426941, + "learning_rate": 9.9966915559604e-06, + "loss": 0.8341, + "step": 958 + }, + { + "epoch": 0.051703687729135216, + "grad_norm": 0.865260899066925, + "learning_rate": 9.996683840328546e-06, + "loss": 0.9136, + "step": 959 + }, + { + "epoch": 0.051757601897778734, + "grad_norm": 0.8903625011444092, + "learning_rate": 9.996676115713332e-06, + "loss": 0.8706, + "step": 960 + }, + { + "epoch": 0.05181151606642226, + "grad_norm": 0.9228227138519287, + "learning_rate": 9.996668382114765e-06, + "loss": 0.8825, + "step": 961 + }, + { + "epoch": 0.051865430235065776, + "grad_norm": 0.9146421551704407, + "learning_rate": 9.996660639532863e-06, + "loss": 0.8347, + "step": 962 + }, + { + "epoch": 0.051919344403709294, + "grad_norm": 0.9010991454124451, + "learning_rate": 9.99665288796764e-06, + "loss": 0.8016, + "step": 963 + }, + { + "epoch": 0.05197325857235281, + "grad_norm": 0.8763105869293213, + "learning_rate": 9.996645127419107e-06, + "loss": 0.8651, + "step": 964 + }, + { + "epoch": 0.05202717274099634, + "grad_norm": 0.9506256580352783, + "learning_rate": 9.996637357887281e-06, + "loss": 0.9429, + "step": 965 + }, + { + "epoch": 0.052081086909639855, + "grad_norm": 0.9484269022941589, + "learning_rate": 9.996629579372175e-06, + "loss": 0.855, + "step": 966 + }, + { + "epoch": 0.05213500107828337, + "grad_norm": 0.8970646262168884, + "learning_rate": 9.996621791873804e-06, + "loss": 0.8611, + "step": 967 + }, + { + "epoch": 0.05218891524692689, + "grad_norm": 0.8925203680992126, + "learning_rate": 9.99661399539218e-06, + "loss": 0.8206, + "step": 968 + }, + { + "epoch": 0.05224282941557041, + "grad_norm": 1.069669246673584, + "learning_rate": 9.996606189927318e-06, + "loss": 0.876, + "step": 969 + }, + { + "epoch": 0.052296743584213934, + "grad_norm": 0.8456307649612427, + "learning_rate": 9.996598375479232e-06, + "loss": 0.7514, + "step": 970 + }, + { + "epoch": 0.05235065775285745, + "grad_norm": 0.9182801246643066, + "learning_rate": 9.996590552047936e-06, + "loss": 0.8915, + "step": 971 + }, + { + "epoch": 0.05240457192150097, + "grad_norm": 0.7616676688194275, + "learning_rate": 9.996582719633445e-06, + "loss": 0.7106, + "step": 972 + }, + { + "epoch": 0.05245848609014449, + "grad_norm": 0.8873127102851868, + "learning_rate": 9.99657487823577e-06, + "loss": 0.9171, + "step": 973 + }, + { + "epoch": 0.05251240025878801, + "grad_norm": 0.9724618792533875, + "learning_rate": 9.996567027854929e-06, + "loss": 0.9765, + "step": 974 + }, + { + "epoch": 0.05256631442743153, + "grad_norm": 0.9106513857841492, + "learning_rate": 9.996559168490933e-06, + "loss": 0.8332, + "step": 975 + }, + { + "epoch": 0.05262022859607505, + "grad_norm": 0.8551159501075745, + "learning_rate": 9.996551300143798e-06, + "loss": 0.8128, + "step": 976 + }, + { + "epoch": 0.052674142764718566, + "grad_norm": 0.9829822182655334, + "learning_rate": 9.996543422813539e-06, + "loss": 0.9088, + "step": 977 + }, + { + "epoch": 0.052728056933362084, + "grad_norm": 0.8281888961791992, + "learning_rate": 9.996535536500166e-06, + "loss": 0.8338, + "step": 978 + }, + { + "epoch": 0.05278197110200561, + "grad_norm": 0.951319694519043, + "learning_rate": 9.9965276412037e-06, + "loss": 0.9359, + "step": 979 + }, + { + "epoch": 0.05283588527064913, + "grad_norm": 0.841390073299408, + "learning_rate": 9.996519736924148e-06, + "loss": 0.7952, + "step": 980 + }, + { + "epoch": 0.052889799439292645, + "grad_norm": 0.8847686648368835, + "learning_rate": 9.996511823661528e-06, + "loss": 0.8435, + "step": 981 + }, + { + "epoch": 0.05294371360793616, + "grad_norm": 0.9261316061019897, + "learning_rate": 9.996503901415855e-06, + "loss": 0.8646, + "step": 982 + }, + { + "epoch": 0.05299762777657969, + "grad_norm": 0.9366586804389954, + "learning_rate": 9.99649597018714e-06, + "loss": 0.8586, + "step": 983 + }, + { + "epoch": 0.053051541945223206, + "grad_norm": 0.8916764259338379, + "learning_rate": 9.9964880299754e-06, + "loss": 0.8215, + "step": 984 + }, + { + "epoch": 0.053105456113866724, + "grad_norm": 0.9496534466743469, + "learning_rate": 9.996480080780648e-06, + "loss": 0.7984, + "step": 985 + }, + { + "epoch": 0.05315937028251024, + "grad_norm": 0.9736526608467102, + "learning_rate": 9.9964721226029e-06, + "loss": 0.7881, + "step": 986 + }, + { + "epoch": 0.05321328445115376, + "grad_norm": 0.9533856511116028, + "learning_rate": 9.996464155442167e-06, + "loss": 0.9855, + "step": 987 + }, + { + "epoch": 0.053267198619797285, + "grad_norm": 0.9656437039375305, + "learning_rate": 9.996456179298467e-06, + "loss": 0.9571, + "step": 988 + }, + { + "epoch": 0.0533211127884408, + "grad_norm": 0.8887313008308411, + "learning_rate": 9.996448194171813e-06, + "loss": 0.9381, + "step": 989 + }, + { + "epoch": 0.05337502695708432, + "grad_norm": 1.0181535482406616, + "learning_rate": 9.996440200062217e-06, + "loss": 0.8834, + "step": 990 + }, + { + "epoch": 0.05342894112572784, + "grad_norm": 0.9083503484725952, + "learning_rate": 9.996432196969696e-06, + "loss": 0.9733, + "step": 991 + }, + { + "epoch": 0.05348285529437136, + "grad_norm": 0.9051093459129333, + "learning_rate": 9.996424184894264e-06, + "loss": 0.8531, + "step": 992 + }, + { + "epoch": 0.05353676946301488, + "grad_norm": 1.0264357328414917, + "learning_rate": 9.996416163835935e-06, + "loss": 0.9212, + "step": 993 + }, + { + "epoch": 0.0535906836316584, + "grad_norm": 1.0350812673568726, + "learning_rate": 9.996408133794726e-06, + "loss": 0.7843, + "step": 994 + }, + { + "epoch": 0.05364459780030192, + "grad_norm": 0.9610341787338257, + "learning_rate": 9.996400094770647e-06, + "loss": 0.8561, + "step": 995 + }, + { + "epoch": 0.05369851196894544, + "grad_norm": 0.8123961687088013, + "learning_rate": 9.996392046763714e-06, + "loss": 0.8296, + "step": 996 + }, + { + "epoch": 0.05375242613758896, + "grad_norm": 0.9337920546531677, + "learning_rate": 9.996383989773942e-06, + "loss": 0.8525, + "step": 997 + }, + { + "epoch": 0.05380634030623248, + "grad_norm": 1.1319444179534912, + "learning_rate": 9.996375923801347e-06, + "loss": 0.9127, + "step": 998 + }, + { + "epoch": 0.053860254474875996, + "grad_norm": 0.8506798148155212, + "learning_rate": 9.996367848845941e-06, + "loss": 0.884, + "step": 999 + }, + { + "epoch": 0.053914168643519514, + "grad_norm": 0.8248615860939026, + "learning_rate": 9.996359764907739e-06, + "loss": 0.7579, + "step": 1000 + }, + { + "epoch": 0.05396808281216304, + "grad_norm": 0.9258946180343628, + "learning_rate": 9.996351671986756e-06, + "loss": 0.8632, + "step": 1001 + }, + { + "epoch": 0.05402199698080656, + "grad_norm": 0.8891279101371765, + "learning_rate": 9.996343570083006e-06, + "loss": 0.8758, + "step": 1002 + }, + { + "epoch": 0.054075911149450075, + "grad_norm": 0.9592086672782898, + "learning_rate": 9.996335459196505e-06, + "loss": 0.8962, + "step": 1003 + }, + { + "epoch": 0.05412982531809359, + "grad_norm": 0.8937798738479614, + "learning_rate": 9.996327339327267e-06, + "loss": 0.8434, + "step": 1004 + }, + { + "epoch": 0.05418373948673712, + "grad_norm": 0.9602083563804626, + "learning_rate": 9.996319210475307e-06, + "loss": 0.9692, + "step": 1005 + }, + { + "epoch": 0.054237653655380635, + "grad_norm": 0.870637834072113, + "learning_rate": 9.996311072640637e-06, + "loss": 0.9146, + "step": 1006 + }, + { + "epoch": 0.05429156782402415, + "grad_norm": 0.9330273866653442, + "learning_rate": 9.996302925823276e-06, + "loss": 0.8584, + "step": 1007 + }, + { + "epoch": 0.05434548199266767, + "grad_norm": 0.8185963034629822, + "learning_rate": 9.996294770023234e-06, + "loss": 0.7854, + "step": 1008 + }, + { + "epoch": 0.05439939616131119, + "grad_norm": 0.8727489113807678, + "learning_rate": 9.996286605240528e-06, + "loss": 0.7388, + "step": 1009 + }, + { + "epoch": 0.054453310329954714, + "grad_norm": 1.0858477354049683, + "learning_rate": 9.996278431475172e-06, + "loss": 0.9201, + "step": 1010 + }, + { + "epoch": 0.05450722449859823, + "grad_norm": 0.9749255776405334, + "learning_rate": 9.996270248727184e-06, + "loss": 0.9041, + "step": 1011 + }, + { + "epoch": 0.05456113866724175, + "grad_norm": 0.9460576176643372, + "learning_rate": 9.996262056996575e-06, + "loss": 0.8553, + "step": 1012 + }, + { + "epoch": 0.05461505283588527, + "grad_norm": 0.9379808306694031, + "learning_rate": 9.99625385628336e-06, + "loss": 0.9253, + "step": 1013 + }, + { + "epoch": 0.05466896700452879, + "grad_norm": 0.8154170513153076, + "learning_rate": 9.996245646587553e-06, + "loss": 0.8703, + "step": 1014 + }, + { + "epoch": 0.05472288117317231, + "grad_norm": 0.9122161269187927, + "learning_rate": 9.996237427909172e-06, + "loss": 0.7734, + "step": 1015 + }, + { + "epoch": 0.05477679534181583, + "grad_norm": 0.9049486517906189, + "learning_rate": 9.996229200248228e-06, + "loss": 0.8991, + "step": 1016 + }, + { + "epoch": 0.05483070951045935, + "grad_norm": 0.9244295358657837, + "learning_rate": 9.996220963604741e-06, + "loss": 0.8514, + "step": 1017 + }, + { + "epoch": 0.05488462367910287, + "grad_norm": 0.9817934036254883, + "learning_rate": 9.99621271797872e-06, + "loss": 0.8641, + "step": 1018 + }, + { + "epoch": 0.05493853784774639, + "grad_norm": 0.9253972768783569, + "learning_rate": 9.996204463370182e-06, + "loss": 0.9199, + "step": 1019 + }, + { + "epoch": 0.05499245201638991, + "grad_norm": 0.9114319682121277, + "learning_rate": 9.996196199779145e-06, + "loss": 0.8063, + "step": 1020 + }, + { + "epoch": 0.055046366185033425, + "grad_norm": 0.9643195867538452, + "learning_rate": 9.996187927205619e-06, + "loss": 0.9668, + "step": 1021 + }, + { + "epoch": 0.05510028035367694, + "grad_norm": 0.8127598166465759, + "learning_rate": 9.996179645649622e-06, + "loss": 0.764, + "step": 1022 + }, + { + "epoch": 0.05515419452232047, + "grad_norm": 0.8728108406066895, + "learning_rate": 9.996171355111167e-06, + "loss": 0.7703, + "step": 1023 + }, + { + "epoch": 0.055208108690963986, + "grad_norm": 0.8554317355155945, + "learning_rate": 9.996163055590269e-06, + "loss": 0.8266, + "step": 1024 + }, + { + "epoch": 0.055262022859607504, + "grad_norm": 0.7951076030731201, + "learning_rate": 9.996154747086946e-06, + "loss": 0.7601, + "step": 1025 + }, + { + "epoch": 0.05531593702825102, + "grad_norm": 0.8916927576065063, + "learning_rate": 9.996146429601208e-06, + "loss": 0.8936, + "step": 1026 + }, + { + "epoch": 0.05536985119689455, + "grad_norm": 1.0242576599121094, + "learning_rate": 9.996138103133075e-06, + "loss": 0.8868, + "step": 1027 + }, + { + "epoch": 0.055423765365538065, + "grad_norm": 0.9273019433021545, + "learning_rate": 9.996129767682557e-06, + "loss": 0.8622, + "step": 1028 + }, + { + "epoch": 0.05547767953418158, + "grad_norm": 0.9547039866447449, + "learning_rate": 9.996121423249673e-06, + "loss": 0.7814, + "step": 1029 + }, + { + "epoch": 0.0555315937028251, + "grad_norm": 0.8750621676445007, + "learning_rate": 9.996113069834437e-06, + "loss": 0.7717, + "step": 1030 + }, + { + "epoch": 0.05558550787146862, + "grad_norm": 0.9547988176345825, + "learning_rate": 9.996104707436862e-06, + "loss": 0.8877, + "step": 1031 + }, + { + "epoch": 0.055639422040112144, + "grad_norm": 0.8856480717658997, + "learning_rate": 9.996096336056966e-06, + "loss": 0.7927, + "step": 1032 + }, + { + "epoch": 0.05569333620875566, + "grad_norm": 0.8311342000961304, + "learning_rate": 9.99608795569476e-06, + "loss": 0.7847, + "step": 1033 + }, + { + "epoch": 0.05574725037739918, + "grad_norm": 1.0720731019973755, + "learning_rate": 9.996079566350266e-06, + "loss": 0.9243, + "step": 1034 + }, + { + "epoch": 0.0558011645460427, + "grad_norm": 0.9498684406280518, + "learning_rate": 9.996071168023491e-06, + "loss": 0.8605, + "step": 1035 + }, + { + "epoch": 0.05585507871468622, + "grad_norm": 0.9043952822685242, + "learning_rate": 9.996062760714456e-06, + "loss": 0.8488, + "step": 1036 + }, + { + "epoch": 0.05590899288332974, + "grad_norm": 0.8051116466522217, + "learning_rate": 9.996054344423173e-06, + "loss": 0.8275, + "step": 1037 + }, + { + "epoch": 0.05596290705197326, + "grad_norm": 0.857120156288147, + "learning_rate": 9.996045919149658e-06, + "loss": 0.8837, + "step": 1038 + }, + { + "epoch": 0.056016821220616776, + "grad_norm": 0.8810911774635315, + "learning_rate": 9.996037484893926e-06, + "loss": 0.8179, + "step": 1039 + }, + { + "epoch": 0.056070735389260294, + "grad_norm": 0.8783093690872192, + "learning_rate": 9.996029041655994e-06, + "loss": 0.7734, + "step": 1040 + }, + { + "epoch": 0.05612464955790382, + "grad_norm": 0.9281952977180481, + "learning_rate": 9.996020589435874e-06, + "loss": 0.8747, + "step": 1041 + }, + { + "epoch": 0.05617856372654734, + "grad_norm": 0.8307299613952637, + "learning_rate": 9.996012128233583e-06, + "loss": 0.8055, + "step": 1042 + }, + { + "epoch": 0.056232477895190855, + "grad_norm": 0.9520873427391052, + "learning_rate": 9.996003658049136e-06, + "loss": 0.8181, + "step": 1043 + }, + { + "epoch": 0.05628639206383437, + "grad_norm": 0.8753806948661804, + "learning_rate": 9.995995178882549e-06, + "loss": 0.808, + "step": 1044 + }, + { + "epoch": 0.0563403062324779, + "grad_norm": 1.067691683769226, + "learning_rate": 9.995986690733836e-06, + "loss": 0.8048, + "step": 1045 + }, + { + "epoch": 0.056394220401121416, + "grad_norm": 0.8575261235237122, + "learning_rate": 9.995978193603013e-06, + "loss": 0.9231, + "step": 1046 + }, + { + "epoch": 0.056448134569764934, + "grad_norm": 0.9857104420661926, + "learning_rate": 9.995969687490096e-06, + "loss": 0.8883, + "step": 1047 + }, + { + "epoch": 0.05650204873840845, + "grad_norm": 0.9203484654426575, + "learning_rate": 9.995961172395098e-06, + "loss": 0.7634, + "step": 1048 + }, + { + "epoch": 0.056555962907051976, + "grad_norm": 0.8741904497146606, + "learning_rate": 9.995952648318036e-06, + "loss": 0.8061, + "step": 1049 + }, + { + "epoch": 0.056609877075695494, + "grad_norm": 0.9495588541030884, + "learning_rate": 9.995944115258925e-06, + "loss": 0.8922, + "step": 1050 + }, + { + "epoch": 0.05666379124433901, + "grad_norm": 0.9306020140647888, + "learning_rate": 9.99593557321778e-06, + "loss": 0.8454, + "step": 1051 + }, + { + "epoch": 0.05671770541298253, + "grad_norm": 0.9457784295082092, + "learning_rate": 9.995927022194615e-06, + "loss": 0.8701, + "step": 1052 + }, + { + "epoch": 0.05677161958162605, + "grad_norm": 0.88719242811203, + "learning_rate": 9.99591846218945e-06, + "loss": 0.8416, + "step": 1053 + }, + { + "epoch": 0.05682553375026957, + "grad_norm": 0.8740848302841187, + "learning_rate": 9.995909893202296e-06, + "loss": 0.7962, + "step": 1054 + }, + { + "epoch": 0.05687944791891309, + "grad_norm": 1.0149377584457397, + "learning_rate": 9.99590131523317e-06, + "loss": 0.8352, + "step": 1055 + }, + { + "epoch": 0.05693336208755661, + "grad_norm": 0.9014917016029358, + "learning_rate": 9.995892728282088e-06, + "loss": 0.9244, + "step": 1056 + }, + { + "epoch": 0.05698727625620013, + "grad_norm": 0.9351898431777954, + "learning_rate": 9.995884132349062e-06, + "loss": 0.865, + "step": 1057 + }, + { + "epoch": 0.05704119042484365, + "grad_norm": 0.8656749129295349, + "learning_rate": 9.995875527434113e-06, + "loss": 0.8836, + "step": 1058 + }, + { + "epoch": 0.05709510459348717, + "grad_norm": 0.9120789170265198, + "learning_rate": 9.995866913537254e-06, + "loss": 0.8772, + "step": 1059 + }, + { + "epoch": 0.05714901876213069, + "grad_norm": 1.0019149780273438, + "learning_rate": 9.995858290658497e-06, + "loss": 0.9338, + "step": 1060 + }, + { + "epoch": 0.057202932930774206, + "grad_norm": 0.8492977023124695, + "learning_rate": 9.995849658797863e-06, + "loss": 0.742, + "step": 1061 + }, + { + "epoch": 0.057256847099417724, + "grad_norm": 1.000607967376709, + "learning_rate": 9.995841017955363e-06, + "loss": 0.8498, + "step": 1062 + }, + { + "epoch": 0.05731076126806125, + "grad_norm": 1.0268487930297852, + "learning_rate": 9.995832368131016e-06, + "loss": 0.8937, + "step": 1063 + }, + { + "epoch": 0.057364675436704766, + "grad_norm": 0.9388830661773682, + "learning_rate": 9.995823709324836e-06, + "loss": 0.877, + "step": 1064 + }, + { + "epoch": 0.057418589605348284, + "grad_norm": 0.9747199416160583, + "learning_rate": 9.99581504153684e-06, + "loss": 0.8436, + "step": 1065 + }, + { + "epoch": 0.0574725037739918, + "grad_norm": 0.9125073552131653, + "learning_rate": 9.99580636476704e-06, + "loss": 0.8853, + "step": 1066 + }, + { + "epoch": 0.05752641794263533, + "grad_norm": 0.8910282254219055, + "learning_rate": 9.995797679015455e-06, + "loss": 0.8566, + "step": 1067 + }, + { + "epoch": 0.057580332111278845, + "grad_norm": 0.8546010255813599, + "learning_rate": 9.995788984282101e-06, + "loss": 0.8209, + "step": 1068 + }, + { + "epoch": 0.05763424627992236, + "grad_norm": 0.9205883145332336, + "learning_rate": 9.99578028056699e-06, + "loss": 0.7814, + "step": 1069 + }, + { + "epoch": 0.05768816044856588, + "grad_norm": 0.9627780914306641, + "learning_rate": 9.995771567870142e-06, + "loss": 0.8686, + "step": 1070 + }, + { + "epoch": 0.057742074617209406, + "grad_norm": 0.9917465448379517, + "learning_rate": 9.995762846191569e-06, + "loss": 0.9672, + "step": 1071 + }, + { + "epoch": 0.057795988785852924, + "grad_norm": 0.9396706223487854, + "learning_rate": 9.995754115531288e-06, + "loss": 0.8631, + "step": 1072 + }, + { + "epoch": 0.05784990295449644, + "grad_norm": 0.8310922980308533, + "learning_rate": 9.995745375889317e-06, + "loss": 0.8637, + "step": 1073 + }, + { + "epoch": 0.05790381712313996, + "grad_norm": 0.9085954427719116, + "learning_rate": 9.995736627265667e-06, + "loss": 0.8821, + "step": 1074 + }, + { + "epoch": 0.05795773129178348, + "grad_norm": 0.8529816269874573, + "learning_rate": 9.995727869660357e-06, + "loss": 0.8426, + "step": 1075 + }, + { + "epoch": 0.058011645460427, + "grad_norm": 0.8288499116897583, + "learning_rate": 9.995719103073403e-06, + "loss": 0.8415, + "step": 1076 + }, + { + "epoch": 0.05806555962907052, + "grad_norm": 0.9105609059333801, + "learning_rate": 9.995710327504819e-06, + "loss": 0.7683, + "step": 1077 + }, + { + "epoch": 0.05811947379771404, + "grad_norm": 0.9578274488449097, + "learning_rate": 9.995701542954622e-06, + "loss": 0.8796, + "step": 1078 + }, + { + "epoch": 0.058173387966357556, + "grad_norm": 0.8542460799217224, + "learning_rate": 9.995692749422827e-06, + "loss": 0.8363, + "step": 1079 + }, + { + "epoch": 0.05822730213500108, + "grad_norm": 0.8723183274269104, + "learning_rate": 9.99568394690945e-06, + "loss": 0.8434, + "step": 1080 + }, + { + "epoch": 0.0582812163036446, + "grad_norm": 0.9157887697219849, + "learning_rate": 9.995675135414507e-06, + "loss": 0.6532, + "step": 1081 + }, + { + "epoch": 0.05833513047228812, + "grad_norm": 0.9055691361427307, + "learning_rate": 9.995666314938014e-06, + "loss": 0.8762, + "step": 1082 + }, + { + "epoch": 0.058389044640931635, + "grad_norm": 0.8224693536758423, + "learning_rate": 9.995657485479987e-06, + "loss": 0.7976, + "step": 1083 + }, + { + "epoch": 0.05844295880957515, + "grad_norm": 0.925414502620697, + "learning_rate": 9.995648647040441e-06, + "loss": 0.8673, + "step": 1084 + }, + { + "epoch": 0.05849687297821868, + "grad_norm": 0.9194141626358032, + "learning_rate": 9.995639799619395e-06, + "loss": 0.7916, + "step": 1085 + }, + { + "epoch": 0.058550787146862196, + "grad_norm": 1.08795166015625, + "learning_rate": 9.995630943216859e-06, + "loss": 0.9135, + "step": 1086 + }, + { + "epoch": 0.058604701315505714, + "grad_norm": 0.9648925065994263, + "learning_rate": 9.995622077832854e-06, + "loss": 0.8442, + "step": 1087 + }, + { + "epoch": 0.05865861548414923, + "grad_norm": 1.0012339353561401, + "learning_rate": 9.995613203467394e-06, + "loss": 0.9543, + "step": 1088 + }, + { + "epoch": 0.05871252965279276, + "grad_norm": 0.9333881735801697, + "learning_rate": 9.995604320120496e-06, + "loss": 0.9267, + "step": 1089 + }, + { + "epoch": 0.058766443821436275, + "grad_norm": 0.8566498160362244, + "learning_rate": 9.995595427792173e-06, + "loss": 0.8539, + "step": 1090 + }, + { + "epoch": 0.05882035799007979, + "grad_norm": 0.8766364455223083, + "learning_rate": 9.995586526482446e-06, + "loss": 0.9293, + "step": 1091 + }, + { + "epoch": 0.05887427215872331, + "grad_norm": 0.9181047677993774, + "learning_rate": 9.995577616191326e-06, + "loss": 0.8333, + "step": 1092 + }, + { + "epoch": 0.05892818632736683, + "grad_norm": 0.8831031918525696, + "learning_rate": 9.995568696918833e-06, + "loss": 0.8016, + "step": 1093 + }, + { + "epoch": 0.05898210049601035, + "grad_norm": 0.8618754148483276, + "learning_rate": 9.99555976866498e-06, + "loss": 0.8988, + "step": 1094 + }, + { + "epoch": 0.05903601466465387, + "grad_norm": 0.9083183407783508, + "learning_rate": 9.995550831429785e-06, + "loss": 0.8626, + "step": 1095 + }, + { + "epoch": 0.05908992883329739, + "grad_norm": 0.8423884510993958, + "learning_rate": 9.995541885213262e-06, + "loss": 0.9121, + "step": 1096 + }, + { + "epoch": 0.05914384300194091, + "grad_norm": 0.7747607827186584, + "learning_rate": 9.99553293001543e-06, + "loss": 0.8087, + "step": 1097 + }, + { + "epoch": 0.05919775717058443, + "grad_norm": 0.8828368186950684, + "learning_rate": 9.995523965836302e-06, + "loss": 0.8284, + "step": 1098 + }, + { + "epoch": 0.05925167133922795, + "grad_norm": 0.9448524713516235, + "learning_rate": 9.995514992675896e-06, + "loss": 0.9565, + "step": 1099 + }, + { + "epoch": 0.05930558550787147, + "grad_norm": 0.8967006206512451, + "learning_rate": 9.99550601053423e-06, + "loss": 0.8412, + "step": 1100 + }, + { + "epoch": 0.059359499676514986, + "grad_norm": 0.9394551515579224, + "learning_rate": 9.995497019411315e-06, + "loss": 0.929, + "step": 1101 + }, + { + "epoch": 0.05941341384515851, + "grad_norm": 0.9002842903137207, + "learning_rate": 9.995488019307172e-06, + "loss": 0.734, + "step": 1102 + }, + { + "epoch": 0.05946732801380203, + "grad_norm": 1.3590562343597412, + "learning_rate": 9.995479010221816e-06, + "loss": 0.8843, + "step": 1103 + }, + { + "epoch": 0.05952124218244555, + "grad_norm": 1.041528582572937, + "learning_rate": 9.99546999215526e-06, + "loss": 0.9001, + "step": 1104 + }, + { + "epoch": 0.059575156351089065, + "grad_norm": 0.9846720099449158, + "learning_rate": 9.995460965107524e-06, + "loss": 0.8174, + "step": 1105 + }, + { + "epoch": 0.05962907051973258, + "grad_norm": 0.9171685576438904, + "learning_rate": 9.995451929078624e-06, + "loss": 0.8756, + "step": 1106 + }, + { + "epoch": 0.05968298468837611, + "grad_norm": 0.9155516028404236, + "learning_rate": 9.995442884068574e-06, + "loss": 0.7327, + "step": 1107 + }, + { + "epoch": 0.059736898857019625, + "grad_norm": 0.8734007477760315, + "learning_rate": 9.99543383007739e-06, + "loss": 0.8385, + "step": 1108 + }, + { + "epoch": 0.05979081302566314, + "grad_norm": 0.8580977320671082, + "learning_rate": 9.99542476710509e-06, + "loss": 0.885, + "step": 1109 + }, + { + "epoch": 0.05984472719430666, + "grad_norm": 0.8499299883842468, + "learning_rate": 9.995415695151692e-06, + "loss": 0.8323, + "step": 1110 + }, + { + "epoch": 0.059898641362950186, + "grad_norm": 0.8348694443702698, + "learning_rate": 9.99540661421721e-06, + "loss": 0.7947, + "step": 1111 + }, + { + "epoch": 0.059952555531593704, + "grad_norm": 0.8865199685096741, + "learning_rate": 9.99539752430166e-06, + "loss": 0.9363, + "step": 1112 + }, + { + "epoch": 0.06000646970023722, + "grad_norm": 0.9492315649986267, + "learning_rate": 9.995388425405059e-06, + "loss": 0.913, + "step": 1113 + }, + { + "epoch": 0.06006038386888074, + "grad_norm": 0.938252329826355, + "learning_rate": 9.995379317527422e-06, + "loss": 0.861, + "step": 1114 + }, + { + "epoch": 0.06011429803752426, + "grad_norm": 1.2601032257080078, + "learning_rate": 9.995370200668768e-06, + "loss": 0.9435, + "step": 1115 + }, + { + "epoch": 0.06016821220616778, + "grad_norm": 0.915830671787262, + "learning_rate": 9.995361074829112e-06, + "loss": 0.9372, + "step": 1116 + }, + { + "epoch": 0.0602221263748113, + "grad_norm": 1.4548465013504028, + "learning_rate": 9.995351940008473e-06, + "loss": 0.9055, + "step": 1117 + }, + { + "epoch": 0.06027604054345482, + "grad_norm": 0.9090906381607056, + "learning_rate": 9.995342796206861e-06, + "loss": 0.8849, + "step": 1118 + }, + { + "epoch": 0.06032995471209834, + "grad_norm": 0.9860616326332092, + "learning_rate": 9.995333643424298e-06, + "loss": 0.8304, + "step": 1119 + }, + { + "epoch": 0.06038386888074186, + "grad_norm": 0.8320879340171814, + "learning_rate": 9.9953244816608e-06, + "loss": 0.8432, + "step": 1120 + }, + { + "epoch": 0.06043778304938538, + "grad_norm": 0.8633564114570618, + "learning_rate": 9.995315310916381e-06, + "loss": 0.7461, + "step": 1121 + }, + { + "epoch": 0.0604916972180289, + "grad_norm": 0.881287693977356, + "learning_rate": 9.995306131191059e-06, + "loss": 0.8512, + "step": 1122 + }, + { + "epoch": 0.060545611386672415, + "grad_norm": 0.8888201713562012, + "learning_rate": 9.99529694248485e-06, + "loss": 0.8416, + "step": 1123 + }, + { + "epoch": 0.06059952555531594, + "grad_norm": 0.8073605895042419, + "learning_rate": 9.99528774479777e-06, + "loss": 0.8369, + "step": 1124 + }, + { + "epoch": 0.06065343972395946, + "grad_norm": 0.9260549545288086, + "learning_rate": 9.995278538129837e-06, + "loss": 0.8548, + "step": 1125 + }, + { + "epoch": 0.060707353892602976, + "grad_norm": 0.9169156551361084, + "learning_rate": 9.99526932248107e-06, + "loss": 0.9149, + "step": 1126 + }, + { + "epoch": 0.060761268061246494, + "grad_norm": 0.8481706380844116, + "learning_rate": 9.995260097851478e-06, + "loss": 0.8591, + "step": 1127 + }, + { + "epoch": 0.06081518222989001, + "grad_norm": 0.8934486508369446, + "learning_rate": 9.995250864241085e-06, + "loss": 0.9322, + "step": 1128 + }, + { + "epoch": 0.06086909639853354, + "grad_norm": 0.947390615940094, + "learning_rate": 9.995241621649902e-06, + "loss": 1.0015, + "step": 1129 + }, + { + "epoch": 0.060923010567177055, + "grad_norm": 0.9185096025466919, + "learning_rate": 9.995232370077949e-06, + "loss": 0.9293, + "step": 1130 + }, + { + "epoch": 0.06097692473582057, + "grad_norm": 0.9517882466316223, + "learning_rate": 9.995223109525245e-06, + "loss": 0.8673, + "step": 1131 + }, + { + "epoch": 0.06103083890446409, + "grad_norm": 1.065699815750122, + "learning_rate": 9.9952138399918e-06, + "loss": 0.9144, + "step": 1132 + }, + { + "epoch": 0.061084753073107616, + "grad_norm": 0.9048404693603516, + "learning_rate": 9.995204561477635e-06, + "loss": 0.7773, + "step": 1133 + }, + { + "epoch": 0.061138667241751134, + "grad_norm": 1.104457139968872, + "learning_rate": 9.995195273982768e-06, + "loss": 0.8847, + "step": 1134 + }, + { + "epoch": 0.06119258141039465, + "grad_norm": 0.9009587168693542, + "learning_rate": 9.995185977507212e-06, + "loss": 0.8118, + "step": 1135 + }, + { + "epoch": 0.06124649557903817, + "grad_norm": 1.0740209817886353, + "learning_rate": 9.995176672050983e-06, + "loss": 0.9173, + "step": 1136 + }, + { + "epoch": 0.06130040974768169, + "grad_norm": 0.9820743203163147, + "learning_rate": 9.995167357614104e-06, + "loss": 0.8555, + "step": 1137 + }, + { + "epoch": 0.06135432391632521, + "grad_norm": 0.9250825047492981, + "learning_rate": 9.995158034196586e-06, + "loss": 0.8771, + "step": 1138 + }, + { + "epoch": 0.06140823808496873, + "grad_norm": 0.8952597379684448, + "learning_rate": 9.995148701798447e-06, + "loss": 0.8598, + "step": 1139 + }, + { + "epoch": 0.06146215225361225, + "grad_norm": 0.8485212922096252, + "learning_rate": 9.995139360419706e-06, + "loss": 0.8557, + "step": 1140 + }, + { + "epoch": 0.061516066422255766, + "grad_norm": 0.9676715731620789, + "learning_rate": 9.995130010060377e-06, + "loss": 0.7748, + "step": 1141 + }, + { + "epoch": 0.06156998059089929, + "grad_norm": 0.7896347045898438, + "learning_rate": 9.995120650720478e-06, + "loss": 0.6183, + "step": 1142 + }, + { + "epoch": 0.06162389475954281, + "grad_norm": 0.8746615052223206, + "learning_rate": 9.995111282400024e-06, + "loss": 0.8321, + "step": 1143 + }, + { + "epoch": 0.06167780892818633, + "grad_norm": 0.9029875993728638, + "learning_rate": 9.995101905099036e-06, + "loss": 0.8686, + "step": 1144 + }, + { + "epoch": 0.061731723096829845, + "grad_norm": 0.9529547095298767, + "learning_rate": 9.995092518817528e-06, + "loss": 0.8878, + "step": 1145 + }, + { + "epoch": 0.06178563726547336, + "grad_norm": 0.8280455470085144, + "learning_rate": 9.995083123555517e-06, + "loss": 0.8232, + "step": 1146 + }, + { + "epoch": 0.06183955143411689, + "grad_norm": 0.908881664276123, + "learning_rate": 9.995073719313021e-06, + "loss": 0.8387, + "step": 1147 + }, + { + "epoch": 0.061893465602760406, + "grad_norm": 0.9137653708457947, + "learning_rate": 9.995064306090055e-06, + "loss": 0.8943, + "step": 1148 + }, + { + "epoch": 0.061947379771403924, + "grad_norm": 0.863861620426178, + "learning_rate": 9.995054883886639e-06, + "loss": 0.7435, + "step": 1149 + }, + { + "epoch": 0.06200129394004744, + "grad_norm": 0.8534915447235107, + "learning_rate": 9.995045452702786e-06, + "loss": 0.941, + "step": 1150 + }, + { + "epoch": 0.06205520810869097, + "grad_norm": 0.9469791650772095, + "learning_rate": 9.995036012538515e-06, + "loss": 0.9137, + "step": 1151 + }, + { + "epoch": 0.062109122277334484, + "grad_norm": 0.9044890999794006, + "learning_rate": 9.995026563393844e-06, + "loss": 0.9117, + "step": 1152 + }, + { + "epoch": 0.062163036445978, + "grad_norm": 0.989772379398346, + "learning_rate": 9.995017105268789e-06, + "loss": 0.8306, + "step": 1153 + }, + { + "epoch": 0.06221695061462152, + "grad_norm": 0.8586496114730835, + "learning_rate": 9.995007638163365e-06, + "loss": 0.8012, + "step": 1154 + }, + { + "epoch": 0.062270864783265045, + "grad_norm": 0.9221116304397583, + "learning_rate": 9.994998162077594e-06, + "loss": 0.7935, + "step": 1155 + }, + { + "epoch": 0.06232477895190856, + "grad_norm": 0.9453061819076538, + "learning_rate": 9.994988677011489e-06, + "loss": 0.8257, + "step": 1156 + }, + { + "epoch": 0.06237869312055208, + "grad_norm": 0.8065335154533386, + "learning_rate": 9.994979182965065e-06, + "loss": 0.86, + "step": 1157 + }, + { + "epoch": 0.0624326072891956, + "grad_norm": 0.9597793817520142, + "learning_rate": 9.994969679938346e-06, + "loss": 0.862, + "step": 1158 + }, + { + "epoch": 0.06248652145783912, + "grad_norm": 0.9118353128433228, + "learning_rate": 9.994960167931342e-06, + "loss": 0.8925, + "step": 1159 + }, + { + "epoch": 0.06254043562648263, + "grad_norm": 1.0216273069381714, + "learning_rate": 9.994950646944077e-06, + "loss": 0.7078, + "step": 1160 + }, + { + "epoch": 0.06259434979512615, + "grad_norm": 0.960182785987854, + "learning_rate": 9.994941116976562e-06, + "loss": 0.8936, + "step": 1161 + }, + { + "epoch": 0.06264826396376968, + "grad_norm": 0.9551856517791748, + "learning_rate": 9.994931578028817e-06, + "loss": 0.8053, + "step": 1162 + }, + { + "epoch": 0.0627021781324132, + "grad_norm": 0.9419867992401123, + "learning_rate": 9.994922030100857e-06, + "loss": 0.8333, + "step": 1163 + }, + { + "epoch": 0.06275609230105672, + "grad_norm": 0.9780306816101074, + "learning_rate": 9.994912473192702e-06, + "loss": 0.88, + "step": 1164 + }, + { + "epoch": 0.06281000646970024, + "grad_norm": 0.9320577383041382, + "learning_rate": 9.99490290730437e-06, + "loss": 0.8859, + "step": 1165 + }, + { + "epoch": 0.06286392063834376, + "grad_norm": 0.7692422270774841, + "learning_rate": 9.994893332435874e-06, + "loss": 0.8093, + "step": 1166 + }, + { + "epoch": 0.06291783480698727, + "grad_norm": 1.0622048377990723, + "learning_rate": 9.994883748587234e-06, + "loss": 0.8959, + "step": 1167 + }, + { + "epoch": 0.06297174897563079, + "grad_norm": 0.9598555564880371, + "learning_rate": 9.994874155758467e-06, + "loss": 0.8153, + "step": 1168 + }, + { + "epoch": 0.06302566314427431, + "grad_norm": 0.9207014441490173, + "learning_rate": 9.994864553949591e-06, + "loss": 0.9383, + "step": 1169 + }, + { + "epoch": 0.06307957731291783, + "grad_norm": 1.0074093341827393, + "learning_rate": 9.99485494316062e-06, + "loss": 0.9999, + "step": 1170 + }, + { + "epoch": 0.06313349148156136, + "grad_norm": 0.8454248905181885, + "learning_rate": 9.994845323391575e-06, + "loss": 0.7946, + "step": 1171 + }, + { + "epoch": 0.06318740565020488, + "grad_norm": 0.847578763961792, + "learning_rate": 9.99483569464247e-06, + "loss": 0.7144, + "step": 1172 + }, + { + "epoch": 0.0632413198188484, + "grad_norm": 0.9083126187324524, + "learning_rate": 9.994826056913325e-06, + "loss": 0.774, + "step": 1173 + }, + { + "epoch": 0.06329523398749191, + "grad_norm": 0.8995345830917358, + "learning_rate": 9.994816410204158e-06, + "loss": 0.8995, + "step": 1174 + }, + { + "epoch": 0.06334914815613543, + "grad_norm": 1.0547746419906616, + "learning_rate": 9.994806754514983e-06, + "loss": 0.8142, + "step": 1175 + }, + { + "epoch": 0.06340306232477895, + "grad_norm": 0.946854829788208, + "learning_rate": 9.99479708984582e-06, + "loss": 0.8639, + "step": 1176 + }, + { + "epoch": 0.06345697649342247, + "grad_norm": 0.8746247291564941, + "learning_rate": 9.994787416196683e-06, + "loss": 0.8601, + "step": 1177 + }, + { + "epoch": 0.06351089066206599, + "grad_norm": 0.9075024127960205, + "learning_rate": 9.994777733567595e-06, + "loss": 0.7969, + "step": 1178 + }, + { + "epoch": 0.0635648048307095, + "grad_norm": 0.9435486197471619, + "learning_rate": 9.994768041958569e-06, + "loss": 0.8199, + "step": 1179 + }, + { + "epoch": 0.06361871899935304, + "grad_norm": 0.8597564697265625, + "learning_rate": 9.994758341369624e-06, + "loss": 0.8791, + "step": 1180 + }, + { + "epoch": 0.06367263316799655, + "grad_norm": 0.7960480451583862, + "learning_rate": 9.994748631800777e-06, + "loss": 0.8035, + "step": 1181 + }, + { + "epoch": 0.06372654733664007, + "grad_norm": 1.1984984874725342, + "learning_rate": 9.994738913252045e-06, + "loss": 0.7372, + "step": 1182 + }, + { + "epoch": 0.06378046150528359, + "grad_norm": 0.8532997369766235, + "learning_rate": 9.994729185723446e-06, + "loss": 0.9094, + "step": 1183 + }, + { + "epoch": 0.06383437567392711, + "grad_norm": 0.8327267169952393, + "learning_rate": 9.994719449214999e-06, + "loss": 0.809, + "step": 1184 + }, + { + "epoch": 0.06388828984257063, + "grad_norm": 0.9086306691169739, + "learning_rate": 9.99470970372672e-06, + "loss": 0.8278, + "step": 1185 + }, + { + "epoch": 0.06394220401121414, + "grad_norm": 0.8422104716300964, + "learning_rate": 9.994699949258626e-06, + "loss": 0.7754, + "step": 1186 + }, + { + "epoch": 0.06399611817985766, + "grad_norm": 1.0434929132461548, + "learning_rate": 9.994690185810733e-06, + "loss": 0.908, + "step": 1187 + }, + { + "epoch": 0.06405003234850119, + "grad_norm": 1.1625720262527466, + "learning_rate": 9.994680413383064e-06, + "loss": 0.8814, + "step": 1188 + }, + { + "epoch": 0.06410394651714471, + "grad_norm": 0.9940767288208008, + "learning_rate": 9.994670631975631e-06, + "loss": 0.7846, + "step": 1189 + }, + { + "epoch": 0.06415786068578823, + "grad_norm": 0.8356907963752747, + "learning_rate": 9.994660841588457e-06, + "loss": 0.798, + "step": 1190 + }, + { + "epoch": 0.06421177485443175, + "grad_norm": 0.830348014831543, + "learning_rate": 9.994651042221552e-06, + "loss": 0.7875, + "step": 1191 + }, + { + "epoch": 0.06426568902307526, + "grad_norm": 1.1060880422592163, + "learning_rate": 9.994641233874943e-06, + "loss": 0.8893, + "step": 1192 + }, + { + "epoch": 0.06431960319171878, + "grad_norm": 0.9319590926170349, + "learning_rate": 9.994631416548637e-06, + "loss": 0.791, + "step": 1193 + }, + { + "epoch": 0.0643735173603623, + "grad_norm": 0.8345780968666077, + "learning_rate": 9.994621590242661e-06, + "loss": 0.8213, + "step": 1194 + }, + { + "epoch": 0.06442743152900582, + "grad_norm": 0.9848359227180481, + "learning_rate": 9.99461175495703e-06, + "loss": 0.735, + "step": 1195 + }, + { + "epoch": 0.06448134569764934, + "grad_norm": 0.9134055972099304, + "learning_rate": 9.994601910691758e-06, + "loss": 0.8415, + "step": 1196 + }, + { + "epoch": 0.06453525986629287, + "grad_norm": 0.8084586262702942, + "learning_rate": 9.994592057446866e-06, + "loss": 0.8702, + "step": 1197 + }, + { + "epoch": 0.06458917403493639, + "grad_norm": 0.9168767333030701, + "learning_rate": 9.994582195222371e-06, + "loss": 0.8921, + "step": 1198 + }, + { + "epoch": 0.0646430882035799, + "grad_norm": 0.8380446434020996, + "learning_rate": 9.994572324018292e-06, + "loss": 0.7705, + "step": 1199 + }, + { + "epoch": 0.06469700237222342, + "grad_norm": 0.8120049238204956, + "learning_rate": 9.994562443834646e-06, + "loss": 0.7576, + "step": 1200 + }, + { + "epoch": 0.06475091654086694, + "grad_norm": 0.9559764266014099, + "learning_rate": 9.994552554671448e-06, + "loss": 0.8427, + "step": 1201 + }, + { + "epoch": 0.06480483070951046, + "grad_norm": 0.9473673105239868, + "learning_rate": 9.99454265652872e-06, + "loss": 0.9988, + "step": 1202 + }, + { + "epoch": 0.06485874487815398, + "grad_norm": 1.0704870223999023, + "learning_rate": 9.994532749406477e-06, + "loss": 0.9499, + "step": 1203 + }, + { + "epoch": 0.0649126590467975, + "grad_norm": 0.9905646443367004, + "learning_rate": 9.994522833304738e-06, + "loss": 0.8801, + "step": 1204 + }, + { + "epoch": 0.06496657321544101, + "grad_norm": 1.194190502166748, + "learning_rate": 9.99451290822352e-06, + "loss": 0.9051, + "step": 1205 + }, + { + "epoch": 0.06502048738408454, + "grad_norm": 0.8571314811706543, + "learning_rate": 9.994502974162843e-06, + "loss": 0.8131, + "step": 1206 + }, + { + "epoch": 0.06507440155272806, + "grad_norm": 0.9769417643547058, + "learning_rate": 9.994493031122721e-06, + "loss": 0.8524, + "step": 1207 + }, + { + "epoch": 0.06512831572137158, + "grad_norm": 0.8106759786605835, + "learning_rate": 9.994483079103176e-06, + "loss": 0.8142, + "step": 1208 + }, + { + "epoch": 0.0651822298900151, + "grad_norm": 0.8817846775054932, + "learning_rate": 9.994473118104223e-06, + "loss": 0.9076, + "step": 1209 + }, + { + "epoch": 0.06523614405865862, + "grad_norm": 0.8271930813789368, + "learning_rate": 9.994463148125882e-06, + "loss": 0.7914, + "step": 1210 + }, + { + "epoch": 0.06529005822730213, + "grad_norm": 0.9060614705085754, + "learning_rate": 9.994453169168169e-06, + "loss": 0.8375, + "step": 1211 + }, + { + "epoch": 0.06534397239594565, + "grad_norm": 0.880614697933197, + "learning_rate": 9.994443181231103e-06, + "loss": 0.7751, + "step": 1212 + }, + { + "epoch": 0.06539788656458917, + "grad_norm": 0.9420819282531738, + "learning_rate": 9.994433184314702e-06, + "loss": 0.8532, + "step": 1213 + }, + { + "epoch": 0.06545180073323269, + "grad_norm": 0.8587054014205933, + "learning_rate": 9.994423178418984e-06, + "loss": 0.8804, + "step": 1214 + }, + { + "epoch": 0.06550571490187622, + "grad_norm": 0.9624550938606262, + "learning_rate": 9.994413163543965e-06, + "loss": 0.9782, + "step": 1215 + }, + { + "epoch": 0.06555962907051974, + "grad_norm": 0.9458224773406982, + "learning_rate": 9.994403139689665e-06, + "loss": 0.8274, + "step": 1216 + }, + { + "epoch": 0.06561354323916326, + "grad_norm": 1.0417940616607666, + "learning_rate": 9.994393106856104e-06, + "loss": 0.9065, + "step": 1217 + }, + { + "epoch": 0.06566745740780677, + "grad_norm": 1.0225417613983154, + "learning_rate": 9.994383065043296e-06, + "loss": 0.8642, + "step": 1218 + }, + { + "epoch": 0.06572137157645029, + "grad_norm": 0.9015594720840454, + "learning_rate": 9.994373014251261e-06, + "loss": 0.8775, + "step": 1219 + }, + { + "epoch": 0.06577528574509381, + "grad_norm": 0.8473883271217346, + "learning_rate": 9.994362954480018e-06, + "loss": 0.8566, + "step": 1220 + }, + { + "epoch": 0.06582919991373733, + "grad_norm": 0.8571242690086365, + "learning_rate": 9.994352885729584e-06, + "loss": 0.8502, + "step": 1221 + }, + { + "epoch": 0.06588311408238084, + "grad_norm": 0.8793268799781799, + "learning_rate": 9.994342807999977e-06, + "loss": 0.9062, + "step": 1222 + }, + { + "epoch": 0.06593702825102436, + "grad_norm": 0.8866230249404907, + "learning_rate": 9.994332721291214e-06, + "loss": 0.9026, + "step": 1223 + }, + { + "epoch": 0.0659909424196679, + "grad_norm": 0.9135996103286743, + "learning_rate": 9.994322625603314e-06, + "loss": 0.8558, + "step": 1224 + }, + { + "epoch": 0.06604485658831141, + "grad_norm": 0.9904530048370361, + "learning_rate": 9.994312520936297e-06, + "loss": 0.8823, + "step": 1225 + }, + { + "epoch": 0.06609877075695493, + "grad_norm": 0.8590260148048401, + "learning_rate": 9.99430240729018e-06, + "loss": 0.8344, + "step": 1226 + }, + { + "epoch": 0.06615268492559845, + "grad_norm": 1.1669397354125977, + "learning_rate": 9.99429228466498e-06, + "loss": 0.9459, + "step": 1227 + }, + { + "epoch": 0.06620659909424197, + "grad_norm": 0.9290857315063477, + "learning_rate": 9.994282153060715e-06, + "loss": 0.8723, + "step": 1228 + }, + { + "epoch": 0.06626051326288548, + "grad_norm": 0.9619696140289307, + "learning_rate": 9.994272012477405e-06, + "loss": 0.8986, + "step": 1229 + }, + { + "epoch": 0.066314427431529, + "grad_norm": 0.8312071561813354, + "learning_rate": 9.994261862915068e-06, + "loss": 0.7291, + "step": 1230 + }, + { + "epoch": 0.06636834160017252, + "grad_norm": 1.0099300146102905, + "learning_rate": 9.994251704373721e-06, + "loss": 0.8725, + "step": 1231 + }, + { + "epoch": 0.06642225576881604, + "grad_norm": 0.8522336483001709, + "learning_rate": 9.994241536853384e-06, + "loss": 0.8656, + "step": 1232 + }, + { + "epoch": 0.06647616993745957, + "grad_norm": 0.919360339641571, + "learning_rate": 9.994231360354074e-06, + "loss": 0.8854, + "step": 1233 + }, + { + "epoch": 0.06653008410610309, + "grad_norm": 0.8002495169639587, + "learning_rate": 9.994221174875809e-06, + "loss": 0.7879, + "step": 1234 + }, + { + "epoch": 0.0665839982747466, + "grad_norm": 0.9539757370948792, + "learning_rate": 9.994210980418607e-06, + "loss": 0.9027, + "step": 1235 + }, + { + "epoch": 0.06663791244339012, + "grad_norm": 0.9222649335861206, + "learning_rate": 9.99420077698249e-06, + "loss": 0.7611, + "step": 1236 + }, + { + "epoch": 0.06669182661203364, + "grad_norm": 0.8629900813102722, + "learning_rate": 9.994190564567472e-06, + "loss": 0.8122, + "step": 1237 + }, + { + "epoch": 0.06674574078067716, + "grad_norm": 0.8339203000068665, + "learning_rate": 9.994180343173574e-06, + "loss": 0.7873, + "step": 1238 + }, + { + "epoch": 0.06679965494932068, + "grad_norm": 0.8844656348228455, + "learning_rate": 9.994170112800812e-06, + "loss": 0.8176, + "step": 1239 + }, + { + "epoch": 0.0668535691179642, + "grad_norm": 1.0024579763412476, + "learning_rate": 9.994159873449206e-06, + "loss": 0.844, + "step": 1240 + }, + { + "epoch": 0.06690748328660773, + "grad_norm": 0.8317261338233948, + "learning_rate": 9.994149625118774e-06, + "loss": 0.9103, + "step": 1241 + }, + { + "epoch": 0.06696139745525125, + "grad_norm": 0.8915300965309143, + "learning_rate": 9.994139367809534e-06, + "loss": 0.9084, + "step": 1242 + }, + { + "epoch": 0.06701531162389476, + "grad_norm": 0.9270803332328796, + "learning_rate": 9.994129101521506e-06, + "loss": 0.7634, + "step": 1243 + }, + { + "epoch": 0.06706922579253828, + "grad_norm": 0.9891652464866638, + "learning_rate": 9.994118826254708e-06, + "loss": 0.9776, + "step": 1244 + }, + { + "epoch": 0.0671231399611818, + "grad_norm": 0.7778229713439941, + "learning_rate": 9.994108542009156e-06, + "loss": 0.7481, + "step": 1245 + }, + { + "epoch": 0.06717705412982532, + "grad_norm": 0.8451201319694519, + "learning_rate": 9.994098248784872e-06, + "loss": 0.8012, + "step": 1246 + }, + { + "epoch": 0.06723096829846884, + "grad_norm": 0.8115825057029724, + "learning_rate": 9.994087946581873e-06, + "loss": 0.874, + "step": 1247 + }, + { + "epoch": 0.06728488246711235, + "grad_norm": 0.815934419631958, + "learning_rate": 9.994077635400175e-06, + "loss": 0.8114, + "step": 1248 + }, + { + "epoch": 0.06733879663575587, + "grad_norm": 1.1179388761520386, + "learning_rate": 9.9940673152398e-06, + "loss": 0.9078, + "step": 1249 + }, + { + "epoch": 0.0673927108043994, + "grad_norm": 0.9235454201698303, + "learning_rate": 9.994056986100767e-06, + "loss": 0.7511, + "step": 1250 + }, + { + "epoch": 0.06744662497304292, + "grad_norm": 0.8568270206451416, + "learning_rate": 9.994046647983093e-06, + "loss": 0.7805, + "step": 1251 + }, + { + "epoch": 0.06750053914168644, + "grad_norm": 1.1337388753890991, + "learning_rate": 9.994036300886796e-06, + "loss": 0.8835, + "step": 1252 + }, + { + "epoch": 0.06755445331032996, + "grad_norm": 0.9154239892959595, + "learning_rate": 9.994025944811896e-06, + "loss": 0.8804, + "step": 1253 + }, + { + "epoch": 0.06760836747897347, + "grad_norm": 0.8301606774330139, + "learning_rate": 9.99401557975841e-06, + "loss": 0.7905, + "step": 1254 + }, + { + "epoch": 0.06766228164761699, + "grad_norm": 0.9907017350196838, + "learning_rate": 9.994005205726358e-06, + "loss": 0.9091, + "step": 1255 + }, + { + "epoch": 0.06771619581626051, + "grad_norm": 0.8883876204490662, + "learning_rate": 9.993994822715758e-06, + "loss": 0.8815, + "step": 1256 + }, + { + "epoch": 0.06777010998490403, + "grad_norm": 0.9746614098548889, + "learning_rate": 9.993984430726627e-06, + "loss": 0.7897, + "step": 1257 + }, + { + "epoch": 0.06782402415354755, + "grad_norm": 0.9773344993591309, + "learning_rate": 9.993974029758988e-06, + "loss": 0.8499, + "step": 1258 + }, + { + "epoch": 0.06787793832219108, + "grad_norm": 0.9552164077758789, + "learning_rate": 9.993963619812856e-06, + "loss": 0.711, + "step": 1259 + }, + { + "epoch": 0.0679318524908346, + "grad_norm": 0.9146968126296997, + "learning_rate": 9.993953200888252e-06, + "loss": 0.9016, + "step": 1260 + }, + { + "epoch": 0.06798576665947811, + "grad_norm": 0.924244225025177, + "learning_rate": 9.993942772985192e-06, + "loss": 0.7534, + "step": 1261 + }, + { + "epoch": 0.06803968082812163, + "grad_norm": 1.2963265180587769, + "learning_rate": 9.993932336103699e-06, + "loss": 0.9409, + "step": 1262 + }, + { + "epoch": 0.06809359499676515, + "grad_norm": 0.7954462766647339, + "learning_rate": 9.993921890243788e-06, + "loss": 0.7669, + "step": 1263 + }, + { + "epoch": 0.06814750916540867, + "grad_norm": 0.9115849137306213, + "learning_rate": 9.993911435405478e-06, + "loss": 0.7567, + "step": 1264 + }, + { + "epoch": 0.06820142333405219, + "grad_norm": 1.0030237436294556, + "learning_rate": 9.99390097158879e-06, + "loss": 0.8952, + "step": 1265 + }, + { + "epoch": 0.0682553375026957, + "grad_norm": 0.8897690773010254, + "learning_rate": 9.993890498793742e-06, + "loss": 0.7993, + "step": 1266 + }, + { + "epoch": 0.06830925167133922, + "grad_norm": 0.9283807277679443, + "learning_rate": 9.993880017020349e-06, + "loss": 0.8808, + "step": 1267 + }, + { + "epoch": 0.06836316583998275, + "grad_norm": 0.848922848701477, + "learning_rate": 9.993869526268637e-06, + "loss": 0.7979, + "step": 1268 + }, + { + "epoch": 0.06841708000862627, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.993859026538618e-06, + "loss": 0.8886, + "step": 1269 + }, + { + "epoch": 0.06847099417726979, + "grad_norm": 0.8602685928344727, + "learning_rate": 9.993848517830318e-06, + "loss": 0.8209, + "step": 1270 + }, + { + "epoch": 0.06852490834591331, + "grad_norm": 0.9300077557563782, + "learning_rate": 9.99383800014375e-06, + "loss": 0.9261, + "step": 1271 + }, + { + "epoch": 0.06857882251455683, + "grad_norm": 0.8691270351409912, + "learning_rate": 9.993827473478934e-06, + "loss": 0.9217, + "step": 1272 + }, + { + "epoch": 0.06863273668320034, + "grad_norm": 0.7943814992904663, + "learning_rate": 9.99381693783589e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.06868665085184386, + "grad_norm": 0.9060125946998596, + "learning_rate": 9.993806393214638e-06, + "loss": 0.8314, + "step": 1274 + }, + { + "epoch": 0.06874056502048738, + "grad_norm": 0.8014434576034546, + "learning_rate": 9.993795839615194e-06, + "loss": 0.8047, + "step": 1275 + }, + { + "epoch": 0.0687944791891309, + "grad_norm": 1.0498815774917603, + "learning_rate": 9.993785277037578e-06, + "loss": 0.7125, + "step": 1276 + }, + { + "epoch": 0.06884839335777443, + "grad_norm": 0.8868438005447388, + "learning_rate": 9.993774705481812e-06, + "loss": 0.8594, + "step": 1277 + }, + { + "epoch": 0.06890230752641795, + "grad_norm": 0.8213896155357361, + "learning_rate": 9.993764124947911e-06, + "loss": 0.7995, + "step": 1278 + }, + { + "epoch": 0.06895622169506146, + "grad_norm": 0.9007741212844849, + "learning_rate": 9.993753535435895e-06, + "loss": 0.8982, + "step": 1279 + }, + { + "epoch": 0.06901013586370498, + "grad_norm": 0.8377478122711182, + "learning_rate": 9.993742936945785e-06, + "loss": 0.7387, + "step": 1280 + }, + { + "epoch": 0.0690640500323485, + "grad_norm": 0.8009492754936218, + "learning_rate": 9.993732329477598e-06, + "loss": 0.8079, + "step": 1281 + }, + { + "epoch": 0.06911796420099202, + "grad_norm": 0.8478789925575256, + "learning_rate": 9.993721713031354e-06, + "loss": 0.8682, + "step": 1282 + }, + { + "epoch": 0.06917187836963554, + "grad_norm": 0.7498561143875122, + "learning_rate": 9.993711087607072e-06, + "loss": 0.8107, + "step": 1283 + }, + { + "epoch": 0.06922579253827905, + "grad_norm": 0.8972634077072144, + "learning_rate": 9.99370045320477e-06, + "loss": 0.8494, + "step": 1284 + }, + { + "epoch": 0.06927970670692257, + "grad_norm": 0.942449152469635, + "learning_rate": 9.99368980982447e-06, + "loss": 0.8487, + "step": 1285 + }, + { + "epoch": 0.0693336208755661, + "grad_norm": 0.8752795457839966, + "learning_rate": 9.993679157466188e-06, + "loss": 0.8859, + "step": 1286 + }, + { + "epoch": 0.06938753504420962, + "grad_norm": 0.8289507031440735, + "learning_rate": 9.993668496129945e-06, + "loss": 0.8726, + "step": 1287 + }, + { + "epoch": 0.06944144921285314, + "grad_norm": 0.9452151656150818, + "learning_rate": 9.993657825815759e-06, + "loss": 0.9266, + "step": 1288 + }, + { + "epoch": 0.06949536338149666, + "grad_norm": 0.8697348237037659, + "learning_rate": 9.993647146523651e-06, + "loss": 0.8946, + "step": 1289 + }, + { + "epoch": 0.06954927755014018, + "grad_norm": 0.8712061643600464, + "learning_rate": 9.993636458253637e-06, + "loss": 0.8551, + "step": 1290 + }, + { + "epoch": 0.0696031917187837, + "grad_norm": 0.9295617938041687, + "learning_rate": 9.993625761005739e-06, + "loss": 0.8963, + "step": 1291 + }, + { + "epoch": 0.06965710588742721, + "grad_norm": 0.9441055059432983, + "learning_rate": 9.993615054779975e-06, + "loss": 0.9567, + "step": 1292 + }, + { + "epoch": 0.06971102005607073, + "grad_norm": 0.8742032051086426, + "learning_rate": 9.993604339576365e-06, + "loss": 0.8341, + "step": 1293 + }, + { + "epoch": 0.06976493422471426, + "grad_norm": 0.8596220016479492, + "learning_rate": 9.993593615394928e-06, + "loss": 0.8576, + "step": 1294 + }, + { + "epoch": 0.06981884839335778, + "grad_norm": 0.8011770844459534, + "learning_rate": 9.993582882235682e-06, + "loss": 0.7317, + "step": 1295 + }, + { + "epoch": 0.0698727625620013, + "grad_norm": 0.8578245043754578, + "learning_rate": 9.993572140098648e-06, + "loss": 0.8853, + "step": 1296 + }, + { + "epoch": 0.06992667673064482, + "grad_norm": 1.1155178546905518, + "learning_rate": 9.993561388983845e-06, + "loss": 0.8199, + "step": 1297 + }, + { + "epoch": 0.06998059089928833, + "grad_norm": 1.035699486732483, + "learning_rate": 9.993550628891293e-06, + "loss": 0.9498, + "step": 1298 + }, + { + "epoch": 0.07003450506793185, + "grad_norm": 0.8635748028755188, + "learning_rate": 9.99353985982101e-06, + "loss": 0.8741, + "step": 1299 + }, + { + "epoch": 0.07008841923657537, + "grad_norm": 0.8650850653648376, + "learning_rate": 9.993529081773016e-06, + "loss": 0.7337, + "step": 1300 + }, + { + "epoch": 0.07014233340521889, + "grad_norm": 0.8334539532661438, + "learning_rate": 9.99351829474733e-06, + "loss": 0.8927, + "step": 1301 + }, + { + "epoch": 0.0701962475738624, + "grad_norm": 0.9150926470756531, + "learning_rate": 9.993507498743971e-06, + "loss": 0.8464, + "step": 1302 + }, + { + "epoch": 0.07025016174250594, + "grad_norm": 0.8916522860527039, + "learning_rate": 9.993496693762958e-06, + "loss": 0.7899, + "step": 1303 + }, + { + "epoch": 0.07030407591114946, + "grad_norm": 1.0224976539611816, + "learning_rate": 9.993485879804314e-06, + "loss": 0.8256, + "step": 1304 + }, + { + "epoch": 0.07035799007979297, + "grad_norm": 0.921816885471344, + "learning_rate": 9.993475056868054e-06, + "loss": 0.7944, + "step": 1305 + }, + { + "epoch": 0.07041190424843649, + "grad_norm": 0.8775705099105835, + "learning_rate": 9.9934642249542e-06, + "loss": 0.9098, + "step": 1306 + }, + { + "epoch": 0.07046581841708001, + "grad_norm": 0.9802567362785339, + "learning_rate": 9.99345338406277e-06, + "loss": 0.9756, + "step": 1307 + }, + { + "epoch": 0.07051973258572353, + "grad_norm": 0.9785491228103638, + "learning_rate": 9.993442534193786e-06, + "loss": 1.0017, + "step": 1308 + }, + { + "epoch": 0.07057364675436704, + "grad_norm": 0.8796840906143188, + "learning_rate": 9.993431675347265e-06, + "loss": 0.7202, + "step": 1309 + }, + { + "epoch": 0.07062756092301056, + "grad_norm": 0.878099799156189, + "learning_rate": 9.993420807523227e-06, + "loss": 0.8655, + "step": 1310 + }, + { + "epoch": 0.07068147509165408, + "grad_norm": 0.8361509442329407, + "learning_rate": 9.99340993072169e-06, + "loss": 0.8522, + "step": 1311 + }, + { + "epoch": 0.07073538926029761, + "grad_norm": 0.8556873798370361, + "learning_rate": 9.99339904494268e-06, + "loss": 0.8603, + "step": 1312 + }, + { + "epoch": 0.07078930342894113, + "grad_norm": 0.8434461355209351, + "learning_rate": 9.993388150186208e-06, + "loss": 0.8571, + "step": 1313 + }, + { + "epoch": 0.07084321759758465, + "grad_norm": 0.8545907139778137, + "learning_rate": 9.9933772464523e-06, + "loss": 0.8145, + "step": 1314 + }, + { + "epoch": 0.07089713176622817, + "grad_norm": 0.9502561092376709, + "learning_rate": 9.993366333740971e-06, + "loss": 0.8068, + "step": 1315 + }, + { + "epoch": 0.07095104593487168, + "grad_norm": 0.848628580570221, + "learning_rate": 9.993355412052244e-06, + "loss": 0.8793, + "step": 1316 + }, + { + "epoch": 0.0710049601035152, + "grad_norm": 0.9699797630310059, + "learning_rate": 9.993344481386137e-06, + "loss": 0.9904, + "step": 1317 + }, + { + "epoch": 0.07105887427215872, + "grad_norm": 0.8888396620750427, + "learning_rate": 9.993333541742671e-06, + "loss": 0.8363, + "step": 1318 + }, + { + "epoch": 0.07111278844080224, + "grad_norm": 0.8805423974990845, + "learning_rate": 9.993322593121863e-06, + "loss": 0.8905, + "step": 1319 + }, + { + "epoch": 0.07116670260944576, + "grad_norm": 0.8875272274017334, + "learning_rate": 9.993311635523736e-06, + "loss": 0.7717, + "step": 1320 + }, + { + "epoch": 0.07122061677808929, + "grad_norm": 0.8853299617767334, + "learning_rate": 9.993300668948308e-06, + "loss": 0.9077, + "step": 1321 + }, + { + "epoch": 0.0712745309467328, + "grad_norm": 0.8847644329071045, + "learning_rate": 9.993289693395599e-06, + "loss": 0.8362, + "step": 1322 + }, + { + "epoch": 0.07132844511537632, + "grad_norm": 0.9531683325767517, + "learning_rate": 9.993278708865629e-06, + "loss": 0.8848, + "step": 1323 + }, + { + "epoch": 0.07138235928401984, + "grad_norm": 0.8573325276374817, + "learning_rate": 9.993267715358414e-06, + "loss": 0.8367, + "step": 1324 + }, + { + "epoch": 0.07143627345266336, + "grad_norm": 0.8920298218727112, + "learning_rate": 9.99325671287398e-06, + "loss": 0.8838, + "step": 1325 + }, + { + "epoch": 0.07149018762130688, + "grad_norm": 0.8472782969474792, + "learning_rate": 9.993245701412343e-06, + "loss": 0.8313, + "step": 1326 + }, + { + "epoch": 0.0715441017899504, + "grad_norm": 1.047664761543274, + "learning_rate": 9.993234680973525e-06, + "loss": 0.8663, + "step": 1327 + }, + { + "epoch": 0.07159801595859391, + "grad_norm": 0.9395570158958435, + "learning_rate": 9.993223651557542e-06, + "loss": 0.7703, + "step": 1328 + }, + { + "epoch": 0.07165193012723743, + "grad_norm": 0.9125472903251648, + "learning_rate": 9.993212613164419e-06, + "loss": 0.9335, + "step": 1329 + }, + { + "epoch": 0.07170584429588096, + "grad_norm": 0.9043323397636414, + "learning_rate": 9.993201565794172e-06, + "loss": 0.9185, + "step": 1330 + }, + { + "epoch": 0.07175975846452448, + "grad_norm": 0.8764339089393616, + "learning_rate": 9.993190509446821e-06, + "loss": 0.8807, + "step": 1331 + }, + { + "epoch": 0.071813672633168, + "grad_norm": 0.9123268723487854, + "learning_rate": 9.99317944412239e-06, + "loss": 0.8134, + "step": 1332 + }, + { + "epoch": 0.07186758680181152, + "grad_norm": 0.9625567197799683, + "learning_rate": 9.993168369820892e-06, + "loss": 0.8132, + "step": 1333 + }, + { + "epoch": 0.07192150097045504, + "grad_norm": 0.880536675453186, + "learning_rate": 9.993157286542352e-06, + "loss": 0.8107, + "step": 1334 + }, + { + "epoch": 0.07197541513909855, + "grad_norm": 0.9165224432945251, + "learning_rate": 9.99314619428679e-06, + "loss": 0.8376, + "step": 1335 + }, + { + "epoch": 0.07202932930774207, + "grad_norm": 0.8278066515922546, + "learning_rate": 9.993135093054223e-06, + "loss": 0.8075, + "step": 1336 + }, + { + "epoch": 0.07208324347638559, + "grad_norm": 0.9237795472145081, + "learning_rate": 9.993123982844674e-06, + "loss": 0.7838, + "step": 1337 + }, + { + "epoch": 0.0721371576450291, + "grad_norm": 0.8200939297676086, + "learning_rate": 9.993112863658161e-06, + "loss": 0.8475, + "step": 1338 + }, + { + "epoch": 0.07219107181367264, + "grad_norm": 0.8505958318710327, + "learning_rate": 9.993101735494704e-06, + "loss": 0.7891, + "step": 1339 + }, + { + "epoch": 0.07224498598231616, + "grad_norm": 0.8407264351844788, + "learning_rate": 9.993090598354323e-06, + "loss": 0.8128, + "step": 1340 + }, + { + "epoch": 0.07229890015095967, + "grad_norm": 0.8039887547492981, + "learning_rate": 9.993079452237038e-06, + "loss": 0.8504, + "step": 1341 + }, + { + "epoch": 0.07235281431960319, + "grad_norm": 0.7590643167495728, + "learning_rate": 9.993068297142871e-06, + "loss": 0.7402, + "step": 1342 + }, + { + "epoch": 0.07240672848824671, + "grad_norm": 0.7866249680519104, + "learning_rate": 9.993057133071842e-06, + "loss": 0.7076, + "step": 1343 + }, + { + "epoch": 0.07246064265689023, + "grad_norm": 0.9846029281616211, + "learning_rate": 9.993045960023967e-06, + "loss": 0.9179, + "step": 1344 + }, + { + "epoch": 0.07251455682553375, + "grad_norm": 0.8918319940567017, + "learning_rate": 9.99303477799927e-06, + "loss": 0.8087, + "step": 1345 + }, + { + "epoch": 0.07256847099417726, + "grad_norm": 0.8407700061798096, + "learning_rate": 9.99302358699777e-06, + "loss": 0.7272, + "step": 1346 + }, + { + "epoch": 0.0726223851628208, + "grad_norm": 0.9637326598167419, + "learning_rate": 9.993012387019486e-06, + "loss": 0.8613, + "step": 1347 + }, + { + "epoch": 0.07267629933146431, + "grad_norm": 0.8362317681312561, + "learning_rate": 9.99300117806444e-06, + "loss": 0.917, + "step": 1348 + }, + { + "epoch": 0.07273021350010783, + "grad_norm": 0.8584982752799988, + "learning_rate": 9.992989960132651e-06, + "loss": 0.8857, + "step": 1349 + }, + { + "epoch": 0.07278412766875135, + "grad_norm": 0.8341198563575745, + "learning_rate": 9.992978733224139e-06, + "loss": 0.802, + "step": 1350 + }, + { + "epoch": 0.07283804183739487, + "grad_norm": 1.6860167980194092, + "learning_rate": 9.992967497338926e-06, + "loss": 0.8789, + "step": 1351 + }, + { + "epoch": 0.07289195600603839, + "grad_norm": 0.8399189114570618, + "learning_rate": 9.99295625247703e-06, + "loss": 0.6338, + "step": 1352 + }, + { + "epoch": 0.0729458701746819, + "grad_norm": 0.9616976976394653, + "learning_rate": 9.992944998638473e-06, + "loss": 0.9735, + "step": 1353 + }, + { + "epoch": 0.07299978434332542, + "grad_norm": 0.8592861890792847, + "learning_rate": 9.992933735823272e-06, + "loss": 0.8159, + "step": 1354 + }, + { + "epoch": 0.07305369851196894, + "grad_norm": 0.8448725342750549, + "learning_rate": 9.992922464031451e-06, + "loss": 0.7942, + "step": 1355 + }, + { + "epoch": 0.07310761268061247, + "grad_norm": 0.8015927672386169, + "learning_rate": 9.99291118326303e-06, + "loss": 0.7429, + "step": 1356 + }, + { + "epoch": 0.07316152684925599, + "grad_norm": 0.8255912065505981, + "learning_rate": 9.992899893518025e-06, + "loss": 0.8532, + "step": 1357 + }, + { + "epoch": 0.07321544101789951, + "grad_norm": 0.8764085173606873, + "learning_rate": 9.992888594796462e-06, + "loss": 0.7989, + "step": 1358 + }, + { + "epoch": 0.07326935518654303, + "grad_norm": 0.8405522704124451, + "learning_rate": 9.992877287098357e-06, + "loss": 0.8709, + "step": 1359 + }, + { + "epoch": 0.07332326935518654, + "grad_norm": 0.8657836318016052, + "learning_rate": 9.992865970423733e-06, + "loss": 0.8236, + "step": 1360 + }, + { + "epoch": 0.07337718352383006, + "grad_norm": 0.8817959427833557, + "learning_rate": 9.992854644772609e-06, + "loss": 0.902, + "step": 1361 + }, + { + "epoch": 0.07343109769247358, + "grad_norm": 0.8290701508522034, + "learning_rate": 9.992843310145006e-06, + "loss": 0.8454, + "step": 1362 + }, + { + "epoch": 0.0734850118611171, + "grad_norm": 0.9637642502784729, + "learning_rate": 9.992831966540946e-06, + "loss": 0.9414, + "step": 1363 + }, + { + "epoch": 0.07353892602976062, + "grad_norm": 0.9220197200775146, + "learning_rate": 9.992820613960446e-06, + "loss": 0.9827, + "step": 1364 + }, + { + "epoch": 0.07359284019840415, + "grad_norm": 0.9008362889289856, + "learning_rate": 9.992809252403526e-06, + "loss": 0.8388, + "step": 1365 + }, + { + "epoch": 0.07364675436704766, + "grad_norm": 0.9517331123352051, + "learning_rate": 9.992797881870212e-06, + "loss": 0.8758, + "step": 1366 + }, + { + "epoch": 0.07370066853569118, + "grad_norm": 0.7811571359634399, + "learning_rate": 9.992786502360517e-06, + "loss": 0.6984, + "step": 1367 + }, + { + "epoch": 0.0737545827043347, + "grad_norm": 0.9887184500694275, + "learning_rate": 9.992775113874466e-06, + "loss": 0.7832, + "step": 1368 + }, + { + "epoch": 0.07380849687297822, + "grad_norm": 1.025869607925415, + "learning_rate": 9.99276371641208e-06, + "loss": 0.8417, + "step": 1369 + }, + { + "epoch": 0.07386241104162174, + "grad_norm": 0.8479165434837341, + "learning_rate": 9.99275230997338e-06, + "loss": 0.7862, + "step": 1370 + }, + { + "epoch": 0.07391632521026525, + "grad_norm": 0.9213555455207825, + "learning_rate": 9.992740894558381e-06, + "loss": 0.915, + "step": 1371 + }, + { + "epoch": 0.07397023937890877, + "grad_norm": 0.832306444644928, + "learning_rate": 9.992729470167109e-06, + "loss": 0.7566, + "step": 1372 + }, + { + "epoch": 0.07402415354755229, + "grad_norm": 1.0360348224639893, + "learning_rate": 9.992718036799583e-06, + "loss": 0.9096, + "step": 1373 + }, + { + "epoch": 0.07407806771619582, + "grad_norm": 0.8898483514785767, + "learning_rate": 9.992706594455823e-06, + "loss": 0.8738, + "step": 1374 + }, + { + "epoch": 0.07413198188483934, + "grad_norm": 0.8813758492469788, + "learning_rate": 9.992695143135849e-06, + "loss": 0.8736, + "step": 1375 + }, + { + "epoch": 0.07418589605348286, + "grad_norm": 1.1480571031570435, + "learning_rate": 9.992683682839683e-06, + "loss": 0.915, + "step": 1376 + }, + { + "epoch": 0.07423981022212638, + "grad_norm": 0.8588376641273499, + "learning_rate": 9.992672213567345e-06, + "loss": 0.8295, + "step": 1377 + }, + { + "epoch": 0.0742937243907699, + "grad_norm": 0.8729918599128723, + "learning_rate": 9.992660735318858e-06, + "loss": 0.9058, + "step": 1378 + }, + { + "epoch": 0.07434763855941341, + "grad_norm": 0.7953224778175354, + "learning_rate": 9.992649248094236e-06, + "loss": 0.7857, + "step": 1379 + }, + { + "epoch": 0.07440155272805693, + "grad_norm": 0.8485717177391052, + "learning_rate": 9.992637751893508e-06, + "loss": 0.7641, + "step": 1380 + }, + { + "epoch": 0.07445546689670045, + "grad_norm": 0.8630878329277039, + "learning_rate": 9.99262624671669e-06, + "loss": 0.8624, + "step": 1381 + }, + { + "epoch": 0.07450938106534397, + "grad_norm": 0.8655185103416443, + "learning_rate": 9.992614732563802e-06, + "loss": 0.8428, + "step": 1382 + }, + { + "epoch": 0.0745632952339875, + "grad_norm": 0.7875732779502869, + "learning_rate": 9.992603209434868e-06, + "loss": 0.7272, + "step": 1383 + }, + { + "epoch": 0.07461720940263102, + "grad_norm": 0.875879168510437, + "learning_rate": 9.992591677329905e-06, + "loss": 0.8539, + "step": 1384 + }, + { + "epoch": 0.07467112357127453, + "grad_norm": 0.8618319034576416, + "learning_rate": 9.992580136248934e-06, + "loss": 0.879, + "step": 1385 + }, + { + "epoch": 0.07472503773991805, + "grad_norm": 0.8695591688156128, + "learning_rate": 9.992568586191981e-06, + "loss": 0.8477, + "step": 1386 + }, + { + "epoch": 0.07477895190856157, + "grad_norm": 0.8539825677871704, + "learning_rate": 9.992557027159062e-06, + "loss": 0.7347, + "step": 1387 + }, + { + "epoch": 0.07483286607720509, + "grad_norm": 0.9625217914581299, + "learning_rate": 9.992545459150197e-06, + "loss": 0.8561, + "step": 1388 + }, + { + "epoch": 0.0748867802458486, + "grad_norm": 0.9862298369407654, + "learning_rate": 9.992533882165409e-06, + "loss": 0.9583, + "step": 1389 + }, + { + "epoch": 0.07494069441449212, + "grad_norm": 0.8217719793319702, + "learning_rate": 9.99252229620472e-06, + "loss": 0.7995, + "step": 1390 + }, + { + "epoch": 0.07499460858313564, + "grad_norm": 0.8668621182441711, + "learning_rate": 9.992510701268147e-06, + "loss": 0.8484, + "step": 1391 + }, + { + "epoch": 0.07504852275177917, + "grad_norm": 0.8549453616142273, + "learning_rate": 9.992499097355716e-06, + "loss": 0.8552, + "step": 1392 + }, + { + "epoch": 0.07510243692042269, + "grad_norm": 0.8262618184089661, + "learning_rate": 9.992487484467444e-06, + "loss": 0.7054, + "step": 1393 + }, + { + "epoch": 0.07515635108906621, + "grad_norm": 0.8524961471557617, + "learning_rate": 9.992475862603352e-06, + "loss": 0.8231, + "step": 1394 + }, + { + "epoch": 0.07521026525770973, + "grad_norm": 0.7805570363998413, + "learning_rate": 9.99246423176346e-06, + "loss": 0.7778, + "step": 1395 + }, + { + "epoch": 0.07526417942635324, + "grad_norm": 0.950484037399292, + "learning_rate": 9.992452591947794e-06, + "loss": 0.8662, + "step": 1396 + }, + { + "epoch": 0.07531809359499676, + "grad_norm": 0.8746458888053894, + "learning_rate": 9.99244094315637e-06, + "loss": 0.7854, + "step": 1397 + }, + { + "epoch": 0.07537200776364028, + "grad_norm": 0.9450538754463196, + "learning_rate": 9.992429285389212e-06, + "loss": 0.954, + "step": 1398 + }, + { + "epoch": 0.0754259219322838, + "grad_norm": 0.9048300385475159, + "learning_rate": 9.992417618646337e-06, + "loss": 0.8915, + "step": 1399 + }, + { + "epoch": 0.07547983610092733, + "grad_norm": 0.8735381364822388, + "learning_rate": 9.99240594292777e-06, + "loss": 0.8391, + "step": 1400 + }, + { + "epoch": 0.07553375026957085, + "grad_norm": 1.0980675220489502, + "learning_rate": 9.99239425823353e-06, + "loss": 0.8892, + "step": 1401 + }, + { + "epoch": 0.07558766443821437, + "grad_norm": 0.9016425013542175, + "learning_rate": 9.992382564563638e-06, + "loss": 0.8192, + "step": 1402 + }, + { + "epoch": 0.07564157860685788, + "grad_norm": 0.801419198513031, + "learning_rate": 9.992370861918117e-06, + "loss": 0.7914, + "step": 1403 + }, + { + "epoch": 0.0756954927755014, + "grad_norm": 0.9043407440185547, + "learning_rate": 9.992359150296985e-06, + "loss": 0.8767, + "step": 1404 + }, + { + "epoch": 0.07574940694414492, + "grad_norm": 0.9703086018562317, + "learning_rate": 9.992347429700266e-06, + "loss": 0.9173, + "step": 1405 + }, + { + "epoch": 0.07580332111278844, + "grad_norm": 0.8154104351997375, + "learning_rate": 9.992335700127978e-06, + "loss": 0.8453, + "step": 1406 + }, + { + "epoch": 0.07585723528143196, + "grad_norm": 0.8551482558250427, + "learning_rate": 9.992323961580146e-06, + "loss": 0.9132, + "step": 1407 + }, + { + "epoch": 0.07591114945007547, + "grad_norm": 0.9425063729286194, + "learning_rate": 9.992312214056785e-06, + "loss": 0.8171, + "step": 1408 + }, + { + "epoch": 0.075965063618719, + "grad_norm": 0.8958794474601746, + "learning_rate": 9.992300457557922e-06, + "loss": 0.7983, + "step": 1409 + }, + { + "epoch": 0.07601897778736252, + "grad_norm": 0.873874843120575, + "learning_rate": 9.992288692083579e-06, + "loss": 0.798, + "step": 1410 + }, + { + "epoch": 0.07607289195600604, + "grad_norm": 0.7951189279556274, + "learning_rate": 9.99227691763377e-06, + "loss": 0.8671, + "step": 1411 + }, + { + "epoch": 0.07612680612464956, + "grad_norm": 0.8073802590370178, + "learning_rate": 9.992265134208522e-06, + "loss": 0.8214, + "step": 1412 + }, + { + "epoch": 0.07618072029329308, + "grad_norm": 0.918222188949585, + "learning_rate": 9.992253341807854e-06, + "loss": 0.807, + "step": 1413 + }, + { + "epoch": 0.0762346344619366, + "grad_norm": 0.834381103515625, + "learning_rate": 9.992241540431789e-06, + "loss": 0.8737, + "step": 1414 + }, + { + "epoch": 0.07628854863058011, + "grad_norm": 0.808437168598175, + "learning_rate": 9.992229730080347e-06, + "loss": 0.7982, + "step": 1415 + }, + { + "epoch": 0.07634246279922363, + "grad_norm": 0.7868708968162537, + "learning_rate": 9.992217910753547e-06, + "loss": 0.7071, + "step": 1416 + }, + { + "epoch": 0.07639637696786715, + "grad_norm": 0.8445919156074524, + "learning_rate": 9.992206082451416e-06, + "loss": 0.8353, + "step": 1417 + }, + { + "epoch": 0.07645029113651068, + "grad_norm": 0.8283419609069824, + "learning_rate": 9.992194245173969e-06, + "loss": 0.867, + "step": 1418 + }, + { + "epoch": 0.0765042053051542, + "grad_norm": 0.8390635251998901, + "learning_rate": 9.99218239892123e-06, + "loss": 0.822, + "step": 1419 + }, + { + "epoch": 0.07655811947379772, + "grad_norm": 0.9037001132965088, + "learning_rate": 9.992170543693222e-06, + "loss": 0.8759, + "step": 1420 + }, + { + "epoch": 0.07661203364244124, + "grad_norm": 0.9708169102668762, + "learning_rate": 9.992158679489965e-06, + "loss": 0.875, + "step": 1421 + }, + { + "epoch": 0.07666594781108475, + "grad_norm": 0.8712205290794373, + "learning_rate": 9.992146806311479e-06, + "loss": 0.8711, + "step": 1422 + }, + { + "epoch": 0.07671986197972827, + "grad_norm": 0.953936755657196, + "learning_rate": 9.992134924157786e-06, + "loss": 0.8117, + "step": 1423 + }, + { + "epoch": 0.07677377614837179, + "grad_norm": 1.3178669214248657, + "learning_rate": 9.992123033028908e-06, + "loss": 0.8932, + "step": 1424 + }, + { + "epoch": 0.0768276903170153, + "grad_norm": 0.8657799959182739, + "learning_rate": 9.992111132924867e-06, + "loss": 0.8429, + "step": 1425 + }, + { + "epoch": 0.07688160448565882, + "grad_norm": 0.8979378938674927, + "learning_rate": 9.992099223845681e-06, + "loss": 0.9165, + "step": 1426 + }, + { + "epoch": 0.07693551865430236, + "grad_norm": 0.797493040561676, + "learning_rate": 9.992087305791376e-06, + "loss": 0.8139, + "step": 1427 + }, + { + "epoch": 0.07698943282294587, + "grad_norm": 0.9762497544288635, + "learning_rate": 9.99207537876197e-06, + "loss": 0.8006, + "step": 1428 + }, + { + "epoch": 0.07704334699158939, + "grad_norm": 0.9322238564491272, + "learning_rate": 9.992063442757487e-06, + "loss": 0.8708, + "step": 1429 + }, + { + "epoch": 0.07709726116023291, + "grad_norm": 0.9208402037620544, + "learning_rate": 9.992051497777947e-06, + "loss": 0.9137, + "step": 1430 + }, + { + "epoch": 0.07715117532887643, + "grad_norm": 0.9262849688529968, + "learning_rate": 9.99203954382337e-06, + "loss": 0.8043, + "step": 1431 + }, + { + "epoch": 0.07720508949751995, + "grad_norm": 1.0556507110595703, + "learning_rate": 9.992027580893781e-06, + "loss": 0.8321, + "step": 1432 + }, + { + "epoch": 0.07725900366616346, + "grad_norm": 1.0503417253494263, + "learning_rate": 9.9920156089892e-06, + "loss": 0.8875, + "step": 1433 + }, + { + "epoch": 0.07731291783480698, + "grad_norm": 0.8772387504577637, + "learning_rate": 9.992003628109647e-06, + "loss": 0.7407, + "step": 1434 + }, + { + "epoch": 0.0773668320034505, + "grad_norm": 0.942286491394043, + "learning_rate": 9.991991638255146e-06, + "loss": 0.8493, + "step": 1435 + }, + { + "epoch": 0.07742074617209403, + "grad_norm": 0.8584794998168945, + "learning_rate": 9.991979639425717e-06, + "loss": 0.8003, + "step": 1436 + }, + { + "epoch": 0.07747466034073755, + "grad_norm": 0.8247780203819275, + "learning_rate": 9.99196763162138e-06, + "loss": 0.9156, + "step": 1437 + }, + { + "epoch": 0.07752857450938107, + "grad_norm": 0.859018862247467, + "learning_rate": 9.99195561484216e-06, + "loss": 0.8255, + "step": 1438 + }, + { + "epoch": 0.07758248867802459, + "grad_norm": 0.9073282480239868, + "learning_rate": 9.991943589088078e-06, + "loss": 0.903, + "step": 1439 + }, + { + "epoch": 0.0776364028466681, + "grad_norm": 0.9324385523796082, + "learning_rate": 9.991931554359154e-06, + "loss": 0.8618, + "step": 1440 + }, + { + "epoch": 0.07769031701531162, + "grad_norm": 0.8038938045501709, + "learning_rate": 9.991919510655409e-06, + "loss": 0.7545, + "step": 1441 + }, + { + "epoch": 0.07774423118395514, + "grad_norm": 0.7999526858329773, + "learning_rate": 9.991907457976866e-06, + "loss": 0.6804, + "step": 1442 + }, + { + "epoch": 0.07779814535259866, + "grad_norm": 1.0165048837661743, + "learning_rate": 9.991895396323548e-06, + "loss": 0.7664, + "step": 1443 + }, + { + "epoch": 0.07785205952124218, + "grad_norm": 0.9513073563575745, + "learning_rate": 9.991883325695475e-06, + "loss": 0.8115, + "step": 1444 + }, + { + "epoch": 0.07790597368988571, + "grad_norm": 1.0391769409179688, + "learning_rate": 9.991871246092669e-06, + "loss": 0.9197, + "step": 1445 + }, + { + "epoch": 0.07795988785852923, + "grad_norm": 0.8990768194198608, + "learning_rate": 9.991859157515151e-06, + "loss": 0.9507, + "step": 1446 + }, + { + "epoch": 0.07801380202717274, + "grad_norm": 0.9990912079811096, + "learning_rate": 9.991847059962945e-06, + "loss": 0.7951, + "step": 1447 + }, + { + "epoch": 0.07806771619581626, + "grad_norm": 1.0030032396316528, + "learning_rate": 9.99183495343607e-06, + "loss": 0.7237, + "step": 1448 + }, + { + "epoch": 0.07812163036445978, + "grad_norm": 0.889561116695404, + "learning_rate": 9.991822837934551e-06, + "loss": 0.9061, + "step": 1449 + }, + { + "epoch": 0.0781755445331033, + "grad_norm": 0.8766982555389404, + "learning_rate": 9.991810713458405e-06, + "loss": 0.7952, + "step": 1450 + }, + { + "epoch": 0.07822945870174682, + "grad_norm": 0.9144406914710999, + "learning_rate": 9.991798580007658e-06, + "loss": 0.9235, + "step": 1451 + }, + { + "epoch": 0.07828337287039033, + "grad_norm": 0.895516037940979, + "learning_rate": 9.99178643758233e-06, + "loss": 0.9469, + "step": 1452 + }, + { + "epoch": 0.07833728703903386, + "grad_norm": 0.8802943229675293, + "learning_rate": 9.991774286182443e-06, + "loss": 0.8548, + "step": 1453 + }, + { + "epoch": 0.07839120120767738, + "grad_norm": 1.2773913145065308, + "learning_rate": 9.99176212580802e-06, + "loss": 0.794, + "step": 1454 + }, + { + "epoch": 0.0784451153763209, + "grad_norm": 0.9501168131828308, + "learning_rate": 9.99174995645908e-06, + "loss": 0.8711, + "step": 1455 + }, + { + "epoch": 0.07849902954496442, + "grad_norm": 0.9047390222549438, + "learning_rate": 9.991737778135649e-06, + "loss": 0.8419, + "step": 1456 + }, + { + "epoch": 0.07855294371360794, + "grad_norm": 0.9492837190628052, + "learning_rate": 9.991725590837747e-06, + "loss": 0.9832, + "step": 1457 + }, + { + "epoch": 0.07860685788225145, + "grad_norm": 0.9585106372833252, + "learning_rate": 9.991713394565394e-06, + "loss": 0.8393, + "step": 1458 + }, + { + "epoch": 0.07866077205089497, + "grad_norm": 0.9568297266960144, + "learning_rate": 9.991701189318615e-06, + "loss": 0.8711, + "step": 1459 + }, + { + "epoch": 0.07871468621953849, + "grad_norm": 0.9201347231864929, + "learning_rate": 9.991688975097429e-06, + "loss": 0.7947, + "step": 1460 + }, + { + "epoch": 0.07876860038818201, + "grad_norm": 0.8375768661499023, + "learning_rate": 9.99167675190186e-06, + "loss": 0.8051, + "step": 1461 + }, + { + "epoch": 0.07882251455682554, + "grad_norm": 0.8397765755653381, + "learning_rate": 9.99166451973193e-06, + "loss": 0.7727, + "step": 1462 + }, + { + "epoch": 0.07887642872546906, + "grad_norm": 0.8697947859764099, + "learning_rate": 9.99165227858766e-06, + "loss": 0.8171, + "step": 1463 + }, + { + "epoch": 0.07893034289411258, + "grad_norm": 0.8894750475883484, + "learning_rate": 9.991640028469073e-06, + "loss": 0.8773, + "step": 1464 + }, + { + "epoch": 0.0789842570627561, + "grad_norm": 0.8817871809005737, + "learning_rate": 9.991627769376189e-06, + "loss": 0.8983, + "step": 1465 + }, + { + "epoch": 0.07903817123139961, + "grad_norm": 0.9241123795509338, + "learning_rate": 9.99161550130903e-06, + "loss": 0.8967, + "step": 1466 + }, + { + "epoch": 0.07909208540004313, + "grad_norm": 0.852982223033905, + "learning_rate": 9.991603224267623e-06, + "loss": 0.9054, + "step": 1467 + }, + { + "epoch": 0.07914599956868665, + "grad_norm": 0.7719098925590515, + "learning_rate": 9.991590938251986e-06, + "loss": 0.7845, + "step": 1468 + }, + { + "epoch": 0.07919991373733017, + "grad_norm": 0.8700329661369324, + "learning_rate": 9.99157864326214e-06, + "loss": 0.9664, + "step": 1469 + }, + { + "epoch": 0.07925382790597368, + "grad_norm": 0.880553126335144, + "learning_rate": 9.991566339298112e-06, + "loss": 0.8803, + "step": 1470 + }, + { + "epoch": 0.07930774207461722, + "grad_norm": 0.9425762295722961, + "learning_rate": 9.991554026359918e-06, + "loss": 0.8259, + "step": 1471 + }, + { + "epoch": 0.07936165624326073, + "grad_norm": 0.8611294031143188, + "learning_rate": 9.991541704447585e-06, + "loss": 0.8693, + "step": 1472 + }, + { + "epoch": 0.07941557041190425, + "grad_norm": 0.856023907661438, + "learning_rate": 9.99152937356113e-06, + "loss": 0.7073, + "step": 1473 + }, + { + "epoch": 0.07946948458054777, + "grad_norm": 0.7763693332672119, + "learning_rate": 9.991517033700582e-06, + "loss": 0.6815, + "step": 1474 + }, + { + "epoch": 0.07952339874919129, + "grad_norm": 0.8417321443557739, + "learning_rate": 9.991504684865959e-06, + "loss": 0.8239, + "step": 1475 + }, + { + "epoch": 0.0795773129178348, + "grad_norm": 0.9151323437690735, + "learning_rate": 9.991492327057282e-06, + "loss": 0.8327, + "step": 1476 + }, + { + "epoch": 0.07963122708647832, + "grad_norm": 0.8285405039787292, + "learning_rate": 9.991479960274576e-06, + "loss": 0.8623, + "step": 1477 + }, + { + "epoch": 0.07968514125512184, + "grad_norm": 0.8204792141914368, + "learning_rate": 9.991467584517863e-06, + "loss": 0.8494, + "step": 1478 + }, + { + "epoch": 0.07973905542376536, + "grad_norm": 0.8516230583190918, + "learning_rate": 9.991455199787164e-06, + "loss": 0.8219, + "step": 1479 + }, + { + "epoch": 0.07979296959240889, + "grad_norm": 0.9418333172798157, + "learning_rate": 9.991442806082501e-06, + "loss": 0.9293, + "step": 1480 + }, + { + "epoch": 0.07984688376105241, + "grad_norm": 0.8852763175964355, + "learning_rate": 9.991430403403898e-06, + "loss": 0.8124, + "step": 1481 + }, + { + "epoch": 0.07990079792969593, + "grad_norm": 0.8435791730880737, + "learning_rate": 9.991417991751376e-06, + "loss": 0.8634, + "step": 1482 + }, + { + "epoch": 0.07995471209833944, + "grad_norm": 0.7795083522796631, + "learning_rate": 9.991405571124957e-06, + "loss": 0.802, + "step": 1483 + }, + { + "epoch": 0.08000862626698296, + "grad_norm": 0.8102303743362427, + "learning_rate": 9.991393141524663e-06, + "loss": 0.7492, + "step": 1484 + }, + { + "epoch": 0.08006254043562648, + "grad_norm": 0.8433593511581421, + "learning_rate": 9.99138070295052e-06, + "loss": 0.7926, + "step": 1485 + }, + { + "epoch": 0.08011645460427, + "grad_norm": 0.8992267847061157, + "learning_rate": 9.991368255402546e-06, + "loss": 0.7859, + "step": 1486 + }, + { + "epoch": 0.08017036877291352, + "grad_norm": 0.8748059868812561, + "learning_rate": 9.991355798880765e-06, + "loss": 0.8245, + "step": 1487 + }, + { + "epoch": 0.08022428294155703, + "grad_norm": 0.8456832766532898, + "learning_rate": 9.9913433333852e-06, + "loss": 0.9009, + "step": 1488 + }, + { + "epoch": 0.08027819711020057, + "grad_norm": 0.8582474589347839, + "learning_rate": 9.991330858915873e-06, + "loss": 0.7607, + "step": 1489 + }, + { + "epoch": 0.08033211127884408, + "grad_norm": 0.8157060146331787, + "learning_rate": 9.991318375472807e-06, + "loss": 0.8426, + "step": 1490 + }, + { + "epoch": 0.0803860254474876, + "grad_norm": 0.7474784851074219, + "learning_rate": 9.991305883056021e-06, + "loss": 0.8014, + "step": 1491 + }, + { + "epoch": 0.08043993961613112, + "grad_norm": 0.8432475924491882, + "learning_rate": 9.991293381665543e-06, + "loss": 0.8254, + "step": 1492 + }, + { + "epoch": 0.08049385378477464, + "grad_norm": 0.8733057379722595, + "learning_rate": 9.991280871301392e-06, + "loss": 0.8694, + "step": 1493 + }, + { + "epoch": 0.08054776795341816, + "grad_norm": 0.8694074153900146, + "learning_rate": 9.991268351963592e-06, + "loss": 0.7306, + "step": 1494 + }, + { + "epoch": 0.08060168212206167, + "grad_norm": 0.8981258869171143, + "learning_rate": 9.991255823652162e-06, + "loss": 0.7821, + "step": 1495 + }, + { + "epoch": 0.08065559629070519, + "grad_norm": 0.9740719795227051, + "learning_rate": 9.99124328636713e-06, + "loss": 0.7678, + "step": 1496 + }, + { + "epoch": 0.08070951045934871, + "grad_norm": 0.8847763538360596, + "learning_rate": 9.991230740108515e-06, + "loss": 0.73, + "step": 1497 + }, + { + "epoch": 0.08076342462799224, + "grad_norm": 0.8909339308738708, + "learning_rate": 9.99121818487634e-06, + "loss": 0.7713, + "step": 1498 + }, + { + "epoch": 0.08081733879663576, + "grad_norm": 0.8183975219726562, + "learning_rate": 9.991205620670626e-06, + "loss": 0.8234, + "step": 1499 + }, + { + "epoch": 0.08087125296527928, + "grad_norm": 1.241355299949646, + "learning_rate": 9.991193047491399e-06, + "loss": 0.8135, + "step": 1500 + }, + { + "epoch": 0.0809251671339228, + "grad_norm": 0.9039500951766968, + "learning_rate": 9.991180465338682e-06, + "loss": 0.8642, + "step": 1501 + }, + { + "epoch": 0.08097908130256631, + "grad_norm": 1.1762068271636963, + "learning_rate": 9.991167874212493e-06, + "loss": 0.7892, + "step": 1502 + }, + { + "epoch": 0.08103299547120983, + "grad_norm": 0.8402833938598633, + "learning_rate": 9.991155274112857e-06, + "loss": 0.9054, + "step": 1503 + }, + { + "epoch": 0.08108690963985335, + "grad_norm": 0.9271976351737976, + "learning_rate": 9.991142665039799e-06, + "loss": 0.8902, + "step": 1504 + }, + { + "epoch": 0.08114082380849687, + "grad_norm": 0.9105845093727112, + "learning_rate": 9.991130046993337e-06, + "loss": 0.8522, + "step": 1505 + }, + { + "epoch": 0.0811947379771404, + "grad_norm": 0.8248290419578552, + "learning_rate": 9.991117419973499e-06, + "loss": 0.882, + "step": 1506 + }, + { + "epoch": 0.08124865214578392, + "grad_norm": 1.0726820230484009, + "learning_rate": 9.991104783980305e-06, + "loss": 0.8001, + "step": 1507 + }, + { + "epoch": 0.08130256631442744, + "grad_norm": 1.296281337738037, + "learning_rate": 9.991092139013776e-06, + "loss": 1.0022, + "step": 1508 + }, + { + "epoch": 0.08135648048307095, + "grad_norm": 1.7287628650665283, + "learning_rate": 9.991079485073938e-06, + "loss": 0.914, + "step": 1509 + }, + { + "epoch": 0.08141039465171447, + "grad_norm": 0.8731694221496582, + "learning_rate": 9.991066822160813e-06, + "loss": 0.8672, + "step": 1510 + }, + { + "epoch": 0.08146430882035799, + "grad_norm": 0.875747799873352, + "learning_rate": 9.99105415027442e-06, + "loss": 0.8044, + "step": 1511 + }, + { + "epoch": 0.08151822298900151, + "grad_norm": 0.9055120348930359, + "learning_rate": 9.991041469414787e-06, + "loss": 0.8312, + "step": 1512 + }, + { + "epoch": 0.08157213715764502, + "grad_norm": 0.8849499821662903, + "learning_rate": 9.991028779581935e-06, + "loss": 0.889, + "step": 1513 + }, + { + "epoch": 0.08162605132628854, + "grad_norm": 0.9549855589866638, + "learning_rate": 9.991016080775884e-06, + "loss": 0.8929, + "step": 1514 + }, + { + "epoch": 0.08167996549493207, + "grad_norm": 0.8395527005195618, + "learning_rate": 9.991003372996662e-06, + "loss": 0.6774, + "step": 1515 + }, + { + "epoch": 0.08173387966357559, + "grad_norm": 0.7791672945022583, + "learning_rate": 9.990990656244287e-06, + "loss": 0.7178, + "step": 1516 + }, + { + "epoch": 0.08178779383221911, + "grad_norm": 0.91841721534729, + "learning_rate": 9.990977930518785e-06, + "loss": 0.8372, + "step": 1517 + }, + { + "epoch": 0.08184170800086263, + "grad_norm": 0.923937976360321, + "learning_rate": 9.990965195820178e-06, + "loss": 0.8467, + "step": 1518 + }, + { + "epoch": 0.08189562216950615, + "grad_norm": 0.9804415106773376, + "learning_rate": 9.990952452148488e-06, + "loss": 0.9281, + "step": 1519 + }, + { + "epoch": 0.08194953633814966, + "grad_norm": 0.9396255016326904, + "learning_rate": 9.99093969950374e-06, + "loss": 0.8606, + "step": 1520 + }, + { + "epoch": 0.08200345050679318, + "grad_norm": 0.8492118120193481, + "learning_rate": 9.990926937885953e-06, + "loss": 0.8253, + "step": 1521 + }, + { + "epoch": 0.0820573646754367, + "grad_norm": 0.8482204079627991, + "learning_rate": 9.990914167295154e-06, + "loss": 0.7361, + "step": 1522 + }, + { + "epoch": 0.08211127884408022, + "grad_norm": 1.1302778720855713, + "learning_rate": 9.990901387731365e-06, + "loss": 0.7511, + "step": 1523 + }, + { + "epoch": 0.08216519301272375, + "grad_norm": 0.9285756945610046, + "learning_rate": 9.990888599194607e-06, + "loss": 0.8329, + "step": 1524 + }, + { + "epoch": 0.08221910718136727, + "grad_norm": 0.8932104110717773, + "learning_rate": 9.990875801684905e-06, + "loss": 0.8146, + "step": 1525 + }, + { + "epoch": 0.08227302135001079, + "grad_norm": 0.8232647180557251, + "learning_rate": 9.990862995202282e-06, + "loss": 0.763, + "step": 1526 + }, + { + "epoch": 0.0823269355186543, + "grad_norm": 0.8582163453102112, + "learning_rate": 9.990850179746759e-06, + "loss": 0.7675, + "step": 1527 + }, + { + "epoch": 0.08238084968729782, + "grad_norm": 0.9890977144241333, + "learning_rate": 9.990837355318362e-06, + "loss": 0.8438, + "step": 1528 + }, + { + "epoch": 0.08243476385594134, + "grad_norm": 0.9228235483169556, + "learning_rate": 9.990824521917113e-06, + "loss": 0.9324, + "step": 1529 + }, + { + "epoch": 0.08248867802458486, + "grad_norm": 0.8286252617835999, + "learning_rate": 9.990811679543033e-06, + "loss": 0.872, + "step": 1530 + }, + { + "epoch": 0.08254259219322838, + "grad_norm": 0.8546530604362488, + "learning_rate": 9.990798828196146e-06, + "loss": 0.7256, + "step": 1531 + }, + { + "epoch": 0.0825965063618719, + "grad_norm": 0.8240640759468079, + "learning_rate": 9.990785967876478e-06, + "loss": 0.8083, + "step": 1532 + }, + { + "epoch": 0.08265042053051543, + "grad_norm": 0.8650565147399902, + "learning_rate": 9.99077309858405e-06, + "loss": 0.8274, + "step": 1533 + }, + { + "epoch": 0.08270433469915894, + "grad_norm": 0.7865849137306213, + "learning_rate": 9.990760220318884e-06, + "loss": 0.7978, + "step": 1534 + }, + { + "epoch": 0.08275824886780246, + "grad_norm": 0.8567995429039001, + "learning_rate": 9.990747333081005e-06, + "loss": 0.8172, + "step": 1535 + }, + { + "epoch": 0.08281216303644598, + "grad_norm": 0.8242521286010742, + "learning_rate": 9.990734436870435e-06, + "loss": 0.8045, + "step": 1536 + }, + { + "epoch": 0.0828660772050895, + "grad_norm": 0.801266074180603, + "learning_rate": 9.990721531687197e-06, + "loss": 0.8312, + "step": 1537 + }, + { + "epoch": 0.08291999137373302, + "grad_norm": 0.8027862906455994, + "learning_rate": 9.990708617531314e-06, + "loss": 0.7227, + "step": 1538 + }, + { + "epoch": 0.08297390554237653, + "grad_norm": 1.0332401990890503, + "learning_rate": 9.990695694402811e-06, + "loss": 0.9091, + "step": 1539 + }, + { + "epoch": 0.08302781971102005, + "grad_norm": 0.8537373542785645, + "learning_rate": 9.99068276230171e-06, + "loss": 0.7573, + "step": 1540 + }, + { + "epoch": 0.08308173387966357, + "grad_norm": 0.8734087944030762, + "learning_rate": 9.990669821228037e-06, + "loss": 0.901, + "step": 1541 + }, + { + "epoch": 0.0831356480483071, + "grad_norm": 0.8546577095985413, + "learning_rate": 9.99065687118181e-06, + "loss": 0.8294, + "step": 1542 + }, + { + "epoch": 0.08318956221695062, + "grad_norm": 0.9555438756942749, + "learning_rate": 9.990643912163055e-06, + "loss": 0.83, + "step": 1543 + }, + { + "epoch": 0.08324347638559414, + "grad_norm": 0.8778670430183411, + "learning_rate": 9.990630944171798e-06, + "loss": 0.8694, + "step": 1544 + }, + { + "epoch": 0.08329739055423765, + "grad_norm": 0.973791241645813, + "learning_rate": 9.990617967208058e-06, + "loss": 0.8348, + "step": 1545 + }, + { + "epoch": 0.08335130472288117, + "grad_norm": 0.7933714389801025, + "learning_rate": 9.990604981271858e-06, + "loss": 0.8208, + "step": 1546 + }, + { + "epoch": 0.08340521889152469, + "grad_norm": 0.9328469634056091, + "learning_rate": 9.990591986363226e-06, + "loss": 0.8188, + "step": 1547 + }, + { + "epoch": 0.08345913306016821, + "grad_norm": 0.8217103481292725, + "learning_rate": 9.990578982482183e-06, + "loss": 0.7948, + "step": 1548 + }, + { + "epoch": 0.08351304722881173, + "grad_norm": 0.8556894659996033, + "learning_rate": 9.990565969628749e-06, + "loss": 0.8129, + "step": 1549 + }, + { + "epoch": 0.08356696139745524, + "grad_norm": 0.901633083820343, + "learning_rate": 9.990552947802954e-06, + "loss": 0.9025, + "step": 1550 + }, + { + "epoch": 0.08362087556609878, + "grad_norm": 0.9021494388580322, + "learning_rate": 9.990539917004815e-06, + "loss": 0.8882, + "step": 1551 + }, + { + "epoch": 0.0836747897347423, + "grad_norm": 0.8187722563743591, + "learning_rate": 9.990526877234359e-06, + "loss": 0.7385, + "step": 1552 + }, + { + "epoch": 0.08372870390338581, + "grad_norm": 0.9237630367279053, + "learning_rate": 9.990513828491609e-06, + "loss": 0.851, + "step": 1553 + }, + { + "epoch": 0.08378261807202933, + "grad_norm": 1.1868582963943481, + "learning_rate": 9.990500770776589e-06, + "loss": 0.7701, + "step": 1554 + }, + { + "epoch": 0.08383653224067285, + "grad_norm": 0.9831421971321106, + "learning_rate": 9.990487704089322e-06, + "loss": 0.836, + "step": 1555 + }, + { + "epoch": 0.08389044640931637, + "grad_norm": 0.9255663752555847, + "learning_rate": 9.99047462842983e-06, + "loss": 0.7916, + "step": 1556 + }, + { + "epoch": 0.08394436057795988, + "grad_norm": 1.0069084167480469, + "learning_rate": 9.990461543798137e-06, + "loss": 0.8652, + "step": 1557 + }, + { + "epoch": 0.0839982747466034, + "grad_norm": 0.943044900894165, + "learning_rate": 9.990448450194267e-06, + "loss": 0.9511, + "step": 1558 + }, + { + "epoch": 0.08405218891524693, + "grad_norm": 0.9996150135993958, + "learning_rate": 9.990435347618246e-06, + "loss": 0.8751, + "step": 1559 + }, + { + "epoch": 0.08410610308389045, + "grad_norm": 0.9531681537628174, + "learning_rate": 9.990422236070094e-06, + "loss": 0.8988, + "step": 1560 + }, + { + "epoch": 0.08416001725253397, + "grad_norm": 0.9504678249359131, + "learning_rate": 9.990409115549837e-06, + "loss": 0.808, + "step": 1561 + }, + { + "epoch": 0.08421393142117749, + "grad_norm": 0.9796282052993774, + "learning_rate": 9.990395986057496e-06, + "loss": 0.778, + "step": 1562 + }, + { + "epoch": 0.084267845589821, + "grad_norm": 0.8871618509292603, + "learning_rate": 9.990382847593096e-06, + "loss": 0.8945, + "step": 1563 + }, + { + "epoch": 0.08432175975846452, + "grad_norm": 0.8253110647201538, + "learning_rate": 9.990369700156662e-06, + "loss": 0.8206, + "step": 1564 + }, + { + "epoch": 0.08437567392710804, + "grad_norm": 0.8799824118614197, + "learning_rate": 9.990356543748216e-06, + "loss": 0.7665, + "step": 1565 + }, + { + "epoch": 0.08442958809575156, + "grad_norm": 0.8275637626647949, + "learning_rate": 9.990343378367782e-06, + "loss": 0.8468, + "step": 1566 + }, + { + "epoch": 0.08448350226439508, + "grad_norm": 1.0431691408157349, + "learning_rate": 9.990330204015382e-06, + "loss": 0.8539, + "step": 1567 + }, + { + "epoch": 0.08453741643303861, + "grad_norm": 1.298999547958374, + "learning_rate": 9.990317020691043e-06, + "loss": 0.8989, + "step": 1568 + }, + { + "epoch": 0.08459133060168213, + "grad_norm": 0.865868866443634, + "learning_rate": 9.990303828394787e-06, + "loss": 0.8296, + "step": 1569 + }, + { + "epoch": 0.08464524477032564, + "grad_norm": 0.9162652492523193, + "learning_rate": 9.990290627126637e-06, + "loss": 0.8617, + "step": 1570 + }, + { + "epoch": 0.08469915893896916, + "grad_norm": 0.9753283858299255, + "learning_rate": 9.990277416886618e-06, + "loss": 0.8082, + "step": 1571 + }, + { + "epoch": 0.08475307310761268, + "grad_norm": 0.9561176300048828, + "learning_rate": 9.990264197674754e-06, + "loss": 0.8678, + "step": 1572 + }, + { + "epoch": 0.0848069872762562, + "grad_norm": 0.833341658115387, + "learning_rate": 9.990250969491067e-06, + "loss": 0.8164, + "step": 1573 + }, + { + "epoch": 0.08486090144489972, + "grad_norm": 0.9928603172302246, + "learning_rate": 9.990237732335581e-06, + "loss": 0.6889, + "step": 1574 + }, + { + "epoch": 0.08491481561354323, + "grad_norm": 1.0163367986679077, + "learning_rate": 9.990224486208322e-06, + "loss": 0.8278, + "step": 1575 + }, + { + "epoch": 0.08496872978218675, + "grad_norm": 0.9905970096588135, + "learning_rate": 9.990211231109312e-06, + "loss": 0.8094, + "step": 1576 + }, + { + "epoch": 0.08502264395083028, + "grad_norm": 0.9112648963928223, + "learning_rate": 9.990197967038574e-06, + "loss": 0.8782, + "step": 1577 + }, + { + "epoch": 0.0850765581194738, + "grad_norm": 1.1176974773406982, + "learning_rate": 9.990184693996136e-06, + "loss": 0.8826, + "step": 1578 + }, + { + "epoch": 0.08513047228811732, + "grad_norm": 0.7696222066879272, + "learning_rate": 9.990171411982016e-06, + "loss": 0.8025, + "step": 1579 + }, + { + "epoch": 0.08518438645676084, + "grad_norm": 0.9288634061813354, + "learning_rate": 9.990158120996242e-06, + "loss": 0.8777, + "step": 1580 + }, + { + "epoch": 0.08523830062540436, + "grad_norm": 0.9235022068023682, + "learning_rate": 9.990144821038839e-06, + "loss": 0.9339, + "step": 1581 + }, + { + "epoch": 0.08529221479404787, + "grad_norm": 0.9124205708503723, + "learning_rate": 9.990131512109826e-06, + "loss": 0.8368, + "step": 1582 + }, + { + "epoch": 0.08534612896269139, + "grad_norm": 0.8409048914909363, + "learning_rate": 9.990118194209229e-06, + "loss": 0.7772, + "step": 1583 + }, + { + "epoch": 0.08540004313133491, + "grad_norm": 0.8279136419296265, + "learning_rate": 9.990104867337074e-06, + "loss": 0.738, + "step": 1584 + }, + { + "epoch": 0.08545395729997843, + "grad_norm": 0.8895745873451233, + "learning_rate": 9.990091531493382e-06, + "loss": 0.7669, + "step": 1585 + }, + { + "epoch": 0.08550787146862196, + "grad_norm": 0.9280734062194824, + "learning_rate": 9.99007818667818e-06, + "loss": 0.9052, + "step": 1586 + }, + { + "epoch": 0.08556178563726548, + "grad_norm": 0.7676610350608826, + "learning_rate": 9.990064832891491e-06, + "loss": 0.807, + "step": 1587 + }, + { + "epoch": 0.085615699805909, + "grad_norm": 0.9035676121711731, + "learning_rate": 9.990051470133337e-06, + "loss": 0.8848, + "step": 1588 + }, + { + "epoch": 0.08566961397455251, + "grad_norm": 1.0960334539413452, + "learning_rate": 9.990038098403742e-06, + "loss": 0.8279, + "step": 1589 + }, + { + "epoch": 0.08572352814319603, + "grad_norm": 0.87922203540802, + "learning_rate": 9.990024717702736e-06, + "loss": 0.8325, + "step": 1590 + }, + { + "epoch": 0.08577744231183955, + "grad_norm": 0.922815203666687, + "learning_rate": 9.990011328030335e-06, + "loss": 0.881, + "step": 1591 + }, + { + "epoch": 0.08583135648048307, + "grad_norm": 0.9880780577659607, + "learning_rate": 9.989997929386567e-06, + "loss": 0.7506, + "step": 1592 + }, + { + "epoch": 0.08588527064912659, + "grad_norm": 0.8827483057975769, + "learning_rate": 9.989984521771456e-06, + "loss": 0.8961, + "step": 1593 + }, + { + "epoch": 0.0859391848177701, + "grad_norm": 0.8395072817802429, + "learning_rate": 9.989971105185026e-06, + "loss": 0.8564, + "step": 1594 + }, + { + "epoch": 0.08599309898641364, + "grad_norm": 0.8731534481048584, + "learning_rate": 9.989957679627302e-06, + "loss": 0.8209, + "step": 1595 + }, + { + "epoch": 0.08604701315505715, + "grad_norm": 0.7969424724578857, + "learning_rate": 9.989944245098305e-06, + "loss": 0.8031, + "step": 1596 + }, + { + "epoch": 0.08610092732370067, + "grad_norm": 0.8420547246932983, + "learning_rate": 9.989930801598062e-06, + "loss": 0.8027, + "step": 1597 + }, + { + "epoch": 0.08615484149234419, + "grad_norm": 0.7900253534317017, + "learning_rate": 9.989917349126597e-06, + "loss": 0.8246, + "step": 1598 + }, + { + "epoch": 0.08620875566098771, + "grad_norm": 0.8860716819763184, + "learning_rate": 9.989903887683934e-06, + "loss": 0.7846, + "step": 1599 + }, + { + "epoch": 0.08626266982963122, + "grad_norm": 0.907744288444519, + "learning_rate": 9.989890417270097e-06, + "loss": 0.7813, + "step": 1600 + }, + { + "epoch": 0.08631658399827474, + "grad_norm": 0.764076828956604, + "learning_rate": 9.989876937885108e-06, + "loss": 0.7953, + "step": 1601 + }, + { + "epoch": 0.08637049816691826, + "grad_norm": 1.0143790245056152, + "learning_rate": 9.989863449528994e-06, + "loss": 0.8854, + "step": 1602 + }, + { + "epoch": 0.08642441233556178, + "grad_norm": 0.8605815172195435, + "learning_rate": 9.989849952201779e-06, + "loss": 0.9289, + "step": 1603 + }, + { + "epoch": 0.08647832650420531, + "grad_norm": 0.8897641897201538, + "learning_rate": 9.989836445903487e-06, + "loss": 0.8659, + "step": 1604 + }, + { + "epoch": 0.08653224067284883, + "grad_norm": 0.8893518447875977, + "learning_rate": 9.989822930634141e-06, + "loss": 0.8724, + "step": 1605 + }, + { + "epoch": 0.08658615484149235, + "grad_norm": 0.8152129054069519, + "learning_rate": 9.989809406393767e-06, + "loss": 0.8321, + "step": 1606 + }, + { + "epoch": 0.08664006901013586, + "grad_norm": 0.8394732475280762, + "learning_rate": 9.98979587318239e-06, + "loss": 0.8074, + "step": 1607 + }, + { + "epoch": 0.08669398317877938, + "grad_norm": 0.8038346767425537, + "learning_rate": 9.989782331000031e-06, + "loss": 0.8132, + "step": 1608 + }, + { + "epoch": 0.0867478973474229, + "grad_norm": 0.8574134111404419, + "learning_rate": 9.989768779846717e-06, + "loss": 0.8191, + "step": 1609 + }, + { + "epoch": 0.08680181151606642, + "grad_norm": 1.0049889087677002, + "learning_rate": 9.989755219722472e-06, + "loss": 0.8771, + "step": 1610 + }, + { + "epoch": 0.08685572568470994, + "grad_norm": 0.9765112996101379, + "learning_rate": 9.989741650627319e-06, + "loss": 0.839, + "step": 1611 + }, + { + "epoch": 0.08690963985335347, + "grad_norm": 0.9430082440376282, + "learning_rate": 9.989728072561284e-06, + "loss": 1.0316, + "step": 1612 + }, + { + "epoch": 0.08696355402199699, + "grad_norm": 0.841590404510498, + "learning_rate": 9.989714485524391e-06, + "loss": 0.8727, + "step": 1613 + }, + { + "epoch": 0.0870174681906405, + "grad_norm": 0.9475975632667542, + "learning_rate": 9.989700889516664e-06, + "loss": 0.8131, + "step": 1614 + }, + { + "epoch": 0.08707138235928402, + "grad_norm": 0.8059530258178711, + "learning_rate": 9.98968728453813e-06, + "loss": 0.8297, + "step": 1615 + }, + { + "epoch": 0.08712529652792754, + "grad_norm": 0.8513601422309875, + "learning_rate": 9.989673670588808e-06, + "loss": 0.8016, + "step": 1616 + }, + { + "epoch": 0.08717921069657106, + "grad_norm": 0.8434658646583557, + "learning_rate": 9.989660047668728e-06, + "loss": 0.866, + "step": 1617 + }, + { + "epoch": 0.08723312486521458, + "grad_norm": 0.9081484079360962, + "learning_rate": 9.989646415777912e-06, + "loss": 0.816, + "step": 1618 + }, + { + "epoch": 0.0872870390338581, + "grad_norm": 0.7941877841949463, + "learning_rate": 9.989632774916385e-06, + "loss": 0.7191, + "step": 1619 + }, + { + "epoch": 0.08734095320250161, + "grad_norm": 0.8800172209739685, + "learning_rate": 9.98961912508417e-06, + "loss": 0.8135, + "step": 1620 + }, + { + "epoch": 0.08739486737114514, + "grad_norm": 0.7940575480461121, + "learning_rate": 9.989605466281292e-06, + "loss": 0.8124, + "step": 1621 + }, + { + "epoch": 0.08744878153978866, + "grad_norm": 0.9570618271827698, + "learning_rate": 9.989591798507779e-06, + "loss": 0.9043, + "step": 1622 + }, + { + "epoch": 0.08750269570843218, + "grad_norm": 0.8635395169258118, + "learning_rate": 9.98957812176365e-06, + "loss": 0.835, + "step": 1623 + }, + { + "epoch": 0.0875566098770757, + "grad_norm": 0.8289955258369446, + "learning_rate": 9.989564436048932e-06, + "loss": 0.8265, + "step": 1624 + }, + { + "epoch": 0.08761052404571922, + "grad_norm": 0.9519028663635254, + "learning_rate": 9.989550741363654e-06, + "loss": 0.8127, + "step": 1625 + }, + { + "epoch": 0.08766443821436273, + "grad_norm": 0.9611422419548035, + "learning_rate": 9.989537037707834e-06, + "loss": 0.8422, + "step": 1626 + }, + { + "epoch": 0.08771835238300625, + "grad_norm": 0.8824746608734131, + "learning_rate": 9.9895233250815e-06, + "loss": 0.8669, + "step": 1627 + }, + { + "epoch": 0.08777226655164977, + "grad_norm": 0.8402838706970215, + "learning_rate": 9.989509603484676e-06, + "loss": 0.8072, + "step": 1628 + }, + { + "epoch": 0.08782618072029329, + "grad_norm": 0.7537099719047546, + "learning_rate": 9.989495872917386e-06, + "loss": 0.7127, + "step": 1629 + }, + { + "epoch": 0.08788009488893682, + "grad_norm": 0.78285151720047, + "learning_rate": 9.989482133379656e-06, + "loss": 0.819, + "step": 1630 + }, + { + "epoch": 0.08793400905758034, + "grad_norm": 0.9339445233345032, + "learning_rate": 9.98946838487151e-06, + "loss": 0.8694, + "step": 1631 + }, + { + "epoch": 0.08798792322622385, + "grad_norm": 0.8022040128707886, + "learning_rate": 9.989454627392973e-06, + "loss": 0.7601, + "step": 1632 + }, + { + "epoch": 0.08804183739486737, + "grad_norm": 0.8593827486038208, + "learning_rate": 9.98944086094407e-06, + "loss": 0.8536, + "step": 1633 + }, + { + "epoch": 0.08809575156351089, + "grad_norm": 0.8415039777755737, + "learning_rate": 9.989427085524824e-06, + "loss": 0.9027, + "step": 1634 + }, + { + "epoch": 0.08814966573215441, + "grad_norm": 0.9551103711128235, + "learning_rate": 9.989413301135263e-06, + "loss": 0.8063, + "step": 1635 + }, + { + "epoch": 0.08820357990079793, + "grad_norm": 0.8554351925849915, + "learning_rate": 9.989399507775407e-06, + "loss": 0.7694, + "step": 1636 + }, + { + "epoch": 0.08825749406944144, + "grad_norm": 0.8688547015190125, + "learning_rate": 9.989385705445285e-06, + "loss": 0.8862, + "step": 1637 + }, + { + "epoch": 0.08831140823808496, + "grad_norm": 0.816558837890625, + "learning_rate": 9.98937189414492e-06, + "loss": 0.7302, + "step": 1638 + }, + { + "epoch": 0.0883653224067285, + "grad_norm": 0.8164445757865906, + "learning_rate": 9.989358073874337e-06, + "loss": 0.8724, + "step": 1639 + }, + { + "epoch": 0.08841923657537201, + "grad_norm": 0.8909460306167603, + "learning_rate": 9.989344244633564e-06, + "loss": 0.7618, + "step": 1640 + }, + { + "epoch": 0.08847315074401553, + "grad_norm": 1.0117470026016235, + "learning_rate": 9.98933040642262e-06, + "loss": 0.8191, + "step": 1641 + }, + { + "epoch": 0.08852706491265905, + "grad_norm": 0.8317937850952148, + "learning_rate": 9.989316559241533e-06, + "loss": 0.8339, + "step": 1642 + }, + { + "epoch": 0.08858097908130257, + "grad_norm": 0.7955135107040405, + "learning_rate": 9.98930270309033e-06, + "loss": 0.7799, + "step": 1643 + }, + { + "epoch": 0.08863489324994608, + "grad_norm": 0.996306300163269, + "learning_rate": 9.98928883796903e-06, + "loss": 0.8547, + "step": 1644 + }, + { + "epoch": 0.0886888074185896, + "grad_norm": 0.9679511189460754, + "learning_rate": 9.989274963877664e-06, + "loss": 1.0831, + "step": 1645 + }, + { + "epoch": 0.08874272158723312, + "grad_norm": 0.8471615314483643, + "learning_rate": 9.989261080816253e-06, + "loss": 0.7765, + "step": 1646 + }, + { + "epoch": 0.08879663575587664, + "grad_norm": 0.8662555813789368, + "learning_rate": 9.989247188784826e-06, + "loss": 0.8894, + "step": 1647 + }, + { + "epoch": 0.08885054992452017, + "grad_norm": 0.9549373388290405, + "learning_rate": 9.989233287783402e-06, + "loss": 0.8341, + "step": 1648 + }, + { + "epoch": 0.08890446409316369, + "grad_norm": 0.8179014325141907, + "learning_rate": 9.989219377812014e-06, + "loss": 0.8653, + "step": 1649 + }, + { + "epoch": 0.0889583782618072, + "grad_norm": 0.9237802624702454, + "learning_rate": 9.989205458870678e-06, + "loss": 0.8206, + "step": 1650 + }, + { + "epoch": 0.08901229243045072, + "grad_norm": 0.940217137336731, + "learning_rate": 9.989191530959426e-06, + "loss": 0.8695, + "step": 1651 + }, + { + "epoch": 0.08906620659909424, + "grad_norm": 0.9200409054756165, + "learning_rate": 9.98917759407828e-06, + "loss": 0.7984, + "step": 1652 + }, + { + "epoch": 0.08912012076773776, + "grad_norm": 0.9270562529563904, + "learning_rate": 9.989163648227265e-06, + "loss": 0.8265, + "step": 1653 + }, + { + "epoch": 0.08917403493638128, + "grad_norm": 0.9945223331451416, + "learning_rate": 9.989149693406408e-06, + "loss": 0.84, + "step": 1654 + }, + { + "epoch": 0.0892279491050248, + "grad_norm": 0.826195478439331, + "learning_rate": 9.98913572961573e-06, + "loss": 0.7862, + "step": 1655 + }, + { + "epoch": 0.08928186327366831, + "grad_norm": 0.9132022857666016, + "learning_rate": 9.989121756855263e-06, + "loss": 0.826, + "step": 1656 + }, + { + "epoch": 0.08933577744231185, + "grad_norm": 0.8559401631355286, + "learning_rate": 9.989107775125023e-06, + "loss": 0.8007, + "step": 1657 + }, + { + "epoch": 0.08938969161095536, + "grad_norm": 0.8000867366790771, + "learning_rate": 9.989093784425044e-06, + "loss": 0.7547, + "step": 1658 + }, + { + "epoch": 0.08944360577959888, + "grad_norm": 0.7761433720588684, + "learning_rate": 9.989079784755346e-06, + "loss": 0.8083, + "step": 1659 + }, + { + "epoch": 0.0894975199482424, + "grad_norm": 0.8072230815887451, + "learning_rate": 9.989065776115956e-06, + "loss": 0.892, + "step": 1660 + }, + { + "epoch": 0.08955143411688592, + "grad_norm": 0.9021360874176025, + "learning_rate": 9.989051758506898e-06, + "loss": 0.8715, + "step": 1661 + }, + { + "epoch": 0.08960534828552943, + "grad_norm": 0.7585147023200989, + "learning_rate": 9.989037731928197e-06, + "loss": 0.7115, + "step": 1662 + }, + { + "epoch": 0.08965926245417295, + "grad_norm": 0.9388399124145508, + "learning_rate": 9.98902369637988e-06, + "loss": 0.8976, + "step": 1663 + }, + { + "epoch": 0.08971317662281647, + "grad_norm": 0.8454418778419495, + "learning_rate": 9.989009651861972e-06, + "loss": 0.8063, + "step": 1664 + }, + { + "epoch": 0.08976709079146, + "grad_norm": 0.82308030128479, + "learning_rate": 9.988995598374496e-06, + "loss": 0.8044, + "step": 1665 + }, + { + "epoch": 0.08982100496010352, + "grad_norm": 1.006800651550293, + "learning_rate": 9.98898153591748e-06, + "loss": 0.8609, + "step": 1666 + }, + { + "epoch": 0.08987491912874704, + "grad_norm": 0.8325724601745605, + "learning_rate": 9.988967464490947e-06, + "loss": 0.8295, + "step": 1667 + }, + { + "epoch": 0.08992883329739056, + "grad_norm": 0.7575547695159912, + "learning_rate": 9.988953384094923e-06, + "loss": 0.8252, + "step": 1668 + }, + { + "epoch": 0.08998274746603407, + "grad_norm": 0.869877278804779, + "learning_rate": 9.988939294729436e-06, + "loss": 0.8304, + "step": 1669 + }, + { + "epoch": 0.09003666163467759, + "grad_norm": 0.7840037941932678, + "learning_rate": 9.988925196394508e-06, + "loss": 0.7742, + "step": 1670 + }, + { + "epoch": 0.09009057580332111, + "grad_norm": 0.8044409155845642, + "learning_rate": 9.988911089090163e-06, + "loss": 0.8371, + "step": 1671 + }, + { + "epoch": 0.09014448997196463, + "grad_norm": 0.8635613322257996, + "learning_rate": 9.988896972816431e-06, + "loss": 0.7693, + "step": 1672 + }, + { + "epoch": 0.09019840414060815, + "grad_norm": 0.7780656814575195, + "learning_rate": 9.988882847573335e-06, + "loss": 0.841, + "step": 1673 + }, + { + "epoch": 0.09025231830925168, + "grad_norm": 0.8938048481941223, + "learning_rate": 9.9888687133609e-06, + "loss": 0.8149, + "step": 1674 + }, + { + "epoch": 0.0903062324778952, + "grad_norm": 0.8432002663612366, + "learning_rate": 9.988854570179152e-06, + "loss": 0.853, + "step": 1675 + }, + { + "epoch": 0.09036014664653871, + "grad_norm": 0.8222450613975525, + "learning_rate": 9.988840418028118e-06, + "loss": 0.897, + "step": 1676 + }, + { + "epoch": 0.09041406081518223, + "grad_norm": 0.8370371460914612, + "learning_rate": 9.98882625690782e-06, + "loss": 0.8288, + "step": 1677 + }, + { + "epoch": 0.09046797498382575, + "grad_norm": 0.8510713577270508, + "learning_rate": 9.988812086818285e-06, + "loss": 0.7637, + "step": 1678 + }, + { + "epoch": 0.09052188915246927, + "grad_norm": 0.8271141648292542, + "learning_rate": 9.98879790775954e-06, + "loss": 0.853, + "step": 1679 + }, + { + "epoch": 0.09057580332111279, + "grad_norm": 1.0627025365829468, + "learning_rate": 9.988783719731607e-06, + "loss": 0.7569, + "step": 1680 + }, + { + "epoch": 0.0906297174897563, + "grad_norm": 0.880283534526825, + "learning_rate": 9.988769522734517e-06, + "loss": 0.8362, + "step": 1681 + }, + { + "epoch": 0.09068363165839982, + "grad_norm": 0.8721734881401062, + "learning_rate": 9.988755316768288e-06, + "loss": 0.8585, + "step": 1682 + }, + { + "epoch": 0.09073754582704335, + "grad_norm": 0.8830682039260864, + "learning_rate": 9.988741101832952e-06, + "loss": 0.8853, + "step": 1683 + }, + { + "epoch": 0.09079145999568687, + "grad_norm": 0.7676220536231995, + "learning_rate": 9.988726877928534e-06, + "loss": 0.7832, + "step": 1684 + }, + { + "epoch": 0.09084537416433039, + "grad_norm": 0.866149365901947, + "learning_rate": 9.988712645055055e-06, + "loss": 0.8534, + "step": 1685 + }, + { + "epoch": 0.09089928833297391, + "grad_norm": 0.8467028141021729, + "learning_rate": 9.988698403212546e-06, + "loss": 0.8637, + "step": 1686 + }, + { + "epoch": 0.09095320250161743, + "grad_norm": 0.913436770439148, + "learning_rate": 9.988684152401028e-06, + "loss": 0.855, + "step": 1687 + }, + { + "epoch": 0.09100711667026094, + "grad_norm": 0.8307977914810181, + "learning_rate": 9.98866989262053e-06, + "loss": 0.8538, + "step": 1688 + }, + { + "epoch": 0.09106103083890446, + "grad_norm": 1.13442862033844, + "learning_rate": 9.988655623871075e-06, + "loss": 0.8129, + "step": 1689 + }, + { + "epoch": 0.09111494500754798, + "grad_norm": 0.8950080871582031, + "learning_rate": 9.988641346152692e-06, + "loss": 0.8674, + "step": 1690 + }, + { + "epoch": 0.0911688591761915, + "grad_norm": 0.9107043147087097, + "learning_rate": 9.988627059465403e-06, + "loss": 0.9507, + "step": 1691 + }, + { + "epoch": 0.09122277334483503, + "grad_norm": 0.8210874795913696, + "learning_rate": 9.988612763809237e-06, + "loss": 0.8913, + "step": 1692 + }, + { + "epoch": 0.09127668751347855, + "grad_norm": 1.0306476354599, + "learning_rate": 9.988598459184217e-06, + "loss": 0.8589, + "step": 1693 + }, + { + "epoch": 0.09133060168212206, + "grad_norm": 0.7582615613937378, + "learning_rate": 9.98858414559037e-06, + "loss": 0.7482, + "step": 1694 + }, + { + "epoch": 0.09138451585076558, + "grad_norm": 0.8572216629981995, + "learning_rate": 9.98856982302772e-06, + "loss": 0.822, + "step": 1695 + }, + { + "epoch": 0.0914384300194091, + "grad_norm": 0.9358139038085938, + "learning_rate": 9.988555491496297e-06, + "loss": 0.8298, + "step": 1696 + }, + { + "epoch": 0.09149234418805262, + "grad_norm": 0.8705672025680542, + "learning_rate": 9.988541150996123e-06, + "loss": 0.8818, + "step": 1697 + }, + { + "epoch": 0.09154625835669614, + "grad_norm": 0.9081273674964905, + "learning_rate": 9.988526801527224e-06, + "loss": 0.8994, + "step": 1698 + }, + { + "epoch": 0.09160017252533965, + "grad_norm": 0.7358905076980591, + "learning_rate": 9.988512443089627e-06, + "loss": 0.7752, + "step": 1699 + }, + { + "epoch": 0.09165408669398317, + "grad_norm": 0.8570963740348816, + "learning_rate": 9.988498075683357e-06, + "loss": 0.908, + "step": 1700 + }, + { + "epoch": 0.0917080008626267, + "grad_norm": 0.8998208045959473, + "learning_rate": 9.988483699308442e-06, + "loss": 0.8561, + "step": 1701 + }, + { + "epoch": 0.09176191503127022, + "grad_norm": 0.7481779456138611, + "learning_rate": 9.988469313964903e-06, + "loss": 0.7184, + "step": 1702 + }, + { + "epoch": 0.09181582919991374, + "grad_norm": 1.052809238433838, + "learning_rate": 9.988454919652772e-06, + "loss": 0.8579, + "step": 1703 + }, + { + "epoch": 0.09186974336855726, + "grad_norm": 0.8492130637168884, + "learning_rate": 9.988440516372071e-06, + "loss": 0.8796, + "step": 1704 + }, + { + "epoch": 0.09192365753720078, + "grad_norm": 0.884483277797699, + "learning_rate": 9.988426104122826e-06, + "loss": 0.8781, + "step": 1705 + }, + { + "epoch": 0.0919775717058443, + "grad_norm": 0.8844857811927795, + "learning_rate": 9.988411682905065e-06, + "loss": 0.8981, + "step": 1706 + }, + { + "epoch": 0.09203148587448781, + "grad_norm": 0.906216025352478, + "learning_rate": 9.988397252718811e-06, + "loss": 0.8741, + "step": 1707 + }, + { + "epoch": 0.09208540004313133, + "grad_norm": 0.8565787076950073, + "learning_rate": 9.988382813564092e-06, + "loss": 0.7358, + "step": 1708 + }, + { + "epoch": 0.09213931421177485, + "grad_norm": 0.8036391139030457, + "learning_rate": 9.988368365440935e-06, + "loss": 0.7966, + "step": 1709 + }, + { + "epoch": 0.09219322838041838, + "grad_norm": 1.1708556413650513, + "learning_rate": 9.988353908349361e-06, + "loss": 0.8385, + "step": 1710 + }, + { + "epoch": 0.0922471425490619, + "grad_norm": 0.8536746501922607, + "learning_rate": 9.988339442289403e-06, + "loss": 0.7387, + "step": 1711 + }, + { + "epoch": 0.09230105671770542, + "grad_norm": 0.8376518487930298, + "learning_rate": 9.988324967261083e-06, + "loss": 0.8537, + "step": 1712 + }, + { + "epoch": 0.09235497088634893, + "grad_norm": 0.8793227672576904, + "learning_rate": 9.988310483264426e-06, + "loss": 0.8028, + "step": 1713 + }, + { + "epoch": 0.09240888505499245, + "grad_norm": 0.8186830282211304, + "learning_rate": 9.98829599029946e-06, + "loss": 0.8478, + "step": 1714 + }, + { + "epoch": 0.09246279922363597, + "grad_norm": 0.8845428824424744, + "learning_rate": 9.98828148836621e-06, + "loss": 0.8524, + "step": 1715 + }, + { + "epoch": 0.09251671339227949, + "grad_norm": 1.0494492053985596, + "learning_rate": 9.988266977464704e-06, + "loss": 0.8542, + "step": 1716 + }, + { + "epoch": 0.092570627560923, + "grad_norm": 0.8876493573188782, + "learning_rate": 9.988252457594966e-06, + "loss": 0.8989, + "step": 1717 + }, + { + "epoch": 0.09262454172956654, + "grad_norm": 0.8787088394165039, + "learning_rate": 9.988237928757024e-06, + "loss": 0.8214, + "step": 1718 + }, + { + "epoch": 0.09267845589821005, + "grad_norm": 1.069684624671936, + "learning_rate": 9.988223390950901e-06, + "loss": 0.9714, + "step": 1719 + }, + { + "epoch": 0.09273237006685357, + "grad_norm": 0.7957501411437988, + "learning_rate": 9.988208844176626e-06, + "loss": 0.7562, + "step": 1720 + }, + { + "epoch": 0.09278628423549709, + "grad_norm": 0.8354908227920532, + "learning_rate": 9.988194288434225e-06, + "loss": 0.7494, + "step": 1721 + }, + { + "epoch": 0.09284019840414061, + "grad_norm": 0.8205936551094055, + "learning_rate": 9.988179723723722e-06, + "loss": 0.7727, + "step": 1722 + }, + { + "epoch": 0.09289411257278413, + "grad_norm": 0.8364951014518738, + "learning_rate": 9.988165150045146e-06, + "loss": 0.861, + "step": 1723 + }, + { + "epoch": 0.09294802674142764, + "grad_norm": 0.8664119243621826, + "learning_rate": 9.98815056739852e-06, + "loss": 0.8512, + "step": 1724 + }, + { + "epoch": 0.09300194091007116, + "grad_norm": 0.9565482139587402, + "learning_rate": 9.988135975783874e-06, + "loss": 0.8606, + "step": 1725 + }, + { + "epoch": 0.09305585507871468, + "grad_norm": 0.8696085214614868, + "learning_rate": 9.988121375201232e-06, + "loss": 0.8614, + "step": 1726 + }, + { + "epoch": 0.09310976924735821, + "grad_norm": 0.8623467683792114, + "learning_rate": 9.98810676565062e-06, + "loss": 0.8547, + "step": 1727 + }, + { + "epoch": 0.09316368341600173, + "grad_norm": 0.8284831047058105, + "learning_rate": 9.988092147132064e-06, + "loss": 0.8376, + "step": 1728 + }, + { + "epoch": 0.09321759758464525, + "grad_norm": 0.7768245339393616, + "learning_rate": 9.988077519645591e-06, + "loss": 0.7472, + "step": 1729 + }, + { + "epoch": 0.09327151175328877, + "grad_norm": 1.221225619316101, + "learning_rate": 9.988062883191228e-06, + "loss": 0.9052, + "step": 1730 + }, + { + "epoch": 0.09332542592193228, + "grad_norm": 1.0027954578399658, + "learning_rate": 9.988048237769002e-06, + "loss": 0.9411, + "step": 1731 + }, + { + "epoch": 0.0933793400905758, + "grad_norm": 0.8029824495315552, + "learning_rate": 9.988033583378937e-06, + "loss": 0.8141, + "step": 1732 + }, + { + "epoch": 0.09343325425921932, + "grad_norm": 0.8081389665603638, + "learning_rate": 9.98801892002106e-06, + "loss": 0.7977, + "step": 1733 + }, + { + "epoch": 0.09348716842786284, + "grad_norm": 0.887438952922821, + "learning_rate": 9.988004247695398e-06, + "loss": 0.8574, + "step": 1734 + }, + { + "epoch": 0.09354108259650636, + "grad_norm": 0.887238085269928, + "learning_rate": 9.987989566401977e-06, + "loss": 0.9041, + "step": 1735 + }, + { + "epoch": 0.09359499676514989, + "grad_norm": 0.9135997891426086, + "learning_rate": 9.987974876140822e-06, + "loss": 0.738, + "step": 1736 + }, + { + "epoch": 0.0936489109337934, + "grad_norm": 0.7749861478805542, + "learning_rate": 9.987960176911964e-06, + "loss": 0.773, + "step": 1737 + }, + { + "epoch": 0.09370282510243692, + "grad_norm": 0.7850096225738525, + "learning_rate": 9.987945468715425e-06, + "loss": 0.7924, + "step": 1738 + }, + { + "epoch": 0.09375673927108044, + "grad_norm": 0.8044145107269287, + "learning_rate": 9.987930751551231e-06, + "loss": 0.8196, + "step": 1739 + }, + { + "epoch": 0.09381065343972396, + "grad_norm": 0.8781464695930481, + "learning_rate": 9.987916025419413e-06, + "loss": 0.9337, + "step": 1740 + }, + { + "epoch": 0.09386456760836748, + "grad_norm": 1.0839952230453491, + "learning_rate": 9.987901290319993e-06, + "loss": 0.8092, + "step": 1741 + }, + { + "epoch": 0.093918481777011, + "grad_norm": 0.7910736203193665, + "learning_rate": 9.987886546253e-06, + "loss": 0.8775, + "step": 1742 + }, + { + "epoch": 0.09397239594565451, + "grad_norm": 0.887287974357605, + "learning_rate": 9.98787179321846e-06, + "loss": 0.8271, + "step": 1743 + }, + { + "epoch": 0.09402631011429803, + "grad_norm": 1.1318427324295044, + "learning_rate": 9.987857031216397e-06, + "loss": 0.8328, + "step": 1744 + }, + { + "epoch": 0.09408022428294156, + "grad_norm": 0.8660401105880737, + "learning_rate": 9.987842260246842e-06, + "loss": 0.8647, + "step": 1745 + }, + { + "epoch": 0.09413413845158508, + "grad_norm": 0.9396790266036987, + "learning_rate": 9.98782748030982e-06, + "loss": 0.9373, + "step": 1746 + }, + { + "epoch": 0.0941880526202286, + "grad_norm": 0.8715323209762573, + "learning_rate": 9.987812691405353e-06, + "loss": 0.8621, + "step": 1747 + }, + { + "epoch": 0.09424196678887212, + "grad_norm": 0.7882347106933594, + "learning_rate": 9.987797893533475e-06, + "loss": 0.7283, + "step": 1748 + }, + { + "epoch": 0.09429588095751563, + "grad_norm": 0.9641733765602112, + "learning_rate": 9.987783086694208e-06, + "loss": 0.8038, + "step": 1749 + }, + { + "epoch": 0.09434979512615915, + "grad_norm": 0.8808518648147583, + "learning_rate": 9.98776827088758e-06, + "loss": 0.8072, + "step": 1750 + }, + { + "epoch": 0.09440370929480267, + "grad_norm": 0.7720713019371033, + "learning_rate": 9.987753446113618e-06, + "loss": 0.7786, + "step": 1751 + }, + { + "epoch": 0.09445762346344619, + "grad_norm": 1.0507936477661133, + "learning_rate": 9.987738612372346e-06, + "loss": 0.9302, + "step": 1752 + }, + { + "epoch": 0.0945115376320897, + "grad_norm": 0.7705017328262329, + "learning_rate": 9.987723769663795e-06, + "loss": 0.7366, + "step": 1753 + }, + { + "epoch": 0.09456545180073324, + "grad_norm": 0.82464200258255, + "learning_rate": 9.987708917987989e-06, + "loss": 0.8063, + "step": 1754 + }, + { + "epoch": 0.09461936596937676, + "grad_norm": 0.9387272000312805, + "learning_rate": 9.987694057344953e-06, + "loss": 0.8108, + "step": 1755 + }, + { + "epoch": 0.09467328013802027, + "grad_norm": 0.9161933064460754, + "learning_rate": 9.987679187734717e-06, + "loss": 0.8331, + "step": 1756 + }, + { + "epoch": 0.09472719430666379, + "grad_norm": 0.9379769563674927, + "learning_rate": 9.987664309157306e-06, + "loss": 0.9064, + "step": 1757 + }, + { + "epoch": 0.09478110847530731, + "grad_norm": 0.9597976803779602, + "learning_rate": 9.987649421612748e-06, + "loss": 0.7785, + "step": 1758 + }, + { + "epoch": 0.09483502264395083, + "grad_norm": 0.8689720630645752, + "learning_rate": 9.98763452510107e-06, + "loss": 0.7828, + "step": 1759 + }, + { + "epoch": 0.09488893681259435, + "grad_norm": 0.9207726716995239, + "learning_rate": 9.987619619622296e-06, + "loss": 0.7853, + "step": 1760 + }, + { + "epoch": 0.09494285098123786, + "grad_norm": 0.8130320310592651, + "learning_rate": 9.987604705176455e-06, + "loss": 0.858, + "step": 1761 + }, + { + "epoch": 0.09499676514988138, + "grad_norm": 0.9004638195037842, + "learning_rate": 9.987589781763574e-06, + "loss": 0.8148, + "step": 1762 + }, + { + "epoch": 0.09505067931852491, + "grad_norm": 0.8554181456565857, + "learning_rate": 9.987574849383678e-06, + "loss": 0.8103, + "step": 1763 + }, + { + "epoch": 0.09510459348716843, + "grad_norm": 0.9148527979850769, + "learning_rate": 9.987559908036797e-06, + "loss": 0.9467, + "step": 1764 + }, + { + "epoch": 0.09515850765581195, + "grad_norm": 0.890083909034729, + "learning_rate": 9.987544957722956e-06, + "loss": 0.8338, + "step": 1765 + }, + { + "epoch": 0.09521242182445547, + "grad_norm": 0.8118012547492981, + "learning_rate": 9.98752999844218e-06, + "loss": 0.8355, + "step": 1766 + }, + { + "epoch": 0.09526633599309899, + "grad_norm": 0.8115151524543762, + "learning_rate": 9.987515030194498e-06, + "loss": 0.9172, + "step": 1767 + }, + { + "epoch": 0.0953202501617425, + "grad_norm": 0.8750082850456238, + "learning_rate": 9.987500052979938e-06, + "loss": 0.8301, + "step": 1768 + }, + { + "epoch": 0.09537416433038602, + "grad_norm": 0.9008756875991821, + "learning_rate": 9.987485066798525e-06, + "loss": 0.8642, + "step": 1769 + }, + { + "epoch": 0.09542807849902954, + "grad_norm": 0.8335922956466675, + "learning_rate": 9.987470071650287e-06, + "loss": 0.8466, + "step": 1770 + }, + { + "epoch": 0.09548199266767307, + "grad_norm": 0.8604272603988647, + "learning_rate": 9.987455067535249e-06, + "loss": 0.8801, + "step": 1771 + }, + { + "epoch": 0.09553590683631659, + "grad_norm": 0.889854371547699, + "learning_rate": 9.98744005445344e-06, + "loss": 0.8804, + "step": 1772 + }, + { + "epoch": 0.09558982100496011, + "grad_norm": 0.8756876587867737, + "learning_rate": 9.987425032404887e-06, + "loss": 0.8367, + "step": 1773 + }, + { + "epoch": 0.09564373517360363, + "grad_norm": 0.9071298837661743, + "learning_rate": 9.987410001389616e-06, + "loss": 0.8875, + "step": 1774 + }, + { + "epoch": 0.09569764934224714, + "grad_norm": 0.8214284777641296, + "learning_rate": 9.987394961407654e-06, + "loss": 0.7859, + "step": 1775 + }, + { + "epoch": 0.09575156351089066, + "grad_norm": 0.940034806728363, + "learning_rate": 9.98737991245903e-06, + "loss": 0.8272, + "step": 1776 + }, + { + "epoch": 0.09580547767953418, + "grad_norm": 0.8156501054763794, + "learning_rate": 9.987364854543768e-06, + "loss": 0.7831, + "step": 1777 + }, + { + "epoch": 0.0958593918481777, + "grad_norm": 0.8450450301170349, + "learning_rate": 9.987349787661898e-06, + "loss": 0.7888, + "step": 1778 + }, + { + "epoch": 0.09591330601682121, + "grad_norm": 0.8143148422241211, + "learning_rate": 9.987334711813446e-06, + "loss": 0.7593, + "step": 1779 + }, + { + "epoch": 0.09596722018546475, + "grad_norm": 1.0489457845687866, + "learning_rate": 9.987319626998437e-06, + "loss": 0.8248, + "step": 1780 + }, + { + "epoch": 0.09602113435410826, + "grad_norm": 0.9584689140319824, + "learning_rate": 9.987304533216901e-06, + "loss": 0.9025, + "step": 1781 + }, + { + "epoch": 0.09607504852275178, + "grad_norm": 0.8366501331329346, + "learning_rate": 9.987289430468862e-06, + "loss": 0.7513, + "step": 1782 + }, + { + "epoch": 0.0961289626913953, + "grad_norm": 0.9896461963653564, + "learning_rate": 9.987274318754352e-06, + "loss": 0.8598, + "step": 1783 + }, + { + "epoch": 0.09618287686003882, + "grad_norm": 1.1904568672180176, + "learning_rate": 9.987259198073396e-06, + "loss": 0.9143, + "step": 1784 + }, + { + "epoch": 0.09623679102868234, + "grad_norm": 0.8100086450576782, + "learning_rate": 9.987244068426019e-06, + "loss": 0.7733, + "step": 1785 + }, + { + "epoch": 0.09629070519732585, + "grad_norm": 0.7814387083053589, + "learning_rate": 9.987228929812249e-06, + "loss": 0.7735, + "step": 1786 + }, + { + "epoch": 0.09634461936596937, + "grad_norm": 0.8880924582481384, + "learning_rate": 9.987213782232115e-06, + "loss": 0.8377, + "step": 1787 + }, + { + "epoch": 0.09639853353461289, + "grad_norm": 0.8739203810691833, + "learning_rate": 9.987198625685643e-06, + "loss": 0.8851, + "step": 1788 + }, + { + "epoch": 0.09645244770325642, + "grad_norm": 0.8984062671661377, + "learning_rate": 9.987183460172861e-06, + "loss": 0.8773, + "step": 1789 + }, + { + "epoch": 0.09650636187189994, + "grad_norm": 1.2485296726226807, + "learning_rate": 9.987168285693795e-06, + "loss": 0.787, + "step": 1790 + }, + { + "epoch": 0.09656027604054346, + "grad_norm": 0.8414161205291748, + "learning_rate": 9.987153102248474e-06, + "loss": 0.7895, + "step": 1791 + }, + { + "epoch": 0.09661419020918698, + "grad_norm": 0.7895180583000183, + "learning_rate": 9.987137909836924e-06, + "loss": 0.7592, + "step": 1792 + }, + { + "epoch": 0.0966681043778305, + "grad_norm": 1.0752787590026855, + "learning_rate": 9.987122708459173e-06, + "loss": 0.8472, + "step": 1793 + }, + { + "epoch": 0.09672201854647401, + "grad_norm": 0.9069424271583557, + "learning_rate": 9.987107498115247e-06, + "loss": 0.8746, + "step": 1794 + }, + { + "epoch": 0.09677593271511753, + "grad_norm": 0.8566716909408569, + "learning_rate": 9.987092278805175e-06, + "loss": 0.7604, + "step": 1795 + }, + { + "epoch": 0.09682984688376105, + "grad_norm": 0.833852231502533, + "learning_rate": 9.987077050528983e-06, + "loss": 0.8645, + "step": 1796 + }, + { + "epoch": 0.09688376105240457, + "grad_norm": 0.8439596891403198, + "learning_rate": 9.9870618132867e-06, + "loss": 0.7673, + "step": 1797 + }, + { + "epoch": 0.0969376752210481, + "grad_norm": 0.9743669629096985, + "learning_rate": 9.987046567078352e-06, + "loss": 0.7754, + "step": 1798 + }, + { + "epoch": 0.09699158938969162, + "grad_norm": 0.9291634559631348, + "learning_rate": 9.987031311903968e-06, + "loss": 0.8431, + "step": 1799 + }, + { + "epoch": 0.09704550355833513, + "grad_norm": 1.169450283050537, + "learning_rate": 9.987016047763571e-06, + "loss": 0.9321, + "step": 1800 + }, + { + "epoch": 0.09709941772697865, + "grad_norm": 0.7758163809776306, + "learning_rate": 9.987000774657195e-06, + "loss": 0.7832, + "step": 1801 + }, + { + "epoch": 0.09715333189562217, + "grad_norm": 0.9673672914505005, + "learning_rate": 9.986985492584863e-06, + "loss": 0.9822, + "step": 1802 + }, + { + "epoch": 0.09720724606426569, + "grad_norm": 1.1516417264938354, + "learning_rate": 9.986970201546605e-06, + "loss": 0.9956, + "step": 1803 + }, + { + "epoch": 0.0972611602329092, + "grad_norm": 0.9660587906837463, + "learning_rate": 9.986954901542445e-06, + "loss": 0.8248, + "step": 1804 + }, + { + "epoch": 0.09731507440155272, + "grad_norm": 0.9452739953994751, + "learning_rate": 9.986939592572413e-06, + "loss": 0.8805, + "step": 1805 + }, + { + "epoch": 0.09736898857019624, + "grad_norm": 0.9339364171028137, + "learning_rate": 9.986924274636538e-06, + "loss": 0.8819, + "step": 1806 + }, + { + "epoch": 0.09742290273883977, + "grad_norm": 0.9344542026519775, + "learning_rate": 9.986908947734844e-06, + "loss": 0.8531, + "step": 1807 + }, + { + "epoch": 0.09747681690748329, + "grad_norm": 0.8910528421401978, + "learning_rate": 9.986893611867362e-06, + "loss": 0.8949, + "step": 1808 + }, + { + "epoch": 0.09753073107612681, + "grad_norm": 0.8484895825386047, + "learning_rate": 9.986878267034115e-06, + "loss": 0.8028, + "step": 1809 + }, + { + "epoch": 0.09758464524477033, + "grad_norm": 1.0784810781478882, + "learning_rate": 9.986862913235135e-06, + "loss": 0.9564, + "step": 1810 + }, + { + "epoch": 0.09763855941341384, + "grad_norm": 0.8350296020507812, + "learning_rate": 9.98684755047045e-06, + "loss": 0.8672, + "step": 1811 + }, + { + "epoch": 0.09769247358205736, + "grad_norm": 0.8558050990104675, + "learning_rate": 9.986832178740084e-06, + "loss": 0.8538, + "step": 1812 + }, + { + "epoch": 0.09774638775070088, + "grad_norm": 0.8633396029472351, + "learning_rate": 9.986816798044066e-06, + "loss": 0.8356, + "step": 1813 + }, + { + "epoch": 0.0978003019193444, + "grad_norm": 0.8256344199180603, + "learning_rate": 9.986801408382424e-06, + "loss": 0.7552, + "step": 1814 + }, + { + "epoch": 0.09785421608798792, + "grad_norm": 0.872844398021698, + "learning_rate": 9.986786009755186e-06, + "loss": 0.9153, + "step": 1815 + }, + { + "epoch": 0.09790813025663145, + "grad_norm": 0.842241108417511, + "learning_rate": 9.986770602162378e-06, + "loss": 0.7965, + "step": 1816 + }, + { + "epoch": 0.09796204442527497, + "grad_norm": 0.9673634171485901, + "learning_rate": 9.98675518560403e-06, + "loss": 0.8317, + "step": 1817 + }, + { + "epoch": 0.09801595859391848, + "grad_norm": 0.8744896650314331, + "learning_rate": 9.98673976008017e-06, + "loss": 0.7342, + "step": 1818 + }, + { + "epoch": 0.098069872762562, + "grad_norm": 0.7830422520637512, + "learning_rate": 9.986724325590825e-06, + "loss": 0.721, + "step": 1819 + }, + { + "epoch": 0.09812378693120552, + "grad_norm": 1.0335441827774048, + "learning_rate": 9.986708882136021e-06, + "loss": 0.8088, + "step": 1820 + }, + { + "epoch": 0.09817770109984904, + "grad_norm": 0.841342568397522, + "learning_rate": 9.986693429715785e-06, + "loss": 0.8847, + "step": 1821 + }, + { + "epoch": 0.09823161526849256, + "grad_norm": 0.9405834674835205, + "learning_rate": 9.98667796833015e-06, + "loss": 0.8878, + "step": 1822 + }, + { + "epoch": 0.09828552943713607, + "grad_norm": 0.8358225226402283, + "learning_rate": 9.986662497979138e-06, + "loss": 0.7377, + "step": 1823 + }, + { + "epoch": 0.0983394436057796, + "grad_norm": 0.8844004273414612, + "learning_rate": 9.98664701866278e-06, + "loss": 0.7236, + "step": 1824 + }, + { + "epoch": 0.09839335777442312, + "grad_norm": 0.8165417313575745, + "learning_rate": 9.986631530381105e-06, + "loss": 0.819, + "step": 1825 + }, + { + "epoch": 0.09844727194306664, + "grad_norm": 0.9569553732872009, + "learning_rate": 9.986616033134137e-06, + "loss": 0.9337, + "step": 1826 + }, + { + "epoch": 0.09850118611171016, + "grad_norm": 0.8311771750450134, + "learning_rate": 9.986600526921907e-06, + "loss": 0.8516, + "step": 1827 + }, + { + "epoch": 0.09855510028035368, + "grad_norm": 0.9444357752799988, + "learning_rate": 9.986585011744441e-06, + "loss": 0.805, + "step": 1828 + }, + { + "epoch": 0.0986090144489972, + "grad_norm": 1.0128875970840454, + "learning_rate": 9.986569487601769e-06, + "loss": 0.8514, + "step": 1829 + }, + { + "epoch": 0.09866292861764071, + "grad_norm": 0.8973994255065918, + "learning_rate": 9.986553954493917e-06, + "loss": 0.7938, + "step": 1830 + }, + { + "epoch": 0.09871684278628423, + "grad_norm": 0.8571779131889343, + "learning_rate": 9.986538412420912e-06, + "loss": 0.7506, + "step": 1831 + }, + { + "epoch": 0.09877075695492775, + "grad_norm": 0.9053436517715454, + "learning_rate": 9.986522861382785e-06, + "loss": 0.8551, + "step": 1832 + }, + { + "epoch": 0.09882467112357128, + "grad_norm": 0.9941746592521667, + "learning_rate": 9.986507301379562e-06, + "loss": 0.8828, + "step": 1833 + }, + { + "epoch": 0.0988785852922148, + "grad_norm": 0.9620066285133362, + "learning_rate": 9.986491732411272e-06, + "loss": 0.8982, + "step": 1834 + }, + { + "epoch": 0.09893249946085832, + "grad_norm": 0.9470074772834778, + "learning_rate": 9.986476154477941e-06, + "loss": 0.8295, + "step": 1835 + }, + { + "epoch": 0.09898641362950183, + "grad_norm": 0.9962137937545776, + "learning_rate": 9.986460567579599e-06, + "loss": 0.8714, + "step": 1836 + }, + { + "epoch": 0.09904032779814535, + "grad_norm": 0.8492829203605652, + "learning_rate": 9.986444971716273e-06, + "loss": 0.8234, + "step": 1837 + }, + { + "epoch": 0.09909424196678887, + "grad_norm": 0.9463719725608826, + "learning_rate": 9.986429366887994e-06, + "loss": 0.7769, + "step": 1838 + }, + { + "epoch": 0.09914815613543239, + "grad_norm": 0.8588153123855591, + "learning_rate": 9.986413753094786e-06, + "loss": 0.8883, + "step": 1839 + }, + { + "epoch": 0.0992020703040759, + "grad_norm": 0.7692183256149292, + "learning_rate": 9.986398130336677e-06, + "loss": 0.7691, + "step": 1840 + }, + { + "epoch": 0.09925598447271942, + "grad_norm": 0.8377199172973633, + "learning_rate": 9.986382498613699e-06, + "loss": 0.789, + "step": 1841 + }, + { + "epoch": 0.09930989864136296, + "grad_norm": 0.9783869385719299, + "learning_rate": 9.986366857925876e-06, + "loss": 0.8517, + "step": 1842 + }, + { + "epoch": 0.09936381281000647, + "grad_norm": 0.8233169913291931, + "learning_rate": 9.986351208273239e-06, + "loss": 0.8701, + "step": 1843 + }, + { + "epoch": 0.09941772697864999, + "grad_norm": 0.9393780827522278, + "learning_rate": 9.986335549655814e-06, + "loss": 0.8837, + "step": 1844 + }, + { + "epoch": 0.09947164114729351, + "grad_norm": 0.8517693877220154, + "learning_rate": 9.986319882073631e-06, + "loss": 0.9043, + "step": 1845 + }, + { + "epoch": 0.09952555531593703, + "grad_norm": 0.8296724557876587, + "learning_rate": 9.986304205526718e-06, + "loss": 0.7406, + "step": 1846 + }, + { + "epoch": 0.09957946948458055, + "grad_norm": 0.8372161388397217, + "learning_rate": 9.986288520015102e-06, + "loss": 0.7763, + "step": 1847 + }, + { + "epoch": 0.09963338365322406, + "grad_norm": 0.8086470365524292, + "learning_rate": 9.986272825538812e-06, + "loss": 0.8786, + "step": 1848 + }, + { + "epoch": 0.09968729782186758, + "grad_norm": 0.8562842011451721, + "learning_rate": 9.986257122097875e-06, + "loss": 0.8391, + "step": 1849 + }, + { + "epoch": 0.0997412119905111, + "grad_norm": 0.9052720665931702, + "learning_rate": 9.986241409692321e-06, + "loss": 0.948, + "step": 1850 + }, + { + "epoch": 0.09979512615915463, + "grad_norm": 0.8220609426498413, + "learning_rate": 9.986225688322178e-06, + "loss": 0.8039, + "step": 1851 + }, + { + "epoch": 0.09984904032779815, + "grad_norm": 0.8018030524253845, + "learning_rate": 9.98620995798747e-06, + "loss": 0.7748, + "step": 1852 + }, + { + "epoch": 0.09990295449644167, + "grad_norm": 0.8150879144668579, + "learning_rate": 9.986194218688235e-06, + "loss": 0.7304, + "step": 1853 + }, + { + "epoch": 0.09995686866508519, + "grad_norm": 0.8677535653114319, + "learning_rate": 9.98617847042449e-06, + "loss": 0.8756, + "step": 1854 + }, + { + "epoch": 0.1000107828337287, + "grad_norm": 0.8889294862747192, + "learning_rate": 9.986162713196272e-06, + "loss": 0.8926, + "step": 1855 + }, + { + "epoch": 0.10006469700237222, + "grad_norm": 0.7618375420570374, + "learning_rate": 9.986146947003603e-06, + "loss": 0.7317, + "step": 1856 + }, + { + "epoch": 0.10011861117101574, + "grad_norm": 0.8775038719177246, + "learning_rate": 9.986131171846518e-06, + "loss": 0.8318, + "step": 1857 + }, + { + "epoch": 0.10017252533965926, + "grad_norm": 0.9671807289123535, + "learning_rate": 9.986115387725039e-06, + "loss": 0.7412, + "step": 1858 + }, + { + "epoch": 0.10022643950830278, + "grad_norm": 0.8808870911598206, + "learning_rate": 9.986099594639197e-06, + "loss": 0.8213, + "step": 1859 + }, + { + "epoch": 0.10028035367694631, + "grad_norm": 0.8104208707809448, + "learning_rate": 9.986083792589021e-06, + "loss": 0.8108, + "step": 1860 + }, + { + "epoch": 0.10033426784558983, + "grad_norm": 0.839911937713623, + "learning_rate": 9.986067981574538e-06, + "loss": 0.8391, + "step": 1861 + }, + { + "epoch": 0.10038818201423334, + "grad_norm": 0.8402823805809021, + "learning_rate": 9.986052161595778e-06, + "loss": 0.7434, + "step": 1862 + }, + { + "epoch": 0.10044209618287686, + "grad_norm": 0.7591431140899658, + "learning_rate": 9.986036332652768e-06, + "loss": 0.763, + "step": 1863 + }, + { + "epoch": 0.10049601035152038, + "grad_norm": 0.8613053560256958, + "learning_rate": 9.986020494745538e-06, + "loss": 0.8324, + "step": 1864 + }, + { + "epoch": 0.1005499245201639, + "grad_norm": 0.8467068076133728, + "learning_rate": 9.986004647874117e-06, + "loss": 0.882, + "step": 1865 + }, + { + "epoch": 0.10060383868880741, + "grad_norm": 1.0717257261276245, + "learning_rate": 9.98598879203853e-06, + "loss": 0.9305, + "step": 1866 + }, + { + "epoch": 0.10065775285745093, + "grad_norm": 0.8680382370948792, + "learning_rate": 9.985972927238808e-06, + "loss": 0.7521, + "step": 1867 + }, + { + "epoch": 0.10071166702609445, + "grad_norm": 0.8465799689292908, + "learning_rate": 9.98595705347498e-06, + "loss": 0.8562, + "step": 1868 + }, + { + "epoch": 0.10076558119473798, + "grad_norm": 0.938218355178833, + "learning_rate": 9.985941170747072e-06, + "loss": 0.7737, + "step": 1869 + }, + { + "epoch": 0.1008194953633815, + "grad_norm": 0.8189761638641357, + "learning_rate": 9.985925279055117e-06, + "loss": 0.8502, + "step": 1870 + }, + { + "epoch": 0.10087340953202502, + "grad_norm": 0.915703535079956, + "learning_rate": 9.985909378399138e-06, + "loss": 0.9576, + "step": 1871 + }, + { + "epoch": 0.10092732370066854, + "grad_norm": 0.7837297916412354, + "learning_rate": 9.985893468779168e-06, + "loss": 0.7091, + "step": 1872 + }, + { + "epoch": 0.10098123786931205, + "grad_norm": 0.7426577806472778, + "learning_rate": 9.985877550195234e-06, + "loss": 0.768, + "step": 1873 + }, + { + "epoch": 0.10103515203795557, + "grad_norm": 0.9437102675437927, + "learning_rate": 9.985861622647364e-06, + "loss": 0.8308, + "step": 1874 + }, + { + "epoch": 0.10108906620659909, + "grad_norm": 0.7381339073181152, + "learning_rate": 9.985845686135586e-06, + "loss": 0.7206, + "step": 1875 + }, + { + "epoch": 0.10114298037524261, + "grad_norm": 0.8478738069534302, + "learning_rate": 9.985829740659932e-06, + "loss": 0.7512, + "step": 1876 + }, + { + "epoch": 0.10119689454388614, + "grad_norm": 0.8331673741340637, + "learning_rate": 9.985813786220428e-06, + "loss": 0.8281, + "step": 1877 + }, + { + "epoch": 0.10125080871252966, + "grad_norm": 0.7703354954719543, + "learning_rate": 9.985797822817102e-06, + "loss": 0.7313, + "step": 1878 + }, + { + "epoch": 0.10130472288117318, + "grad_norm": 0.9182866811752319, + "learning_rate": 9.985781850449985e-06, + "loss": 0.8365, + "step": 1879 + }, + { + "epoch": 0.1013586370498167, + "grad_norm": 0.8285559415817261, + "learning_rate": 9.985765869119104e-06, + "loss": 0.8439, + "step": 1880 + }, + { + "epoch": 0.10141255121846021, + "grad_norm": 0.8400557041168213, + "learning_rate": 9.985749878824488e-06, + "loss": 0.8011, + "step": 1881 + }, + { + "epoch": 0.10146646538710373, + "grad_norm": 0.9225326776504517, + "learning_rate": 9.985733879566168e-06, + "loss": 0.8402, + "step": 1882 + }, + { + "epoch": 0.10152037955574725, + "grad_norm": 0.9194371700286865, + "learning_rate": 9.985717871344172e-06, + "loss": 0.8245, + "step": 1883 + }, + { + "epoch": 0.10157429372439077, + "grad_norm": 0.7443274259567261, + "learning_rate": 9.985701854158525e-06, + "loss": 0.7708, + "step": 1884 + }, + { + "epoch": 0.10162820789303428, + "grad_norm": 1.1139355897903442, + "learning_rate": 9.985685828009259e-06, + "loss": 0.8384, + "step": 1885 + }, + { + "epoch": 0.10168212206167782, + "grad_norm": 0.8835493326187134, + "learning_rate": 9.985669792896402e-06, + "loss": 0.8063, + "step": 1886 + }, + { + "epoch": 0.10173603623032133, + "grad_norm": 0.8012663125991821, + "learning_rate": 9.985653748819983e-06, + "loss": 0.8393, + "step": 1887 + }, + { + "epoch": 0.10178995039896485, + "grad_norm": 0.8092807531356812, + "learning_rate": 9.985637695780033e-06, + "loss": 0.7631, + "step": 1888 + }, + { + "epoch": 0.10184386456760837, + "grad_norm": 1.7357290983200073, + "learning_rate": 9.985621633776577e-06, + "loss": 0.8067, + "step": 1889 + }, + { + "epoch": 0.10189777873625189, + "grad_norm": 0.8562015891075134, + "learning_rate": 9.985605562809646e-06, + "loss": 0.8543, + "step": 1890 + }, + { + "epoch": 0.1019516929048954, + "grad_norm": 0.9570844769477844, + "learning_rate": 9.98558948287927e-06, + "loss": 0.7778, + "step": 1891 + }, + { + "epoch": 0.10200560707353892, + "grad_norm": 0.748468279838562, + "learning_rate": 9.985573393985475e-06, + "loss": 0.6559, + "step": 1892 + }, + { + "epoch": 0.10205952124218244, + "grad_norm": 1.004490852355957, + "learning_rate": 9.98555729612829e-06, + "loss": 0.8453, + "step": 1893 + }, + { + "epoch": 0.10211343541082596, + "grad_norm": 0.9566166996955872, + "learning_rate": 9.985541189307749e-06, + "loss": 0.8984, + "step": 1894 + }, + { + "epoch": 0.10216734957946949, + "grad_norm": 0.8624017834663391, + "learning_rate": 9.985525073523874e-06, + "loss": 0.7442, + "step": 1895 + }, + { + "epoch": 0.10222126374811301, + "grad_norm": 1.0596553087234497, + "learning_rate": 9.9855089487767e-06, + "loss": 0.778, + "step": 1896 + }, + { + "epoch": 0.10227517791675653, + "grad_norm": 0.8003553152084351, + "learning_rate": 9.985492815066252e-06, + "loss": 0.7513, + "step": 1897 + }, + { + "epoch": 0.10232909208540004, + "grad_norm": 1.0067185163497925, + "learning_rate": 9.98547667239256e-06, + "loss": 0.8878, + "step": 1898 + }, + { + "epoch": 0.10238300625404356, + "grad_norm": 0.8398754596710205, + "learning_rate": 9.985460520755654e-06, + "loss": 0.8222, + "step": 1899 + }, + { + "epoch": 0.10243692042268708, + "grad_norm": 0.9688541293144226, + "learning_rate": 9.985444360155563e-06, + "loss": 0.8304, + "step": 1900 + }, + { + "epoch": 0.1024908345913306, + "grad_norm": 0.8848011493682861, + "learning_rate": 9.985428190592314e-06, + "loss": 0.7853, + "step": 1901 + }, + { + "epoch": 0.10254474875997412, + "grad_norm": 0.9240403771400452, + "learning_rate": 9.985412012065937e-06, + "loss": 0.9058, + "step": 1902 + }, + { + "epoch": 0.10259866292861763, + "grad_norm": 0.814194917678833, + "learning_rate": 9.985395824576463e-06, + "loss": 0.7775, + "step": 1903 + }, + { + "epoch": 0.10265257709726117, + "grad_norm": 0.9210302233695984, + "learning_rate": 9.98537962812392e-06, + "loss": 0.9288, + "step": 1904 + }, + { + "epoch": 0.10270649126590468, + "grad_norm": 0.8850705027580261, + "learning_rate": 9.985363422708336e-06, + "loss": 0.9036, + "step": 1905 + }, + { + "epoch": 0.1027604054345482, + "grad_norm": 0.8312196731567383, + "learning_rate": 9.985347208329742e-06, + "loss": 0.811, + "step": 1906 + }, + { + "epoch": 0.10281431960319172, + "grad_norm": 1.1294670104980469, + "learning_rate": 9.985330984988164e-06, + "loss": 0.9775, + "step": 1907 + }, + { + "epoch": 0.10286823377183524, + "grad_norm": 0.7980399131774902, + "learning_rate": 9.985314752683635e-06, + "loss": 0.7786, + "step": 1908 + }, + { + "epoch": 0.10292214794047876, + "grad_norm": 0.8291264176368713, + "learning_rate": 9.985298511416181e-06, + "loss": 0.7028, + "step": 1909 + }, + { + "epoch": 0.10297606210912227, + "grad_norm": 0.8284684419631958, + "learning_rate": 9.985282261185833e-06, + "loss": 0.8043, + "step": 1910 + }, + { + "epoch": 0.10302997627776579, + "grad_norm": 0.8680904507637024, + "learning_rate": 9.985266001992622e-06, + "loss": 0.8274, + "step": 1911 + }, + { + "epoch": 0.10308389044640931, + "grad_norm": 0.7380900979042053, + "learning_rate": 9.985249733836573e-06, + "loss": 0.6991, + "step": 1912 + }, + { + "epoch": 0.10313780461505284, + "grad_norm": 0.8572129011154175, + "learning_rate": 9.985233456717718e-06, + "loss": 0.7751, + "step": 1913 + }, + { + "epoch": 0.10319171878369636, + "grad_norm": 0.8797627687454224, + "learning_rate": 9.985217170636085e-06, + "loss": 0.8681, + "step": 1914 + }, + { + "epoch": 0.10324563295233988, + "grad_norm": 0.9301999807357788, + "learning_rate": 9.985200875591704e-06, + "loss": 0.6208, + "step": 1915 + }, + { + "epoch": 0.1032995471209834, + "grad_norm": 0.8296228647232056, + "learning_rate": 9.985184571584606e-06, + "loss": 0.8027, + "step": 1916 + }, + { + "epoch": 0.10335346128962691, + "grad_norm": 0.8241246342658997, + "learning_rate": 9.985168258614815e-06, + "loss": 0.8223, + "step": 1917 + }, + { + "epoch": 0.10340737545827043, + "grad_norm": 0.9633389115333557, + "learning_rate": 9.985151936682367e-06, + "loss": 0.9037, + "step": 1918 + }, + { + "epoch": 0.10346128962691395, + "grad_norm": 0.8903288245201111, + "learning_rate": 9.985135605787286e-06, + "loss": 0.8949, + "step": 1919 + }, + { + "epoch": 0.10351520379555747, + "grad_norm": 0.8670981526374817, + "learning_rate": 9.985119265929604e-06, + "loss": 0.7094, + "step": 1920 + }, + { + "epoch": 0.10356911796420099, + "grad_norm": 0.9681735038757324, + "learning_rate": 9.985102917109351e-06, + "loss": 0.9617, + "step": 1921 + }, + { + "epoch": 0.10362303213284452, + "grad_norm": 0.9229291081428528, + "learning_rate": 9.985086559326555e-06, + "loss": 0.9384, + "step": 1922 + }, + { + "epoch": 0.10367694630148803, + "grad_norm": 0.8501392602920532, + "learning_rate": 9.985070192581245e-06, + "loss": 0.8647, + "step": 1923 + }, + { + "epoch": 0.10373086047013155, + "grad_norm": 1.4047728776931763, + "learning_rate": 9.985053816873452e-06, + "loss": 0.7905, + "step": 1924 + }, + { + "epoch": 0.10378477463877507, + "grad_norm": 1.154661774635315, + "learning_rate": 9.985037432203204e-06, + "loss": 0.8666, + "step": 1925 + }, + { + "epoch": 0.10383868880741859, + "grad_norm": 1.042126178741455, + "learning_rate": 9.985021038570532e-06, + "loss": 0.7736, + "step": 1926 + }, + { + "epoch": 0.1038926029760621, + "grad_norm": 0.7904629111289978, + "learning_rate": 9.985004635975464e-06, + "loss": 0.7247, + "step": 1927 + }, + { + "epoch": 0.10394651714470562, + "grad_norm": 0.8718095421791077, + "learning_rate": 9.984988224418029e-06, + "loss": 0.7792, + "step": 1928 + }, + { + "epoch": 0.10400043131334914, + "grad_norm": 0.870330274105072, + "learning_rate": 9.984971803898258e-06, + "loss": 0.7992, + "step": 1929 + }, + { + "epoch": 0.10405434548199267, + "grad_norm": 0.8473007678985596, + "learning_rate": 9.98495537441618e-06, + "loss": 0.883, + "step": 1930 + }, + { + "epoch": 0.10410825965063619, + "grad_norm": 1.0333232879638672, + "learning_rate": 9.984938935971824e-06, + "loss": 0.9228, + "step": 1931 + }, + { + "epoch": 0.10416217381927971, + "grad_norm": 0.9389268159866333, + "learning_rate": 9.984922488565221e-06, + "loss": 0.7792, + "step": 1932 + }, + { + "epoch": 0.10421608798792323, + "grad_norm": 0.9977405667304993, + "learning_rate": 9.9849060321964e-06, + "loss": 0.7971, + "step": 1933 + }, + { + "epoch": 0.10427000215656675, + "grad_norm": 0.7879780530929565, + "learning_rate": 9.98488956686539e-06, + "loss": 0.8149, + "step": 1934 + }, + { + "epoch": 0.10432391632521026, + "grad_norm": 0.8149437308311462, + "learning_rate": 9.98487309257222e-06, + "loss": 0.8391, + "step": 1935 + }, + { + "epoch": 0.10437783049385378, + "grad_norm": 0.9226745367050171, + "learning_rate": 9.984856609316921e-06, + "loss": 0.9581, + "step": 1936 + }, + { + "epoch": 0.1044317446624973, + "grad_norm": 0.9190924167633057, + "learning_rate": 9.984840117099524e-06, + "loss": 0.8859, + "step": 1937 + }, + { + "epoch": 0.10448565883114082, + "grad_norm": 0.7996852397918701, + "learning_rate": 9.984823615920054e-06, + "loss": 0.7377, + "step": 1938 + }, + { + "epoch": 0.10453957299978435, + "grad_norm": 1.0055615901947021, + "learning_rate": 9.984807105778544e-06, + "loss": 1.4365, + "step": 1939 + }, + { + "epoch": 0.10459348716842787, + "grad_norm": 0.8595201969146729, + "learning_rate": 9.984790586675023e-06, + "loss": 0.807, + "step": 1940 + }, + { + "epoch": 0.10464740133707139, + "grad_norm": 0.9500923156738281, + "learning_rate": 9.984774058609522e-06, + "loss": 0.9378, + "step": 1941 + }, + { + "epoch": 0.1047013155057149, + "grad_norm": 0.8677893877029419, + "learning_rate": 9.98475752158207e-06, + "loss": 0.8399, + "step": 1942 + }, + { + "epoch": 0.10475522967435842, + "grad_norm": 0.8256751298904419, + "learning_rate": 9.984740975592695e-06, + "loss": 0.8552, + "step": 1943 + }, + { + "epoch": 0.10480914384300194, + "grad_norm": 0.8910439610481262, + "learning_rate": 9.984724420641427e-06, + "loss": 0.9704, + "step": 1944 + }, + { + "epoch": 0.10486305801164546, + "grad_norm": 0.8732389807701111, + "learning_rate": 9.9847078567283e-06, + "loss": 0.8448, + "step": 1945 + }, + { + "epoch": 0.10491697218028898, + "grad_norm": 0.856151282787323, + "learning_rate": 9.984691283853338e-06, + "loss": 0.7403, + "step": 1946 + }, + { + "epoch": 0.1049708863489325, + "grad_norm": 0.8741405010223389, + "learning_rate": 9.984674702016573e-06, + "loss": 0.8913, + "step": 1947 + }, + { + "epoch": 0.10502480051757603, + "grad_norm": 0.9214139580726624, + "learning_rate": 9.984658111218036e-06, + "loss": 0.8901, + "step": 1948 + }, + { + "epoch": 0.10507871468621954, + "grad_norm": 0.9773908853530884, + "learning_rate": 9.984641511457757e-06, + "loss": 0.7979, + "step": 1949 + }, + { + "epoch": 0.10513262885486306, + "grad_norm": 0.9136568903923035, + "learning_rate": 9.984624902735765e-06, + "loss": 0.9019, + "step": 1950 + }, + { + "epoch": 0.10518654302350658, + "grad_norm": 0.857468843460083, + "learning_rate": 9.984608285052087e-06, + "loss": 0.7663, + "step": 1951 + }, + { + "epoch": 0.1052404571921501, + "grad_norm": 0.8473180532455444, + "learning_rate": 9.984591658406756e-06, + "loss": 0.8137, + "step": 1952 + }, + { + "epoch": 0.10529437136079361, + "grad_norm": 0.8932186961174011, + "learning_rate": 9.984575022799805e-06, + "loss": 0.8859, + "step": 1953 + }, + { + "epoch": 0.10534828552943713, + "grad_norm": 0.8191091418266296, + "learning_rate": 9.984558378231257e-06, + "loss": 0.8111, + "step": 1954 + }, + { + "epoch": 0.10540219969808065, + "grad_norm": 0.8452546000480652, + "learning_rate": 9.984541724701147e-06, + "loss": 0.8563, + "step": 1955 + }, + { + "epoch": 0.10545611386672417, + "grad_norm": 0.8053101897239685, + "learning_rate": 9.984525062209502e-06, + "loss": 0.8166, + "step": 1956 + }, + { + "epoch": 0.1055100280353677, + "grad_norm": 0.7936314344406128, + "learning_rate": 9.984508390756354e-06, + "loss": 0.8446, + "step": 1957 + }, + { + "epoch": 0.10556394220401122, + "grad_norm": 0.7867884635925293, + "learning_rate": 9.984491710341733e-06, + "loss": 0.7719, + "step": 1958 + }, + { + "epoch": 0.10561785637265474, + "grad_norm": 0.8387873768806458, + "learning_rate": 9.984475020965667e-06, + "loss": 0.842, + "step": 1959 + }, + { + "epoch": 0.10567177054129825, + "grad_norm": 0.8028631806373596, + "learning_rate": 9.984458322628188e-06, + "loss": 0.7673, + "step": 1960 + }, + { + "epoch": 0.10572568470994177, + "grad_norm": 0.765836238861084, + "learning_rate": 9.984441615329323e-06, + "loss": 0.7383, + "step": 1961 + }, + { + "epoch": 0.10577959887858529, + "grad_norm": 0.8619019389152527, + "learning_rate": 9.984424899069106e-06, + "loss": 0.8076, + "step": 1962 + }, + { + "epoch": 0.10583351304722881, + "grad_norm": 1.1085911989212036, + "learning_rate": 9.984408173847565e-06, + "loss": 0.9379, + "step": 1963 + }, + { + "epoch": 0.10588742721587233, + "grad_norm": 0.7861249446868896, + "learning_rate": 9.98439143966473e-06, + "loss": 0.7531, + "step": 1964 + }, + { + "epoch": 0.10594134138451584, + "grad_norm": 0.8964807391166687, + "learning_rate": 9.984374696520633e-06, + "loss": 0.7991, + "step": 1965 + }, + { + "epoch": 0.10599525555315938, + "grad_norm": 0.720808207988739, + "learning_rate": 9.984357944415302e-06, + "loss": 0.7171, + "step": 1966 + }, + { + "epoch": 0.1060491697218029, + "grad_norm": 0.9870907068252563, + "learning_rate": 9.984341183348766e-06, + "loss": 0.8168, + "step": 1967 + }, + { + "epoch": 0.10610308389044641, + "grad_norm": 0.7987208366394043, + "learning_rate": 9.984324413321057e-06, + "loss": 0.817, + "step": 1968 + }, + { + "epoch": 0.10615699805908993, + "grad_norm": 0.7737677097320557, + "learning_rate": 9.984307634332206e-06, + "loss": 0.855, + "step": 1969 + }, + { + "epoch": 0.10621091222773345, + "grad_norm": 0.9125123620033264, + "learning_rate": 9.984290846382243e-06, + "loss": 0.8059, + "step": 1970 + }, + { + "epoch": 0.10626482639637697, + "grad_norm": 0.8460454344749451, + "learning_rate": 9.984274049471197e-06, + "loss": 0.7415, + "step": 1971 + }, + { + "epoch": 0.10631874056502048, + "grad_norm": 0.8322888016700745, + "learning_rate": 9.984257243599096e-06, + "loss": 0.793, + "step": 1972 + }, + { + "epoch": 0.106372654733664, + "grad_norm": 0.7797715067863464, + "learning_rate": 9.984240428765975e-06, + "loss": 0.7324, + "step": 1973 + }, + { + "epoch": 0.10642656890230752, + "grad_norm": 0.847457766532898, + "learning_rate": 9.98422360497186e-06, + "loss": 0.7949, + "step": 1974 + }, + { + "epoch": 0.10648048307095105, + "grad_norm": 0.8471247553825378, + "learning_rate": 9.984206772216785e-06, + "loss": 0.8368, + "step": 1975 + }, + { + "epoch": 0.10653439723959457, + "grad_norm": 0.879416823387146, + "learning_rate": 9.984189930500778e-06, + "loss": 0.7779, + "step": 1976 + }, + { + "epoch": 0.10658831140823809, + "grad_norm": 0.8355580568313599, + "learning_rate": 9.98417307982387e-06, + "loss": 0.7741, + "step": 1977 + }, + { + "epoch": 0.1066422255768816, + "grad_norm": 0.8388553857803345, + "learning_rate": 9.98415622018609e-06, + "loss": 0.7839, + "step": 1978 + }, + { + "epoch": 0.10669613974552512, + "grad_norm": 0.7899215221405029, + "learning_rate": 9.98413935158747e-06, + "loss": 0.7419, + "step": 1979 + }, + { + "epoch": 0.10675005391416864, + "grad_norm": 0.9422525763511658, + "learning_rate": 9.98412247402804e-06, + "loss": 0.7977, + "step": 1980 + }, + { + "epoch": 0.10680396808281216, + "grad_norm": 0.8084313869476318, + "learning_rate": 9.984105587507831e-06, + "loss": 0.6813, + "step": 1981 + }, + { + "epoch": 0.10685788225145568, + "grad_norm": 0.9860095977783203, + "learning_rate": 9.98408869202687e-06, + "loss": 0.8934, + "step": 1982 + }, + { + "epoch": 0.10691179642009921, + "grad_norm": 0.9511064887046814, + "learning_rate": 9.98407178758519e-06, + "loss": 0.8438, + "step": 1983 + }, + { + "epoch": 0.10696571058874273, + "grad_norm": 0.9021103978157043, + "learning_rate": 9.984054874182822e-06, + "loss": 0.854, + "step": 1984 + }, + { + "epoch": 0.10701962475738624, + "grad_norm": 0.8343318104743958, + "learning_rate": 9.984037951819796e-06, + "loss": 0.8075, + "step": 1985 + }, + { + "epoch": 0.10707353892602976, + "grad_norm": 0.8592053651809692, + "learning_rate": 9.984021020496141e-06, + "loss": 0.8431, + "step": 1986 + }, + { + "epoch": 0.10712745309467328, + "grad_norm": 0.8554633259773254, + "learning_rate": 9.98400408021189e-06, + "loss": 0.797, + "step": 1987 + }, + { + "epoch": 0.1071813672633168, + "grad_norm": 0.8476511240005493, + "learning_rate": 9.98398713096707e-06, + "loss": 0.834, + "step": 1988 + }, + { + "epoch": 0.10723528143196032, + "grad_norm": 0.8374871611595154, + "learning_rate": 9.983970172761715e-06, + "loss": 0.7934, + "step": 1989 + }, + { + "epoch": 0.10728919560060383, + "grad_norm": 0.8740583658218384, + "learning_rate": 9.983953205595853e-06, + "loss": 0.8945, + "step": 1990 + }, + { + "epoch": 0.10734310976924735, + "grad_norm": 0.8888646364212036, + "learning_rate": 9.983936229469514e-06, + "loss": 0.8582, + "step": 1991 + }, + { + "epoch": 0.10739702393789088, + "grad_norm": 0.7999173402786255, + "learning_rate": 9.983919244382732e-06, + "loss": 0.7906, + "step": 1992 + }, + { + "epoch": 0.1074509381065344, + "grad_norm": 0.8284609913825989, + "learning_rate": 9.983902250335532e-06, + "loss": 0.8282, + "step": 1993 + }, + { + "epoch": 0.10750485227517792, + "grad_norm": 0.8933084607124329, + "learning_rate": 9.98388524732795e-06, + "loss": 0.8332, + "step": 1994 + }, + { + "epoch": 0.10755876644382144, + "grad_norm": 1.1771386861801147, + "learning_rate": 9.983868235360017e-06, + "loss": 0.6624, + "step": 1995 + }, + { + "epoch": 0.10761268061246496, + "grad_norm": 0.7977056503295898, + "learning_rate": 9.98385121443176e-06, + "loss": 0.7169, + "step": 1996 + }, + { + "epoch": 0.10766659478110847, + "grad_norm": 1.1132346391677856, + "learning_rate": 9.98383418454321e-06, + "loss": 0.8448, + "step": 1997 + }, + { + "epoch": 0.10772050894975199, + "grad_norm": 0.8148393034934998, + "learning_rate": 9.983817145694396e-06, + "loss": 0.7313, + "step": 1998 + }, + { + "epoch": 0.10777442311839551, + "grad_norm": 1.0594265460968018, + "learning_rate": 9.983800097885353e-06, + "loss": 0.9795, + "step": 1999 + }, + { + "epoch": 0.10782833728703903, + "grad_norm": 0.8699034452438354, + "learning_rate": 9.983783041116109e-06, + "loss": 0.8717, + "step": 2000 + }, + { + "epoch": 0.10788225145568256, + "grad_norm": 1.0455189943313599, + "learning_rate": 9.983765975386696e-06, + "loss": 0.898, + "step": 2001 + }, + { + "epoch": 0.10793616562432608, + "grad_norm": 1.0363630056381226, + "learning_rate": 9.983748900697143e-06, + "loss": 0.8404, + "step": 2002 + }, + { + "epoch": 0.1079900797929696, + "grad_norm": 0.7753402590751648, + "learning_rate": 9.983731817047482e-06, + "loss": 0.8416, + "step": 2003 + }, + { + "epoch": 0.10804399396161311, + "grad_norm": 0.7321370244026184, + "learning_rate": 9.983714724437744e-06, + "loss": 0.7051, + "step": 2004 + }, + { + "epoch": 0.10809790813025663, + "grad_norm": 0.8907992839813232, + "learning_rate": 9.983697622867959e-06, + "loss": 0.8347, + "step": 2005 + }, + { + "epoch": 0.10815182229890015, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.983680512338157e-06, + "loss": 0.7704, + "step": 2006 + }, + { + "epoch": 0.10820573646754367, + "grad_norm": 0.9187548756599426, + "learning_rate": 9.983663392848371e-06, + "loss": 0.8926, + "step": 2007 + }, + { + "epoch": 0.10825965063618719, + "grad_norm": 1.0350191593170166, + "learning_rate": 9.983646264398629e-06, + "loss": 0.8253, + "step": 2008 + }, + { + "epoch": 0.1083135648048307, + "grad_norm": 0.9566621780395508, + "learning_rate": 9.983629126988963e-06, + "loss": 0.8545, + "step": 2009 + }, + { + "epoch": 0.10836747897347423, + "grad_norm": 0.7644455432891846, + "learning_rate": 9.983611980619405e-06, + "loss": 0.707, + "step": 2010 + }, + { + "epoch": 0.10842139314211775, + "grad_norm": 0.7929621934890747, + "learning_rate": 9.983594825289983e-06, + "loss": 0.8123, + "step": 2011 + }, + { + "epoch": 0.10847530731076127, + "grad_norm": 0.8667447566986084, + "learning_rate": 9.983577661000732e-06, + "loss": 0.8371, + "step": 2012 + }, + { + "epoch": 0.10852922147940479, + "grad_norm": 0.9008684158325195, + "learning_rate": 9.98356048775168e-06, + "loss": 0.8088, + "step": 2013 + }, + { + "epoch": 0.1085831356480483, + "grad_norm": 0.8797710537910461, + "learning_rate": 9.983543305542858e-06, + "loss": 0.8315, + "step": 2014 + }, + { + "epoch": 0.10863704981669182, + "grad_norm": 1.0082249641418457, + "learning_rate": 9.983526114374296e-06, + "loss": 0.6944, + "step": 2015 + }, + { + "epoch": 0.10869096398533534, + "grad_norm": 0.8216932415962219, + "learning_rate": 9.983508914246027e-06, + "loss": 0.7704, + "step": 2016 + }, + { + "epoch": 0.10874487815397886, + "grad_norm": 0.7873802781105042, + "learning_rate": 9.983491705158082e-06, + "loss": 0.8269, + "step": 2017 + }, + { + "epoch": 0.10879879232262238, + "grad_norm": 0.9200018644332886, + "learning_rate": 9.983474487110492e-06, + "loss": 0.8736, + "step": 2018 + }, + { + "epoch": 0.10885270649126591, + "grad_norm": 0.8780434727668762, + "learning_rate": 9.983457260103284e-06, + "loss": 0.8959, + "step": 2019 + }, + { + "epoch": 0.10890662065990943, + "grad_norm": 0.8503702878952026, + "learning_rate": 9.983440024136493e-06, + "loss": 0.874, + "step": 2020 + }, + { + "epoch": 0.10896053482855295, + "grad_norm": 0.8003312349319458, + "learning_rate": 9.98342277921015e-06, + "loss": 0.8053, + "step": 2021 + }, + { + "epoch": 0.10901444899719646, + "grad_norm": 0.8508152961730957, + "learning_rate": 9.983405525324284e-06, + "loss": 0.8349, + "step": 2022 + }, + { + "epoch": 0.10906836316583998, + "grad_norm": 0.7947866320610046, + "learning_rate": 9.983388262478928e-06, + "loss": 0.7969, + "step": 2023 + }, + { + "epoch": 0.1091222773344835, + "grad_norm": 0.7566391229629517, + "learning_rate": 9.98337099067411e-06, + "loss": 0.7485, + "step": 2024 + }, + { + "epoch": 0.10917619150312702, + "grad_norm": 0.7484708428382874, + "learning_rate": 9.983353709909865e-06, + "loss": 0.7223, + "step": 2025 + }, + { + "epoch": 0.10923010567177054, + "grad_norm": 0.7474842667579651, + "learning_rate": 9.983336420186223e-06, + "loss": 0.7643, + "step": 2026 + }, + { + "epoch": 0.10928401984041405, + "grad_norm": 0.9116804003715515, + "learning_rate": 9.983319121503212e-06, + "loss": 0.9259, + "step": 2027 + }, + { + "epoch": 0.10933793400905759, + "grad_norm": 0.7918151617050171, + "learning_rate": 9.983301813860866e-06, + "loss": 0.8006, + "step": 2028 + }, + { + "epoch": 0.1093918481777011, + "grad_norm": 0.8043256998062134, + "learning_rate": 9.983284497259216e-06, + "loss": 0.7776, + "step": 2029 + }, + { + "epoch": 0.10944576234634462, + "grad_norm": 0.7829573154449463, + "learning_rate": 9.983267171698292e-06, + "loss": 0.7518, + "step": 2030 + }, + { + "epoch": 0.10949967651498814, + "grad_norm": 0.9080957174301147, + "learning_rate": 9.983249837178126e-06, + "loss": 0.777, + "step": 2031 + }, + { + "epoch": 0.10955359068363166, + "grad_norm": 0.9077693223953247, + "learning_rate": 9.983232493698748e-06, + "loss": 0.7412, + "step": 2032 + }, + { + "epoch": 0.10960750485227518, + "grad_norm": 0.7891800403594971, + "learning_rate": 9.98321514126019e-06, + "loss": 0.8089, + "step": 2033 + }, + { + "epoch": 0.1096614190209187, + "grad_norm": 0.8350703716278076, + "learning_rate": 9.983197779862485e-06, + "loss": 0.8414, + "step": 2034 + }, + { + "epoch": 0.10971533318956221, + "grad_norm": 0.8714777231216431, + "learning_rate": 9.983180409505663e-06, + "loss": 0.7355, + "step": 2035 + }, + { + "epoch": 0.10976924735820574, + "grad_norm": 0.8524130582809448, + "learning_rate": 9.98316303018975e-06, + "loss": 0.8611, + "step": 2036 + }, + { + "epoch": 0.10982316152684926, + "grad_norm": 0.8570566177368164, + "learning_rate": 9.983145641914787e-06, + "loss": 0.799, + "step": 2037 + }, + { + "epoch": 0.10987707569549278, + "grad_norm": 0.8222963213920593, + "learning_rate": 9.983128244680797e-06, + "loss": 0.8302, + "step": 2038 + }, + { + "epoch": 0.1099309898641363, + "grad_norm": 0.7977816462516785, + "learning_rate": 9.983110838487818e-06, + "loss": 0.8475, + "step": 2039 + }, + { + "epoch": 0.10998490403277981, + "grad_norm": 0.7925818562507629, + "learning_rate": 9.983093423335875e-06, + "loss": 0.7176, + "step": 2040 + }, + { + "epoch": 0.11003881820142333, + "grad_norm": 0.8456152081489563, + "learning_rate": 9.983075999225002e-06, + "loss": 0.785, + "step": 2041 + }, + { + "epoch": 0.11009273237006685, + "grad_norm": 0.8691622018814087, + "learning_rate": 9.98305856615523e-06, + "loss": 0.8871, + "step": 2042 + }, + { + "epoch": 0.11014664653871037, + "grad_norm": 0.9402886629104614, + "learning_rate": 9.983041124126593e-06, + "loss": 0.8239, + "step": 2043 + }, + { + "epoch": 0.11020056070735389, + "grad_norm": 0.7975844144821167, + "learning_rate": 9.98302367313912e-06, + "loss": 0.7336, + "step": 2044 + }, + { + "epoch": 0.11025447487599742, + "grad_norm": 0.8384075164794922, + "learning_rate": 9.98300621319284e-06, + "loss": 0.9003, + "step": 2045 + }, + { + "epoch": 0.11030838904464094, + "grad_norm": 0.847994327545166, + "learning_rate": 9.98298874428779e-06, + "loss": 0.8611, + "step": 2046 + }, + { + "epoch": 0.11036230321328445, + "grad_norm": 0.801159143447876, + "learning_rate": 9.982971266423996e-06, + "loss": 0.7967, + "step": 2047 + }, + { + "epoch": 0.11041621738192797, + "grad_norm": 0.8316680192947388, + "learning_rate": 9.982953779601492e-06, + "loss": 0.8644, + "step": 2048 + }, + { + "epoch": 0.11047013155057149, + "grad_norm": 0.9387392401695251, + "learning_rate": 9.982936283820311e-06, + "loss": 0.916, + "step": 2049 + }, + { + "epoch": 0.11052404571921501, + "grad_norm": 0.8682491779327393, + "learning_rate": 9.982918779080481e-06, + "loss": 0.8267, + "step": 2050 + }, + { + "epoch": 0.11057795988785853, + "grad_norm": 0.8443827629089355, + "learning_rate": 9.982901265382034e-06, + "loss": 0.8129, + "step": 2051 + }, + { + "epoch": 0.11063187405650204, + "grad_norm": 0.8612427115440369, + "learning_rate": 9.982883742725005e-06, + "loss": 0.9203, + "step": 2052 + }, + { + "epoch": 0.11068578822514556, + "grad_norm": 0.786834716796875, + "learning_rate": 9.98286621110942e-06, + "loss": 0.7731, + "step": 2053 + }, + { + "epoch": 0.1107397023937891, + "grad_norm": 0.8566606044769287, + "learning_rate": 9.982848670535316e-06, + "loss": 0.8111, + "step": 2054 + }, + { + "epoch": 0.11079361656243261, + "grad_norm": 0.7485222816467285, + "learning_rate": 9.982831121002722e-06, + "loss": 0.722, + "step": 2055 + }, + { + "epoch": 0.11084753073107613, + "grad_norm": 0.7441151738166809, + "learning_rate": 9.98281356251167e-06, + "loss": 0.7081, + "step": 2056 + }, + { + "epoch": 0.11090144489971965, + "grad_norm": 0.8212536573410034, + "learning_rate": 9.98279599506219e-06, + "loss": 0.8572, + "step": 2057 + }, + { + "epoch": 0.11095535906836317, + "grad_norm": 0.8686707019805908, + "learning_rate": 9.982778418654315e-06, + "loss": 0.8553, + "step": 2058 + }, + { + "epoch": 0.11100927323700668, + "grad_norm": 0.8908647298812866, + "learning_rate": 9.982760833288079e-06, + "loss": 0.9059, + "step": 2059 + }, + { + "epoch": 0.1110631874056502, + "grad_norm": 0.9393401741981506, + "learning_rate": 9.982743238963508e-06, + "loss": 0.8574, + "step": 2060 + }, + { + "epoch": 0.11111710157429372, + "grad_norm": 0.9027063250541687, + "learning_rate": 9.982725635680638e-06, + "loss": 0.7717, + "step": 2061 + }, + { + "epoch": 0.11117101574293724, + "grad_norm": 0.7742587924003601, + "learning_rate": 9.982708023439498e-06, + "loss": 0.6618, + "step": 2062 + }, + { + "epoch": 0.11122492991158077, + "grad_norm": 0.8025707602500916, + "learning_rate": 9.982690402240124e-06, + "loss": 0.7263, + "step": 2063 + }, + { + "epoch": 0.11127884408022429, + "grad_norm": 0.8629397749900818, + "learning_rate": 9.982672772082541e-06, + "loss": 0.8222, + "step": 2064 + }, + { + "epoch": 0.1113327582488678, + "grad_norm": 0.8332691788673401, + "learning_rate": 9.982655132966785e-06, + "loss": 0.8302, + "step": 2065 + }, + { + "epoch": 0.11138667241751132, + "grad_norm": 0.8381907939910889, + "learning_rate": 9.982637484892889e-06, + "loss": 0.8638, + "step": 2066 + }, + { + "epoch": 0.11144058658615484, + "grad_norm": 1.0945167541503906, + "learning_rate": 9.982619827860882e-06, + "loss": 0.8866, + "step": 2067 + }, + { + "epoch": 0.11149450075479836, + "grad_norm": 0.8755025267601013, + "learning_rate": 9.982602161870795e-06, + "loss": 0.8587, + "step": 2068 + }, + { + "epoch": 0.11154841492344188, + "grad_norm": 0.8665636777877808, + "learning_rate": 9.982584486922664e-06, + "loss": 0.8309, + "step": 2069 + }, + { + "epoch": 0.1116023290920854, + "grad_norm": 0.8764104247093201, + "learning_rate": 9.982566803016516e-06, + "loss": 0.9003, + "step": 2070 + }, + { + "epoch": 0.11165624326072891, + "grad_norm": 1.1225675344467163, + "learning_rate": 9.982549110152387e-06, + "loss": 0.8897, + "step": 2071 + }, + { + "epoch": 0.11171015742937244, + "grad_norm": 0.7883412837982178, + "learning_rate": 9.982531408330304e-06, + "loss": 0.7104, + "step": 2072 + }, + { + "epoch": 0.11176407159801596, + "grad_norm": 0.8683668971061707, + "learning_rate": 9.982513697550303e-06, + "loss": 0.831, + "step": 2073 + }, + { + "epoch": 0.11181798576665948, + "grad_norm": 0.9139745831489563, + "learning_rate": 9.982495977812415e-06, + "loss": 0.7492, + "step": 2074 + }, + { + "epoch": 0.111871899935303, + "grad_norm": 0.8651925921440125, + "learning_rate": 9.98247824911667e-06, + "loss": 0.8385, + "step": 2075 + }, + { + "epoch": 0.11192581410394652, + "grad_norm": 0.9110192656517029, + "learning_rate": 9.982460511463102e-06, + "loss": 0.8513, + "step": 2076 + }, + { + "epoch": 0.11197972827259003, + "grad_norm": 0.8511810302734375, + "learning_rate": 9.982442764851742e-06, + "loss": 0.8352, + "step": 2077 + }, + { + "epoch": 0.11203364244123355, + "grad_norm": 0.8981106877326965, + "learning_rate": 9.982425009282622e-06, + "loss": 0.7837, + "step": 2078 + }, + { + "epoch": 0.11208755660987707, + "grad_norm": 0.7660240530967712, + "learning_rate": 9.982407244755771e-06, + "loss": 0.6994, + "step": 2079 + }, + { + "epoch": 0.11214147077852059, + "grad_norm": 0.830569863319397, + "learning_rate": 9.982389471271228e-06, + "loss": 0.7756, + "step": 2080 + }, + { + "epoch": 0.11219538494716412, + "grad_norm": 0.8888838887214661, + "learning_rate": 9.982371688829018e-06, + "loss": 0.7302, + "step": 2081 + }, + { + "epoch": 0.11224929911580764, + "grad_norm": 0.823513388633728, + "learning_rate": 9.982353897429176e-06, + "loss": 0.8357, + "step": 2082 + }, + { + "epoch": 0.11230321328445116, + "grad_norm": 0.8353226780891418, + "learning_rate": 9.982336097071734e-06, + "loss": 0.7939, + "step": 2083 + }, + { + "epoch": 0.11235712745309467, + "grad_norm": 1.0246703624725342, + "learning_rate": 9.982318287756725e-06, + "loss": 0.9416, + "step": 2084 + }, + { + "epoch": 0.11241104162173819, + "grad_norm": 0.9405194520950317, + "learning_rate": 9.982300469484178e-06, + "loss": 0.8296, + "step": 2085 + }, + { + "epoch": 0.11246495579038171, + "grad_norm": 0.905885636806488, + "learning_rate": 9.982282642254126e-06, + "loss": 0.8181, + "step": 2086 + }, + { + "epoch": 0.11251886995902523, + "grad_norm": 0.8098746538162231, + "learning_rate": 9.982264806066604e-06, + "loss": 0.7372, + "step": 2087 + }, + { + "epoch": 0.11257278412766875, + "grad_norm": 1.2416350841522217, + "learning_rate": 9.98224696092164e-06, + "loss": 0.8984, + "step": 2088 + }, + { + "epoch": 0.11262669829631228, + "grad_norm": 0.8675969839096069, + "learning_rate": 9.98222910681927e-06, + "loss": 0.8417, + "step": 2089 + }, + { + "epoch": 0.1126806124649558, + "grad_norm": 1.063124179840088, + "learning_rate": 9.982211243759522e-06, + "loss": 0.9227, + "step": 2090 + }, + { + "epoch": 0.11273452663359931, + "grad_norm": 0.9010531902313232, + "learning_rate": 9.98219337174243e-06, + "loss": 0.9547, + "step": 2091 + }, + { + "epoch": 0.11278844080224283, + "grad_norm": 0.7843347191810608, + "learning_rate": 9.982175490768027e-06, + "loss": 0.8607, + "step": 2092 + }, + { + "epoch": 0.11284235497088635, + "grad_norm": 0.8451966643333435, + "learning_rate": 9.982157600836344e-06, + "loss": 0.8788, + "step": 2093 + }, + { + "epoch": 0.11289626913952987, + "grad_norm": 0.7359250783920288, + "learning_rate": 9.982139701947415e-06, + "loss": 0.7916, + "step": 2094 + }, + { + "epoch": 0.11295018330817339, + "grad_norm": 0.8133944869041443, + "learning_rate": 9.98212179410127e-06, + "loss": 0.8327, + "step": 2095 + }, + { + "epoch": 0.1130040974768169, + "grad_norm": 0.8658613562583923, + "learning_rate": 9.982103877297941e-06, + "loss": 0.7648, + "step": 2096 + }, + { + "epoch": 0.11305801164546042, + "grad_norm": 0.8523211479187012, + "learning_rate": 9.982085951537463e-06, + "loss": 0.8618, + "step": 2097 + }, + { + "epoch": 0.11311192581410395, + "grad_norm": 0.9494971632957458, + "learning_rate": 9.982068016819867e-06, + "loss": 0.8116, + "step": 2098 + }, + { + "epoch": 0.11316583998274747, + "grad_norm": 0.797603964805603, + "learning_rate": 9.982050073145182e-06, + "loss": 0.7268, + "step": 2099 + }, + { + "epoch": 0.11321975415139099, + "grad_norm": 0.8662691712379456, + "learning_rate": 9.982032120513443e-06, + "loss": 0.8007, + "step": 2100 + }, + { + "epoch": 0.1132736683200345, + "grad_norm": 0.8377127051353455, + "learning_rate": 9.982014158924684e-06, + "loss": 0.813, + "step": 2101 + }, + { + "epoch": 0.11332758248867802, + "grad_norm": 1.0051186084747314, + "learning_rate": 9.981996188378934e-06, + "loss": 0.921, + "step": 2102 + }, + { + "epoch": 0.11338149665732154, + "grad_norm": 0.7831799983978271, + "learning_rate": 9.981978208876228e-06, + "loss": 0.9197, + "step": 2103 + }, + { + "epoch": 0.11343541082596506, + "grad_norm": 1.0273268222808838, + "learning_rate": 9.981960220416595e-06, + "loss": 0.9144, + "step": 2104 + }, + { + "epoch": 0.11348932499460858, + "grad_norm": 0.8754317164421082, + "learning_rate": 9.981942223000072e-06, + "loss": 0.8359, + "step": 2105 + }, + { + "epoch": 0.1135432391632521, + "grad_norm": 0.7923420071601868, + "learning_rate": 9.981924216626686e-06, + "loss": 0.737, + "step": 2106 + }, + { + "epoch": 0.11359715333189563, + "grad_norm": 0.8651608824729919, + "learning_rate": 9.981906201296475e-06, + "loss": 0.7588, + "step": 2107 + }, + { + "epoch": 0.11365106750053915, + "grad_norm": 0.9219616651535034, + "learning_rate": 9.981888177009468e-06, + "loss": 0.8598, + "step": 2108 + }, + { + "epoch": 0.11370498166918266, + "grad_norm": 0.8936532139778137, + "learning_rate": 9.981870143765697e-06, + "loss": 0.7718, + "step": 2109 + }, + { + "epoch": 0.11375889583782618, + "grad_norm": 0.8959317803382874, + "learning_rate": 9.981852101565195e-06, + "loss": 0.794, + "step": 2110 + }, + { + "epoch": 0.1138128100064697, + "grad_norm": 0.8781943917274475, + "learning_rate": 9.981834050407997e-06, + "loss": 0.8045, + "step": 2111 + }, + { + "epoch": 0.11386672417511322, + "grad_norm": 0.8148792386054993, + "learning_rate": 9.981815990294131e-06, + "loss": 0.7398, + "step": 2112 + }, + { + "epoch": 0.11392063834375674, + "grad_norm": 0.8491646647453308, + "learning_rate": 9.981797921223633e-06, + "loss": 0.878, + "step": 2113 + }, + { + "epoch": 0.11397455251240025, + "grad_norm": 0.8166778087615967, + "learning_rate": 9.981779843196533e-06, + "loss": 0.918, + "step": 2114 + }, + { + "epoch": 0.11402846668104377, + "grad_norm": 0.8016941547393799, + "learning_rate": 9.981761756212867e-06, + "loss": 0.7958, + "step": 2115 + }, + { + "epoch": 0.1140823808496873, + "grad_norm": 0.9108608961105347, + "learning_rate": 9.981743660272663e-06, + "loss": 0.8645, + "step": 2116 + }, + { + "epoch": 0.11413629501833082, + "grad_norm": 0.8930072784423828, + "learning_rate": 9.981725555375956e-06, + "loss": 0.842, + "step": 2117 + }, + { + "epoch": 0.11419020918697434, + "grad_norm": 0.75871342420578, + "learning_rate": 9.981707441522778e-06, + "loss": 0.7513, + "step": 2118 + }, + { + "epoch": 0.11424412335561786, + "grad_norm": 0.9924628734588623, + "learning_rate": 9.981689318713163e-06, + "loss": 0.8248, + "step": 2119 + }, + { + "epoch": 0.11429803752426138, + "grad_norm": 0.9345909953117371, + "learning_rate": 9.981671186947145e-06, + "loss": 0.7963, + "step": 2120 + }, + { + "epoch": 0.1143519516929049, + "grad_norm": 0.8094825148582458, + "learning_rate": 9.98165304622475e-06, + "loss": 0.8189, + "step": 2121 + }, + { + "epoch": 0.11440586586154841, + "grad_norm": 0.789262056350708, + "learning_rate": 9.981634896546017e-06, + "loss": 0.721, + "step": 2122 + }, + { + "epoch": 0.11445978003019193, + "grad_norm": 0.9279952645301819, + "learning_rate": 9.981616737910975e-06, + "loss": 0.8499, + "step": 2123 + }, + { + "epoch": 0.11451369419883545, + "grad_norm": 0.8332392573356628, + "learning_rate": 9.981598570319657e-06, + "loss": 0.8296, + "step": 2124 + }, + { + "epoch": 0.11456760836747898, + "grad_norm": 0.7957965731620789, + "learning_rate": 9.981580393772098e-06, + "loss": 0.7872, + "step": 2125 + }, + { + "epoch": 0.1146215225361225, + "grad_norm": 0.7587382197380066, + "learning_rate": 9.981562208268331e-06, + "loss": 0.721, + "step": 2126 + }, + { + "epoch": 0.11467543670476602, + "grad_norm": 0.7246111631393433, + "learning_rate": 9.981544013808385e-06, + "loss": 0.7965, + "step": 2127 + }, + { + "epoch": 0.11472935087340953, + "grad_norm": 0.9953028559684753, + "learning_rate": 9.981525810392295e-06, + "loss": 0.7129, + "step": 2128 + }, + { + "epoch": 0.11478326504205305, + "grad_norm": 1.0731823444366455, + "learning_rate": 9.981507598020094e-06, + "loss": 0.8532, + "step": 2129 + }, + { + "epoch": 0.11483717921069657, + "grad_norm": 0.8425208926200867, + "learning_rate": 9.981489376691814e-06, + "loss": 0.8191, + "step": 2130 + }, + { + "epoch": 0.11489109337934009, + "grad_norm": 0.7841627597808838, + "learning_rate": 9.981471146407487e-06, + "loss": 0.7946, + "step": 2131 + }, + { + "epoch": 0.1149450075479836, + "grad_norm": 0.8923974633216858, + "learning_rate": 9.981452907167148e-06, + "loss": 0.8445, + "step": 2132 + }, + { + "epoch": 0.11499892171662712, + "grad_norm": 0.7729552984237671, + "learning_rate": 9.981434658970828e-06, + "loss": 0.7566, + "step": 2133 + }, + { + "epoch": 0.11505283588527065, + "grad_norm": 0.910899817943573, + "learning_rate": 9.98141640181856e-06, + "loss": 0.8236, + "step": 2134 + }, + { + "epoch": 0.11510675005391417, + "grad_norm": 0.8768936395645142, + "learning_rate": 9.981398135710377e-06, + "loss": 0.8929, + "step": 2135 + }, + { + "epoch": 0.11516066422255769, + "grad_norm": 0.9078627824783325, + "learning_rate": 9.981379860646313e-06, + "loss": 0.745, + "step": 2136 + }, + { + "epoch": 0.11521457839120121, + "grad_norm": 0.8225182890892029, + "learning_rate": 9.981361576626399e-06, + "loss": 0.8349, + "step": 2137 + }, + { + "epoch": 0.11526849255984473, + "grad_norm": 0.8092076778411865, + "learning_rate": 9.981343283650668e-06, + "loss": 0.8157, + "step": 2138 + }, + { + "epoch": 0.11532240672848824, + "grad_norm": 0.8253282308578491, + "learning_rate": 9.981324981719156e-06, + "loss": 0.7412, + "step": 2139 + }, + { + "epoch": 0.11537632089713176, + "grad_norm": 0.9668901562690735, + "learning_rate": 9.981306670831892e-06, + "loss": 0.7868, + "step": 2140 + }, + { + "epoch": 0.11543023506577528, + "grad_norm": 0.7919616103172302, + "learning_rate": 9.981288350988911e-06, + "loss": 0.7384, + "step": 2141 + }, + { + "epoch": 0.11548414923441881, + "grad_norm": 0.8589178919792175, + "learning_rate": 9.981270022190244e-06, + "loss": 0.8352, + "step": 2142 + }, + { + "epoch": 0.11553806340306233, + "grad_norm": 0.8211520910263062, + "learning_rate": 9.981251684435926e-06, + "loss": 0.8124, + "step": 2143 + }, + { + "epoch": 0.11559197757170585, + "grad_norm": 0.911702573299408, + "learning_rate": 9.98123333772599e-06, + "loss": 0.8468, + "step": 2144 + }, + { + "epoch": 0.11564589174034937, + "grad_norm": 0.7934874892234802, + "learning_rate": 9.981214982060469e-06, + "loss": 0.8091, + "step": 2145 + }, + { + "epoch": 0.11569980590899288, + "grad_norm": 0.7407031655311584, + "learning_rate": 9.981196617439394e-06, + "loss": 0.7755, + "step": 2146 + }, + { + "epoch": 0.1157537200776364, + "grad_norm": 0.757688581943512, + "learning_rate": 9.9811782438628e-06, + "loss": 0.7468, + "step": 2147 + }, + { + "epoch": 0.11580763424627992, + "grad_norm": 1.0007857084274292, + "learning_rate": 9.981159861330717e-06, + "loss": 0.9108, + "step": 2148 + }, + { + "epoch": 0.11586154841492344, + "grad_norm": 1.300113558769226, + "learning_rate": 9.981141469843183e-06, + "loss": 0.8099, + "step": 2149 + }, + { + "epoch": 0.11591546258356696, + "grad_norm": 1.0352274179458618, + "learning_rate": 9.981123069400226e-06, + "loss": 0.801, + "step": 2150 + }, + { + "epoch": 0.11596937675221049, + "grad_norm": 0.9033756256103516, + "learning_rate": 9.981104660001885e-06, + "loss": 0.8789, + "step": 2151 + }, + { + "epoch": 0.116023290920854, + "grad_norm": 0.9051264524459839, + "learning_rate": 9.981086241648188e-06, + "loss": 0.8737, + "step": 2152 + }, + { + "epoch": 0.11607720508949752, + "grad_norm": 0.7855859398841858, + "learning_rate": 9.98106781433917e-06, + "loss": 0.7508, + "step": 2153 + }, + { + "epoch": 0.11613111925814104, + "grad_norm": 0.9001717567443848, + "learning_rate": 9.981049378074862e-06, + "loss": 0.6852, + "step": 2154 + }, + { + "epoch": 0.11618503342678456, + "grad_norm": 0.8165149092674255, + "learning_rate": 9.9810309328553e-06, + "loss": 0.8755, + "step": 2155 + }, + { + "epoch": 0.11623894759542808, + "grad_norm": 0.8920814990997314, + "learning_rate": 9.981012478680517e-06, + "loss": 0.753, + "step": 2156 + }, + { + "epoch": 0.1162928617640716, + "grad_norm": 0.8186051249504089, + "learning_rate": 9.980994015550544e-06, + "loss": 0.8341, + "step": 2157 + }, + { + "epoch": 0.11634677593271511, + "grad_norm": 0.8103832602500916, + "learning_rate": 9.980975543465417e-06, + "loss": 0.8276, + "step": 2158 + }, + { + "epoch": 0.11640069010135863, + "grad_norm": 0.8752830028533936, + "learning_rate": 9.980957062425167e-06, + "loss": 0.8449, + "step": 2159 + }, + { + "epoch": 0.11645460427000216, + "grad_norm": 0.9748302698135376, + "learning_rate": 9.98093857242983e-06, + "loss": 0.8323, + "step": 2160 + }, + { + "epoch": 0.11650851843864568, + "grad_norm": 0.8948556184768677, + "learning_rate": 9.980920073479435e-06, + "loss": 0.7836, + "step": 2161 + }, + { + "epoch": 0.1165624326072892, + "grad_norm": 0.8715651035308838, + "learning_rate": 9.980901565574017e-06, + "loss": 0.7942, + "step": 2162 + }, + { + "epoch": 0.11661634677593272, + "grad_norm": 0.7667563557624817, + "learning_rate": 9.980883048713612e-06, + "loss": 0.7517, + "step": 2163 + }, + { + "epoch": 0.11667026094457623, + "grad_norm": 0.8058063387870789, + "learning_rate": 9.980864522898247e-06, + "loss": 0.7997, + "step": 2164 + }, + { + "epoch": 0.11672417511321975, + "grad_norm": 0.9300008416175842, + "learning_rate": 9.980845988127963e-06, + "loss": 0.856, + "step": 2165 + }, + { + "epoch": 0.11677808928186327, + "grad_norm": 0.8321848511695862, + "learning_rate": 9.98082744440279e-06, + "loss": 0.7483, + "step": 2166 + }, + { + "epoch": 0.11683200345050679, + "grad_norm": 0.9346274137496948, + "learning_rate": 9.98080889172276e-06, + "loss": 0.8149, + "step": 2167 + }, + { + "epoch": 0.1168859176191503, + "grad_norm": 0.9119831919670105, + "learning_rate": 9.980790330087906e-06, + "loss": 0.8384, + "step": 2168 + }, + { + "epoch": 0.11693983178779384, + "grad_norm": 0.8416613936424255, + "learning_rate": 9.980771759498264e-06, + "loss": 0.776, + "step": 2169 + }, + { + "epoch": 0.11699374595643736, + "grad_norm": 0.765889048576355, + "learning_rate": 9.980753179953867e-06, + "loss": 0.7413, + "step": 2170 + }, + { + "epoch": 0.11704766012508087, + "grad_norm": 1.3491352796554565, + "learning_rate": 9.980734591454746e-06, + "loss": 0.7444, + "step": 2171 + }, + { + "epoch": 0.11710157429372439, + "grad_norm": 0.926618218421936, + "learning_rate": 9.980715994000936e-06, + "loss": 0.8495, + "step": 2172 + }, + { + "epoch": 0.11715548846236791, + "grad_norm": 0.7720175981521606, + "learning_rate": 9.98069738759247e-06, + "loss": 0.8238, + "step": 2173 + }, + { + "epoch": 0.11720940263101143, + "grad_norm": 0.9114102125167847, + "learning_rate": 9.980678772229385e-06, + "loss": 0.7805, + "step": 2174 + }, + { + "epoch": 0.11726331679965495, + "grad_norm": 0.778404712677002, + "learning_rate": 9.980660147911709e-06, + "loss": 0.7705, + "step": 2175 + }, + { + "epoch": 0.11731723096829846, + "grad_norm": 0.7945864200592041, + "learning_rate": 9.980641514639478e-06, + "loss": 0.7052, + "step": 2176 + }, + { + "epoch": 0.11737114513694198, + "grad_norm": 0.8246831297874451, + "learning_rate": 9.980622872412723e-06, + "loss": 0.8514, + "step": 2177 + }, + { + "epoch": 0.11742505930558551, + "grad_norm": 0.899563193321228, + "learning_rate": 9.980604221231482e-06, + "loss": 0.761, + "step": 2178 + }, + { + "epoch": 0.11747897347422903, + "grad_norm": 0.7277782559394836, + "learning_rate": 9.980585561095788e-06, + "loss": 0.6671, + "step": 2179 + }, + { + "epoch": 0.11753288764287255, + "grad_norm": 0.7977896928787231, + "learning_rate": 9.98056689200567e-06, + "loss": 0.8045, + "step": 2180 + }, + { + "epoch": 0.11758680181151607, + "grad_norm": 0.8606321811676025, + "learning_rate": 9.980548213961165e-06, + "loss": 0.8232, + "step": 2181 + }, + { + "epoch": 0.11764071598015959, + "grad_norm": 0.769458532333374, + "learning_rate": 9.980529526962308e-06, + "loss": 0.729, + "step": 2182 + }, + { + "epoch": 0.1176946301488031, + "grad_norm": 1.1045739650726318, + "learning_rate": 9.98051083100913e-06, + "loss": 0.802, + "step": 2183 + }, + { + "epoch": 0.11774854431744662, + "grad_norm": 0.7568592429161072, + "learning_rate": 9.980492126101664e-06, + "loss": 0.7427, + "step": 2184 + }, + { + "epoch": 0.11780245848609014, + "grad_norm": 0.7503477931022644, + "learning_rate": 9.980473412239946e-06, + "loss": 0.7857, + "step": 2185 + }, + { + "epoch": 0.11785637265473366, + "grad_norm": 0.8330819606781006, + "learning_rate": 9.980454689424007e-06, + "loss": 0.7561, + "step": 2186 + }, + { + "epoch": 0.11791028682337719, + "grad_norm": 0.792736291885376, + "learning_rate": 9.980435957653884e-06, + "loss": 0.837, + "step": 2187 + }, + { + "epoch": 0.1179642009920207, + "grad_norm": 0.8983330130577087, + "learning_rate": 9.980417216929608e-06, + "loss": 0.8499, + "step": 2188 + }, + { + "epoch": 0.11801811516066422, + "grad_norm": 0.8700925707817078, + "learning_rate": 9.980398467251214e-06, + "loss": 0.9048, + "step": 2189 + }, + { + "epoch": 0.11807202932930774, + "grad_norm": 0.8873588442802429, + "learning_rate": 9.980379708618734e-06, + "loss": 0.7617, + "step": 2190 + }, + { + "epoch": 0.11812594349795126, + "grad_norm": 0.7786865234375, + "learning_rate": 9.980360941032204e-06, + "loss": 0.7828, + "step": 2191 + }, + { + "epoch": 0.11817985766659478, + "grad_norm": 0.796852171421051, + "learning_rate": 9.980342164491657e-06, + "loss": 0.7739, + "step": 2192 + }, + { + "epoch": 0.1182337718352383, + "grad_norm": 0.7752018570899963, + "learning_rate": 9.980323378997126e-06, + "loss": 0.6969, + "step": 2193 + }, + { + "epoch": 0.11828768600388181, + "grad_norm": 0.8607134819030762, + "learning_rate": 9.980304584548644e-06, + "loss": 0.8623, + "step": 2194 + }, + { + "epoch": 0.11834160017252535, + "grad_norm": 0.8624950051307678, + "learning_rate": 9.980285781146248e-06, + "loss": 0.8124, + "step": 2195 + }, + { + "epoch": 0.11839551434116886, + "grad_norm": 0.8951582908630371, + "learning_rate": 9.98026696878997e-06, + "loss": 0.8491, + "step": 2196 + }, + { + "epoch": 0.11844942850981238, + "grad_norm": 0.8373478055000305, + "learning_rate": 9.980248147479843e-06, + "loss": 0.7166, + "step": 2197 + }, + { + "epoch": 0.1185033426784559, + "grad_norm": 0.8007619976997375, + "learning_rate": 9.980229317215901e-06, + "loss": 0.8137, + "step": 2198 + }, + { + "epoch": 0.11855725684709942, + "grad_norm": 0.8464154601097107, + "learning_rate": 9.980210477998177e-06, + "loss": 0.7803, + "step": 2199 + }, + { + "epoch": 0.11861117101574294, + "grad_norm": 0.8384450078010559, + "learning_rate": 9.98019162982671e-06, + "loss": 0.8511, + "step": 2200 + }, + { + "epoch": 0.11866508518438645, + "grad_norm": 0.9059091210365295, + "learning_rate": 9.980172772701527e-06, + "loss": 0.8538, + "step": 2201 + }, + { + "epoch": 0.11871899935302997, + "grad_norm": 1.1080526113510132, + "learning_rate": 9.980153906622667e-06, + "loss": 1.0067, + "step": 2202 + }, + { + "epoch": 0.11877291352167349, + "grad_norm": 0.8379873633384705, + "learning_rate": 9.980135031590162e-06, + "loss": 0.8285, + "step": 2203 + }, + { + "epoch": 0.11882682769031702, + "grad_norm": 0.9143814444541931, + "learning_rate": 9.980116147604044e-06, + "loss": 0.8286, + "step": 2204 + }, + { + "epoch": 0.11888074185896054, + "grad_norm": 0.8619917631149292, + "learning_rate": 9.98009725466435e-06, + "loss": 0.8304, + "step": 2205 + }, + { + "epoch": 0.11893465602760406, + "grad_norm": 0.8470893502235413, + "learning_rate": 9.980078352771112e-06, + "loss": 0.8245, + "step": 2206 + }, + { + "epoch": 0.11898857019624758, + "grad_norm": 0.9560073614120483, + "learning_rate": 9.980059441924365e-06, + "loss": 0.8821, + "step": 2207 + }, + { + "epoch": 0.1190424843648911, + "grad_norm": 0.8186134696006775, + "learning_rate": 9.980040522124143e-06, + "loss": 0.7166, + "step": 2208 + }, + { + "epoch": 0.11909639853353461, + "grad_norm": 0.8410859704017639, + "learning_rate": 9.980021593370481e-06, + "loss": 0.7465, + "step": 2209 + }, + { + "epoch": 0.11915031270217813, + "grad_norm": 0.9180718660354614, + "learning_rate": 9.980002655663412e-06, + "loss": 0.8508, + "step": 2210 + }, + { + "epoch": 0.11920422687082165, + "grad_norm": 0.8384451270103455, + "learning_rate": 9.979983709002967e-06, + "loss": 0.7723, + "step": 2211 + }, + { + "epoch": 0.11925814103946517, + "grad_norm": 0.815075159072876, + "learning_rate": 9.979964753389187e-06, + "loss": 0.7769, + "step": 2212 + }, + { + "epoch": 0.1193120552081087, + "grad_norm": 0.9130523800849915, + "learning_rate": 9.9799457888221e-06, + "loss": 0.8616, + "step": 2213 + }, + { + "epoch": 0.11936596937675222, + "grad_norm": 0.8262661099433899, + "learning_rate": 9.97992681530174e-06, + "loss": 0.7507, + "step": 2214 + }, + { + "epoch": 0.11941988354539573, + "grad_norm": 0.8962772488594055, + "learning_rate": 9.979907832828145e-06, + "loss": 0.8387, + "step": 2215 + }, + { + "epoch": 0.11947379771403925, + "grad_norm": 0.8966812491416931, + "learning_rate": 9.979888841401348e-06, + "loss": 0.8095, + "step": 2216 + }, + { + "epoch": 0.11952771188268277, + "grad_norm": 0.8484013676643372, + "learning_rate": 9.979869841021381e-06, + "loss": 0.8475, + "step": 2217 + }, + { + "epoch": 0.11958162605132629, + "grad_norm": 0.8858511447906494, + "learning_rate": 9.979850831688282e-06, + "loss": 0.8576, + "step": 2218 + }, + { + "epoch": 0.1196355402199698, + "grad_norm": 0.8044704794883728, + "learning_rate": 9.97983181340208e-06, + "loss": 0.8195, + "step": 2219 + }, + { + "epoch": 0.11968945438861332, + "grad_norm": 0.8463665246963501, + "learning_rate": 9.979812786162815e-06, + "loss": 0.8177, + "step": 2220 + }, + { + "epoch": 0.11974336855725684, + "grad_norm": 0.8145734071731567, + "learning_rate": 9.979793749970517e-06, + "loss": 0.8307, + "step": 2221 + }, + { + "epoch": 0.11979728272590037, + "grad_norm": 0.7789961695671082, + "learning_rate": 9.97977470482522e-06, + "loss": 0.7854, + "step": 2222 + }, + { + "epoch": 0.11985119689454389, + "grad_norm": 0.858213484287262, + "learning_rate": 9.97975565072696e-06, + "loss": 0.8914, + "step": 2223 + }, + { + "epoch": 0.11990511106318741, + "grad_norm": 0.8503074645996094, + "learning_rate": 9.979736587675772e-06, + "loss": 0.8731, + "step": 2224 + }, + { + "epoch": 0.11995902523183093, + "grad_norm": 0.9815833568572998, + "learning_rate": 9.97971751567169e-06, + "loss": 0.8769, + "step": 2225 + }, + { + "epoch": 0.12001293940047444, + "grad_norm": 0.7897947430610657, + "learning_rate": 9.979698434714747e-06, + "loss": 0.8308, + "step": 2226 + }, + { + "epoch": 0.12006685356911796, + "grad_norm": 0.9122232794761658, + "learning_rate": 9.979679344804976e-06, + "loss": 0.8934, + "step": 2227 + }, + { + "epoch": 0.12012076773776148, + "grad_norm": 0.7640379071235657, + "learning_rate": 9.979660245942416e-06, + "loss": 0.8205, + "step": 2228 + }, + { + "epoch": 0.120174681906405, + "grad_norm": 0.8736944198608398, + "learning_rate": 9.979641138127097e-06, + "loss": 0.8522, + "step": 2229 + }, + { + "epoch": 0.12022859607504852, + "grad_norm": 0.8782697916030884, + "learning_rate": 9.979622021359054e-06, + "loss": 0.812, + "step": 2230 + }, + { + "epoch": 0.12028251024369205, + "grad_norm": 0.8260065317153931, + "learning_rate": 9.979602895638322e-06, + "loss": 0.768, + "step": 2231 + }, + { + "epoch": 0.12033642441233557, + "grad_norm": 0.8338255286216736, + "learning_rate": 9.979583760964939e-06, + "loss": 0.7747, + "step": 2232 + }, + { + "epoch": 0.12039033858097908, + "grad_norm": 0.8310086131095886, + "learning_rate": 9.979564617338933e-06, + "loss": 0.8206, + "step": 2233 + }, + { + "epoch": 0.1204442527496226, + "grad_norm": 0.8234529495239258, + "learning_rate": 9.979545464760342e-06, + "loss": 0.847, + "step": 2234 + }, + { + "epoch": 0.12049816691826612, + "grad_norm": 0.9490135908126831, + "learning_rate": 9.9795263032292e-06, + "loss": 0.7277, + "step": 2235 + }, + { + "epoch": 0.12055208108690964, + "grad_norm": 0.8937979340553284, + "learning_rate": 9.97950713274554e-06, + "loss": 0.8714, + "step": 2236 + }, + { + "epoch": 0.12060599525555316, + "grad_norm": 0.7739347219467163, + "learning_rate": 9.9794879533094e-06, + "loss": 0.8009, + "step": 2237 + }, + { + "epoch": 0.12065990942419667, + "grad_norm": 0.8843472003936768, + "learning_rate": 9.979468764920812e-06, + "loss": 0.7748, + "step": 2238 + }, + { + "epoch": 0.12071382359284019, + "grad_norm": 0.815528154373169, + "learning_rate": 9.979449567579809e-06, + "loss": 0.7896, + "step": 2239 + }, + { + "epoch": 0.12076773776148372, + "grad_norm": 0.8802885413169861, + "learning_rate": 9.979430361286428e-06, + "loss": 0.8468, + "step": 2240 + }, + { + "epoch": 0.12082165193012724, + "grad_norm": 0.7907035946846008, + "learning_rate": 9.979411146040703e-06, + "loss": 0.7742, + "step": 2241 + }, + { + "epoch": 0.12087556609877076, + "grad_norm": 0.8344926238059998, + "learning_rate": 9.979391921842669e-06, + "loss": 0.8242, + "step": 2242 + }, + { + "epoch": 0.12092948026741428, + "grad_norm": 0.8011842370033264, + "learning_rate": 9.979372688692359e-06, + "loss": 0.7697, + "step": 2243 + }, + { + "epoch": 0.1209833944360578, + "grad_norm": 0.9063104391098022, + "learning_rate": 9.97935344658981e-06, + "loss": 0.8487, + "step": 2244 + }, + { + "epoch": 0.12103730860470131, + "grad_norm": 0.8313894867897034, + "learning_rate": 9.979334195535053e-06, + "loss": 0.8601, + "step": 2245 + }, + { + "epoch": 0.12109122277334483, + "grad_norm": 0.7892987728118896, + "learning_rate": 9.979314935528125e-06, + "loss": 0.7539, + "step": 2246 + }, + { + "epoch": 0.12114513694198835, + "grad_norm": 0.8141210079193115, + "learning_rate": 9.979295666569062e-06, + "loss": 0.8749, + "step": 2247 + }, + { + "epoch": 0.12119905111063188, + "grad_norm": 0.8218675851821899, + "learning_rate": 9.979276388657895e-06, + "loss": 0.743, + "step": 2248 + }, + { + "epoch": 0.1212529652792754, + "grad_norm": 0.8640784025192261, + "learning_rate": 9.979257101794661e-06, + "loss": 0.8876, + "step": 2249 + }, + { + "epoch": 0.12130687944791892, + "grad_norm": 0.8411698341369629, + "learning_rate": 9.979237805979395e-06, + "loss": 0.8692, + "step": 2250 + }, + { + "epoch": 0.12136079361656243, + "grad_norm": 0.9402859210968018, + "learning_rate": 9.97921850121213e-06, + "loss": 0.9362, + "step": 2251 + }, + { + "epoch": 0.12141470778520595, + "grad_norm": 0.8132252097129822, + "learning_rate": 9.979199187492903e-06, + "loss": 0.8119, + "step": 2252 + }, + { + "epoch": 0.12146862195384947, + "grad_norm": 0.9142205119132996, + "learning_rate": 9.979179864821747e-06, + "loss": 0.8219, + "step": 2253 + }, + { + "epoch": 0.12152253612249299, + "grad_norm": 0.9614750742912292, + "learning_rate": 9.979160533198697e-06, + "loss": 0.8342, + "step": 2254 + }, + { + "epoch": 0.1215764502911365, + "grad_norm": 0.7893047332763672, + "learning_rate": 9.979141192623787e-06, + "loss": 0.7111, + "step": 2255 + }, + { + "epoch": 0.12163036445978002, + "grad_norm": 0.8807032704353333, + "learning_rate": 9.979121843097053e-06, + "loss": 0.7677, + "step": 2256 + }, + { + "epoch": 0.12168427862842356, + "grad_norm": 1.1099025011062622, + "learning_rate": 9.97910248461853e-06, + "loss": 0.9548, + "step": 2257 + }, + { + "epoch": 0.12173819279706707, + "grad_norm": 0.9182586669921875, + "learning_rate": 9.979083117188253e-06, + "loss": 0.8734, + "step": 2258 + }, + { + "epoch": 0.12179210696571059, + "grad_norm": 0.9201869964599609, + "learning_rate": 9.979063740806253e-06, + "loss": 0.823, + "step": 2259 + }, + { + "epoch": 0.12184602113435411, + "grad_norm": 1.0309760570526123, + "learning_rate": 9.979044355472571e-06, + "loss": 0.7175, + "step": 2260 + }, + { + "epoch": 0.12189993530299763, + "grad_norm": 0.8577457070350647, + "learning_rate": 9.979024961187238e-06, + "loss": 0.8963, + "step": 2261 + }, + { + "epoch": 0.12195384947164115, + "grad_norm": 0.8203986883163452, + "learning_rate": 9.97900555795029e-06, + "loss": 0.736, + "step": 2262 + }, + { + "epoch": 0.12200776364028466, + "grad_norm": 0.8232439160346985, + "learning_rate": 9.97898614576176e-06, + "loss": 0.8104, + "step": 2263 + }, + { + "epoch": 0.12206167780892818, + "grad_norm": 1.276479959487915, + "learning_rate": 9.978966724621686e-06, + "loss": 0.7975, + "step": 2264 + }, + { + "epoch": 0.1221155919775717, + "grad_norm": 1.0115424394607544, + "learning_rate": 9.978947294530102e-06, + "loss": 1.0566, + "step": 2265 + }, + { + "epoch": 0.12216950614621523, + "grad_norm": 0.8645843863487244, + "learning_rate": 9.97892785548704e-06, + "loss": 0.8772, + "step": 2266 + }, + { + "epoch": 0.12222342031485875, + "grad_norm": 0.8335905075073242, + "learning_rate": 9.978908407492539e-06, + "loss": 0.7735, + "step": 2267 + }, + { + "epoch": 0.12227733448350227, + "grad_norm": 0.7752977013587952, + "learning_rate": 9.978888950546632e-06, + "loss": 0.725, + "step": 2268 + }, + { + "epoch": 0.12233124865214579, + "grad_norm": 0.9533143639564514, + "learning_rate": 9.978869484649354e-06, + "loss": 0.7845, + "step": 2269 + }, + { + "epoch": 0.1223851628207893, + "grad_norm": 1.2071044445037842, + "learning_rate": 9.978850009800739e-06, + "loss": 0.8394, + "step": 2270 + }, + { + "epoch": 0.12243907698943282, + "grad_norm": 0.8296889662742615, + "learning_rate": 9.978830526000825e-06, + "loss": 0.8088, + "step": 2271 + }, + { + "epoch": 0.12249299115807634, + "grad_norm": 0.7804126739501953, + "learning_rate": 9.978811033249643e-06, + "loss": 0.8174, + "step": 2272 + }, + { + "epoch": 0.12254690532671986, + "grad_norm": 0.9114241600036621, + "learning_rate": 9.978791531547232e-06, + "loss": 0.8601, + "step": 2273 + }, + { + "epoch": 0.12260081949536338, + "grad_norm": 0.9482108354568481, + "learning_rate": 9.978772020893626e-06, + "loss": 0.8063, + "step": 2274 + }, + { + "epoch": 0.1226547336640069, + "grad_norm": 0.7750483751296997, + "learning_rate": 9.978752501288857e-06, + "loss": 0.7875, + "step": 2275 + }, + { + "epoch": 0.12270864783265042, + "grad_norm": 0.838796854019165, + "learning_rate": 9.978732972732964e-06, + "loss": 0.7617, + "step": 2276 + }, + { + "epoch": 0.12276256200129394, + "grad_norm": 0.8419491052627563, + "learning_rate": 9.97871343522598e-06, + "loss": 0.8438, + "step": 2277 + }, + { + "epoch": 0.12281647616993746, + "grad_norm": 0.8125029802322388, + "learning_rate": 9.97869388876794e-06, + "loss": 0.8376, + "step": 2278 + }, + { + "epoch": 0.12287039033858098, + "grad_norm": 0.8310109972953796, + "learning_rate": 9.978674333358882e-06, + "loss": 0.8159, + "step": 2279 + }, + { + "epoch": 0.1229243045072245, + "grad_norm": 0.9533166289329529, + "learning_rate": 9.978654768998838e-06, + "loss": 0.8911, + "step": 2280 + }, + { + "epoch": 0.12297821867586801, + "grad_norm": 0.7564504742622375, + "learning_rate": 9.978635195687845e-06, + "loss": 0.7685, + "step": 2281 + }, + { + "epoch": 0.12303213284451153, + "grad_norm": 0.7912551760673523, + "learning_rate": 9.978615613425937e-06, + "loss": 0.7392, + "step": 2282 + }, + { + "epoch": 0.12308604701315505, + "grad_norm": 0.8196814656257629, + "learning_rate": 9.978596022213148e-06, + "loss": 0.8619, + "step": 2283 + }, + { + "epoch": 0.12313996118179858, + "grad_norm": 0.9053134918212891, + "learning_rate": 9.978576422049515e-06, + "loss": 0.8822, + "step": 2284 + }, + { + "epoch": 0.1231938753504421, + "grad_norm": 0.7988365292549133, + "learning_rate": 9.978556812935074e-06, + "loss": 0.7993, + "step": 2285 + }, + { + "epoch": 0.12324778951908562, + "grad_norm": 0.7595045566558838, + "learning_rate": 9.978537194869859e-06, + "loss": 0.7589, + "step": 2286 + }, + { + "epoch": 0.12330170368772914, + "grad_norm": 0.872302234172821, + "learning_rate": 9.978517567853908e-06, + "loss": 0.8315, + "step": 2287 + }, + { + "epoch": 0.12335561785637265, + "grad_norm": 0.8375674486160278, + "learning_rate": 9.97849793188725e-06, + "loss": 0.8348, + "step": 2288 + }, + { + "epoch": 0.12340953202501617, + "grad_norm": 0.8239575624465942, + "learning_rate": 9.978478286969927e-06, + "loss": 0.7636, + "step": 2289 + }, + { + "epoch": 0.12346344619365969, + "grad_norm": 0.8614348769187927, + "learning_rate": 9.97845863310197e-06, + "loss": 0.8162, + "step": 2290 + }, + { + "epoch": 0.12351736036230321, + "grad_norm": 0.8609321713447571, + "learning_rate": 9.978438970283417e-06, + "loss": 0.7776, + "step": 2291 + }, + { + "epoch": 0.12357127453094673, + "grad_norm": 0.9590173959732056, + "learning_rate": 9.978419298514302e-06, + "loss": 0.8761, + "step": 2292 + }, + { + "epoch": 0.12362518869959026, + "grad_norm": 0.8345216512680054, + "learning_rate": 9.978399617794659e-06, + "loss": 0.8353, + "step": 2293 + }, + { + "epoch": 0.12367910286823378, + "grad_norm": 0.8771556615829468, + "learning_rate": 9.978379928124526e-06, + "loss": 0.773, + "step": 2294 + }, + { + "epoch": 0.1237330170368773, + "grad_norm": 0.8305835127830505, + "learning_rate": 9.978360229503936e-06, + "loss": 0.7898, + "step": 2295 + }, + { + "epoch": 0.12378693120552081, + "grad_norm": 0.8536269664764404, + "learning_rate": 9.978340521932927e-06, + "loss": 0.8261, + "step": 2296 + }, + { + "epoch": 0.12384084537416433, + "grad_norm": 0.9008522629737854, + "learning_rate": 9.978320805411534e-06, + "loss": 0.7114, + "step": 2297 + }, + { + "epoch": 0.12389475954280785, + "grad_norm": 0.7834939956665039, + "learning_rate": 9.97830107993979e-06, + "loss": 0.7338, + "step": 2298 + }, + { + "epoch": 0.12394867371145137, + "grad_norm": 0.8269515037536621, + "learning_rate": 9.978281345517733e-06, + "loss": 0.7676, + "step": 2299 + }, + { + "epoch": 0.12400258788009488, + "grad_norm": 0.8482736945152283, + "learning_rate": 9.978261602145398e-06, + "loss": 0.8185, + "step": 2300 + }, + { + "epoch": 0.12405650204873842, + "grad_norm": 0.8833953142166138, + "learning_rate": 9.978241849822819e-06, + "loss": 0.7776, + "step": 2301 + }, + { + "epoch": 0.12411041621738193, + "grad_norm": 0.8089832067489624, + "learning_rate": 9.978222088550033e-06, + "loss": 0.7697, + "step": 2302 + }, + { + "epoch": 0.12416433038602545, + "grad_norm": 0.8204466104507446, + "learning_rate": 9.978202318327075e-06, + "loss": 0.839, + "step": 2303 + }, + { + "epoch": 0.12421824455466897, + "grad_norm": 0.8547719120979309, + "learning_rate": 9.97818253915398e-06, + "loss": 0.9022, + "step": 2304 + }, + { + "epoch": 0.12427215872331249, + "grad_norm": 1.090289831161499, + "learning_rate": 9.978162751030787e-06, + "loss": 0.7154, + "step": 2305 + }, + { + "epoch": 0.124326072891956, + "grad_norm": 0.88922518491745, + "learning_rate": 9.978142953957526e-06, + "loss": 0.8962, + "step": 2306 + }, + { + "epoch": 0.12437998706059952, + "grad_norm": 0.8741730451583862, + "learning_rate": 9.978123147934236e-06, + "loss": 0.7742, + "step": 2307 + }, + { + "epoch": 0.12443390122924304, + "grad_norm": 1.2885240316390991, + "learning_rate": 9.97810333296095e-06, + "loss": 0.7256, + "step": 2308 + }, + { + "epoch": 0.12448781539788656, + "grad_norm": 0.7973229885101318, + "learning_rate": 9.978083509037711e-06, + "loss": 0.8433, + "step": 2309 + }, + { + "epoch": 0.12454172956653009, + "grad_norm": 0.8328043222427368, + "learning_rate": 9.978063676164544e-06, + "loss": 0.8617, + "step": 2310 + }, + { + "epoch": 0.12459564373517361, + "grad_norm": 0.8093283176422119, + "learning_rate": 9.978043834341493e-06, + "loss": 0.8407, + "step": 2311 + }, + { + "epoch": 0.12464955790381713, + "grad_norm": 0.7566602826118469, + "learning_rate": 9.978023983568588e-06, + "loss": 0.7602, + "step": 2312 + }, + { + "epoch": 0.12470347207246064, + "grad_norm": 0.7731996178627014, + "learning_rate": 9.97800412384587e-06, + "loss": 0.8323, + "step": 2313 + }, + { + "epoch": 0.12475738624110416, + "grad_norm": 0.9148348569869995, + "learning_rate": 9.97798425517337e-06, + "loss": 0.7886, + "step": 2314 + }, + { + "epoch": 0.12481130040974768, + "grad_norm": 0.8546224236488342, + "learning_rate": 9.977964377551126e-06, + "loss": 0.8116, + "step": 2315 + }, + { + "epoch": 0.1248652145783912, + "grad_norm": 1.0733944177627563, + "learning_rate": 9.977944490979175e-06, + "loss": 0.8255, + "step": 2316 + }, + { + "epoch": 0.12491912874703472, + "grad_norm": 0.8404545783996582, + "learning_rate": 9.977924595457549e-06, + "loss": 0.8542, + "step": 2317 + }, + { + "epoch": 0.12497304291567823, + "grad_norm": 0.8276603817939758, + "learning_rate": 9.977904690986286e-06, + "loss": 0.8242, + "step": 2318 + }, + { + "epoch": 0.12502695708432177, + "grad_norm": 0.8703106641769409, + "learning_rate": 9.977884777565423e-06, + "loss": 0.8525, + "step": 2319 + }, + { + "epoch": 0.12508087125296527, + "grad_norm": 0.8353367447853088, + "learning_rate": 9.977864855194994e-06, + "loss": 0.7921, + "step": 2320 + }, + { + "epoch": 0.1251347854216088, + "grad_norm": 0.8283559083938599, + "learning_rate": 9.977844923875036e-06, + "loss": 0.8262, + "step": 2321 + }, + { + "epoch": 0.1251886995902523, + "grad_norm": 0.8737161755561829, + "learning_rate": 9.977824983605584e-06, + "loss": 0.9117, + "step": 2322 + }, + { + "epoch": 0.12524261375889584, + "grad_norm": 0.8616884350776672, + "learning_rate": 9.977805034386675e-06, + "loss": 0.8178, + "step": 2323 + }, + { + "epoch": 0.12529652792753937, + "grad_norm": 0.9863162636756897, + "learning_rate": 9.977785076218342e-06, + "loss": 0.8671, + "step": 2324 + }, + { + "epoch": 0.12535044209618287, + "grad_norm": 0.9636940360069275, + "learning_rate": 9.977765109100624e-06, + "loss": 0.894, + "step": 2325 + }, + { + "epoch": 0.1254043562648264, + "grad_norm": 0.741320013999939, + "learning_rate": 9.977745133033554e-06, + "loss": 0.7474, + "step": 2326 + }, + { + "epoch": 0.1254582704334699, + "grad_norm": 0.7776119709014893, + "learning_rate": 9.97772514801717e-06, + "loss": 0.7867, + "step": 2327 + }, + { + "epoch": 0.12551218460211344, + "grad_norm": 0.8219690918922424, + "learning_rate": 9.97770515405151e-06, + "loss": 0.8443, + "step": 2328 + }, + { + "epoch": 0.12556609877075695, + "grad_norm": 0.8977565765380859, + "learning_rate": 9.977685151136605e-06, + "loss": 0.7831, + "step": 2329 + }, + { + "epoch": 0.12562001293940048, + "grad_norm": 0.8503162264823914, + "learning_rate": 9.977665139272495e-06, + "loss": 0.8733, + "step": 2330 + }, + { + "epoch": 0.12567392710804398, + "grad_norm": 0.7666327953338623, + "learning_rate": 9.977645118459213e-06, + "loss": 0.7165, + "step": 2331 + }, + { + "epoch": 0.1257278412766875, + "grad_norm": 0.8265602588653564, + "learning_rate": 9.977625088696797e-06, + "loss": 0.8894, + "step": 2332 + }, + { + "epoch": 0.12578175544533104, + "grad_norm": 0.9852930307388306, + "learning_rate": 9.977605049985282e-06, + "loss": 0.9223, + "step": 2333 + }, + { + "epoch": 0.12583566961397455, + "grad_norm": 0.9563886523246765, + "learning_rate": 9.977585002324705e-06, + "loss": 0.8275, + "step": 2334 + }, + { + "epoch": 0.12588958378261808, + "grad_norm": 0.8098574876785278, + "learning_rate": 9.977564945715102e-06, + "loss": 0.8831, + "step": 2335 + }, + { + "epoch": 0.12594349795126158, + "grad_norm": 0.8795431852340698, + "learning_rate": 9.977544880156507e-06, + "loss": 0.8079, + "step": 2336 + }, + { + "epoch": 0.12599741211990512, + "grad_norm": 0.7483893036842346, + "learning_rate": 9.97752480564896e-06, + "loss": 0.7734, + "step": 2337 + }, + { + "epoch": 0.12605132628854862, + "grad_norm": 0.7988960146903992, + "learning_rate": 9.977504722192493e-06, + "loss": 0.6936, + "step": 2338 + }, + { + "epoch": 0.12610524045719215, + "grad_norm": 0.7945669293403625, + "learning_rate": 9.977484629787143e-06, + "loss": 0.8608, + "step": 2339 + }, + { + "epoch": 0.12615915462583566, + "grad_norm": 0.8720629215240479, + "learning_rate": 9.977464528432948e-06, + "loss": 0.8656, + "step": 2340 + }, + { + "epoch": 0.1262130687944792, + "grad_norm": 0.8935837745666504, + "learning_rate": 9.977444418129943e-06, + "loss": 0.8854, + "step": 2341 + }, + { + "epoch": 0.12626698296312272, + "grad_norm": 0.8034403324127197, + "learning_rate": 9.977424298878165e-06, + "loss": 0.8422, + "step": 2342 + }, + { + "epoch": 0.12632089713176622, + "grad_norm": 1.0071096420288086, + "learning_rate": 9.977404170677648e-06, + "loss": 0.9105, + "step": 2343 + }, + { + "epoch": 0.12637481130040976, + "grad_norm": 1.0757510662078857, + "learning_rate": 9.97738403352843e-06, + "loss": 0.7454, + "step": 2344 + }, + { + "epoch": 0.12642872546905326, + "grad_norm": 0.7133142352104187, + "learning_rate": 9.977363887430548e-06, + "loss": 0.6814, + "step": 2345 + }, + { + "epoch": 0.1264826396376968, + "grad_norm": 0.769752025604248, + "learning_rate": 9.977343732384035e-06, + "loss": 0.7209, + "step": 2346 + }, + { + "epoch": 0.1265365538063403, + "grad_norm": 0.8043524622917175, + "learning_rate": 9.977323568388933e-06, + "loss": 0.8379, + "step": 2347 + }, + { + "epoch": 0.12659046797498383, + "grad_norm": 0.9236345887184143, + "learning_rate": 9.97730339544527e-06, + "loss": 0.8091, + "step": 2348 + }, + { + "epoch": 0.12664438214362733, + "grad_norm": 0.8852472305297852, + "learning_rate": 9.97728321355309e-06, + "loss": 0.8527, + "step": 2349 + }, + { + "epoch": 0.12669829631227086, + "grad_norm": 0.8866454362869263, + "learning_rate": 9.977263022712425e-06, + "loss": 0.7412, + "step": 2350 + }, + { + "epoch": 0.1267522104809144, + "grad_norm": 0.7950204014778137, + "learning_rate": 9.977242822923311e-06, + "loss": 0.7778, + "step": 2351 + }, + { + "epoch": 0.1268061246495579, + "grad_norm": 0.8775694966316223, + "learning_rate": 9.977222614185787e-06, + "loss": 0.7437, + "step": 2352 + }, + { + "epoch": 0.12686003881820143, + "grad_norm": 0.8059643507003784, + "learning_rate": 9.977202396499889e-06, + "loss": 0.7935, + "step": 2353 + }, + { + "epoch": 0.12691395298684494, + "grad_norm": 0.8250171542167664, + "learning_rate": 9.977182169865652e-06, + "loss": 0.7936, + "step": 2354 + }, + { + "epoch": 0.12696786715548847, + "grad_norm": 0.8618381023406982, + "learning_rate": 9.97716193428311e-06, + "loss": 0.7884, + "step": 2355 + }, + { + "epoch": 0.12702178132413197, + "grad_norm": 0.8977087140083313, + "learning_rate": 9.977141689752306e-06, + "loss": 0.7764, + "step": 2356 + }, + { + "epoch": 0.1270756954927755, + "grad_norm": 0.7616862058639526, + "learning_rate": 9.97712143627327e-06, + "loss": 0.7222, + "step": 2357 + }, + { + "epoch": 0.127129609661419, + "grad_norm": 0.8255194425582886, + "learning_rate": 9.977101173846042e-06, + "loss": 0.8015, + "step": 2358 + }, + { + "epoch": 0.12718352383006254, + "grad_norm": 0.7783398628234863, + "learning_rate": 9.977080902470657e-06, + "loss": 0.7403, + "step": 2359 + }, + { + "epoch": 0.12723743799870607, + "grad_norm": 1.201339840888977, + "learning_rate": 9.977060622147152e-06, + "loss": 0.8994, + "step": 2360 + }, + { + "epoch": 0.12729135216734958, + "grad_norm": 0.906428337097168, + "learning_rate": 9.977040332875563e-06, + "loss": 0.7791, + "step": 2361 + }, + { + "epoch": 0.1273452663359931, + "grad_norm": 0.8238182663917542, + "learning_rate": 9.977020034655927e-06, + "loss": 0.728, + "step": 2362 + }, + { + "epoch": 0.1273991805046366, + "grad_norm": 0.9390681385993958, + "learning_rate": 9.976999727488279e-06, + "loss": 0.8697, + "step": 2363 + }, + { + "epoch": 0.12745309467328014, + "grad_norm": 0.8595122694969177, + "learning_rate": 9.976979411372658e-06, + "loss": 0.8481, + "step": 2364 + }, + { + "epoch": 0.12750700884192365, + "grad_norm": 0.8220391273498535, + "learning_rate": 9.976959086309099e-06, + "loss": 0.709, + "step": 2365 + }, + { + "epoch": 0.12756092301056718, + "grad_norm": 0.9712308645248413, + "learning_rate": 9.976938752297638e-06, + "loss": 0.8898, + "step": 2366 + }, + { + "epoch": 0.12761483717921068, + "grad_norm": 0.8864933848381042, + "learning_rate": 9.976918409338315e-06, + "loss": 0.8798, + "step": 2367 + }, + { + "epoch": 0.12766875134785421, + "grad_norm": 0.7780918478965759, + "learning_rate": 9.976898057431162e-06, + "loss": 0.8123, + "step": 2368 + }, + { + "epoch": 0.12772266551649775, + "grad_norm": 0.8338439464569092, + "learning_rate": 9.976877696576218e-06, + "loss": 0.8177, + "step": 2369 + }, + { + "epoch": 0.12777657968514125, + "grad_norm": 0.9967712759971619, + "learning_rate": 9.976857326773517e-06, + "loss": 0.8613, + "step": 2370 + }, + { + "epoch": 0.12783049385378478, + "grad_norm": 0.7666492462158203, + "learning_rate": 9.976836948023099e-06, + "loss": 0.7226, + "step": 2371 + }, + { + "epoch": 0.1278844080224283, + "grad_norm": 0.9783684611320496, + "learning_rate": 9.976816560325e-06, + "loss": 0.8616, + "step": 2372 + }, + { + "epoch": 0.12793832219107182, + "grad_norm": 1.0170663595199585, + "learning_rate": 9.976796163679256e-06, + "loss": 0.8211, + "step": 2373 + }, + { + "epoch": 0.12799223635971532, + "grad_norm": 0.8657981157302856, + "learning_rate": 9.976775758085903e-06, + "loss": 0.867, + "step": 2374 + }, + { + "epoch": 0.12804615052835885, + "grad_norm": 0.8487955927848816, + "learning_rate": 9.976755343544979e-06, + "loss": 0.8056, + "step": 2375 + }, + { + "epoch": 0.12810006469700239, + "grad_norm": 0.90731281042099, + "learning_rate": 9.976734920056522e-06, + "loss": 0.8492, + "step": 2376 + }, + { + "epoch": 0.1281539788656459, + "grad_norm": 0.9684501886367798, + "learning_rate": 9.976714487620565e-06, + "loss": 0.8023, + "step": 2377 + }, + { + "epoch": 0.12820789303428942, + "grad_norm": 0.8361303806304932, + "learning_rate": 9.976694046237146e-06, + "loss": 0.8132, + "step": 2378 + }, + { + "epoch": 0.12826180720293293, + "grad_norm": 0.9570466876029968, + "learning_rate": 9.976673595906303e-06, + "loss": 0.8991, + "step": 2379 + }, + { + "epoch": 0.12831572137157646, + "grad_norm": 0.8944576978683472, + "learning_rate": 9.976653136628071e-06, + "loss": 0.8163, + "step": 2380 + }, + { + "epoch": 0.12836963554021996, + "grad_norm": 0.7991742491722107, + "learning_rate": 9.976632668402489e-06, + "loss": 0.7962, + "step": 2381 + }, + { + "epoch": 0.1284235497088635, + "grad_norm": 0.9284802079200745, + "learning_rate": 9.976612191229594e-06, + "loss": 1.0115, + "step": 2382 + }, + { + "epoch": 0.128477463877507, + "grad_norm": 0.8092453479766846, + "learning_rate": 9.97659170510942e-06, + "loss": 0.705, + "step": 2383 + }, + { + "epoch": 0.12853137804615053, + "grad_norm": 0.8068677186965942, + "learning_rate": 9.976571210042005e-06, + "loss": 0.8283, + "step": 2384 + }, + { + "epoch": 0.12858529221479406, + "grad_norm": 0.8636525869369507, + "learning_rate": 9.976550706027386e-06, + "loss": 0.7824, + "step": 2385 + }, + { + "epoch": 0.12863920638343757, + "grad_norm": 0.9768033027648926, + "learning_rate": 9.9765301930656e-06, + "loss": 0.8317, + "step": 2386 + }, + { + "epoch": 0.1286931205520811, + "grad_norm": 0.8494508862495422, + "learning_rate": 9.976509671156684e-06, + "loss": 0.9464, + "step": 2387 + }, + { + "epoch": 0.1287470347207246, + "grad_norm": 0.8336171507835388, + "learning_rate": 9.976489140300676e-06, + "loss": 0.8003, + "step": 2388 + }, + { + "epoch": 0.12880094888936813, + "grad_norm": 0.819869339466095, + "learning_rate": 9.97646860049761e-06, + "loss": 0.6779, + "step": 2389 + }, + { + "epoch": 0.12885486305801164, + "grad_norm": 1.179028868675232, + "learning_rate": 9.976448051747526e-06, + "loss": 0.8183, + "step": 2390 + }, + { + "epoch": 0.12890877722665517, + "grad_norm": 0.8214680552482605, + "learning_rate": 9.97642749405046e-06, + "loss": 0.7659, + "step": 2391 + }, + { + "epoch": 0.12896269139529867, + "grad_norm": 0.8303862810134888, + "learning_rate": 9.976406927406446e-06, + "loss": 0.8993, + "step": 2392 + }, + { + "epoch": 0.1290166055639422, + "grad_norm": 0.8043105006217957, + "learning_rate": 9.976386351815526e-06, + "loss": 0.7948, + "step": 2393 + }, + { + "epoch": 0.12907051973258574, + "grad_norm": 0.7988419532775879, + "learning_rate": 9.976365767277734e-06, + "loss": 0.8042, + "step": 2394 + }, + { + "epoch": 0.12912443390122924, + "grad_norm": 0.8145790696144104, + "learning_rate": 9.976345173793107e-06, + "loss": 0.7214, + "step": 2395 + }, + { + "epoch": 0.12917834806987277, + "grad_norm": 0.8323239088058472, + "learning_rate": 9.976324571361682e-06, + "loss": 0.8692, + "step": 2396 + }, + { + "epoch": 0.12923226223851628, + "grad_norm": 1.5968064069747925, + "learning_rate": 9.976303959983498e-06, + "loss": 0.8573, + "step": 2397 + }, + { + "epoch": 0.1292861764071598, + "grad_norm": 0.8523521423339844, + "learning_rate": 9.976283339658589e-06, + "loss": 0.8856, + "step": 2398 + }, + { + "epoch": 0.1293400905758033, + "grad_norm": 1.3875633478164673, + "learning_rate": 9.976262710386994e-06, + "loss": 0.829, + "step": 2399 + }, + { + "epoch": 0.12939400474444684, + "grad_norm": 0.8131827712059021, + "learning_rate": 9.976242072168751e-06, + "loss": 0.7787, + "step": 2400 + }, + { + "epoch": 0.12944791891309035, + "grad_norm": 0.8347164392471313, + "learning_rate": 9.976221425003896e-06, + "loss": 0.9119, + "step": 2401 + }, + { + "epoch": 0.12950183308173388, + "grad_norm": 0.791674792766571, + "learning_rate": 9.976200768892465e-06, + "loss": 0.8483, + "step": 2402 + }, + { + "epoch": 0.1295557472503774, + "grad_norm": 0.8207666277885437, + "learning_rate": 9.976180103834496e-06, + "loss": 0.7688, + "step": 2403 + }, + { + "epoch": 0.12960966141902092, + "grad_norm": 0.8335880041122437, + "learning_rate": 9.976159429830027e-06, + "loss": 0.8943, + "step": 2404 + }, + { + "epoch": 0.12966357558766445, + "grad_norm": 0.8273102045059204, + "learning_rate": 9.976138746879094e-06, + "loss": 0.7847, + "step": 2405 + }, + { + "epoch": 0.12971748975630795, + "grad_norm": 0.9029181003570557, + "learning_rate": 9.976118054981735e-06, + "loss": 0.9779, + "step": 2406 + }, + { + "epoch": 0.12977140392495148, + "grad_norm": 1.0253269672393799, + "learning_rate": 9.976097354137986e-06, + "loss": 0.8301, + "step": 2407 + }, + { + "epoch": 0.129825318093595, + "grad_norm": 0.859992265701294, + "learning_rate": 9.976076644347887e-06, + "loss": 0.7809, + "step": 2408 + }, + { + "epoch": 0.12987923226223852, + "grad_norm": 0.8313273787498474, + "learning_rate": 9.976055925611472e-06, + "loss": 0.8435, + "step": 2409 + }, + { + "epoch": 0.12993314643088202, + "grad_norm": 0.8921852707862854, + "learning_rate": 9.976035197928779e-06, + "loss": 0.8407, + "step": 2410 + }, + { + "epoch": 0.12998706059952556, + "grad_norm": 0.9168267846107483, + "learning_rate": 9.976014461299848e-06, + "loss": 0.8428, + "step": 2411 + }, + { + "epoch": 0.1300409747681691, + "grad_norm": 0.8943728804588318, + "learning_rate": 9.975993715724712e-06, + "loss": 0.8953, + "step": 2412 + }, + { + "epoch": 0.1300948889368126, + "grad_norm": 0.8288392424583435, + "learning_rate": 9.975972961203411e-06, + "loss": 0.8008, + "step": 2413 + }, + { + "epoch": 0.13014880310545612, + "grad_norm": 0.8432718515396118, + "learning_rate": 9.975952197735982e-06, + "loss": 0.775, + "step": 2414 + }, + { + "epoch": 0.13020271727409963, + "grad_norm": 1.029341220855713, + "learning_rate": 9.975931425322462e-06, + "loss": 0.9086, + "step": 2415 + }, + { + "epoch": 0.13025663144274316, + "grad_norm": 0.8342422842979431, + "learning_rate": 9.975910643962888e-06, + "loss": 0.8867, + "step": 2416 + }, + { + "epoch": 0.13031054561138666, + "grad_norm": 0.7766898274421692, + "learning_rate": 9.975889853657298e-06, + "loss": 0.7597, + "step": 2417 + }, + { + "epoch": 0.1303644597800302, + "grad_norm": 0.865112841129303, + "learning_rate": 9.97586905440573e-06, + "loss": 0.8164, + "step": 2418 + }, + { + "epoch": 0.1304183739486737, + "grad_norm": 0.7938675880432129, + "learning_rate": 9.97584824620822e-06, + "loss": 0.8053, + "step": 2419 + }, + { + "epoch": 0.13047228811731723, + "grad_norm": 0.8813329339027405, + "learning_rate": 9.975827429064805e-06, + "loss": 0.8662, + "step": 2420 + }, + { + "epoch": 0.13052620228596076, + "grad_norm": 0.8217114210128784, + "learning_rate": 9.975806602975525e-06, + "loss": 0.8647, + "step": 2421 + }, + { + "epoch": 0.13058011645460427, + "grad_norm": 1.0177736282348633, + "learning_rate": 9.975785767940413e-06, + "loss": 0.813, + "step": 2422 + }, + { + "epoch": 0.1306340306232478, + "grad_norm": 0.7887234687805176, + "learning_rate": 9.975764923959512e-06, + "loss": 0.7759, + "step": 2423 + }, + { + "epoch": 0.1306879447918913, + "grad_norm": 0.7670013904571533, + "learning_rate": 9.975744071032856e-06, + "loss": 0.7534, + "step": 2424 + }, + { + "epoch": 0.13074185896053483, + "grad_norm": 0.7348708510398865, + "learning_rate": 9.975723209160483e-06, + "loss": 0.7955, + "step": 2425 + }, + { + "epoch": 0.13079577312917834, + "grad_norm": 0.8183468580245972, + "learning_rate": 9.97570233834243e-06, + "loss": 0.8664, + "step": 2426 + }, + { + "epoch": 0.13084968729782187, + "grad_norm": 0.8783697485923767, + "learning_rate": 9.975681458578736e-06, + "loss": 0.8399, + "step": 2427 + }, + { + "epoch": 0.13090360146646537, + "grad_norm": 0.7653324007987976, + "learning_rate": 9.975660569869439e-06, + "loss": 0.7723, + "step": 2428 + }, + { + "epoch": 0.1309575156351089, + "grad_norm": 0.9938413500785828, + "learning_rate": 9.975639672214574e-06, + "loss": 0.7439, + "step": 2429 + }, + { + "epoch": 0.13101142980375244, + "grad_norm": 0.7844074368476868, + "learning_rate": 9.975618765614181e-06, + "loss": 0.8234, + "step": 2430 + }, + { + "epoch": 0.13106534397239594, + "grad_norm": 0.8992919325828552, + "learning_rate": 9.975597850068295e-06, + "loss": 0.7485, + "step": 2431 + }, + { + "epoch": 0.13111925814103947, + "grad_norm": 0.8023738265037537, + "learning_rate": 9.975576925576956e-06, + "loss": 0.7986, + "step": 2432 + }, + { + "epoch": 0.13117317230968298, + "grad_norm": 0.8369026184082031, + "learning_rate": 9.9755559921402e-06, + "loss": 0.8695, + "step": 2433 + }, + { + "epoch": 0.1312270864783265, + "grad_norm": 0.812224805355072, + "learning_rate": 9.975535049758067e-06, + "loss": 0.834, + "step": 2434 + }, + { + "epoch": 0.13128100064697001, + "grad_norm": 0.7718735337257385, + "learning_rate": 9.975514098430591e-06, + "loss": 0.8055, + "step": 2435 + }, + { + "epoch": 0.13133491481561355, + "grad_norm": 0.8709392547607422, + "learning_rate": 9.975493138157813e-06, + "loss": 0.899, + "step": 2436 + }, + { + "epoch": 0.13138882898425705, + "grad_norm": 0.8817125558853149, + "learning_rate": 9.97547216893977e-06, + "loss": 0.7908, + "step": 2437 + }, + { + "epoch": 0.13144274315290058, + "grad_norm": 0.9631084203720093, + "learning_rate": 9.975451190776498e-06, + "loss": 0.9153, + "step": 2438 + }, + { + "epoch": 0.1314966573215441, + "grad_norm": 0.998906672000885, + "learning_rate": 9.975430203668037e-06, + "loss": 0.971, + "step": 2439 + }, + { + "epoch": 0.13155057149018762, + "grad_norm": 0.9689096212387085, + "learning_rate": 9.975409207614422e-06, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 0.13160448565883115, + "grad_norm": 0.7694187760353088, + "learning_rate": 9.975388202615692e-06, + "loss": 0.757, + "step": 2441 + }, + { + "epoch": 0.13165839982747465, + "grad_norm": 0.8082549571990967, + "learning_rate": 9.975367188671885e-06, + "loss": 0.8704, + "step": 2442 + }, + { + "epoch": 0.13171231399611819, + "grad_norm": 0.8493963479995728, + "learning_rate": 9.97534616578304e-06, + "loss": 0.8171, + "step": 2443 + }, + { + "epoch": 0.1317662281647617, + "grad_norm": 0.972273588180542, + "learning_rate": 9.975325133949195e-06, + "loss": 0.9834, + "step": 2444 + }, + { + "epoch": 0.13182014233340522, + "grad_norm": 0.8235988616943359, + "learning_rate": 9.975304093170384e-06, + "loss": 0.8896, + "step": 2445 + }, + { + "epoch": 0.13187405650204873, + "grad_norm": 0.8405951261520386, + "learning_rate": 9.975283043446649e-06, + "loss": 0.8362, + "step": 2446 + }, + { + "epoch": 0.13192797067069226, + "grad_norm": 0.765640377998352, + "learning_rate": 9.975261984778024e-06, + "loss": 0.7543, + "step": 2447 + }, + { + "epoch": 0.1319818848393358, + "grad_norm": 0.9431920051574707, + "learning_rate": 9.97524091716455e-06, + "loss": 0.8322, + "step": 2448 + }, + { + "epoch": 0.1320357990079793, + "grad_norm": 0.8060823082923889, + "learning_rate": 9.975219840606265e-06, + "loss": 0.8153, + "step": 2449 + }, + { + "epoch": 0.13208971317662282, + "grad_norm": 1.1293737888336182, + "learning_rate": 9.975198755103203e-06, + "loss": 0.8969, + "step": 2450 + }, + { + "epoch": 0.13214362734526633, + "grad_norm": 0.8462950587272644, + "learning_rate": 9.975177660655407e-06, + "loss": 0.7758, + "step": 2451 + }, + { + "epoch": 0.13219754151390986, + "grad_norm": 0.8241791725158691, + "learning_rate": 9.975156557262914e-06, + "loss": 0.8046, + "step": 2452 + }, + { + "epoch": 0.13225145568255336, + "grad_norm": 0.8260864615440369, + "learning_rate": 9.975135444925756e-06, + "loss": 0.7559, + "step": 2453 + }, + { + "epoch": 0.1323053698511969, + "grad_norm": 0.8952769637107849, + "learning_rate": 9.975114323643978e-06, + "loss": 0.8292, + "step": 2454 + }, + { + "epoch": 0.1323592840198404, + "grad_norm": 0.8182158470153809, + "learning_rate": 9.975093193417615e-06, + "loss": 0.7137, + "step": 2455 + }, + { + "epoch": 0.13241319818848393, + "grad_norm": 0.9926600456237793, + "learning_rate": 9.975072054246706e-06, + "loss": 0.7935, + "step": 2456 + }, + { + "epoch": 0.13246711235712746, + "grad_norm": 0.872171938419342, + "learning_rate": 9.97505090613129e-06, + "loss": 0.882, + "step": 2457 + }, + { + "epoch": 0.13252102652577097, + "grad_norm": 0.8218923807144165, + "learning_rate": 9.975029749071401e-06, + "loss": 0.7675, + "step": 2458 + }, + { + "epoch": 0.1325749406944145, + "grad_norm": 0.8250816464424133, + "learning_rate": 9.97500858306708e-06, + "loss": 0.8404, + "step": 2459 + }, + { + "epoch": 0.132628854863058, + "grad_norm": 0.8135029673576355, + "learning_rate": 9.974987408118365e-06, + "loss": 0.8387, + "step": 2460 + }, + { + "epoch": 0.13268276903170154, + "grad_norm": 1.3989582061767578, + "learning_rate": 9.974966224225293e-06, + "loss": 0.817, + "step": 2461 + }, + { + "epoch": 0.13273668320034504, + "grad_norm": 0.8212644457817078, + "learning_rate": 9.974945031387902e-06, + "loss": 0.8377, + "step": 2462 + }, + { + "epoch": 0.13279059736898857, + "grad_norm": 1.5513782501220703, + "learning_rate": 9.974923829606232e-06, + "loss": 0.7645, + "step": 2463 + }, + { + "epoch": 0.13284451153763208, + "grad_norm": 0.9355224370956421, + "learning_rate": 9.97490261888032e-06, + "loss": 0.7943, + "step": 2464 + }, + { + "epoch": 0.1328984257062756, + "grad_norm": 0.8264141082763672, + "learning_rate": 9.974881399210204e-06, + "loss": 0.7868, + "step": 2465 + }, + { + "epoch": 0.13295233987491914, + "grad_norm": 0.8267685770988464, + "learning_rate": 9.974860170595921e-06, + "loss": 0.8482, + "step": 2466 + }, + { + "epoch": 0.13300625404356264, + "grad_norm": 0.7816182374954224, + "learning_rate": 9.974838933037512e-06, + "loss": 0.6735, + "step": 2467 + }, + { + "epoch": 0.13306016821220618, + "grad_norm": 0.8686188459396362, + "learning_rate": 9.974817686535013e-06, + "loss": 0.7639, + "step": 2468 + }, + { + "epoch": 0.13311408238084968, + "grad_norm": 0.8006383776664734, + "learning_rate": 9.974796431088462e-06, + "loss": 0.9035, + "step": 2469 + }, + { + "epoch": 0.1331679965494932, + "grad_norm": 0.829788327217102, + "learning_rate": 9.974775166697898e-06, + "loss": 0.7724, + "step": 2470 + }, + { + "epoch": 0.13322191071813672, + "grad_norm": 0.7149111032485962, + "learning_rate": 9.97475389336336e-06, + "loss": 0.7543, + "step": 2471 + }, + { + "epoch": 0.13327582488678025, + "grad_norm": 0.8626448512077332, + "learning_rate": 9.974732611084886e-06, + "loss": 0.8903, + "step": 2472 + }, + { + "epoch": 0.13332973905542375, + "grad_norm": 0.818778395652771, + "learning_rate": 9.974711319862514e-06, + "loss": 0.7862, + "step": 2473 + }, + { + "epoch": 0.13338365322406728, + "grad_norm": 0.8285005688667297, + "learning_rate": 9.97469001969628e-06, + "loss": 0.8186, + "step": 2474 + }, + { + "epoch": 0.13343756739271082, + "grad_norm": 0.9331484436988831, + "learning_rate": 9.974668710586226e-06, + "loss": 0.7278, + "step": 2475 + }, + { + "epoch": 0.13349148156135432, + "grad_norm": 0.7760492563247681, + "learning_rate": 9.974647392532387e-06, + "loss": 0.82, + "step": 2476 + }, + { + "epoch": 0.13354539572999785, + "grad_norm": 0.9858410358428955, + "learning_rate": 9.974626065534804e-06, + "loss": 0.9733, + "step": 2477 + }, + { + "epoch": 0.13359930989864136, + "grad_norm": 0.774960458278656, + "learning_rate": 9.974604729593513e-06, + "loss": 0.7899, + "step": 2478 + }, + { + "epoch": 0.1336532240672849, + "grad_norm": 0.7779082655906677, + "learning_rate": 9.974583384708556e-06, + "loss": 0.7727, + "step": 2479 + }, + { + "epoch": 0.1337071382359284, + "grad_norm": 0.8611405491828918, + "learning_rate": 9.974562030879967e-06, + "loss": 0.8341, + "step": 2480 + }, + { + "epoch": 0.13376105240457192, + "grad_norm": 0.9042904376983643, + "learning_rate": 9.974540668107788e-06, + "loss": 0.8015, + "step": 2481 + }, + { + "epoch": 0.13381496657321545, + "grad_norm": 1.067806601524353, + "learning_rate": 9.974519296392054e-06, + "loss": 0.8583, + "step": 2482 + }, + { + "epoch": 0.13386888074185896, + "grad_norm": 0.8079432845115662, + "learning_rate": 9.974497915732806e-06, + "loss": 0.7246, + "step": 2483 + }, + { + "epoch": 0.1339227949105025, + "grad_norm": 0.7360541224479675, + "learning_rate": 9.974476526130082e-06, + "loss": 0.7228, + "step": 2484 + }, + { + "epoch": 0.133976709079146, + "grad_norm": 0.7532739639282227, + "learning_rate": 9.97445512758392e-06, + "loss": 0.7472, + "step": 2485 + }, + { + "epoch": 0.13403062324778953, + "grad_norm": 0.794747531414032, + "learning_rate": 9.974433720094358e-06, + "loss": 0.8288, + "step": 2486 + }, + { + "epoch": 0.13408453741643303, + "grad_norm": 0.9305081367492676, + "learning_rate": 9.974412303661435e-06, + "loss": 0.9414, + "step": 2487 + }, + { + "epoch": 0.13413845158507656, + "grad_norm": 0.9857872128486633, + "learning_rate": 9.97439087828519e-06, + "loss": 0.9123, + "step": 2488 + }, + { + "epoch": 0.13419236575372007, + "grad_norm": 0.9159066081047058, + "learning_rate": 9.97436944396566e-06, + "loss": 0.815, + "step": 2489 + }, + { + "epoch": 0.1342462799223636, + "grad_norm": 0.920803427696228, + "learning_rate": 9.974348000702887e-06, + "loss": 0.855, + "step": 2490 + }, + { + "epoch": 0.13430019409100713, + "grad_norm": 0.8599058389663696, + "learning_rate": 9.974326548496906e-06, + "loss": 0.8944, + "step": 2491 + }, + { + "epoch": 0.13435410825965063, + "grad_norm": 0.7708035111427307, + "learning_rate": 9.974305087347758e-06, + "loss": 0.7733, + "step": 2492 + }, + { + "epoch": 0.13440802242829417, + "grad_norm": 0.771906852722168, + "learning_rate": 9.974283617255478e-06, + "loss": 0.8555, + "step": 2493 + }, + { + "epoch": 0.13446193659693767, + "grad_norm": 0.7494363188743591, + "learning_rate": 9.974262138220108e-06, + "loss": 0.7575, + "step": 2494 + }, + { + "epoch": 0.1345158507655812, + "grad_norm": 0.8488510251045227, + "learning_rate": 9.974240650241687e-06, + "loss": 0.8423, + "step": 2495 + }, + { + "epoch": 0.1345697649342247, + "grad_norm": 0.7665607929229736, + "learning_rate": 9.97421915332025e-06, + "loss": 0.8221, + "step": 2496 + }, + { + "epoch": 0.13462367910286824, + "grad_norm": 0.83452969789505, + "learning_rate": 9.974197647455839e-06, + "loss": 0.8192, + "step": 2497 + }, + { + "epoch": 0.13467759327151174, + "grad_norm": 0.8927843570709229, + "learning_rate": 9.97417613264849e-06, + "loss": 0.8041, + "step": 2498 + }, + { + "epoch": 0.13473150744015527, + "grad_norm": 0.8050754070281982, + "learning_rate": 9.974154608898246e-06, + "loss": 0.7374, + "step": 2499 + }, + { + "epoch": 0.1347854216087988, + "grad_norm": 0.8286676406860352, + "learning_rate": 9.97413307620514e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.1348393357774423, + "grad_norm": 0.8953397870063782, + "learning_rate": 9.974111534569215e-06, + "loss": 0.8419, + "step": 2501 + }, + { + "epoch": 0.13489324994608584, + "grad_norm": 0.8619454503059387, + "learning_rate": 9.974089983990507e-06, + "loss": 0.7231, + "step": 2502 + }, + { + "epoch": 0.13494716411472935, + "grad_norm": 0.8102728724479675, + "learning_rate": 9.974068424469058e-06, + "loss": 0.8701, + "step": 2503 + }, + { + "epoch": 0.13500107828337288, + "grad_norm": 0.7568274736404419, + "learning_rate": 9.974046856004904e-06, + "loss": 0.7864, + "step": 2504 + }, + { + "epoch": 0.13505499245201638, + "grad_norm": 0.7835590839385986, + "learning_rate": 9.974025278598086e-06, + "loss": 0.8595, + "step": 2505 + }, + { + "epoch": 0.1351089066206599, + "grad_norm": 0.854015052318573, + "learning_rate": 9.974003692248638e-06, + "loss": 0.7683, + "step": 2506 + }, + { + "epoch": 0.13516282078930342, + "grad_norm": 0.7973034977912903, + "learning_rate": 9.973982096956604e-06, + "loss": 0.7332, + "step": 2507 + }, + { + "epoch": 0.13521673495794695, + "grad_norm": 0.8860466480255127, + "learning_rate": 9.973960492722022e-06, + "loss": 0.8312, + "step": 2508 + }, + { + "epoch": 0.13527064912659048, + "grad_norm": 0.8370612263679504, + "learning_rate": 9.973938879544928e-06, + "loss": 0.8307, + "step": 2509 + }, + { + "epoch": 0.13532456329523398, + "grad_norm": 0.9102504253387451, + "learning_rate": 9.973917257425365e-06, + "loss": 0.8276, + "step": 2510 + }, + { + "epoch": 0.13537847746387752, + "grad_norm": 0.9040873646736145, + "learning_rate": 9.973895626363367e-06, + "loss": 0.7717, + "step": 2511 + }, + { + "epoch": 0.13543239163252102, + "grad_norm": 0.7447285056114197, + "learning_rate": 9.973873986358977e-06, + "loss": 0.7836, + "step": 2512 + }, + { + "epoch": 0.13548630580116455, + "grad_norm": 0.7533379197120667, + "learning_rate": 9.973852337412234e-06, + "loss": 0.8308, + "step": 2513 + }, + { + "epoch": 0.13554021996980806, + "grad_norm": 0.7503568530082703, + "learning_rate": 9.973830679523173e-06, + "loss": 0.7893, + "step": 2514 + }, + { + "epoch": 0.1355941341384516, + "grad_norm": 0.786011815071106, + "learning_rate": 9.973809012691836e-06, + "loss": 0.7562, + "step": 2515 + }, + { + "epoch": 0.1356480483070951, + "grad_norm": 0.9311261773109436, + "learning_rate": 9.973787336918262e-06, + "loss": 0.7295, + "step": 2516 + }, + { + "epoch": 0.13570196247573862, + "grad_norm": 0.8217887878417969, + "learning_rate": 9.973765652202488e-06, + "loss": 0.8399, + "step": 2517 + }, + { + "epoch": 0.13575587664438216, + "grad_norm": 0.8265646696090698, + "learning_rate": 9.973743958544554e-06, + "loss": 0.8146, + "step": 2518 + }, + { + "epoch": 0.13580979081302566, + "grad_norm": 0.9443806409835815, + "learning_rate": 9.9737222559445e-06, + "loss": 0.9217, + "step": 2519 + }, + { + "epoch": 0.1358637049816692, + "grad_norm": 0.807623028755188, + "learning_rate": 9.973700544402362e-06, + "loss": 0.8266, + "step": 2520 + }, + { + "epoch": 0.1359176191503127, + "grad_norm": 0.819793164730072, + "learning_rate": 9.973678823918184e-06, + "loss": 0.755, + "step": 2521 + }, + { + "epoch": 0.13597153331895623, + "grad_norm": 0.7608258724212646, + "learning_rate": 9.973657094492002e-06, + "loss": 0.7707, + "step": 2522 + }, + { + "epoch": 0.13602544748759973, + "grad_norm": 0.795218825340271, + "learning_rate": 9.973635356123854e-06, + "loss": 0.7235, + "step": 2523 + }, + { + "epoch": 0.13607936165624326, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.973613608813782e-06, + "loss": 0.8698, + "step": 2524 + }, + { + "epoch": 0.13613327582488677, + "grad_norm": 0.8091539144515991, + "learning_rate": 9.973591852561822e-06, + "loss": 0.8492, + "step": 2525 + }, + { + "epoch": 0.1361871899935303, + "grad_norm": 0.9144110679626465, + "learning_rate": 9.973570087368015e-06, + "loss": 0.7952, + "step": 2526 + }, + { + "epoch": 0.13624110416217383, + "grad_norm": 0.761695921421051, + "learning_rate": 9.9735483132324e-06, + "loss": 0.7841, + "step": 2527 + }, + { + "epoch": 0.13629501833081734, + "grad_norm": 0.887026846408844, + "learning_rate": 9.973526530155016e-06, + "loss": 0.8855, + "step": 2528 + }, + { + "epoch": 0.13634893249946087, + "grad_norm": 0.8282152414321899, + "learning_rate": 9.973504738135903e-06, + "loss": 0.8857, + "step": 2529 + }, + { + "epoch": 0.13640284666810437, + "grad_norm": 0.7782665491104126, + "learning_rate": 9.973482937175098e-06, + "loss": 0.8076, + "step": 2530 + }, + { + "epoch": 0.1364567608367479, + "grad_norm": 0.8865575194358826, + "learning_rate": 9.973461127272642e-06, + "loss": 0.8596, + "step": 2531 + }, + { + "epoch": 0.1365106750053914, + "grad_norm": 0.7215422987937927, + "learning_rate": 9.973439308428572e-06, + "loss": 0.7437, + "step": 2532 + }, + { + "epoch": 0.13656458917403494, + "grad_norm": 0.7932387590408325, + "learning_rate": 9.97341748064293e-06, + "loss": 0.8439, + "step": 2533 + }, + { + "epoch": 0.13661850334267844, + "grad_norm": 0.8260403871536255, + "learning_rate": 9.973395643915756e-06, + "loss": 0.7956, + "step": 2534 + }, + { + "epoch": 0.13667241751132198, + "grad_norm": 0.7879858016967773, + "learning_rate": 9.973373798247085e-06, + "loss": 0.8501, + "step": 2535 + }, + { + "epoch": 0.1367263316799655, + "grad_norm": 0.7268496751785278, + "learning_rate": 9.97335194363696e-06, + "loss": 0.78, + "step": 2536 + }, + { + "epoch": 0.136780245848609, + "grad_norm": 0.8170067071914673, + "learning_rate": 9.973330080085417e-06, + "loss": 0.829, + "step": 2537 + }, + { + "epoch": 0.13683416001725254, + "grad_norm": 0.8400061726570129, + "learning_rate": 9.973308207592498e-06, + "loss": 0.8576, + "step": 2538 + }, + { + "epoch": 0.13688807418589605, + "grad_norm": 0.9156914353370667, + "learning_rate": 9.973286326158244e-06, + "loss": 0.8633, + "step": 2539 + }, + { + "epoch": 0.13694198835453958, + "grad_norm": 0.7413343191146851, + "learning_rate": 9.97326443578269e-06, + "loss": 0.8128, + "step": 2540 + }, + { + "epoch": 0.13699590252318308, + "grad_norm": 0.8003092408180237, + "learning_rate": 9.973242536465877e-06, + "loss": 0.7743, + "step": 2541 + }, + { + "epoch": 0.13704981669182661, + "grad_norm": 0.8532862067222595, + "learning_rate": 9.973220628207844e-06, + "loss": 0.8526, + "step": 2542 + }, + { + "epoch": 0.13710373086047012, + "grad_norm": 0.7677969336509705, + "learning_rate": 9.973198711008634e-06, + "loss": 0.8493, + "step": 2543 + }, + { + "epoch": 0.13715764502911365, + "grad_norm": 0.8414867520332336, + "learning_rate": 9.973176784868282e-06, + "loss": 0.7674, + "step": 2544 + }, + { + "epoch": 0.13721155919775718, + "grad_norm": 0.825450599193573, + "learning_rate": 9.973154849786828e-06, + "loss": 0.8328, + "step": 2545 + }, + { + "epoch": 0.1372654733664007, + "grad_norm": 0.8429614305496216, + "learning_rate": 9.973132905764313e-06, + "loss": 0.787, + "step": 2546 + }, + { + "epoch": 0.13731938753504422, + "grad_norm": 0.9791093468666077, + "learning_rate": 9.973110952800776e-06, + "loss": 0.7836, + "step": 2547 + }, + { + "epoch": 0.13737330170368772, + "grad_norm": 0.8728508353233337, + "learning_rate": 9.973088990896255e-06, + "loss": 0.8897, + "step": 2548 + }, + { + "epoch": 0.13742721587233125, + "grad_norm": 0.9933381080627441, + "learning_rate": 9.973067020050792e-06, + "loss": 0.8679, + "step": 2549 + }, + { + "epoch": 0.13748113004097476, + "grad_norm": 0.8786694407463074, + "learning_rate": 9.973045040264423e-06, + "loss": 0.8599, + "step": 2550 + }, + { + "epoch": 0.1375350442096183, + "grad_norm": 0.7714465260505676, + "learning_rate": 9.973023051537193e-06, + "loss": 0.6355, + "step": 2551 + }, + { + "epoch": 0.1375889583782618, + "grad_norm": 0.9043986201286316, + "learning_rate": 9.973001053869138e-06, + "loss": 0.7445, + "step": 2552 + }, + { + "epoch": 0.13764287254690533, + "grad_norm": 0.879623532295227, + "learning_rate": 9.972979047260297e-06, + "loss": 0.8086, + "step": 2553 + }, + { + "epoch": 0.13769678671554886, + "grad_norm": 0.8384745121002197, + "learning_rate": 9.972957031710708e-06, + "loss": 0.6832, + "step": 2554 + }, + { + "epoch": 0.13775070088419236, + "grad_norm": 0.8574655055999756, + "learning_rate": 9.972935007220415e-06, + "loss": 0.8326, + "step": 2555 + }, + { + "epoch": 0.1378046150528359, + "grad_norm": 0.8241353034973145, + "learning_rate": 9.972912973789458e-06, + "loss": 0.7526, + "step": 2556 + }, + { + "epoch": 0.1378585292214794, + "grad_norm": 0.8306788802146912, + "learning_rate": 9.97289093141787e-06, + "loss": 0.9423, + "step": 2557 + }, + { + "epoch": 0.13791244339012293, + "grad_norm": 0.7930428385734558, + "learning_rate": 9.972868880105696e-06, + "loss": 0.8635, + "step": 2558 + }, + { + "epoch": 0.13796635755876643, + "grad_norm": 0.856482207775116, + "learning_rate": 9.972846819852974e-06, + "loss": 0.7902, + "step": 2559 + }, + { + "epoch": 0.13802027172740997, + "grad_norm": 0.8513977527618408, + "learning_rate": 9.972824750659747e-06, + "loss": 0.8485, + "step": 2560 + }, + { + "epoch": 0.13807418589605347, + "grad_norm": 0.7595572471618652, + "learning_rate": 9.97280267252605e-06, + "loss": 0.7294, + "step": 2561 + }, + { + "epoch": 0.138128100064697, + "grad_norm": 0.9774705767631531, + "learning_rate": 9.972780585451923e-06, + "loss": 0.8758, + "step": 2562 + }, + { + "epoch": 0.13818201423334053, + "grad_norm": 0.8011289834976196, + "learning_rate": 9.972758489437408e-06, + "loss": 0.7649, + "step": 2563 + }, + { + "epoch": 0.13823592840198404, + "grad_norm": 0.8921117186546326, + "learning_rate": 9.972736384482545e-06, + "loss": 0.8745, + "step": 2564 + }, + { + "epoch": 0.13828984257062757, + "grad_norm": 0.8739173412322998, + "learning_rate": 9.972714270587372e-06, + "loss": 0.841, + "step": 2565 + }, + { + "epoch": 0.13834375673927107, + "grad_norm": 0.7379958033561707, + "learning_rate": 9.97269214775193e-06, + "loss": 0.813, + "step": 2566 + }, + { + "epoch": 0.1383976709079146, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.972670015976258e-06, + "loss": 0.8319, + "step": 2567 + }, + { + "epoch": 0.1384515850765581, + "grad_norm": 0.7312106490135193, + "learning_rate": 9.972647875260395e-06, + "loss": 0.7494, + "step": 2568 + }, + { + "epoch": 0.13850549924520164, + "grad_norm": 0.8182246088981628, + "learning_rate": 9.972625725604383e-06, + "loss": 0.9543, + "step": 2569 + }, + { + "epoch": 0.13855941341384514, + "grad_norm": 0.8153319358825684, + "learning_rate": 9.97260356700826e-06, + "loss": 0.8411, + "step": 2570 + }, + { + "epoch": 0.13861332758248868, + "grad_norm": 0.7589008212089539, + "learning_rate": 9.972581399472066e-06, + "loss": 0.7576, + "step": 2571 + }, + { + "epoch": 0.1386672417511322, + "grad_norm": 0.8160014748573303, + "learning_rate": 9.972559222995841e-06, + "loss": 0.8801, + "step": 2572 + }, + { + "epoch": 0.1387211559197757, + "grad_norm": 0.752868115901947, + "learning_rate": 9.972537037579626e-06, + "loss": 0.7504, + "step": 2573 + }, + { + "epoch": 0.13877507008841924, + "grad_norm": 0.8015901446342468, + "learning_rate": 9.97251484322346e-06, + "loss": 0.7468, + "step": 2574 + }, + { + "epoch": 0.13882898425706275, + "grad_norm": 0.815352737903595, + "learning_rate": 9.972492639927384e-06, + "loss": 0.8526, + "step": 2575 + }, + { + "epoch": 0.13888289842570628, + "grad_norm": 0.7475571036338806, + "learning_rate": 9.972470427691436e-06, + "loss": 0.7653, + "step": 2576 + }, + { + "epoch": 0.13893681259434978, + "grad_norm": 1.1950535774230957, + "learning_rate": 9.972448206515656e-06, + "loss": 0.9106, + "step": 2577 + }, + { + "epoch": 0.13899072676299332, + "grad_norm": 0.843235194683075, + "learning_rate": 9.972425976400086e-06, + "loss": 0.8922, + "step": 2578 + }, + { + "epoch": 0.13904464093163682, + "grad_norm": 0.8039982914924622, + "learning_rate": 9.972403737344763e-06, + "loss": 0.6855, + "step": 2579 + }, + { + "epoch": 0.13909855510028035, + "grad_norm": 0.7598289251327515, + "learning_rate": 9.97238148934973e-06, + "loss": 0.832, + "step": 2580 + }, + { + "epoch": 0.13915246926892388, + "grad_norm": 0.7986323237419128, + "learning_rate": 9.972359232415025e-06, + "loss": 0.7886, + "step": 2581 + }, + { + "epoch": 0.1392063834375674, + "grad_norm": 0.7465773820877075, + "learning_rate": 9.97233696654069e-06, + "loss": 0.7875, + "step": 2582 + }, + { + "epoch": 0.13926029760621092, + "grad_norm": 0.8853508830070496, + "learning_rate": 9.972314691726764e-06, + "loss": 0.9263, + "step": 2583 + }, + { + "epoch": 0.13931421177485442, + "grad_norm": 0.7267711162567139, + "learning_rate": 9.972292407973286e-06, + "loss": 0.78, + "step": 2584 + }, + { + "epoch": 0.13936812594349796, + "grad_norm": 0.7631322145462036, + "learning_rate": 9.972270115280295e-06, + "loss": 0.7726, + "step": 2585 + }, + { + "epoch": 0.13942204011214146, + "grad_norm": 0.8661205768585205, + "learning_rate": 9.972247813647836e-06, + "loss": 0.977, + "step": 2586 + }, + { + "epoch": 0.139475954280785, + "grad_norm": 0.7955568432807922, + "learning_rate": 9.972225503075943e-06, + "loss": 0.8481, + "step": 2587 + }, + { + "epoch": 0.13952986844942852, + "grad_norm": 0.8810243606567383, + "learning_rate": 9.972203183564661e-06, + "loss": 0.8938, + "step": 2588 + }, + { + "epoch": 0.13958378261807203, + "grad_norm": 0.783968985080719, + "learning_rate": 9.972180855114029e-06, + "loss": 0.7565, + "step": 2589 + }, + { + "epoch": 0.13963769678671556, + "grad_norm": 0.749191164970398, + "learning_rate": 9.972158517724084e-06, + "loss": 0.7283, + "step": 2590 + }, + { + "epoch": 0.13969161095535906, + "grad_norm": 0.7926847338676453, + "learning_rate": 9.972136171394871e-06, + "loss": 0.9073, + "step": 2591 + }, + { + "epoch": 0.1397455251240026, + "grad_norm": 0.7621777653694153, + "learning_rate": 9.972113816126427e-06, + "loss": 0.7176, + "step": 2592 + }, + { + "epoch": 0.1397994392926461, + "grad_norm": 0.8856351375579834, + "learning_rate": 9.972091451918792e-06, + "loss": 0.7428, + "step": 2593 + }, + { + "epoch": 0.13985335346128963, + "grad_norm": 0.8027200698852539, + "learning_rate": 9.972069078772008e-06, + "loss": 0.7794, + "step": 2594 + }, + { + "epoch": 0.13990726762993314, + "grad_norm": 0.8776759505271912, + "learning_rate": 9.972046696686115e-06, + "loss": 0.9087, + "step": 2595 + }, + { + "epoch": 0.13996118179857667, + "grad_norm": 0.8979713320732117, + "learning_rate": 9.972024305661152e-06, + "loss": 0.8031, + "step": 2596 + }, + { + "epoch": 0.1400150959672202, + "grad_norm": 0.8233299851417542, + "learning_rate": 9.97200190569716e-06, + "loss": 0.8462, + "step": 2597 + }, + { + "epoch": 0.1400690101358637, + "grad_norm": 0.8777962327003479, + "learning_rate": 9.971979496794178e-06, + "loss": 0.8464, + "step": 2598 + }, + { + "epoch": 0.14012292430450723, + "grad_norm": 0.7185937166213989, + "learning_rate": 9.971957078952249e-06, + "loss": 0.7423, + "step": 2599 + }, + { + "epoch": 0.14017683847315074, + "grad_norm": 0.8226794600486755, + "learning_rate": 9.971934652171412e-06, + "loss": 0.8017, + "step": 2600 + }, + { + "epoch": 0.14023075264179427, + "grad_norm": 0.8021965622901917, + "learning_rate": 9.971912216451705e-06, + "loss": 0.8018, + "step": 2601 + }, + { + "epoch": 0.14028466681043777, + "grad_norm": 1.0516051054000854, + "learning_rate": 9.971889771793172e-06, + "loss": 0.8894, + "step": 2602 + }, + { + "epoch": 0.1403385809790813, + "grad_norm": 0.8212647438049316, + "learning_rate": 9.971867318195851e-06, + "loss": 0.826, + "step": 2603 + }, + { + "epoch": 0.1403924951477248, + "grad_norm": 0.8427513241767883, + "learning_rate": 9.971844855659783e-06, + "loss": 0.815, + "step": 2604 + }, + { + "epoch": 0.14044640931636834, + "grad_norm": 0.779569149017334, + "learning_rate": 9.97182238418501e-06, + "loss": 0.797, + "step": 2605 + }, + { + "epoch": 0.14050032348501187, + "grad_norm": 0.7430607080459595, + "learning_rate": 9.97179990377157e-06, + "loss": 0.7925, + "step": 2606 + }, + { + "epoch": 0.14055423765365538, + "grad_norm": 0.8079801797866821, + "learning_rate": 9.971777414419503e-06, + "loss": 0.8259, + "step": 2607 + }, + { + "epoch": 0.1406081518222989, + "grad_norm": 0.794086754322052, + "learning_rate": 9.971754916128853e-06, + "loss": 0.833, + "step": 2608 + }, + { + "epoch": 0.14066206599094241, + "grad_norm": 0.8177362680435181, + "learning_rate": 9.971732408899657e-06, + "loss": 0.8543, + "step": 2609 + }, + { + "epoch": 0.14071598015958595, + "grad_norm": 0.8591805100440979, + "learning_rate": 9.971709892731956e-06, + "loss": 0.9323, + "step": 2610 + }, + { + "epoch": 0.14076989432822945, + "grad_norm": 0.8102341890335083, + "learning_rate": 9.971687367625793e-06, + "loss": 0.7679, + "step": 2611 + }, + { + "epoch": 0.14082380849687298, + "grad_norm": 0.8556869626045227, + "learning_rate": 9.971664833581205e-06, + "loss": 0.8458, + "step": 2612 + }, + { + "epoch": 0.14087772266551649, + "grad_norm": 0.7998070120811462, + "learning_rate": 9.971642290598235e-06, + "loss": 0.7663, + "step": 2613 + }, + { + "epoch": 0.14093163683416002, + "grad_norm": 0.8800550103187561, + "learning_rate": 9.971619738676923e-06, + "loss": 0.8653, + "step": 2614 + }, + { + "epoch": 0.14098555100280355, + "grad_norm": 0.8199629187583923, + "learning_rate": 9.971597177817308e-06, + "loss": 0.8804, + "step": 2615 + }, + { + "epoch": 0.14103946517144705, + "grad_norm": 0.8774363398551941, + "learning_rate": 9.971574608019432e-06, + "loss": 0.8468, + "step": 2616 + }, + { + "epoch": 0.14109337934009059, + "grad_norm": 0.7911790013313293, + "learning_rate": 9.971552029283335e-06, + "loss": 0.7841, + "step": 2617 + }, + { + "epoch": 0.1411472935087341, + "grad_norm": 0.8152750134468079, + "learning_rate": 9.97152944160906e-06, + "loss": 0.7753, + "step": 2618 + }, + { + "epoch": 0.14120120767737762, + "grad_norm": 0.8709943890571594, + "learning_rate": 9.971506844996645e-06, + "loss": 0.7259, + "step": 2619 + }, + { + "epoch": 0.14125512184602113, + "grad_norm": 1.1131712198257446, + "learning_rate": 9.97148423944613e-06, + "loss": 0.9422, + "step": 2620 + }, + { + "epoch": 0.14130903601466466, + "grad_norm": 0.8992665410041809, + "learning_rate": 9.971461624957557e-06, + "loss": 0.733, + "step": 2621 + }, + { + "epoch": 0.14136295018330816, + "grad_norm": 0.7548032402992249, + "learning_rate": 9.971439001530967e-06, + "loss": 0.7733, + "step": 2622 + }, + { + "epoch": 0.1414168643519517, + "grad_norm": 0.7988988161087036, + "learning_rate": 9.9714163691664e-06, + "loss": 0.8218, + "step": 2623 + }, + { + "epoch": 0.14147077852059523, + "grad_norm": 0.7697865962982178, + "learning_rate": 9.971393727863899e-06, + "loss": 0.7882, + "step": 2624 + }, + { + "epoch": 0.14152469268923873, + "grad_norm": 0.993664026260376, + "learning_rate": 9.9713710776235e-06, + "loss": 0.8331, + "step": 2625 + }, + { + "epoch": 0.14157860685788226, + "grad_norm": 1.0097055435180664, + "learning_rate": 9.971348418445245e-06, + "loss": 0.8959, + "step": 2626 + }, + { + "epoch": 0.14163252102652577, + "grad_norm": 0.7682481408119202, + "learning_rate": 9.97132575032918e-06, + "loss": 0.7425, + "step": 2627 + }, + { + "epoch": 0.1416864351951693, + "grad_norm": 0.790695309638977, + "learning_rate": 9.971303073275338e-06, + "loss": 0.6887, + "step": 2628 + }, + { + "epoch": 0.1417403493638128, + "grad_norm": 0.9672498106956482, + "learning_rate": 9.971280387283766e-06, + "loss": 0.8617, + "step": 2629 + }, + { + "epoch": 0.14179426353245633, + "grad_norm": 0.8538743853569031, + "learning_rate": 9.971257692354502e-06, + "loss": 0.7826, + "step": 2630 + }, + { + "epoch": 0.14184817770109984, + "grad_norm": 0.7527078986167908, + "learning_rate": 9.971234988487587e-06, + "loss": 0.7542, + "step": 2631 + }, + { + "epoch": 0.14190209186974337, + "grad_norm": 0.9390487670898438, + "learning_rate": 9.97121227568306e-06, + "loss": 0.8415, + "step": 2632 + }, + { + "epoch": 0.1419560060383869, + "grad_norm": 0.8717443346977234, + "learning_rate": 9.971189553940966e-06, + "loss": 0.7969, + "step": 2633 + }, + { + "epoch": 0.1420099202070304, + "grad_norm": 0.7848197817802429, + "learning_rate": 9.971166823261343e-06, + "loss": 0.8049, + "step": 2634 + }, + { + "epoch": 0.14206383437567394, + "grad_norm": 0.8002238273620605, + "learning_rate": 9.971144083644233e-06, + "loss": 0.8681, + "step": 2635 + }, + { + "epoch": 0.14211774854431744, + "grad_norm": 0.7699506282806396, + "learning_rate": 9.971121335089676e-06, + "loss": 0.7815, + "step": 2636 + }, + { + "epoch": 0.14217166271296097, + "grad_norm": 0.9187048673629761, + "learning_rate": 9.971098577597713e-06, + "loss": 0.8611, + "step": 2637 + }, + { + "epoch": 0.14222557688160448, + "grad_norm": 0.802859365940094, + "learning_rate": 9.971075811168385e-06, + "loss": 0.7991, + "step": 2638 + }, + { + "epoch": 0.142279491050248, + "grad_norm": 1.0536410808563232, + "learning_rate": 9.971053035801735e-06, + "loss": 0.9726, + "step": 2639 + }, + { + "epoch": 0.1423334052188915, + "grad_norm": 0.8278898000717163, + "learning_rate": 9.9710302514978e-06, + "loss": 0.8636, + "step": 2640 + }, + { + "epoch": 0.14238731938753504, + "grad_norm": 0.7639529705047607, + "learning_rate": 9.971007458256623e-06, + "loss": 0.7849, + "step": 2641 + }, + { + "epoch": 0.14244123355617858, + "grad_norm": 0.9108867049217224, + "learning_rate": 9.970984656078246e-06, + "loss": 0.891, + "step": 2642 + }, + { + "epoch": 0.14249514772482208, + "grad_norm": 0.8182162046432495, + "learning_rate": 9.97096184496271e-06, + "loss": 0.7975, + "step": 2643 + }, + { + "epoch": 0.1425490618934656, + "grad_norm": 0.848781168460846, + "learning_rate": 9.970939024910053e-06, + "loss": 0.8677, + "step": 2644 + }, + { + "epoch": 0.14260297606210912, + "grad_norm": 0.8322750926017761, + "learning_rate": 9.97091619592032e-06, + "loss": 0.776, + "step": 2645 + }, + { + "epoch": 0.14265689023075265, + "grad_norm": 0.8054049611091614, + "learning_rate": 9.970893357993548e-06, + "loss": 0.804, + "step": 2646 + }, + { + "epoch": 0.14271080439939615, + "grad_norm": 0.8162119388580322, + "learning_rate": 9.970870511129782e-06, + "loss": 0.7856, + "step": 2647 + }, + { + "epoch": 0.14276471856803968, + "grad_norm": 0.73929363489151, + "learning_rate": 9.97084765532906e-06, + "loss": 0.7687, + "step": 2648 + }, + { + "epoch": 0.1428186327366832, + "grad_norm": 0.866688072681427, + "learning_rate": 9.970824790591425e-06, + "loss": 0.8751, + "step": 2649 + }, + { + "epoch": 0.14287254690532672, + "grad_norm": 0.7772359251976013, + "learning_rate": 9.970801916916917e-06, + "loss": 0.7232, + "step": 2650 + }, + { + "epoch": 0.14292646107397025, + "grad_norm": 0.8912346363067627, + "learning_rate": 9.970779034305578e-06, + "loss": 0.8393, + "step": 2651 + }, + { + "epoch": 0.14298037524261376, + "grad_norm": 0.7827256917953491, + "learning_rate": 9.970756142757448e-06, + "loss": 0.7924, + "step": 2652 + }, + { + "epoch": 0.1430342894112573, + "grad_norm": 0.7557843923568726, + "learning_rate": 9.97073324227257e-06, + "loss": 0.8032, + "step": 2653 + }, + { + "epoch": 0.1430882035799008, + "grad_norm": 0.7939576506614685, + "learning_rate": 9.970710332850983e-06, + "loss": 0.7251, + "step": 2654 + }, + { + "epoch": 0.14314211774854432, + "grad_norm": 0.8175502419471741, + "learning_rate": 9.97068741449273e-06, + "loss": 0.7685, + "step": 2655 + }, + { + "epoch": 0.14319603191718783, + "grad_norm": 0.7537406086921692, + "learning_rate": 9.970664487197851e-06, + "loss": 0.7354, + "step": 2656 + }, + { + "epoch": 0.14324994608583136, + "grad_norm": 0.8045641779899597, + "learning_rate": 9.970641550966388e-06, + "loss": 0.7581, + "step": 2657 + }, + { + "epoch": 0.14330386025447486, + "grad_norm": 0.69786137342453, + "learning_rate": 9.97061860579838e-06, + "loss": 0.6923, + "step": 2658 + }, + { + "epoch": 0.1433577744231184, + "grad_norm": 0.7913051843643188, + "learning_rate": 9.970595651693874e-06, + "loss": 0.7579, + "step": 2659 + }, + { + "epoch": 0.14341168859176193, + "grad_norm": 0.7890749573707581, + "learning_rate": 9.970572688652905e-06, + "loss": 0.7843, + "step": 2660 + }, + { + "epoch": 0.14346560276040543, + "grad_norm": 0.913074791431427, + "learning_rate": 9.970549716675516e-06, + "loss": 0.8318, + "step": 2661 + }, + { + "epoch": 0.14351951692904896, + "grad_norm": 0.757522463798523, + "learning_rate": 9.97052673576175e-06, + "loss": 0.6803, + "step": 2662 + }, + { + "epoch": 0.14357343109769247, + "grad_norm": 0.9279198050498962, + "learning_rate": 9.970503745911645e-06, + "loss": 0.8591, + "step": 2663 + }, + { + "epoch": 0.143627345266336, + "grad_norm": 0.8218236565589905, + "learning_rate": 9.97048074712525e-06, + "loss": 0.8253, + "step": 2664 + }, + { + "epoch": 0.1436812594349795, + "grad_norm": 0.7562058568000793, + "learning_rate": 9.970457739402596e-06, + "loss": 0.8114, + "step": 2665 + }, + { + "epoch": 0.14373517360362303, + "grad_norm": 0.7626449465751648, + "learning_rate": 9.970434722743732e-06, + "loss": 0.7932, + "step": 2666 + }, + { + "epoch": 0.14378908777226654, + "grad_norm": 0.8287700414657593, + "learning_rate": 9.970411697148696e-06, + "loss": 0.754, + "step": 2667 + }, + { + "epoch": 0.14384300194091007, + "grad_norm": 1.0403661727905273, + "learning_rate": 9.97038866261753e-06, + "loss": 0.9062, + "step": 2668 + }, + { + "epoch": 0.1438969161095536, + "grad_norm": 0.8278779983520508, + "learning_rate": 9.970365619150276e-06, + "loss": 0.9181, + "step": 2669 + }, + { + "epoch": 0.1439508302781971, + "grad_norm": 0.950964629650116, + "learning_rate": 9.970342566746973e-06, + "loss": 0.9235, + "step": 2670 + }, + { + "epoch": 0.14400474444684064, + "grad_norm": 0.9529917240142822, + "learning_rate": 9.970319505407667e-06, + "loss": 0.7929, + "step": 2671 + }, + { + "epoch": 0.14405865861548414, + "grad_norm": 0.7601970434188843, + "learning_rate": 9.970296435132395e-06, + "loss": 0.7133, + "step": 2672 + }, + { + "epoch": 0.14411257278412767, + "grad_norm": 0.8906385898590088, + "learning_rate": 9.970273355921201e-06, + "loss": 0.8679, + "step": 2673 + }, + { + "epoch": 0.14416648695277118, + "grad_norm": 0.8250144720077515, + "learning_rate": 9.970250267774126e-06, + "loss": 0.7871, + "step": 2674 + }, + { + "epoch": 0.1442204011214147, + "grad_norm": 0.8182716965675354, + "learning_rate": 9.970227170691212e-06, + "loss": 0.7391, + "step": 2675 + }, + { + "epoch": 0.1442743152900582, + "grad_norm": 0.8261950016021729, + "learning_rate": 9.970204064672498e-06, + "loss": 0.8914, + "step": 2676 + }, + { + "epoch": 0.14432822945870175, + "grad_norm": 1.248270869255066, + "learning_rate": 9.97018094971803e-06, + "loss": 0.7834, + "step": 2677 + }, + { + "epoch": 0.14438214362734528, + "grad_norm": 0.7821226119995117, + "learning_rate": 9.970157825827844e-06, + "loss": 0.7436, + "step": 2678 + }, + { + "epoch": 0.14443605779598878, + "grad_norm": 0.9708791375160217, + "learning_rate": 9.970134693001987e-06, + "loss": 0.9038, + "step": 2679 + }, + { + "epoch": 0.1444899719646323, + "grad_norm": 0.8178976774215698, + "learning_rate": 9.970111551240499e-06, + "loss": 0.8748, + "step": 2680 + }, + { + "epoch": 0.14454388613327582, + "grad_norm": 0.8477594256401062, + "learning_rate": 9.970088400543417e-06, + "loss": 0.8169, + "step": 2681 + }, + { + "epoch": 0.14459780030191935, + "grad_norm": 0.9478195309638977, + "learning_rate": 9.970065240910789e-06, + "loss": 0.789, + "step": 2682 + }, + { + "epoch": 0.14465171447056285, + "grad_norm": 0.9151026010513306, + "learning_rate": 9.970042072342652e-06, + "loss": 0.8804, + "step": 2683 + }, + { + "epoch": 0.14470562863920639, + "grad_norm": 0.8062365651130676, + "learning_rate": 9.970018894839052e-06, + "loss": 0.8329, + "step": 2684 + }, + { + "epoch": 0.1447595428078499, + "grad_norm": 0.8029241561889648, + "learning_rate": 9.969995708400028e-06, + "loss": 0.7053, + "step": 2685 + }, + { + "epoch": 0.14481345697649342, + "grad_norm": 0.8023892641067505, + "learning_rate": 9.969972513025621e-06, + "loss": 0.7921, + "step": 2686 + }, + { + "epoch": 0.14486737114513695, + "grad_norm": 0.9224045276641846, + "learning_rate": 9.969949308715874e-06, + "loss": 0.7416, + "step": 2687 + }, + { + "epoch": 0.14492128531378046, + "grad_norm": 0.7767837047576904, + "learning_rate": 9.969926095470829e-06, + "loss": 0.7844, + "step": 2688 + }, + { + "epoch": 0.144975199482424, + "grad_norm": 0.7804312109947205, + "learning_rate": 9.969902873290526e-06, + "loss": 0.712, + "step": 2689 + }, + { + "epoch": 0.1450291136510675, + "grad_norm": 0.9595988988876343, + "learning_rate": 9.969879642175009e-06, + "loss": 0.7686, + "step": 2690 + }, + { + "epoch": 0.14508302781971102, + "grad_norm": 1.0414133071899414, + "learning_rate": 9.969856402124318e-06, + "loss": 0.8833, + "step": 2691 + }, + { + "epoch": 0.14513694198835453, + "grad_norm": 0.9321674108505249, + "learning_rate": 9.969833153138498e-06, + "loss": 0.7576, + "step": 2692 + }, + { + "epoch": 0.14519085615699806, + "grad_norm": 0.7715985774993896, + "learning_rate": 9.969809895217586e-06, + "loss": 0.7371, + "step": 2693 + }, + { + "epoch": 0.1452447703256416, + "grad_norm": 1.0257316827774048, + "learning_rate": 9.969786628361625e-06, + "loss": 0.8394, + "step": 2694 + }, + { + "epoch": 0.1452986844942851, + "grad_norm": 0.7823453545570374, + "learning_rate": 9.969763352570659e-06, + "loss": 0.7974, + "step": 2695 + }, + { + "epoch": 0.14535259866292863, + "grad_norm": 0.8257505893707275, + "learning_rate": 9.969740067844728e-06, + "loss": 0.7948, + "step": 2696 + }, + { + "epoch": 0.14540651283157213, + "grad_norm": 0.6493780016899109, + "learning_rate": 9.969716774183878e-06, + "loss": 0.6531, + "step": 2697 + }, + { + "epoch": 0.14546042700021566, + "grad_norm": 0.8953896760940552, + "learning_rate": 9.969693471588144e-06, + "loss": 0.7414, + "step": 2698 + }, + { + "epoch": 0.14551434116885917, + "grad_norm": 0.7177074551582336, + "learning_rate": 9.969670160057572e-06, + "loss": 0.65, + "step": 2699 + }, + { + "epoch": 0.1455682553375027, + "grad_norm": 0.8214414715766907, + "learning_rate": 9.969646839592204e-06, + "loss": 0.7605, + "step": 2700 + }, + { + "epoch": 0.1456221695061462, + "grad_norm": 0.8062289953231812, + "learning_rate": 9.969623510192081e-06, + "loss": 0.8275, + "step": 2701 + }, + { + "epoch": 0.14567608367478974, + "grad_norm": 0.9606921076774597, + "learning_rate": 9.969600171857246e-06, + "loss": 0.8472, + "step": 2702 + }, + { + "epoch": 0.14572999784343327, + "grad_norm": 1.0146433115005493, + "learning_rate": 9.96957682458774e-06, + "loss": 0.8398, + "step": 2703 + }, + { + "epoch": 0.14578391201207677, + "grad_norm": 0.8463965058326721, + "learning_rate": 9.969553468383604e-06, + "loss": 0.7563, + "step": 2704 + }, + { + "epoch": 0.1458378261807203, + "grad_norm": 0.8125115633010864, + "learning_rate": 9.96953010324488e-06, + "loss": 0.8042, + "step": 2705 + }, + { + "epoch": 0.1458917403493638, + "grad_norm": 0.9350455403327942, + "learning_rate": 9.969506729171612e-06, + "loss": 0.9067, + "step": 2706 + }, + { + "epoch": 0.14594565451800734, + "grad_norm": 0.9979991316795349, + "learning_rate": 9.969483346163843e-06, + "loss": 0.778, + "step": 2707 + }, + { + "epoch": 0.14599956868665084, + "grad_norm": 0.8236498236656189, + "learning_rate": 9.969459954221612e-06, + "loss": 0.9011, + "step": 2708 + }, + { + "epoch": 0.14605348285529438, + "grad_norm": 0.6965605616569519, + "learning_rate": 9.969436553344962e-06, + "loss": 0.6657, + "step": 2709 + }, + { + "epoch": 0.14610739702393788, + "grad_norm": 0.810246467590332, + "learning_rate": 9.969413143533936e-06, + "loss": 0.8099, + "step": 2710 + }, + { + "epoch": 0.1461613111925814, + "grad_norm": 1.1437804698944092, + "learning_rate": 9.969389724788574e-06, + "loss": 0.7457, + "step": 2711 + }, + { + "epoch": 0.14621522536122494, + "grad_norm": 0.8632565140724182, + "learning_rate": 9.96936629710892e-06, + "loss": 0.8549, + "step": 2712 + }, + { + "epoch": 0.14626913952986845, + "grad_norm": 0.9616119265556335, + "learning_rate": 9.969342860495018e-06, + "loss": 0.6219, + "step": 2713 + }, + { + "epoch": 0.14632305369851198, + "grad_norm": 0.9943077564239502, + "learning_rate": 9.969319414946906e-06, + "loss": 0.8676, + "step": 2714 + }, + { + "epoch": 0.14637696786715548, + "grad_norm": 0.861070454120636, + "learning_rate": 9.969295960464627e-06, + "loss": 0.7235, + "step": 2715 + }, + { + "epoch": 0.14643088203579901, + "grad_norm": 0.9375396370887756, + "learning_rate": 9.969272497048225e-06, + "loss": 0.9169, + "step": 2716 + }, + { + "epoch": 0.14648479620444252, + "grad_norm": 0.8180664777755737, + "learning_rate": 9.969249024697741e-06, + "loss": 0.8109, + "step": 2717 + }, + { + "epoch": 0.14653871037308605, + "grad_norm": 0.8574398159980774, + "learning_rate": 9.969225543413218e-06, + "loss": 0.767, + "step": 2718 + }, + { + "epoch": 0.14659262454172955, + "grad_norm": 1.0249319076538086, + "learning_rate": 9.969202053194697e-06, + "loss": 0.902, + "step": 2719 + }, + { + "epoch": 0.1466465387103731, + "grad_norm": 0.8045467734336853, + "learning_rate": 9.96917855404222e-06, + "loss": 0.7797, + "step": 2720 + }, + { + "epoch": 0.14670045287901662, + "grad_norm": 0.880533754825592, + "learning_rate": 9.969155045955831e-06, + "loss": 0.8071, + "step": 2721 + }, + { + "epoch": 0.14675436704766012, + "grad_norm": 0.8733983635902405, + "learning_rate": 9.969131528935572e-06, + "loss": 0.8309, + "step": 2722 + }, + { + "epoch": 0.14680828121630365, + "grad_norm": 0.8205264210700989, + "learning_rate": 9.969108002981484e-06, + "loss": 0.8126, + "step": 2723 + }, + { + "epoch": 0.14686219538494716, + "grad_norm": 0.8250916600227356, + "learning_rate": 9.96908446809361e-06, + "loss": 0.7488, + "step": 2724 + }, + { + "epoch": 0.1469161095535907, + "grad_norm": 0.8082099556922913, + "learning_rate": 9.969060924271994e-06, + "loss": 0.8039, + "step": 2725 + }, + { + "epoch": 0.1469700237222342, + "grad_norm": 0.8376840353012085, + "learning_rate": 9.969037371516674e-06, + "loss": 0.7603, + "step": 2726 + }, + { + "epoch": 0.14702393789087773, + "grad_norm": 1.2106066942214966, + "learning_rate": 9.969013809827697e-06, + "loss": 0.8187, + "step": 2727 + }, + { + "epoch": 0.14707785205952123, + "grad_norm": 0.8828561305999756, + "learning_rate": 9.968990239205103e-06, + "loss": 0.7249, + "step": 2728 + }, + { + "epoch": 0.14713176622816476, + "grad_norm": 0.8182427883148193, + "learning_rate": 9.968966659648935e-06, + "loss": 0.8353, + "step": 2729 + }, + { + "epoch": 0.1471856803968083, + "grad_norm": 0.8091077208518982, + "learning_rate": 9.968943071159234e-06, + "loss": 0.8261, + "step": 2730 + }, + { + "epoch": 0.1472395945654518, + "grad_norm": 0.9515360593795776, + "learning_rate": 9.968919473736043e-06, + "loss": 0.9099, + "step": 2731 + }, + { + "epoch": 0.14729350873409533, + "grad_norm": 0.7404700517654419, + "learning_rate": 9.968895867379407e-06, + "loss": 0.7793, + "step": 2732 + }, + { + "epoch": 0.14734742290273883, + "grad_norm": 0.7887243032455444, + "learning_rate": 9.968872252089365e-06, + "loss": 0.8749, + "step": 2733 + }, + { + "epoch": 0.14740133707138237, + "grad_norm": 1.1335293054580688, + "learning_rate": 9.968848627865962e-06, + "loss": 0.8428, + "step": 2734 + }, + { + "epoch": 0.14745525124002587, + "grad_norm": 0.787325382232666, + "learning_rate": 9.968824994709238e-06, + "loss": 0.8026, + "step": 2735 + }, + { + "epoch": 0.1475091654086694, + "grad_norm": 0.8006013035774231, + "learning_rate": 9.968801352619238e-06, + "loss": 0.9083, + "step": 2736 + }, + { + "epoch": 0.1475630795773129, + "grad_norm": 0.8923180103302002, + "learning_rate": 9.968777701596002e-06, + "loss": 0.8628, + "step": 2737 + }, + { + "epoch": 0.14761699374595644, + "grad_norm": 0.798041582107544, + "learning_rate": 9.968754041639573e-06, + "loss": 0.7519, + "step": 2738 + }, + { + "epoch": 0.14767090791459997, + "grad_norm": 0.8984145522117615, + "learning_rate": 9.968730372749996e-06, + "loss": 0.7624, + "step": 2739 + }, + { + "epoch": 0.14772482208324347, + "grad_norm": 0.8182528018951416, + "learning_rate": 9.968706694927312e-06, + "loss": 0.8442, + "step": 2740 + }, + { + "epoch": 0.147778736251887, + "grad_norm": 0.8047756552696228, + "learning_rate": 9.968683008171562e-06, + "loss": 0.847, + "step": 2741 + }, + { + "epoch": 0.1478326504205305, + "grad_norm": 0.7935258150100708, + "learning_rate": 9.968659312482792e-06, + "loss": 0.8072, + "step": 2742 + }, + { + "epoch": 0.14788656458917404, + "grad_norm": 0.8043146729469299, + "learning_rate": 9.968635607861042e-06, + "loss": 0.7769, + "step": 2743 + }, + { + "epoch": 0.14794047875781755, + "grad_norm": 0.7826459407806396, + "learning_rate": 9.968611894306356e-06, + "loss": 0.8418, + "step": 2744 + }, + { + "epoch": 0.14799439292646108, + "grad_norm": 0.9293491244316101, + "learning_rate": 9.968588171818775e-06, + "loss": 0.8704, + "step": 2745 + }, + { + "epoch": 0.14804830709510458, + "grad_norm": 0.8281397223472595, + "learning_rate": 9.968564440398343e-06, + "loss": 0.9288, + "step": 2746 + }, + { + "epoch": 0.1481022212637481, + "grad_norm": 0.8558036684989929, + "learning_rate": 9.968540700045101e-06, + "loss": 0.8406, + "step": 2747 + }, + { + "epoch": 0.14815613543239164, + "grad_norm": 0.8167025446891785, + "learning_rate": 9.968516950759096e-06, + "loss": 0.8268, + "step": 2748 + }, + { + "epoch": 0.14821004960103515, + "grad_norm": 0.8612670302391052, + "learning_rate": 9.968493192540364e-06, + "loss": 0.8265, + "step": 2749 + }, + { + "epoch": 0.14826396376967868, + "grad_norm": 0.9208493232727051, + "learning_rate": 9.968469425388953e-06, + "loss": 0.8555, + "step": 2750 + }, + { + "epoch": 0.14831787793832218, + "grad_norm": 0.756591260433197, + "learning_rate": 9.968445649304904e-06, + "loss": 0.7655, + "step": 2751 + }, + { + "epoch": 0.14837179210696572, + "grad_norm": 0.8566586375236511, + "learning_rate": 9.96842186428826e-06, + "loss": 0.8125, + "step": 2752 + }, + { + "epoch": 0.14842570627560922, + "grad_norm": 0.7984357476234436, + "learning_rate": 9.968398070339063e-06, + "loss": 0.7307, + "step": 2753 + }, + { + "epoch": 0.14847962044425275, + "grad_norm": 0.8943261504173279, + "learning_rate": 9.968374267457356e-06, + "loss": 0.757, + "step": 2754 + }, + { + "epoch": 0.14853353461289626, + "grad_norm": 0.9466004967689514, + "learning_rate": 9.968350455643184e-06, + "loss": 0.8271, + "step": 2755 + }, + { + "epoch": 0.1485874487815398, + "grad_norm": 0.7604812383651733, + "learning_rate": 9.968326634896585e-06, + "loss": 0.7654, + "step": 2756 + }, + { + "epoch": 0.14864136295018332, + "grad_norm": 0.7803215384483337, + "learning_rate": 9.968302805217609e-06, + "loss": 0.7691, + "step": 2757 + }, + { + "epoch": 0.14869527711882682, + "grad_norm": 0.8579596281051636, + "learning_rate": 9.96827896660629e-06, + "loss": 0.859, + "step": 2758 + }, + { + "epoch": 0.14874919128747036, + "grad_norm": 0.8205640316009521, + "learning_rate": 9.968255119062679e-06, + "loss": 0.8588, + "step": 2759 + }, + { + "epoch": 0.14880310545611386, + "grad_norm": 0.8601415753364563, + "learning_rate": 9.968231262586814e-06, + "loss": 0.8399, + "step": 2760 + }, + { + "epoch": 0.1488570196247574, + "grad_norm": 0.8827456831932068, + "learning_rate": 9.96820739717874e-06, + "loss": 0.8413, + "step": 2761 + }, + { + "epoch": 0.1489109337934009, + "grad_norm": 0.7422264218330383, + "learning_rate": 9.968183522838499e-06, + "loss": 0.7451, + "step": 2762 + }, + { + "epoch": 0.14896484796204443, + "grad_norm": 0.9764127135276794, + "learning_rate": 9.968159639566133e-06, + "loss": 0.8436, + "step": 2763 + }, + { + "epoch": 0.14901876213068793, + "grad_norm": 0.7435232400894165, + "learning_rate": 9.968135747361687e-06, + "loss": 0.7553, + "step": 2764 + }, + { + "epoch": 0.14907267629933146, + "grad_norm": 0.7399751543998718, + "learning_rate": 9.968111846225202e-06, + "loss": 0.7695, + "step": 2765 + }, + { + "epoch": 0.149126590467975, + "grad_norm": 0.882901668548584, + "learning_rate": 9.968087936156722e-06, + "loss": 0.8418, + "step": 2766 + }, + { + "epoch": 0.1491805046366185, + "grad_norm": 0.840501606464386, + "learning_rate": 9.968064017156292e-06, + "loss": 0.83, + "step": 2767 + }, + { + "epoch": 0.14923441880526203, + "grad_norm": 0.9809413552284241, + "learning_rate": 9.96804008922395e-06, + "loss": 0.8029, + "step": 2768 + }, + { + "epoch": 0.14928833297390554, + "grad_norm": 0.7534085512161255, + "learning_rate": 9.968016152359744e-06, + "loss": 0.7201, + "step": 2769 + }, + { + "epoch": 0.14934224714254907, + "grad_norm": 0.813582718372345, + "learning_rate": 9.967992206563714e-06, + "loss": 0.8533, + "step": 2770 + }, + { + "epoch": 0.14939616131119257, + "grad_norm": 0.9827276468276978, + "learning_rate": 9.967968251835905e-06, + "loss": 0.8097, + "step": 2771 + }, + { + "epoch": 0.1494500754798361, + "grad_norm": 0.828959047794342, + "learning_rate": 9.967944288176359e-06, + "loss": 0.859, + "step": 2772 + }, + { + "epoch": 0.1495039896484796, + "grad_norm": 0.8123818039894104, + "learning_rate": 9.967920315585118e-06, + "loss": 0.7044, + "step": 2773 + }, + { + "epoch": 0.14955790381712314, + "grad_norm": 0.7503589987754822, + "learning_rate": 9.967896334062228e-06, + "loss": 0.7255, + "step": 2774 + }, + { + "epoch": 0.14961181798576667, + "grad_norm": 0.7414034605026245, + "learning_rate": 9.96787234360773e-06, + "loss": 0.7599, + "step": 2775 + }, + { + "epoch": 0.14966573215441017, + "grad_norm": 0.7467254400253296, + "learning_rate": 9.967848344221667e-06, + "loss": 0.6835, + "step": 2776 + }, + { + "epoch": 0.1497196463230537, + "grad_norm": 0.8653414249420166, + "learning_rate": 9.967824335904082e-06, + "loss": 0.8205, + "step": 2777 + }, + { + "epoch": 0.1497735604916972, + "grad_norm": 0.9113380312919617, + "learning_rate": 9.96780031865502e-06, + "loss": 0.8758, + "step": 2778 + }, + { + "epoch": 0.14982747466034074, + "grad_norm": 0.8330965042114258, + "learning_rate": 9.967776292474523e-06, + "loss": 0.8696, + "step": 2779 + }, + { + "epoch": 0.14988138882898425, + "grad_norm": 0.9087555408477783, + "learning_rate": 9.967752257362633e-06, + "loss": 0.8381, + "step": 2780 + }, + { + "epoch": 0.14993530299762778, + "grad_norm": 0.856777548789978, + "learning_rate": 9.967728213319394e-06, + "loss": 0.8365, + "step": 2781 + }, + { + "epoch": 0.14998921716627128, + "grad_norm": 0.8314496874809265, + "learning_rate": 9.967704160344852e-06, + "loss": 0.7403, + "step": 2782 + }, + { + "epoch": 0.15004313133491481, + "grad_norm": 0.8357448577880859, + "learning_rate": 9.967680098439047e-06, + "loss": 0.8256, + "step": 2783 + }, + { + "epoch": 0.15009704550355835, + "grad_norm": 0.8366092443466187, + "learning_rate": 9.967656027602023e-06, + "loss": 0.8221, + "step": 2784 + }, + { + "epoch": 0.15015095967220185, + "grad_norm": 0.7944943904876709, + "learning_rate": 9.967631947833823e-06, + "loss": 0.813, + "step": 2785 + }, + { + "epoch": 0.15020487384084538, + "grad_norm": 0.8407523036003113, + "learning_rate": 9.967607859134492e-06, + "loss": 0.8237, + "step": 2786 + }, + { + "epoch": 0.1502587880094889, + "grad_norm": 0.7879778146743774, + "learning_rate": 9.967583761504071e-06, + "loss": 0.777, + "step": 2787 + }, + { + "epoch": 0.15031270217813242, + "grad_norm": 0.8307899832725525, + "learning_rate": 9.967559654942604e-06, + "loss": 0.8394, + "step": 2788 + }, + { + "epoch": 0.15036661634677592, + "grad_norm": 0.8068673610687256, + "learning_rate": 9.967535539450135e-06, + "loss": 0.8435, + "step": 2789 + }, + { + "epoch": 0.15042053051541945, + "grad_norm": 0.8473932147026062, + "learning_rate": 9.967511415026709e-06, + "loss": 0.8698, + "step": 2790 + }, + { + "epoch": 0.15047444468406296, + "grad_norm": 0.8352688550949097, + "learning_rate": 9.967487281672365e-06, + "loss": 0.8617, + "step": 2791 + }, + { + "epoch": 0.1505283588527065, + "grad_norm": 0.7729620337486267, + "learning_rate": 9.96746313938715e-06, + "loss": 0.779, + "step": 2792 + }, + { + "epoch": 0.15058227302135002, + "grad_norm": 0.8704085946083069, + "learning_rate": 9.967438988171106e-06, + "loss": 0.833, + "step": 2793 + }, + { + "epoch": 0.15063618718999353, + "grad_norm": 0.7538182735443115, + "learning_rate": 9.967414828024276e-06, + "loss": 0.7479, + "step": 2794 + }, + { + "epoch": 0.15069010135863706, + "grad_norm": 0.7672195434570312, + "learning_rate": 9.967390658946704e-06, + "loss": 0.7778, + "step": 2795 + }, + { + "epoch": 0.15074401552728056, + "grad_norm": 0.8245819211006165, + "learning_rate": 9.967366480938435e-06, + "loss": 0.6898, + "step": 2796 + }, + { + "epoch": 0.1507979296959241, + "grad_norm": 0.8197571635246277, + "learning_rate": 9.967342293999512e-06, + "loss": 0.8714, + "step": 2797 + }, + { + "epoch": 0.1508518438645676, + "grad_norm": 0.8135389685630798, + "learning_rate": 9.967318098129974e-06, + "loss": 0.8906, + "step": 2798 + }, + { + "epoch": 0.15090575803321113, + "grad_norm": 0.7287562489509583, + "learning_rate": 9.96729389332987e-06, + "loss": 0.7834, + "step": 2799 + }, + { + "epoch": 0.15095967220185466, + "grad_norm": 0.8642309904098511, + "learning_rate": 9.967269679599242e-06, + "loss": 0.7912, + "step": 2800 + }, + { + "epoch": 0.15101358637049817, + "grad_norm": 0.886060893535614, + "learning_rate": 9.967245456938132e-06, + "loss": 0.8614, + "step": 2801 + }, + { + "epoch": 0.1510675005391417, + "grad_norm": 0.8505488038063049, + "learning_rate": 9.967221225346584e-06, + "loss": 0.8323, + "step": 2802 + }, + { + "epoch": 0.1511214147077852, + "grad_norm": 0.8862965703010559, + "learning_rate": 9.967196984824644e-06, + "loss": 0.8292, + "step": 2803 + }, + { + "epoch": 0.15117532887642873, + "grad_norm": 0.8016111254692078, + "learning_rate": 9.967172735372353e-06, + "loss": 0.643, + "step": 2804 + }, + { + "epoch": 0.15122924304507224, + "grad_norm": 0.7599527835845947, + "learning_rate": 9.967148476989755e-06, + "loss": 0.8166, + "step": 2805 + }, + { + "epoch": 0.15128315721371577, + "grad_norm": 0.9574166536331177, + "learning_rate": 9.967124209676894e-06, + "loss": 0.8867, + "step": 2806 + }, + { + "epoch": 0.15133707138235927, + "grad_norm": 0.8384936451911926, + "learning_rate": 9.967099933433815e-06, + "loss": 0.9021, + "step": 2807 + }, + { + "epoch": 0.1513909855510028, + "grad_norm": 0.7779715061187744, + "learning_rate": 9.967075648260559e-06, + "loss": 0.7672, + "step": 2808 + }, + { + "epoch": 0.15144489971964634, + "grad_norm": 0.7783359885215759, + "learning_rate": 9.96705135415717e-06, + "loss": 0.8012, + "step": 2809 + }, + { + "epoch": 0.15149881388828984, + "grad_norm": 0.9124150276184082, + "learning_rate": 9.967027051123695e-06, + "loss": 0.8803, + "step": 2810 + }, + { + "epoch": 0.15155272805693337, + "grad_norm": 0.8135334849357605, + "learning_rate": 9.967002739160173e-06, + "loss": 0.7764, + "step": 2811 + }, + { + "epoch": 0.15160664222557688, + "grad_norm": 0.8082837462425232, + "learning_rate": 9.966978418266651e-06, + "loss": 0.8552, + "step": 2812 + }, + { + "epoch": 0.1516605563942204, + "grad_norm": 0.7978013753890991, + "learning_rate": 9.966954088443171e-06, + "loss": 0.7321, + "step": 2813 + }, + { + "epoch": 0.1517144705628639, + "grad_norm": 0.7845378518104553, + "learning_rate": 9.966929749689778e-06, + "loss": 0.7694, + "step": 2814 + }, + { + "epoch": 0.15176838473150744, + "grad_norm": 0.8671941161155701, + "learning_rate": 9.966905402006516e-06, + "loss": 0.886, + "step": 2815 + }, + { + "epoch": 0.15182229890015095, + "grad_norm": 0.8316017389297485, + "learning_rate": 9.966881045393426e-06, + "loss": 0.8844, + "step": 2816 + }, + { + "epoch": 0.15187621306879448, + "grad_norm": 0.7372319102287292, + "learning_rate": 9.966856679850554e-06, + "loss": 0.739, + "step": 2817 + }, + { + "epoch": 0.151930127237438, + "grad_norm": 0.7547122240066528, + "learning_rate": 9.966832305377944e-06, + "loss": 0.7518, + "step": 2818 + }, + { + "epoch": 0.15198404140608152, + "grad_norm": 0.8701632022857666, + "learning_rate": 9.96680792197564e-06, + "loss": 0.8632, + "step": 2819 + }, + { + "epoch": 0.15203795557472505, + "grad_norm": 0.7842714786529541, + "learning_rate": 9.966783529643686e-06, + "loss": 0.8161, + "step": 2820 + }, + { + "epoch": 0.15209186974336855, + "grad_norm": 0.858406126499176, + "learning_rate": 9.966759128382125e-06, + "loss": 0.7742, + "step": 2821 + }, + { + "epoch": 0.15214578391201208, + "grad_norm": 1.02357816696167, + "learning_rate": 9.966734718190998e-06, + "loss": 0.9142, + "step": 2822 + }, + { + "epoch": 0.1521996980806556, + "grad_norm": 0.81562739610672, + "learning_rate": 9.966710299070355e-06, + "loss": 0.8426, + "step": 2823 + }, + { + "epoch": 0.15225361224929912, + "grad_norm": 0.8576202988624573, + "learning_rate": 9.966685871020236e-06, + "loss": 0.7546, + "step": 2824 + }, + { + "epoch": 0.15230752641794262, + "grad_norm": 0.8974374532699585, + "learning_rate": 9.966661434040684e-06, + "loss": 0.7236, + "step": 2825 + }, + { + "epoch": 0.15236144058658616, + "grad_norm": 0.7306199073791504, + "learning_rate": 9.966636988131745e-06, + "loss": 0.7581, + "step": 2826 + }, + { + "epoch": 0.1524153547552297, + "grad_norm": 0.9296971559524536, + "learning_rate": 9.966612533293465e-06, + "loss": 0.9214, + "step": 2827 + }, + { + "epoch": 0.1524692689238732, + "grad_norm": 1.029969573020935, + "learning_rate": 9.966588069525885e-06, + "loss": 0.8371, + "step": 2828 + }, + { + "epoch": 0.15252318309251672, + "grad_norm": 0.869320809841156, + "learning_rate": 9.966563596829046e-06, + "loss": 0.6396, + "step": 2829 + }, + { + "epoch": 0.15257709726116023, + "grad_norm": 0.8893983960151672, + "learning_rate": 9.966539115202998e-06, + "loss": 0.8423, + "step": 2830 + }, + { + "epoch": 0.15263101142980376, + "grad_norm": 0.823639452457428, + "learning_rate": 9.966514624647783e-06, + "loss": 0.7924, + "step": 2831 + }, + { + "epoch": 0.15268492559844726, + "grad_norm": 0.805551290512085, + "learning_rate": 9.966490125163444e-06, + "loss": 0.8091, + "step": 2832 + }, + { + "epoch": 0.1527388397670908, + "grad_norm": 0.9040341377258301, + "learning_rate": 9.966465616750025e-06, + "loss": 0.8924, + "step": 2833 + }, + { + "epoch": 0.1527927539357343, + "grad_norm": 0.8297836780548096, + "learning_rate": 9.966441099407572e-06, + "loss": 0.7538, + "step": 2834 + }, + { + "epoch": 0.15284666810437783, + "grad_norm": 0.8824244141578674, + "learning_rate": 9.966416573136127e-06, + "loss": 0.8892, + "step": 2835 + }, + { + "epoch": 0.15290058227302136, + "grad_norm": 1.0663546323776245, + "learning_rate": 9.966392037935734e-06, + "loss": 0.7809, + "step": 2836 + }, + { + "epoch": 0.15295449644166487, + "grad_norm": 0.8324514627456665, + "learning_rate": 9.966367493806439e-06, + "loss": 0.8308, + "step": 2837 + }, + { + "epoch": 0.1530084106103084, + "grad_norm": 0.7742459177970886, + "learning_rate": 9.966342940748286e-06, + "loss": 0.8269, + "step": 2838 + }, + { + "epoch": 0.1530623247789519, + "grad_norm": 0.9513984322547913, + "learning_rate": 9.966318378761317e-06, + "loss": 0.8538, + "step": 2839 + }, + { + "epoch": 0.15311623894759543, + "grad_norm": 0.8030692934989929, + "learning_rate": 9.966293807845577e-06, + "loss": 0.7752, + "step": 2840 + }, + { + "epoch": 0.15317015311623894, + "grad_norm": 0.8903285264968872, + "learning_rate": 9.966269228001112e-06, + "loss": 0.8556, + "step": 2841 + }, + { + "epoch": 0.15322406728488247, + "grad_norm": 0.8221173286437988, + "learning_rate": 9.966244639227962e-06, + "loss": 0.7249, + "step": 2842 + }, + { + "epoch": 0.15327798145352597, + "grad_norm": 0.9883365035057068, + "learning_rate": 9.966220041526176e-06, + "loss": 0.961, + "step": 2843 + }, + { + "epoch": 0.1533318956221695, + "grad_norm": 0.8654862642288208, + "learning_rate": 9.966195434895796e-06, + "loss": 0.7779, + "step": 2844 + }, + { + "epoch": 0.15338580979081304, + "grad_norm": 0.7924084663391113, + "learning_rate": 9.966170819336866e-06, + "loss": 0.7706, + "step": 2845 + }, + { + "epoch": 0.15343972395945654, + "grad_norm": 0.8227209448814392, + "learning_rate": 9.96614619484943e-06, + "loss": 0.8659, + "step": 2846 + }, + { + "epoch": 0.15349363812810007, + "grad_norm": 0.9436708688735962, + "learning_rate": 9.966121561433534e-06, + "loss": 0.87, + "step": 2847 + }, + { + "epoch": 0.15354755229674358, + "grad_norm": 1.137171983718872, + "learning_rate": 9.96609691908922e-06, + "loss": 0.7883, + "step": 2848 + }, + { + "epoch": 0.1536014664653871, + "grad_norm": 0.8868550658226013, + "learning_rate": 9.966072267816535e-06, + "loss": 0.8309, + "step": 2849 + }, + { + "epoch": 0.1536553806340306, + "grad_norm": 0.7190971970558167, + "learning_rate": 9.966047607615521e-06, + "loss": 0.6938, + "step": 2850 + }, + { + "epoch": 0.15370929480267415, + "grad_norm": 0.883866548538208, + "learning_rate": 9.966022938486223e-06, + "loss": 0.8368, + "step": 2851 + }, + { + "epoch": 0.15376320897131765, + "grad_norm": 0.9433422684669495, + "learning_rate": 9.965998260428686e-06, + "loss": 0.7739, + "step": 2852 + }, + { + "epoch": 0.15381712313996118, + "grad_norm": 0.9166012406349182, + "learning_rate": 9.965973573442956e-06, + "loss": 0.8308, + "step": 2853 + }, + { + "epoch": 0.1538710373086047, + "grad_norm": 0.8955514430999756, + "learning_rate": 9.965948877529071e-06, + "loss": 0.8403, + "step": 2854 + }, + { + "epoch": 0.15392495147724822, + "grad_norm": 0.8281451463699341, + "learning_rate": 9.965924172687083e-06, + "loss": 0.8127, + "step": 2855 + }, + { + "epoch": 0.15397886564589175, + "grad_norm": 0.8765435218811035, + "learning_rate": 9.965899458917031e-06, + "loss": 0.87, + "step": 2856 + }, + { + "epoch": 0.15403277981453525, + "grad_norm": 0.9525101780891418, + "learning_rate": 9.965874736218964e-06, + "loss": 0.8665, + "step": 2857 + }, + { + "epoch": 0.15408669398317879, + "grad_norm": 0.7836191654205322, + "learning_rate": 9.965850004592921e-06, + "loss": 0.8261, + "step": 2858 + }, + { + "epoch": 0.1541406081518223, + "grad_norm": 0.7918692827224731, + "learning_rate": 9.96582526403895e-06, + "loss": 0.8422, + "step": 2859 + }, + { + "epoch": 0.15419452232046582, + "grad_norm": 0.8489586710929871, + "learning_rate": 9.965800514557096e-06, + "loss": 0.8871, + "step": 2860 + }, + { + "epoch": 0.15424843648910933, + "grad_norm": 0.9581596255302429, + "learning_rate": 9.965775756147402e-06, + "loss": 0.9346, + "step": 2861 + }, + { + "epoch": 0.15430235065775286, + "grad_norm": 1.0253969430923462, + "learning_rate": 9.965750988809913e-06, + "loss": 0.8381, + "step": 2862 + }, + { + "epoch": 0.1543562648263964, + "grad_norm": 0.8403491377830505, + "learning_rate": 9.965726212544674e-06, + "loss": 0.8307, + "step": 2863 + }, + { + "epoch": 0.1544101789950399, + "grad_norm": 0.729560375213623, + "learning_rate": 9.965701427351728e-06, + "loss": 0.8021, + "step": 2864 + }, + { + "epoch": 0.15446409316368342, + "grad_norm": 0.7576143741607666, + "learning_rate": 9.965676633231121e-06, + "loss": 0.7896, + "step": 2865 + }, + { + "epoch": 0.15451800733232693, + "grad_norm": 1.100948452949524, + "learning_rate": 9.965651830182898e-06, + "loss": 0.797, + "step": 2866 + }, + { + "epoch": 0.15457192150097046, + "grad_norm": 1.0760526657104492, + "learning_rate": 9.965627018207102e-06, + "loss": 0.7875, + "step": 2867 + }, + { + "epoch": 0.15462583566961396, + "grad_norm": 0.8553655743598938, + "learning_rate": 9.96560219730378e-06, + "loss": 0.872, + "step": 2868 + }, + { + "epoch": 0.1546797498382575, + "grad_norm": 1.1357450485229492, + "learning_rate": 9.965577367472971e-06, + "loss": 0.7306, + "step": 2869 + }, + { + "epoch": 0.154733664006901, + "grad_norm": 0.8308514952659607, + "learning_rate": 9.965552528714725e-06, + "loss": 0.8106, + "step": 2870 + }, + { + "epoch": 0.15478757817554453, + "grad_norm": 0.8406074047088623, + "learning_rate": 9.965527681029088e-06, + "loss": 0.9085, + "step": 2871 + }, + { + "epoch": 0.15484149234418806, + "grad_norm": 0.8215218186378479, + "learning_rate": 9.9655028244161e-06, + "loss": 0.733, + "step": 2872 + }, + { + "epoch": 0.15489540651283157, + "grad_norm": 1.0004653930664062, + "learning_rate": 9.965477958875806e-06, + "loss": 0.8625, + "step": 2873 + }, + { + "epoch": 0.1549493206814751, + "grad_norm": 0.8359742760658264, + "learning_rate": 9.965453084408256e-06, + "loss": 0.7847, + "step": 2874 + }, + { + "epoch": 0.1550032348501186, + "grad_norm": 1.0257774591445923, + "learning_rate": 9.965428201013488e-06, + "loss": 0.8654, + "step": 2875 + }, + { + "epoch": 0.15505714901876214, + "grad_norm": 0.7931713461875916, + "learning_rate": 9.96540330869155e-06, + "loss": 0.7498, + "step": 2876 + }, + { + "epoch": 0.15511106318740564, + "grad_norm": 0.7873162031173706, + "learning_rate": 9.965378407442488e-06, + "loss": 0.7617, + "step": 2877 + }, + { + "epoch": 0.15516497735604917, + "grad_norm": 0.8008442521095276, + "learning_rate": 9.965353497266346e-06, + "loss": 0.8464, + "step": 2878 + }, + { + "epoch": 0.15521889152469268, + "grad_norm": 0.798004686832428, + "learning_rate": 9.965328578163166e-06, + "loss": 0.8519, + "step": 2879 + }, + { + "epoch": 0.1552728056933362, + "grad_norm": 0.8730151057243347, + "learning_rate": 9.965303650132996e-06, + "loss": 0.8257, + "step": 2880 + }, + { + "epoch": 0.15532671986197974, + "grad_norm": 0.7465460896492004, + "learning_rate": 9.965278713175879e-06, + "loss": 0.7786, + "step": 2881 + }, + { + "epoch": 0.15538063403062324, + "grad_norm": 0.9565917253494263, + "learning_rate": 9.96525376729186e-06, + "loss": 0.8694, + "step": 2882 + }, + { + "epoch": 0.15543454819926678, + "grad_norm": 0.880181074142456, + "learning_rate": 9.965228812480987e-06, + "loss": 0.813, + "step": 2883 + }, + { + "epoch": 0.15548846236791028, + "grad_norm": 0.7912368774414062, + "learning_rate": 9.965203848743299e-06, + "loss": 0.7764, + "step": 2884 + }, + { + "epoch": 0.1555423765365538, + "grad_norm": 0.8370791077613831, + "learning_rate": 9.965178876078846e-06, + "loss": 0.8591, + "step": 2885 + }, + { + "epoch": 0.15559629070519732, + "grad_norm": 0.8508057594299316, + "learning_rate": 9.965153894487672e-06, + "loss": 0.8535, + "step": 2886 + }, + { + "epoch": 0.15565020487384085, + "grad_norm": 1.0393366813659668, + "learning_rate": 9.965128903969818e-06, + "loss": 0.8032, + "step": 2887 + }, + { + "epoch": 0.15570411904248435, + "grad_norm": 0.7545601725578308, + "learning_rate": 9.965103904525334e-06, + "loss": 0.7024, + "step": 2888 + }, + { + "epoch": 0.15575803321112788, + "grad_norm": 0.7933251261711121, + "learning_rate": 9.965078896154262e-06, + "loss": 0.8325, + "step": 2889 + }, + { + "epoch": 0.15581194737977141, + "grad_norm": 0.8319270610809326, + "learning_rate": 9.965053878856648e-06, + "loss": 0.7781, + "step": 2890 + }, + { + "epoch": 0.15586586154841492, + "grad_norm": 1.0789637565612793, + "learning_rate": 9.965028852632537e-06, + "loss": 0.7931, + "step": 2891 + }, + { + "epoch": 0.15591977571705845, + "grad_norm": 0.9561448097229004, + "learning_rate": 9.965003817481974e-06, + "loss": 0.7472, + "step": 2892 + }, + { + "epoch": 0.15597368988570195, + "grad_norm": 0.9099969267845154, + "learning_rate": 9.964978773405003e-06, + "loss": 0.9154, + "step": 2893 + }, + { + "epoch": 0.1560276040543455, + "grad_norm": 0.9164708852767944, + "learning_rate": 9.96495372040167e-06, + "loss": 0.8552, + "step": 2894 + }, + { + "epoch": 0.156081518222989, + "grad_norm": 0.9367608428001404, + "learning_rate": 9.96492865847202e-06, + "loss": 0.7926, + "step": 2895 + }, + { + "epoch": 0.15613543239163252, + "grad_norm": 0.8970937728881836, + "learning_rate": 9.9649035876161e-06, + "loss": 0.8798, + "step": 2896 + }, + { + "epoch": 0.15618934656027603, + "grad_norm": 0.8037889003753662, + "learning_rate": 9.96487850783395e-06, + "loss": 0.8157, + "step": 2897 + }, + { + "epoch": 0.15624326072891956, + "grad_norm": 0.906944215297699, + "learning_rate": 9.964853419125619e-06, + "loss": 0.8191, + "step": 2898 + }, + { + "epoch": 0.1562971748975631, + "grad_norm": 0.8197054266929626, + "learning_rate": 9.964828321491152e-06, + "loss": 0.7899, + "step": 2899 + }, + { + "epoch": 0.1563510890662066, + "grad_norm": 0.7816088795661926, + "learning_rate": 9.96480321493059e-06, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.15640500323485013, + "grad_norm": 0.8319717645645142, + "learning_rate": 9.964778099443985e-06, + "loss": 0.7835, + "step": 2901 + }, + { + "epoch": 0.15645891740349363, + "grad_norm": 0.7739672660827637, + "learning_rate": 9.964752975031378e-06, + "loss": 0.7813, + "step": 2902 + }, + { + "epoch": 0.15651283157213716, + "grad_norm": 0.8002716898918152, + "learning_rate": 9.964727841692815e-06, + "loss": 0.7971, + "step": 2903 + }, + { + "epoch": 0.15656674574078067, + "grad_norm": 0.8796008229255676, + "learning_rate": 9.964702699428339e-06, + "loss": 0.7462, + "step": 2904 + }, + { + "epoch": 0.1566206599094242, + "grad_norm": 0.837027907371521, + "learning_rate": 9.964677548237998e-06, + "loss": 0.864, + "step": 2905 + }, + { + "epoch": 0.15667457407806773, + "grad_norm": 0.9098290205001831, + "learning_rate": 9.964652388121837e-06, + "loss": 0.9079, + "step": 2906 + }, + { + "epoch": 0.15672848824671123, + "grad_norm": 0.7707619071006775, + "learning_rate": 9.964627219079898e-06, + "loss": 0.7472, + "step": 2907 + }, + { + "epoch": 0.15678240241535477, + "grad_norm": 1.0109550952911377, + "learning_rate": 9.964602041112233e-06, + "loss": 0.8981, + "step": 2908 + }, + { + "epoch": 0.15683631658399827, + "grad_norm": 0.8410045504570007, + "learning_rate": 9.964576854218882e-06, + "loss": 0.8488, + "step": 2909 + }, + { + "epoch": 0.1568902307526418, + "grad_norm": 0.8624899983406067, + "learning_rate": 9.96455165839989e-06, + "loss": 0.817, + "step": 2910 + }, + { + "epoch": 0.1569441449212853, + "grad_norm": 0.9060286283493042, + "learning_rate": 9.964526453655304e-06, + "loss": 0.8171, + "step": 2911 + }, + { + "epoch": 0.15699805908992884, + "grad_norm": 0.7718086838722229, + "learning_rate": 9.96450123998517e-06, + "loss": 0.7158, + "step": 2912 + }, + { + "epoch": 0.15705197325857234, + "grad_norm": 0.8690425157546997, + "learning_rate": 9.96447601738953e-06, + "loss": 0.8347, + "step": 2913 + }, + { + "epoch": 0.15710588742721587, + "grad_norm": 0.782656192779541, + "learning_rate": 9.964450785868433e-06, + "loss": 0.7581, + "step": 2914 + }, + { + "epoch": 0.1571598015958594, + "grad_norm": 1.0090769529342651, + "learning_rate": 9.964425545421924e-06, + "loss": 0.8179, + "step": 2915 + }, + { + "epoch": 0.1572137157645029, + "grad_norm": 0.8786135911941528, + "learning_rate": 9.964400296050047e-06, + "loss": 0.8733, + "step": 2916 + }, + { + "epoch": 0.15726762993314644, + "grad_norm": 0.8163133859634399, + "learning_rate": 9.964375037752847e-06, + "loss": 0.8091, + "step": 2917 + }, + { + "epoch": 0.15732154410178995, + "grad_norm": 0.8213543891906738, + "learning_rate": 9.964349770530371e-06, + "loss": 0.7978, + "step": 2918 + }, + { + "epoch": 0.15737545827043348, + "grad_norm": 0.849274218082428, + "learning_rate": 9.964324494382663e-06, + "loss": 0.8168, + "step": 2919 + }, + { + "epoch": 0.15742937243907698, + "grad_norm": 0.8099618554115295, + "learning_rate": 9.964299209309769e-06, + "loss": 0.8372, + "step": 2920 + }, + { + "epoch": 0.1574832866077205, + "grad_norm": 0.9064434766769409, + "learning_rate": 9.964273915311734e-06, + "loss": 0.8681, + "step": 2921 + }, + { + "epoch": 0.15753720077636402, + "grad_norm": 0.7269558310508728, + "learning_rate": 9.964248612388607e-06, + "loss": 0.7179, + "step": 2922 + }, + { + "epoch": 0.15759111494500755, + "grad_norm": 0.8115706443786621, + "learning_rate": 9.964223300540427e-06, + "loss": 0.8572, + "step": 2923 + }, + { + "epoch": 0.15764502911365108, + "grad_norm": 0.8180872797966003, + "learning_rate": 9.964197979767246e-06, + "loss": 0.7463, + "step": 2924 + }, + { + "epoch": 0.15769894328229458, + "grad_norm": 0.741603434085846, + "learning_rate": 9.964172650069105e-06, + "loss": 0.7646, + "step": 2925 + }, + { + "epoch": 0.15775285745093812, + "grad_norm": 0.7558543682098389, + "learning_rate": 9.964147311446051e-06, + "loss": 0.7363, + "step": 2926 + }, + { + "epoch": 0.15780677161958162, + "grad_norm": 0.8128615617752075, + "learning_rate": 9.96412196389813e-06, + "loss": 0.8515, + "step": 2927 + }, + { + "epoch": 0.15786068578822515, + "grad_norm": 0.9731131196022034, + "learning_rate": 9.964096607425388e-06, + "loss": 0.8847, + "step": 2928 + }, + { + "epoch": 0.15791459995686866, + "grad_norm": 1.136883020401001, + "learning_rate": 9.964071242027868e-06, + "loss": 0.8457, + "step": 2929 + }, + { + "epoch": 0.1579685141255122, + "grad_norm": 0.7780461311340332, + "learning_rate": 9.964045867705618e-06, + "loss": 0.737, + "step": 2930 + }, + { + "epoch": 0.1580224282941557, + "grad_norm": 0.801013708114624, + "learning_rate": 9.964020484458684e-06, + "loss": 0.8164, + "step": 2931 + }, + { + "epoch": 0.15807634246279922, + "grad_norm": 0.8851730823516846, + "learning_rate": 9.96399509228711e-06, + "loss": 0.8762, + "step": 2932 + }, + { + "epoch": 0.15813025663144276, + "grad_norm": 0.9501338005065918, + "learning_rate": 9.963969691190942e-06, + "loss": 0.7788, + "step": 2933 + }, + { + "epoch": 0.15818417080008626, + "grad_norm": 0.9714099168777466, + "learning_rate": 9.963944281170227e-06, + "loss": 0.9207, + "step": 2934 + }, + { + "epoch": 0.1582380849687298, + "grad_norm": 0.764689564704895, + "learning_rate": 9.963918862225009e-06, + "loss": 0.737, + "step": 2935 + }, + { + "epoch": 0.1582919991373733, + "grad_norm": 1.1618343591690063, + "learning_rate": 9.963893434355335e-06, + "loss": 0.8055, + "step": 2936 + }, + { + "epoch": 0.15834591330601683, + "grad_norm": 0.8724596500396729, + "learning_rate": 9.96386799756125e-06, + "loss": 0.8449, + "step": 2937 + }, + { + "epoch": 0.15839982747466033, + "grad_norm": 0.7769358158111572, + "learning_rate": 9.963842551842798e-06, + "loss": 0.8155, + "step": 2938 + }, + { + "epoch": 0.15845374164330386, + "grad_norm": 0.8337542414665222, + "learning_rate": 9.963817097200028e-06, + "loss": 0.7331, + "step": 2939 + }, + { + "epoch": 0.15850765581194737, + "grad_norm": 0.8240610957145691, + "learning_rate": 9.963791633632984e-06, + "loss": 0.8076, + "step": 2940 + }, + { + "epoch": 0.1585615699805909, + "grad_norm": 0.7781216502189636, + "learning_rate": 9.963766161141713e-06, + "loss": 0.7274, + "step": 2941 + }, + { + "epoch": 0.15861548414923443, + "grad_norm": 0.8469343781471252, + "learning_rate": 9.96374067972626e-06, + "loss": 0.8364, + "step": 2942 + }, + { + "epoch": 0.15866939831787794, + "grad_norm": 0.7859261631965637, + "learning_rate": 9.963715189386669e-06, + "loss": 0.8006, + "step": 2943 + }, + { + "epoch": 0.15872331248652147, + "grad_norm": 0.8646130561828613, + "learning_rate": 9.963689690122988e-06, + "loss": 0.808, + "step": 2944 + }, + { + "epoch": 0.15877722665516497, + "grad_norm": 0.8905766010284424, + "learning_rate": 9.963664181935263e-06, + "loss": 0.8406, + "step": 2945 + }, + { + "epoch": 0.1588311408238085, + "grad_norm": 0.8756605982780457, + "learning_rate": 9.963638664823539e-06, + "loss": 0.8643, + "step": 2946 + }, + { + "epoch": 0.158885054992452, + "grad_norm": 0.899135410785675, + "learning_rate": 9.963613138787862e-06, + "loss": 0.9063, + "step": 2947 + }, + { + "epoch": 0.15893896916109554, + "grad_norm": 0.8382771015167236, + "learning_rate": 9.96358760382828e-06, + "loss": 0.8004, + "step": 2948 + }, + { + "epoch": 0.15899288332973904, + "grad_norm": 0.7687328457832336, + "learning_rate": 9.963562059944833e-06, + "loss": 0.7695, + "step": 2949 + }, + { + "epoch": 0.15904679749838257, + "grad_norm": 0.807344913482666, + "learning_rate": 9.963536507137574e-06, + "loss": 0.7514, + "step": 2950 + }, + { + "epoch": 0.1591007116670261, + "grad_norm": 0.7882648706436157, + "learning_rate": 9.963510945406545e-06, + "loss": 0.7537, + "step": 2951 + }, + { + "epoch": 0.1591546258356696, + "grad_norm": 0.8422887921333313, + "learning_rate": 9.963485374751793e-06, + "loss": 0.7937, + "step": 2952 + }, + { + "epoch": 0.15920854000431314, + "grad_norm": 0.7578607797622681, + "learning_rate": 9.963459795173362e-06, + "loss": 0.8071, + "step": 2953 + }, + { + "epoch": 0.15926245417295665, + "grad_norm": 0.8854062557220459, + "learning_rate": 9.963434206671302e-06, + "loss": 0.9078, + "step": 2954 + }, + { + "epoch": 0.15931636834160018, + "grad_norm": 0.8705536723136902, + "learning_rate": 9.963408609245654e-06, + "loss": 0.7971, + "step": 2955 + }, + { + "epoch": 0.15937028251024368, + "grad_norm": 0.8247761726379395, + "learning_rate": 9.96338300289647e-06, + "loss": 0.7889, + "step": 2956 + }, + { + "epoch": 0.15942419667888721, + "grad_norm": 0.8216410279273987, + "learning_rate": 9.96335738762379e-06, + "loss": 0.9097, + "step": 2957 + }, + { + "epoch": 0.15947811084753072, + "grad_norm": 0.9624109268188477, + "learning_rate": 9.963331763427666e-06, + "loss": 0.8562, + "step": 2958 + }, + { + "epoch": 0.15953202501617425, + "grad_norm": 0.8426920175552368, + "learning_rate": 9.96330613030814e-06, + "loss": 0.8011, + "step": 2959 + }, + { + "epoch": 0.15958593918481778, + "grad_norm": 0.8987439870834351, + "learning_rate": 9.963280488265256e-06, + "loss": 0.7965, + "step": 2960 + }, + { + "epoch": 0.1596398533534613, + "grad_norm": 0.8105943202972412, + "learning_rate": 9.963254837299066e-06, + "loss": 0.8178, + "step": 2961 + }, + { + "epoch": 0.15969376752210482, + "grad_norm": 0.928841769695282, + "learning_rate": 9.963229177409612e-06, + "loss": 0.8106, + "step": 2962 + }, + { + "epoch": 0.15974768169074832, + "grad_norm": 0.7369773983955383, + "learning_rate": 9.963203508596942e-06, + "loss": 0.7401, + "step": 2963 + }, + { + "epoch": 0.15980159585939185, + "grad_norm": 0.7476964592933655, + "learning_rate": 9.9631778308611e-06, + "loss": 0.8112, + "step": 2964 + }, + { + "epoch": 0.15985551002803536, + "grad_norm": 0.8257710337638855, + "learning_rate": 9.963152144202135e-06, + "loss": 0.8489, + "step": 2965 + }, + { + "epoch": 0.1599094241966789, + "grad_norm": 0.8324301242828369, + "learning_rate": 9.963126448620091e-06, + "loss": 0.8511, + "step": 2966 + }, + { + "epoch": 0.1599633383653224, + "grad_norm": 0.8221176266670227, + "learning_rate": 9.963100744115017e-06, + "loss": 0.7924, + "step": 2967 + }, + { + "epoch": 0.16001725253396593, + "grad_norm": 0.7942221164703369, + "learning_rate": 9.963075030686955e-06, + "loss": 0.7936, + "step": 2968 + }, + { + "epoch": 0.16007116670260946, + "grad_norm": 0.7341020107269287, + "learning_rate": 9.963049308335954e-06, + "loss": 0.7381, + "step": 2969 + }, + { + "epoch": 0.16012508087125296, + "grad_norm": 0.8118404746055603, + "learning_rate": 9.963023577062062e-06, + "loss": 0.756, + "step": 2970 + }, + { + "epoch": 0.1601789950398965, + "grad_norm": 0.7517318725585938, + "learning_rate": 9.96299783686532e-06, + "loss": 0.7051, + "step": 2971 + }, + { + "epoch": 0.16023290920854, + "grad_norm": 0.7982935905456543, + "learning_rate": 9.962972087745777e-06, + "loss": 0.8412, + "step": 2972 + }, + { + "epoch": 0.16028682337718353, + "grad_norm": 0.8397754430770874, + "learning_rate": 9.962946329703482e-06, + "loss": 0.8314, + "step": 2973 + }, + { + "epoch": 0.16034073754582703, + "grad_norm": 0.8342095613479614, + "learning_rate": 9.962920562738477e-06, + "loss": 0.7649, + "step": 2974 + }, + { + "epoch": 0.16039465171447057, + "grad_norm": 0.8053215742111206, + "learning_rate": 9.96289478685081e-06, + "loss": 0.7315, + "step": 2975 + }, + { + "epoch": 0.16044856588311407, + "grad_norm": 0.8931438326835632, + "learning_rate": 9.962869002040529e-06, + "loss": 0.9241, + "step": 2976 + }, + { + "epoch": 0.1605024800517576, + "grad_norm": 0.8217912316322327, + "learning_rate": 9.962843208307677e-06, + "loss": 0.7551, + "step": 2977 + }, + { + "epoch": 0.16055639422040113, + "grad_norm": 0.7592090964317322, + "learning_rate": 9.962817405652305e-06, + "loss": 0.7243, + "step": 2978 + }, + { + "epoch": 0.16061030838904464, + "grad_norm": 0.8466029167175293, + "learning_rate": 9.962791594074455e-06, + "loss": 0.785, + "step": 2979 + }, + { + "epoch": 0.16066422255768817, + "grad_norm": 0.859207272529602, + "learning_rate": 9.962765773574174e-06, + "loss": 0.8344, + "step": 2980 + }, + { + "epoch": 0.16071813672633167, + "grad_norm": 0.8134403824806213, + "learning_rate": 9.962739944151511e-06, + "loss": 0.7595, + "step": 2981 + }, + { + "epoch": 0.1607720508949752, + "grad_norm": 0.7411110401153564, + "learning_rate": 9.962714105806511e-06, + "loss": 0.7751, + "step": 2982 + }, + { + "epoch": 0.1608259650636187, + "grad_norm": 0.7976831793785095, + "learning_rate": 9.962688258539219e-06, + "loss": 0.7353, + "step": 2983 + }, + { + "epoch": 0.16087987923226224, + "grad_norm": 0.8306836485862732, + "learning_rate": 9.962662402349684e-06, + "loss": 0.7903, + "step": 2984 + }, + { + "epoch": 0.16093379340090574, + "grad_norm": 0.794691264629364, + "learning_rate": 9.96263653723795e-06, + "loss": 0.7972, + "step": 2985 + }, + { + "epoch": 0.16098770756954928, + "grad_norm": 0.7471837401390076, + "learning_rate": 9.962610663204066e-06, + "loss": 0.7994, + "step": 2986 + }, + { + "epoch": 0.1610416217381928, + "grad_norm": 0.8046342134475708, + "learning_rate": 9.962584780248079e-06, + "loss": 0.7912, + "step": 2987 + }, + { + "epoch": 0.1610955359068363, + "grad_norm": 0.7935966849327087, + "learning_rate": 9.96255888837003e-06, + "loss": 0.8053, + "step": 2988 + }, + { + "epoch": 0.16114945007547984, + "grad_norm": 0.7403679490089417, + "learning_rate": 9.962532987569973e-06, + "loss": 0.6707, + "step": 2989 + }, + { + "epoch": 0.16120336424412335, + "grad_norm": 0.8277058005332947, + "learning_rate": 9.96250707784795e-06, + "loss": 0.8074, + "step": 2990 + }, + { + "epoch": 0.16125727841276688, + "grad_norm": 1.0225850343704224, + "learning_rate": 9.962481159204008e-06, + "loss": 0.8475, + "step": 2991 + }, + { + "epoch": 0.16131119258141038, + "grad_norm": 0.8091806769371033, + "learning_rate": 9.962455231638193e-06, + "loss": 0.7714, + "step": 2992 + }, + { + "epoch": 0.16136510675005392, + "grad_norm": 0.7496880292892456, + "learning_rate": 9.962429295150554e-06, + "loss": 0.7449, + "step": 2993 + }, + { + "epoch": 0.16141902091869742, + "grad_norm": 0.7799220085144043, + "learning_rate": 9.962403349741137e-06, + "loss": 0.7241, + "step": 2994 + }, + { + "epoch": 0.16147293508734095, + "grad_norm": 0.92058926820755, + "learning_rate": 9.962377395409986e-06, + "loss": 0.8374, + "step": 2995 + }, + { + "epoch": 0.16152684925598448, + "grad_norm": 0.7713897228240967, + "learning_rate": 9.96235143215715e-06, + "loss": 0.7571, + "step": 2996 + }, + { + "epoch": 0.161580763424628, + "grad_norm": 0.779852032661438, + "learning_rate": 9.962325459982678e-06, + "loss": 0.796, + "step": 2997 + }, + { + "epoch": 0.16163467759327152, + "grad_norm": 0.8362038731575012, + "learning_rate": 9.962299478886613e-06, + "loss": 0.8645, + "step": 2998 + }, + { + "epoch": 0.16168859176191502, + "grad_norm": 0.8759078979492188, + "learning_rate": 9.962273488869003e-06, + "loss": 0.8192, + "step": 2999 + }, + { + "epoch": 0.16174250593055856, + "grad_norm": 0.7853894233703613, + "learning_rate": 9.962247489929892e-06, + "loss": 0.81, + "step": 3000 + }, + { + "epoch": 0.16179642009920206, + "grad_norm": 0.8752580881118774, + "learning_rate": 9.962221482069332e-06, + "loss": 0.8172, + "step": 3001 + }, + { + "epoch": 0.1618503342678456, + "grad_norm": 0.8129578828811646, + "learning_rate": 9.962195465287367e-06, + "loss": 0.698, + "step": 3002 + }, + { + "epoch": 0.1619042484364891, + "grad_norm": 0.7905570268630981, + "learning_rate": 9.962169439584043e-06, + "loss": 0.7755, + "step": 3003 + }, + { + "epoch": 0.16195816260513263, + "grad_norm": 1.1296168565750122, + "learning_rate": 9.962143404959408e-06, + "loss": 0.829, + "step": 3004 + }, + { + "epoch": 0.16201207677377616, + "grad_norm": 0.8880928158760071, + "learning_rate": 9.962117361413508e-06, + "loss": 0.8542, + "step": 3005 + }, + { + "epoch": 0.16206599094241966, + "grad_norm": 0.7933239936828613, + "learning_rate": 9.96209130894639e-06, + "loss": 0.714, + "step": 3006 + }, + { + "epoch": 0.1621199051110632, + "grad_norm": 0.8112434148788452, + "learning_rate": 9.962065247558101e-06, + "loss": 0.7967, + "step": 3007 + }, + { + "epoch": 0.1621738192797067, + "grad_norm": 0.7101603150367737, + "learning_rate": 9.962039177248689e-06, + "loss": 0.7054, + "step": 3008 + }, + { + "epoch": 0.16222773344835023, + "grad_norm": 0.9327304363250732, + "learning_rate": 9.962013098018198e-06, + "loss": 0.7683, + "step": 3009 + }, + { + "epoch": 0.16228164761699373, + "grad_norm": 0.8223574161529541, + "learning_rate": 9.961987009866678e-06, + "loss": 0.7174, + "step": 3010 + }, + { + "epoch": 0.16233556178563727, + "grad_norm": 0.889711856842041, + "learning_rate": 9.961960912794176e-06, + "loss": 0.8562, + "step": 3011 + }, + { + "epoch": 0.1623894759542808, + "grad_norm": 0.9297184348106384, + "learning_rate": 9.961934806800736e-06, + "loss": 0.8887, + "step": 3012 + }, + { + "epoch": 0.1624433901229243, + "grad_norm": 0.8206717371940613, + "learning_rate": 9.961908691886404e-06, + "loss": 0.8272, + "step": 3013 + }, + { + "epoch": 0.16249730429156783, + "grad_norm": 0.7833002805709839, + "learning_rate": 9.961882568051233e-06, + "loss": 0.848, + "step": 3014 + }, + { + "epoch": 0.16255121846021134, + "grad_norm": 0.8386265635490417, + "learning_rate": 9.961856435295265e-06, + "loss": 0.7528, + "step": 3015 + }, + { + "epoch": 0.16260513262885487, + "grad_norm": 0.8227097392082214, + "learning_rate": 9.961830293618547e-06, + "loss": 0.8181, + "step": 3016 + }, + { + "epoch": 0.16265904679749837, + "grad_norm": 0.7938892245292664, + "learning_rate": 9.96180414302113e-06, + "loss": 0.8293, + "step": 3017 + }, + { + "epoch": 0.1627129609661419, + "grad_norm": 1.1556557416915894, + "learning_rate": 9.961777983503056e-06, + "loss": 0.9544, + "step": 3018 + }, + { + "epoch": 0.1627668751347854, + "grad_norm": 0.8379788994789124, + "learning_rate": 9.961751815064375e-06, + "loss": 0.7168, + "step": 3019 + }, + { + "epoch": 0.16282078930342894, + "grad_norm": 0.9397227764129639, + "learning_rate": 9.961725637705134e-06, + "loss": 0.8804, + "step": 3020 + }, + { + "epoch": 0.16287470347207247, + "grad_norm": 0.8950162529945374, + "learning_rate": 9.96169945142538e-06, + "loss": 0.8652, + "step": 3021 + }, + { + "epoch": 0.16292861764071598, + "grad_norm": 0.8643755912780762, + "learning_rate": 9.961673256225159e-06, + "loss": 0.9041, + "step": 3022 + }, + { + "epoch": 0.1629825318093595, + "grad_norm": 0.8658211827278137, + "learning_rate": 9.961647052104517e-06, + "loss": 0.8721, + "step": 3023 + }, + { + "epoch": 0.16303644597800301, + "grad_norm": 0.812038242816925, + "learning_rate": 9.961620839063507e-06, + "loss": 0.8715, + "step": 3024 + }, + { + "epoch": 0.16309036014664655, + "grad_norm": 0.7646269798278809, + "learning_rate": 9.961594617102169e-06, + "loss": 0.7805, + "step": 3025 + }, + { + "epoch": 0.16314427431529005, + "grad_norm": 0.7684099674224854, + "learning_rate": 9.961568386220553e-06, + "loss": 0.8214, + "step": 3026 + }, + { + "epoch": 0.16319818848393358, + "grad_norm": 0.888566255569458, + "learning_rate": 9.961542146418706e-06, + "loss": 0.8972, + "step": 3027 + }, + { + "epoch": 0.16325210265257709, + "grad_norm": 0.8100109100341797, + "learning_rate": 9.961515897696675e-06, + "loss": 0.7337, + "step": 3028 + }, + { + "epoch": 0.16330601682122062, + "grad_norm": 0.8838690519332886, + "learning_rate": 9.96148964005451e-06, + "loss": 0.7148, + "step": 3029 + }, + { + "epoch": 0.16335993098986415, + "grad_norm": 0.7518458962440491, + "learning_rate": 9.961463373492253e-06, + "loss": 0.7127, + "step": 3030 + }, + { + "epoch": 0.16341384515850765, + "grad_norm": 0.8280466198921204, + "learning_rate": 9.961437098009956e-06, + "loss": 0.7569, + "step": 3031 + }, + { + "epoch": 0.16346775932715119, + "grad_norm": 0.7333472371101379, + "learning_rate": 9.961410813607663e-06, + "loss": 0.7984, + "step": 3032 + }, + { + "epoch": 0.1635216734957947, + "grad_norm": 0.8064109086990356, + "learning_rate": 9.961384520285423e-06, + "loss": 0.8255, + "step": 3033 + }, + { + "epoch": 0.16357558766443822, + "grad_norm": 0.8310550451278687, + "learning_rate": 9.961358218043282e-06, + "loss": 0.828, + "step": 3034 + }, + { + "epoch": 0.16362950183308173, + "grad_norm": 0.8141489028930664, + "learning_rate": 9.961331906881289e-06, + "loss": 0.8121, + "step": 3035 + }, + { + "epoch": 0.16368341600172526, + "grad_norm": 0.9229308366775513, + "learning_rate": 9.96130558679949e-06, + "loss": 0.9288, + "step": 3036 + }, + { + "epoch": 0.16373733017036876, + "grad_norm": 0.9087804555892944, + "learning_rate": 9.961279257797933e-06, + "loss": 0.8725, + "step": 3037 + }, + { + "epoch": 0.1637912443390123, + "grad_norm": 0.8357719779014587, + "learning_rate": 9.961252919876665e-06, + "loss": 0.8413, + "step": 3038 + }, + { + "epoch": 0.16384515850765582, + "grad_norm": 0.8311809301376343, + "learning_rate": 9.961226573035734e-06, + "loss": 0.885, + "step": 3039 + }, + { + "epoch": 0.16389907267629933, + "grad_norm": 0.7797298431396484, + "learning_rate": 9.961200217275185e-06, + "loss": 0.8767, + "step": 3040 + }, + { + "epoch": 0.16395298684494286, + "grad_norm": 0.8659999370574951, + "learning_rate": 9.961173852595069e-06, + "loss": 0.7852, + "step": 3041 + }, + { + "epoch": 0.16400690101358636, + "grad_norm": 0.8036298155784607, + "learning_rate": 9.96114747899543e-06, + "loss": 0.8122, + "step": 3042 + }, + { + "epoch": 0.1640608151822299, + "grad_norm": 0.8683627843856812, + "learning_rate": 9.961121096476318e-06, + "loss": 0.8197, + "step": 3043 + }, + { + "epoch": 0.1641147293508734, + "grad_norm": 0.8885881900787354, + "learning_rate": 9.96109470503778e-06, + "loss": 0.7302, + "step": 3044 + }, + { + "epoch": 0.16416864351951693, + "grad_norm": 0.7480132579803467, + "learning_rate": 9.961068304679861e-06, + "loss": 0.7938, + "step": 3045 + }, + { + "epoch": 0.16422255768816044, + "grad_norm": 0.680261492729187, + "learning_rate": 9.96104189540261e-06, + "loss": 0.7016, + "step": 3046 + }, + { + "epoch": 0.16427647185680397, + "grad_norm": 0.8690764904022217, + "learning_rate": 9.961015477206078e-06, + "loss": 0.7716, + "step": 3047 + }, + { + "epoch": 0.1643303860254475, + "grad_norm": 0.8533129692077637, + "learning_rate": 9.960989050090306e-06, + "loss": 0.8561, + "step": 3048 + }, + { + "epoch": 0.164384300194091, + "grad_norm": 0.6941283345222473, + "learning_rate": 9.960962614055345e-06, + "loss": 0.6501, + "step": 3049 + }, + { + "epoch": 0.16443821436273454, + "grad_norm": 0.9178086519241333, + "learning_rate": 9.960936169101244e-06, + "loss": 0.8511, + "step": 3050 + }, + { + "epoch": 0.16449212853137804, + "grad_norm": 0.7419497966766357, + "learning_rate": 9.960909715228049e-06, + "loss": 0.7331, + "step": 3051 + }, + { + "epoch": 0.16454604270002157, + "grad_norm": 0.879289984703064, + "learning_rate": 9.960883252435807e-06, + "loss": 0.8969, + "step": 3052 + }, + { + "epoch": 0.16459995686866508, + "grad_norm": 0.7679347991943359, + "learning_rate": 9.960856780724563e-06, + "loss": 0.7467, + "step": 3053 + }, + { + "epoch": 0.1646538710373086, + "grad_norm": 0.7927586436271667, + "learning_rate": 9.960830300094371e-06, + "loss": 0.7479, + "step": 3054 + }, + { + "epoch": 0.1647077852059521, + "grad_norm": 0.7693600058555603, + "learning_rate": 9.960803810545275e-06, + "loss": 0.8421, + "step": 3055 + }, + { + "epoch": 0.16476169937459564, + "grad_norm": 0.8548445105552673, + "learning_rate": 9.96077731207732e-06, + "loss": 0.8104, + "step": 3056 + }, + { + "epoch": 0.16481561354323918, + "grad_norm": 0.8420791029930115, + "learning_rate": 9.960750804690559e-06, + "loss": 0.6974, + "step": 3057 + }, + { + "epoch": 0.16486952771188268, + "grad_norm": 0.7880173921585083, + "learning_rate": 9.960724288385037e-06, + "loss": 0.7723, + "step": 3058 + }, + { + "epoch": 0.1649234418805262, + "grad_norm": 0.8810162544250488, + "learning_rate": 9.960697763160803e-06, + "loss": 0.7488, + "step": 3059 + }, + { + "epoch": 0.16497735604916972, + "grad_norm": 0.9951279759407043, + "learning_rate": 9.9606712290179e-06, + "loss": 0.8119, + "step": 3060 + }, + { + "epoch": 0.16503127021781325, + "grad_norm": 0.755189836025238, + "learning_rate": 9.960644685956383e-06, + "loss": 0.7568, + "step": 3061 + }, + { + "epoch": 0.16508518438645675, + "grad_norm": 0.99064040184021, + "learning_rate": 9.960618133976292e-06, + "loss": 0.8493, + "step": 3062 + }, + { + "epoch": 0.16513909855510028, + "grad_norm": 0.8672367334365845, + "learning_rate": 9.960591573077682e-06, + "loss": 0.7961, + "step": 3063 + }, + { + "epoch": 0.1651930127237438, + "grad_norm": 0.9614015817642212, + "learning_rate": 9.960565003260596e-06, + "loss": 0.8894, + "step": 3064 + }, + { + "epoch": 0.16524692689238732, + "grad_norm": 0.7433729767799377, + "learning_rate": 9.960538424525083e-06, + "loss": 0.7586, + "step": 3065 + }, + { + "epoch": 0.16530084106103085, + "grad_norm": 0.8151267766952515, + "learning_rate": 9.96051183687119e-06, + "loss": 0.8311, + "step": 3066 + }, + { + "epoch": 0.16535475522967436, + "grad_norm": 0.9241605401039124, + "learning_rate": 9.960485240298967e-06, + "loss": 0.8526, + "step": 3067 + }, + { + "epoch": 0.1654086693983179, + "grad_norm": 0.8612751364707947, + "learning_rate": 9.96045863480846e-06, + "loss": 0.7672, + "step": 3068 + }, + { + "epoch": 0.1654625835669614, + "grad_norm": 0.8707523345947266, + "learning_rate": 9.960432020399719e-06, + "loss": 0.7862, + "step": 3069 + }, + { + "epoch": 0.16551649773560492, + "grad_norm": 0.8456318378448486, + "learning_rate": 9.960405397072788e-06, + "loss": 0.8221, + "step": 3070 + }, + { + "epoch": 0.16557041190424843, + "grad_norm": 0.7929409742355347, + "learning_rate": 9.960378764827719e-06, + "loss": 0.8438, + "step": 3071 + }, + { + "epoch": 0.16562432607289196, + "grad_norm": 0.8241098523139954, + "learning_rate": 9.960352123664556e-06, + "loss": 0.7769, + "step": 3072 + }, + { + "epoch": 0.16567824024153546, + "grad_norm": 0.9634597301483154, + "learning_rate": 9.96032547358335e-06, + "loss": 0.8323, + "step": 3073 + }, + { + "epoch": 0.165732154410179, + "grad_norm": 0.6783578395843506, + "learning_rate": 9.960298814584148e-06, + "loss": 0.6585, + "step": 3074 + }, + { + "epoch": 0.16578606857882253, + "grad_norm": 0.756289005279541, + "learning_rate": 9.960272146666997e-06, + "loss": 0.7109, + "step": 3075 + }, + { + "epoch": 0.16583998274746603, + "grad_norm": 0.8414442539215088, + "learning_rate": 9.960245469831947e-06, + "loss": 0.7543, + "step": 3076 + }, + { + "epoch": 0.16589389691610956, + "grad_norm": 0.7551240921020508, + "learning_rate": 9.960218784079044e-06, + "loss": 0.7131, + "step": 3077 + }, + { + "epoch": 0.16594781108475307, + "grad_norm": 0.8211004137992859, + "learning_rate": 9.960192089408335e-06, + "loss": 0.8335, + "step": 3078 + }, + { + "epoch": 0.1660017252533966, + "grad_norm": 0.7540998458862305, + "learning_rate": 9.960165385819873e-06, + "loss": 0.7557, + "step": 3079 + }, + { + "epoch": 0.1660556394220401, + "grad_norm": 0.7917600274085999, + "learning_rate": 9.9601386733137e-06, + "loss": 0.7522, + "step": 3080 + }, + { + "epoch": 0.16610955359068363, + "grad_norm": 0.9180947542190552, + "learning_rate": 9.960111951889868e-06, + "loss": 0.7943, + "step": 3081 + }, + { + "epoch": 0.16616346775932714, + "grad_norm": 0.8169807195663452, + "learning_rate": 9.960085221548422e-06, + "loss": 0.8633, + "step": 3082 + }, + { + "epoch": 0.16621738192797067, + "grad_norm": 0.8790155649185181, + "learning_rate": 9.960058482289413e-06, + "loss": 0.8265, + "step": 3083 + }, + { + "epoch": 0.1662712960966142, + "grad_norm": 0.8958606123924255, + "learning_rate": 9.960031734112887e-06, + "loss": 0.8601, + "step": 3084 + }, + { + "epoch": 0.1663252102652577, + "grad_norm": 0.8116661906242371, + "learning_rate": 9.960004977018893e-06, + "loss": 0.8203, + "step": 3085 + }, + { + "epoch": 0.16637912443390124, + "grad_norm": 0.771135687828064, + "learning_rate": 9.95997821100748e-06, + "loss": 0.7258, + "step": 3086 + }, + { + "epoch": 0.16643303860254474, + "grad_norm": 0.9094653725624084, + "learning_rate": 9.959951436078696e-06, + "loss": 0.9094, + "step": 3087 + }, + { + "epoch": 0.16648695277118827, + "grad_norm": 0.9042958617210388, + "learning_rate": 9.959924652232586e-06, + "loss": 0.7434, + "step": 3088 + }, + { + "epoch": 0.16654086693983178, + "grad_norm": 0.7170906662940979, + "learning_rate": 9.959897859469201e-06, + "loss": 0.7134, + "step": 3089 + }, + { + "epoch": 0.1665947811084753, + "grad_norm": 0.7896520495414734, + "learning_rate": 9.959871057788589e-06, + "loss": 0.7727, + "step": 3090 + }, + { + "epoch": 0.1666486952771188, + "grad_norm": 0.9295204281806946, + "learning_rate": 9.959844247190797e-06, + "loss": 0.8928, + "step": 3091 + }, + { + "epoch": 0.16670260944576235, + "grad_norm": 0.8025391101837158, + "learning_rate": 9.959817427675875e-06, + "loss": 0.7808, + "step": 3092 + }, + { + "epoch": 0.16675652361440588, + "grad_norm": 0.9727420210838318, + "learning_rate": 9.95979059924387e-06, + "loss": 0.9677, + "step": 3093 + }, + { + "epoch": 0.16681043778304938, + "grad_norm": 0.8534692525863647, + "learning_rate": 9.95976376189483e-06, + "loss": 0.8642, + "step": 3094 + }, + { + "epoch": 0.1668643519516929, + "grad_norm": 0.8361443877220154, + "learning_rate": 9.959736915628803e-06, + "loss": 0.8746, + "step": 3095 + }, + { + "epoch": 0.16691826612033642, + "grad_norm": 0.8551936745643616, + "learning_rate": 9.95971006044584e-06, + "loss": 0.7973, + "step": 3096 + }, + { + "epoch": 0.16697218028897995, + "grad_norm": 0.6986585259437561, + "learning_rate": 9.959683196345987e-06, + "loss": 0.6689, + "step": 3097 + }, + { + "epoch": 0.16702609445762345, + "grad_norm": 0.9048603773117065, + "learning_rate": 9.959656323329291e-06, + "loss": 0.7924, + "step": 3098 + }, + { + "epoch": 0.16708000862626698, + "grad_norm": 0.8295788764953613, + "learning_rate": 9.959629441395802e-06, + "loss": 0.843, + "step": 3099 + }, + { + "epoch": 0.1671339227949105, + "grad_norm": 0.838590681552887, + "learning_rate": 9.959602550545568e-06, + "loss": 0.7615, + "step": 3100 + }, + { + "epoch": 0.16718783696355402, + "grad_norm": 0.8323560357093811, + "learning_rate": 9.959575650778639e-06, + "loss": 0.8375, + "step": 3101 + }, + { + "epoch": 0.16724175113219755, + "grad_norm": 0.8825474381446838, + "learning_rate": 9.959548742095062e-06, + "loss": 0.7701, + "step": 3102 + }, + { + "epoch": 0.16729566530084106, + "grad_norm": 0.8911004662513733, + "learning_rate": 9.959521824494884e-06, + "loss": 0.8, + "step": 3103 + }, + { + "epoch": 0.1673495794694846, + "grad_norm": 0.76695317029953, + "learning_rate": 9.959494897978154e-06, + "loss": 0.7177, + "step": 3104 + }, + { + "epoch": 0.1674034936381281, + "grad_norm": 0.9462987184524536, + "learning_rate": 9.959467962544922e-06, + "loss": 0.8479, + "step": 3105 + }, + { + "epoch": 0.16745740780677162, + "grad_norm": 0.7185036540031433, + "learning_rate": 9.959441018195235e-06, + "loss": 0.6444, + "step": 3106 + }, + { + "epoch": 0.16751132197541513, + "grad_norm": 0.9797527194023132, + "learning_rate": 9.959414064929143e-06, + "loss": 0.916, + "step": 3107 + }, + { + "epoch": 0.16756523614405866, + "grad_norm": 0.7815739512443542, + "learning_rate": 9.959387102746693e-06, + "loss": 0.7315, + "step": 3108 + }, + { + "epoch": 0.1676191503127022, + "grad_norm": 0.9536890387535095, + "learning_rate": 9.959360131647933e-06, + "loss": 0.7795, + "step": 3109 + }, + { + "epoch": 0.1676730644813457, + "grad_norm": 0.7770065069198608, + "learning_rate": 9.959333151632913e-06, + "loss": 0.8203, + "step": 3110 + }, + { + "epoch": 0.16772697864998923, + "grad_norm": 0.8031367659568787, + "learning_rate": 9.959306162701681e-06, + "loss": 0.8362, + "step": 3111 + }, + { + "epoch": 0.16778089281863273, + "grad_norm": 0.8009032011032104, + "learning_rate": 9.959279164854286e-06, + "loss": 0.8113, + "step": 3112 + }, + { + "epoch": 0.16783480698727626, + "grad_norm": 0.8091812133789062, + "learning_rate": 9.959252158090775e-06, + "loss": 0.84, + "step": 3113 + }, + { + "epoch": 0.16788872115591977, + "grad_norm": 0.7102682590484619, + "learning_rate": 9.959225142411197e-06, + "loss": 0.7378, + "step": 3114 + }, + { + "epoch": 0.1679426353245633, + "grad_norm": 0.8190940618515015, + "learning_rate": 9.959198117815602e-06, + "loss": 0.8478, + "step": 3115 + }, + { + "epoch": 0.1679965494932068, + "grad_norm": 0.7320457696914673, + "learning_rate": 9.959171084304037e-06, + "loss": 0.8358, + "step": 3116 + }, + { + "epoch": 0.16805046366185034, + "grad_norm": 0.8222710490226746, + "learning_rate": 9.959144041876551e-06, + "loss": 0.809, + "step": 3117 + }, + { + "epoch": 0.16810437783049387, + "grad_norm": 0.7939282059669495, + "learning_rate": 9.959116990533195e-06, + "loss": 0.8562, + "step": 3118 + }, + { + "epoch": 0.16815829199913737, + "grad_norm": 0.7231613993644714, + "learning_rate": 9.959089930274013e-06, + "loss": 0.7656, + "step": 3119 + }, + { + "epoch": 0.1682122061677809, + "grad_norm": 0.8997424840927124, + "learning_rate": 9.959062861099058e-06, + "loss": 0.8831, + "step": 3120 + }, + { + "epoch": 0.1682661203364244, + "grad_norm": 0.80366450548172, + "learning_rate": 9.959035783008374e-06, + "loss": 0.8044, + "step": 3121 + }, + { + "epoch": 0.16832003450506794, + "grad_norm": 0.8153119683265686, + "learning_rate": 9.959008696002015e-06, + "loss": 0.8325, + "step": 3122 + }, + { + "epoch": 0.16837394867371144, + "grad_norm": 0.8638020157814026, + "learning_rate": 9.958981600080026e-06, + "loss": 0.8197, + "step": 3123 + }, + { + "epoch": 0.16842786284235498, + "grad_norm": 0.8430980443954468, + "learning_rate": 9.95895449524246e-06, + "loss": 0.8212, + "step": 3124 + }, + { + "epoch": 0.16848177701099848, + "grad_norm": 0.9273066520690918, + "learning_rate": 9.958927381489358e-06, + "loss": 0.8145, + "step": 3125 + }, + { + "epoch": 0.168535691179642, + "grad_norm": 0.8697495460510254, + "learning_rate": 9.958900258820777e-06, + "loss": 0.8519, + "step": 3126 + }, + { + "epoch": 0.16858960534828554, + "grad_norm": 0.7957634925842285, + "learning_rate": 9.95887312723676e-06, + "loss": 0.8065, + "step": 3127 + }, + { + "epoch": 0.16864351951692905, + "grad_norm": 0.8890637755393982, + "learning_rate": 9.958845986737357e-06, + "loss": 0.822, + "step": 3128 + }, + { + "epoch": 0.16869743368557258, + "grad_norm": 0.7979970574378967, + "learning_rate": 9.95881883732262e-06, + "loss": 0.8346, + "step": 3129 + }, + { + "epoch": 0.16875134785421608, + "grad_norm": 0.8589211106300354, + "learning_rate": 9.958791678992594e-06, + "loss": 0.7498, + "step": 3130 + }, + { + "epoch": 0.16880526202285961, + "grad_norm": 0.7819254398345947, + "learning_rate": 9.95876451174733e-06, + "loss": 0.7515, + "step": 3131 + }, + { + "epoch": 0.16885917619150312, + "grad_norm": 0.9037144184112549, + "learning_rate": 9.958737335586877e-06, + "loss": 0.7684, + "step": 3132 + }, + { + "epoch": 0.16891309036014665, + "grad_norm": 0.9139670133590698, + "learning_rate": 9.958710150511282e-06, + "loss": 0.7848, + "step": 3133 + }, + { + "epoch": 0.16896700452879015, + "grad_norm": 0.8177505135536194, + "learning_rate": 9.958682956520596e-06, + "loss": 0.8656, + "step": 3134 + }, + { + "epoch": 0.1690209186974337, + "grad_norm": 0.7351679801940918, + "learning_rate": 9.958655753614865e-06, + "loss": 0.769, + "step": 3135 + }, + { + "epoch": 0.16907483286607722, + "grad_norm": 0.8661699891090393, + "learning_rate": 9.958628541794142e-06, + "loss": 0.8523, + "step": 3136 + }, + { + "epoch": 0.16912874703472072, + "grad_norm": 0.7755950689315796, + "learning_rate": 9.958601321058471e-06, + "loss": 0.7737, + "step": 3137 + }, + { + "epoch": 0.16918266120336425, + "grad_norm": 0.8523197174072266, + "learning_rate": 9.958574091407906e-06, + "loss": 0.8508, + "step": 3138 + }, + { + "epoch": 0.16923657537200776, + "grad_norm": 0.7154935598373413, + "learning_rate": 9.958546852842493e-06, + "loss": 0.6725, + "step": 3139 + }, + { + "epoch": 0.1692904895406513, + "grad_norm": 0.8140445947647095, + "learning_rate": 9.95851960536228e-06, + "loss": 0.92, + "step": 3140 + }, + { + "epoch": 0.1693444037092948, + "grad_norm": 0.7320675849914551, + "learning_rate": 9.95849234896732e-06, + "loss": 0.8091, + "step": 3141 + }, + { + "epoch": 0.16939831787793833, + "grad_norm": 0.7761030197143555, + "learning_rate": 9.958465083657659e-06, + "loss": 0.7444, + "step": 3142 + }, + { + "epoch": 0.16945223204658183, + "grad_norm": 0.8432923555374146, + "learning_rate": 9.958437809433345e-06, + "loss": 0.8112, + "step": 3143 + }, + { + "epoch": 0.16950614621522536, + "grad_norm": 0.8015188574790955, + "learning_rate": 9.958410526294428e-06, + "loss": 0.8383, + "step": 3144 + }, + { + "epoch": 0.1695600603838689, + "grad_norm": 0.7635226845741272, + "learning_rate": 9.95838323424096e-06, + "loss": 0.7942, + "step": 3145 + }, + { + "epoch": 0.1696139745525124, + "grad_norm": 0.942131757736206, + "learning_rate": 9.958355933272986e-06, + "loss": 0.8877, + "step": 3146 + }, + { + "epoch": 0.16966788872115593, + "grad_norm": 1.1072907447814941, + "learning_rate": 9.958328623390558e-06, + "loss": 0.7369, + "step": 3147 + }, + { + "epoch": 0.16972180288979943, + "grad_norm": 0.8342657685279846, + "learning_rate": 9.958301304593722e-06, + "loss": 0.7946, + "step": 3148 + }, + { + "epoch": 0.16977571705844297, + "grad_norm": 0.7320284843444824, + "learning_rate": 9.958273976882531e-06, + "loss": 0.754, + "step": 3149 + }, + { + "epoch": 0.16982963122708647, + "grad_norm": 0.7840715646743774, + "learning_rate": 9.958246640257031e-06, + "loss": 0.7897, + "step": 3150 + }, + { + "epoch": 0.16988354539573, + "grad_norm": 0.7383304834365845, + "learning_rate": 9.958219294717273e-06, + "loss": 0.8205, + "step": 3151 + }, + { + "epoch": 0.1699374595643735, + "grad_norm": 0.7597193121910095, + "learning_rate": 9.958191940263305e-06, + "loss": 0.8016, + "step": 3152 + }, + { + "epoch": 0.16999137373301704, + "grad_norm": 0.7770809531211853, + "learning_rate": 9.958164576895176e-06, + "loss": 0.7228, + "step": 3153 + }, + { + "epoch": 0.17004528790166057, + "grad_norm": 0.891514241695404, + "learning_rate": 9.958137204612936e-06, + "loss": 0.8598, + "step": 3154 + }, + { + "epoch": 0.17009920207030407, + "grad_norm": 0.8025946021080017, + "learning_rate": 9.958109823416635e-06, + "loss": 0.8979, + "step": 3155 + }, + { + "epoch": 0.1701531162389476, + "grad_norm": 0.7912386059761047, + "learning_rate": 9.95808243330632e-06, + "loss": 0.7562, + "step": 3156 + }, + { + "epoch": 0.1702070304075911, + "grad_norm": 0.8642987608909607, + "learning_rate": 9.958055034282043e-06, + "loss": 0.7916, + "step": 3157 + }, + { + "epoch": 0.17026094457623464, + "grad_norm": 0.8047364950180054, + "learning_rate": 9.958027626343852e-06, + "loss": 0.7598, + "step": 3158 + }, + { + "epoch": 0.17031485874487814, + "grad_norm": 0.8402281999588013, + "learning_rate": 9.958000209491794e-06, + "loss": 0.8572, + "step": 3159 + }, + { + "epoch": 0.17036877291352168, + "grad_norm": 0.7486295700073242, + "learning_rate": 9.95797278372592e-06, + "loss": 0.7221, + "step": 3160 + }, + { + "epoch": 0.17042268708216518, + "grad_norm": 0.7889320254325867, + "learning_rate": 9.95794534904628e-06, + "loss": 0.7734, + "step": 3161 + }, + { + "epoch": 0.1704766012508087, + "grad_norm": 0.7864039540290833, + "learning_rate": 9.957917905452925e-06, + "loss": 0.7763, + "step": 3162 + }, + { + "epoch": 0.17053051541945224, + "grad_norm": 0.8366582989692688, + "learning_rate": 9.957890452945903e-06, + "loss": 0.8594, + "step": 3163 + }, + { + "epoch": 0.17058442958809575, + "grad_norm": 0.8014213442802429, + "learning_rate": 9.95786299152526e-06, + "loss": 0.7802, + "step": 3164 + }, + { + "epoch": 0.17063834375673928, + "grad_norm": 0.8158774375915527, + "learning_rate": 9.957835521191048e-06, + "loss": 0.7693, + "step": 3165 + }, + { + "epoch": 0.17069225792538278, + "grad_norm": 1.0622320175170898, + "learning_rate": 9.957808041943316e-06, + "loss": 0.8949, + "step": 3166 + }, + { + "epoch": 0.17074617209402632, + "grad_norm": 0.7825013399124146, + "learning_rate": 9.957780553782114e-06, + "loss": 0.7681, + "step": 3167 + }, + { + "epoch": 0.17080008626266982, + "grad_norm": 1.0727826356887817, + "learning_rate": 9.957753056707493e-06, + "loss": 0.876, + "step": 3168 + }, + { + "epoch": 0.17085400043131335, + "grad_norm": 0.7952837944030762, + "learning_rate": 9.9577255507195e-06, + "loss": 0.7671, + "step": 3169 + }, + { + "epoch": 0.17090791459995686, + "grad_norm": 0.7251336574554443, + "learning_rate": 9.957698035818185e-06, + "loss": 0.7938, + "step": 3170 + }, + { + "epoch": 0.1709618287686004, + "grad_norm": 0.8674930930137634, + "learning_rate": 9.957670512003598e-06, + "loss": 0.9387, + "step": 3171 + }, + { + "epoch": 0.17101574293724392, + "grad_norm": 0.7578595876693726, + "learning_rate": 9.957642979275787e-06, + "loss": 0.8295, + "step": 3172 + }, + { + "epoch": 0.17106965710588742, + "grad_norm": 0.8236204385757446, + "learning_rate": 9.957615437634802e-06, + "loss": 0.871, + "step": 3173 + }, + { + "epoch": 0.17112357127453096, + "grad_norm": 0.7528506517410278, + "learning_rate": 9.957587887080696e-06, + "loss": 0.7034, + "step": 3174 + }, + { + "epoch": 0.17117748544317446, + "grad_norm": 0.8170275092124939, + "learning_rate": 9.957560327613514e-06, + "loss": 0.7412, + "step": 3175 + }, + { + "epoch": 0.171231399611818, + "grad_norm": 0.91305011510849, + "learning_rate": 9.957532759233307e-06, + "loss": 0.8861, + "step": 3176 + }, + { + "epoch": 0.1712853137804615, + "grad_norm": 0.7793359756469727, + "learning_rate": 9.957505181940124e-06, + "loss": 0.8106, + "step": 3177 + }, + { + "epoch": 0.17133922794910503, + "grad_norm": 0.9424631595611572, + "learning_rate": 9.957477595734016e-06, + "loss": 0.8271, + "step": 3178 + }, + { + "epoch": 0.17139314211774853, + "grad_norm": 0.8909611701965332, + "learning_rate": 9.957450000615031e-06, + "loss": 0.8711, + "step": 3179 + }, + { + "epoch": 0.17144705628639206, + "grad_norm": 0.703960657119751, + "learning_rate": 9.95742239658322e-06, + "loss": 0.6693, + "step": 3180 + }, + { + "epoch": 0.1715009704550356, + "grad_norm": 0.8511449098587036, + "learning_rate": 9.957394783638632e-06, + "loss": 0.8075, + "step": 3181 + }, + { + "epoch": 0.1715548846236791, + "grad_norm": 0.93243008852005, + "learning_rate": 9.957367161781318e-06, + "loss": 0.8663, + "step": 3182 + }, + { + "epoch": 0.17160879879232263, + "grad_norm": 0.926092803478241, + "learning_rate": 9.957339531011325e-06, + "loss": 0.8973, + "step": 3183 + }, + { + "epoch": 0.17166271296096614, + "grad_norm": 0.8564586043357849, + "learning_rate": 9.957311891328705e-06, + "loss": 0.7561, + "step": 3184 + }, + { + "epoch": 0.17171662712960967, + "grad_norm": 0.8317960500717163, + "learning_rate": 9.957284242733507e-06, + "loss": 0.817, + "step": 3185 + }, + { + "epoch": 0.17177054129825317, + "grad_norm": 0.7291557788848877, + "learning_rate": 9.95725658522578e-06, + "loss": 0.6963, + "step": 3186 + }, + { + "epoch": 0.1718244554668967, + "grad_norm": 0.8154743313789368, + "learning_rate": 9.957228918805574e-06, + "loss": 0.8005, + "step": 3187 + }, + { + "epoch": 0.1718783696355402, + "grad_norm": 0.7985217571258545, + "learning_rate": 9.95720124347294e-06, + "loss": 0.8471, + "step": 3188 + }, + { + "epoch": 0.17193228380418374, + "grad_norm": 0.7928630709648132, + "learning_rate": 9.957173559227926e-06, + "loss": 0.8809, + "step": 3189 + }, + { + "epoch": 0.17198619797282727, + "grad_norm": 0.800392210483551, + "learning_rate": 9.957145866070583e-06, + "loss": 0.8031, + "step": 3190 + }, + { + "epoch": 0.17204011214147077, + "grad_norm": 0.8904628157615662, + "learning_rate": 9.95711816400096e-06, + "loss": 0.7583, + "step": 3191 + }, + { + "epoch": 0.1720940263101143, + "grad_norm": 0.7246114611625671, + "learning_rate": 9.957090453019106e-06, + "loss": 0.7365, + "step": 3192 + }, + { + "epoch": 0.1721479404787578, + "grad_norm": 0.8280320763587952, + "learning_rate": 9.957062733125074e-06, + "loss": 0.7723, + "step": 3193 + }, + { + "epoch": 0.17220185464740134, + "grad_norm": 0.929804265499115, + "learning_rate": 9.957035004318911e-06, + "loss": 0.8412, + "step": 3194 + }, + { + "epoch": 0.17225576881604485, + "grad_norm": 0.815108060836792, + "learning_rate": 9.957007266600666e-06, + "loss": 0.8076, + "step": 3195 + }, + { + "epoch": 0.17230968298468838, + "grad_norm": 0.7849567532539368, + "learning_rate": 9.956979519970393e-06, + "loss": 0.8245, + "step": 3196 + }, + { + "epoch": 0.17236359715333188, + "grad_norm": 1.458945393562317, + "learning_rate": 9.956951764428138e-06, + "loss": 0.7647, + "step": 3197 + }, + { + "epoch": 0.17241751132197541, + "grad_norm": 0.8327317833900452, + "learning_rate": 9.956923999973954e-06, + "loss": 0.8824, + "step": 3198 + }, + { + "epoch": 0.17247142549061895, + "grad_norm": 0.7398284077644348, + "learning_rate": 9.956896226607887e-06, + "loss": 0.7907, + "step": 3199 + }, + { + "epoch": 0.17252533965926245, + "grad_norm": 0.8546818494796753, + "learning_rate": 9.95686844432999e-06, + "loss": 0.8723, + "step": 3200 + }, + { + "epoch": 0.17257925382790598, + "grad_norm": 0.7967200875282288, + "learning_rate": 9.956840653140311e-06, + "loss": 0.8156, + "step": 3201 + }, + { + "epoch": 0.17263316799654949, + "grad_norm": 0.9093504548072815, + "learning_rate": 9.956812853038903e-06, + "loss": 0.8002, + "step": 3202 + }, + { + "epoch": 0.17268708216519302, + "grad_norm": 0.7995857000350952, + "learning_rate": 9.956785044025811e-06, + "loss": 0.8413, + "step": 3203 + }, + { + "epoch": 0.17274099633383652, + "grad_norm": 0.828748881816864, + "learning_rate": 9.95675722610109e-06, + "loss": 0.7162, + "step": 3204 + }, + { + "epoch": 0.17279491050248005, + "grad_norm": 0.7679111361503601, + "learning_rate": 9.956729399264789e-06, + "loss": 0.7909, + "step": 3205 + }, + { + "epoch": 0.17284882467112356, + "grad_norm": 0.9187313318252563, + "learning_rate": 9.956701563516956e-06, + "loss": 0.8537, + "step": 3206 + }, + { + "epoch": 0.1729027388397671, + "grad_norm": 0.7859029173851013, + "learning_rate": 9.956673718857642e-06, + "loss": 0.7392, + "step": 3207 + }, + { + "epoch": 0.17295665300841062, + "grad_norm": 0.8365893363952637, + "learning_rate": 9.956645865286897e-06, + "loss": 0.7921, + "step": 3208 + }, + { + "epoch": 0.17301056717705413, + "grad_norm": 0.912382960319519, + "learning_rate": 9.956618002804771e-06, + "loss": 0.8651, + "step": 3209 + }, + { + "epoch": 0.17306448134569766, + "grad_norm": 0.7380210757255554, + "learning_rate": 9.956590131411314e-06, + "loss": 0.7031, + "step": 3210 + }, + { + "epoch": 0.17311839551434116, + "grad_norm": 0.7943229675292969, + "learning_rate": 9.956562251106578e-06, + "loss": 0.7725, + "step": 3211 + }, + { + "epoch": 0.1731723096829847, + "grad_norm": 0.8835777640342712, + "learning_rate": 9.95653436189061e-06, + "loss": 0.8633, + "step": 3212 + }, + { + "epoch": 0.1732262238516282, + "grad_norm": 0.8082174062728882, + "learning_rate": 9.956506463763464e-06, + "loss": 0.8833, + "step": 3213 + }, + { + "epoch": 0.17328013802027173, + "grad_norm": 0.8236085772514343, + "learning_rate": 9.956478556725186e-06, + "loss": 0.8517, + "step": 3214 + }, + { + "epoch": 0.17333405218891526, + "grad_norm": 0.8428922891616821, + "learning_rate": 9.956450640775829e-06, + "loss": 0.8659, + "step": 3215 + }, + { + "epoch": 0.17338796635755876, + "grad_norm": 0.8443105220794678, + "learning_rate": 9.95642271591544e-06, + "loss": 0.9589, + "step": 3216 + }, + { + "epoch": 0.1734418805262023, + "grad_norm": 0.7856699228286743, + "learning_rate": 9.956394782144074e-06, + "loss": 0.787, + "step": 3217 + }, + { + "epoch": 0.1734957946948458, + "grad_norm": 0.8537113666534424, + "learning_rate": 9.95636683946178e-06, + "loss": 0.9339, + "step": 3218 + }, + { + "epoch": 0.17354970886348933, + "grad_norm": 0.8206045627593994, + "learning_rate": 9.956338887868603e-06, + "loss": 0.832, + "step": 3219 + }, + { + "epoch": 0.17360362303213284, + "grad_norm": 0.7913991808891296, + "learning_rate": 9.956310927364599e-06, + "loss": 0.7647, + "step": 3220 + }, + { + "epoch": 0.17365753720077637, + "grad_norm": 0.9481332302093506, + "learning_rate": 9.956282957949817e-06, + "loss": 0.7113, + "step": 3221 + }, + { + "epoch": 0.17371145136941987, + "grad_norm": 0.9326061606407166, + "learning_rate": 9.956254979624304e-06, + "loss": 0.8324, + "step": 3222 + }, + { + "epoch": 0.1737653655380634, + "grad_norm": 1.0496339797973633, + "learning_rate": 9.956226992388117e-06, + "loss": 0.7959, + "step": 3223 + }, + { + "epoch": 0.17381927970670694, + "grad_norm": 0.8025851249694824, + "learning_rate": 9.9561989962413e-06, + "loss": 0.811, + "step": 3224 + }, + { + "epoch": 0.17387319387535044, + "grad_norm": 0.9083681106567383, + "learning_rate": 9.956170991183905e-06, + "loss": 0.7957, + "step": 3225 + }, + { + "epoch": 0.17392710804399397, + "grad_norm": 0.8242226243019104, + "learning_rate": 9.956142977215983e-06, + "loss": 0.8224, + "step": 3226 + }, + { + "epoch": 0.17398102221263748, + "grad_norm": 0.8805774450302124, + "learning_rate": 9.956114954337586e-06, + "loss": 0.8847, + "step": 3227 + }, + { + "epoch": 0.174034936381281, + "grad_norm": 0.748651921749115, + "learning_rate": 9.956086922548761e-06, + "loss": 0.7719, + "step": 3228 + }, + { + "epoch": 0.1740888505499245, + "grad_norm": 0.7385552525520325, + "learning_rate": 9.956058881849562e-06, + "loss": 0.7591, + "step": 3229 + }, + { + "epoch": 0.17414276471856804, + "grad_norm": 0.7795779705047607, + "learning_rate": 9.956030832240037e-06, + "loss": 0.8071, + "step": 3230 + }, + { + "epoch": 0.17419667888721155, + "grad_norm": 9.106490135192871, + "learning_rate": 9.956002773720236e-06, + "loss": 0.7915, + "step": 3231 + }, + { + "epoch": 0.17425059305585508, + "grad_norm": 0.861794650554657, + "learning_rate": 9.955974706290212e-06, + "loss": 0.8293, + "step": 3232 + }, + { + "epoch": 0.1743045072244986, + "grad_norm": 0.8002027869224548, + "learning_rate": 9.955946629950012e-06, + "loss": 0.8404, + "step": 3233 + }, + { + "epoch": 0.17435842139314212, + "grad_norm": 0.8162701725959778, + "learning_rate": 9.95591854469969e-06, + "loss": 0.8362, + "step": 3234 + }, + { + "epoch": 0.17441233556178565, + "grad_norm": 0.7436956763267517, + "learning_rate": 9.955890450539295e-06, + "loss": 0.8339, + "step": 3235 + }, + { + "epoch": 0.17446624973042915, + "grad_norm": 0.8074719309806824, + "learning_rate": 9.955862347468875e-06, + "loss": 0.8403, + "step": 3236 + }, + { + "epoch": 0.17452016389907268, + "grad_norm": 0.8527933955192566, + "learning_rate": 9.955834235488485e-06, + "loss": 0.8201, + "step": 3237 + }, + { + "epoch": 0.1745740780677162, + "grad_norm": 0.792177140712738, + "learning_rate": 9.955806114598173e-06, + "loss": 0.8304, + "step": 3238 + }, + { + "epoch": 0.17462799223635972, + "grad_norm": 0.8211845755577087, + "learning_rate": 9.95577798479799e-06, + "loss": 0.8013, + "step": 3239 + }, + { + "epoch": 0.17468190640500322, + "grad_norm": 0.906973659992218, + "learning_rate": 9.955749846087986e-06, + "loss": 0.823, + "step": 3240 + }, + { + "epoch": 0.17473582057364676, + "grad_norm": 0.904077410697937, + "learning_rate": 9.955721698468213e-06, + "loss": 0.7651, + "step": 3241 + }, + { + "epoch": 0.1747897347422903, + "grad_norm": 0.8147358298301697, + "learning_rate": 9.95569354193872e-06, + "loss": 0.9268, + "step": 3242 + }, + { + "epoch": 0.1748436489109338, + "grad_norm": 0.8664659857749939, + "learning_rate": 9.95566537649956e-06, + "loss": 0.8366, + "step": 3243 + }, + { + "epoch": 0.17489756307957732, + "grad_norm": 0.6882225871086121, + "learning_rate": 9.95563720215078e-06, + "loss": 0.7152, + "step": 3244 + }, + { + "epoch": 0.17495147724822083, + "grad_norm": 0.7605637907981873, + "learning_rate": 9.955609018892434e-06, + "loss": 0.7864, + "step": 3245 + }, + { + "epoch": 0.17500539141686436, + "grad_norm": 0.7316586375236511, + "learning_rate": 9.95558082672457e-06, + "loss": 0.7175, + "step": 3246 + }, + { + "epoch": 0.17505930558550786, + "grad_norm": 0.8258477449417114, + "learning_rate": 9.955552625647241e-06, + "loss": 0.8463, + "step": 3247 + }, + { + "epoch": 0.1751132197541514, + "grad_norm": 0.7658422589302063, + "learning_rate": 9.955524415660498e-06, + "loss": 0.9477, + "step": 3248 + }, + { + "epoch": 0.1751671339227949, + "grad_norm": 0.9374455809593201, + "learning_rate": 9.955496196764387e-06, + "loss": 0.8725, + "step": 3249 + }, + { + "epoch": 0.17522104809143843, + "grad_norm": 0.7676389813423157, + "learning_rate": 9.955467968958965e-06, + "loss": 0.7868, + "step": 3250 + }, + { + "epoch": 0.17527496226008196, + "grad_norm": 0.9800841808319092, + "learning_rate": 9.955439732244279e-06, + "loss": 0.7787, + "step": 3251 + }, + { + "epoch": 0.17532887642872547, + "grad_norm": 0.7501618266105652, + "learning_rate": 9.95541148662038e-06, + "loss": 0.7703, + "step": 3252 + }, + { + "epoch": 0.175382790597369, + "grad_norm": 0.8019260168075562, + "learning_rate": 9.95538323208732e-06, + "loss": 0.7635, + "step": 3253 + }, + { + "epoch": 0.1754367047660125, + "grad_norm": 0.7791414260864258, + "learning_rate": 9.95535496864515e-06, + "loss": 0.7372, + "step": 3254 + }, + { + "epoch": 0.17549061893465603, + "grad_norm": 0.7667005658149719, + "learning_rate": 9.955326696293921e-06, + "loss": 0.8481, + "step": 3255 + }, + { + "epoch": 0.17554453310329954, + "grad_norm": 0.7585765719413757, + "learning_rate": 9.955298415033681e-06, + "loss": 0.7933, + "step": 3256 + }, + { + "epoch": 0.17559844727194307, + "grad_norm": 0.8037384152412415, + "learning_rate": 9.955270124864485e-06, + "loss": 0.8716, + "step": 3257 + }, + { + "epoch": 0.17565236144058657, + "grad_norm": 0.7610961198806763, + "learning_rate": 9.955241825786379e-06, + "loss": 0.7647, + "step": 3258 + }, + { + "epoch": 0.1757062756092301, + "grad_norm": 0.7867752909660339, + "learning_rate": 9.955213517799418e-06, + "loss": 0.7685, + "step": 3259 + }, + { + "epoch": 0.17576018977787364, + "grad_norm": 1.1530165672302246, + "learning_rate": 9.955185200903652e-06, + "loss": 0.9032, + "step": 3260 + }, + { + "epoch": 0.17581410394651714, + "grad_norm": 0.7161276936531067, + "learning_rate": 9.955156875099129e-06, + "loss": 0.7367, + "step": 3261 + }, + { + "epoch": 0.17586801811516067, + "grad_norm": 0.7634873390197754, + "learning_rate": 9.955128540385903e-06, + "loss": 0.6914, + "step": 3262 + }, + { + "epoch": 0.17592193228380418, + "grad_norm": 0.8375166654586792, + "learning_rate": 9.955100196764025e-06, + "loss": 0.965, + "step": 3263 + }, + { + "epoch": 0.1759758464524477, + "grad_norm": 0.784824788570404, + "learning_rate": 9.955071844233545e-06, + "loss": 0.7825, + "step": 3264 + }, + { + "epoch": 0.1760297606210912, + "grad_norm": 0.7765333652496338, + "learning_rate": 9.955043482794514e-06, + "loss": 0.9057, + "step": 3265 + }, + { + "epoch": 0.17608367478973475, + "grad_norm": 0.9159989356994629, + "learning_rate": 9.955015112446985e-06, + "loss": 0.8055, + "step": 3266 + }, + { + "epoch": 0.17613758895837825, + "grad_norm": 0.8813021183013916, + "learning_rate": 9.954986733191003e-06, + "loss": 0.8811, + "step": 3267 + }, + { + "epoch": 0.17619150312702178, + "grad_norm": 0.7664482593536377, + "learning_rate": 9.954958345026627e-06, + "loss": 0.7138, + "step": 3268 + }, + { + "epoch": 0.1762454172956653, + "grad_norm": 0.8903096914291382, + "learning_rate": 9.954929947953902e-06, + "loss": 0.8884, + "step": 3269 + }, + { + "epoch": 0.17629933146430882, + "grad_norm": 0.750549852848053, + "learning_rate": 9.95490154197288e-06, + "loss": 0.7948, + "step": 3270 + }, + { + "epoch": 0.17635324563295235, + "grad_norm": 0.8723561763763428, + "learning_rate": 9.954873127083615e-06, + "loss": 0.8896, + "step": 3271 + }, + { + "epoch": 0.17640715980159585, + "grad_norm": 0.8852900862693787, + "learning_rate": 9.954844703286157e-06, + "loss": 0.8504, + "step": 3272 + }, + { + "epoch": 0.17646107397023938, + "grad_norm": 0.8535251021385193, + "learning_rate": 9.954816270580555e-06, + "loss": 0.7198, + "step": 3273 + }, + { + "epoch": 0.1765149881388829, + "grad_norm": 0.8378668427467346, + "learning_rate": 9.954787828966864e-06, + "loss": 0.8361, + "step": 3274 + }, + { + "epoch": 0.17656890230752642, + "grad_norm": 0.7617664337158203, + "learning_rate": 9.954759378445132e-06, + "loss": 0.8147, + "step": 3275 + }, + { + "epoch": 0.17662281647616992, + "grad_norm": 0.8433284163475037, + "learning_rate": 9.95473091901541e-06, + "loss": 0.9083, + "step": 3276 + }, + { + "epoch": 0.17667673064481346, + "grad_norm": 0.82453453540802, + "learning_rate": 9.954702450677749e-06, + "loss": 0.8646, + "step": 3277 + }, + { + "epoch": 0.176730644813457, + "grad_norm": 0.8066715598106384, + "learning_rate": 9.954673973432202e-06, + "loss": 0.7837, + "step": 3278 + }, + { + "epoch": 0.1767845589821005, + "grad_norm": 0.7899057865142822, + "learning_rate": 9.95464548727882e-06, + "loss": 0.8418, + "step": 3279 + }, + { + "epoch": 0.17683847315074402, + "grad_norm": 0.7744193077087402, + "learning_rate": 9.954616992217654e-06, + "loss": 0.7316, + "step": 3280 + }, + { + "epoch": 0.17689238731938753, + "grad_norm": 0.9195299744606018, + "learning_rate": 9.954588488248756e-06, + "loss": 0.9387, + "step": 3281 + }, + { + "epoch": 0.17694630148803106, + "grad_norm": 0.9263700246810913, + "learning_rate": 9.954559975372173e-06, + "loss": 0.7165, + "step": 3282 + }, + { + "epoch": 0.17700021565667456, + "grad_norm": 0.7949888706207275, + "learning_rate": 9.954531453587962e-06, + "loss": 0.7981, + "step": 3283 + }, + { + "epoch": 0.1770541298253181, + "grad_norm": 0.9938671588897705, + "learning_rate": 9.95450292289617e-06, + "loss": 0.754, + "step": 3284 + }, + { + "epoch": 0.1771080439939616, + "grad_norm": 0.7466611862182617, + "learning_rate": 9.95447438329685e-06, + "loss": 0.8182, + "step": 3285 + }, + { + "epoch": 0.17716195816260513, + "grad_norm": 0.7918881177902222, + "learning_rate": 9.954445834790054e-06, + "loss": 0.6938, + "step": 3286 + }, + { + "epoch": 0.17721587233124866, + "grad_norm": 0.7867146730422974, + "learning_rate": 9.954417277375832e-06, + "loss": 0.7999, + "step": 3287 + }, + { + "epoch": 0.17726978649989217, + "grad_norm": 0.7873522043228149, + "learning_rate": 9.954388711054237e-06, + "loss": 0.7822, + "step": 3288 + }, + { + "epoch": 0.1773237006685357, + "grad_norm": 0.7909482717514038, + "learning_rate": 9.954360135825319e-06, + "loss": 0.724, + "step": 3289 + }, + { + "epoch": 0.1773776148371792, + "grad_norm": 0.7893263697624207, + "learning_rate": 9.954331551689129e-06, + "loss": 0.8892, + "step": 3290 + }, + { + "epoch": 0.17743152900582274, + "grad_norm": 0.813908040523529, + "learning_rate": 9.954302958645719e-06, + "loss": 0.8261, + "step": 3291 + }, + { + "epoch": 0.17748544317446624, + "grad_norm": 1.0279232263565063, + "learning_rate": 9.95427435669514e-06, + "loss": 0.8383, + "step": 3292 + }, + { + "epoch": 0.17753935734310977, + "grad_norm": 0.7427249550819397, + "learning_rate": 9.954245745837445e-06, + "loss": 0.7883, + "step": 3293 + }, + { + "epoch": 0.17759327151175328, + "grad_norm": 0.7699581980705261, + "learning_rate": 9.954217126072686e-06, + "loss": 0.749, + "step": 3294 + }, + { + "epoch": 0.1776471856803968, + "grad_norm": 0.8005263209342957, + "learning_rate": 9.954188497400909e-06, + "loss": 0.7886, + "step": 3295 + }, + { + "epoch": 0.17770109984904034, + "grad_norm": 0.8718039393424988, + "learning_rate": 9.95415985982217e-06, + "loss": 0.8397, + "step": 3296 + }, + { + "epoch": 0.17775501401768384, + "grad_norm": 0.7747098207473755, + "learning_rate": 9.954131213336522e-06, + "loss": 0.7193, + "step": 3297 + }, + { + "epoch": 0.17780892818632738, + "grad_norm": 0.8327599167823792, + "learning_rate": 9.954102557944013e-06, + "loss": 0.8484, + "step": 3298 + }, + { + "epoch": 0.17786284235497088, + "grad_norm": 0.7737470269203186, + "learning_rate": 9.954073893644696e-06, + "loss": 0.7638, + "step": 3299 + }, + { + "epoch": 0.1779167565236144, + "grad_norm": 0.8054937124252319, + "learning_rate": 9.954045220438622e-06, + "loss": 0.7772, + "step": 3300 + }, + { + "epoch": 0.17797067069225792, + "grad_norm": 0.7954006195068359, + "learning_rate": 9.954016538325844e-06, + "loss": 0.7746, + "step": 3301 + }, + { + "epoch": 0.17802458486090145, + "grad_norm": 0.8075349926948547, + "learning_rate": 9.95398784730641e-06, + "loss": 0.794, + "step": 3302 + }, + { + "epoch": 0.17807849902954495, + "grad_norm": 0.8701021075248718, + "learning_rate": 9.953959147380376e-06, + "loss": 0.8493, + "step": 3303 + }, + { + "epoch": 0.17813241319818848, + "grad_norm": 0.9046748876571655, + "learning_rate": 9.953930438547792e-06, + "loss": 0.8491, + "step": 3304 + }, + { + "epoch": 0.17818632736683201, + "grad_norm": 0.8041692972183228, + "learning_rate": 9.953901720808708e-06, + "loss": 0.7422, + "step": 3305 + }, + { + "epoch": 0.17824024153547552, + "grad_norm": 0.8486021757125854, + "learning_rate": 9.953872994163176e-06, + "loss": 0.7876, + "step": 3306 + }, + { + "epoch": 0.17829415570411905, + "grad_norm": 0.7282015085220337, + "learning_rate": 9.95384425861125e-06, + "loss": 0.7729, + "step": 3307 + }, + { + "epoch": 0.17834806987276255, + "grad_norm": 0.8199304342269897, + "learning_rate": 9.953815514152979e-06, + "loss": 0.8046, + "step": 3308 + }, + { + "epoch": 0.1784019840414061, + "grad_norm": 0.9033650755882263, + "learning_rate": 9.953786760788416e-06, + "loss": 0.735, + "step": 3309 + }, + { + "epoch": 0.1784558982100496, + "grad_norm": 1.1363990306854248, + "learning_rate": 9.953757998517614e-06, + "loss": 0.8351, + "step": 3310 + }, + { + "epoch": 0.17850981237869312, + "grad_norm": 0.747763454914093, + "learning_rate": 9.953729227340621e-06, + "loss": 0.7603, + "step": 3311 + }, + { + "epoch": 0.17856372654733663, + "grad_norm": 0.8733643293380737, + "learning_rate": 9.953700447257493e-06, + "loss": 0.8538, + "step": 3312 + }, + { + "epoch": 0.17861764071598016, + "grad_norm": 0.8054553270339966, + "learning_rate": 9.953671658268279e-06, + "loss": 0.6782, + "step": 3313 + }, + { + "epoch": 0.1786715548846237, + "grad_norm": 0.8797160387039185, + "learning_rate": 9.953642860373032e-06, + "loss": 0.613, + "step": 3314 + }, + { + "epoch": 0.1787254690532672, + "grad_norm": 0.7065737843513489, + "learning_rate": 9.953614053571802e-06, + "loss": 0.7912, + "step": 3315 + }, + { + "epoch": 0.17877938322191073, + "grad_norm": 0.8206682205200195, + "learning_rate": 9.953585237864642e-06, + "loss": 0.8505, + "step": 3316 + }, + { + "epoch": 0.17883329739055423, + "grad_norm": 0.7129380702972412, + "learning_rate": 9.953556413251605e-06, + "loss": 0.7242, + "step": 3317 + }, + { + "epoch": 0.17888721155919776, + "grad_norm": 0.8084376454353333, + "learning_rate": 9.953527579732742e-06, + "loss": 0.7626, + "step": 3318 + }, + { + "epoch": 0.17894112572784127, + "grad_norm": 0.8610605001449585, + "learning_rate": 9.953498737308103e-06, + "loss": 0.8255, + "step": 3319 + }, + { + "epoch": 0.1789950398964848, + "grad_norm": 0.7437496185302734, + "learning_rate": 9.953469885977742e-06, + "loss": 0.677, + "step": 3320 + }, + { + "epoch": 0.17904895406512833, + "grad_norm": 0.7540122270584106, + "learning_rate": 9.95344102574171e-06, + "loss": 0.7094, + "step": 3321 + }, + { + "epoch": 0.17910286823377183, + "grad_norm": 0.8017913699150085, + "learning_rate": 9.95341215660006e-06, + "loss": 0.8882, + "step": 3322 + }, + { + "epoch": 0.17915678240241537, + "grad_norm": 1.0244393348693848, + "learning_rate": 9.953383278552841e-06, + "loss": 0.7879, + "step": 3323 + }, + { + "epoch": 0.17921069657105887, + "grad_norm": 0.7007571458816528, + "learning_rate": 9.953354391600109e-06, + "loss": 0.757, + "step": 3324 + }, + { + "epoch": 0.1792646107397024, + "grad_norm": 0.8408647775650024, + "learning_rate": 9.953325495741913e-06, + "loss": 0.7772, + "step": 3325 + }, + { + "epoch": 0.1793185249083459, + "grad_norm": 0.718988299369812, + "learning_rate": 9.953296590978305e-06, + "loss": 0.7885, + "step": 3326 + }, + { + "epoch": 0.17937243907698944, + "grad_norm": 0.7917525768280029, + "learning_rate": 9.95326767730934e-06, + "loss": 0.8321, + "step": 3327 + }, + { + "epoch": 0.17942635324563294, + "grad_norm": 0.9516105055809021, + "learning_rate": 9.953238754735066e-06, + "loss": 0.8124, + "step": 3328 + }, + { + "epoch": 0.17948026741427647, + "grad_norm": 0.8829317688941956, + "learning_rate": 9.953209823255536e-06, + "loss": 0.7426, + "step": 3329 + }, + { + "epoch": 0.17953418158292, + "grad_norm": 0.83402019739151, + "learning_rate": 9.953180882870805e-06, + "loss": 0.7358, + "step": 3330 + }, + { + "epoch": 0.1795880957515635, + "grad_norm": 0.819425106048584, + "learning_rate": 9.953151933580923e-06, + "loss": 0.8002, + "step": 3331 + }, + { + "epoch": 0.17964200992020704, + "grad_norm": 0.8458916544914246, + "learning_rate": 9.95312297538594e-06, + "loss": 0.8305, + "step": 3332 + }, + { + "epoch": 0.17969592408885054, + "grad_norm": 0.8235782980918884, + "learning_rate": 9.95309400828591e-06, + "loss": 0.8228, + "step": 3333 + }, + { + "epoch": 0.17974983825749408, + "grad_norm": 0.7924965023994446, + "learning_rate": 9.953065032280885e-06, + "loss": 0.7369, + "step": 3334 + }, + { + "epoch": 0.17980375242613758, + "grad_norm": 0.7931050658226013, + "learning_rate": 9.953036047370919e-06, + "loss": 0.8337, + "step": 3335 + }, + { + "epoch": 0.1798576665947811, + "grad_norm": 0.7998207211494446, + "learning_rate": 9.95300705355606e-06, + "loss": 0.7341, + "step": 3336 + }, + { + "epoch": 0.17991158076342462, + "grad_norm": 0.713846743106842, + "learning_rate": 9.952978050836364e-06, + "loss": 0.6958, + "step": 3337 + }, + { + "epoch": 0.17996549493206815, + "grad_norm": 0.807744026184082, + "learning_rate": 9.95294903921188e-06, + "loss": 0.7723, + "step": 3338 + }, + { + "epoch": 0.18001940910071168, + "grad_norm": 0.865696370601654, + "learning_rate": 9.95292001868266e-06, + "loss": 0.8957, + "step": 3339 + }, + { + "epoch": 0.18007332326935518, + "grad_norm": 0.7955803871154785, + "learning_rate": 9.952890989248763e-06, + "loss": 0.7632, + "step": 3340 + }, + { + "epoch": 0.18012723743799872, + "grad_norm": 0.8028436303138733, + "learning_rate": 9.952861950910233e-06, + "loss": 0.8642, + "step": 3341 + }, + { + "epoch": 0.18018115160664222, + "grad_norm": 0.8755636215209961, + "learning_rate": 9.952832903667125e-06, + "loss": 0.8521, + "step": 3342 + }, + { + "epoch": 0.18023506577528575, + "grad_norm": 0.8018125891685486, + "learning_rate": 9.952803847519492e-06, + "loss": 0.8719, + "step": 3343 + }, + { + "epoch": 0.18028897994392926, + "grad_norm": 0.6923267245292664, + "learning_rate": 9.952774782467384e-06, + "loss": 0.718, + "step": 3344 + }, + { + "epoch": 0.1803428941125728, + "grad_norm": 0.7926875948905945, + "learning_rate": 9.952745708510856e-06, + "loss": 0.8657, + "step": 3345 + }, + { + "epoch": 0.1803968082812163, + "grad_norm": 0.8815774917602539, + "learning_rate": 9.95271662564996e-06, + "loss": 0.8196, + "step": 3346 + }, + { + "epoch": 0.18045072244985982, + "grad_norm": 0.8497309684753418, + "learning_rate": 9.952687533884748e-06, + "loss": 0.7563, + "step": 3347 + }, + { + "epoch": 0.18050463661850336, + "grad_norm": 0.7040117979049683, + "learning_rate": 9.952658433215269e-06, + "loss": 0.687, + "step": 3348 + }, + { + "epoch": 0.18055855078714686, + "grad_norm": 0.8446635007858276, + "learning_rate": 9.95262932364158e-06, + "loss": 0.895, + "step": 3349 + }, + { + "epoch": 0.1806124649557904, + "grad_norm": 0.821702778339386, + "learning_rate": 9.952600205163733e-06, + "loss": 0.8387, + "step": 3350 + }, + { + "epoch": 0.1806663791244339, + "grad_norm": 0.9755251407623291, + "learning_rate": 9.952571077781776e-06, + "loss": 0.9119, + "step": 3351 + }, + { + "epoch": 0.18072029329307743, + "grad_norm": 0.8260585069656372, + "learning_rate": 9.952541941495766e-06, + "loss": 0.7827, + "step": 3352 + }, + { + "epoch": 0.18077420746172093, + "grad_norm": 0.7443965673446655, + "learning_rate": 9.952512796305753e-06, + "loss": 0.7331, + "step": 3353 + }, + { + "epoch": 0.18082812163036446, + "grad_norm": 0.8674094676971436, + "learning_rate": 9.95248364221179e-06, + "loss": 0.8789, + "step": 3354 + }, + { + "epoch": 0.18088203579900797, + "grad_norm": 0.7950018644332886, + "learning_rate": 9.952454479213929e-06, + "loss": 0.7802, + "step": 3355 + }, + { + "epoch": 0.1809359499676515, + "grad_norm": 0.8740068078041077, + "learning_rate": 9.952425307312223e-06, + "loss": 0.9354, + "step": 3356 + }, + { + "epoch": 0.18098986413629503, + "grad_norm": 0.8254936933517456, + "learning_rate": 9.952396126506724e-06, + "loss": 0.8903, + "step": 3357 + }, + { + "epoch": 0.18104377830493854, + "grad_norm": 0.7814514636993408, + "learning_rate": 9.952366936797484e-06, + "loss": 0.7214, + "step": 3358 + }, + { + "epoch": 0.18109769247358207, + "grad_norm": 0.7647988796234131, + "learning_rate": 9.952337738184557e-06, + "loss": 0.7591, + "step": 3359 + }, + { + "epoch": 0.18115160664222557, + "grad_norm": 0.8247759938240051, + "learning_rate": 9.952308530667996e-06, + "loss": 0.7825, + "step": 3360 + }, + { + "epoch": 0.1812055208108691, + "grad_norm": 0.724585771560669, + "learning_rate": 9.95227931424785e-06, + "loss": 0.7828, + "step": 3361 + }, + { + "epoch": 0.1812594349795126, + "grad_norm": 0.8304919004440308, + "learning_rate": 9.952250088924175e-06, + "loss": 0.8071, + "step": 3362 + }, + { + "epoch": 0.18131334914815614, + "grad_norm": 0.8318499326705933, + "learning_rate": 9.95222085469702e-06, + "loss": 0.7571, + "step": 3363 + }, + { + "epoch": 0.18136726331679964, + "grad_norm": 0.7315414547920227, + "learning_rate": 9.952191611566443e-06, + "loss": 0.7644, + "step": 3364 + }, + { + "epoch": 0.18142117748544317, + "grad_norm": 0.853285551071167, + "learning_rate": 9.952162359532493e-06, + "loss": 0.8946, + "step": 3365 + }, + { + "epoch": 0.1814750916540867, + "grad_norm": 0.8418978452682495, + "learning_rate": 9.95213309859522e-06, + "loss": 0.7892, + "step": 3366 + }, + { + "epoch": 0.1815290058227302, + "grad_norm": 0.7926337122917175, + "learning_rate": 9.952103828754682e-06, + "loss": 0.7182, + "step": 3367 + }, + { + "epoch": 0.18158291999137374, + "grad_norm": 0.9103478193283081, + "learning_rate": 9.95207455001093e-06, + "loss": 0.8474, + "step": 3368 + }, + { + "epoch": 0.18163683416001725, + "grad_norm": 0.8050599098205566, + "learning_rate": 9.952045262364014e-06, + "loss": 0.7581, + "step": 3369 + }, + { + "epoch": 0.18169074832866078, + "grad_norm": 0.7441660165786743, + "learning_rate": 9.952015965813988e-06, + "loss": 0.7713, + "step": 3370 + }, + { + "epoch": 0.18174466249730428, + "grad_norm": 0.7210862636566162, + "learning_rate": 9.951986660360906e-06, + "loss": 0.7732, + "step": 3371 + }, + { + "epoch": 0.18179857666594781, + "grad_norm": 0.8199747204780579, + "learning_rate": 9.951957346004822e-06, + "loss": 0.8697, + "step": 3372 + }, + { + "epoch": 0.18185249083459132, + "grad_norm": 0.7781465649604797, + "learning_rate": 9.951928022745784e-06, + "loss": 0.8011, + "step": 3373 + }, + { + "epoch": 0.18190640500323485, + "grad_norm": 0.8713019490242004, + "learning_rate": 9.951898690583848e-06, + "loss": 0.8328, + "step": 3374 + }, + { + "epoch": 0.18196031917187838, + "grad_norm": 0.7194361686706543, + "learning_rate": 9.951869349519066e-06, + "loss": 0.7291, + "step": 3375 + }, + { + "epoch": 0.18201423334052189, + "grad_norm": 0.7940298914909363, + "learning_rate": 9.95183999955149e-06, + "loss": 0.8128, + "step": 3376 + }, + { + "epoch": 0.18206814750916542, + "grad_norm": 0.8048009872436523, + "learning_rate": 9.951810640681175e-06, + "loss": 0.7627, + "step": 3377 + }, + { + "epoch": 0.18212206167780892, + "grad_norm": 0.8479227423667908, + "learning_rate": 9.951781272908173e-06, + "loss": 0.7587, + "step": 3378 + }, + { + "epoch": 0.18217597584645245, + "grad_norm": 0.8620457053184509, + "learning_rate": 9.951751896232534e-06, + "loss": 0.7409, + "step": 3379 + }, + { + "epoch": 0.18222989001509596, + "grad_norm": 0.8283497095108032, + "learning_rate": 9.951722510654314e-06, + "loss": 0.7953, + "step": 3380 + }, + { + "epoch": 0.1822838041837395, + "grad_norm": 0.9071113467216492, + "learning_rate": 9.951693116173565e-06, + "loss": 0.8476, + "step": 3381 + }, + { + "epoch": 0.182337718352383, + "grad_norm": 0.8383519053459167, + "learning_rate": 9.951663712790338e-06, + "loss": 0.8388, + "step": 3382 + }, + { + "epoch": 0.18239163252102653, + "grad_norm": 0.8026612997055054, + "learning_rate": 9.951634300504689e-06, + "loss": 0.8848, + "step": 3383 + }, + { + "epoch": 0.18244554668967006, + "grad_norm": 0.8395872116088867, + "learning_rate": 9.951604879316667e-06, + "loss": 0.7759, + "step": 3384 + }, + { + "epoch": 0.18249946085831356, + "grad_norm": 1.1459238529205322, + "learning_rate": 9.95157544922633e-06, + "loss": 0.8005, + "step": 3385 + }, + { + "epoch": 0.1825533750269571, + "grad_norm": 0.8083657026290894, + "learning_rate": 9.951546010233729e-06, + "loss": 0.8298, + "step": 3386 + }, + { + "epoch": 0.1826072891956006, + "grad_norm": 0.8329801559448242, + "learning_rate": 9.951516562338912e-06, + "loss": 0.7743, + "step": 3387 + }, + { + "epoch": 0.18266120336424413, + "grad_norm": 0.7916942834854126, + "learning_rate": 9.951487105541939e-06, + "loss": 0.7934, + "step": 3388 + }, + { + "epoch": 0.18271511753288763, + "grad_norm": 0.8752714395523071, + "learning_rate": 9.951457639842861e-06, + "loss": 0.8031, + "step": 3389 + }, + { + "epoch": 0.18276903170153116, + "grad_norm": 0.7645601630210876, + "learning_rate": 9.951428165241728e-06, + "loss": 0.6987, + "step": 3390 + }, + { + "epoch": 0.18282294587017467, + "grad_norm": 0.9860275983810425, + "learning_rate": 9.951398681738595e-06, + "loss": 0.8027, + "step": 3391 + }, + { + "epoch": 0.1828768600388182, + "grad_norm": 0.8548283576965332, + "learning_rate": 9.951369189333515e-06, + "loss": 0.8595, + "step": 3392 + }, + { + "epoch": 0.18293077420746173, + "grad_norm": 0.843217670917511, + "learning_rate": 9.95133968802654e-06, + "loss": 0.8437, + "step": 3393 + }, + { + "epoch": 0.18298468837610524, + "grad_norm": 0.7996432781219482, + "learning_rate": 9.951310177817726e-06, + "loss": 0.7229, + "step": 3394 + }, + { + "epoch": 0.18303860254474877, + "grad_norm": 0.8908971548080444, + "learning_rate": 9.951280658707124e-06, + "loss": 0.8639, + "step": 3395 + }, + { + "epoch": 0.18309251671339227, + "grad_norm": 0.9041224718093872, + "learning_rate": 9.951251130694787e-06, + "loss": 0.8026, + "step": 3396 + }, + { + "epoch": 0.1831464308820358, + "grad_norm": 0.7458503842353821, + "learning_rate": 9.951221593780768e-06, + "loss": 0.8228, + "step": 3397 + }, + { + "epoch": 0.1832003450506793, + "grad_norm": 0.8241537809371948, + "learning_rate": 9.95119204796512e-06, + "loss": 0.7937, + "step": 3398 + }, + { + "epoch": 0.18325425921932284, + "grad_norm": 0.8728781342506409, + "learning_rate": 9.951162493247897e-06, + "loss": 0.8829, + "step": 3399 + }, + { + "epoch": 0.18330817338796634, + "grad_norm": 0.843101978302002, + "learning_rate": 9.95113292962915e-06, + "loss": 0.9562, + "step": 3400 + }, + { + "epoch": 0.18336208755660988, + "grad_norm": 1.031156301498413, + "learning_rate": 9.951103357108935e-06, + "loss": 0.6757, + "step": 3401 + }, + { + "epoch": 0.1834160017252534, + "grad_norm": 0.9858013391494751, + "learning_rate": 9.951073775687304e-06, + "loss": 0.7922, + "step": 3402 + }, + { + "epoch": 0.1834699158938969, + "grad_norm": 0.9532352685928345, + "learning_rate": 9.95104418536431e-06, + "loss": 0.8979, + "step": 3403 + }, + { + "epoch": 0.18352383006254044, + "grad_norm": 0.9552246332168579, + "learning_rate": 9.951014586140006e-06, + "loss": 0.8682, + "step": 3404 + }, + { + "epoch": 0.18357774423118395, + "grad_norm": 0.8952224850654602, + "learning_rate": 9.950984978014446e-06, + "loss": 0.9064, + "step": 3405 + }, + { + "epoch": 0.18363165839982748, + "grad_norm": 0.8228804469108582, + "learning_rate": 9.950955360987684e-06, + "loss": 0.8337, + "step": 3406 + }, + { + "epoch": 0.18368557256847098, + "grad_norm": 0.8621776103973389, + "learning_rate": 9.95092573505977e-06, + "loss": 0.8418, + "step": 3407 + }, + { + "epoch": 0.18373948673711452, + "grad_norm": 0.8312029242515564, + "learning_rate": 9.95089610023076e-06, + "loss": 0.8453, + "step": 3408 + }, + { + "epoch": 0.18379340090575802, + "grad_norm": 0.8212811350822449, + "learning_rate": 9.950866456500706e-06, + "loss": 0.7226, + "step": 3409 + }, + { + "epoch": 0.18384731507440155, + "grad_norm": 0.7918773293495178, + "learning_rate": 9.950836803869663e-06, + "loss": 0.7546, + "step": 3410 + }, + { + "epoch": 0.18390122924304508, + "grad_norm": 0.8544521331787109, + "learning_rate": 9.950807142337682e-06, + "loss": 0.8975, + "step": 3411 + }, + { + "epoch": 0.1839551434116886, + "grad_norm": 0.7909727692604065, + "learning_rate": 9.950777471904818e-06, + "loss": 0.8266, + "step": 3412 + }, + { + "epoch": 0.18400905758033212, + "grad_norm": 0.7834721207618713, + "learning_rate": 9.950747792571122e-06, + "loss": 0.7647, + "step": 3413 + }, + { + "epoch": 0.18406297174897562, + "grad_norm": 1.0084491968154907, + "learning_rate": 9.950718104336651e-06, + "loss": 0.8954, + "step": 3414 + }, + { + "epoch": 0.18411688591761916, + "grad_norm": 0.9300922155380249, + "learning_rate": 9.950688407201457e-06, + "loss": 0.8106, + "step": 3415 + }, + { + "epoch": 0.18417080008626266, + "grad_norm": 0.7957245111465454, + "learning_rate": 9.950658701165593e-06, + "loss": 0.7556, + "step": 3416 + }, + { + "epoch": 0.1842247142549062, + "grad_norm": 0.7386512160301208, + "learning_rate": 9.950628986229111e-06, + "loss": 0.7384, + "step": 3417 + }, + { + "epoch": 0.1842786284235497, + "grad_norm": 0.8791146874427795, + "learning_rate": 9.950599262392067e-06, + "loss": 0.7681, + "step": 3418 + }, + { + "epoch": 0.18433254259219323, + "grad_norm": 0.78180330991745, + "learning_rate": 9.950569529654512e-06, + "loss": 0.7641, + "step": 3419 + }, + { + "epoch": 0.18438645676083676, + "grad_norm": 0.7648051977157593, + "learning_rate": 9.950539788016502e-06, + "loss": 0.7782, + "step": 3420 + }, + { + "epoch": 0.18444037092948026, + "grad_norm": 0.8135426640510559, + "learning_rate": 9.950510037478089e-06, + "loss": 0.8313, + "step": 3421 + }, + { + "epoch": 0.1844942850981238, + "grad_norm": 0.8623054623603821, + "learning_rate": 9.950480278039325e-06, + "loss": 0.8142, + "step": 3422 + }, + { + "epoch": 0.1845481992667673, + "grad_norm": 0.774558424949646, + "learning_rate": 9.950450509700267e-06, + "loss": 0.7747, + "step": 3423 + }, + { + "epoch": 0.18460211343541083, + "grad_norm": 0.7947419285774231, + "learning_rate": 9.950420732460965e-06, + "loss": 0.8757, + "step": 3424 + }, + { + "epoch": 0.18465602760405433, + "grad_norm": 0.8677110075950623, + "learning_rate": 9.950390946321475e-06, + "loss": 0.8527, + "step": 3425 + }, + { + "epoch": 0.18470994177269787, + "grad_norm": 0.8350674510002136, + "learning_rate": 9.950361151281852e-06, + "loss": 0.7209, + "step": 3426 + }, + { + "epoch": 0.1847638559413414, + "grad_norm": 0.7326707243919373, + "learning_rate": 9.950331347342143e-06, + "loss": 0.749, + "step": 3427 + }, + { + "epoch": 0.1848177701099849, + "grad_norm": 0.8775684237480164, + "learning_rate": 9.95030153450241e-06, + "loss": 0.762, + "step": 3428 + }, + { + "epoch": 0.18487168427862843, + "grad_norm": 0.8116014003753662, + "learning_rate": 9.9502717127627e-06, + "loss": 0.7592, + "step": 3429 + }, + { + "epoch": 0.18492559844727194, + "grad_norm": 0.7852542996406555, + "learning_rate": 9.950241882123068e-06, + "loss": 0.8254, + "step": 3430 + }, + { + "epoch": 0.18497951261591547, + "grad_norm": 0.761076807975769, + "learning_rate": 9.950212042583571e-06, + "loss": 0.7444, + "step": 3431 + }, + { + "epoch": 0.18503342678455897, + "grad_norm": 0.914729118347168, + "learning_rate": 9.95018219414426e-06, + "loss": 0.8847, + "step": 3432 + }, + { + "epoch": 0.1850873409532025, + "grad_norm": 0.7256419062614441, + "learning_rate": 9.950152336805188e-06, + "loss": 0.7069, + "step": 3433 + }, + { + "epoch": 0.185141255121846, + "grad_norm": 0.7481849193572998, + "learning_rate": 9.950122470566411e-06, + "loss": 0.7921, + "step": 3434 + }, + { + "epoch": 0.18519516929048954, + "grad_norm": 0.7878799438476562, + "learning_rate": 9.95009259542798e-06, + "loss": 0.7422, + "step": 3435 + }, + { + "epoch": 0.18524908345913307, + "grad_norm": 0.8083212375640869, + "learning_rate": 9.950062711389953e-06, + "loss": 0.8445, + "step": 3436 + }, + { + "epoch": 0.18530299762777658, + "grad_norm": 0.9458408355712891, + "learning_rate": 9.950032818452377e-06, + "loss": 0.771, + "step": 3437 + }, + { + "epoch": 0.1853569117964201, + "grad_norm": 0.7575398087501526, + "learning_rate": 9.950002916615311e-06, + "loss": 0.765, + "step": 3438 + }, + { + "epoch": 0.1854108259650636, + "grad_norm": 0.8672422766685486, + "learning_rate": 9.94997300587881e-06, + "loss": 0.8499, + "step": 3439 + }, + { + "epoch": 0.18546474013370715, + "grad_norm": 0.7971605658531189, + "learning_rate": 9.949943086242923e-06, + "loss": 0.8617, + "step": 3440 + }, + { + "epoch": 0.18551865430235065, + "grad_norm": 1.0215446949005127, + "learning_rate": 9.949913157707704e-06, + "loss": 0.8224, + "step": 3441 + }, + { + "epoch": 0.18557256847099418, + "grad_norm": 0.7983795404434204, + "learning_rate": 9.949883220273211e-06, + "loss": 0.7497, + "step": 3442 + }, + { + "epoch": 0.18562648263963769, + "grad_norm": 0.8548665642738342, + "learning_rate": 9.949853273939496e-06, + "loss": 0.856, + "step": 3443 + }, + { + "epoch": 0.18568039680828122, + "grad_norm": 0.7996117472648621, + "learning_rate": 9.949823318706611e-06, + "loss": 0.7344, + "step": 3444 + }, + { + "epoch": 0.18573431097692475, + "grad_norm": 0.9108440279960632, + "learning_rate": 9.949793354574612e-06, + "loss": 0.8229, + "step": 3445 + }, + { + "epoch": 0.18578822514556825, + "grad_norm": 0.8484078049659729, + "learning_rate": 9.949763381543553e-06, + "loss": 0.7366, + "step": 3446 + }, + { + "epoch": 0.18584213931421179, + "grad_norm": 0.7617974877357483, + "learning_rate": 9.949733399613486e-06, + "loss": 0.777, + "step": 3447 + }, + { + "epoch": 0.1858960534828553, + "grad_norm": 1.0613569021224976, + "learning_rate": 9.949703408784465e-06, + "loss": 0.9028, + "step": 3448 + }, + { + "epoch": 0.18594996765149882, + "grad_norm": 0.7503539323806763, + "learning_rate": 9.949673409056546e-06, + "loss": 0.797, + "step": 3449 + }, + { + "epoch": 0.18600388182014232, + "grad_norm": 0.8162353038787842, + "learning_rate": 9.949643400429782e-06, + "loss": 0.8698, + "step": 3450 + }, + { + "epoch": 0.18605779598878586, + "grad_norm": 0.8876883387565613, + "learning_rate": 9.949613382904226e-06, + "loss": 0.8422, + "step": 3451 + }, + { + "epoch": 0.18611171015742936, + "grad_norm": 0.7412144541740417, + "learning_rate": 9.949583356479934e-06, + "loss": 0.7977, + "step": 3452 + }, + { + "epoch": 0.1861656243260729, + "grad_norm": 0.7515407204627991, + "learning_rate": 9.949553321156957e-06, + "loss": 0.8046, + "step": 3453 + }, + { + "epoch": 0.18621953849471642, + "grad_norm": 0.8171376585960388, + "learning_rate": 9.949523276935352e-06, + "loss": 0.7121, + "step": 3454 + }, + { + "epoch": 0.18627345266335993, + "grad_norm": 0.838368833065033, + "learning_rate": 9.94949322381517e-06, + "loss": 0.833, + "step": 3455 + }, + { + "epoch": 0.18632736683200346, + "grad_norm": 1.0004788637161255, + "learning_rate": 9.949463161796468e-06, + "loss": 0.7967, + "step": 3456 + }, + { + "epoch": 0.18638128100064696, + "grad_norm": 0.8949950337409973, + "learning_rate": 9.949433090879298e-06, + "loss": 0.815, + "step": 3457 + }, + { + "epoch": 0.1864351951692905, + "grad_norm": 0.8611262440681458, + "learning_rate": 9.949403011063716e-06, + "loss": 0.8998, + "step": 3458 + }, + { + "epoch": 0.186489109337934, + "grad_norm": 0.7873225212097168, + "learning_rate": 9.949372922349775e-06, + "loss": 0.8011, + "step": 3459 + }, + { + "epoch": 0.18654302350657753, + "grad_norm": 0.7770752310752869, + "learning_rate": 9.949342824737529e-06, + "loss": 0.7687, + "step": 3460 + }, + { + "epoch": 0.18659693767522104, + "grad_norm": 0.7723278403282166, + "learning_rate": 9.949312718227031e-06, + "loss": 0.8047, + "step": 3461 + }, + { + "epoch": 0.18665085184386457, + "grad_norm": 0.8038878440856934, + "learning_rate": 9.949282602818335e-06, + "loss": 0.6522, + "step": 3462 + }, + { + "epoch": 0.1867047660125081, + "grad_norm": 0.8243177533149719, + "learning_rate": 9.949252478511499e-06, + "loss": 0.7859, + "step": 3463 + }, + { + "epoch": 0.1867586801811516, + "grad_norm": 0.8061205744743347, + "learning_rate": 9.949222345306574e-06, + "loss": 0.8, + "step": 3464 + }, + { + "epoch": 0.18681259434979514, + "grad_norm": 0.8916036486625671, + "learning_rate": 9.949192203203615e-06, + "loss": 0.7831, + "step": 3465 + }, + { + "epoch": 0.18686650851843864, + "grad_norm": 0.7694443464279175, + "learning_rate": 9.949162052202675e-06, + "loss": 0.753, + "step": 3466 + }, + { + "epoch": 0.18692042268708217, + "grad_norm": 0.8028594255447388, + "learning_rate": 9.94913189230381e-06, + "loss": 0.7834, + "step": 3467 + }, + { + "epoch": 0.18697433685572568, + "grad_norm": 0.8558024764060974, + "learning_rate": 9.94910172350707e-06, + "loss": 0.8479, + "step": 3468 + }, + { + "epoch": 0.1870282510243692, + "grad_norm": 0.8418707251548767, + "learning_rate": 9.949071545812517e-06, + "loss": 0.7841, + "step": 3469 + }, + { + "epoch": 0.1870821651930127, + "grad_norm": 0.9143140316009521, + "learning_rate": 9.9490413592202e-06, + "loss": 0.7803, + "step": 3470 + }, + { + "epoch": 0.18713607936165624, + "grad_norm": 0.927670419216156, + "learning_rate": 9.949011163730172e-06, + "loss": 0.7969, + "step": 3471 + }, + { + "epoch": 0.18718999353029978, + "grad_norm": 0.7614530324935913, + "learning_rate": 9.948980959342492e-06, + "loss": 0.7541, + "step": 3472 + }, + { + "epoch": 0.18724390769894328, + "grad_norm": 0.7719544172286987, + "learning_rate": 9.948950746057208e-06, + "loss": 0.6996, + "step": 3473 + }, + { + "epoch": 0.1872978218675868, + "grad_norm": 0.8512967824935913, + "learning_rate": 9.94892052387438e-06, + "loss": 0.8749, + "step": 3474 + }, + { + "epoch": 0.18735173603623032, + "grad_norm": 0.7408632636070251, + "learning_rate": 9.948890292794062e-06, + "loss": 0.7646, + "step": 3475 + }, + { + "epoch": 0.18740565020487385, + "grad_norm": 0.7667837142944336, + "learning_rate": 9.948860052816305e-06, + "loss": 0.7721, + "step": 3476 + }, + { + "epoch": 0.18745956437351735, + "grad_norm": 0.8099546432495117, + "learning_rate": 9.948829803941167e-06, + "loss": 0.8604, + "step": 3477 + }, + { + "epoch": 0.18751347854216088, + "grad_norm": 0.7130147814750671, + "learning_rate": 9.948799546168699e-06, + "loss": 0.7215, + "step": 3478 + }, + { + "epoch": 0.1875673927108044, + "grad_norm": 0.7442251443862915, + "learning_rate": 9.948769279498955e-06, + "loss": 0.7691, + "step": 3479 + }, + { + "epoch": 0.18762130687944792, + "grad_norm": 0.8528403043746948, + "learning_rate": 9.948739003931995e-06, + "loss": 0.8738, + "step": 3480 + }, + { + "epoch": 0.18767522104809145, + "grad_norm": 0.7217040061950684, + "learning_rate": 9.948708719467868e-06, + "loss": 0.6989, + "step": 3481 + }, + { + "epoch": 0.18772913521673495, + "grad_norm": 1.0738893747329712, + "learning_rate": 9.94867842610663e-06, + "loss": 0.7464, + "step": 3482 + }, + { + "epoch": 0.1877830493853785, + "grad_norm": 0.7653424739837646, + "learning_rate": 9.948648123848334e-06, + "loss": 0.8552, + "step": 3483 + }, + { + "epoch": 0.187836963554022, + "grad_norm": 0.791019856929779, + "learning_rate": 9.948617812693037e-06, + "loss": 0.8548, + "step": 3484 + }, + { + "epoch": 0.18789087772266552, + "grad_norm": 0.8527680039405823, + "learning_rate": 9.948587492640796e-06, + "loss": 0.7717, + "step": 3485 + }, + { + "epoch": 0.18794479189130903, + "grad_norm": 1.0001403093338013, + "learning_rate": 9.948557163691659e-06, + "loss": 0.8061, + "step": 3486 + }, + { + "epoch": 0.18799870605995256, + "grad_norm": 0.7622776627540588, + "learning_rate": 9.948526825845683e-06, + "loss": 0.7082, + "step": 3487 + }, + { + "epoch": 0.18805262022859606, + "grad_norm": 0.7377861142158508, + "learning_rate": 9.948496479102925e-06, + "loss": 0.7776, + "step": 3488 + }, + { + "epoch": 0.1881065343972396, + "grad_norm": 0.9017737507820129, + "learning_rate": 9.948466123463436e-06, + "loss": 0.7676, + "step": 3489 + }, + { + "epoch": 0.18816044856588313, + "grad_norm": 0.7733216881752014, + "learning_rate": 9.948435758927274e-06, + "loss": 0.7503, + "step": 3490 + }, + { + "epoch": 0.18821436273452663, + "grad_norm": 0.9103933572769165, + "learning_rate": 9.948405385494491e-06, + "loss": 0.8696, + "step": 3491 + }, + { + "epoch": 0.18826827690317016, + "grad_norm": 0.7228747010231018, + "learning_rate": 9.948375003165143e-06, + "loss": 0.8396, + "step": 3492 + }, + { + "epoch": 0.18832219107181367, + "grad_norm": 0.9336891174316406, + "learning_rate": 9.948344611939283e-06, + "loss": 0.7994, + "step": 3493 + }, + { + "epoch": 0.1883761052404572, + "grad_norm": 0.8534504175186157, + "learning_rate": 9.948314211816968e-06, + "loss": 0.7627, + "step": 3494 + }, + { + "epoch": 0.1884300194091007, + "grad_norm": 0.867060661315918, + "learning_rate": 9.94828380279825e-06, + "loss": 0.8503, + "step": 3495 + }, + { + "epoch": 0.18848393357774423, + "grad_norm": 0.7721019983291626, + "learning_rate": 9.948253384883188e-06, + "loss": 0.7409, + "step": 3496 + }, + { + "epoch": 0.18853784774638774, + "grad_norm": 0.7308738827705383, + "learning_rate": 9.948222958071832e-06, + "loss": 0.7579, + "step": 3497 + }, + { + "epoch": 0.18859176191503127, + "grad_norm": 1.1277705430984497, + "learning_rate": 9.948192522364237e-06, + "loss": 0.8288, + "step": 3498 + }, + { + "epoch": 0.1886456760836748, + "grad_norm": 0.8183790445327759, + "learning_rate": 9.948162077760462e-06, + "loss": 0.7819, + "step": 3499 + }, + { + "epoch": 0.1886995902523183, + "grad_norm": 0.7458687424659729, + "learning_rate": 9.948131624260557e-06, + "loss": 0.7482, + "step": 3500 + }, + { + "epoch": 0.18875350442096184, + "grad_norm": 0.9347942471504211, + "learning_rate": 9.94810116186458e-06, + "loss": 0.8208, + "step": 3501 + }, + { + "epoch": 0.18880741858960534, + "grad_norm": 0.7442129254341125, + "learning_rate": 9.948070690572582e-06, + "loss": 0.7843, + "step": 3502 + }, + { + "epoch": 0.18886133275824887, + "grad_norm": 0.8121855854988098, + "learning_rate": 9.948040210384622e-06, + "loss": 0.738, + "step": 3503 + }, + { + "epoch": 0.18891524692689238, + "grad_norm": 0.8118747472763062, + "learning_rate": 9.948009721300754e-06, + "loss": 0.8792, + "step": 3504 + }, + { + "epoch": 0.1889691610955359, + "grad_norm": 0.8263816833496094, + "learning_rate": 9.94797922332103e-06, + "loss": 0.7759, + "step": 3505 + }, + { + "epoch": 0.1890230752641794, + "grad_norm": 0.7452372908592224, + "learning_rate": 9.947948716445508e-06, + "loss": 0.7588, + "step": 3506 + }, + { + "epoch": 0.18907698943282295, + "grad_norm": 0.7385339736938477, + "learning_rate": 9.94791820067424e-06, + "loss": 0.8412, + "step": 3507 + }, + { + "epoch": 0.18913090360146648, + "grad_norm": 0.7456401586532593, + "learning_rate": 9.947887676007284e-06, + "loss": 0.7539, + "step": 3508 + }, + { + "epoch": 0.18918481777010998, + "grad_norm": 0.8101776242256165, + "learning_rate": 9.947857142444693e-06, + "loss": 0.8006, + "step": 3509 + }, + { + "epoch": 0.1892387319387535, + "grad_norm": 0.7587085962295532, + "learning_rate": 9.947826599986523e-06, + "loss": 0.7958, + "step": 3510 + }, + { + "epoch": 0.18929264610739702, + "grad_norm": 0.7974298596382141, + "learning_rate": 9.947796048632826e-06, + "loss": 0.7954, + "step": 3511 + }, + { + "epoch": 0.18934656027604055, + "grad_norm": 0.8407479524612427, + "learning_rate": 9.94776548838366e-06, + "loss": 0.825, + "step": 3512 + }, + { + "epoch": 0.18940047444468405, + "grad_norm": 0.7465969324111938, + "learning_rate": 9.94773491923908e-06, + "loss": 0.7725, + "step": 3513 + }, + { + "epoch": 0.18945438861332758, + "grad_norm": 0.9324356913566589, + "learning_rate": 9.947704341199137e-06, + "loss": 0.755, + "step": 3514 + }, + { + "epoch": 0.1895083027819711, + "grad_norm": 0.8157918453216553, + "learning_rate": 9.94767375426389e-06, + "loss": 0.8678, + "step": 3515 + }, + { + "epoch": 0.18956221695061462, + "grad_norm": 0.8501976132392883, + "learning_rate": 9.947643158433395e-06, + "loss": 0.8431, + "step": 3516 + }, + { + "epoch": 0.18961613111925815, + "grad_norm": 0.7773411273956299, + "learning_rate": 9.947612553707703e-06, + "loss": 0.748, + "step": 3517 + }, + { + "epoch": 0.18967004528790166, + "grad_norm": 0.7716071605682373, + "learning_rate": 9.947581940086873e-06, + "loss": 0.7563, + "step": 3518 + }, + { + "epoch": 0.1897239594565452, + "grad_norm": 0.9465253353118896, + "learning_rate": 9.947551317570957e-06, + "loss": 0.9289, + "step": 3519 + }, + { + "epoch": 0.1897778736251887, + "grad_norm": 0.7123626470565796, + "learning_rate": 9.94752068616001e-06, + "loss": 0.7012, + "step": 3520 + }, + { + "epoch": 0.18983178779383222, + "grad_norm": 0.7318246960639954, + "learning_rate": 9.94749004585409e-06, + "loss": 0.8247, + "step": 3521 + }, + { + "epoch": 0.18988570196247573, + "grad_norm": 0.8028656244277954, + "learning_rate": 9.947459396653248e-06, + "loss": 0.8606, + "step": 3522 + }, + { + "epoch": 0.18993961613111926, + "grad_norm": 0.7580826282501221, + "learning_rate": 9.947428738557541e-06, + "loss": 0.7801, + "step": 3523 + }, + { + "epoch": 0.18999353029976276, + "grad_norm": 0.7612492442131042, + "learning_rate": 9.947398071567025e-06, + "loss": 0.8298, + "step": 3524 + }, + { + "epoch": 0.1900474444684063, + "grad_norm": 0.7892666459083557, + "learning_rate": 9.947367395681755e-06, + "loss": 0.739, + "step": 3525 + }, + { + "epoch": 0.19010135863704983, + "grad_norm": 0.7531749606132507, + "learning_rate": 9.947336710901785e-06, + "loss": 0.7804, + "step": 3526 + }, + { + "epoch": 0.19015527280569333, + "grad_norm": 0.7833613753318787, + "learning_rate": 9.947306017227171e-06, + "loss": 0.6541, + "step": 3527 + }, + { + "epoch": 0.19020918697433686, + "grad_norm": 0.749286413192749, + "learning_rate": 9.94727531465797e-06, + "loss": 0.6982, + "step": 3528 + }, + { + "epoch": 0.19026310114298037, + "grad_norm": 0.9150011539459229, + "learning_rate": 9.947244603194233e-06, + "loss": 0.8681, + "step": 3529 + }, + { + "epoch": 0.1903170153116239, + "grad_norm": 0.8265007138252258, + "learning_rate": 9.947213882836018e-06, + "loss": 0.9088, + "step": 3530 + }, + { + "epoch": 0.1903709294802674, + "grad_norm": 0.7807170152664185, + "learning_rate": 9.947183153583379e-06, + "loss": 0.7875, + "step": 3531 + }, + { + "epoch": 0.19042484364891094, + "grad_norm": 1.0078792572021484, + "learning_rate": 9.947152415436375e-06, + "loss": 1.2045, + "step": 3532 + }, + { + "epoch": 0.19047875781755447, + "grad_norm": 0.7661539912223816, + "learning_rate": 9.947121668395055e-06, + "loss": 0.8202, + "step": 3533 + }, + { + "epoch": 0.19053267198619797, + "grad_norm": 0.7419549226760864, + "learning_rate": 9.947090912459479e-06, + "loss": 0.7775, + "step": 3534 + }, + { + "epoch": 0.1905865861548415, + "grad_norm": 0.9671319723129272, + "learning_rate": 9.947060147629698e-06, + "loss": 0.8328, + "step": 3535 + }, + { + "epoch": 0.190640500323485, + "grad_norm": 0.9418153762817383, + "learning_rate": 9.947029373905773e-06, + "loss": 0.8476, + "step": 3536 + }, + { + "epoch": 0.19069441449212854, + "grad_norm": 0.8007176518440247, + "learning_rate": 9.946998591287755e-06, + "loss": 0.8379, + "step": 3537 + }, + { + "epoch": 0.19074832866077204, + "grad_norm": 1.0271466970443726, + "learning_rate": 9.946967799775701e-06, + "loss": 0.7789, + "step": 3538 + }, + { + "epoch": 0.19080224282941557, + "grad_norm": 0.7577568888664246, + "learning_rate": 9.946936999369668e-06, + "loss": 0.7749, + "step": 3539 + }, + { + "epoch": 0.19085615699805908, + "grad_norm": 0.7766523361206055, + "learning_rate": 9.946906190069707e-06, + "loss": 0.7143, + "step": 3540 + }, + { + "epoch": 0.1909100711667026, + "grad_norm": 0.798589825630188, + "learning_rate": 9.946875371875876e-06, + "loss": 0.8481, + "step": 3541 + }, + { + "epoch": 0.19096398533534614, + "grad_norm": 0.8279602527618408, + "learning_rate": 9.946844544788232e-06, + "loss": 0.8369, + "step": 3542 + }, + { + "epoch": 0.19101789950398965, + "grad_norm": 0.7607479691505432, + "learning_rate": 9.946813708806828e-06, + "loss": 0.8088, + "step": 3543 + }, + { + "epoch": 0.19107181367263318, + "grad_norm": 0.7722266912460327, + "learning_rate": 9.946782863931719e-06, + "loss": 0.704, + "step": 3544 + }, + { + "epoch": 0.19112572784127668, + "grad_norm": 0.8101015686988831, + "learning_rate": 9.946752010162964e-06, + "loss": 0.7828, + "step": 3545 + }, + { + "epoch": 0.19117964200992021, + "grad_norm": 0.8161671161651611, + "learning_rate": 9.946721147500613e-06, + "loss": 0.8875, + "step": 3546 + }, + { + "epoch": 0.19123355617856372, + "grad_norm": 0.9234161972999573, + "learning_rate": 9.946690275944727e-06, + "loss": 0.8846, + "step": 3547 + }, + { + "epoch": 0.19128747034720725, + "grad_norm": 0.7948644757270813, + "learning_rate": 9.946659395495357e-06, + "loss": 0.8331, + "step": 3548 + }, + { + "epoch": 0.19134138451585075, + "grad_norm": 0.9087135791778564, + "learning_rate": 9.946628506152563e-06, + "loss": 0.7462, + "step": 3549 + }, + { + "epoch": 0.19139529868449429, + "grad_norm": 0.7624903917312622, + "learning_rate": 9.946597607916396e-06, + "loss": 0.6431, + "step": 3550 + }, + { + "epoch": 0.19144921285313782, + "grad_norm": 0.9236660003662109, + "learning_rate": 9.946566700786914e-06, + "loss": 0.921, + "step": 3551 + }, + { + "epoch": 0.19150312702178132, + "grad_norm": 0.8824177980422974, + "learning_rate": 9.946535784764173e-06, + "loss": 0.805, + "step": 3552 + }, + { + "epoch": 0.19155704119042485, + "grad_norm": 0.7843056917190552, + "learning_rate": 9.946504859848227e-06, + "loss": 0.8528, + "step": 3553 + }, + { + "epoch": 0.19161095535906836, + "grad_norm": 1.2314038276672363, + "learning_rate": 9.946473926039134e-06, + "loss": 0.8141, + "step": 3554 + }, + { + "epoch": 0.1916648695277119, + "grad_norm": 0.7956500053405762, + "learning_rate": 9.946442983336945e-06, + "loss": 0.7946, + "step": 3555 + }, + { + "epoch": 0.1917187836963554, + "grad_norm": 0.850674033164978, + "learning_rate": 9.94641203174172e-06, + "loss": 0.8965, + "step": 3556 + }, + { + "epoch": 0.19177269786499893, + "grad_norm": 0.8371244668960571, + "learning_rate": 9.946381071253514e-06, + "loss": 0.7859, + "step": 3557 + }, + { + "epoch": 0.19182661203364243, + "grad_norm": 0.7423365712165833, + "learning_rate": 9.946350101872382e-06, + "loss": 0.8012, + "step": 3558 + }, + { + "epoch": 0.19188052620228596, + "grad_norm": 0.8446981310844421, + "learning_rate": 9.946319123598379e-06, + "loss": 0.9037, + "step": 3559 + }, + { + "epoch": 0.1919344403709295, + "grad_norm": 0.8565588593482971, + "learning_rate": 9.946288136431562e-06, + "loss": 0.7398, + "step": 3560 + }, + { + "epoch": 0.191988354539573, + "grad_norm": 0.8087875843048096, + "learning_rate": 9.946257140371985e-06, + "loss": 0.7214, + "step": 3561 + }, + { + "epoch": 0.19204226870821653, + "grad_norm": 0.7951125502586365, + "learning_rate": 9.946226135419705e-06, + "loss": 0.7988, + "step": 3562 + }, + { + "epoch": 0.19209618287686003, + "grad_norm": 0.8709264397621155, + "learning_rate": 9.946195121574779e-06, + "loss": 0.8563, + "step": 3563 + }, + { + "epoch": 0.19215009704550357, + "grad_norm": 0.7908393740653992, + "learning_rate": 9.94616409883726e-06, + "loss": 0.7874, + "step": 3564 + }, + { + "epoch": 0.19220401121414707, + "grad_norm": 1.0512382984161377, + "learning_rate": 9.946133067207204e-06, + "loss": 0.9174, + "step": 3565 + }, + { + "epoch": 0.1922579253827906, + "grad_norm": 0.7937822937965393, + "learning_rate": 9.94610202668467e-06, + "loss": 0.6863, + "step": 3566 + }, + { + "epoch": 0.1923118395514341, + "grad_norm": 0.9130533337593079, + "learning_rate": 9.94607097726971e-06, + "loss": 0.8287, + "step": 3567 + }, + { + "epoch": 0.19236575372007764, + "grad_norm": 1.1604489088058472, + "learning_rate": 9.946039918962383e-06, + "loss": 0.6922, + "step": 3568 + }, + { + "epoch": 0.19241966788872117, + "grad_norm": 1.0400906801223755, + "learning_rate": 9.946008851762743e-06, + "loss": 0.7978, + "step": 3569 + }, + { + "epoch": 0.19247358205736467, + "grad_norm": 0.8068282008171082, + "learning_rate": 9.945977775670845e-06, + "loss": 0.7365, + "step": 3570 + }, + { + "epoch": 0.1925274962260082, + "grad_norm": 0.8328807353973389, + "learning_rate": 9.945946690686747e-06, + "loss": 0.7308, + "step": 3571 + }, + { + "epoch": 0.1925814103946517, + "grad_norm": 0.946949303150177, + "learning_rate": 9.945915596810502e-06, + "loss": 0.9117, + "step": 3572 + }, + { + "epoch": 0.19263532456329524, + "grad_norm": 0.8421696424484253, + "learning_rate": 9.94588449404217e-06, + "loss": 0.7132, + "step": 3573 + }, + { + "epoch": 0.19268923873193874, + "grad_norm": 0.7321984171867371, + "learning_rate": 9.945853382381805e-06, + "loss": 0.752, + "step": 3574 + }, + { + "epoch": 0.19274315290058228, + "grad_norm": 0.8039024472236633, + "learning_rate": 9.94582226182946e-06, + "loss": 0.7952, + "step": 3575 + }, + { + "epoch": 0.19279706706922578, + "grad_norm": 0.8612285256385803, + "learning_rate": 9.945791132385196e-06, + "loss": 0.7944, + "step": 3576 + }, + { + "epoch": 0.1928509812378693, + "grad_norm": 1.0525864362716675, + "learning_rate": 9.945759994049066e-06, + "loss": 0.8078, + "step": 3577 + }, + { + "epoch": 0.19290489540651284, + "grad_norm": 0.8032466769218445, + "learning_rate": 9.945728846821128e-06, + "loss": 0.8522, + "step": 3578 + }, + { + "epoch": 0.19295880957515635, + "grad_norm": 1.324041485786438, + "learning_rate": 9.945697690701435e-06, + "loss": 0.7705, + "step": 3579 + }, + { + "epoch": 0.19301272374379988, + "grad_norm": 0.8733030557632446, + "learning_rate": 9.945666525690044e-06, + "loss": 0.8115, + "step": 3580 + }, + { + "epoch": 0.19306663791244338, + "grad_norm": 0.8208357095718384, + "learning_rate": 9.945635351787012e-06, + "loss": 0.7975, + "step": 3581 + }, + { + "epoch": 0.19312055208108692, + "grad_norm": 0.744498074054718, + "learning_rate": 9.945604168992395e-06, + "loss": 0.8088, + "step": 3582 + }, + { + "epoch": 0.19317446624973042, + "grad_norm": 0.9391197562217712, + "learning_rate": 9.945572977306249e-06, + "loss": 0.8403, + "step": 3583 + }, + { + "epoch": 0.19322838041837395, + "grad_norm": 0.8050488829612732, + "learning_rate": 9.945541776728629e-06, + "loss": 0.769, + "step": 3584 + }, + { + "epoch": 0.19328229458701746, + "grad_norm": 0.8373685479164124, + "learning_rate": 9.945510567259592e-06, + "loss": 0.7803, + "step": 3585 + }, + { + "epoch": 0.193336208755661, + "grad_norm": 0.8766368627548218, + "learning_rate": 9.945479348899194e-06, + "loss": 0.8325, + "step": 3586 + }, + { + "epoch": 0.19339012292430452, + "grad_norm": 0.8029547333717346, + "learning_rate": 9.945448121647492e-06, + "loss": 0.6647, + "step": 3587 + }, + { + "epoch": 0.19344403709294802, + "grad_norm": 0.7231468558311462, + "learning_rate": 9.94541688550454e-06, + "loss": 0.6939, + "step": 3588 + }, + { + "epoch": 0.19349795126159156, + "grad_norm": 0.8487125039100647, + "learning_rate": 9.945385640470397e-06, + "loss": 0.8097, + "step": 3589 + }, + { + "epoch": 0.19355186543023506, + "grad_norm": 0.7813920378684998, + "learning_rate": 9.945354386545116e-06, + "loss": 0.8023, + "step": 3590 + }, + { + "epoch": 0.1936057795988786, + "grad_norm": 0.8754404783248901, + "learning_rate": 9.945323123728756e-06, + "loss": 0.8401, + "step": 3591 + }, + { + "epoch": 0.1936596937675221, + "grad_norm": 0.8191613554954529, + "learning_rate": 9.945291852021371e-06, + "loss": 0.8151, + "step": 3592 + }, + { + "epoch": 0.19371360793616563, + "grad_norm": 0.7882266044616699, + "learning_rate": 9.945260571423019e-06, + "loss": 0.77, + "step": 3593 + }, + { + "epoch": 0.19376752210480913, + "grad_norm": 0.816411554813385, + "learning_rate": 9.945229281933756e-06, + "loss": 0.7378, + "step": 3594 + }, + { + "epoch": 0.19382143627345266, + "grad_norm": 0.8545891046524048, + "learning_rate": 9.945197983553636e-06, + "loss": 0.7563, + "step": 3595 + }, + { + "epoch": 0.1938753504420962, + "grad_norm": 0.8293501138687134, + "learning_rate": 9.945166676282717e-06, + "loss": 0.893, + "step": 3596 + }, + { + "epoch": 0.1939292646107397, + "grad_norm": 0.7536304593086243, + "learning_rate": 9.945135360121058e-06, + "loss": 0.7101, + "step": 3597 + }, + { + "epoch": 0.19398317877938323, + "grad_norm": 0.96649569272995, + "learning_rate": 9.94510403506871e-06, + "loss": 0.8027, + "step": 3598 + }, + { + "epoch": 0.19403709294802673, + "grad_norm": 0.7543211579322815, + "learning_rate": 9.945072701125733e-06, + "loss": 0.8144, + "step": 3599 + }, + { + "epoch": 0.19409100711667027, + "grad_norm": 0.7223193049430847, + "learning_rate": 9.945041358292183e-06, + "loss": 0.7585, + "step": 3600 + }, + { + "epoch": 0.19414492128531377, + "grad_norm": 0.8515756726264954, + "learning_rate": 9.945010006568115e-06, + "loss": 0.9114, + "step": 3601 + }, + { + "epoch": 0.1941988354539573, + "grad_norm": 0.7318340539932251, + "learning_rate": 9.944978645953585e-06, + "loss": 0.7554, + "step": 3602 + }, + { + "epoch": 0.1942527496226008, + "grad_norm": 0.8565723299980164, + "learning_rate": 9.944947276448649e-06, + "loss": 0.8918, + "step": 3603 + }, + { + "epoch": 0.19430666379124434, + "grad_norm": 0.8536270260810852, + "learning_rate": 9.944915898053367e-06, + "loss": 0.8184, + "step": 3604 + }, + { + "epoch": 0.19436057795988787, + "grad_norm": 0.7093652486801147, + "learning_rate": 9.944884510767792e-06, + "loss": 0.8031, + "step": 3605 + }, + { + "epoch": 0.19441449212853137, + "grad_norm": 0.7644805312156677, + "learning_rate": 9.944853114591984e-06, + "loss": 0.8546, + "step": 3606 + }, + { + "epoch": 0.1944684062971749, + "grad_norm": 0.6533430218696594, + "learning_rate": 9.944821709525994e-06, + "loss": 0.6453, + "step": 3607 + }, + { + "epoch": 0.1945223204658184, + "grad_norm": 0.8608343005180359, + "learning_rate": 9.944790295569883e-06, + "loss": 0.8539, + "step": 3608 + }, + { + "epoch": 0.19457623463446194, + "grad_norm": 0.777740478515625, + "learning_rate": 9.944758872723706e-06, + "loss": 0.7414, + "step": 3609 + }, + { + "epoch": 0.19463014880310545, + "grad_norm": 0.7757480144500732, + "learning_rate": 9.944727440987518e-06, + "loss": 0.7394, + "step": 3610 + }, + { + "epoch": 0.19468406297174898, + "grad_norm": 0.7862492203712463, + "learning_rate": 9.944696000361379e-06, + "loss": 0.8264, + "step": 3611 + }, + { + "epoch": 0.19473797714039248, + "grad_norm": 0.72691410779953, + "learning_rate": 9.944664550845342e-06, + "loss": 0.6876, + "step": 3612 + }, + { + "epoch": 0.194791891309036, + "grad_norm": 0.8702194094657898, + "learning_rate": 9.944633092439467e-06, + "loss": 0.7286, + "step": 3613 + }, + { + "epoch": 0.19484580547767955, + "grad_norm": 1.1160287857055664, + "learning_rate": 9.944601625143806e-06, + "loss": 0.8619, + "step": 3614 + }, + { + "epoch": 0.19489971964632305, + "grad_norm": 0.8278397917747498, + "learning_rate": 9.944570148958419e-06, + "loss": 0.7458, + "step": 3615 + }, + { + "epoch": 0.19495363381496658, + "grad_norm": 0.8430503606796265, + "learning_rate": 9.944538663883362e-06, + "loss": 0.7681, + "step": 3616 + }, + { + "epoch": 0.19500754798361009, + "grad_norm": 0.8198543190956116, + "learning_rate": 9.94450716991869e-06, + "loss": 0.6681, + "step": 3617 + }, + { + "epoch": 0.19506146215225362, + "grad_norm": 0.7874541282653809, + "learning_rate": 9.944475667064464e-06, + "loss": 0.813, + "step": 3618 + }, + { + "epoch": 0.19511537632089712, + "grad_norm": 0.76181960105896, + "learning_rate": 9.944444155320736e-06, + "loss": 0.7443, + "step": 3619 + }, + { + "epoch": 0.19516929048954065, + "grad_norm": 0.7647060751914978, + "learning_rate": 9.944412634687563e-06, + "loss": 0.8232, + "step": 3620 + }, + { + "epoch": 0.19522320465818416, + "grad_norm": 0.7609487771987915, + "learning_rate": 9.944381105165006e-06, + "loss": 0.8134, + "step": 3621 + }, + { + "epoch": 0.1952771188268277, + "grad_norm": 0.8139258027076721, + "learning_rate": 9.944349566753116e-06, + "loss": 0.8053, + "step": 3622 + }, + { + "epoch": 0.19533103299547122, + "grad_norm": 0.7404879927635193, + "learning_rate": 9.944318019451952e-06, + "loss": 0.7774, + "step": 3623 + }, + { + "epoch": 0.19538494716411473, + "grad_norm": 0.863972008228302, + "learning_rate": 9.944286463261573e-06, + "loss": 0.8824, + "step": 3624 + }, + { + "epoch": 0.19543886133275826, + "grad_norm": 0.907744824886322, + "learning_rate": 9.944254898182033e-06, + "loss": 0.7537, + "step": 3625 + }, + { + "epoch": 0.19549277550140176, + "grad_norm": 0.8722240328788757, + "learning_rate": 9.944223324213389e-06, + "loss": 0.8688, + "step": 3626 + }, + { + "epoch": 0.1955466896700453, + "grad_norm": 0.7386543154716492, + "learning_rate": 9.9441917413557e-06, + "loss": 0.6962, + "step": 3627 + }, + { + "epoch": 0.1956006038386888, + "grad_norm": 0.7577354907989502, + "learning_rate": 9.944160149609018e-06, + "loss": 0.7261, + "step": 3628 + }, + { + "epoch": 0.19565451800733233, + "grad_norm": 0.8413889408111572, + "learning_rate": 9.944128548973407e-06, + "loss": 0.8369, + "step": 3629 + }, + { + "epoch": 0.19570843217597583, + "grad_norm": 0.8649793863296509, + "learning_rate": 9.944096939448917e-06, + "loss": 0.8363, + "step": 3630 + }, + { + "epoch": 0.19576234634461936, + "grad_norm": 0.7515233755111694, + "learning_rate": 9.944065321035607e-06, + "loss": 0.7634, + "step": 3631 + }, + { + "epoch": 0.1958162605132629, + "grad_norm": 0.9059920310974121, + "learning_rate": 9.944033693733535e-06, + "loss": 0.9312, + "step": 3632 + }, + { + "epoch": 0.1958701746819064, + "grad_norm": 0.780707597732544, + "learning_rate": 9.944002057542757e-06, + "loss": 0.7545, + "step": 3633 + }, + { + "epoch": 0.19592408885054993, + "grad_norm": 0.7543255686759949, + "learning_rate": 9.94397041246333e-06, + "loss": 0.7496, + "step": 3634 + }, + { + "epoch": 0.19597800301919344, + "grad_norm": 0.7795106172561646, + "learning_rate": 9.943938758495313e-06, + "loss": 0.6734, + "step": 3635 + }, + { + "epoch": 0.19603191718783697, + "grad_norm": 0.9682700037956238, + "learning_rate": 9.943907095638758e-06, + "loss": 0.8928, + "step": 3636 + }, + { + "epoch": 0.19608583135648047, + "grad_norm": 0.7332949638366699, + "learning_rate": 9.943875423893727e-06, + "loss": 0.7507, + "step": 3637 + }, + { + "epoch": 0.196139745525124, + "grad_norm": 0.8316323161125183, + "learning_rate": 9.943843743260275e-06, + "loss": 0.7492, + "step": 3638 + }, + { + "epoch": 0.19619365969376754, + "grad_norm": 0.7973113059997559, + "learning_rate": 9.943812053738458e-06, + "loss": 0.8381, + "step": 3639 + }, + { + "epoch": 0.19624757386241104, + "grad_norm": 0.7654823064804077, + "learning_rate": 9.943780355328332e-06, + "loss": 0.8497, + "step": 3640 + }, + { + "epoch": 0.19630148803105457, + "grad_norm": 0.7055602073669434, + "learning_rate": 9.943748648029958e-06, + "loss": 0.7949, + "step": 3641 + }, + { + "epoch": 0.19635540219969808, + "grad_norm": 0.9971569180488586, + "learning_rate": 9.94371693184339e-06, + "loss": 0.8311, + "step": 3642 + }, + { + "epoch": 0.1964093163683416, + "grad_norm": 0.7608943581581116, + "learning_rate": 9.943685206768686e-06, + "loss": 0.8303, + "step": 3643 + }, + { + "epoch": 0.1964632305369851, + "grad_norm": 0.9169919490814209, + "learning_rate": 9.943653472805901e-06, + "loss": 0.8314, + "step": 3644 + }, + { + "epoch": 0.19651714470562864, + "grad_norm": 0.8501203656196594, + "learning_rate": 9.943621729955096e-06, + "loss": 0.8765, + "step": 3645 + }, + { + "epoch": 0.19657105887427215, + "grad_norm": 0.7438945770263672, + "learning_rate": 9.943589978216325e-06, + "loss": 0.7323, + "step": 3646 + }, + { + "epoch": 0.19662497304291568, + "grad_norm": 0.8795550465583801, + "learning_rate": 9.943558217589646e-06, + "loss": 0.7916, + "step": 3647 + }, + { + "epoch": 0.1966788872115592, + "grad_norm": 0.7928707003593445, + "learning_rate": 9.943526448075117e-06, + "loss": 0.8621, + "step": 3648 + }, + { + "epoch": 0.19673280138020272, + "grad_norm": 0.8225892782211304, + "learning_rate": 9.943494669672792e-06, + "loss": 0.8718, + "step": 3649 + }, + { + "epoch": 0.19678671554884625, + "grad_norm": 0.8227444291114807, + "learning_rate": 9.943462882382732e-06, + "loss": 0.8374, + "step": 3650 + }, + { + "epoch": 0.19684062971748975, + "grad_norm": 0.7860620021820068, + "learning_rate": 9.943431086204991e-06, + "loss": 0.8919, + "step": 3651 + }, + { + "epoch": 0.19689454388613328, + "grad_norm": 0.8000875115394592, + "learning_rate": 9.94339928113963e-06, + "loss": 0.7822, + "step": 3652 + }, + { + "epoch": 0.1969484580547768, + "grad_norm": 0.796389639377594, + "learning_rate": 9.943367467186702e-06, + "loss": 0.7149, + "step": 3653 + }, + { + "epoch": 0.19700237222342032, + "grad_norm": 0.8032622337341309, + "learning_rate": 9.943335644346267e-06, + "loss": 0.8442, + "step": 3654 + }, + { + "epoch": 0.19705628639206382, + "grad_norm": 0.8624833226203918, + "learning_rate": 9.94330381261838e-06, + "loss": 0.8681, + "step": 3655 + }, + { + "epoch": 0.19711020056070735, + "grad_norm": 0.9663752317428589, + "learning_rate": 9.9432719720031e-06, + "loss": 0.8749, + "step": 3656 + }, + { + "epoch": 0.1971641147293509, + "grad_norm": 0.6869292259216309, + "learning_rate": 9.943240122500484e-06, + "loss": 0.7288, + "step": 3657 + }, + { + "epoch": 0.1972180288979944, + "grad_norm": 0.7496824264526367, + "learning_rate": 9.943208264110589e-06, + "loss": 0.7191, + "step": 3658 + }, + { + "epoch": 0.19727194306663792, + "grad_norm": 0.7637088894844055, + "learning_rate": 9.943176396833471e-06, + "loss": 0.7602, + "step": 3659 + }, + { + "epoch": 0.19732585723528143, + "grad_norm": 0.7049651741981506, + "learning_rate": 9.94314452066919e-06, + "loss": 0.7097, + "step": 3660 + }, + { + "epoch": 0.19737977140392496, + "grad_norm": 0.8979986310005188, + "learning_rate": 9.943112635617802e-06, + "loss": 0.7953, + "step": 3661 + }, + { + "epoch": 0.19743368557256846, + "grad_norm": 0.7865282893180847, + "learning_rate": 9.943080741679364e-06, + "loss": 0.7394, + "step": 3662 + }, + { + "epoch": 0.197487599741212, + "grad_norm": 0.7790982723236084, + "learning_rate": 9.943048838853932e-06, + "loss": 0.8587, + "step": 3663 + }, + { + "epoch": 0.1975415139098555, + "grad_norm": 0.8486214876174927, + "learning_rate": 9.943016927141566e-06, + "loss": 0.9232, + "step": 3664 + }, + { + "epoch": 0.19759542807849903, + "grad_norm": 0.7729238867759705, + "learning_rate": 9.942985006542322e-06, + "loss": 0.7704, + "step": 3665 + }, + { + "epoch": 0.19764934224714256, + "grad_norm": 0.7827340960502625, + "learning_rate": 9.942953077056259e-06, + "loss": 0.7834, + "step": 3666 + }, + { + "epoch": 0.19770325641578607, + "grad_norm": 0.8735725283622742, + "learning_rate": 9.94292113868343e-06, + "loss": 0.7521, + "step": 3667 + }, + { + "epoch": 0.1977571705844296, + "grad_norm": 0.803302526473999, + "learning_rate": 9.942889191423897e-06, + "loss": 0.7475, + "step": 3668 + }, + { + "epoch": 0.1978110847530731, + "grad_norm": 0.7523918747901917, + "learning_rate": 9.942857235277716e-06, + "loss": 0.7882, + "step": 3669 + }, + { + "epoch": 0.19786499892171663, + "grad_norm": 0.891010582447052, + "learning_rate": 9.942825270244944e-06, + "loss": 0.6855, + "step": 3670 + }, + { + "epoch": 0.19791891309036014, + "grad_norm": 0.8103521466255188, + "learning_rate": 9.94279329632564e-06, + "loss": 0.7604, + "step": 3671 + }, + { + "epoch": 0.19797282725900367, + "grad_norm": 0.7801117897033691, + "learning_rate": 9.94276131351986e-06, + "loss": 0.757, + "step": 3672 + }, + { + "epoch": 0.19802674142764717, + "grad_norm": 0.8760844469070435, + "learning_rate": 9.942729321827661e-06, + "loss": 0.9507, + "step": 3673 + }, + { + "epoch": 0.1980806555962907, + "grad_norm": 0.7129818201065063, + "learning_rate": 9.942697321249101e-06, + "loss": 0.7118, + "step": 3674 + }, + { + "epoch": 0.19813456976493424, + "grad_norm": 0.7223137021064758, + "learning_rate": 9.942665311784239e-06, + "loss": 0.6911, + "step": 3675 + }, + { + "epoch": 0.19818848393357774, + "grad_norm": 0.7100752592086792, + "learning_rate": 9.94263329343313e-06, + "loss": 0.7569, + "step": 3676 + }, + { + "epoch": 0.19824239810222127, + "grad_norm": 0.955298662185669, + "learning_rate": 9.942601266195834e-06, + "loss": 0.8562, + "step": 3677 + }, + { + "epoch": 0.19829631227086478, + "grad_norm": 0.7367860078811646, + "learning_rate": 9.942569230072408e-06, + "loss": 0.7184, + "step": 3678 + }, + { + "epoch": 0.1983502264395083, + "grad_norm": 0.7822328805923462, + "learning_rate": 9.942537185062909e-06, + "loss": 0.7111, + "step": 3679 + }, + { + "epoch": 0.1984041406081518, + "grad_norm": 0.8836474418640137, + "learning_rate": 9.942505131167394e-06, + "loss": 0.731, + "step": 3680 + }, + { + "epoch": 0.19845805477679535, + "grad_norm": 0.7033706903457642, + "learning_rate": 9.942473068385921e-06, + "loss": 0.7228, + "step": 3681 + }, + { + "epoch": 0.19851196894543885, + "grad_norm": 0.7241103649139404, + "learning_rate": 9.942440996718549e-06, + "loss": 0.7045, + "step": 3682 + }, + { + "epoch": 0.19856588311408238, + "grad_norm": 0.8266516923904419, + "learning_rate": 9.942408916165334e-06, + "loss": 0.781, + "step": 3683 + }, + { + "epoch": 0.1986197972827259, + "grad_norm": 0.9639707207679749, + "learning_rate": 9.942376826726334e-06, + "loss": 0.8136, + "step": 3684 + }, + { + "epoch": 0.19867371145136942, + "grad_norm": 0.874279797077179, + "learning_rate": 9.942344728401609e-06, + "loss": 0.8147, + "step": 3685 + }, + { + "epoch": 0.19872762562001295, + "grad_norm": 0.7670862674713135, + "learning_rate": 9.942312621191213e-06, + "loss": 0.8134, + "step": 3686 + }, + { + "epoch": 0.19878153978865645, + "grad_norm": 0.8974711894989014, + "learning_rate": 9.942280505095206e-06, + "loss": 0.8211, + "step": 3687 + }, + { + "epoch": 0.19883545395729998, + "grad_norm": 0.8174877762794495, + "learning_rate": 9.942248380113646e-06, + "loss": 0.8641, + "step": 3688 + }, + { + "epoch": 0.1988893681259435, + "grad_norm": 0.7798371315002441, + "learning_rate": 9.942216246246588e-06, + "loss": 0.7226, + "step": 3689 + }, + { + "epoch": 0.19894328229458702, + "grad_norm": 0.8269854784011841, + "learning_rate": 9.942184103494093e-06, + "loss": 0.8789, + "step": 3690 + }, + { + "epoch": 0.19899719646323052, + "grad_norm": 0.8148782253265381, + "learning_rate": 9.942151951856217e-06, + "loss": 0.8436, + "step": 3691 + }, + { + "epoch": 0.19905111063187406, + "grad_norm": 0.823692262172699, + "learning_rate": 9.942119791333017e-06, + "loss": 0.6935, + "step": 3692 + }, + { + "epoch": 0.1991050248005176, + "grad_norm": 0.8396292924880981, + "learning_rate": 9.942087621924555e-06, + "loss": 0.8814, + "step": 3693 + }, + { + "epoch": 0.1991589389691611, + "grad_norm": 0.7293786406517029, + "learning_rate": 9.942055443630885e-06, + "loss": 0.7735, + "step": 3694 + }, + { + "epoch": 0.19921285313780462, + "grad_norm": 0.7367222905158997, + "learning_rate": 9.942023256452066e-06, + "loss": 0.7797, + "step": 3695 + }, + { + "epoch": 0.19926676730644813, + "grad_norm": 0.7078450322151184, + "learning_rate": 9.941991060388155e-06, + "loss": 0.7192, + "step": 3696 + }, + { + "epoch": 0.19932068147509166, + "grad_norm": 0.7927302718162537, + "learning_rate": 9.941958855439211e-06, + "loss": 0.8249, + "step": 3697 + }, + { + "epoch": 0.19937459564373516, + "grad_norm": 0.806266725063324, + "learning_rate": 9.941926641605292e-06, + "loss": 0.7829, + "step": 3698 + }, + { + "epoch": 0.1994285098123787, + "grad_norm": 0.8022493720054626, + "learning_rate": 9.941894418886455e-06, + "loss": 0.7843, + "step": 3699 + }, + { + "epoch": 0.1994824239810222, + "grad_norm": 0.8877873420715332, + "learning_rate": 9.941862187282759e-06, + "loss": 0.7266, + "step": 3700 + }, + { + "epoch": 0.19953633814966573, + "grad_norm": 0.7944962382316589, + "learning_rate": 9.94182994679426e-06, + "loss": 0.8078, + "step": 3701 + }, + { + "epoch": 0.19959025231830926, + "grad_norm": 0.8684442639350891, + "learning_rate": 9.941797697421017e-06, + "loss": 0.7445, + "step": 3702 + }, + { + "epoch": 0.19964416648695277, + "grad_norm": 0.7841063141822815, + "learning_rate": 9.94176543916309e-06, + "loss": 0.7231, + "step": 3703 + }, + { + "epoch": 0.1996980806555963, + "grad_norm": 0.7657507658004761, + "learning_rate": 9.941733172020533e-06, + "loss": 0.7018, + "step": 3704 + }, + { + "epoch": 0.1997519948242398, + "grad_norm": 1.086627721786499, + "learning_rate": 9.94170089599341e-06, + "loss": 0.7914, + "step": 3705 + }, + { + "epoch": 0.19980590899288334, + "grad_norm": 0.7400459051132202, + "learning_rate": 9.941668611081771e-06, + "loss": 0.7841, + "step": 3706 + }, + { + "epoch": 0.19985982316152684, + "grad_norm": 1.0587258338928223, + "learning_rate": 9.94163631728568e-06, + "loss": 0.923, + "step": 3707 + }, + { + "epoch": 0.19991373733017037, + "grad_norm": 0.8322579264640808, + "learning_rate": 9.941604014605193e-06, + "loss": 0.8095, + "step": 3708 + }, + { + "epoch": 0.19996765149881388, + "grad_norm": 0.6660327911376953, + "learning_rate": 9.94157170304037e-06, + "loss": 0.6977, + "step": 3709 + }, + { + "epoch": 0.2000215656674574, + "grad_norm": 0.8063632249832153, + "learning_rate": 9.941539382591267e-06, + "loss": 0.7693, + "step": 3710 + }, + { + "epoch": 0.20007547983610094, + "grad_norm": 0.7367355227470398, + "learning_rate": 9.941507053257942e-06, + "loss": 0.7312, + "step": 3711 + }, + { + "epoch": 0.20012939400474444, + "grad_norm": 0.7430408596992493, + "learning_rate": 9.941474715040454e-06, + "loss": 0.8077, + "step": 3712 + }, + { + "epoch": 0.20018330817338797, + "grad_norm": 0.8141972422599792, + "learning_rate": 9.94144236793886e-06, + "loss": 0.8017, + "step": 3713 + }, + { + "epoch": 0.20023722234203148, + "grad_norm": 0.7599862217903137, + "learning_rate": 9.94141001195322e-06, + "loss": 0.8644, + "step": 3714 + }, + { + "epoch": 0.200291136510675, + "grad_norm": 0.8302745819091797, + "learning_rate": 9.941377647083591e-06, + "loss": 0.8996, + "step": 3715 + }, + { + "epoch": 0.20034505067931851, + "grad_norm": 0.8288695812225342, + "learning_rate": 9.941345273330031e-06, + "loss": 0.7727, + "step": 3716 + }, + { + "epoch": 0.20039896484796205, + "grad_norm": 0.7157832980155945, + "learning_rate": 9.9413128906926e-06, + "loss": 0.7619, + "step": 3717 + }, + { + "epoch": 0.20045287901660555, + "grad_norm": 0.7811874151229858, + "learning_rate": 9.941280499171355e-06, + "loss": 0.7905, + "step": 3718 + }, + { + "epoch": 0.20050679318524908, + "grad_norm": 0.7507179975509644, + "learning_rate": 9.941248098766354e-06, + "loss": 0.7023, + "step": 3719 + }, + { + "epoch": 0.20056070735389261, + "grad_norm": 0.7824770212173462, + "learning_rate": 9.941215689477655e-06, + "loss": 0.8233, + "step": 3720 + }, + { + "epoch": 0.20061462152253612, + "grad_norm": 0.7690337896347046, + "learning_rate": 9.941183271305314e-06, + "loss": 0.7162, + "step": 3721 + }, + { + "epoch": 0.20066853569117965, + "grad_norm": 0.8605464696884155, + "learning_rate": 9.941150844249396e-06, + "loss": 0.8073, + "step": 3722 + }, + { + "epoch": 0.20072244985982315, + "grad_norm": 0.8741899132728577, + "learning_rate": 9.941118408309953e-06, + "loss": 0.8131, + "step": 3723 + }, + { + "epoch": 0.2007763640284667, + "grad_norm": 0.8655528426170349, + "learning_rate": 9.941085963487044e-06, + "loss": 0.8162, + "step": 3724 + }, + { + "epoch": 0.2008302781971102, + "grad_norm": 0.7617276310920715, + "learning_rate": 9.941053509780732e-06, + "loss": 0.8257, + "step": 3725 + }, + { + "epoch": 0.20088419236575372, + "grad_norm": 0.7816554307937622, + "learning_rate": 9.941021047191071e-06, + "loss": 0.7722, + "step": 3726 + }, + { + "epoch": 0.20093810653439723, + "grad_norm": 0.7922171354293823, + "learning_rate": 9.94098857571812e-06, + "loss": 0.8267, + "step": 3727 + }, + { + "epoch": 0.20099202070304076, + "grad_norm": 0.7950446009635925, + "learning_rate": 9.940956095361939e-06, + "loss": 0.7743, + "step": 3728 + }, + { + "epoch": 0.2010459348716843, + "grad_norm": 1.154969573020935, + "learning_rate": 9.940923606122584e-06, + "loss": 0.7542, + "step": 3729 + }, + { + "epoch": 0.2010998490403278, + "grad_norm": 0.9842036962509155, + "learning_rate": 9.940891108000116e-06, + "loss": 0.8469, + "step": 3730 + }, + { + "epoch": 0.20115376320897133, + "grad_norm": 0.7800561785697937, + "learning_rate": 9.940858600994593e-06, + "loss": 0.7894, + "step": 3731 + }, + { + "epoch": 0.20120767737761483, + "grad_norm": 0.8366021513938904, + "learning_rate": 9.94082608510607e-06, + "loss": 0.8298, + "step": 3732 + }, + { + "epoch": 0.20126159154625836, + "grad_norm": 0.8020085692405701, + "learning_rate": 9.940793560334608e-06, + "loss": 0.8874, + "step": 3733 + }, + { + "epoch": 0.20131550571490187, + "grad_norm": 0.7151523232460022, + "learning_rate": 9.940761026680269e-06, + "loss": 0.697, + "step": 3734 + }, + { + "epoch": 0.2013694198835454, + "grad_norm": 0.8671187162399292, + "learning_rate": 9.940728484143105e-06, + "loss": 0.9408, + "step": 3735 + }, + { + "epoch": 0.2014233340521889, + "grad_norm": 0.8134783506393433, + "learning_rate": 9.940695932723179e-06, + "loss": 0.7751, + "step": 3736 + }, + { + "epoch": 0.20147724822083243, + "grad_norm": 0.8050068616867065, + "learning_rate": 9.940663372420546e-06, + "loss": 0.8676, + "step": 3737 + }, + { + "epoch": 0.20153116238947597, + "grad_norm": 0.9040514230728149, + "learning_rate": 9.940630803235269e-06, + "loss": 0.8499, + "step": 3738 + }, + { + "epoch": 0.20158507655811947, + "grad_norm": 0.8492094874382019, + "learning_rate": 9.9405982251674e-06, + "loss": 0.7006, + "step": 3739 + }, + { + "epoch": 0.201638990726763, + "grad_norm": 0.6991918683052063, + "learning_rate": 9.940565638217008e-06, + "loss": 0.73, + "step": 3740 + }, + { + "epoch": 0.2016929048954065, + "grad_norm": 0.8373433947563171, + "learning_rate": 9.940533042384142e-06, + "loss": 0.8514, + "step": 3741 + }, + { + "epoch": 0.20174681906405004, + "grad_norm": 0.8045080304145813, + "learning_rate": 9.940500437668864e-06, + "loss": 0.7678, + "step": 3742 + }, + { + "epoch": 0.20180073323269354, + "grad_norm": 0.8632493019104004, + "learning_rate": 9.940467824071233e-06, + "loss": 0.8541, + "step": 3743 + }, + { + "epoch": 0.20185464740133707, + "grad_norm": 0.8510474562644958, + "learning_rate": 9.940435201591307e-06, + "loss": 0.8124, + "step": 3744 + }, + { + "epoch": 0.2019085615699806, + "grad_norm": 0.8647206425666809, + "learning_rate": 9.940402570229144e-06, + "loss": 0.8553, + "step": 3745 + }, + { + "epoch": 0.2019624757386241, + "grad_norm": 0.8359355330467224, + "learning_rate": 9.940369929984804e-06, + "loss": 0.7459, + "step": 3746 + }, + { + "epoch": 0.20201638990726764, + "grad_norm": 0.7150790691375732, + "learning_rate": 9.940337280858346e-06, + "loss": 0.7155, + "step": 3747 + }, + { + "epoch": 0.20207030407591114, + "grad_norm": 0.8442468047142029, + "learning_rate": 9.940304622849826e-06, + "loss": 0.8139, + "step": 3748 + }, + { + "epoch": 0.20212421824455468, + "grad_norm": 0.8318220973014832, + "learning_rate": 9.940271955959307e-06, + "loss": 0.7255, + "step": 3749 + }, + { + "epoch": 0.20217813241319818, + "grad_norm": 0.802943229675293, + "learning_rate": 9.940239280186842e-06, + "loss": 0.7781, + "step": 3750 + }, + { + "epoch": 0.2022320465818417, + "grad_norm": 0.7529780268669128, + "learning_rate": 9.940206595532497e-06, + "loss": 0.7723, + "step": 3751 + }, + { + "epoch": 0.20228596075048522, + "grad_norm": 0.748574435710907, + "learning_rate": 9.940173901996325e-06, + "loss": 0.7911, + "step": 3752 + }, + { + "epoch": 0.20233987491912875, + "grad_norm": 0.800564706325531, + "learning_rate": 9.940141199578386e-06, + "loss": 0.7973, + "step": 3753 + }, + { + "epoch": 0.20239378908777228, + "grad_norm": 0.7890446186065674, + "learning_rate": 9.940108488278741e-06, + "loss": 0.8618, + "step": 3754 + }, + { + "epoch": 0.20244770325641578, + "grad_norm": 0.8168792128562927, + "learning_rate": 9.940075768097445e-06, + "loss": 0.7948, + "step": 3755 + }, + { + "epoch": 0.20250161742505932, + "grad_norm": 0.7742816209793091, + "learning_rate": 9.940043039034562e-06, + "loss": 0.8215, + "step": 3756 + }, + { + "epoch": 0.20255553159370282, + "grad_norm": 0.7921069860458374, + "learning_rate": 9.940010301090147e-06, + "loss": 0.7379, + "step": 3757 + }, + { + "epoch": 0.20260944576234635, + "grad_norm": 0.7375590205192566, + "learning_rate": 9.939977554264258e-06, + "loss": 0.7829, + "step": 3758 + }, + { + "epoch": 0.20266335993098986, + "grad_norm": 0.8653424382209778, + "learning_rate": 9.939944798556955e-06, + "loss": 0.8414, + "step": 3759 + }, + { + "epoch": 0.2027172740996334, + "grad_norm": 0.862486720085144, + "learning_rate": 9.9399120339683e-06, + "loss": 0.8531, + "step": 3760 + }, + { + "epoch": 0.2027711882682769, + "grad_norm": 0.737153947353363, + "learning_rate": 9.93987926049835e-06, + "loss": 0.8192, + "step": 3761 + }, + { + "epoch": 0.20282510243692042, + "grad_norm": 0.8391088843345642, + "learning_rate": 9.93984647814716e-06, + "loss": 0.7868, + "step": 3762 + }, + { + "epoch": 0.20287901660556396, + "grad_norm": 0.7767393589019775, + "learning_rate": 9.939813686914794e-06, + "loss": 0.7491, + "step": 3763 + }, + { + "epoch": 0.20293293077420746, + "grad_norm": 0.8916594982147217, + "learning_rate": 9.93978088680131e-06, + "loss": 0.7636, + "step": 3764 + }, + { + "epoch": 0.202986844942851, + "grad_norm": 0.8313565254211426, + "learning_rate": 9.939748077806766e-06, + "loss": 0.777, + "step": 3765 + }, + { + "epoch": 0.2030407591114945, + "grad_norm": 1.0501350164413452, + "learning_rate": 9.93971525993122e-06, + "loss": 0.8355, + "step": 3766 + }, + { + "epoch": 0.20309467328013803, + "grad_norm": 0.8451823592185974, + "learning_rate": 9.939682433174733e-06, + "loss": 0.7977, + "step": 3767 + }, + { + "epoch": 0.20314858744878153, + "grad_norm": 0.7655192613601685, + "learning_rate": 9.939649597537363e-06, + "loss": 0.803, + "step": 3768 + }, + { + "epoch": 0.20320250161742506, + "grad_norm": 0.885886549949646, + "learning_rate": 9.939616753019169e-06, + "loss": 0.7001, + "step": 3769 + }, + { + "epoch": 0.20325641578606857, + "grad_norm": 0.7583027482032776, + "learning_rate": 9.939583899620211e-06, + "loss": 0.7477, + "step": 3770 + }, + { + "epoch": 0.2033103299547121, + "grad_norm": 0.7712547779083252, + "learning_rate": 9.939551037340546e-06, + "loss": 0.7587, + "step": 3771 + }, + { + "epoch": 0.20336424412335563, + "grad_norm": 0.8146941065788269, + "learning_rate": 9.939518166180235e-06, + "loss": 0.8707, + "step": 3772 + }, + { + "epoch": 0.20341815829199913, + "grad_norm": 0.813261866569519, + "learning_rate": 9.939485286139338e-06, + "loss": 0.793, + "step": 3773 + }, + { + "epoch": 0.20347207246064267, + "grad_norm": 0.8719590306282043, + "learning_rate": 9.93945239721791e-06, + "loss": 0.7468, + "step": 3774 + }, + { + "epoch": 0.20352598662928617, + "grad_norm": 0.7224612236022949, + "learning_rate": 9.939419499416015e-06, + "loss": 0.7042, + "step": 3775 + }, + { + "epoch": 0.2035799007979297, + "grad_norm": 0.9211709499359131, + "learning_rate": 9.939386592733709e-06, + "loss": 0.859, + "step": 3776 + }, + { + "epoch": 0.2036338149665732, + "grad_norm": 0.7238151431083679, + "learning_rate": 9.939353677171054e-06, + "loss": 0.7656, + "step": 3777 + }, + { + "epoch": 0.20368772913521674, + "grad_norm": 0.7677724957466125, + "learning_rate": 9.939320752728105e-06, + "loss": 0.7827, + "step": 3778 + }, + { + "epoch": 0.20374164330386024, + "grad_norm": 0.8721383213996887, + "learning_rate": 9.939287819404924e-06, + "loss": 0.8731, + "step": 3779 + }, + { + "epoch": 0.20379555747250377, + "grad_norm": 0.815819501876831, + "learning_rate": 9.93925487720157e-06, + "loss": 0.8022, + "step": 3780 + }, + { + "epoch": 0.2038494716411473, + "grad_norm": 0.7322037816047668, + "learning_rate": 9.939221926118102e-06, + "loss": 0.7593, + "step": 3781 + }, + { + "epoch": 0.2039033858097908, + "grad_norm": 0.787909984588623, + "learning_rate": 9.939188966154577e-06, + "loss": 0.6648, + "step": 3782 + }, + { + "epoch": 0.20395729997843434, + "grad_norm": 0.8070237636566162, + "learning_rate": 9.93915599731106e-06, + "loss": 0.8062, + "step": 3783 + }, + { + "epoch": 0.20401121414707785, + "grad_norm": 0.8590712547302246, + "learning_rate": 9.939123019587604e-06, + "loss": 0.7494, + "step": 3784 + }, + { + "epoch": 0.20406512831572138, + "grad_norm": 0.7172074317932129, + "learning_rate": 9.939090032984271e-06, + "loss": 0.7305, + "step": 3785 + }, + { + "epoch": 0.20411904248436488, + "grad_norm": 0.7950757145881653, + "learning_rate": 9.93905703750112e-06, + "loss": 0.8096, + "step": 3786 + }, + { + "epoch": 0.20417295665300841, + "grad_norm": 0.76169353723526, + "learning_rate": 9.939024033138212e-06, + "loss": 0.856, + "step": 3787 + }, + { + "epoch": 0.20422687082165192, + "grad_norm": 0.7239205241203308, + "learning_rate": 9.938991019895606e-06, + "loss": 0.7194, + "step": 3788 + }, + { + "epoch": 0.20428078499029545, + "grad_norm": 0.9215821027755737, + "learning_rate": 9.938957997773358e-06, + "loss": 0.9972, + "step": 3789 + }, + { + "epoch": 0.20433469915893898, + "grad_norm": 0.6955212950706482, + "learning_rate": 9.93892496677153e-06, + "loss": 0.7032, + "step": 3790 + }, + { + "epoch": 0.20438861332758249, + "grad_norm": 0.7507944107055664, + "learning_rate": 9.938891926890181e-06, + "loss": 0.6664, + "step": 3791 + }, + { + "epoch": 0.20444252749622602, + "grad_norm": 0.8046016097068787, + "learning_rate": 9.938858878129372e-06, + "loss": 0.8034, + "step": 3792 + }, + { + "epoch": 0.20449644166486952, + "grad_norm": 0.7906206250190735, + "learning_rate": 9.938825820489158e-06, + "loss": 0.745, + "step": 3793 + }, + { + "epoch": 0.20455035583351305, + "grad_norm": 0.820650577545166, + "learning_rate": 9.938792753969604e-06, + "loss": 0.8695, + "step": 3794 + }, + { + "epoch": 0.20460427000215656, + "grad_norm": 0.7975518107414246, + "learning_rate": 9.938759678570766e-06, + "loss": 0.7835, + "step": 3795 + }, + { + "epoch": 0.2046581841708001, + "grad_norm": 0.7118270993232727, + "learning_rate": 9.938726594292703e-06, + "loss": 0.6961, + "step": 3796 + }, + { + "epoch": 0.2047120983394436, + "grad_norm": 0.7525848746299744, + "learning_rate": 9.938693501135477e-06, + "loss": 0.7714, + "step": 3797 + }, + { + "epoch": 0.20476601250808713, + "grad_norm": 0.7751832604408264, + "learning_rate": 9.938660399099145e-06, + "loss": 0.8213, + "step": 3798 + }, + { + "epoch": 0.20481992667673066, + "grad_norm": 0.7307599186897278, + "learning_rate": 9.938627288183769e-06, + "loss": 0.7164, + "step": 3799 + }, + { + "epoch": 0.20487384084537416, + "grad_norm": 0.7432039380073547, + "learning_rate": 9.938594168389406e-06, + "loss": 0.8215, + "step": 3800 + }, + { + "epoch": 0.2049277550140177, + "grad_norm": 0.8611830472946167, + "learning_rate": 9.938561039716116e-06, + "loss": 0.829, + "step": 3801 + }, + { + "epoch": 0.2049816691826612, + "grad_norm": 0.8893013596534729, + "learning_rate": 9.93852790216396e-06, + "loss": 0.7413, + "step": 3802 + }, + { + "epoch": 0.20503558335130473, + "grad_norm": 0.7722970843315125, + "learning_rate": 9.938494755732999e-06, + "loss": 0.7398, + "step": 3803 + }, + { + "epoch": 0.20508949751994823, + "grad_norm": 0.762994110584259, + "learning_rate": 9.938461600423289e-06, + "loss": 0.8195, + "step": 3804 + }, + { + "epoch": 0.20514341168859176, + "grad_norm": 0.7434782981872559, + "learning_rate": 9.938428436234891e-06, + "loss": 0.7917, + "step": 3805 + }, + { + "epoch": 0.20519732585723527, + "grad_norm": 0.7441586852073669, + "learning_rate": 9.938395263167866e-06, + "loss": 0.7852, + "step": 3806 + }, + { + "epoch": 0.2052512400258788, + "grad_norm": 0.7333529591560364, + "learning_rate": 9.93836208122227e-06, + "loss": 0.6912, + "step": 3807 + }, + { + "epoch": 0.20530515419452233, + "grad_norm": 0.8772805333137512, + "learning_rate": 9.938328890398167e-06, + "loss": 0.7828, + "step": 3808 + }, + { + "epoch": 0.20535906836316584, + "grad_norm": 0.7632616758346558, + "learning_rate": 9.938295690695614e-06, + "loss": 0.7235, + "step": 3809 + }, + { + "epoch": 0.20541298253180937, + "grad_norm": 0.7122440338134766, + "learning_rate": 9.93826248211467e-06, + "loss": 0.7734, + "step": 3810 + }, + { + "epoch": 0.20546689670045287, + "grad_norm": 0.7449793815612793, + "learning_rate": 9.938229264655399e-06, + "loss": 0.6826, + "step": 3811 + }, + { + "epoch": 0.2055208108690964, + "grad_norm": 0.7615137696266174, + "learning_rate": 9.938196038317856e-06, + "loss": 0.8139, + "step": 3812 + }, + { + "epoch": 0.2055747250377399, + "grad_norm": 0.7921400666236877, + "learning_rate": 9.938162803102102e-06, + "loss": 0.8424, + "step": 3813 + }, + { + "epoch": 0.20562863920638344, + "grad_norm": 0.8665443062782288, + "learning_rate": 9.938129559008198e-06, + "loss": 0.665, + "step": 3814 + }, + { + "epoch": 0.20568255337502694, + "grad_norm": 0.7882665991783142, + "learning_rate": 9.938096306036202e-06, + "loss": 0.8162, + "step": 3815 + }, + { + "epoch": 0.20573646754367048, + "grad_norm": 0.7418076395988464, + "learning_rate": 9.938063044186176e-06, + "loss": 0.7629, + "step": 3816 + }, + { + "epoch": 0.205790381712314, + "grad_norm": 0.8741267919540405, + "learning_rate": 9.93802977345818e-06, + "loss": 0.851, + "step": 3817 + }, + { + "epoch": 0.2058442958809575, + "grad_norm": 0.7862716913223267, + "learning_rate": 9.937996493852271e-06, + "loss": 0.7542, + "step": 3818 + }, + { + "epoch": 0.20589821004960104, + "grad_norm": 0.8344624042510986, + "learning_rate": 9.937963205368509e-06, + "loss": 0.7366, + "step": 3819 + }, + { + "epoch": 0.20595212421824455, + "grad_norm": 0.9976859092712402, + "learning_rate": 9.937929908006957e-06, + "loss": 0.9252, + "step": 3820 + }, + { + "epoch": 0.20600603838688808, + "grad_norm": 0.8346890807151794, + "learning_rate": 9.937896601767672e-06, + "loss": 0.8172, + "step": 3821 + }, + { + "epoch": 0.20605995255553158, + "grad_norm": 0.8109154105186462, + "learning_rate": 9.937863286650715e-06, + "loss": 0.8869, + "step": 3822 + }, + { + "epoch": 0.20611386672417512, + "grad_norm": 0.7664018869400024, + "learning_rate": 9.937829962656147e-06, + "loss": 0.7821, + "step": 3823 + }, + { + "epoch": 0.20616778089281862, + "grad_norm": 0.9373911619186401, + "learning_rate": 9.937796629784025e-06, + "loss": 0.7391, + "step": 3824 + }, + { + "epoch": 0.20622169506146215, + "grad_norm": 0.7312552332878113, + "learning_rate": 9.937763288034411e-06, + "loss": 0.7328, + "step": 3825 + }, + { + "epoch": 0.20627560923010568, + "grad_norm": 0.9266682863235474, + "learning_rate": 9.937729937407365e-06, + "loss": 0.8976, + "step": 3826 + }, + { + "epoch": 0.2063295233987492, + "grad_norm": 0.7579758763313293, + "learning_rate": 9.937696577902947e-06, + "loss": 0.766, + "step": 3827 + }, + { + "epoch": 0.20638343756739272, + "grad_norm": 0.8648816347122192, + "learning_rate": 9.937663209521216e-06, + "loss": 0.9122, + "step": 3828 + }, + { + "epoch": 0.20643735173603622, + "grad_norm": 0.8788310289382935, + "learning_rate": 9.937629832262231e-06, + "loss": 0.9152, + "step": 3829 + }, + { + "epoch": 0.20649126590467975, + "grad_norm": 0.8865007162094116, + "learning_rate": 9.937596446126057e-06, + "loss": 0.8767, + "step": 3830 + }, + { + "epoch": 0.20654518007332326, + "grad_norm": 0.7323981523513794, + "learning_rate": 9.937563051112748e-06, + "loss": 0.7733, + "step": 3831 + }, + { + "epoch": 0.2065990942419668, + "grad_norm": 0.8782559037208557, + "learning_rate": 9.937529647222368e-06, + "loss": 0.7694, + "step": 3832 + }, + { + "epoch": 0.2066530084106103, + "grad_norm": 0.8300665020942688, + "learning_rate": 9.937496234454974e-06, + "loss": 0.8386, + "step": 3833 + }, + { + "epoch": 0.20670692257925383, + "grad_norm": 0.8438191413879395, + "learning_rate": 9.937462812810628e-06, + "loss": 0.7394, + "step": 3834 + }, + { + "epoch": 0.20676083674789736, + "grad_norm": 0.7255253195762634, + "learning_rate": 9.937429382289391e-06, + "loss": 0.6973, + "step": 3835 + }, + { + "epoch": 0.20681475091654086, + "grad_norm": 0.8600755929946899, + "learning_rate": 9.93739594289132e-06, + "loss": 0.838, + "step": 3836 + }, + { + "epoch": 0.2068686650851844, + "grad_norm": 0.788693904876709, + "learning_rate": 9.937362494616479e-06, + "loss": 0.7911, + "step": 3837 + }, + { + "epoch": 0.2069225792538279, + "grad_norm": 0.808438777923584, + "learning_rate": 9.937329037464924e-06, + "loss": 0.7802, + "step": 3838 + }, + { + "epoch": 0.20697649342247143, + "grad_norm": 0.9273937344551086, + "learning_rate": 9.937295571436719e-06, + "loss": 0.8589, + "step": 3839 + }, + { + "epoch": 0.20703040759111493, + "grad_norm": 0.7375195026397705, + "learning_rate": 9.937262096531922e-06, + "loss": 0.7017, + "step": 3840 + }, + { + "epoch": 0.20708432175975847, + "grad_norm": 0.7502869963645935, + "learning_rate": 9.937228612750594e-06, + "loss": 0.7577, + "step": 3841 + }, + { + "epoch": 0.20713823592840197, + "grad_norm": 0.8005609512329102, + "learning_rate": 9.937195120092794e-06, + "loss": 0.7411, + "step": 3842 + }, + { + "epoch": 0.2071921500970455, + "grad_norm": 0.8089357018470764, + "learning_rate": 9.937161618558583e-06, + "loss": 0.8149, + "step": 3843 + }, + { + "epoch": 0.20724606426568903, + "grad_norm": 0.946266233921051, + "learning_rate": 9.937128108148022e-06, + "loss": 0.8676, + "step": 3844 + }, + { + "epoch": 0.20729997843433254, + "grad_norm": 0.793250322341919, + "learning_rate": 9.937094588861171e-06, + "loss": 0.8402, + "step": 3845 + }, + { + "epoch": 0.20735389260297607, + "grad_norm": 0.9192420244216919, + "learning_rate": 9.937061060698088e-06, + "loss": 0.8381, + "step": 3846 + }, + { + "epoch": 0.20740780677161957, + "grad_norm": 0.7944622039794922, + "learning_rate": 9.937027523658838e-06, + "loss": 0.8281, + "step": 3847 + }, + { + "epoch": 0.2074617209402631, + "grad_norm": 0.8567733764648438, + "learning_rate": 9.936993977743476e-06, + "loss": 0.7528, + "step": 3848 + }, + { + "epoch": 0.2075156351089066, + "grad_norm": 0.9478929042816162, + "learning_rate": 9.936960422952064e-06, + "loss": 0.6957, + "step": 3849 + }, + { + "epoch": 0.20756954927755014, + "grad_norm": 0.8856588006019592, + "learning_rate": 9.936926859284665e-06, + "loss": 0.9112, + "step": 3850 + }, + { + "epoch": 0.20762346344619367, + "grad_norm": 0.8800935745239258, + "learning_rate": 9.936893286741336e-06, + "loss": 0.7313, + "step": 3851 + }, + { + "epoch": 0.20767737761483718, + "grad_norm": 0.773314893245697, + "learning_rate": 9.936859705322139e-06, + "loss": 0.7953, + "step": 3852 + }, + { + "epoch": 0.2077312917834807, + "grad_norm": 0.8045309782028198, + "learning_rate": 9.936826115027136e-06, + "loss": 0.7789, + "step": 3853 + }, + { + "epoch": 0.2077852059521242, + "grad_norm": 0.7337809801101685, + "learning_rate": 9.936792515856383e-06, + "loss": 0.7471, + "step": 3854 + }, + { + "epoch": 0.20783912012076775, + "grad_norm": 0.7467783093452454, + "learning_rate": 9.936758907809944e-06, + "loss": 0.746, + "step": 3855 + }, + { + "epoch": 0.20789303428941125, + "grad_norm": 0.896782398223877, + "learning_rate": 9.936725290887878e-06, + "loss": 0.8753, + "step": 3856 + }, + { + "epoch": 0.20794694845805478, + "grad_norm": 0.7642794251441956, + "learning_rate": 9.936691665090246e-06, + "loss": 0.744, + "step": 3857 + }, + { + "epoch": 0.20800086262669829, + "grad_norm": 0.9514477849006653, + "learning_rate": 9.936658030417108e-06, + "loss": 0.9586, + "step": 3858 + }, + { + "epoch": 0.20805477679534182, + "grad_norm": 0.8868480324745178, + "learning_rate": 9.936624386868524e-06, + "loss": 0.7381, + "step": 3859 + }, + { + "epoch": 0.20810869096398535, + "grad_norm": 0.7855881452560425, + "learning_rate": 9.936590734444555e-06, + "loss": 0.7942, + "step": 3860 + }, + { + "epoch": 0.20816260513262885, + "grad_norm": 0.7549954056739807, + "learning_rate": 9.936557073145264e-06, + "loss": 0.8478, + "step": 3861 + }, + { + "epoch": 0.20821651930127238, + "grad_norm": 0.7425951361656189, + "learning_rate": 9.936523402970707e-06, + "loss": 0.7854, + "step": 3862 + }, + { + "epoch": 0.2082704334699159, + "grad_norm": 0.7873994708061218, + "learning_rate": 9.936489723920947e-06, + "loss": 0.6917, + "step": 3863 + }, + { + "epoch": 0.20832434763855942, + "grad_norm": 0.7681507468223572, + "learning_rate": 9.936456035996044e-06, + "loss": 0.7427, + "step": 3864 + }, + { + "epoch": 0.20837826180720292, + "grad_norm": 0.8043473362922668, + "learning_rate": 9.93642233919606e-06, + "loss": 0.7319, + "step": 3865 + }, + { + "epoch": 0.20843217597584646, + "grad_norm": 0.9194585084915161, + "learning_rate": 9.936388633521055e-06, + "loss": 0.755, + "step": 3866 + }, + { + "epoch": 0.20848609014448996, + "grad_norm": 0.7365962862968445, + "learning_rate": 9.936354918971087e-06, + "loss": 0.7855, + "step": 3867 + }, + { + "epoch": 0.2085400043131335, + "grad_norm": 0.8254776000976562, + "learning_rate": 9.936321195546218e-06, + "loss": 0.7854, + "step": 3868 + }, + { + "epoch": 0.20859391848177702, + "grad_norm": 0.8259122967720032, + "learning_rate": 9.936287463246513e-06, + "loss": 0.8759, + "step": 3869 + }, + { + "epoch": 0.20864783265042053, + "grad_norm": 0.731363832950592, + "learning_rate": 9.936253722072026e-06, + "loss": 0.76, + "step": 3870 + }, + { + "epoch": 0.20870174681906406, + "grad_norm": 0.908054530620575, + "learning_rate": 9.93621997202282e-06, + "loss": 0.8865, + "step": 3871 + }, + { + "epoch": 0.20875566098770756, + "grad_norm": 0.7576562166213989, + "learning_rate": 9.936186213098958e-06, + "loss": 0.8276, + "step": 3872 + }, + { + "epoch": 0.2088095751563511, + "grad_norm": 0.8297492861747742, + "learning_rate": 9.9361524453005e-06, + "loss": 0.8799, + "step": 3873 + }, + { + "epoch": 0.2088634893249946, + "grad_norm": 0.7945959568023682, + "learning_rate": 9.936118668627502e-06, + "loss": 0.8448, + "step": 3874 + }, + { + "epoch": 0.20891740349363813, + "grad_norm": 0.8161780834197998, + "learning_rate": 9.936084883080031e-06, + "loss": 0.8835, + "step": 3875 + }, + { + "epoch": 0.20897131766228164, + "grad_norm": 0.768398106098175, + "learning_rate": 9.936051088658145e-06, + "loss": 0.7984, + "step": 3876 + }, + { + "epoch": 0.20902523183092517, + "grad_norm": 0.8847882151603699, + "learning_rate": 9.936017285361903e-06, + "loss": 0.8757, + "step": 3877 + }, + { + "epoch": 0.2090791459995687, + "grad_norm": 0.8796868324279785, + "learning_rate": 9.93598347319137e-06, + "loss": 0.8227, + "step": 3878 + }, + { + "epoch": 0.2091330601682122, + "grad_norm": 0.8362753987312317, + "learning_rate": 9.935949652146604e-06, + "loss": 0.7892, + "step": 3879 + }, + { + "epoch": 0.20918697433685574, + "grad_norm": 1.0995301008224487, + "learning_rate": 9.935915822227664e-06, + "loss": 0.7227, + "step": 3880 + }, + { + "epoch": 0.20924088850549924, + "grad_norm": 0.7771546244621277, + "learning_rate": 9.935881983434616e-06, + "loss": 0.8025, + "step": 3881 + }, + { + "epoch": 0.20929480267414277, + "grad_norm": 0.8586302995681763, + "learning_rate": 9.935848135767516e-06, + "loss": 0.7086, + "step": 3882 + }, + { + "epoch": 0.20934871684278628, + "grad_norm": 0.956278920173645, + "learning_rate": 9.935814279226428e-06, + "loss": 0.8625, + "step": 3883 + }, + { + "epoch": 0.2094026310114298, + "grad_norm": 0.8021535277366638, + "learning_rate": 9.935780413811412e-06, + "loss": 0.8392, + "step": 3884 + }, + { + "epoch": 0.2094565451800733, + "grad_norm": 0.7699674367904663, + "learning_rate": 9.935746539522526e-06, + "loss": 0.8322, + "step": 3885 + }, + { + "epoch": 0.20951045934871684, + "grad_norm": 0.7814954519271851, + "learning_rate": 9.935712656359835e-06, + "loss": 0.9123, + "step": 3886 + }, + { + "epoch": 0.20956437351736037, + "grad_norm": 0.7062190175056458, + "learning_rate": 9.935678764323397e-06, + "loss": 0.7398, + "step": 3887 + }, + { + "epoch": 0.20961828768600388, + "grad_norm": 0.8294083476066589, + "learning_rate": 9.935644863413276e-06, + "loss": 0.8381, + "step": 3888 + }, + { + "epoch": 0.2096722018546474, + "grad_norm": 0.779521107673645, + "learning_rate": 9.93561095362953e-06, + "loss": 0.7838, + "step": 3889 + }, + { + "epoch": 0.20972611602329091, + "grad_norm": 0.894511878490448, + "learning_rate": 9.935577034972224e-06, + "loss": 0.7278, + "step": 3890 + }, + { + "epoch": 0.20978003019193445, + "grad_norm": 0.6891781091690063, + "learning_rate": 9.935543107441414e-06, + "loss": 0.6854, + "step": 3891 + }, + { + "epoch": 0.20983394436057795, + "grad_norm": 0.7697615623474121, + "learning_rate": 9.935509171037161e-06, + "loss": 0.7901, + "step": 3892 + }, + { + "epoch": 0.20988785852922148, + "grad_norm": 0.7699109315872192, + "learning_rate": 9.935475225759532e-06, + "loss": 0.7982, + "step": 3893 + }, + { + "epoch": 0.209941772697865, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.93544127160858e-06, + "loss": 0.7911, + "step": 3894 + }, + { + "epoch": 0.20999568686650852, + "grad_norm": 0.7754570245742798, + "learning_rate": 9.935407308584374e-06, + "loss": 0.6886, + "step": 3895 + }, + { + "epoch": 0.21004960103515205, + "grad_norm": 0.8235013484954834, + "learning_rate": 9.935373336686971e-06, + "loss": 0.845, + "step": 3896 + }, + { + "epoch": 0.21010351520379555, + "grad_norm": 0.7366604208946228, + "learning_rate": 9.93533935591643e-06, + "loss": 0.7499, + "step": 3897 + }, + { + "epoch": 0.2101574293724391, + "grad_norm": 0.7987866401672363, + "learning_rate": 9.935305366272816e-06, + "loss": 0.7866, + "step": 3898 + }, + { + "epoch": 0.2102113435410826, + "grad_norm": 0.8240886926651001, + "learning_rate": 9.93527136775619e-06, + "loss": 0.8027, + "step": 3899 + }, + { + "epoch": 0.21026525770972612, + "grad_norm": 0.7460751533508301, + "learning_rate": 9.93523736036661e-06, + "loss": 0.7517, + "step": 3900 + }, + { + "epoch": 0.21031917187836963, + "grad_norm": 0.7845814228057861, + "learning_rate": 9.935203344104139e-06, + "loss": 0.7533, + "step": 3901 + }, + { + "epoch": 0.21037308604701316, + "grad_norm": 0.7805215120315552, + "learning_rate": 9.935169318968838e-06, + "loss": 0.7034, + "step": 3902 + }, + { + "epoch": 0.21042700021565666, + "grad_norm": 0.7909711003303528, + "learning_rate": 9.935135284960769e-06, + "loss": 0.8253, + "step": 3903 + }, + { + "epoch": 0.2104809143843002, + "grad_norm": 0.7670220136642456, + "learning_rate": 9.93510124207999e-06, + "loss": 0.8114, + "step": 3904 + }, + { + "epoch": 0.21053482855294373, + "grad_norm": 0.7751194834709167, + "learning_rate": 9.935067190326566e-06, + "loss": 0.875, + "step": 3905 + }, + { + "epoch": 0.21058874272158723, + "grad_norm": 0.9303408265113831, + "learning_rate": 9.935033129700557e-06, + "loss": 0.9104, + "step": 3906 + }, + { + "epoch": 0.21064265689023076, + "grad_norm": 0.786558210849762, + "learning_rate": 9.934999060202024e-06, + "loss": 0.7453, + "step": 3907 + }, + { + "epoch": 0.21069657105887427, + "grad_norm": 0.8450469970703125, + "learning_rate": 9.934964981831028e-06, + "loss": 0.9733, + "step": 3908 + }, + { + "epoch": 0.2107504852275178, + "grad_norm": 0.8045774698257446, + "learning_rate": 9.93493089458763e-06, + "loss": 0.6763, + "step": 3909 + }, + { + "epoch": 0.2108043993961613, + "grad_norm": 0.7320234775543213, + "learning_rate": 9.934896798471894e-06, + "loss": 0.7668, + "step": 3910 + }, + { + "epoch": 0.21085831356480483, + "grad_norm": 0.8155072331428528, + "learning_rate": 9.934862693483878e-06, + "loss": 0.8186, + "step": 3911 + }, + { + "epoch": 0.21091222773344834, + "grad_norm": 0.7914832234382629, + "learning_rate": 9.934828579623643e-06, + "loss": 0.7977, + "step": 3912 + }, + { + "epoch": 0.21096614190209187, + "grad_norm": 0.7110108733177185, + "learning_rate": 9.934794456891254e-06, + "loss": 0.6576, + "step": 3913 + }, + { + "epoch": 0.2110200560707354, + "grad_norm": 1.0787992477416992, + "learning_rate": 9.934760325286768e-06, + "loss": 0.871, + "step": 3914 + }, + { + "epoch": 0.2110739702393789, + "grad_norm": 0.798880934715271, + "learning_rate": 9.93472618481025e-06, + "loss": 0.7115, + "step": 3915 + }, + { + "epoch": 0.21112788440802244, + "grad_norm": 0.945782482624054, + "learning_rate": 9.934692035461759e-06, + "loss": 0.7806, + "step": 3916 + }, + { + "epoch": 0.21118179857666594, + "grad_norm": 0.8860074877738953, + "learning_rate": 9.934657877241358e-06, + "loss": 0.735, + "step": 3917 + }, + { + "epoch": 0.21123571274530947, + "grad_norm": 0.7661596536636353, + "learning_rate": 9.934623710149107e-06, + "loss": 0.747, + "step": 3918 + }, + { + "epoch": 0.21128962691395298, + "grad_norm": 0.7670447826385498, + "learning_rate": 9.934589534185068e-06, + "loss": 0.7366, + "step": 3919 + }, + { + "epoch": 0.2113435410825965, + "grad_norm": 0.7264759540557861, + "learning_rate": 9.934555349349305e-06, + "loss": 0.7353, + "step": 3920 + }, + { + "epoch": 0.21139745525124, + "grad_norm": 0.7623618841171265, + "learning_rate": 9.934521155641874e-06, + "loss": 0.7758, + "step": 3921 + }, + { + "epoch": 0.21145136941988354, + "grad_norm": 0.6979674100875854, + "learning_rate": 9.93448695306284e-06, + "loss": 0.7376, + "step": 3922 + }, + { + "epoch": 0.21150528358852708, + "grad_norm": 0.7221145033836365, + "learning_rate": 9.934452741612265e-06, + "loss": 0.7918, + "step": 3923 + }, + { + "epoch": 0.21155919775717058, + "grad_norm": 0.7353740930557251, + "learning_rate": 9.934418521290209e-06, + "loss": 0.7487, + "step": 3924 + }, + { + "epoch": 0.2116131119258141, + "grad_norm": 0.8132720589637756, + "learning_rate": 9.934384292096734e-06, + "loss": 0.8121, + "step": 3925 + }, + { + "epoch": 0.21166702609445762, + "grad_norm": 0.8918466567993164, + "learning_rate": 9.9343500540319e-06, + "loss": 0.8911, + "step": 3926 + }, + { + "epoch": 0.21172094026310115, + "grad_norm": 0.7636724710464478, + "learning_rate": 9.934315807095774e-06, + "loss": 0.8012, + "step": 3927 + }, + { + "epoch": 0.21177485443174465, + "grad_norm": 0.889636754989624, + "learning_rate": 9.93428155128841e-06, + "loss": 0.7793, + "step": 3928 + }, + { + "epoch": 0.21182876860038818, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.934247286609875e-06, + "loss": 0.7483, + "step": 3929 + }, + { + "epoch": 0.2118826827690317, + "grad_norm": 0.8311534523963928, + "learning_rate": 9.934213013060228e-06, + "loss": 0.8796, + "step": 3930 + }, + { + "epoch": 0.21193659693767522, + "grad_norm": 0.7643389105796814, + "learning_rate": 9.934178730639531e-06, + "loss": 0.7587, + "step": 3931 + }, + { + "epoch": 0.21199051110631875, + "grad_norm": 0.8276751637458801, + "learning_rate": 9.934144439347849e-06, + "loss": 0.745, + "step": 3932 + }, + { + "epoch": 0.21204442527496226, + "grad_norm": 0.7427680492401123, + "learning_rate": 9.934110139185238e-06, + "loss": 0.7445, + "step": 3933 + }, + { + "epoch": 0.2120983394436058, + "grad_norm": 0.7343453168869019, + "learning_rate": 9.934075830151762e-06, + "loss": 0.7037, + "step": 3934 + }, + { + "epoch": 0.2121522536122493, + "grad_norm": 0.8002830743789673, + "learning_rate": 9.934041512247485e-06, + "loss": 0.8458, + "step": 3935 + }, + { + "epoch": 0.21220616778089282, + "grad_norm": 0.7045907974243164, + "learning_rate": 9.934007185472466e-06, + "loss": 0.7626, + "step": 3936 + }, + { + "epoch": 0.21226008194953633, + "grad_norm": 0.8169815540313721, + "learning_rate": 9.933972849826767e-06, + "loss": 0.8116, + "step": 3937 + }, + { + "epoch": 0.21231399611817986, + "grad_norm": 0.6935508847236633, + "learning_rate": 9.933938505310451e-06, + "loss": 0.7244, + "step": 3938 + }, + { + "epoch": 0.21236791028682336, + "grad_norm": 0.8311216235160828, + "learning_rate": 9.93390415192358e-06, + "loss": 0.8238, + "step": 3939 + }, + { + "epoch": 0.2124218244554669, + "grad_norm": 0.84473717212677, + "learning_rate": 9.933869789666213e-06, + "loss": 0.8334, + "step": 3940 + }, + { + "epoch": 0.21247573862411043, + "grad_norm": 0.7648805379867554, + "learning_rate": 9.933835418538414e-06, + "loss": 0.8705, + "step": 3941 + }, + { + "epoch": 0.21252965279275393, + "grad_norm": 0.752015233039856, + "learning_rate": 9.933801038540245e-06, + "loss": 0.7827, + "step": 3942 + }, + { + "epoch": 0.21258356696139746, + "grad_norm": 0.9639801383018494, + "learning_rate": 9.933766649671765e-06, + "loss": 0.7827, + "step": 3943 + }, + { + "epoch": 0.21263748113004097, + "grad_norm": 0.7730019092559814, + "learning_rate": 9.933732251933042e-06, + "loss": 0.7868, + "step": 3944 + }, + { + "epoch": 0.2126913952986845, + "grad_norm": 0.8141674995422363, + "learning_rate": 9.93369784532413e-06, + "loss": 0.7699, + "step": 3945 + }, + { + "epoch": 0.212745309467328, + "grad_norm": 0.8050745725631714, + "learning_rate": 9.933663429845097e-06, + "loss": 0.706, + "step": 3946 + }, + { + "epoch": 0.21279922363597154, + "grad_norm": 0.8519124388694763, + "learning_rate": 9.933629005496002e-06, + "loss": 0.8638, + "step": 3947 + }, + { + "epoch": 0.21285313780461504, + "grad_norm": 0.7999953627586365, + "learning_rate": 9.933594572276907e-06, + "loss": 0.7263, + "step": 3948 + }, + { + "epoch": 0.21290705197325857, + "grad_norm": 0.8291010856628418, + "learning_rate": 9.933560130187875e-06, + "loss": 0.8241, + "step": 3949 + }, + { + "epoch": 0.2129609661419021, + "grad_norm": 0.8472279906272888, + "learning_rate": 9.933525679228965e-06, + "loss": 0.8031, + "step": 3950 + }, + { + "epoch": 0.2130148803105456, + "grad_norm": 0.8077083826065063, + "learning_rate": 9.933491219400244e-06, + "loss": 0.7499, + "step": 3951 + }, + { + "epoch": 0.21306879447918914, + "grad_norm": 0.7736468315124512, + "learning_rate": 9.933456750701771e-06, + "loss": 0.7767, + "step": 3952 + }, + { + "epoch": 0.21312270864783264, + "grad_norm": 0.7541413307189941, + "learning_rate": 9.933422273133606e-06, + "loss": 0.7952, + "step": 3953 + }, + { + "epoch": 0.21317662281647617, + "grad_norm": 0.8432198762893677, + "learning_rate": 9.933387786695816e-06, + "loss": 0.8618, + "step": 3954 + }, + { + "epoch": 0.21323053698511968, + "grad_norm": 0.9090738296508789, + "learning_rate": 9.933353291388458e-06, + "loss": 0.8484, + "step": 3955 + }, + { + "epoch": 0.2132844511537632, + "grad_norm": 0.7549050450325012, + "learning_rate": 9.933318787211597e-06, + "loss": 0.842, + "step": 3956 + }, + { + "epoch": 0.21333836532240674, + "grad_norm": 0.7340126633644104, + "learning_rate": 9.933284274165293e-06, + "loss": 0.7253, + "step": 3957 + }, + { + "epoch": 0.21339227949105025, + "grad_norm": 0.7898053526878357, + "learning_rate": 9.933249752249609e-06, + "loss": 0.7364, + "step": 3958 + }, + { + "epoch": 0.21344619365969378, + "grad_norm": 0.7347330451011658, + "learning_rate": 9.933215221464609e-06, + "loss": 0.7613, + "step": 3959 + }, + { + "epoch": 0.21350010782833728, + "grad_norm": 0.7483309507369995, + "learning_rate": 9.933180681810354e-06, + "loss": 0.7351, + "step": 3960 + }, + { + "epoch": 0.21355402199698081, + "grad_norm": 0.8972424864768982, + "learning_rate": 9.933146133286905e-06, + "loss": 0.8067, + "step": 3961 + }, + { + "epoch": 0.21360793616562432, + "grad_norm": 0.9186527729034424, + "learning_rate": 9.933111575894323e-06, + "loss": 0.8375, + "step": 3962 + }, + { + "epoch": 0.21366185033426785, + "grad_norm": 0.7975471019744873, + "learning_rate": 9.933077009632672e-06, + "loss": 0.7288, + "step": 3963 + }, + { + "epoch": 0.21371576450291135, + "grad_norm": 0.8140373229980469, + "learning_rate": 9.933042434502014e-06, + "loss": 0.851, + "step": 3964 + }, + { + "epoch": 0.21376967867155489, + "grad_norm": 0.7657467126846313, + "learning_rate": 9.933007850502412e-06, + "loss": 0.7874, + "step": 3965 + }, + { + "epoch": 0.21382359284019842, + "grad_norm": 0.7267435193061829, + "learning_rate": 9.932973257633927e-06, + "loss": 0.7065, + "step": 3966 + }, + { + "epoch": 0.21387750700884192, + "grad_norm": 0.8350456357002258, + "learning_rate": 9.932938655896622e-06, + "loss": 0.949, + "step": 3967 + }, + { + "epoch": 0.21393142117748545, + "grad_norm": 0.7870462536811829, + "learning_rate": 9.932904045290557e-06, + "loss": 0.754, + "step": 3968 + }, + { + "epoch": 0.21398533534612896, + "grad_norm": 0.9062042236328125, + "learning_rate": 9.932869425815797e-06, + "loss": 0.8169, + "step": 3969 + }, + { + "epoch": 0.2140392495147725, + "grad_norm": 0.7563914656639099, + "learning_rate": 9.932834797472401e-06, + "loss": 0.7848, + "step": 3970 + }, + { + "epoch": 0.214093163683416, + "grad_norm": 0.8287369012832642, + "learning_rate": 9.932800160260437e-06, + "loss": 0.7775, + "step": 3971 + }, + { + "epoch": 0.21414707785205953, + "grad_norm": 0.7961543202400208, + "learning_rate": 9.93276551417996e-06, + "loss": 0.785, + "step": 3972 + }, + { + "epoch": 0.21420099202070303, + "grad_norm": 1.1722525358200073, + "learning_rate": 9.932730859231038e-06, + "loss": 0.8139, + "step": 3973 + }, + { + "epoch": 0.21425490618934656, + "grad_norm": 0.7425355315208435, + "learning_rate": 9.93269619541373e-06, + "loss": 0.8, + "step": 3974 + }, + { + "epoch": 0.2143088203579901, + "grad_norm": 0.7701120376586914, + "learning_rate": 9.9326615227281e-06, + "loss": 0.7766, + "step": 3975 + }, + { + "epoch": 0.2143627345266336, + "grad_norm": 0.7475442886352539, + "learning_rate": 9.932626841174212e-06, + "loss": 0.731, + "step": 3976 + }, + { + "epoch": 0.21441664869527713, + "grad_norm": 0.7970359325408936, + "learning_rate": 9.932592150752122e-06, + "loss": 0.685, + "step": 3977 + }, + { + "epoch": 0.21447056286392063, + "grad_norm": 0.7397587299346924, + "learning_rate": 9.9325574514619e-06, + "loss": 0.7768, + "step": 3978 + }, + { + "epoch": 0.21452447703256416, + "grad_norm": 0.7406956553459167, + "learning_rate": 9.932522743303604e-06, + "loss": 0.7288, + "step": 3979 + }, + { + "epoch": 0.21457839120120767, + "grad_norm": 0.7971269488334656, + "learning_rate": 9.932488026277295e-06, + "loss": 0.8475, + "step": 3980 + }, + { + "epoch": 0.2146323053698512, + "grad_norm": 0.8104044198989868, + "learning_rate": 9.93245330038304e-06, + "loss": 0.8302, + "step": 3981 + }, + { + "epoch": 0.2146862195384947, + "grad_norm": 0.7473177313804626, + "learning_rate": 9.9324185656209e-06, + "loss": 0.7144, + "step": 3982 + }, + { + "epoch": 0.21474013370713824, + "grad_norm": 0.8730058670043945, + "learning_rate": 9.932383821990937e-06, + "loss": 0.7823, + "step": 3983 + }, + { + "epoch": 0.21479404787578177, + "grad_norm": 0.7489315271377563, + "learning_rate": 9.93234906949321e-06, + "loss": 0.791, + "step": 3984 + }, + { + "epoch": 0.21484796204442527, + "grad_norm": 0.811970055103302, + "learning_rate": 9.932314308127785e-06, + "loss": 0.7773, + "step": 3985 + }, + { + "epoch": 0.2149018762130688, + "grad_norm": 0.7983556985855103, + "learning_rate": 9.932279537894726e-06, + "loss": 0.8677, + "step": 3986 + }, + { + "epoch": 0.2149557903817123, + "grad_norm": 0.8278135657310486, + "learning_rate": 9.932244758794095e-06, + "loss": 0.8562, + "step": 3987 + }, + { + "epoch": 0.21500970455035584, + "grad_norm": 0.8001466989517212, + "learning_rate": 9.93220997082595e-06, + "loss": 0.7695, + "step": 3988 + }, + { + "epoch": 0.21506361871899934, + "grad_norm": 0.7240970730781555, + "learning_rate": 9.932175173990359e-06, + "loss": 0.7293, + "step": 3989 + }, + { + "epoch": 0.21511753288764288, + "grad_norm": 0.7863660454750061, + "learning_rate": 9.932140368287381e-06, + "loss": 0.8307, + "step": 3990 + }, + { + "epoch": 0.21517144705628638, + "grad_norm": 0.7192577719688416, + "learning_rate": 9.932105553717079e-06, + "loss": 0.7819, + "step": 3991 + }, + { + "epoch": 0.2152253612249299, + "grad_norm": 0.7139109969139099, + "learning_rate": 9.932070730279517e-06, + "loss": 0.7343, + "step": 3992 + }, + { + "epoch": 0.21527927539357344, + "grad_norm": 0.7812891006469727, + "learning_rate": 9.932035897974759e-06, + "loss": 0.8159, + "step": 3993 + }, + { + "epoch": 0.21533318956221695, + "grad_norm": 0.8222309947013855, + "learning_rate": 9.932001056802863e-06, + "loss": 0.7424, + "step": 3994 + }, + { + "epoch": 0.21538710373086048, + "grad_norm": 0.7709689140319824, + "learning_rate": 9.931966206763896e-06, + "loss": 0.7952, + "step": 3995 + }, + { + "epoch": 0.21544101789950398, + "grad_norm": 0.8006699681282043, + "learning_rate": 9.931931347857919e-06, + "loss": 0.8527, + "step": 3996 + }, + { + "epoch": 0.21549493206814752, + "grad_norm": 0.8302900195121765, + "learning_rate": 9.931896480084993e-06, + "loss": 0.8531, + "step": 3997 + }, + { + "epoch": 0.21554884623679102, + "grad_norm": 0.7552672028541565, + "learning_rate": 9.931861603445183e-06, + "loss": 0.7589, + "step": 3998 + }, + { + "epoch": 0.21560276040543455, + "grad_norm": 0.7574741244316101, + "learning_rate": 9.931826717938551e-06, + "loss": 0.7806, + "step": 3999 + }, + { + "epoch": 0.21565667457407806, + "grad_norm": 0.9765385389328003, + "learning_rate": 9.93179182356516e-06, + "loss": 0.8503, + "step": 4000 + }, + { + "epoch": 0.2157105887427216, + "grad_norm": 0.8695611953735352, + "learning_rate": 9.931756920325073e-06, + "loss": 0.8484, + "step": 4001 + }, + { + "epoch": 0.21576450291136512, + "grad_norm": 0.9320261478424072, + "learning_rate": 9.931722008218351e-06, + "loss": 0.8019, + "step": 4002 + }, + { + "epoch": 0.21581841708000862, + "grad_norm": 0.7879775762557983, + "learning_rate": 9.931687087245059e-06, + "loss": 0.789, + "step": 4003 + }, + { + "epoch": 0.21587233124865216, + "grad_norm": 0.8338239789009094, + "learning_rate": 9.931652157405258e-06, + "loss": 0.7903, + "step": 4004 + }, + { + "epoch": 0.21592624541729566, + "grad_norm": 0.7812073230743408, + "learning_rate": 9.931617218699011e-06, + "loss": 0.8457, + "step": 4005 + }, + { + "epoch": 0.2159801595859392, + "grad_norm": 0.8999424576759338, + "learning_rate": 9.931582271126384e-06, + "loss": 0.7719, + "step": 4006 + }, + { + "epoch": 0.2160340737545827, + "grad_norm": 0.7390351295471191, + "learning_rate": 9.931547314687434e-06, + "loss": 0.7393, + "step": 4007 + }, + { + "epoch": 0.21608798792322623, + "grad_norm": 0.8604621887207031, + "learning_rate": 9.931512349382228e-06, + "loss": 0.8218, + "step": 4008 + }, + { + "epoch": 0.21614190209186973, + "grad_norm": 0.7581399083137512, + "learning_rate": 9.93147737521083e-06, + "loss": 0.6874, + "step": 4009 + }, + { + "epoch": 0.21619581626051326, + "grad_norm": 0.7431824803352356, + "learning_rate": 9.931442392173298e-06, + "loss": 0.7587, + "step": 4010 + }, + { + "epoch": 0.2162497304291568, + "grad_norm": 0.858138382434845, + "learning_rate": 9.931407400269699e-06, + "loss": 0.8672, + "step": 4011 + }, + { + "epoch": 0.2163036445978003, + "grad_norm": 0.7675254940986633, + "learning_rate": 9.931372399500094e-06, + "loss": 0.7608, + "step": 4012 + }, + { + "epoch": 0.21635755876644383, + "grad_norm": 0.8220716714859009, + "learning_rate": 9.931337389864546e-06, + "loss": 0.7495, + "step": 4013 + }, + { + "epoch": 0.21641147293508733, + "grad_norm": 0.8696985244750977, + "learning_rate": 9.93130237136312e-06, + "loss": 0.8872, + "step": 4014 + }, + { + "epoch": 0.21646538710373087, + "grad_norm": 0.8657988905906677, + "learning_rate": 9.931267343995878e-06, + "loss": 0.7733, + "step": 4015 + }, + { + "epoch": 0.21651930127237437, + "grad_norm": 0.7498238682746887, + "learning_rate": 9.93123230776288e-06, + "loss": 0.8208, + "step": 4016 + }, + { + "epoch": 0.2165732154410179, + "grad_norm": 0.8726654648780823, + "learning_rate": 9.931197262664193e-06, + "loss": 0.7924, + "step": 4017 + }, + { + "epoch": 0.2166271296096614, + "grad_norm": 0.7092527747154236, + "learning_rate": 9.931162208699879e-06, + "loss": 0.7351, + "step": 4018 + }, + { + "epoch": 0.21668104377830494, + "grad_norm": 0.7181721329689026, + "learning_rate": 9.931127145869998e-06, + "loss": 0.719, + "step": 4019 + }, + { + "epoch": 0.21673495794694847, + "grad_norm": 0.7992464303970337, + "learning_rate": 9.931092074174618e-06, + "loss": 0.5935, + "step": 4020 + }, + { + "epoch": 0.21678887211559197, + "grad_norm": 0.8293359279632568, + "learning_rate": 9.931056993613796e-06, + "loss": 0.8331, + "step": 4021 + }, + { + "epoch": 0.2168427862842355, + "grad_norm": 1.215417742729187, + "learning_rate": 9.931021904187603e-06, + "loss": 0.8067, + "step": 4022 + }, + { + "epoch": 0.216896700452879, + "grad_norm": 0.8828169107437134, + "learning_rate": 9.930986805896095e-06, + "loss": 0.6962, + "step": 4023 + }, + { + "epoch": 0.21695061462152254, + "grad_norm": 0.8225864171981812, + "learning_rate": 9.930951698739338e-06, + "loss": 0.7497, + "step": 4024 + }, + { + "epoch": 0.21700452879016605, + "grad_norm": 0.825343906879425, + "learning_rate": 9.930916582717396e-06, + "loss": 0.8693, + "step": 4025 + }, + { + "epoch": 0.21705844295880958, + "grad_norm": 0.7945353984832764, + "learning_rate": 9.93088145783033e-06, + "loss": 0.8349, + "step": 4026 + }, + { + "epoch": 0.21711235712745308, + "grad_norm": 0.7948806285858154, + "learning_rate": 9.930846324078205e-06, + "loss": 0.7726, + "step": 4027 + }, + { + "epoch": 0.2171662712960966, + "grad_norm": 0.7694181203842163, + "learning_rate": 9.930811181461081e-06, + "loss": 0.7704, + "step": 4028 + }, + { + "epoch": 0.21722018546474015, + "grad_norm": 0.74179607629776, + "learning_rate": 9.930776029979026e-06, + "loss": 0.8566, + "step": 4029 + }, + { + "epoch": 0.21727409963338365, + "grad_norm": 0.7846640348434448, + "learning_rate": 9.9307408696321e-06, + "loss": 0.7063, + "step": 4030 + }, + { + "epoch": 0.21732801380202718, + "grad_norm": 0.865972638130188, + "learning_rate": 9.930705700420368e-06, + "loss": 0.7553, + "step": 4031 + }, + { + "epoch": 0.21738192797067069, + "grad_norm": 0.90953129529953, + "learning_rate": 9.930670522343891e-06, + "loss": 0.7857, + "step": 4032 + }, + { + "epoch": 0.21743584213931422, + "grad_norm": 0.743373692035675, + "learning_rate": 9.930635335402733e-06, + "loss": 0.6955, + "step": 4033 + }, + { + "epoch": 0.21748975630795772, + "grad_norm": 0.994404137134552, + "learning_rate": 9.930600139596958e-06, + "loss": 0.7886, + "step": 4034 + }, + { + "epoch": 0.21754367047660125, + "grad_norm": 0.7715345621109009, + "learning_rate": 9.93056493492663e-06, + "loss": 0.8261, + "step": 4035 + }, + { + "epoch": 0.21759758464524476, + "grad_norm": 0.8100937604904175, + "learning_rate": 9.93052972139181e-06, + "loss": 0.7828, + "step": 4036 + }, + { + "epoch": 0.2176514988138883, + "grad_norm": 1.0633374452590942, + "learning_rate": 9.930494498992562e-06, + "loss": 0.7885, + "step": 4037 + }, + { + "epoch": 0.21770541298253182, + "grad_norm": 0.766617476940155, + "learning_rate": 9.930459267728951e-06, + "loss": 0.8267, + "step": 4038 + }, + { + "epoch": 0.21775932715117532, + "grad_norm": 0.7761416435241699, + "learning_rate": 9.93042402760104e-06, + "loss": 0.8079, + "step": 4039 + }, + { + "epoch": 0.21781324131981886, + "grad_norm": 0.8123136758804321, + "learning_rate": 9.93038877860889e-06, + "loss": 0.8228, + "step": 4040 + }, + { + "epoch": 0.21786715548846236, + "grad_norm": 0.8818230628967285, + "learning_rate": 9.930353520752567e-06, + "loss": 0.8171, + "step": 4041 + }, + { + "epoch": 0.2179210696571059, + "grad_norm": 1.0989209413528442, + "learning_rate": 9.930318254032131e-06, + "loss": 0.9083, + "step": 4042 + }, + { + "epoch": 0.2179749838257494, + "grad_norm": 0.8373724818229675, + "learning_rate": 9.930282978447649e-06, + "loss": 0.7842, + "step": 4043 + }, + { + "epoch": 0.21802889799439293, + "grad_norm": 0.7905243039131165, + "learning_rate": 9.930247693999185e-06, + "loss": 0.7842, + "step": 4044 + }, + { + "epoch": 0.21808281216303643, + "grad_norm": 0.8310670852661133, + "learning_rate": 9.9302124006868e-06, + "loss": 0.735, + "step": 4045 + }, + { + "epoch": 0.21813672633167996, + "grad_norm": 0.8986020684242249, + "learning_rate": 9.930177098510556e-06, + "loss": 0.9901, + "step": 4046 + }, + { + "epoch": 0.2181906405003235, + "grad_norm": 0.7886272668838501, + "learning_rate": 9.93014178747052e-06, + "loss": 0.8593, + "step": 4047 + }, + { + "epoch": 0.218244554668967, + "grad_norm": 0.8021159768104553, + "learning_rate": 9.930106467566754e-06, + "loss": 0.7114, + "step": 4048 + }, + { + "epoch": 0.21829846883761053, + "grad_norm": 0.7256723642349243, + "learning_rate": 9.930071138799322e-06, + "loss": 0.7537, + "step": 4049 + }, + { + "epoch": 0.21835238300625404, + "grad_norm": 0.8547120094299316, + "learning_rate": 9.930035801168286e-06, + "loss": 0.771, + "step": 4050 + }, + { + "epoch": 0.21840629717489757, + "grad_norm": 0.7411953210830688, + "learning_rate": 9.930000454673711e-06, + "loss": 0.7756, + "step": 4051 + }, + { + "epoch": 0.21846021134354107, + "grad_norm": 0.8918336033821106, + "learning_rate": 9.929965099315659e-06, + "loss": 0.8297, + "step": 4052 + }, + { + "epoch": 0.2185141255121846, + "grad_norm": 0.7391760349273682, + "learning_rate": 9.929929735094196e-06, + "loss": 0.8109, + "step": 4053 + }, + { + "epoch": 0.2185680396808281, + "grad_norm": 0.7272089719772339, + "learning_rate": 9.929894362009384e-06, + "loss": 0.7727, + "step": 4054 + }, + { + "epoch": 0.21862195384947164, + "grad_norm": 0.6963438391685486, + "learning_rate": 9.929858980061287e-06, + "loss": 0.681, + "step": 4055 + }, + { + "epoch": 0.21867586801811517, + "grad_norm": 0.714117169380188, + "learning_rate": 9.929823589249968e-06, + "loss": 0.6973, + "step": 4056 + }, + { + "epoch": 0.21872978218675868, + "grad_norm": 0.8449671268463135, + "learning_rate": 9.92978818957549e-06, + "loss": 0.8321, + "step": 4057 + }, + { + "epoch": 0.2187836963554022, + "grad_norm": 0.8275889754295349, + "learning_rate": 9.92975278103792e-06, + "loss": 0.7589, + "step": 4058 + }, + { + "epoch": 0.2188376105240457, + "grad_norm": 0.8010358214378357, + "learning_rate": 9.929717363637318e-06, + "loss": 0.7673, + "step": 4059 + }, + { + "epoch": 0.21889152469268924, + "grad_norm": 0.8558088541030884, + "learning_rate": 9.92968193737375e-06, + "loss": 0.8374, + "step": 4060 + }, + { + "epoch": 0.21894543886133275, + "grad_norm": 0.8413086533546448, + "learning_rate": 9.929646502247278e-06, + "loss": 0.8522, + "step": 4061 + }, + { + "epoch": 0.21899935302997628, + "grad_norm": 0.7852063775062561, + "learning_rate": 9.929611058257966e-06, + "loss": 0.7475, + "step": 4062 + }, + { + "epoch": 0.2190532671986198, + "grad_norm": 0.752642810344696, + "learning_rate": 9.92957560540588e-06, + "loss": 0.7054, + "step": 4063 + }, + { + "epoch": 0.21910718136726332, + "grad_norm": 0.8099555969238281, + "learning_rate": 9.929540143691079e-06, + "loss": 0.8409, + "step": 4064 + }, + { + "epoch": 0.21916109553590685, + "grad_norm": 0.7962636947631836, + "learning_rate": 9.929504673113632e-06, + "loss": 0.8581, + "step": 4065 + }, + { + "epoch": 0.21921500970455035, + "grad_norm": 0.7996272444725037, + "learning_rate": 9.9294691936736e-06, + "loss": 0.7837, + "step": 4066 + }, + { + "epoch": 0.21926892387319388, + "grad_norm": 0.7685336470603943, + "learning_rate": 9.929433705371046e-06, + "loss": 0.7658, + "step": 4067 + }, + { + "epoch": 0.2193228380418374, + "grad_norm": 0.8068851232528687, + "learning_rate": 9.929398208206036e-06, + "loss": 0.8141, + "step": 4068 + }, + { + "epoch": 0.21937675221048092, + "grad_norm": 0.7585315108299255, + "learning_rate": 9.929362702178634e-06, + "loss": 0.7533, + "step": 4069 + }, + { + "epoch": 0.21943066637912442, + "grad_norm": 1.1367120742797852, + "learning_rate": 9.9293271872889e-06, + "loss": 0.925, + "step": 4070 + }, + { + "epoch": 0.21948458054776795, + "grad_norm": 0.8255071640014648, + "learning_rate": 9.929291663536902e-06, + "loss": 0.6905, + "step": 4071 + }, + { + "epoch": 0.2195384947164115, + "grad_norm": 0.805061936378479, + "learning_rate": 9.929256130922702e-06, + "loss": 0.7787, + "step": 4072 + }, + { + "epoch": 0.219592408885055, + "grad_norm": 0.7786453366279602, + "learning_rate": 9.929220589446365e-06, + "loss": 0.8182, + "step": 4073 + }, + { + "epoch": 0.21964632305369852, + "grad_norm": 0.925881028175354, + "learning_rate": 9.929185039107955e-06, + "loss": 0.8611, + "step": 4074 + }, + { + "epoch": 0.21970023722234203, + "grad_norm": 0.7396146059036255, + "learning_rate": 9.929149479907533e-06, + "loss": 0.8427, + "step": 4075 + }, + { + "epoch": 0.21975415139098556, + "grad_norm": 0.8113187551498413, + "learning_rate": 9.929113911845167e-06, + "loss": 0.7436, + "step": 4076 + }, + { + "epoch": 0.21980806555962906, + "grad_norm": 0.8359308838844299, + "learning_rate": 9.929078334920918e-06, + "loss": 0.7606, + "step": 4077 + }, + { + "epoch": 0.2198619797282726, + "grad_norm": 0.9729122519493103, + "learning_rate": 9.92904274913485e-06, + "loss": 0.774, + "step": 4078 + }, + { + "epoch": 0.2199158938969161, + "grad_norm": 0.7794427871704102, + "learning_rate": 9.92900715448703e-06, + "loss": 0.8311, + "step": 4079 + }, + { + "epoch": 0.21996980806555963, + "grad_norm": 0.8245888352394104, + "learning_rate": 9.928971550977519e-06, + "loss": 0.8461, + "step": 4080 + }, + { + "epoch": 0.22002372223420316, + "grad_norm": 0.7551932334899902, + "learning_rate": 9.92893593860638e-06, + "loss": 0.7899, + "step": 4081 + }, + { + "epoch": 0.22007763640284667, + "grad_norm": 0.7409234642982483, + "learning_rate": 9.928900317373681e-06, + "loss": 0.7847, + "step": 4082 + }, + { + "epoch": 0.2201315505714902, + "grad_norm": 1.5267807245254517, + "learning_rate": 9.928864687279485e-06, + "loss": 1.0547, + "step": 4083 + }, + { + "epoch": 0.2201854647401337, + "grad_norm": 0.832936704158783, + "learning_rate": 9.928829048323853e-06, + "loss": 0.8919, + "step": 4084 + }, + { + "epoch": 0.22023937890877723, + "grad_norm": 0.7933560609817505, + "learning_rate": 9.928793400506852e-06, + "loss": 0.833, + "step": 4085 + }, + { + "epoch": 0.22029329307742074, + "grad_norm": 0.7095281481742859, + "learning_rate": 9.928757743828545e-06, + "loss": 0.7383, + "step": 4086 + }, + { + "epoch": 0.22034720724606427, + "grad_norm": 0.7681827545166016, + "learning_rate": 9.928722078288998e-06, + "loss": 0.7573, + "step": 4087 + }, + { + "epoch": 0.22040112141470777, + "grad_norm": 0.7923296689987183, + "learning_rate": 9.928686403888271e-06, + "loss": 0.7574, + "step": 4088 + }, + { + "epoch": 0.2204550355833513, + "grad_norm": 0.7329868674278259, + "learning_rate": 9.928650720626431e-06, + "loss": 0.7798, + "step": 4089 + }, + { + "epoch": 0.22050894975199484, + "grad_norm": 0.6931655406951904, + "learning_rate": 9.928615028503542e-06, + "loss": 0.687, + "step": 4090 + }, + { + "epoch": 0.22056286392063834, + "grad_norm": 0.8253043293952942, + "learning_rate": 9.928579327519668e-06, + "loss": 0.7611, + "step": 4091 + }, + { + "epoch": 0.22061677808928187, + "grad_norm": 0.9808893799781799, + "learning_rate": 9.928543617674873e-06, + "loss": 0.8013, + "step": 4092 + }, + { + "epoch": 0.22067069225792538, + "grad_norm": 0.765825092792511, + "learning_rate": 9.928507898969222e-06, + "loss": 0.704, + "step": 4093 + }, + { + "epoch": 0.2207246064265689, + "grad_norm": 0.8836820721626282, + "learning_rate": 9.928472171402777e-06, + "loss": 0.7862, + "step": 4094 + }, + { + "epoch": 0.2207785205952124, + "grad_norm": 0.7684285640716553, + "learning_rate": 9.928436434975606e-06, + "loss": 0.6694, + "step": 4095 + }, + { + "epoch": 0.22083243476385594, + "grad_norm": 0.8041714429855347, + "learning_rate": 9.92840068968777e-06, + "loss": 0.7593, + "step": 4096 + }, + { + "epoch": 0.22088634893249945, + "grad_norm": 0.8422744274139404, + "learning_rate": 9.928364935539331e-06, + "loss": 0.7447, + "step": 4097 + }, + { + "epoch": 0.22094026310114298, + "grad_norm": 0.8337421417236328, + "learning_rate": 9.928329172530361e-06, + "loss": 0.8273, + "step": 4098 + }, + { + "epoch": 0.2209941772697865, + "grad_norm": 0.9864090085029602, + "learning_rate": 9.928293400660918e-06, + "loss": 0.9286, + "step": 4099 + }, + { + "epoch": 0.22104809143843002, + "grad_norm": 0.8052615523338318, + "learning_rate": 9.928257619931068e-06, + "loss": 0.78, + "step": 4100 + }, + { + "epoch": 0.22110200560707355, + "grad_norm": 0.8060072064399719, + "learning_rate": 9.928221830340876e-06, + "loss": 0.7759, + "step": 4101 + }, + { + "epoch": 0.22115591977571705, + "grad_norm": 0.8900836706161499, + "learning_rate": 9.928186031890405e-06, + "loss": 0.8144, + "step": 4102 + }, + { + "epoch": 0.22120983394436058, + "grad_norm": 0.7392085194587708, + "learning_rate": 9.928150224579723e-06, + "loss": 0.7787, + "step": 4103 + }, + { + "epoch": 0.2212637481130041, + "grad_norm": 0.9728571772575378, + "learning_rate": 9.92811440840889e-06, + "loss": 0.8359, + "step": 4104 + }, + { + "epoch": 0.22131766228164762, + "grad_norm": 0.9601667523384094, + "learning_rate": 9.92807858337797e-06, + "loss": 0.7868, + "step": 4105 + }, + { + "epoch": 0.22137157645029112, + "grad_norm": 0.7148939371109009, + "learning_rate": 9.92804274948703e-06, + "loss": 0.7266, + "step": 4106 + }, + { + "epoch": 0.22142549061893466, + "grad_norm": 0.7482119798660278, + "learning_rate": 9.928006906736136e-06, + "loss": 0.7602, + "step": 4107 + }, + { + "epoch": 0.2214794047875782, + "grad_norm": 0.8613291382789612, + "learning_rate": 9.927971055125348e-06, + "loss": 0.7747, + "step": 4108 + }, + { + "epoch": 0.2215333189562217, + "grad_norm": 0.7668588757514954, + "learning_rate": 9.927935194654733e-06, + "loss": 0.7572, + "step": 4109 + }, + { + "epoch": 0.22158723312486522, + "grad_norm": 0.7911893725395203, + "learning_rate": 9.927899325324356e-06, + "loss": 0.8563, + "step": 4110 + }, + { + "epoch": 0.22164114729350873, + "grad_norm": 0.8059565424919128, + "learning_rate": 9.92786344713428e-06, + "loss": 0.893, + "step": 4111 + }, + { + "epoch": 0.22169506146215226, + "grad_norm": 0.8575117588043213, + "learning_rate": 9.92782756008457e-06, + "loss": 0.8475, + "step": 4112 + }, + { + "epoch": 0.22174897563079576, + "grad_norm": 0.7179403901100159, + "learning_rate": 9.927791664175292e-06, + "loss": 0.7914, + "step": 4113 + }, + { + "epoch": 0.2218028897994393, + "grad_norm": 0.8687799572944641, + "learning_rate": 9.927755759406508e-06, + "loss": 0.7447, + "step": 4114 + }, + { + "epoch": 0.2218568039680828, + "grad_norm": 0.7538093328475952, + "learning_rate": 9.927719845778283e-06, + "loss": 0.6988, + "step": 4115 + }, + { + "epoch": 0.22191071813672633, + "grad_norm": 0.7586212754249573, + "learning_rate": 9.927683923290685e-06, + "loss": 0.7743, + "step": 4116 + }, + { + "epoch": 0.22196463230536986, + "grad_norm": 0.797385573387146, + "learning_rate": 9.927647991943774e-06, + "loss": 0.7541, + "step": 4117 + }, + { + "epoch": 0.22201854647401337, + "grad_norm": 0.7193878293037415, + "learning_rate": 9.927612051737617e-06, + "loss": 0.758, + "step": 4118 + }, + { + "epoch": 0.2220724606426569, + "grad_norm": 0.7417513132095337, + "learning_rate": 9.927576102672276e-06, + "loss": 0.7902, + "step": 4119 + }, + { + "epoch": 0.2221263748113004, + "grad_norm": 0.8947266936302185, + "learning_rate": 9.927540144747821e-06, + "loss": 0.9153, + "step": 4120 + }, + { + "epoch": 0.22218028897994394, + "grad_norm": 0.7990988492965698, + "learning_rate": 9.927504177964311e-06, + "loss": 0.8487, + "step": 4121 + }, + { + "epoch": 0.22223420314858744, + "grad_norm": 0.801420271396637, + "learning_rate": 9.927468202321816e-06, + "loss": 0.7752, + "step": 4122 + }, + { + "epoch": 0.22228811731723097, + "grad_norm": 0.7953904271125793, + "learning_rate": 9.927432217820394e-06, + "loss": 0.7249, + "step": 4123 + }, + { + "epoch": 0.22234203148587448, + "grad_norm": 0.8257938027381897, + "learning_rate": 9.927396224460116e-06, + "loss": 0.8311, + "step": 4124 + }, + { + "epoch": 0.222395945654518, + "grad_norm": 0.7679301500320435, + "learning_rate": 9.927360222241042e-06, + "loss": 0.7155, + "step": 4125 + }, + { + "epoch": 0.22244985982316154, + "grad_norm": 0.7410153150558472, + "learning_rate": 9.92732421116324e-06, + "loss": 0.7007, + "step": 4126 + }, + { + "epoch": 0.22250377399180504, + "grad_norm": 0.8296052813529968, + "learning_rate": 9.927288191226774e-06, + "loss": 0.7546, + "step": 4127 + }, + { + "epoch": 0.22255768816044857, + "grad_norm": 1.051527500152588, + "learning_rate": 9.927252162431708e-06, + "loss": 0.7039, + "step": 4128 + }, + { + "epoch": 0.22261160232909208, + "grad_norm": 0.8625979423522949, + "learning_rate": 9.927216124778108e-06, + "loss": 0.7348, + "step": 4129 + }, + { + "epoch": 0.2226655164977356, + "grad_norm": 0.8892311453819275, + "learning_rate": 9.927180078266038e-06, + "loss": 0.8221, + "step": 4130 + }, + { + "epoch": 0.22271943066637911, + "grad_norm": 0.8888135552406311, + "learning_rate": 9.927144022895562e-06, + "loss": 0.8953, + "step": 4131 + }, + { + "epoch": 0.22277334483502265, + "grad_norm": 0.8566902279853821, + "learning_rate": 9.927107958666746e-06, + "loss": 0.7894, + "step": 4132 + }, + { + "epoch": 0.22282725900366615, + "grad_norm": 0.821061909198761, + "learning_rate": 9.927071885579654e-06, + "loss": 0.8271, + "step": 4133 + }, + { + "epoch": 0.22288117317230968, + "grad_norm": 1.0494943857192993, + "learning_rate": 9.927035803634351e-06, + "loss": 0.79, + "step": 4134 + }, + { + "epoch": 0.22293508734095321, + "grad_norm": 1.730763554573059, + "learning_rate": 9.926999712830903e-06, + "loss": 0.7944, + "step": 4135 + }, + { + "epoch": 0.22298900150959672, + "grad_norm": 0.799264669418335, + "learning_rate": 9.926963613169372e-06, + "loss": 0.7922, + "step": 4136 + }, + { + "epoch": 0.22304291567824025, + "grad_norm": 0.7929497361183167, + "learning_rate": 9.926927504649826e-06, + "loss": 0.8809, + "step": 4137 + }, + { + "epoch": 0.22309682984688375, + "grad_norm": 0.8016352653503418, + "learning_rate": 9.92689138727233e-06, + "loss": 0.8839, + "step": 4138 + }, + { + "epoch": 0.22315074401552729, + "grad_norm": 0.7640015482902527, + "learning_rate": 9.926855261036947e-06, + "loss": 0.7351, + "step": 4139 + }, + { + "epoch": 0.2232046581841708, + "grad_norm": 0.7678577899932861, + "learning_rate": 9.926819125943743e-06, + "loss": 0.7249, + "step": 4140 + }, + { + "epoch": 0.22325857235281432, + "grad_norm": 0.9195266962051392, + "learning_rate": 9.926782981992782e-06, + "loss": 0.7459, + "step": 4141 + }, + { + "epoch": 0.22331248652145783, + "grad_norm": 0.9069259762763977, + "learning_rate": 9.92674682918413e-06, + "loss": 0.8569, + "step": 4142 + }, + { + "epoch": 0.22336640069010136, + "grad_norm": 0.8251914978027344, + "learning_rate": 9.926710667517853e-06, + "loss": 0.7659, + "step": 4143 + }, + { + "epoch": 0.2234203148587449, + "grad_norm": 0.7647615671157837, + "learning_rate": 9.926674496994013e-06, + "loss": 0.7847, + "step": 4144 + }, + { + "epoch": 0.2234742290273884, + "grad_norm": 0.7971541285514832, + "learning_rate": 9.926638317612678e-06, + "loss": 0.7033, + "step": 4145 + }, + { + "epoch": 0.22352814319603193, + "grad_norm": 0.8472650051116943, + "learning_rate": 9.92660212937391e-06, + "loss": 0.7953, + "step": 4146 + }, + { + "epoch": 0.22358205736467543, + "grad_norm": 0.7527226805686951, + "learning_rate": 9.926565932277776e-06, + "loss": 0.7402, + "step": 4147 + }, + { + "epoch": 0.22363597153331896, + "grad_norm": 0.8266519904136658, + "learning_rate": 9.926529726324344e-06, + "loss": 0.8852, + "step": 4148 + }, + { + "epoch": 0.22368988570196247, + "grad_norm": 0.8195723295211792, + "learning_rate": 9.926493511513673e-06, + "loss": 0.8529, + "step": 4149 + }, + { + "epoch": 0.223743799870606, + "grad_norm": 0.821739912033081, + "learning_rate": 9.92645728784583e-06, + "loss": 0.8809, + "step": 4150 + }, + { + "epoch": 0.2237977140392495, + "grad_norm": 0.8063598275184631, + "learning_rate": 9.926421055320883e-06, + "loss": 0.8219, + "step": 4151 + }, + { + "epoch": 0.22385162820789303, + "grad_norm": 0.7054430246353149, + "learning_rate": 9.926384813938896e-06, + "loss": 0.6726, + "step": 4152 + }, + { + "epoch": 0.22390554237653656, + "grad_norm": 0.8751134872436523, + "learning_rate": 9.926348563699933e-06, + "loss": 0.8059, + "step": 4153 + }, + { + "epoch": 0.22395945654518007, + "grad_norm": 0.8193408846855164, + "learning_rate": 9.92631230460406e-06, + "loss": 0.7078, + "step": 4154 + }, + { + "epoch": 0.2240133707138236, + "grad_norm": 0.7827375531196594, + "learning_rate": 9.92627603665134e-06, + "loss": 0.7604, + "step": 4155 + }, + { + "epoch": 0.2240672848824671, + "grad_norm": 0.7906658053398132, + "learning_rate": 9.926239759841842e-06, + "loss": 0.7428, + "step": 4156 + }, + { + "epoch": 0.22412119905111064, + "grad_norm": 0.8965858817100525, + "learning_rate": 9.92620347417563e-06, + "loss": 0.805, + "step": 4157 + }, + { + "epoch": 0.22417511321975414, + "grad_norm": 0.7383534908294678, + "learning_rate": 9.926167179652767e-06, + "loss": 0.8041, + "step": 4158 + }, + { + "epoch": 0.22422902738839767, + "grad_norm": 0.7922899127006531, + "learning_rate": 9.926130876273321e-06, + "loss": 0.8966, + "step": 4159 + }, + { + "epoch": 0.22428294155704118, + "grad_norm": 0.7780346870422363, + "learning_rate": 9.926094564037354e-06, + "loss": 0.787, + "step": 4160 + }, + { + "epoch": 0.2243368557256847, + "grad_norm": 0.8276410102844238, + "learning_rate": 9.926058242944936e-06, + "loss": 0.7222, + "step": 4161 + }, + { + "epoch": 0.22439076989432824, + "grad_norm": 0.8523558378219604, + "learning_rate": 9.926021912996128e-06, + "loss": 0.7784, + "step": 4162 + }, + { + "epoch": 0.22444468406297174, + "grad_norm": 1.0391061305999756, + "learning_rate": 9.925985574190997e-06, + "loss": 0.8078, + "step": 4163 + }, + { + "epoch": 0.22449859823161528, + "grad_norm": 1.09534752368927, + "learning_rate": 9.925949226529609e-06, + "loss": 0.8317, + "step": 4164 + }, + { + "epoch": 0.22455251240025878, + "grad_norm": 1.0554418563842773, + "learning_rate": 9.925912870012028e-06, + "loss": 0.8352, + "step": 4165 + }, + { + "epoch": 0.2246064265689023, + "grad_norm": 0.889376699924469, + "learning_rate": 9.92587650463832e-06, + "loss": 0.7787, + "step": 4166 + }, + { + "epoch": 0.22466034073754582, + "grad_norm": 0.8486199378967285, + "learning_rate": 9.92584013040855e-06, + "loss": 0.8005, + "step": 4167 + }, + { + "epoch": 0.22471425490618935, + "grad_norm": 0.7989416718482971, + "learning_rate": 9.925803747322786e-06, + "loss": 0.7258, + "step": 4168 + }, + { + "epoch": 0.22476816907483288, + "grad_norm": 0.8066874146461487, + "learning_rate": 9.925767355381088e-06, + "loss": 0.7334, + "step": 4169 + }, + { + "epoch": 0.22482208324347638, + "grad_norm": 0.7679908871650696, + "learning_rate": 9.925730954583529e-06, + "loss": 0.8172, + "step": 4170 + }, + { + "epoch": 0.22487599741211992, + "grad_norm": 0.8524256944656372, + "learning_rate": 9.925694544930165e-06, + "loss": 0.788, + "step": 4171 + }, + { + "epoch": 0.22492991158076342, + "grad_norm": 0.7501714825630188, + "learning_rate": 9.925658126421069e-06, + "loss": 0.7749, + "step": 4172 + }, + { + "epoch": 0.22498382574940695, + "grad_norm": 0.7706030607223511, + "learning_rate": 9.925621699056304e-06, + "loss": 0.7231, + "step": 4173 + }, + { + "epoch": 0.22503773991805046, + "grad_norm": 0.8854154348373413, + "learning_rate": 9.925585262835936e-06, + "loss": 0.8278, + "step": 4174 + }, + { + "epoch": 0.225091654086694, + "grad_norm": 0.7319517731666565, + "learning_rate": 9.925548817760029e-06, + "loss": 0.6935, + "step": 4175 + }, + { + "epoch": 0.2251455682553375, + "grad_norm": 0.7906307578086853, + "learning_rate": 9.925512363828652e-06, + "loss": 0.6917, + "step": 4176 + }, + { + "epoch": 0.22519948242398102, + "grad_norm": 0.7849681377410889, + "learning_rate": 9.925475901041865e-06, + "loss": 0.7164, + "step": 4177 + }, + { + "epoch": 0.22525339659262456, + "grad_norm": 0.7835176587104797, + "learning_rate": 9.925439429399737e-06, + "loss": 0.8398, + "step": 4178 + }, + { + "epoch": 0.22530731076126806, + "grad_norm": 0.7237651944160461, + "learning_rate": 9.925402948902334e-06, + "loss": 0.7466, + "step": 4179 + }, + { + "epoch": 0.2253612249299116, + "grad_norm": 0.7823938131332397, + "learning_rate": 9.925366459549721e-06, + "loss": 0.7348, + "step": 4180 + }, + { + "epoch": 0.2254151390985551, + "grad_norm": 0.8057203888893127, + "learning_rate": 9.925329961341964e-06, + "loss": 0.6959, + "step": 4181 + }, + { + "epoch": 0.22546905326719863, + "grad_norm": 0.7731473445892334, + "learning_rate": 9.925293454279125e-06, + "loss": 0.817, + "step": 4182 + }, + { + "epoch": 0.22552296743584213, + "grad_norm": 0.7807347178459167, + "learning_rate": 9.925256938361276e-06, + "loss": 0.8092, + "step": 4183 + }, + { + "epoch": 0.22557688160448566, + "grad_norm": 0.9550508260726929, + "learning_rate": 9.925220413588478e-06, + "loss": 0.777, + "step": 4184 + }, + { + "epoch": 0.22563079577312917, + "grad_norm": 0.7147027254104614, + "learning_rate": 9.925183879960799e-06, + "loss": 0.7964, + "step": 4185 + }, + { + "epoch": 0.2256847099417727, + "grad_norm": 0.8344054222106934, + "learning_rate": 9.925147337478302e-06, + "loss": 0.8445, + "step": 4186 + }, + { + "epoch": 0.22573862411041623, + "grad_norm": 0.7597602605819702, + "learning_rate": 9.925110786141055e-06, + "loss": 0.7832, + "step": 4187 + }, + { + "epoch": 0.22579253827905973, + "grad_norm": 0.7721429467201233, + "learning_rate": 9.925074225949123e-06, + "loss": 0.7126, + "step": 4188 + }, + { + "epoch": 0.22584645244770327, + "grad_norm": 1.0660802125930786, + "learning_rate": 9.925037656902572e-06, + "loss": 0.7464, + "step": 4189 + }, + { + "epoch": 0.22590036661634677, + "grad_norm": 1.1455479860305786, + "learning_rate": 9.925001079001465e-06, + "loss": 0.7962, + "step": 4190 + }, + { + "epoch": 0.2259542807849903, + "grad_norm": 0.7436321377754211, + "learning_rate": 9.924964492245874e-06, + "loss": 0.7943, + "step": 4191 + }, + { + "epoch": 0.2260081949536338, + "grad_norm": 0.8470258712768555, + "learning_rate": 9.92492789663586e-06, + "loss": 0.8384, + "step": 4192 + }, + { + "epoch": 0.22606210912227734, + "grad_norm": 0.7316015958786011, + "learning_rate": 9.92489129217149e-06, + "loss": 0.6658, + "step": 4193 + }, + { + "epoch": 0.22611602329092084, + "grad_norm": 0.8184043765068054, + "learning_rate": 9.924854678852829e-06, + "loss": 0.77, + "step": 4194 + }, + { + "epoch": 0.22616993745956437, + "grad_norm": 0.8100526928901672, + "learning_rate": 9.924818056679943e-06, + "loss": 0.7668, + "step": 4195 + }, + { + "epoch": 0.2262238516282079, + "grad_norm": 0.7480085492134094, + "learning_rate": 9.924781425652899e-06, + "loss": 0.7623, + "step": 4196 + }, + { + "epoch": 0.2262777657968514, + "grad_norm": 0.8250038623809814, + "learning_rate": 9.924744785771762e-06, + "loss": 0.8567, + "step": 4197 + }, + { + "epoch": 0.22633167996549494, + "grad_norm": 0.7686489224433899, + "learning_rate": 9.924708137036599e-06, + "loss": 0.7706, + "step": 4198 + }, + { + "epoch": 0.22638559413413845, + "grad_norm": 0.735899806022644, + "learning_rate": 9.924671479447474e-06, + "loss": 0.7753, + "step": 4199 + }, + { + "epoch": 0.22643950830278198, + "grad_norm": 0.9740009307861328, + "learning_rate": 9.924634813004455e-06, + "loss": 0.7569, + "step": 4200 + }, + { + "epoch": 0.22649342247142548, + "grad_norm": 1.0002168416976929, + "learning_rate": 9.924598137707606e-06, + "loss": 0.8258, + "step": 4201 + }, + { + "epoch": 0.226547336640069, + "grad_norm": 0.6893144845962524, + "learning_rate": 9.924561453556993e-06, + "loss": 0.7139, + "step": 4202 + }, + { + "epoch": 0.22660125080871252, + "grad_norm": 0.8272411227226257, + "learning_rate": 9.924524760552684e-06, + "loss": 0.8422, + "step": 4203 + }, + { + "epoch": 0.22665516497735605, + "grad_norm": 0.7915756702423096, + "learning_rate": 9.924488058694743e-06, + "loss": 0.797, + "step": 4204 + }, + { + "epoch": 0.22670907914599958, + "grad_norm": 0.9074721932411194, + "learning_rate": 9.924451347983238e-06, + "loss": 0.837, + "step": 4205 + }, + { + "epoch": 0.22676299331464309, + "grad_norm": 0.7446406483650208, + "learning_rate": 9.92441462841823e-06, + "loss": 0.7532, + "step": 4206 + }, + { + "epoch": 0.22681690748328662, + "grad_norm": 0.7998174428939819, + "learning_rate": 9.924377899999793e-06, + "loss": 0.7768, + "step": 4207 + }, + { + "epoch": 0.22687082165193012, + "grad_norm": 0.7808948755264282, + "learning_rate": 9.924341162727987e-06, + "loss": 0.8571, + "step": 4208 + }, + { + "epoch": 0.22692473582057365, + "grad_norm": 0.837177574634552, + "learning_rate": 9.924304416602879e-06, + "loss": 0.7659, + "step": 4209 + }, + { + "epoch": 0.22697864998921716, + "grad_norm": 0.922913670539856, + "learning_rate": 9.924267661624536e-06, + "loss": 0.7124, + "step": 4210 + }, + { + "epoch": 0.2270325641578607, + "grad_norm": 0.7991519570350647, + "learning_rate": 9.924230897793024e-06, + "loss": 0.7212, + "step": 4211 + }, + { + "epoch": 0.2270864783265042, + "grad_norm": 0.7561559081077576, + "learning_rate": 9.924194125108409e-06, + "loss": 0.7116, + "step": 4212 + }, + { + "epoch": 0.22714039249514772, + "grad_norm": 0.8377161026000977, + "learning_rate": 9.924157343570758e-06, + "loss": 0.8286, + "step": 4213 + }, + { + "epoch": 0.22719430666379126, + "grad_norm": 0.7423402070999146, + "learning_rate": 9.924120553180135e-06, + "loss": 0.7146, + "step": 4214 + }, + { + "epoch": 0.22724822083243476, + "grad_norm": 0.867027223110199, + "learning_rate": 9.924083753936607e-06, + "loss": 0.9115, + "step": 4215 + }, + { + "epoch": 0.2273021350010783, + "grad_norm": 0.8492380976676941, + "learning_rate": 9.924046945840243e-06, + "loss": 0.8469, + "step": 4216 + }, + { + "epoch": 0.2273560491697218, + "grad_norm": 0.9068216681480408, + "learning_rate": 9.924010128891104e-06, + "loss": 0.8478, + "step": 4217 + }, + { + "epoch": 0.22740996333836533, + "grad_norm": 0.8054717779159546, + "learning_rate": 9.92397330308926e-06, + "loss": 0.8606, + "step": 4218 + }, + { + "epoch": 0.22746387750700883, + "grad_norm": 0.7788351774215698, + "learning_rate": 9.923936468434777e-06, + "loss": 0.7892, + "step": 4219 + }, + { + "epoch": 0.22751779167565236, + "grad_norm": 0.7584444284439087, + "learning_rate": 9.923899624927717e-06, + "loss": 0.7834, + "step": 4220 + }, + { + "epoch": 0.22757170584429587, + "grad_norm": 0.7948986291885376, + "learning_rate": 9.923862772568154e-06, + "loss": 0.8158, + "step": 4221 + }, + { + "epoch": 0.2276256200129394, + "grad_norm": 0.9347550868988037, + "learning_rate": 9.923825911356146e-06, + "loss": 0.8955, + "step": 4222 + }, + { + "epoch": 0.22767953418158293, + "grad_norm": 0.7694705724716187, + "learning_rate": 9.923789041291765e-06, + "loss": 0.7797, + "step": 4223 + }, + { + "epoch": 0.22773344835022644, + "grad_norm": 0.7127852439880371, + "learning_rate": 9.923752162375076e-06, + "loss": 0.7026, + "step": 4224 + }, + { + "epoch": 0.22778736251886997, + "grad_norm": 0.9811069369316101, + "learning_rate": 9.923715274606142e-06, + "loss": 0.7804, + "step": 4225 + }, + { + "epoch": 0.22784127668751347, + "grad_norm": 0.8820962309837341, + "learning_rate": 9.923678377985035e-06, + "loss": 0.7807, + "step": 4226 + }, + { + "epoch": 0.227895190856157, + "grad_norm": 0.9057408571243286, + "learning_rate": 9.923641472511819e-06, + "loss": 0.7855, + "step": 4227 + }, + { + "epoch": 0.2279491050248005, + "grad_norm": 0.8836835622787476, + "learning_rate": 9.923604558186557e-06, + "loss": 0.7507, + "step": 4228 + }, + { + "epoch": 0.22800301919344404, + "grad_norm": 0.7494282722473145, + "learning_rate": 9.923567635009319e-06, + "loss": 0.7939, + "step": 4229 + }, + { + "epoch": 0.22805693336208754, + "grad_norm": 0.9468182921409607, + "learning_rate": 9.92353070298017e-06, + "loss": 0.7739, + "step": 4230 + }, + { + "epoch": 0.22811084753073108, + "grad_norm": 0.8671477437019348, + "learning_rate": 9.923493762099177e-06, + "loss": 0.8455, + "step": 4231 + }, + { + "epoch": 0.2281647616993746, + "grad_norm": 0.7388983964920044, + "learning_rate": 9.923456812366405e-06, + "loss": 0.8193, + "step": 4232 + }, + { + "epoch": 0.2282186758680181, + "grad_norm": 0.8403687477111816, + "learning_rate": 9.923419853781924e-06, + "loss": 0.8591, + "step": 4233 + }, + { + "epoch": 0.22827259003666164, + "grad_norm": 0.8540427684783936, + "learning_rate": 9.923382886345797e-06, + "loss": 0.8384, + "step": 4234 + }, + { + "epoch": 0.22832650420530515, + "grad_norm": 0.8174583911895752, + "learning_rate": 9.923345910058092e-06, + "loss": 0.8088, + "step": 4235 + }, + { + "epoch": 0.22838041837394868, + "grad_norm": 0.8237600326538086, + "learning_rate": 9.923308924918876e-06, + "loss": 0.7776, + "step": 4236 + }, + { + "epoch": 0.22843433254259218, + "grad_norm": 0.7644588947296143, + "learning_rate": 9.923271930928213e-06, + "loss": 0.7916, + "step": 4237 + }, + { + "epoch": 0.22848824671123572, + "grad_norm": 0.7141766548156738, + "learning_rate": 9.923234928086172e-06, + "loss": 0.7218, + "step": 4238 + }, + { + "epoch": 0.22854216087987922, + "grad_norm": 0.6722819805145264, + "learning_rate": 9.923197916392816e-06, + "loss": 0.7451, + "step": 4239 + }, + { + "epoch": 0.22859607504852275, + "grad_norm": 0.8109803199768066, + "learning_rate": 9.923160895848217e-06, + "loss": 0.7948, + "step": 4240 + }, + { + "epoch": 0.22864998921716628, + "grad_norm": 0.7268984913825989, + "learning_rate": 9.923123866452437e-06, + "loss": 0.7946, + "step": 4241 + }, + { + "epoch": 0.2287039033858098, + "grad_norm": 0.7497883439064026, + "learning_rate": 9.923086828205546e-06, + "loss": 0.7594, + "step": 4242 + }, + { + "epoch": 0.22875781755445332, + "grad_norm": 0.7800997495651245, + "learning_rate": 9.92304978110761e-06, + "loss": 0.7278, + "step": 4243 + }, + { + "epoch": 0.22881173172309682, + "grad_norm": 0.7802282571792603, + "learning_rate": 9.923012725158692e-06, + "loss": 0.8135, + "step": 4244 + }, + { + "epoch": 0.22886564589174035, + "grad_norm": 0.6718098521232605, + "learning_rate": 9.92297566035886e-06, + "loss": 0.7554, + "step": 4245 + }, + { + "epoch": 0.22891956006038386, + "grad_norm": 0.9285357594490051, + "learning_rate": 9.922938586708184e-06, + "loss": 0.8134, + "step": 4246 + }, + { + "epoch": 0.2289734742290274, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.922901504206728e-06, + "loss": 0.7114, + "step": 4247 + }, + { + "epoch": 0.2290273883976709, + "grad_norm": 0.893153190612793, + "learning_rate": 9.922864412854558e-06, + "loss": 0.798, + "step": 4248 + }, + { + "epoch": 0.22908130256631443, + "grad_norm": 0.9572556614875793, + "learning_rate": 9.922827312651744e-06, + "loss": 0.8467, + "step": 4249 + }, + { + "epoch": 0.22913521673495796, + "grad_norm": 0.8193963766098022, + "learning_rate": 9.922790203598349e-06, + "loss": 0.7466, + "step": 4250 + }, + { + "epoch": 0.22918913090360146, + "grad_norm": 0.8693044185638428, + "learning_rate": 9.922753085694441e-06, + "loss": 0.7253, + "step": 4251 + }, + { + "epoch": 0.229243045072245, + "grad_norm": 0.7820607423782349, + "learning_rate": 9.922715958940086e-06, + "loss": 0.8457, + "step": 4252 + }, + { + "epoch": 0.2292969592408885, + "grad_norm": 0.8323820233345032, + "learning_rate": 9.922678823335353e-06, + "loss": 0.8532, + "step": 4253 + }, + { + "epoch": 0.22935087340953203, + "grad_norm": 0.7978707551956177, + "learning_rate": 9.922641678880306e-06, + "loss": 0.7549, + "step": 4254 + }, + { + "epoch": 0.22940478757817553, + "grad_norm": 0.8820145726203918, + "learning_rate": 9.922604525575014e-06, + "loss": 0.746, + "step": 4255 + }, + { + "epoch": 0.22945870174681907, + "grad_norm": 0.7836315631866455, + "learning_rate": 9.922567363419544e-06, + "loss": 0.8095, + "step": 4256 + }, + { + "epoch": 0.22951261591546257, + "grad_norm": 0.7744200825691223, + "learning_rate": 9.922530192413962e-06, + "loss": 0.8599, + "step": 4257 + }, + { + "epoch": 0.2295665300841061, + "grad_norm": 0.861124575138092, + "learning_rate": 9.922493012558334e-06, + "loss": 0.8522, + "step": 4258 + }, + { + "epoch": 0.22962044425274963, + "grad_norm": 0.8234331607818604, + "learning_rate": 9.922455823852726e-06, + "loss": 0.8266, + "step": 4259 + }, + { + "epoch": 0.22967435842139314, + "grad_norm": 0.8142805099487305, + "learning_rate": 9.922418626297207e-06, + "loss": 0.7434, + "step": 4260 + }, + { + "epoch": 0.22972827259003667, + "grad_norm": 1.2082080841064453, + "learning_rate": 9.922381419891845e-06, + "loss": 0.7884, + "step": 4261 + }, + { + "epoch": 0.22978218675868017, + "grad_norm": 0.7151769399642944, + "learning_rate": 9.922344204636702e-06, + "loss": 0.8028, + "step": 4262 + }, + { + "epoch": 0.2298361009273237, + "grad_norm": 0.9005017280578613, + "learning_rate": 9.922306980531851e-06, + "loss": 0.7569, + "step": 4263 + }, + { + "epoch": 0.2298900150959672, + "grad_norm": 0.8531069755554199, + "learning_rate": 9.922269747577354e-06, + "loss": 0.7255, + "step": 4264 + }, + { + "epoch": 0.22994392926461074, + "grad_norm": 0.7625791430473328, + "learning_rate": 9.922232505773279e-06, + "loss": 0.7316, + "step": 4265 + }, + { + "epoch": 0.22999784343325425, + "grad_norm": 0.8707940578460693, + "learning_rate": 9.922195255119696e-06, + "loss": 0.7281, + "step": 4266 + }, + { + "epoch": 0.23005175760189778, + "grad_norm": 0.8227086067199707, + "learning_rate": 9.922157995616669e-06, + "loss": 0.7815, + "step": 4267 + }, + { + "epoch": 0.2301056717705413, + "grad_norm": 0.7798532843589783, + "learning_rate": 9.922120727264266e-06, + "loss": 0.7347, + "step": 4268 + }, + { + "epoch": 0.2301595859391848, + "grad_norm": 0.9200069308280945, + "learning_rate": 9.922083450062554e-06, + "loss": 0.8466, + "step": 4269 + }, + { + "epoch": 0.23021350010782834, + "grad_norm": 0.7376945614814758, + "learning_rate": 9.922046164011598e-06, + "loss": 0.7834, + "step": 4270 + }, + { + "epoch": 0.23026741427647185, + "grad_norm": 0.7460160255432129, + "learning_rate": 9.922008869111469e-06, + "loss": 0.7622, + "step": 4271 + }, + { + "epoch": 0.23032132844511538, + "grad_norm": 1.0576467514038086, + "learning_rate": 9.921971565362232e-06, + "loss": 0.7648, + "step": 4272 + }, + { + "epoch": 0.23037524261375888, + "grad_norm": 0.8479774594306946, + "learning_rate": 9.921934252763953e-06, + "loss": 0.7834, + "step": 4273 + }, + { + "epoch": 0.23042915678240242, + "grad_norm": 0.7337886691093445, + "learning_rate": 9.9218969313167e-06, + "loss": 0.7125, + "step": 4274 + }, + { + "epoch": 0.23048307095104595, + "grad_norm": 0.7631418108940125, + "learning_rate": 9.92185960102054e-06, + "loss": 0.7741, + "step": 4275 + }, + { + "epoch": 0.23053698511968945, + "grad_norm": 1.0179954767227173, + "learning_rate": 9.92182226187554e-06, + "loss": 0.8724, + "step": 4276 + }, + { + "epoch": 0.23059089928833298, + "grad_norm": 0.768721342086792, + "learning_rate": 9.921784913881768e-06, + "loss": 0.8324, + "step": 4277 + }, + { + "epoch": 0.2306448134569765, + "grad_norm": 0.8202316761016846, + "learning_rate": 9.92174755703929e-06, + "loss": 0.8052, + "step": 4278 + }, + { + "epoch": 0.23069872762562002, + "grad_norm": 0.8934405446052551, + "learning_rate": 9.921710191348174e-06, + "loss": 0.8247, + "step": 4279 + }, + { + "epoch": 0.23075264179426352, + "grad_norm": 0.8000699281692505, + "learning_rate": 9.921672816808488e-06, + "loss": 0.728, + "step": 4280 + }, + { + "epoch": 0.23080655596290706, + "grad_norm": 0.76044100522995, + "learning_rate": 9.921635433420295e-06, + "loss": 0.7559, + "step": 4281 + }, + { + "epoch": 0.23086047013155056, + "grad_norm": 0.8910096883773804, + "learning_rate": 9.921598041183668e-06, + "loss": 0.7519, + "step": 4282 + }, + { + "epoch": 0.2309143843001941, + "grad_norm": 0.7211179733276367, + "learning_rate": 9.92156064009867e-06, + "loss": 0.7838, + "step": 4283 + }, + { + "epoch": 0.23096829846883762, + "grad_norm": 0.7589021921157837, + "learning_rate": 9.921523230165372e-06, + "loss": 0.8378, + "step": 4284 + }, + { + "epoch": 0.23102221263748113, + "grad_norm": 0.7287599444389343, + "learning_rate": 9.921485811383838e-06, + "loss": 0.7667, + "step": 4285 + }, + { + "epoch": 0.23107612680612466, + "grad_norm": 0.7445182204246521, + "learning_rate": 9.921448383754136e-06, + "loss": 0.7795, + "step": 4286 + }, + { + "epoch": 0.23113004097476816, + "grad_norm": 0.7145516872406006, + "learning_rate": 9.921410947276334e-06, + "loss": 0.7439, + "step": 4287 + }, + { + "epoch": 0.2311839551434117, + "grad_norm": 0.8579338192939758, + "learning_rate": 9.921373501950497e-06, + "loss": 0.7636, + "step": 4288 + }, + { + "epoch": 0.2312378693120552, + "grad_norm": 0.7707585096359253, + "learning_rate": 9.921336047776695e-06, + "loss": 0.7422, + "step": 4289 + }, + { + "epoch": 0.23129178348069873, + "grad_norm": 1.0461829900741577, + "learning_rate": 9.921298584754994e-06, + "loss": 0.7768, + "step": 4290 + }, + { + "epoch": 0.23134569764934224, + "grad_norm": 0.9363743662834167, + "learning_rate": 9.921261112885464e-06, + "loss": 0.8283, + "step": 4291 + }, + { + "epoch": 0.23139961181798577, + "grad_norm": 0.7723295092582703, + "learning_rate": 9.921223632168168e-06, + "loss": 0.769, + "step": 4292 + }, + { + "epoch": 0.2314535259866293, + "grad_norm": 0.8078635931015015, + "learning_rate": 9.921186142603178e-06, + "loss": 0.7919, + "step": 4293 + }, + { + "epoch": 0.2315074401552728, + "grad_norm": 0.8620443940162659, + "learning_rate": 9.921148644190557e-06, + "loss": 0.805, + "step": 4294 + }, + { + "epoch": 0.23156135432391634, + "grad_norm": 0.8201389908790588, + "learning_rate": 9.921111136930376e-06, + "loss": 0.7982, + "step": 4295 + }, + { + "epoch": 0.23161526849255984, + "grad_norm": 0.7656800150871277, + "learning_rate": 9.9210736208227e-06, + "loss": 0.763, + "step": 4296 + }, + { + "epoch": 0.23166918266120337, + "grad_norm": 0.8814857602119446, + "learning_rate": 9.921036095867598e-06, + "loss": 0.8467, + "step": 4297 + }, + { + "epoch": 0.23172309682984688, + "grad_norm": 0.6766259074211121, + "learning_rate": 9.920998562065136e-06, + "loss": 0.6669, + "step": 4298 + }, + { + "epoch": 0.2317770109984904, + "grad_norm": 0.729774534702301, + "learning_rate": 9.920961019415383e-06, + "loss": 0.7208, + "step": 4299 + }, + { + "epoch": 0.2318309251671339, + "grad_norm": 0.8035505414009094, + "learning_rate": 9.920923467918405e-06, + "loss": 0.8282, + "step": 4300 + }, + { + "epoch": 0.23188483933577744, + "grad_norm": 0.7243404388427734, + "learning_rate": 9.920885907574269e-06, + "loss": 0.7861, + "step": 4301 + }, + { + "epoch": 0.23193875350442097, + "grad_norm": 0.9405563473701477, + "learning_rate": 9.920848338383047e-06, + "loss": 0.7703, + "step": 4302 + }, + { + "epoch": 0.23199266767306448, + "grad_norm": 1.192933440208435, + "learning_rate": 9.920810760344801e-06, + "loss": 0.8294, + "step": 4303 + }, + { + "epoch": 0.232046581841708, + "grad_norm": 0.7398643493652344, + "learning_rate": 9.920773173459601e-06, + "loss": 0.7712, + "step": 4304 + }, + { + "epoch": 0.23210049601035151, + "grad_norm": 0.7436460852622986, + "learning_rate": 9.920735577727516e-06, + "loss": 0.7155, + "step": 4305 + }, + { + "epoch": 0.23215441017899505, + "grad_norm": 0.7705883383750916, + "learning_rate": 9.920697973148613e-06, + "loss": 0.6632, + "step": 4306 + }, + { + "epoch": 0.23220832434763855, + "grad_norm": 0.8805288076400757, + "learning_rate": 9.920660359722955e-06, + "loss": 0.7914, + "step": 4307 + }, + { + "epoch": 0.23226223851628208, + "grad_norm": 0.7252172231674194, + "learning_rate": 9.920622737450616e-06, + "loss": 0.688, + "step": 4308 + }, + { + "epoch": 0.2323161526849256, + "grad_norm": 0.8841788172721863, + "learning_rate": 9.92058510633166e-06, + "loss": 0.8714, + "step": 4309 + }, + { + "epoch": 0.23237006685356912, + "grad_norm": 0.9365109205245972, + "learning_rate": 9.920547466366156e-06, + "loss": 0.8198, + "step": 4310 + }, + { + "epoch": 0.23242398102221265, + "grad_norm": 0.7860931754112244, + "learning_rate": 9.920509817554172e-06, + "loss": 0.8343, + "step": 4311 + }, + { + "epoch": 0.23247789519085615, + "grad_norm": 0.7520400881767273, + "learning_rate": 9.920472159895773e-06, + "loss": 0.8114, + "step": 4312 + }, + { + "epoch": 0.23253180935949969, + "grad_norm": 0.9704170823097229, + "learning_rate": 9.920434493391029e-06, + "loss": 0.7866, + "step": 4313 + }, + { + "epoch": 0.2325857235281432, + "grad_norm": 0.7817257046699524, + "learning_rate": 9.920396818040009e-06, + "loss": 0.7344, + "step": 4314 + }, + { + "epoch": 0.23263963769678672, + "grad_norm": 0.8574941158294678, + "learning_rate": 9.920359133842778e-06, + "loss": 0.8337, + "step": 4315 + }, + { + "epoch": 0.23269355186543023, + "grad_norm": 0.8683324456214905, + "learning_rate": 9.920321440799405e-06, + "loss": 0.6998, + "step": 4316 + }, + { + "epoch": 0.23274746603407376, + "grad_norm": 0.831664502620697, + "learning_rate": 9.920283738909958e-06, + "loss": 0.823, + "step": 4317 + }, + { + "epoch": 0.23280138020271726, + "grad_norm": 0.7768320441246033, + "learning_rate": 9.920246028174506e-06, + "loss": 0.8132, + "step": 4318 + }, + { + "epoch": 0.2328552943713608, + "grad_norm": 0.8081845045089722, + "learning_rate": 9.920208308593112e-06, + "loss": 0.8486, + "step": 4319 + }, + { + "epoch": 0.23290920854000433, + "grad_norm": 0.8148953914642334, + "learning_rate": 9.920170580165849e-06, + "loss": 0.7817, + "step": 4320 + }, + { + "epoch": 0.23296312270864783, + "grad_norm": 0.8448207378387451, + "learning_rate": 9.920132842892782e-06, + "loss": 0.81, + "step": 4321 + }, + { + "epoch": 0.23301703687729136, + "grad_norm": 0.9412322640419006, + "learning_rate": 9.92009509677398e-06, + "loss": 0.8847, + "step": 4322 + }, + { + "epoch": 0.23307095104593487, + "grad_norm": 0.745847761631012, + "learning_rate": 9.920057341809511e-06, + "loss": 0.6898, + "step": 4323 + }, + { + "epoch": 0.2331248652145784, + "grad_norm": 0.8120739459991455, + "learning_rate": 9.920019577999442e-06, + "loss": 0.7538, + "step": 4324 + }, + { + "epoch": 0.2331787793832219, + "grad_norm": 0.8183807730674744, + "learning_rate": 9.919981805343842e-06, + "loss": 0.7645, + "step": 4325 + }, + { + "epoch": 0.23323269355186543, + "grad_norm": 0.7442939281463623, + "learning_rate": 9.919944023842778e-06, + "loss": 0.7434, + "step": 4326 + }, + { + "epoch": 0.23328660772050894, + "grad_norm": 0.7586483359336853, + "learning_rate": 9.919906233496319e-06, + "loss": 0.767, + "step": 4327 + }, + { + "epoch": 0.23334052188915247, + "grad_norm": 0.8090452551841736, + "learning_rate": 9.919868434304531e-06, + "loss": 0.7217, + "step": 4328 + }, + { + "epoch": 0.233394436057796, + "grad_norm": 0.7344191670417786, + "learning_rate": 9.919830626267484e-06, + "loss": 0.8379, + "step": 4329 + }, + { + "epoch": 0.2334483502264395, + "grad_norm": 0.842797040939331, + "learning_rate": 9.919792809385244e-06, + "loss": 0.7191, + "step": 4330 + }, + { + "epoch": 0.23350226439508304, + "grad_norm": 0.9725179076194763, + "learning_rate": 9.91975498365788e-06, + "loss": 0.752, + "step": 4331 + }, + { + "epoch": 0.23355617856372654, + "grad_norm": 0.8612834811210632, + "learning_rate": 9.91971714908546e-06, + "loss": 0.6699, + "step": 4332 + }, + { + "epoch": 0.23361009273237007, + "grad_norm": 0.7784733772277832, + "learning_rate": 9.919679305668053e-06, + "loss": 0.6382, + "step": 4333 + }, + { + "epoch": 0.23366400690101358, + "grad_norm": 0.7414956092834473, + "learning_rate": 9.919641453405726e-06, + "loss": 0.7486, + "step": 4334 + }, + { + "epoch": 0.2337179210696571, + "grad_norm": 0.7242193818092346, + "learning_rate": 9.919603592298548e-06, + "loss": 0.7451, + "step": 4335 + }, + { + "epoch": 0.2337718352383006, + "grad_norm": 0.7716617584228516, + "learning_rate": 9.919565722346585e-06, + "loss": 0.8141, + "step": 4336 + }, + { + "epoch": 0.23382574940694414, + "grad_norm": 0.7281931042671204, + "learning_rate": 9.919527843549905e-06, + "loss": 0.7144, + "step": 4337 + }, + { + "epoch": 0.23387966357558768, + "grad_norm": 0.8105024695396423, + "learning_rate": 9.91948995590858e-06, + "loss": 0.7436, + "step": 4338 + }, + { + "epoch": 0.23393357774423118, + "grad_norm": 0.7437110543251038, + "learning_rate": 9.919452059422674e-06, + "loss": 0.7382, + "step": 4339 + }, + { + "epoch": 0.2339874919128747, + "grad_norm": 0.7429775595664978, + "learning_rate": 9.919414154092258e-06, + "loss": 0.6745, + "step": 4340 + }, + { + "epoch": 0.23404140608151822, + "grad_norm": 0.7441113591194153, + "learning_rate": 9.919376239917398e-06, + "loss": 0.6918, + "step": 4341 + }, + { + "epoch": 0.23409532025016175, + "grad_norm": 0.7948750257492065, + "learning_rate": 9.919338316898162e-06, + "loss": 0.7844, + "step": 4342 + }, + { + "epoch": 0.23414923441880525, + "grad_norm": 0.8123278021812439, + "learning_rate": 9.91930038503462e-06, + "loss": 0.8009, + "step": 4343 + }, + { + "epoch": 0.23420314858744878, + "grad_norm": 0.7706881761550903, + "learning_rate": 9.919262444326841e-06, + "loss": 0.7557, + "step": 4344 + }, + { + "epoch": 0.2342570627560923, + "grad_norm": 0.7763088345527649, + "learning_rate": 9.91922449477489e-06, + "loss": 0.7718, + "step": 4345 + }, + { + "epoch": 0.23431097692473582, + "grad_norm": 0.8066530227661133, + "learning_rate": 9.919186536378836e-06, + "loss": 0.7332, + "step": 4346 + }, + { + "epoch": 0.23436489109337935, + "grad_norm": 0.7513235211372375, + "learning_rate": 9.91914856913875e-06, + "loss": 0.7551, + "step": 4347 + }, + { + "epoch": 0.23441880526202286, + "grad_norm": 0.7152560949325562, + "learning_rate": 9.919110593054697e-06, + "loss": 0.7086, + "step": 4348 + }, + { + "epoch": 0.2344727194306664, + "grad_norm": 0.8949812650680542, + "learning_rate": 9.919072608126747e-06, + "loss": 0.8965, + "step": 4349 + }, + { + "epoch": 0.2345266335993099, + "grad_norm": 0.7958235740661621, + "learning_rate": 9.919034614354968e-06, + "loss": 0.8007, + "step": 4350 + }, + { + "epoch": 0.23458054776795342, + "grad_norm": 0.7758817672729492, + "learning_rate": 9.91899661173943e-06, + "loss": 0.6801, + "step": 4351 + }, + { + "epoch": 0.23463446193659693, + "grad_norm": 0.6918591260910034, + "learning_rate": 9.918958600280196e-06, + "loss": 0.7202, + "step": 4352 + }, + { + "epoch": 0.23468837610524046, + "grad_norm": 0.7467452883720398, + "learning_rate": 9.918920579977339e-06, + "loss": 0.7289, + "step": 4353 + }, + { + "epoch": 0.23474229027388396, + "grad_norm": 0.8222523331642151, + "learning_rate": 9.918882550830926e-06, + "loss": 0.8121, + "step": 4354 + }, + { + "epoch": 0.2347962044425275, + "grad_norm": 0.7198072671890259, + "learning_rate": 9.918844512841027e-06, + "loss": 0.7534, + "step": 4355 + }, + { + "epoch": 0.23485011861117103, + "grad_norm": 0.7741684317588806, + "learning_rate": 9.918806466007709e-06, + "loss": 0.7617, + "step": 4356 + }, + { + "epoch": 0.23490403277981453, + "grad_norm": 0.7739984393119812, + "learning_rate": 9.918768410331038e-06, + "loss": 0.668, + "step": 4357 + }, + { + "epoch": 0.23495794694845806, + "grad_norm": 0.7554827928543091, + "learning_rate": 9.918730345811088e-06, + "loss": 0.8149, + "step": 4358 + }, + { + "epoch": 0.23501186111710157, + "grad_norm": 0.687698483467102, + "learning_rate": 9.918692272447922e-06, + "loss": 0.7372, + "step": 4359 + }, + { + "epoch": 0.2350657752857451, + "grad_norm": 0.804979681968689, + "learning_rate": 9.91865419024161e-06, + "loss": 0.7604, + "step": 4360 + }, + { + "epoch": 0.2351196894543886, + "grad_norm": 0.839570164680481, + "learning_rate": 9.918616099192223e-06, + "loss": 0.7819, + "step": 4361 + }, + { + "epoch": 0.23517360362303213, + "grad_norm": 0.7619128823280334, + "learning_rate": 9.918577999299827e-06, + "loss": 0.7964, + "step": 4362 + }, + { + "epoch": 0.23522751779167564, + "grad_norm": 0.8392224311828613, + "learning_rate": 9.918539890564491e-06, + "loss": 0.778, + "step": 4363 + }, + { + "epoch": 0.23528143196031917, + "grad_norm": 0.7874334454536438, + "learning_rate": 9.918501772986284e-06, + "loss": 0.8403, + "step": 4364 + }, + { + "epoch": 0.2353353461289627, + "grad_norm": 0.7531299591064453, + "learning_rate": 9.918463646565276e-06, + "loss": 0.8639, + "step": 4365 + }, + { + "epoch": 0.2353892602976062, + "grad_norm": 0.7251406908035278, + "learning_rate": 9.91842551130153e-06, + "loss": 0.7858, + "step": 4366 + }, + { + "epoch": 0.23544317446624974, + "grad_norm": 0.8003079891204834, + "learning_rate": 9.918387367195121e-06, + "loss": 0.8117, + "step": 4367 + }, + { + "epoch": 0.23549708863489324, + "grad_norm": 0.7766731977462769, + "learning_rate": 9.918349214246112e-06, + "loss": 0.7751, + "step": 4368 + }, + { + "epoch": 0.23555100280353677, + "grad_norm": 0.7517151236534119, + "learning_rate": 9.918311052454577e-06, + "loss": 0.7245, + "step": 4369 + }, + { + "epoch": 0.23560491697218028, + "grad_norm": 0.6932556629180908, + "learning_rate": 9.918272881820582e-06, + "loss": 0.7544, + "step": 4370 + }, + { + "epoch": 0.2356588311408238, + "grad_norm": 0.7345824837684631, + "learning_rate": 9.918234702344194e-06, + "loss": 0.7467, + "step": 4371 + }, + { + "epoch": 0.23571274530946731, + "grad_norm": 0.7525627017021179, + "learning_rate": 9.918196514025485e-06, + "loss": 0.8197, + "step": 4372 + }, + { + "epoch": 0.23576665947811085, + "grad_norm": 0.9494594931602478, + "learning_rate": 9.918158316864522e-06, + "loss": 0.7505, + "step": 4373 + }, + { + "epoch": 0.23582057364675438, + "grad_norm": 0.7376323342323303, + "learning_rate": 9.918120110861372e-06, + "loss": 0.7513, + "step": 4374 + }, + { + "epoch": 0.23587448781539788, + "grad_norm": 0.8581971526145935, + "learning_rate": 9.918081896016108e-06, + "loss": 0.8419, + "step": 4375 + }, + { + "epoch": 0.2359284019840414, + "grad_norm": 0.9238672256469727, + "learning_rate": 9.918043672328793e-06, + "loss": 0.9286, + "step": 4376 + }, + { + "epoch": 0.23598231615268492, + "grad_norm": 0.787239670753479, + "learning_rate": 9.9180054397995e-06, + "loss": 0.7917, + "step": 4377 + }, + { + "epoch": 0.23603623032132845, + "grad_norm": 0.862934947013855, + "learning_rate": 9.917967198428298e-06, + "loss": 0.9533, + "step": 4378 + }, + { + "epoch": 0.23609014448997195, + "grad_norm": 0.8004072308540344, + "learning_rate": 9.917928948215251e-06, + "loss": 0.8035, + "step": 4379 + }, + { + "epoch": 0.23614405865861549, + "grad_norm": 0.7238081097602844, + "learning_rate": 9.917890689160434e-06, + "loss": 0.7777, + "step": 4380 + }, + { + "epoch": 0.23619797282725902, + "grad_norm": 0.7420337200164795, + "learning_rate": 9.917852421263912e-06, + "loss": 0.7643, + "step": 4381 + }, + { + "epoch": 0.23625188699590252, + "grad_norm": 0.8613260984420776, + "learning_rate": 9.917814144525754e-06, + "loss": 0.7936, + "step": 4382 + }, + { + "epoch": 0.23630580116454605, + "grad_norm": 0.787196934223175, + "learning_rate": 9.91777585894603e-06, + "loss": 0.8281, + "step": 4383 + }, + { + "epoch": 0.23635971533318956, + "grad_norm": 0.8265708088874817, + "learning_rate": 9.917737564524807e-06, + "loss": 0.7518, + "step": 4384 + }, + { + "epoch": 0.2364136295018331, + "grad_norm": 0.7922816276550293, + "learning_rate": 9.917699261262156e-06, + "loss": 0.8803, + "step": 4385 + }, + { + "epoch": 0.2364675436704766, + "grad_norm": 0.8977661728858948, + "learning_rate": 9.917660949158147e-06, + "loss": 0.9311, + "step": 4386 + }, + { + "epoch": 0.23652145783912012, + "grad_norm": 0.7732436060905457, + "learning_rate": 9.917622628212846e-06, + "loss": 0.7885, + "step": 4387 + }, + { + "epoch": 0.23657537200776363, + "grad_norm": 0.7951593399047852, + "learning_rate": 9.917584298426322e-06, + "loss": 0.9044, + "step": 4388 + }, + { + "epoch": 0.23662928617640716, + "grad_norm": 0.7638776898384094, + "learning_rate": 9.917545959798643e-06, + "loss": 0.7276, + "step": 4389 + }, + { + "epoch": 0.2366832003450507, + "grad_norm": 0.8405231833457947, + "learning_rate": 9.917507612329882e-06, + "loss": 0.9712, + "step": 4390 + }, + { + "epoch": 0.2367371145136942, + "grad_norm": 0.7908889651298523, + "learning_rate": 9.917469256020104e-06, + "loss": 0.7017, + "step": 4391 + }, + { + "epoch": 0.23679102868233773, + "grad_norm": 0.7041110992431641, + "learning_rate": 9.917430890869379e-06, + "loss": 0.811, + "step": 4392 + }, + { + "epoch": 0.23684494285098123, + "grad_norm": 0.923809289932251, + "learning_rate": 9.917392516877779e-06, + "loss": 0.7363, + "step": 4393 + }, + { + "epoch": 0.23689885701962476, + "grad_norm": 0.7647616267204285, + "learning_rate": 9.91735413404537e-06, + "loss": 0.6428, + "step": 4394 + }, + { + "epoch": 0.23695277118826827, + "grad_norm": 0.7839642763137817, + "learning_rate": 9.91731574237222e-06, + "loss": 0.7562, + "step": 4395 + }, + { + "epoch": 0.2370066853569118, + "grad_norm": 0.7928365468978882, + "learning_rate": 9.9172773418584e-06, + "loss": 0.8758, + "step": 4396 + }, + { + "epoch": 0.2370605995255553, + "grad_norm": 0.8615469336509705, + "learning_rate": 9.917238932503979e-06, + "loss": 0.8264, + "step": 4397 + }, + { + "epoch": 0.23711451369419884, + "grad_norm": 0.7869088649749756, + "learning_rate": 9.917200514309024e-06, + "loss": 0.8973, + "step": 4398 + }, + { + "epoch": 0.23716842786284237, + "grad_norm": 0.8070249557495117, + "learning_rate": 9.917162087273606e-06, + "loss": 0.786, + "step": 4399 + }, + { + "epoch": 0.23722234203148587, + "grad_norm": 0.7543795704841614, + "learning_rate": 9.917123651397796e-06, + "loss": 0.6012, + "step": 4400 + }, + { + "epoch": 0.2372762562001294, + "grad_norm": 1.0837504863739014, + "learning_rate": 9.91708520668166e-06, + "loss": 0.7304, + "step": 4401 + }, + { + "epoch": 0.2373301703687729, + "grad_norm": 0.8013801574707031, + "learning_rate": 9.917046753125265e-06, + "loss": 0.6564, + "step": 4402 + }, + { + "epoch": 0.23738408453741644, + "grad_norm": 0.8721063137054443, + "learning_rate": 9.917008290728687e-06, + "loss": 0.9042, + "step": 4403 + }, + { + "epoch": 0.23743799870605994, + "grad_norm": 0.9169342517852783, + "learning_rate": 9.91696981949199e-06, + "loss": 0.8766, + "step": 4404 + }, + { + "epoch": 0.23749191287470348, + "grad_norm": 0.7514129877090454, + "learning_rate": 9.916931339415243e-06, + "loss": 0.7818, + "step": 4405 + }, + { + "epoch": 0.23754582704334698, + "grad_norm": 0.747178316116333, + "learning_rate": 9.916892850498518e-06, + "loss": 0.7608, + "step": 4406 + }, + { + "epoch": 0.2375997412119905, + "grad_norm": 0.7261523008346558, + "learning_rate": 9.916854352741883e-06, + "loss": 0.6679, + "step": 4407 + }, + { + "epoch": 0.23765365538063404, + "grad_norm": 0.7496599555015564, + "learning_rate": 9.916815846145407e-06, + "loss": 0.8072, + "step": 4408 + }, + { + "epoch": 0.23770756954927755, + "grad_norm": 0.8052302002906799, + "learning_rate": 9.916777330709159e-06, + "loss": 0.7882, + "step": 4409 + }, + { + "epoch": 0.23776148371792108, + "grad_norm": 0.8955451250076294, + "learning_rate": 9.916738806433208e-06, + "loss": 0.7566, + "step": 4410 + }, + { + "epoch": 0.23781539788656458, + "grad_norm": 0.7964259386062622, + "learning_rate": 9.916700273317623e-06, + "loss": 0.7503, + "step": 4411 + }, + { + "epoch": 0.23786931205520812, + "grad_norm": 0.904030978679657, + "learning_rate": 9.916661731362476e-06, + "loss": 0.8056, + "step": 4412 + }, + { + "epoch": 0.23792322622385162, + "grad_norm": 0.8031491637229919, + "learning_rate": 9.916623180567833e-06, + "loss": 0.7978, + "step": 4413 + }, + { + "epoch": 0.23797714039249515, + "grad_norm": 1.2857294082641602, + "learning_rate": 9.916584620933764e-06, + "loss": 0.7822, + "step": 4414 + }, + { + "epoch": 0.23803105456113866, + "grad_norm": 0.8789198994636536, + "learning_rate": 9.91654605246034e-06, + "loss": 0.8392, + "step": 4415 + }, + { + "epoch": 0.2380849687297822, + "grad_norm": 0.7934818267822266, + "learning_rate": 9.91650747514763e-06, + "loss": 0.8242, + "step": 4416 + }, + { + "epoch": 0.23813888289842572, + "grad_norm": 0.8770273923873901, + "learning_rate": 9.916468888995703e-06, + "loss": 0.7649, + "step": 4417 + }, + { + "epoch": 0.23819279706706922, + "grad_norm": 0.9187912940979004, + "learning_rate": 9.916430294004627e-06, + "loss": 0.7531, + "step": 4418 + }, + { + "epoch": 0.23824671123571275, + "grad_norm": 0.8346499800682068, + "learning_rate": 9.916391690174472e-06, + "loss": 0.7785, + "step": 4419 + }, + { + "epoch": 0.23830062540435626, + "grad_norm": 0.7771525382995605, + "learning_rate": 9.916353077505307e-06, + "loss": 0.8418, + "step": 4420 + }, + { + "epoch": 0.2383545395729998, + "grad_norm": 0.8043860197067261, + "learning_rate": 9.916314455997204e-06, + "loss": 0.7878, + "step": 4421 + }, + { + "epoch": 0.2384084537416433, + "grad_norm": 0.8319140672683716, + "learning_rate": 9.916275825650231e-06, + "loss": 0.7751, + "step": 4422 + }, + { + "epoch": 0.23846236791028683, + "grad_norm": 0.7341157793998718, + "learning_rate": 9.916237186464455e-06, + "loss": 0.7486, + "step": 4423 + }, + { + "epoch": 0.23851628207893033, + "grad_norm": 0.8434766530990601, + "learning_rate": 9.91619853843995e-06, + "loss": 0.7906, + "step": 4424 + }, + { + "epoch": 0.23857019624757386, + "grad_norm": 0.8698723912239075, + "learning_rate": 9.916159881576782e-06, + "loss": 0.7577, + "step": 4425 + }, + { + "epoch": 0.2386241104162174, + "grad_norm": 0.6935116052627563, + "learning_rate": 9.91612121587502e-06, + "loss": 0.7538, + "step": 4426 + }, + { + "epoch": 0.2386780245848609, + "grad_norm": 0.7313439249992371, + "learning_rate": 9.916082541334737e-06, + "loss": 0.8306, + "step": 4427 + }, + { + "epoch": 0.23873193875350443, + "grad_norm": 0.7396842241287231, + "learning_rate": 9.916043857956e-06, + "loss": 0.8037, + "step": 4428 + }, + { + "epoch": 0.23878585292214793, + "grad_norm": 0.7954176664352417, + "learning_rate": 9.91600516573888e-06, + "loss": 0.7552, + "step": 4429 + }, + { + "epoch": 0.23883976709079147, + "grad_norm": 0.7113604545593262, + "learning_rate": 9.915966464683444e-06, + "loss": 0.722, + "step": 4430 + }, + { + "epoch": 0.23889368125943497, + "grad_norm": 0.7765493392944336, + "learning_rate": 9.915927754789765e-06, + "loss": 0.7143, + "step": 4431 + }, + { + "epoch": 0.2389475954280785, + "grad_norm": 0.8287819623947144, + "learning_rate": 9.91588903605791e-06, + "loss": 0.8256, + "step": 4432 + }, + { + "epoch": 0.239001509596722, + "grad_norm": 0.7855268120765686, + "learning_rate": 9.91585030848795e-06, + "loss": 0.8666, + "step": 4433 + }, + { + "epoch": 0.23905542376536554, + "grad_norm": 0.7613146901130676, + "learning_rate": 9.915811572079955e-06, + "loss": 0.7367, + "step": 4434 + }, + { + "epoch": 0.23910933793400907, + "grad_norm": 0.7982416152954102, + "learning_rate": 9.91577282683399e-06, + "loss": 0.8782, + "step": 4435 + }, + { + "epoch": 0.23916325210265257, + "grad_norm": 0.8698425889015198, + "learning_rate": 9.915734072750132e-06, + "loss": 0.7962, + "step": 4436 + }, + { + "epoch": 0.2392171662712961, + "grad_norm": 0.7771449089050293, + "learning_rate": 9.915695309828449e-06, + "loss": 0.8175, + "step": 4437 + }, + { + "epoch": 0.2392710804399396, + "grad_norm": 0.7628130912780762, + "learning_rate": 9.915656538069005e-06, + "loss": 0.8522, + "step": 4438 + }, + { + "epoch": 0.23932499460858314, + "grad_norm": 0.7890259623527527, + "learning_rate": 9.915617757471873e-06, + "loss": 0.7256, + "step": 4439 + }, + { + "epoch": 0.23937890877722665, + "grad_norm": 0.8656981587409973, + "learning_rate": 9.915578968037127e-06, + "loss": 0.7982, + "step": 4440 + }, + { + "epoch": 0.23943282294587018, + "grad_norm": 0.7118672132492065, + "learning_rate": 9.91554016976483e-06, + "loss": 0.7917, + "step": 4441 + }, + { + "epoch": 0.23948673711451368, + "grad_norm": 0.8988688588142395, + "learning_rate": 9.915501362655055e-06, + "loss": 0.7884, + "step": 4442 + }, + { + "epoch": 0.2395406512831572, + "grad_norm": 0.7870175242424011, + "learning_rate": 9.915462546707873e-06, + "loss": 0.8008, + "step": 4443 + }, + { + "epoch": 0.23959456545180075, + "grad_norm": 0.8649255037307739, + "learning_rate": 9.915423721923351e-06, + "loss": 0.7897, + "step": 4444 + }, + { + "epoch": 0.23964847962044425, + "grad_norm": 0.8905230164527893, + "learning_rate": 9.915384888301561e-06, + "loss": 0.8611, + "step": 4445 + }, + { + "epoch": 0.23970239378908778, + "grad_norm": 0.7729083299636841, + "learning_rate": 9.91534604584257e-06, + "loss": 0.7539, + "step": 4446 + }, + { + "epoch": 0.23975630795773129, + "grad_norm": 0.9127714037895203, + "learning_rate": 9.915307194546452e-06, + "loss": 0.8286, + "step": 4447 + }, + { + "epoch": 0.23981022212637482, + "grad_norm": 0.9115898013114929, + "learning_rate": 9.915268334413274e-06, + "loss": 0.8655, + "step": 4448 + }, + { + "epoch": 0.23986413629501832, + "grad_norm": 0.8105745315551758, + "learning_rate": 9.915229465443106e-06, + "loss": 0.7936, + "step": 4449 + }, + { + "epoch": 0.23991805046366185, + "grad_norm": 0.732665479183197, + "learning_rate": 9.91519058763602e-06, + "loss": 0.6538, + "step": 4450 + }, + { + "epoch": 0.23997196463230536, + "grad_norm": 0.7506905794143677, + "learning_rate": 9.91515170099208e-06, + "loss": 0.7461, + "step": 4451 + }, + { + "epoch": 0.2400258788009489, + "grad_norm": 1.0013810396194458, + "learning_rate": 9.915112805511364e-06, + "loss": 0.8622, + "step": 4452 + }, + { + "epoch": 0.24007979296959242, + "grad_norm": 0.8527307510375977, + "learning_rate": 9.915073901193937e-06, + "loss": 0.7516, + "step": 4453 + }, + { + "epoch": 0.24013370713823592, + "grad_norm": 0.756240963935852, + "learning_rate": 9.91503498803987e-06, + "loss": 0.7374, + "step": 4454 + }, + { + "epoch": 0.24018762130687946, + "grad_norm": 0.7914390563964844, + "learning_rate": 9.914996066049234e-06, + "loss": 0.7492, + "step": 4455 + }, + { + "epoch": 0.24024153547552296, + "grad_norm": 0.820505678653717, + "learning_rate": 9.914957135222096e-06, + "loss": 0.6724, + "step": 4456 + }, + { + "epoch": 0.2402954496441665, + "grad_norm": 0.9144145846366882, + "learning_rate": 9.91491819555853e-06, + "loss": 0.8507, + "step": 4457 + }, + { + "epoch": 0.24034936381281, + "grad_norm": 0.7114265561103821, + "learning_rate": 9.914879247058602e-06, + "loss": 0.7308, + "step": 4458 + }, + { + "epoch": 0.24040327798145353, + "grad_norm": 0.8527531027793884, + "learning_rate": 9.914840289722385e-06, + "loss": 0.8446, + "step": 4459 + }, + { + "epoch": 0.24045719215009703, + "grad_norm": 0.9392815232276917, + "learning_rate": 9.914801323549948e-06, + "loss": 0.8434, + "step": 4460 + }, + { + "epoch": 0.24051110631874056, + "grad_norm": 0.8654825687408447, + "learning_rate": 9.91476234854136e-06, + "loss": 0.7719, + "step": 4461 + }, + { + "epoch": 0.2405650204873841, + "grad_norm": 0.8563691973686218, + "learning_rate": 9.914723364696693e-06, + "loss": 0.7922, + "step": 4462 + }, + { + "epoch": 0.2406189346560276, + "grad_norm": 0.7988063097000122, + "learning_rate": 9.914684372016016e-06, + "loss": 0.8222, + "step": 4463 + }, + { + "epoch": 0.24067284882467113, + "grad_norm": 0.8066624402999878, + "learning_rate": 9.9146453704994e-06, + "loss": 0.8205, + "step": 4464 + }, + { + "epoch": 0.24072676299331464, + "grad_norm": 0.9636842608451843, + "learning_rate": 9.914606360146915e-06, + "loss": 0.835, + "step": 4465 + }, + { + "epoch": 0.24078067716195817, + "grad_norm": 0.7767032980918884, + "learning_rate": 9.91456734095863e-06, + "loss": 0.7637, + "step": 4466 + }, + { + "epoch": 0.24083459133060167, + "grad_norm": 0.7343990802764893, + "learning_rate": 9.914528312934614e-06, + "loss": 0.752, + "step": 4467 + }, + { + "epoch": 0.2408885054992452, + "grad_norm": 0.8200786113739014, + "learning_rate": 9.91448927607494e-06, + "loss": 0.7468, + "step": 4468 + }, + { + "epoch": 0.2409424196678887, + "grad_norm": 0.810748279094696, + "learning_rate": 9.91445023037968e-06, + "loss": 0.8567, + "step": 4469 + }, + { + "epoch": 0.24099633383653224, + "grad_norm": 0.8314438462257385, + "learning_rate": 9.914411175848896e-06, + "loss": 0.8693, + "step": 4470 + }, + { + "epoch": 0.24105024800517577, + "grad_norm": 0.827609121799469, + "learning_rate": 9.914372112482668e-06, + "loss": 0.7171, + "step": 4471 + }, + { + "epoch": 0.24110416217381928, + "grad_norm": 0.7794898748397827, + "learning_rate": 9.91433304028106e-06, + "loss": 0.7279, + "step": 4472 + }, + { + "epoch": 0.2411580763424628, + "grad_norm": 0.7951536178588867, + "learning_rate": 9.914293959244145e-06, + "loss": 0.8438, + "step": 4473 + }, + { + "epoch": 0.2412119905111063, + "grad_norm": 0.8130155801773071, + "learning_rate": 9.914254869371991e-06, + "loss": 0.7849, + "step": 4474 + }, + { + "epoch": 0.24126590467974984, + "grad_norm": 0.8347324728965759, + "learning_rate": 9.91421577066467e-06, + "loss": 0.851, + "step": 4475 + }, + { + "epoch": 0.24131981884839335, + "grad_norm": 0.8122373819351196, + "learning_rate": 9.914176663122252e-06, + "loss": 0.8293, + "step": 4476 + }, + { + "epoch": 0.24137373301703688, + "grad_norm": 0.728115975856781, + "learning_rate": 9.914137546744807e-06, + "loss": 0.7865, + "step": 4477 + }, + { + "epoch": 0.24142764718568038, + "grad_norm": 0.8177993893623352, + "learning_rate": 9.914098421532404e-06, + "loss": 0.7765, + "step": 4478 + }, + { + "epoch": 0.24148156135432391, + "grad_norm": 0.7987833619117737, + "learning_rate": 9.914059287485117e-06, + "loss": 0.7611, + "step": 4479 + }, + { + "epoch": 0.24153547552296745, + "grad_norm": 0.7656280994415283, + "learning_rate": 9.914020144603013e-06, + "loss": 0.7538, + "step": 4480 + }, + { + "epoch": 0.24158938969161095, + "grad_norm": 0.69268798828125, + "learning_rate": 9.913980992886163e-06, + "loss": 0.6306, + "step": 4481 + }, + { + "epoch": 0.24164330386025448, + "grad_norm": 0.7506656050682068, + "learning_rate": 9.91394183233464e-06, + "loss": 0.7629, + "step": 4482 + }, + { + "epoch": 0.241697218028898, + "grad_norm": 0.893014669418335, + "learning_rate": 9.91390266294851e-06, + "loss": 0.7696, + "step": 4483 + }, + { + "epoch": 0.24175113219754152, + "grad_norm": 0.8073716163635254, + "learning_rate": 9.913863484727847e-06, + "loss": 0.7901, + "step": 4484 + }, + { + "epoch": 0.24180504636618502, + "grad_norm": 0.7654293775558472, + "learning_rate": 9.913824297672721e-06, + "loss": 0.8004, + "step": 4485 + }, + { + "epoch": 0.24185896053482855, + "grad_norm": 0.7301006317138672, + "learning_rate": 9.9137851017832e-06, + "loss": 0.694, + "step": 4486 + }, + { + "epoch": 0.24191287470347209, + "grad_norm": 0.7901747822761536, + "learning_rate": 9.913745897059356e-06, + "loss": 0.741, + "step": 4487 + }, + { + "epoch": 0.2419667888721156, + "grad_norm": 0.7572670578956604, + "learning_rate": 9.91370668350126e-06, + "loss": 0.8361, + "step": 4488 + }, + { + "epoch": 0.24202070304075912, + "grad_norm": 0.8322924971580505, + "learning_rate": 9.913667461108983e-06, + "loss": 0.8155, + "step": 4489 + }, + { + "epoch": 0.24207461720940263, + "grad_norm": 1.0176936388015747, + "learning_rate": 9.913628229882593e-06, + "loss": 0.8341, + "step": 4490 + }, + { + "epoch": 0.24212853137804616, + "grad_norm": 0.7386930584907532, + "learning_rate": 9.913588989822165e-06, + "loss": 0.838, + "step": 4491 + }, + { + "epoch": 0.24218244554668966, + "grad_norm": 0.874079167842865, + "learning_rate": 9.913549740927764e-06, + "loss": 0.7181, + "step": 4492 + }, + { + "epoch": 0.2422363597153332, + "grad_norm": 0.8320260643959045, + "learning_rate": 9.913510483199464e-06, + "loss": 0.8909, + "step": 4493 + }, + { + "epoch": 0.2422902738839767, + "grad_norm": 0.7491182088851929, + "learning_rate": 9.913471216637335e-06, + "loss": 0.8469, + "step": 4494 + }, + { + "epoch": 0.24234418805262023, + "grad_norm": 0.7132229804992676, + "learning_rate": 9.913431941241446e-06, + "loss": 0.7237, + "step": 4495 + }, + { + "epoch": 0.24239810222126376, + "grad_norm": 0.8269235491752625, + "learning_rate": 9.913392657011872e-06, + "loss": 0.7929, + "step": 4496 + }, + { + "epoch": 0.24245201638990727, + "grad_norm": 0.8247712254524231, + "learning_rate": 9.913353363948679e-06, + "loss": 0.7298, + "step": 4497 + }, + { + "epoch": 0.2425059305585508, + "grad_norm": 0.761820912361145, + "learning_rate": 9.91331406205194e-06, + "loss": 0.749, + "step": 4498 + }, + { + "epoch": 0.2425598447271943, + "grad_norm": 0.7263596653938293, + "learning_rate": 9.913274751321723e-06, + "loss": 0.8055, + "step": 4499 + }, + { + "epoch": 0.24261375889583783, + "grad_norm": 0.7232603430747986, + "learning_rate": 9.913235431758102e-06, + "loss": 0.8011, + "step": 4500 + }, + { + "epoch": 0.24266767306448134, + "grad_norm": 0.8140621781349182, + "learning_rate": 9.913196103361146e-06, + "loss": 0.8332, + "step": 4501 + }, + { + "epoch": 0.24272158723312487, + "grad_norm": 0.8474514484405518, + "learning_rate": 9.913156766130926e-06, + "loss": 0.8632, + "step": 4502 + }, + { + "epoch": 0.24277550140176837, + "grad_norm": 0.8690447211265564, + "learning_rate": 9.913117420067515e-06, + "loss": 0.8027, + "step": 4503 + }, + { + "epoch": 0.2428294155704119, + "grad_norm": 0.7381221652030945, + "learning_rate": 9.91307806517098e-06, + "loss": 0.7622, + "step": 4504 + }, + { + "epoch": 0.24288332973905544, + "grad_norm": 0.7889763712882996, + "learning_rate": 9.913038701441393e-06, + "loss": 0.7792, + "step": 4505 + }, + { + "epoch": 0.24293724390769894, + "grad_norm": 0.7800214886665344, + "learning_rate": 9.912999328878825e-06, + "loss": 0.7972, + "step": 4506 + }, + { + "epoch": 0.24299115807634247, + "grad_norm": 0.7379936575889587, + "learning_rate": 9.912959947483348e-06, + "loss": 0.7353, + "step": 4507 + }, + { + "epoch": 0.24304507224498598, + "grad_norm": 0.7070313692092896, + "learning_rate": 9.912920557255028e-06, + "loss": 0.7483, + "step": 4508 + }, + { + "epoch": 0.2430989864136295, + "grad_norm": 0.7230751514434814, + "learning_rate": 9.912881158193943e-06, + "loss": 0.7882, + "step": 4509 + }, + { + "epoch": 0.243152900582273, + "grad_norm": 0.8739690780639648, + "learning_rate": 9.91284175030016e-06, + "loss": 0.885, + "step": 4510 + }, + { + "epoch": 0.24320681475091654, + "grad_norm": 0.7954097986221313, + "learning_rate": 9.912802333573748e-06, + "loss": 0.8575, + "step": 4511 + }, + { + "epoch": 0.24326072891956005, + "grad_norm": 0.7602096796035767, + "learning_rate": 9.912762908014781e-06, + "loss": 0.7847, + "step": 4512 + }, + { + "epoch": 0.24331464308820358, + "grad_norm": 0.7269259691238403, + "learning_rate": 9.91272347362333e-06, + "loss": 0.7931, + "step": 4513 + }, + { + "epoch": 0.2433685572568471, + "grad_norm": 0.6849657297134399, + "learning_rate": 9.912684030399464e-06, + "loss": 0.7478, + "step": 4514 + }, + { + "epoch": 0.24342247142549062, + "grad_norm": 0.8350282907485962, + "learning_rate": 9.912644578343255e-06, + "loss": 0.829, + "step": 4515 + }, + { + "epoch": 0.24347638559413415, + "grad_norm": 0.7411940693855286, + "learning_rate": 9.912605117454772e-06, + "loss": 0.6513, + "step": 4516 + }, + { + "epoch": 0.24353029976277765, + "grad_norm": 0.73365718126297, + "learning_rate": 9.912565647734089e-06, + "loss": 0.7376, + "step": 4517 + }, + { + "epoch": 0.24358421393142118, + "grad_norm": 0.8144620060920715, + "learning_rate": 9.912526169181275e-06, + "loss": 0.745, + "step": 4518 + }, + { + "epoch": 0.2436381281000647, + "grad_norm": 0.7516615390777588, + "learning_rate": 9.912486681796403e-06, + "loss": 0.8864, + "step": 4519 + }, + { + "epoch": 0.24369204226870822, + "grad_norm": 0.8179273009300232, + "learning_rate": 9.91244718557954e-06, + "loss": 0.7763, + "step": 4520 + }, + { + "epoch": 0.24374595643735172, + "grad_norm": 0.7541390657424927, + "learning_rate": 9.912407680530762e-06, + "loss": 0.8565, + "step": 4521 + }, + { + "epoch": 0.24379987060599526, + "grad_norm": 0.7410699129104614, + "learning_rate": 9.912368166650137e-06, + "loss": 0.7938, + "step": 4522 + }, + { + "epoch": 0.2438537847746388, + "grad_norm": 0.840753972530365, + "learning_rate": 9.912328643937735e-06, + "loss": 0.8895, + "step": 4523 + }, + { + "epoch": 0.2439076989432823, + "grad_norm": 0.7780727744102478, + "learning_rate": 9.91228911239363e-06, + "loss": 0.8554, + "step": 4524 + }, + { + "epoch": 0.24396161311192582, + "grad_norm": 0.8156387805938721, + "learning_rate": 9.91224957201789e-06, + "loss": 0.7871, + "step": 4525 + }, + { + "epoch": 0.24401552728056933, + "grad_norm": 0.7830832004547119, + "learning_rate": 9.912210022810591e-06, + "loss": 0.729, + "step": 4526 + }, + { + "epoch": 0.24406944144921286, + "grad_norm": 0.9109267592430115, + "learning_rate": 9.912170464771799e-06, + "loss": 0.8281, + "step": 4527 + }, + { + "epoch": 0.24412335561785636, + "grad_norm": 0.7609542012214661, + "learning_rate": 9.912130897901587e-06, + "loss": 0.8907, + "step": 4528 + }, + { + "epoch": 0.2441772697864999, + "grad_norm": 0.8503179550170898, + "learning_rate": 9.912091322200025e-06, + "loss": 0.8337, + "step": 4529 + }, + { + "epoch": 0.2442311839551434, + "grad_norm": 0.8808969259262085, + "learning_rate": 9.912051737667188e-06, + "loss": 0.806, + "step": 4530 + }, + { + "epoch": 0.24428509812378693, + "grad_norm": 0.8438240885734558, + "learning_rate": 9.912012144303142e-06, + "loss": 0.8318, + "step": 4531 + }, + { + "epoch": 0.24433901229243046, + "grad_norm": 0.7944091558456421, + "learning_rate": 9.911972542107962e-06, + "loss": 0.764, + "step": 4532 + }, + { + "epoch": 0.24439292646107397, + "grad_norm": 0.7484297752380371, + "learning_rate": 9.911932931081718e-06, + "loss": 0.7677, + "step": 4533 + }, + { + "epoch": 0.2444468406297175, + "grad_norm": 0.9554882049560547, + "learning_rate": 9.911893311224479e-06, + "loss": 0.7226, + "step": 4534 + }, + { + "epoch": 0.244500754798361, + "grad_norm": 0.6818152070045471, + "learning_rate": 9.91185368253632e-06, + "loss": 0.6565, + "step": 4535 + }, + { + "epoch": 0.24455466896700453, + "grad_norm": 0.8960323333740234, + "learning_rate": 9.91181404501731e-06, + "loss": 0.6672, + "step": 4536 + }, + { + "epoch": 0.24460858313564804, + "grad_norm": 0.8440603017807007, + "learning_rate": 9.911774398667521e-06, + "loss": 0.8423, + "step": 4537 + }, + { + "epoch": 0.24466249730429157, + "grad_norm": 0.7892050743103027, + "learning_rate": 9.911734743487025e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.24471641147293507, + "grad_norm": 0.7640264630317688, + "learning_rate": 9.911695079475892e-06, + "loss": 0.7893, + "step": 4539 + }, + { + "epoch": 0.2447703256415786, + "grad_norm": 0.8862099051475525, + "learning_rate": 9.911655406634191e-06, + "loss": 0.8155, + "step": 4540 + }, + { + "epoch": 0.24482423981022214, + "grad_norm": 0.7623111009597778, + "learning_rate": 9.911615724961999e-06, + "loss": 0.8313, + "step": 4541 + }, + { + "epoch": 0.24487815397886564, + "grad_norm": 0.713832676410675, + "learning_rate": 9.911576034459385e-06, + "loss": 0.7635, + "step": 4542 + }, + { + "epoch": 0.24493206814750917, + "grad_norm": 0.8501582741737366, + "learning_rate": 9.911536335126417e-06, + "loss": 0.8201, + "step": 4543 + }, + { + "epoch": 0.24498598231615268, + "grad_norm": 0.7051424980163574, + "learning_rate": 9.911496626963171e-06, + "loss": 0.7257, + "step": 4544 + }, + { + "epoch": 0.2450398964847962, + "grad_norm": 0.8079765439033508, + "learning_rate": 9.911456909969716e-06, + "loss": 0.7397, + "step": 4545 + }, + { + "epoch": 0.24509381065343971, + "grad_norm": 0.9106319546699524, + "learning_rate": 9.911417184146124e-06, + "loss": 0.7966, + "step": 4546 + }, + { + "epoch": 0.24514772482208325, + "grad_norm": 0.9614812731742859, + "learning_rate": 9.911377449492465e-06, + "loss": 0.7727, + "step": 4547 + }, + { + "epoch": 0.24520163899072675, + "grad_norm": 0.8388345241546631, + "learning_rate": 9.911337706008813e-06, + "loss": 0.7328, + "step": 4548 + }, + { + "epoch": 0.24525555315937028, + "grad_norm": 0.782459020614624, + "learning_rate": 9.911297953695239e-06, + "loss": 0.7823, + "step": 4549 + }, + { + "epoch": 0.2453094673280138, + "grad_norm": 0.8531977534294128, + "learning_rate": 9.911258192551812e-06, + "loss": 0.7139, + "step": 4550 + }, + { + "epoch": 0.24536338149665732, + "grad_norm": 0.7864230871200562, + "learning_rate": 9.911218422578605e-06, + "loss": 0.8322, + "step": 4551 + }, + { + "epoch": 0.24541729566530085, + "grad_norm": 0.7742743492126465, + "learning_rate": 9.911178643775691e-06, + "loss": 0.6747, + "step": 4552 + }, + { + "epoch": 0.24547120983394435, + "grad_norm": 0.7385323643684387, + "learning_rate": 9.91113885614314e-06, + "loss": 0.7297, + "step": 4553 + }, + { + "epoch": 0.24552512400258789, + "grad_norm": 0.8086322546005249, + "learning_rate": 9.911099059681023e-06, + "loss": 0.8216, + "step": 4554 + }, + { + "epoch": 0.2455790381712314, + "grad_norm": 0.7630950808525085, + "learning_rate": 9.911059254389412e-06, + "loss": 0.7549, + "step": 4555 + }, + { + "epoch": 0.24563295233987492, + "grad_norm": 0.8294158577919006, + "learning_rate": 9.91101944026838e-06, + "loss": 0.7858, + "step": 4556 + }, + { + "epoch": 0.24568686650851843, + "grad_norm": 0.8100032210350037, + "learning_rate": 9.910979617317998e-06, + "loss": 0.8488, + "step": 4557 + }, + { + "epoch": 0.24574078067716196, + "grad_norm": 0.7359179258346558, + "learning_rate": 9.910939785538335e-06, + "loss": 0.8151, + "step": 4558 + }, + { + "epoch": 0.2457946948458055, + "grad_norm": 0.811253011226654, + "learning_rate": 9.910899944929465e-06, + "loss": 0.801, + "step": 4559 + }, + { + "epoch": 0.245848609014449, + "grad_norm": 0.7908209562301636, + "learning_rate": 9.91086009549146e-06, + "loss": 0.8618, + "step": 4560 + }, + { + "epoch": 0.24590252318309253, + "grad_norm": 0.7895631790161133, + "learning_rate": 9.91082023722439e-06, + "loss": 0.7601, + "step": 4561 + }, + { + "epoch": 0.24595643735173603, + "grad_norm": 0.7346864938735962, + "learning_rate": 9.910780370128328e-06, + "loss": 0.7725, + "step": 4562 + }, + { + "epoch": 0.24601035152037956, + "grad_norm": 0.6873648166656494, + "learning_rate": 9.910740494203346e-06, + "loss": 0.7597, + "step": 4563 + }, + { + "epoch": 0.24606426568902307, + "grad_norm": 0.8287232518196106, + "learning_rate": 9.910700609449514e-06, + "loss": 0.8514, + "step": 4564 + }, + { + "epoch": 0.2461181798576666, + "grad_norm": 0.9342181086540222, + "learning_rate": 9.910660715866904e-06, + "loss": 0.8839, + "step": 4565 + }, + { + "epoch": 0.2461720940263101, + "grad_norm": 0.7942633032798767, + "learning_rate": 9.91062081345559e-06, + "loss": 0.7975, + "step": 4566 + }, + { + "epoch": 0.24622600819495363, + "grad_norm": 0.8790503144264221, + "learning_rate": 9.910580902215641e-06, + "loss": 0.655, + "step": 4567 + }, + { + "epoch": 0.24627992236359716, + "grad_norm": 0.7399418354034424, + "learning_rate": 9.91054098214713e-06, + "loss": 0.7647, + "step": 4568 + }, + { + "epoch": 0.24633383653224067, + "grad_norm": 1.9217935800552368, + "learning_rate": 9.91050105325013e-06, + "loss": 0.8151, + "step": 4569 + }, + { + "epoch": 0.2463877507008842, + "grad_norm": 0.7717850804328918, + "learning_rate": 9.910461115524709e-06, + "loss": 0.7653, + "step": 4570 + }, + { + "epoch": 0.2464416648695277, + "grad_norm": 0.9564247131347656, + "learning_rate": 9.910421168970943e-06, + "loss": 0.8427, + "step": 4571 + }, + { + "epoch": 0.24649557903817124, + "grad_norm": 0.7386001348495483, + "learning_rate": 9.9103812135889e-06, + "loss": 0.7577, + "step": 4572 + }, + { + "epoch": 0.24654949320681474, + "grad_norm": 0.7440508008003235, + "learning_rate": 9.910341249378656e-06, + "loss": 0.7735, + "step": 4573 + }, + { + "epoch": 0.24660340737545827, + "grad_norm": 0.7204955220222473, + "learning_rate": 9.91030127634028e-06, + "loss": 0.7404, + "step": 4574 + }, + { + "epoch": 0.24665732154410178, + "grad_norm": 0.7932496666908264, + "learning_rate": 9.910261294473844e-06, + "loss": 0.7131, + "step": 4575 + }, + { + "epoch": 0.2467112357127453, + "grad_norm": 0.8415532112121582, + "learning_rate": 9.91022130377942e-06, + "loss": 0.889, + "step": 4576 + }, + { + "epoch": 0.24676514988138884, + "grad_norm": 0.7823799252510071, + "learning_rate": 9.91018130425708e-06, + "loss": 0.7589, + "step": 4577 + }, + { + "epoch": 0.24681906405003234, + "grad_norm": 0.6958774924278259, + "learning_rate": 9.910141295906898e-06, + "loss": 0.6957, + "step": 4578 + }, + { + "epoch": 0.24687297821867588, + "grad_norm": 0.7267159819602966, + "learning_rate": 9.910101278728944e-06, + "loss": 0.767, + "step": 4579 + }, + { + "epoch": 0.24692689238731938, + "grad_norm": 0.7345640659332275, + "learning_rate": 9.91006125272329e-06, + "loss": 0.7756, + "step": 4580 + }, + { + "epoch": 0.2469808065559629, + "grad_norm": 0.8117407560348511, + "learning_rate": 9.910021217890007e-06, + "loss": 0.8028, + "step": 4581 + }, + { + "epoch": 0.24703472072460642, + "grad_norm": 0.7520045042037964, + "learning_rate": 9.90998117422917e-06, + "loss": 0.7801, + "step": 4582 + }, + { + "epoch": 0.24708863489324995, + "grad_norm": 0.791251003742218, + "learning_rate": 9.909941121740847e-06, + "loss": 0.8244, + "step": 4583 + }, + { + "epoch": 0.24714254906189345, + "grad_norm": 0.8434782028198242, + "learning_rate": 9.909901060425114e-06, + "loss": 0.8461, + "step": 4584 + }, + { + "epoch": 0.24719646323053698, + "grad_norm": 0.789013147354126, + "learning_rate": 9.909860990282038e-06, + "loss": 0.8655, + "step": 4585 + }, + { + "epoch": 0.24725037739918052, + "grad_norm": 0.7809332609176636, + "learning_rate": 9.909820911311697e-06, + "loss": 0.7963, + "step": 4586 + }, + { + "epoch": 0.24730429156782402, + "grad_norm": 0.7775362730026245, + "learning_rate": 9.909780823514159e-06, + "loss": 0.8098, + "step": 4587 + }, + { + "epoch": 0.24735820573646755, + "grad_norm": 0.7136217355728149, + "learning_rate": 9.909740726889498e-06, + "loss": 0.7454, + "step": 4588 + }, + { + "epoch": 0.24741211990511106, + "grad_norm": 0.7367640733718872, + "learning_rate": 9.909700621437786e-06, + "loss": 0.7732, + "step": 4589 + }, + { + "epoch": 0.2474660340737546, + "grad_norm": 0.8922567963600159, + "learning_rate": 9.909660507159093e-06, + "loss": 0.7173, + "step": 4590 + }, + { + "epoch": 0.2475199482423981, + "grad_norm": 0.7434333562850952, + "learning_rate": 9.909620384053494e-06, + "loss": 0.7255, + "step": 4591 + }, + { + "epoch": 0.24757386241104162, + "grad_norm": 0.7813223600387573, + "learning_rate": 9.909580252121057e-06, + "loss": 0.7583, + "step": 4592 + }, + { + "epoch": 0.24762777657968515, + "grad_norm": 0.699350118637085, + "learning_rate": 9.90954011136186e-06, + "loss": 0.7572, + "step": 4593 + }, + { + "epoch": 0.24768169074832866, + "grad_norm": 0.8126040101051331, + "learning_rate": 9.90949996177597e-06, + "loss": 0.839, + "step": 4594 + }, + { + "epoch": 0.2477356049169722, + "grad_norm": 0.7475876808166504, + "learning_rate": 9.90945980336346e-06, + "loss": 0.7268, + "step": 4595 + }, + { + "epoch": 0.2477895190856157, + "grad_norm": 0.7833042740821838, + "learning_rate": 9.909419636124407e-06, + "loss": 0.825, + "step": 4596 + }, + { + "epoch": 0.24784343325425923, + "grad_norm": 0.7600408792495728, + "learning_rate": 9.909379460058877e-06, + "loss": 0.7598, + "step": 4597 + }, + { + "epoch": 0.24789734742290273, + "grad_norm": 0.7315041422843933, + "learning_rate": 9.909339275166946e-06, + "loss": 0.7671, + "step": 4598 + }, + { + "epoch": 0.24795126159154626, + "grad_norm": 0.8522780537605286, + "learning_rate": 9.909299081448685e-06, + "loss": 0.7847, + "step": 4599 + }, + { + "epoch": 0.24800517576018977, + "grad_norm": 0.8812578320503235, + "learning_rate": 9.909258878904166e-06, + "loss": 0.8141, + "step": 4600 + }, + { + "epoch": 0.2480590899288333, + "grad_norm": 0.7550300359725952, + "learning_rate": 9.909218667533463e-06, + "loss": 0.8522, + "step": 4601 + }, + { + "epoch": 0.24811300409747683, + "grad_norm": 0.7031952738761902, + "learning_rate": 9.909178447336644e-06, + "loss": 0.7793, + "step": 4602 + }, + { + "epoch": 0.24816691826612033, + "grad_norm": 0.7782654166221619, + "learning_rate": 9.909138218313788e-06, + "loss": 0.8185, + "step": 4603 + }, + { + "epoch": 0.24822083243476387, + "grad_norm": 0.7581482529640198, + "learning_rate": 9.909097980464961e-06, + "loss": 0.714, + "step": 4604 + }, + { + "epoch": 0.24827474660340737, + "grad_norm": 0.7732239365577698, + "learning_rate": 9.909057733790236e-06, + "loss": 0.7916, + "step": 4605 + }, + { + "epoch": 0.2483286607720509, + "grad_norm": 0.8440051674842834, + "learning_rate": 9.909017478289692e-06, + "loss": 0.6826, + "step": 4606 + }, + { + "epoch": 0.2483825749406944, + "grad_norm": 0.8361368179321289, + "learning_rate": 9.908977213963394e-06, + "loss": 0.7922, + "step": 4607 + }, + { + "epoch": 0.24843648910933794, + "grad_norm": 0.7201125025749207, + "learning_rate": 9.908936940811418e-06, + "loss": 0.7285, + "step": 4608 + }, + { + "epoch": 0.24849040327798144, + "grad_norm": 0.7888527512550354, + "learning_rate": 9.908896658833836e-06, + "loss": 0.807, + "step": 4609 + }, + { + "epoch": 0.24854431744662497, + "grad_norm": 0.7935523390769958, + "learning_rate": 9.908856368030717e-06, + "loss": 0.7634, + "step": 4610 + }, + { + "epoch": 0.2485982316152685, + "grad_norm": 0.8482795357704163, + "learning_rate": 9.908816068402138e-06, + "loss": 0.7679, + "step": 4611 + }, + { + "epoch": 0.248652145783912, + "grad_norm": 0.8024162650108337, + "learning_rate": 9.908775759948171e-06, + "loss": 0.8348, + "step": 4612 + }, + { + "epoch": 0.24870605995255554, + "grad_norm": 1.1745551824569702, + "learning_rate": 9.908735442668886e-06, + "loss": 0.9002, + "step": 4613 + }, + { + "epoch": 0.24875997412119905, + "grad_norm": 0.7877936363220215, + "learning_rate": 9.908695116564356e-06, + "loss": 0.8618, + "step": 4614 + }, + { + "epoch": 0.24881388828984258, + "grad_norm": 0.7331380248069763, + "learning_rate": 9.908654781634656e-06, + "loss": 0.7798, + "step": 4615 + }, + { + "epoch": 0.24886780245848608, + "grad_norm": 0.7370942831039429, + "learning_rate": 9.908614437879856e-06, + "loss": 0.7355, + "step": 4616 + }, + { + "epoch": 0.2489217166271296, + "grad_norm": 0.7926658391952515, + "learning_rate": 9.908574085300029e-06, + "loss": 0.7758, + "step": 4617 + }, + { + "epoch": 0.24897563079577312, + "grad_norm": 0.7218267917633057, + "learning_rate": 9.908533723895247e-06, + "loss": 0.7218, + "step": 4618 + }, + { + "epoch": 0.24902954496441665, + "grad_norm": 0.7260599136352539, + "learning_rate": 9.908493353665584e-06, + "loss": 0.7298, + "step": 4619 + }, + { + "epoch": 0.24908345913306018, + "grad_norm": 0.7151805758476257, + "learning_rate": 9.908452974611114e-06, + "loss": 0.8047, + "step": 4620 + }, + { + "epoch": 0.24913737330170369, + "grad_norm": 0.7485063076019287, + "learning_rate": 9.908412586731905e-06, + "loss": 0.8048, + "step": 4621 + }, + { + "epoch": 0.24919128747034722, + "grad_norm": 0.733971893787384, + "learning_rate": 9.908372190028033e-06, + "loss": 0.7345, + "step": 4622 + }, + { + "epoch": 0.24924520163899072, + "grad_norm": 0.7228642106056213, + "learning_rate": 9.90833178449957e-06, + "loss": 0.7076, + "step": 4623 + }, + { + "epoch": 0.24929911580763425, + "grad_norm": 0.7565811276435852, + "learning_rate": 9.908291370146588e-06, + "loss": 0.8207, + "step": 4624 + }, + { + "epoch": 0.24935302997627776, + "grad_norm": 0.7520995140075684, + "learning_rate": 9.90825094696916e-06, + "loss": 0.7815, + "step": 4625 + }, + { + "epoch": 0.2494069441449213, + "grad_norm": 0.8191807866096497, + "learning_rate": 9.908210514967358e-06, + "loss": 0.775, + "step": 4626 + }, + { + "epoch": 0.2494608583135648, + "grad_norm": 0.7196933031082153, + "learning_rate": 9.908170074141257e-06, + "loss": 0.8197, + "step": 4627 + }, + { + "epoch": 0.24951477248220832, + "grad_norm": 0.724298894405365, + "learning_rate": 9.908129624490928e-06, + "loss": 0.8882, + "step": 4628 + }, + { + "epoch": 0.24956868665085186, + "grad_norm": 0.7686057686805725, + "learning_rate": 9.908089166016444e-06, + "loss": 0.7896, + "step": 4629 + }, + { + "epoch": 0.24962260081949536, + "grad_norm": 0.7816513180732727, + "learning_rate": 9.908048698717877e-06, + "loss": 0.7487, + "step": 4630 + }, + { + "epoch": 0.2496765149881389, + "grad_norm": 0.7616474628448486, + "learning_rate": 9.9080082225953e-06, + "loss": 0.7335, + "step": 4631 + }, + { + "epoch": 0.2497304291567824, + "grad_norm": 0.923209011554718, + "learning_rate": 9.907967737648787e-06, + "loss": 0.7808, + "step": 4632 + }, + { + "epoch": 0.24978434332542593, + "grad_norm": 0.7830556035041809, + "learning_rate": 9.90792724387841e-06, + "loss": 0.6349, + "step": 4633 + }, + { + "epoch": 0.24983825749406943, + "grad_norm": 0.7756953835487366, + "learning_rate": 9.90788674128424e-06, + "loss": 0.7903, + "step": 4634 + }, + { + "epoch": 0.24989217166271296, + "grad_norm": 0.7644580006599426, + "learning_rate": 9.907846229866354e-06, + "loss": 0.8474, + "step": 4635 + }, + { + "epoch": 0.24994608583135647, + "grad_norm": 0.7665796875953674, + "learning_rate": 9.907805709624822e-06, + "loss": 0.8081, + "step": 4636 + }, + { + "epoch": 0.25, + "grad_norm": 0.823797881603241, + "learning_rate": 9.907765180559716e-06, + "loss": 0.734, + "step": 4637 + }, + { + "epoch": 0.25005391416864353, + "grad_norm": 0.7901148200035095, + "learning_rate": 9.907724642671111e-06, + "loss": 0.752, + "step": 4638 + }, + { + "epoch": 0.25010782833728706, + "grad_norm": 0.994473934173584, + "learning_rate": 9.90768409595908e-06, + "loss": 0.8103, + "step": 4639 + }, + { + "epoch": 0.25016174250593054, + "grad_norm": 0.7167239189147949, + "learning_rate": 9.907643540423692e-06, + "loss": 0.7288, + "step": 4640 + }, + { + "epoch": 0.25021565667457407, + "grad_norm": 0.8114840388298035, + "learning_rate": 9.907602976065025e-06, + "loss": 0.629, + "step": 4641 + }, + { + "epoch": 0.2502695708432176, + "grad_norm": 0.8481932282447815, + "learning_rate": 9.90756240288315e-06, + "loss": 0.8044, + "step": 4642 + }, + { + "epoch": 0.25032348501186114, + "grad_norm": 0.8757217526435852, + "learning_rate": 9.907521820878139e-06, + "loss": 0.8285, + "step": 4643 + }, + { + "epoch": 0.2503773991805046, + "grad_norm": 0.7892036437988281, + "learning_rate": 9.907481230050065e-06, + "loss": 0.7795, + "step": 4644 + }, + { + "epoch": 0.25043131334914814, + "grad_norm": 0.8281320333480835, + "learning_rate": 9.907440630399003e-06, + "loss": 0.8106, + "step": 4645 + }, + { + "epoch": 0.2504852275177917, + "grad_norm": 0.7743760943412781, + "learning_rate": 9.907400021925022e-06, + "loss": 0.8023, + "step": 4646 + }, + { + "epoch": 0.2505391416864352, + "grad_norm": 0.7882426977157593, + "learning_rate": 9.9073594046282e-06, + "loss": 0.7913, + "step": 4647 + }, + { + "epoch": 0.25059305585507874, + "grad_norm": 0.7276794910430908, + "learning_rate": 9.907318778508607e-06, + "loss": 0.7044, + "step": 4648 + }, + { + "epoch": 0.2506469700237222, + "grad_norm": 0.7869488596916199, + "learning_rate": 9.907278143566317e-06, + "loss": 0.8278, + "step": 4649 + }, + { + "epoch": 0.25070088419236575, + "grad_norm": 0.8069205284118652, + "learning_rate": 9.907237499801403e-06, + "loss": 0.7968, + "step": 4650 + }, + { + "epoch": 0.2507547983610093, + "grad_norm": 0.7453712224960327, + "learning_rate": 9.907196847213938e-06, + "loss": 0.7703, + "step": 4651 + }, + { + "epoch": 0.2508087125296528, + "grad_norm": 0.7574083209037781, + "learning_rate": 9.907156185803994e-06, + "loss": 0.7364, + "step": 4652 + }, + { + "epoch": 0.2508626266982963, + "grad_norm": 0.7393423318862915, + "learning_rate": 9.907115515571643e-06, + "loss": 0.7262, + "step": 4653 + }, + { + "epoch": 0.2509165408669398, + "grad_norm": 0.6861773133277893, + "learning_rate": 9.907074836516963e-06, + "loss": 0.6719, + "step": 4654 + }, + { + "epoch": 0.25097045503558335, + "grad_norm": 0.7770050764083862, + "learning_rate": 9.907034148640025e-06, + "loss": 0.833, + "step": 4655 + }, + { + "epoch": 0.2510243692042269, + "grad_norm": 0.8121877312660217, + "learning_rate": 9.9069934519409e-06, + "loss": 0.7668, + "step": 4656 + }, + { + "epoch": 0.2510782833728704, + "grad_norm": 0.7469497919082642, + "learning_rate": 9.906952746419662e-06, + "loss": 0.7414, + "step": 4657 + }, + { + "epoch": 0.2511321975415139, + "grad_norm": 0.7283838391304016, + "learning_rate": 9.906912032076385e-06, + "loss": 0.7748, + "step": 4658 + }, + { + "epoch": 0.2511861117101574, + "grad_norm": 0.7288998365402222, + "learning_rate": 9.906871308911143e-06, + "loss": 0.7462, + "step": 4659 + }, + { + "epoch": 0.25124002587880095, + "grad_norm": 0.7184773087501526, + "learning_rate": 9.906830576924007e-06, + "loss": 0.7055, + "step": 4660 + }, + { + "epoch": 0.2512939400474445, + "grad_norm": 0.7292659878730774, + "learning_rate": 9.906789836115051e-06, + "loss": 0.7817, + "step": 4661 + }, + { + "epoch": 0.25134785421608796, + "grad_norm": 0.8918725848197937, + "learning_rate": 9.906749086484351e-06, + "loss": 0.8216, + "step": 4662 + }, + { + "epoch": 0.2514017683847315, + "grad_norm": 0.8097497224807739, + "learning_rate": 9.906708328031977e-06, + "loss": 0.7242, + "step": 4663 + }, + { + "epoch": 0.251455682553375, + "grad_norm": 0.7008753418922424, + "learning_rate": 9.906667560758003e-06, + "loss": 0.6947, + "step": 4664 + }, + { + "epoch": 0.25150959672201856, + "grad_norm": 0.7514529228210449, + "learning_rate": 9.906626784662502e-06, + "loss": 0.7933, + "step": 4665 + }, + { + "epoch": 0.2515635108906621, + "grad_norm": 0.992230236530304, + "learning_rate": 9.906585999745547e-06, + "loss": 0.6778, + "step": 4666 + }, + { + "epoch": 0.25161742505930557, + "grad_norm": 0.6534571051597595, + "learning_rate": 9.906545206007214e-06, + "loss": 0.7024, + "step": 4667 + }, + { + "epoch": 0.2516713392279491, + "grad_norm": 0.7981176376342773, + "learning_rate": 9.906504403447573e-06, + "loss": 0.7208, + "step": 4668 + }, + { + "epoch": 0.25172525339659263, + "grad_norm": 0.7560659646987915, + "learning_rate": 9.906463592066699e-06, + "loss": 0.7456, + "step": 4669 + }, + { + "epoch": 0.25177916756523616, + "grad_norm": 1.1304062604904175, + "learning_rate": 9.906422771864666e-06, + "loss": 0.8277, + "step": 4670 + }, + { + "epoch": 0.25183308173387964, + "grad_norm": 0.7330453395843506, + "learning_rate": 9.906381942841546e-06, + "loss": 0.7689, + "step": 4671 + }, + { + "epoch": 0.25188699590252317, + "grad_norm": 0.7832298278808594, + "learning_rate": 9.906341104997412e-06, + "loss": 0.7738, + "step": 4672 + }, + { + "epoch": 0.2519409100711667, + "grad_norm": 0.8089982271194458, + "learning_rate": 9.90630025833234e-06, + "loss": 0.8649, + "step": 4673 + }, + { + "epoch": 0.25199482423981023, + "grad_norm": 0.7778360247612, + "learning_rate": 9.906259402846401e-06, + "loss": 0.8055, + "step": 4674 + }, + { + "epoch": 0.25204873840845377, + "grad_norm": 0.7771027684211731, + "learning_rate": 9.906218538539671e-06, + "loss": 0.6705, + "step": 4675 + }, + { + "epoch": 0.25210265257709724, + "grad_norm": 0.9576727151870728, + "learning_rate": 9.90617766541222e-06, + "loss": 0.8363, + "step": 4676 + }, + { + "epoch": 0.2521565667457408, + "grad_norm": 0.7581680417060852, + "learning_rate": 9.906136783464124e-06, + "loss": 0.778, + "step": 4677 + }, + { + "epoch": 0.2522104809143843, + "grad_norm": 0.8484781384468079, + "learning_rate": 9.906095892695455e-06, + "loss": 0.692, + "step": 4678 + }, + { + "epoch": 0.25226439508302784, + "grad_norm": 0.8313053846359253, + "learning_rate": 9.906054993106289e-06, + "loss": 0.8329, + "step": 4679 + }, + { + "epoch": 0.2523183092516713, + "grad_norm": 0.8454006314277649, + "learning_rate": 9.906014084696696e-06, + "loss": 0.8002, + "step": 4680 + }, + { + "epoch": 0.25237222342031485, + "grad_norm": 0.7415658235549927, + "learning_rate": 9.905973167466751e-06, + "loss": 0.7369, + "step": 4681 + }, + { + "epoch": 0.2524261375889584, + "grad_norm": 1.1871880292892456, + "learning_rate": 9.90593224141653e-06, + "loss": 0.8321, + "step": 4682 + }, + { + "epoch": 0.2524800517576019, + "grad_norm": 0.7169525623321533, + "learning_rate": 9.905891306546102e-06, + "loss": 0.7746, + "step": 4683 + }, + { + "epoch": 0.25253396592624544, + "grad_norm": 0.9533750414848328, + "learning_rate": 9.905850362855544e-06, + "loss": 0.7785, + "step": 4684 + }, + { + "epoch": 0.2525878800948889, + "grad_norm": 0.7524462342262268, + "learning_rate": 9.90580941034493e-06, + "loss": 0.7905, + "step": 4685 + }, + { + "epoch": 0.25264179426353245, + "grad_norm": 0.7788832187652588, + "learning_rate": 9.905768449014332e-06, + "loss": 0.8351, + "step": 4686 + }, + { + "epoch": 0.252695708432176, + "grad_norm": 0.7439721822738647, + "learning_rate": 9.905727478863823e-06, + "loss": 0.8131, + "step": 4687 + }, + { + "epoch": 0.2527496226008195, + "grad_norm": 0.7753449082374573, + "learning_rate": 9.90568649989348e-06, + "loss": 0.8662, + "step": 4688 + }, + { + "epoch": 0.252803536769463, + "grad_norm": 0.7604972124099731, + "learning_rate": 9.90564551210337e-06, + "loss": 0.7539, + "step": 4689 + }, + { + "epoch": 0.2528574509381065, + "grad_norm": 0.7789442539215088, + "learning_rate": 9.905604515493574e-06, + "loss": 0.7488, + "step": 4690 + }, + { + "epoch": 0.25291136510675005, + "grad_norm": 0.7509225010871887, + "learning_rate": 9.905563510064162e-06, + "loss": 0.7889, + "step": 4691 + }, + { + "epoch": 0.2529652792753936, + "grad_norm": 0.7840915322303772, + "learning_rate": 9.905522495815208e-06, + "loss": 0.8808, + "step": 4692 + }, + { + "epoch": 0.2530191934440371, + "grad_norm": 0.7814779877662659, + "learning_rate": 9.905481472746787e-06, + "loss": 0.784, + "step": 4693 + }, + { + "epoch": 0.2530731076126806, + "grad_norm": 1.052604079246521, + "learning_rate": 9.905440440858973e-06, + "loss": 0.9627, + "step": 4694 + }, + { + "epoch": 0.2531270217813241, + "grad_norm": 0.8607435822486877, + "learning_rate": 9.905399400151836e-06, + "loss": 0.7937, + "step": 4695 + }, + { + "epoch": 0.25318093594996766, + "grad_norm": 0.7610926628112793, + "learning_rate": 9.905358350625453e-06, + "loss": 0.828, + "step": 4696 + }, + { + "epoch": 0.2532348501186112, + "grad_norm": 0.7309452295303345, + "learning_rate": 9.905317292279899e-06, + "loss": 0.7337, + "step": 4697 + }, + { + "epoch": 0.25328876428725466, + "grad_norm": 0.9021269083023071, + "learning_rate": 9.905276225115246e-06, + "loss": 0.7447, + "step": 4698 + }, + { + "epoch": 0.2533426784558982, + "grad_norm": 0.8152287006378174, + "learning_rate": 9.905235149131565e-06, + "loss": 0.7322, + "step": 4699 + }, + { + "epoch": 0.2533965926245417, + "grad_norm": 0.8354026675224304, + "learning_rate": 9.905194064328935e-06, + "loss": 0.801, + "step": 4700 + }, + { + "epoch": 0.25345050679318526, + "grad_norm": 0.7649407982826233, + "learning_rate": 9.905152970707428e-06, + "loss": 0.8091, + "step": 4701 + }, + { + "epoch": 0.2535044209618288, + "grad_norm": 0.8044828176498413, + "learning_rate": 9.905111868267116e-06, + "loss": 0.8391, + "step": 4702 + }, + { + "epoch": 0.25355833513047227, + "grad_norm": 0.8590373992919922, + "learning_rate": 9.905070757008076e-06, + "loss": 0.8373, + "step": 4703 + }, + { + "epoch": 0.2536122492991158, + "grad_norm": 0.7771210670471191, + "learning_rate": 9.90502963693038e-06, + "loss": 0.7158, + "step": 4704 + }, + { + "epoch": 0.25366616346775933, + "grad_norm": 1.00150728225708, + "learning_rate": 9.904988508034102e-06, + "loss": 0.7809, + "step": 4705 + }, + { + "epoch": 0.25372007763640286, + "grad_norm": 0.7746372222900391, + "learning_rate": 9.904947370319316e-06, + "loss": 0.8337, + "step": 4706 + }, + { + "epoch": 0.25377399180504634, + "grad_norm": 0.796157717704773, + "learning_rate": 9.904906223786097e-06, + "loss": 0.7119, + "step": 4707 + }, + { + "epoch": 0.25382790597368987, + "grad_norm": 0.7384063601493835, + "learning_rate": 9.904865068434517e-06, + "loss": 0.7639, + "step": 4708 + }, + { + "epoch": 0.2538818201423334, + "grad_norm": 0.7987060546875, + "learning_rate": 9.904823904264651e-06, + "loss": 0.777, + "step": 4709 + }, + { + "epoch": 0.25393573431097693, + "grad_norm": 0.7243106365203857, + "learning_rate": 9.904782731276574e-06, + "loss": 0.796, + "step": 4710 + }, + { + "epoch": 0.25398964847962047, + "grad_norm": 0.9222633242607117, + "learning_rate": 9.904741549470358e-06, + "loss": 0.967, + "step": 4711 + }, + { + "epoch": 0.25404356264826394, + "grad_norm": 1.1910635232925415, + "learning_rate": 9.90470035884608e-06, + "loss": 0.8376, + "step": 4712 + }, + { + "epoch": 0.2540974768169075, + "grad_norm": 0.7156771421432495, + "learning_rate": 9.904659159403811e-06, + "loss": 0.7112, + "step": 4713 + }, + { + "epoch": 0.254151390985551, + "grad_norm": 0.7093952894210815, + "learning_rate": 9.904617951143627e-06, + "loss": 0.7948, + "step": 4714 + }, + { + "epoch": 0.25420530515419454, + "grad_norm": 0.7801835536956787, + "learning_rate": 9.9045767340656e-06, + "loss": 0.6716, + "step": 4715 + }, + { + "epoch": 0.254259219322838, + "grad_norm": 0.7523871660232544, + "learning_rate": 9.904535508169807e-06, + "loss": 0.7212, + "step": 4716 + }, + { + "epoch": 0.25431313349148155, + "grad_norm": 0.7119418978691101, + "learning_rate": 9.90449427345632e-06, + "loss": 0.6655, + "step": 4717 + }, + { + "epoch": 0.2543670476601251, + "grad_norm": 0.7273330092430115, + "learning_rate": 9.904453029925214e-06, + "loss": 0.7697, + "step": 4718 + }, + { + "epoch": 0.2544209618287686, + "grad_norm": 0.9550130367279053, + "learning_rate": 9.904411777576564e-06, + "loss": 0.7441, + "step": 4719 + }, + { + "epoch": 0.25447487599741214, + "grad_norm": 0.7486676573753357, + "learning_rate": 9.90437051641044e-06, + "loss": 0.8171, + "step": 4720 + }, + { + "epoch": 0.2545287901660556, + "grad_norm": 0.8107298612594604, + "learning_rate": 9.904329246426923e-06, + "loss": 0.7311, + "step": 4721 + }, + { + "epoch": 0.25458270433469915, + "grad_norm": 0.679837167263031, + "learning_rate": 9.90428796762608e-06, + "loss": 0.7797, + "step": 4722 + }, + { + "epoch": 0.2546366185033427, + "grad_norm": 0.8209143877029419, + "learning_rate": 9.904246680007993e-06, + "loss": 0.8244, + "step": 4723 + }, + { + "epoch": 0.2546905326719862, + "grad_norm": 0.7561433911323547, + "learning_rate": 9.904205383572727e-06, + "loss": 0.86, + "step": 4724 + }, + { + "epoch": 0.2547444468406297, + "grad_norm": 0.8573929071426392, + "learning_rate": 9.904164078320363e-06, + "loss": 0.802, + "step": 4725 + }, + { + "epoch": 0.2547983610092732, + "grad_norm": 0.8191418051719666, + "learning_rate": 9.904122764250975e-06, + "loss": 0.8121, + "step": 4726 + }, + { + "epoch": 0.25485227517791675, + "grad_norm": 0.8703283667564392, + "learning_rate": 9.904081441364635e-06, + "loss": 0.7535, + "step": 4727 + }, + { + "epoch": 0.2549061893465603, + "grad_norm": 0.8311215043067932, + "learning_rate": 9.904040109661417e-06, + "loss": 0.8085, + "step": 4728 + }, + { + "epoch": 0.2549601035152038, + "grad_norm": 0.754145622253418, + "learning_rate": 9.903998769141397e-06, + "loss": 0.7151, + "step": 4729 + }, + { + "epoch": 0.2550140176838473, + "grad_norm": 0.6936500668525696, + "learning_rate": 9.903957419804648e-06, + "loss": 0.6923, + "step": 4730 + }, + { + "epoch": 0.2550679318524908, + "grad_norm": 0.7825912237167358, + "learning_rate": 9.903916061651245e-06, + "loss": 0.7544, + "step": 4731 + }, + { + "epoch": 0.25512184602113436, + "grad_norm": 0.8273274898529053, + "learning_rate": 9.903874694681264e-06, + "loss": 0.8099, + "step": 4732 + }, + { + "epoch": 0.2551757601897779, + "grad_norm": 0.7580922842025757, + "learning_rate": 9.903833318894776e-06, + "loss": 0.7341, + "step": 4733 + }, + { + "epoch": 0.25522967435842137, + "grad_norm": 0.78266441822052, + "learning_rate": 9.903791934291856e-06, + "loss": 0.8152, + "step": 4734 + }, + { + "epoch": 0.2552835885270649, + "grad_norm": 0.7813715934753418, + "learning_rate": 9.903750540872582e-06, + "loss": 0.7859, + "step": 4735 + }, + { + "epoch": 0.25533750269570843, + "grad_norm": 0.9143809080123901, + "learning_rate": 9.903709138637024e-06, + "loss": 0.8131, + "step": 4736 + }, + { + "epoch": 0.25539141686435196, + "grad_norm": 0.7642751932144165, + "learning_rate": 9.90366772758526e-06, + "loss": 0.6977, + "step": 4737 + }, + { + "epoch": 0.2554453310329955, + "grad_norm": 0.7250218391418457, + "learning_rate": 9.903626307717362e-06, + "loss": 0.7843, + "step": 4738 + }, + { + "epoch": 0.25549924520163897, + "grad_norm": 0.7458422780036926, + "learning_rate": 9.903584879033404e-06, + "loss": 0.7054, + "step": 4739 + }, + { + "epoch": 0.2555531593702825, + "grad_norm": 0.7256683707237244, + "learning_rate": 9.903543441533463e-06, + "loss": 0.7024, + "step": 4740 + }, + { + "epoch": 0.25560707353892603, + "grad_norm": 0.8649191856384277, + "learning_rate": 9.903501995217613e-06, + "loss": 0.7044, + "step": 4741 + }, + { + "epoch": 0.25566098770756956, + "grad_norm": 0.8472884297370911, + "learning_rate": 9.903460540085927e-06, + "loss": 0.722, + "step": 4742 + }, + { + "epoch": 0.2557149018762131, + "grad_norm": 0.7086893916130066, + "learning_rate": 9.90341907613848e-06, + "loss": 0.6744, + "step": 4743 + }, + { + "epoch": 0.2557688160448566, + "grad_norm": 0.9324516654014587, + "learning_rate": 9.903377603375346e-06, + "loss": 0.8765, + "step": 4744 + }, + { + "epoch": 0.2558227302135001, + "grad_norm": 0.8247219324111938, + "learning_rate": 9.903336121796601e-06, + "loss": 0.7104, + "step": 4745 + }, + { + "epoch": 0.25587664438214364, + "grad_norm": 0.7695756554603577, + "learning_rate": 9.90329463140232e-06, + "loss": 0.8399, + "step": 4746 + }, + { + "epoch": 0.25593055855078717, + "grad_norm": 0.8179047107696533, + "learning_rate": 9.903253132192577e-06, + "loss": 0.7531, + "step": 4747 + }, + { + "epoch": 0.25598447271943064, + "grad_norm": 0.7995123863220215, + "learning_rate": 9.903211624167444e-06, + "loss": 0.8248, + "step": 4748 + }, + { + "epoch": 0.2560383868880742, + "grad_norm": 0.7822200059890747, + "learning_rate": 9.903170107326997e-06, + "loss": 0.7224, + "step": 4749 + }, + { + "epoch": 0.2560923010567177, + "grad_norm": 0.9561625123023987, + "learning_rate": 9.903128581671315e-06, + "loss": 0.7307, + "step": 4750 + }, + { + "epoch": 0.25614621522536124, + "grad_norm": 0.8102663159370422, + "learning_rate": 9.903087047200468e-06, + "loss": 0.7958, + "step": 4751 + }, + { + "epoch": 0.25620012939400477, + "grad_norm": 0.8910477757453918, + "learning_rate": 9.90304550391453e-06, + "loss": 0.8577, + "step": 4752 + }, + { + "epoch": 0.25625404356264825, + "grad_norm": 0.9179983139038086, + "learning_rate": 9.903003951813579e-06, + "loss": 0.854, + "step": 4753 + }, + { + "epoch": 0.2563079577312918, + "grad_norm": 0.8993476629257202, + "learning_rate": 9.902962390897688e-06, + "loss": 0.7274, + "step": 4754 + }, + { + "epoch": 0.2563618718999353, + "grad_norm": 0.7873006463050842, + "learning_rate": 9.902920821166932e-06, + "loss": 0.7421, + "step": 4755 + }, + { + "epoch": 0.25641578606857884, + "grad_norm": 0.8410146236419678, + "learning_rate": 9.902879242621385e-06, + "loss": 0.852, + "step": 4756 + }, + { + "epoch": 0.2564697002372223, + "grad_norm": 0.7734405994415283, + "learning_rate": 9.902837655261123e-06, + "loss": 0.7485, + "step": 4757 + }, + { + "epoch": 0.25652361440586585, + "grad_norm": 0.7456048130989075, + "learning_rate": 9.90279605908622e-06, + "loss": 0.7223, + "step": 4758 + }, + { + "epoch": 0.2565775285745094, + "grad_norm": 0.8257940411567688, + "learning_rate": 9.90275445409675e-06, + "loss": 0.8324, + "step": 4759 + }, + { + "epoch": 0.2566314427431529, + "grad_norm": 0.9504823088645935, + "learning_rate": 9.90271284029279e-06, + "loss": 0.8872, + "step": 4760 + }, + { + "epoch": 0.25668535691179645, + "grad_norm": 0.7958370447158813, + "learning_rate": 9.902671217674413e-06, + "loss": 0.7605, + "step": 4761 + }, + { + "epoch": 0.2567392710804399, + "grad_norm": 0.7735753059387207, + "learning_rate": 9.902629586241694e-06, + "loss": 0.7682, + "step": 4762 + }, + { + "epoch": 0.25679318524908346, + "grad_norm": 0.8063069581985474, + "learning_rate": 9.902587945994709e-06, + "loss": 0.8126, + "step": 4763 + }, + { + "epoch": 0.256847099417727, + "grad_norm": 0.7964012026786804, + "learning_rate": 9.902546296933532e-06, + "loss": 0.7487, + "step": 4764 + }, + { + "epoch": 0.2569010135863705, + "grad_norm": 0.7407160997390747, + "learning_rate": 9.902504639058237e-06, + "loss": 0.8495, + "step": 4765 + }, + { + "epoch": 0.256954927755014, + "grad_norm": 0.7712891697883606, + "learning_rate": 9.9024629723689e-06, + "loss": 0.71, + "step": 4766 + }, + { + "epoch": 0.2570088419236575, + "grad_norm": 0.708794355392456, + "learning_rate": 9.902421296865596e-06, + "loss": 0.8264, + "step": 4767 + }, + { + "epoch": 0.25706275609230106, + "grad_norm": 0.7903236746788025, + "learning_rate": 9.902379612548401e-06, + "loss": 0.7409, + "step": 4768 + }, + { + "epoch": 0.2571166702609446, + "grad_norm": 0.8012224435806274, + "learning_rate": 9.902337919417387e-06, + "loss": 0.8192, + "step": 4769 + }, + { + "epoch": 0.2571705844295881, + "grad_norm": 0.7412340044975281, + "learning_rate": 9.902296217472632e-06, + "loss": 0.7908, + "step": 4770 + }, + { + "epoch": 0.2572244985982316, + "grad_norm": 0.7860136032104492, + "learning_rate": 9.902254506714209e-06, + "loss": 0.8757, + "step": 4771 + }, + { + "epoch": 0.25727841276687513, + "grad_norm": 0.7804144620895386, + "learning_rate": 9.902212787142193e-06, + "loss": 0.7549, + "step": 4772 + }, + { + "epoch": 0.25733232693551866, + "grad_norm": 0.809959888458252, + "learning_rate": 9.90217105875666e-06, + "loss": 0.8345, + "step": 4773 + }, + { + "epoch": 0.2573862411041622, + "grad_norm": 0.7853354811668396, + "learning_rate": 9.902129321557685e-06, + "loss": 0.8185, + "step": 4774 + }, + { + "epoch": 0.25744015527280567, + "grad_norm": 0.7500307559967041, + "learning_rate": 9.902087575545341e-06, + "loss": 0.7888, + "step": 4775 + }, + { + "epoch": 0.2574940694414492, + "grad_norm": 0.7578644752502441, + "learning_rate": 9.902045820719705e-06, + "loss": 0.7489, + "step": 4776 + }, + { + "epoch": 0.25754798361009273, + "grad_norm": 0.8096863627433777, + "learning_rate": 9.902004057080854e-06, + "loss": 0.7409, + "step": 4777 + }, + { + "epoch": 0.25760189777873627, + "grad_norm": 0.837684154510498, + "learning_rate": 9.90196228462886e-06, + "loss": 0.8483, + "step": 4778 + }, + { + "epoch": 0.2576558119473798, + "grad_norm": 0.7905386686325073, + "learning_rate": 9.901920503363798e-06, + "loss": 0.7641, + "step": 4779 + }, + { + "epoch": 0.2577097261160233, + "grad_norm": 0.750465452671051, + "learning_rate": 9.901878713285744e-06, + "loss": 0.7286, + "step": 4780 + }, + { + "epoch": 0.2577636402846668, + "grad_norm": 0.8911929726600647, + "learning_rate": 9.901836914394773e-06, + "loss": 0.9407, + "step": 4781 + }, + { + "epoch": 0.25781755445331034, + "grad_norm": 0.7831119894981384, + "learning_rate": 9.90179510669096e-06, + "loss": 0.7239, + "step": 4782 + }, + { + "epoch": 0.25787146862195387, + "grad_norm": 0.7694600820541382, + "learning_rate": 9.901753290174382e-06, + "loss": 0.8146, + "step": 4783 + }, + { + "epoch": 0.25792538279059735, + "grad_norm": 0.8094425797462463, + "learning_rate": 9.901711464845114e-06, + "loss": 0.8349, + "step": 4784 + }, + { + "epoch": 0.2579792969592409, + "grad_norm": 0.8766717314720154, + "learning_rate": 9.901669630703229e-06, + "loss": 0.8034, + "step": 4785 + }, + { + "epoch": 0.2580332111278844, + "grad_norm": 0.7051625847816467, + "learning_rate": 9.9016277877488e-06, + "loss": 0.7393, + "step": 4786 + }, + { + "epoch": 0.25808712529652794, + "grad_norm": 0.8611576557159424, + "learning_rate": 9.901585935981907e-06, + "loss": 0.8226, + "step": 4787 + }, + { + "epoch": 0.2581410394651715, + "grad_norm": 0.767514705657959, + "learning_rate": 9.901544075402624e-06, + "loss": 0.7877, + "step": 4788 + }, + { + "epoch": 0.25819495363381495, + "grad_norm": 0.7997928857803345, + "learning_rate": 9.901502206011027e-06, + "loss": 0.8712, + "step": 4789 + }, + { + "epoch": 0.2582488678024585, + "grad_norm": 0.9323418736457825, + "learning_rate": 9.901460327807189e-06, + "loss": 0.823, + "step": 4790 + }, + { + "epoch": 0.258302781971102, + "grad_norm": 0.8389249444007874, + "learning_rate": 9.901418440791186e-06, + "loss": 0.8592, + "step": 4791 + }, + { + "epoch": 0.25835669613974555, + "grad_norm": 0.6641879677772522, + "learning_rate": 9.901376544963094e-06, + "loss": 0.6147, + "step": 4792 + }, + { + "epoch": 0.258410610308389, + "grad_norm": 0.8162431716918945, + "learning_rate": 9.901334640322989e-06, + "loss": 0.8057, + "step": 4793 + }, + { + "epoch": 0.25846452447703255, + "grad_norm": 0.7615718841552734, + "learning_rate": 9.901292726870943e-06, + "loss": 0.8446, + "step": 4794 + }, + { + "epoch": 0.2585184386456761, + "grad_norm": 0.764523983001709, + "learning_rate": 9.901250804607037e-06, + "loss": 0.8061, + "step": 4795 + }, + { + "epoch": 0.2585723528143196, + "grad_norm": 0.8213503360748291, + "learning_rate": 9.901208873531341e-06, + "loss": 0.7875, + "step": 4796 + }, + { + "epoch": 0.25862626698296315, + "grad_norm": 1.050784945487976, + "learning_rate": 9.901166933643933e-06, + "loss": 0.8412, + "step": 4797 + }, + { + "epoch": 0.2586801811516066, + "grad_norm": 0.7617695927619934, + "learning_rate": 9.901124984944886e-06, + "loss": 0.7946, + "step": 4798 + }, + { + "epoch": 0.25873409532025016, + "grad_norm": 0.8027677536010742, + "learning_rate": 9.90108302743428e-06, + "loss": 0.7967, + "step": 4799 + }, + { + "epoch": 0.2587880094888937, + "grad_norm": 0.7340978384017944, + "learning_rate": 9.901041061112186e-06, + "loss": 0.7771, + "step": 4800 + }, + { + "epoch": 0.2588419236575372, + "grad_norm": 0.7108075618743896, + "learning_rate": 9.900999085978682e-06, + "loss": 0.7182, + "step": 4801 + }, + { + "epoch": 0.2588958378261807, + "grad_norm": 0.8320378661155701, + "learning_rate": 9.90095710203384e-06, + "loss": 0.8903, + "step": 4802 + }, + { + "epoch": 0.25894975199482423, + "grad_norm": 0.7735534310340881, + "learning_rate": 9.900915109277743e-06, + "loss": 0.8604, + "step": 4803 + }, + { + "epoch": 0.25900366616346776, + "grad_norm": 0.9205079078674316, + "learning_rate": 9.900873107710458e-06, + "loss": 0.8451, + "step": 4804 + }, + { + "epoch": 0.2590575803321113, + "grad_norm": 0.8668771386146545, + "learning_rate": 9.900831097332066e-06, + "loss": 0.8566, + "step": 4805 + }, + { + "epoch": 0.2591114945007548, + "grad_norm": 0.8134620785713196, + "learning_rate": 9.90078907814264e-06, + "loss": 0.7842, + "step": 4806 + }, + { + "epoch": 0.2591654086693983, + "grad_norm": 0.8436452746391296, + "learning_rate": 9.900747050142257e-06, + "loss": 0.8673, + "step": 4807 + }, + { + "epoch": 0.25921932283804183, + "grad_norm": 0.877737283706665, + "learning_rate": 9.90070501333099e-06, + "loss": 0.8668, + "step": 4808 + }, + { + "epoch": 0.25927323700668536, + "grad_norm": 0.7415887713432312, + "learning_rate": 9.900662967708917e-06, + "loss": 0.7148, + "step": 4809 + }, + { + "epoch": 0.2593271511753289, + "grad_norm": 0.6708645820617676, + "learning_rate": 9.900620913276114e-06, + "loss": 0.6428, + "step": 4810 + }, + { + "epoch": 0.2593810653439724, + "grad_norm": 0.7553024888038635, + "learning_rate": 9.900578850032655e-06, + "loss": 0.7812, + "step": 4811 + }, + { + "epoch": 0.2594349795126159, + "grad_norm": 0.7633180022239685, + "learning_rate": 9.900536777978615e-06, + "loss": 0.7481, + "step": 4812 + }, + { + "epoch": 0.25948889368125944, + "grad_norm": 0.8034750819206238, + "learning_rate": 9.900494697114072e-06, + "loss": 0.7981, + "step": 4813 + }, + { + "epoch": 0.25954280784990297, + "grad_norm": 0.7227773070335388, + "learning_rate": 9.9004526074391e-06, + "loss": 0.7233, + "step": 4814 + }, + { + "epoch": 0.2595967220185465, + "grad_norm": 0.8364164233207703, + "learning_rate": 9.900410508953775e-06, + "loss": 0.8677, + "step": 4815 + }, + { + "epoch": 0.25965063618719, + "grad_norm": 0.7321234941482544, + "learning_rate": 9.900368401658174e-06, + "loss": 0.6652, + "step": 4816 + }, + { + "epoch": 0.2597045503558335, + "grad_norm": 0.7887052893638611, + "learning_rate": 9.90032628555237e-06, + "loss": 0.8696, + "step": 4817 + }, + { + "epoch": 0.25975846452447704, + "grad_norm": 0.7807821035385132, + "learning_rate": 9.900284160636441e-06, + "loss": 0.8359, + "step": 4818 + }, + { + "epoch": 0.25981237869312057, + "grad_norm": 0.8123578429222107, + "learning_rate": 9.900242026910462e-06, + "loss": 0.8893, + "step": 4819 + }, + { + "epoch": 0.25986629286176405, + "grad_norm": 0.7520090937614441, + "learning_rate": 9.900199884374508e-06, + "loss": 0.8037, + "step": 4820 + }, + { + "epoch": 0.2599202070304076, + "grad_norm": 0.8489886522293091, + "learning_rate": 9.900157733028656e-06, + "loss": 0.8827, + "step": 4821 + }, + { + "epoch": 0.2599741211990511, + "grad_norm": 0.8435912132263184, + "learning_rate": 9.900115572872981e-06, + "loss": 0.7468, + "step": 4822 + }, + { + "epoch": 0.26002803536769464, + "grad_norm": 0.7331469655036926, + "learning_rate": 9.90007340390756e-06, + "loss": 0.7421, + "step": 4823 + }, + { + "epoch": 0.2600819495363382, + "grad_norm": 0.8015231490135193, + "learning_rate": 9.900031226132469e-06, + "loss": 0.8709, + "step": 4824 + }, + { + "epoch": 0.26013586370498165, + "grad_norm": 0.8771700263023376, + "learning_rate": 9.89998903954778e-06, + "loss": 0.8313, + "step": 4825 + }, + { + "epoch": 0.2601897778736252, + "grad_norm": 0.752811074256897, + "learning_rate": 9.899946844153573e-06, + "loss": 0.7887, + "step": 4826 + }, + { + "epoch": 0.2602436920422687, + "grad_norm": 0.7526640295982361, + "learning_rate": 9.899904639949921e-06, + "loss": 0.8189, + "step": 4827 + }, + { + "epoch": 0.26029760621091225, + "grad_norm": 0.8185133337974548, + "learning_rate": 9.899862426936904e-06, + "loss": 0.8426, + "step": 4828 + }, + { + "epoch": 0.2603515203795557, + "grad_norm": 0.6737107038497925, + "learning_rate": 9.899820205114593e-06, + "loss": 0.7235, + "step": 4829 + }, + { + "epoch": 0.26040543454819925, + "grad_norm": 0.880402147769928, + "learning_rate": 9.899777974483068e-06, + "loss": 0.9, + "step": 4830 + }, + { + "epoch": 0.2604593487168428, + "grad_norm": 0.8077740669250488, + "learning_rate": 9.8997357350424e-06, + "loss": 0.7653, + "step": 4831 + }, + { + "epoch": 0.2605132628854863, + "grad_norm": 0.9043613076210022, + "learning_rate": 9.89969348679267e-06, + "loss": 0.7489, + "step": 4832 + }, + { + "epoch": 0.26056717705412985, + "grad_norm": 0.7480129599571228, + "learning_rate": 9.899651229733952e-06, + "loss": 0.804, + "step": 4833 + }, + { + "epoch": 0.2606210912227733, + "grad_norm": 0.8027556538581848, + "learning_rate": 9.899608963866322e-06, + "loss": 0.7484, + "step": 4834 + }, + { + "epoch": 0.26067500539141686, + "grad_norm": 0.7745609283447266, + "learning_rate": 9.899566689189855e-06, + "loss": 0.737, + "step": 4835 + }, + { + "epoch": 0.2607289195600604, + "grad_norm": 0.8240119218826294, + "learning_rate": 9.899524405704627e-06, + "loss": 0.861, + "step": 4836 + }, + { + "epoch": 0.2607828337287039, + "grad_norm": 0.7260393500328064, + "learning_rate": 9.899482113410718e-06, + "loss": 0.812, + "step": 4837 + }, + { + "epoch": 0.2608367478973474, + "grad_norm": 0.7049936652183533, + "learning_rate": 9.899439812308198e-06, + "loss": 0.7422, + "step": 4838 + }, + { + "epoch": 0.26089066206599093, + "grad_norm": 0.802170991897583, + "learning_rate": 9.899397502397148e-06, + "loss": 0.7852, + "step": 4839 + }, + { + "epoch": 0.26094457623463446, + "grad_norm": 0.7912299633026123, + "learning_rate": 9.899355183677642e-06, + "loss": 0.8151, + "step": 4840 + }, + { + "epoch": 0.260998490403278, + "grad_norm": 0.7643092274665833, + "learning_rate": 9.899312856149756e-06, + "loss": 0.7903, + "step": 4841 + }, + { + "epoch": 0.2610524045719215, + "grad_norm": 0.7583617568016052, + "learning_rate": 9.899270519813564e-06, + "loss": 0.8403, + "step": 4842 + }, + { + "epoch": 0.261106318740565, + "grad_norm": 0.8232578635215759, + "learning_rate": 9.899228174669146e-06, + "loss": 0.7994, + "step": 4843 + }, + { + "epoch": 0.26116023290920853, + "grad_norm": 0.7829787731170654, + "learning_rate": 9.899185820716576e-06, + "loss": 0.7586, + "step": 4844 + }, + { + "epoch": 0.26121414707785207, + "grad_norm": 0.8476693630218506, + "learning_rate": 9.899143457955933e-06, + "loss": 0.7687, + "step": 4845 + }, + { + "epoch": 0.2612680612464956, + "grad_norm": 0.7025540471076965, + "learning_rate": 9.899101086387289e-06, + "loss": 0.7326, + "step": 4846 + }, + { + "epoch": 0.2613219754151391, + "grad_norm": 0.727745532989502, + "learning_rate": 9.899058706010723e-06, + "loss": 0.7813, + "step": 4847 + }, + { + "epoch": 0.2613758895837826, + "grad_norm": 0.7706053853034973, + "learning_rate": 9.89901631682631e-06, + "loss": 0.8251, + "step": 4848 + }, + { + "epoch": 0.26142980375242614, + "grad_norm": 0.8354002833366394, + "learning_rate": 9.898973918834123e-06, + "loss": 0.7891, + "step": 4849 + }, + { + "epoch": 0.26148371792106967, + "grad_norm": 0.970196545124054, + "learning_rate": 9.898931512034245e-06, + "loss": 0.812, + "step": 4850 + }, + { + "epoch": 0.2615376320897132, + "grad_norm": 0.7720034718513489, + "learning_rate": 9.898889096426748e-06, + "loss": 0.7794, + "step": 4851 + }, + { + "epoch": 0.2615915462583567, + "grad_norm": 1.2140640020370483, + "learning_rate": 9.89884667201171e-06, + "loss": 0.7144, + "step": 4852 + }, + { + "epoch": 0.2616454604270002, + "grad_norm": 0.8927225470542908, + "learning_rate": 9.898804238789206e-06, + "loss": 0.7906, + "step": 4853 + }, + { + "epoch": 0.26169937459564374, + "grad_norm": 0.886418342590332, + "learning_rate": 9.898761796759312e-06, + "loss": 0.7661, + "step": 4854 + }, + { + "epoch": 0.2617532887642873, + "grad_norm": 0.8143467903137207, + "learning_rate": 9.898719345922105e-06, + "loss": 0.8139, + "step": 4855 + }, + { + "epoch": 0.26180720293293075, + "grad_norm": 0.7952978014945984, + "learning_rate": 9.898676886277662e-06, + "loss": 0.7199, + "step": 4856 + }, + { + "epoch": 0.2618611171015743, + "grad_norm": 0.7782503962516785, + "learning_rate": 9.898634417826059e-06, + "loss": 0.7104, + "step": 4857 + }, + { + "epoch": 0.2619150312702178, + "grad_norm": 0.8419458866119385, + "learning_rate": 9.898591940567371e-06, + "loss": 0.6675, + "step": 4858 + }, + { + "epoch": 0.26196894543886134, + "grad_norm": 0.8036027550697327, + "learning_rate": 9.898549454501675e-06, + "loss": 0.8304, + "step": 4859 + }, + { + "epoch": 0.2620228596075049, + "grad_norm": 0.8537300825119019, + "learning_rate": 9.898506959629049e-06, + "loss": 0.7559, + "step": 4860 + }, + { + "epoch": 0.26207677377614835, + "grad_norm": 0.8351823687553406, + "learning_rate": 9.898464455949565e-06, + "loss": 0.755, + "step": 4861 + }, + { + "epoch": 0.2621306879447919, + "grad_norm": 0.7771688103675842, + "learning_rate": 9.898421943463307e-06, + "loss": 0.7593, + "step": 4862 + }, + { + "epoch": 0.2621846021134354, + "grad_norm": 0.923363208770752, + "learning_rate": 9.898379422170344e-06, + "loss": 0.7514, + "step": 4863 + }, + { + "epoch": 0.26223851628207895, + "grad_norm": 0.695932924747467, + "learning_rate": 9.898336892070756e-06, + "loss": 0.691, + "step": 4864 + }, + { + "epoch": 0.2622924304507224, + "grad_norm": 0.8631780743598938, + "learning_rate": 9.89829435316462e-06, + "loss": 0.8582, + "step": 4865 + }, + { + "epoch": 0.26234634461936596, + "grad_norm": 0.7588357925415039, + "learning_rate": 9.89825180545201e-06, + "loss": 0.7677, + "step": 4866 + }, + { + "epoch": 0.2624002587880095, + "grad_norm": 0.858504056930542, + "learning_rate": 9.898209248933006e-06, + "loss": 0.8136, + "step": 4867 + }, + { + "epoch": 0.262454172956653, + "grad_norm": 0.7912299633026123, + "learning_rate": 9.898166683607683e-06, + "loss": 0.7603, + "step": 4868 + }, + { + "epoch": 0.26250808712529655, + "grad_norm": 0.7564625144004822, + "learning_rate": 9.898124109476113e-06, + "loss": 0.7402, + "step": 4869 + }, + { + "epoch": 0.26256200129394003, + "grad_norm": 0.7155072689056396, + "learning_rate": 9.89808152653838e-06, + "loss": 0.7288, + "step": 4870 + }, + { + "epoch": 0.26261591546258356, + "grad_norm": 0.7694748044013977, + "learning_rate": 9.898038934794554e-06, + "loss": 0.7637, + "step": 4871 + }, + { + "epoch": 0.2626698296312271, + "grad_norm": 0.7335909605026245, + "learning_rate": 9.897996334244717e-06, + "loss": 0.6838, + "step": 4872 + }, + { + "epoch": 0.2627237437998706, + "grad_norm": 0.651745080947876, + "learning_rate": 9.897953724888942e-06, + "loss": 0.6384, + "step": 4873 + }, + { + "epoch": 0.2627776579685141, + "grad_norm": 0.8076156377792358, + "learning_rate": 9.897911106727307e-06, + "loss": 0.783, + "step": 4874 + }, + { + "epoch": 0.26283157213715763, + "grad_norm": 0.74184650182724, + "learning_rate": 9.897868479759888e-06, + "loss": 0.787, + "step": 4875 + }, + { + "epoch": 0.26288548630580116, + "grad_norm": 0.7538748383522034, + "learning_rate": 9.897825843986763e-06, + "loss": 0.7606, + "step": 4876 + }, + { + "epoch": 0.2629394004744447, + "grad_norm": 0.7376627922058105, + "learning_rate": 9.897783199408006e-06, + "loss": 0.8512, + "step": 4877 + }, + { + "epoch": 0.2629933146430882, + "grad_norm": 0.7860908508300781, + "learning_rate": 9.897740546023697e-06, + "loss": 0.8811, + "step": 4878 + }, + { + "epoch": 0.2630472288117317, + "grad_norm": 0.8043631911277771, + "learning_rate": 9.897697883833912e-06, + "loss": 0.8369, + "step": 4879 + }, + { + "epoch": 0.26310114298037524, + "grad_norm": 0.8448672890663147, + "learning_rate": 9.897655212838724e-06, + "loss": 0.8011, + "step": 4880 + }, + { + "epoch": 0.26315505714901877, + "grad_norm": 0.7942283749580383, + "learning_rate": 9.897612533038214e-06, + "loss": 0.8357, + "step": 4881 + }, + { + "epoch": 0.2632089713176623, + "grad_norm": 0.8033713698387146, + "learning_rate": 9.897569844432458e-06, + "loss": 0.8054, + "step": 4882 + }, + { + "epoch": 0.2632628854863058, + "grad_norm": 0.842699408531189, + "learning_rate": 9.89752714702153e-06, + "loss": 0.8016, + "step": 4883 + }, + { + "epoch": 0.2633167996549493, + "grad_norm": 0.8190520405769348, + "learning_rate": 9.89748444080551e-06, + "loss": 0.7823, + "step": 4884 + }, + { + "epoch": 0.26337071382359284, + "grad_norm": 1.5263949632644653, + "learning_rate": 9.897441725784474e-06, + "loss": 0.8822, + "step": 4885 + }, + { + "epoch": 0.26342462799223637, + "grad_norm": 0.7523469924926758, + "learning_rate": 9.897399001958496e-06, + "loss": 0.8486, + "step": 4886 + }, + { + "epoch": 0.2634785421608799, + "grad_norm": 0.8582022190093994, + "learning_rate": 9.897356269327659e-06, + "loss": 0.8655, + "step": 4887 + }, + { + "epoch": 0.2635324563295234, + "grad_norm": 0.9637673497200012, + "learning_rate": 9.897313527892032e-06, + "loss": 0.9027, + "step": 4888 + }, + { + "epoch": 0.2635863704981669, + "grad_norm": 0.7891300916671753, + "learning_rate": 9.897270777651698e-06, + "loss": 0.7856, + "step": 4889 + }, + { + "epoch": 0.26364028466681044, + "grad_norm": 0.7728479504585266, + "learning_rate": 9.897228018606731e-06, + "loss": 0.8606, + "step": 4890 + }, + { + "epoch": 0.263694198835454, + "grad_norm": 0.9174859523773193, + "learning_rate": 9.897185250757209e-06, + "loss": 0.7001, + "step": 4891 + }, + { + "epoch": 0.26374811300409745, + "grad_norm": 0.7392576932907104, + "learning_rate": 9.897142474103208e-06, + "loss": 0.7096, + "step": 4892 + }, + { + "epoch": 0.263802027172741, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.897099688644804e-06, + "loss": 0.7899, + "step": 4893 + }, + { + "epoch": 0.2638559413413845, + "grad_norm": 0.7568668723106384, + "learning_rate": 9.897056894382077e-06, + "loss": 0.7595, + "step": 4894 + }, + { + "epoch": 0.26390985551002805, + "grad_norm": 0.800240695476532, + "learning_rate": 9.897014091315102e-06, + "loss": 0.8398, + "step": 4895 + }, + { + "epoch": 0.2639637696786716, + "grad_norm": 0.7847012281417847, + "learning_rate": 9.896971279443956e-06, + "loss": 0.7433, + "step": 4896 + }, + { + "epoch": 0.26401768384731505, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.896928458768716e-06, + "loss": 0.7549, + "step": 4897 + }, + { + "epoch": 0.2640715980159586, + "grad_norm": 0.7179371118545532, + "learning_rate": 9.89688562928946e-06, + "loss": 0.6947, + "step": 4898 + }, + { + "epoch": 0.2641255121846021, + "grad_norm": 0.8114293217658997, + "learning_rate": 9.896842791006261e-06, + "loss": 0.7943, + "step": 4899 + }, + { + "epoch": 0.26417942635324565, + "grad_norm": 0.7791370749473572, + "learning_rate": 9.896799943919202e-06, + "loss": 0.6892, + "step": 4900 + }, + { + "epoch": 0.2642333405218891, + "grad_norm": 0.8667739629745483, + "learning_rate": 9.896757088028355e-06, + "loss": 0.8893, + "step": 4901 + }, + { + "epoch": 0.26428725469053266, + "grad_norm": 0.739639163017273, + "learning_rate": 9.8967142233338e-06, + "loss": 0.7566, + "step": 4902 + }, + { + "epoch": 0.2643411688591762, + "grad_norm": 0.7148702144622803, + "learning_rate": 9.896671349835616e-06, + "loss": 0.7915, + "step": 4903 + }, + { + "epoch": 0.2643950830278197, + "grad_norm": 0.7041117548942566, + "learning_rate": 9.896628467533875e-06, + "loss": 0.7123, + "step": 4904 + }, + { + "epoch": 0.26444899719646325, + "grad_norm": 0.7493545413017273, + "learning_rate": 9.896585576428655e-06, + "loss": 0.8255, + "step": 4905 + }, + { + "epoch": 0.26450291136510673, + "grad_norm": 0.802142322063446, + "learning_rate": 9.896542676520035e-06, + "loss": 0.8414, + "step": 4906 + }, + { + "epoch": 0.26455682553375026, + "grad_norm": 0.7283496260643005, + "learning_rate": 9.896499767808094e-06, + "loss": 0.745, + "step": 4907 + }, + { + "epoch": 0.2646107397023938, + "grad_norm": 0.7583940029144287, + "learning_rate": 9.896456850292907e-06, + "loss": 0.7771, + "step": 4908 + }, + { + "epoch": 0.2646646538710373, + "grad_norm": 0.7401677966117859, + "learning_rate": 9.896413923974548e-06, + "loss": 0.7648, + "step": 4909 + }, + { + "epoch": 0.2647185680396808, + "grad_norm": 0.7986511588096619, + "learning_rate": 9.896370988853099e-06, + "loss": 0.7145, + "step": 4910 + }, + { + "epoch": 0.26477248220832433, + "grad_norm": 0.6956211924552917, + "learning_rate": 9.896328044928634e-06, + "loss": 0.7786, + "step": 4911 + }, + { + "epoch": 0.26482639637696787, + "grad_norm": 0.8934255838394165, + "learning_rate": 9.896285092201231e-06, + "loss": 0.7156, + "step": 4912 + }, + { + "epoch": 0.2648803105456114, + "grad_norm": 0.6990894079208374, + "learning_rate": 9.896242130670972e-06, + "loss": 0.753, + "step": 4913 + }, + { + "epoch": 0.26493422471425493, + "grad_norm": 0.79696124792099, + "learning_rate": 9.896199160337927e-06, + "loss": 0.8626, + "step": 4914 + }, + { + "epoch": 0.2649881388828984, + "grad_norm": 0.7954263091087341, + "learning_rate": 9.896156181202175e-06, + "loss": 0.7447, + "step": 4915 + }, + { + "epoch": 0.26504205305154194, + "grad_norm": 0.7960940003395081, + "learning_rate": 9.896113193263796e-06, + "loss": 0.7805, + "step": 4916 + }, + { + "epoch": 0.26509596722018547, + "grad_norm": 0.7872769236564636, + "learning_rate": 9.896070196522867e-06, + "loss": 0.8706, + "step": 4917 + }, + { + "epoch": 0.265149881388829, + "grad_norm": 0.8143740892410278, + "learning_rate": 9.896027190979462e-06, + "loss": 0.894, + "step": 4918 + }, + { + "epoch": 0.2652037955574725, + "grad_norm": 0.7195903062820435, + "learning_rate": 9.895984176633662e-06, + "loss": 0.7079, + "step": 4919 + }, + { + "epoch": 0.265257709726116, + "grad_norm": 1.2636377811431885, + "learning_rate": 9.895941153485541e-06, + "loss": 0.8019, + "step": 4920 + }, + { + "epoch": 0.26531162389475954, + "grad_norm": 0.9132199287414551, + "learning_rate": 9.895898121535182e-06, + "loss": 0.8137, + "step": 4921 + }, + { + "epoch": 0.2653655380634031, + "grad_norm": 0.7580793499946594, + "learning_rate": 9.895855080782655e-06, + "loss": 0.8015, + "step": 4922 + }, + { + "epoch": 0.2654194522320466, + "grad_norm": 0.764226496219635, + "learning_rate": 9.89581203122804e-06, + "loss": 0.7951, + "step": 4923 + }, + { + "epoch": 0.2654733664006901, + "grad_norm": 0.7804572582244873, + "learning_rate": 9.895768972871418e-06, + "loss": 0.8292, + "step": 4924 + }, + { + "epoch": 0.2655272805693336, + "grad_norm": 1.0945926904678345, + "learning_rate": 9.895725905712863e-06, + "loss": 0.916, + "step": 4925 + }, + { + "epoch": 0.26558119473797714, + "grad_norm": 0.7809876203536987, + "learning_rate": 9.895682829752452e-06, + "loss": 0.8282, + "step": 4926 + }, + { + "epoch": 0.2656351089066207, + "grad_norm": 0.9589576721191406, + "learning_rate": 9.895639744990264e-06, + "loss": 0.7427, + "step": 4927 + }, + { + "epoch": 0.26568902307526415, + "grad_norm": 0.8494128584861755, + "learning_rate": 9.895596651426376e-06, + "loss": 0.8192, + "step": 4928 + }, + { + "epoch": 0.2657429372439077, + "grad_norm": 0.7642913460731506, + "learning_rate": 9.895553549060867e-06, + "loss": 0.7407, + "step": 4929 + }, + { + "epoch": 0.2657968514125512, + "grad_norm": 0.7758688926696777, + "learning_rate": 9.895510437893812e-06, + "loss": 0.8022, + "step": 4930 + }, + { + "epoch": 0.26585076558119475, + "grad_norm": 0.7677244544029236, + "learning_rate": 9.895467317925289e-06, + "loss": 0.7344, + "step": 4931 + }, + { + "epoch": 0.2659046797498383, + "grad_norm": 0.7520139217376709, + "learning_rate": 9.895424189155375e-06, + "loss": 0.8539, + "step": 4932 + }, + { + "epoch": 0.26595859391848176, + "grad_norm": 0.8028707504272461, + "learning_rate": 9.89538105158415e-06, + "loss": 0.7777, + "step": 4933 + }, + { + "epoch": 0.2660125080871253, + "grad_norm": 0.7818429470062256, + "learning_rate": 9.895337905211691e-06, + "loss": 0.7559, + "step": 4934 + }, + { + "epoch": 0.2660664222557688, + "grad_norm": 0.7150774002075195, + "learning_rate": 9.895294750038073e-06, + "loss": 0.7501, + "step": 4935 + }, + { + "epoch": 0.26612033642441235, + "grad_norm": 0.709414541721344, + "learning_rate": 9.895251586063376e-06, + "loss": 0.7232, + "step": 4936 + }, + { + "epoch": 0.26617425059305583, + "grad_norm": 0.8100318908691406, + "learning_rate": 9.895208413287677e-06, + "loss": 0.7702, + "step": 4937 + }, + { + "epoch": 0.26622816476169936, + "grad_norm": 0.6777253150939941, + "learning_rate": 9.895165231711052e-06, + "loss": 0.707, + "step": 4938 + }, + { + "epoch": 0.2662820789303429, + "grad_norm": 0.7034317851066589, + "learning_rate": 9.895122041333583e-06, + "loss": 0.6021, + "step": 4939 + }, + { + "epoch": 0.2663359930989864, + "grad_norm": 0.8210963606834412, + "learning_rate": 9.895078842155343e-06, + "loss": 0.8198, + "step": 4940 + }, + { + "epoch": 0.26638990726762996, + "grad_norm": 0.7624147534370422, + "learning_rate": 9.89503563417641e-06, + "loss": 0.7677, + "step": 4941 + }, + { + "epoch": 0.26644382143627343, + "grad_norm": 0.735461413860321, + "learning_rate": 9.894992417396866e-06, + "loss": 0.839, + "step": 4942 + }, + { + "epoch": 0.26649773560491696, + "grad_norm": 0.7400258183479309, + "learning_rate": 9.894949191816786e-06, + "loss": 0.7904, + "step": 4943 + }, + { + "epoch": 0.2665516497735605, + "grad_norm": 0.7352719902992249, + "learning_rate": 9.894905957436244e-06, + "loss": 0.7283, + "step": 4944 + }, + { + "epoch": 0.266605563942204, + "grad_norm": 0.7771669626235962, + "learning_rate": 9.894862714255324e-06, + "loss": 0.863, + "step": 4945 + }, + { + "epoch": 0.2666594781108475, + "grad_norm": 0.8066530227661133, + "learning_rate": 9.8948194622741e-06, + "loss": 0.8065, + "step": 4946 + }, + { + "epoch": 0.26671339227949104, + "grad_norm": 0.7446811199188232, + "learning_rate": 9.894776201492651e-06, + "loss": 0.7539, + "step": 4947 + }, + { + "epoch": 0.26676730644813457, + "grad_norm": 0.787760317325592, + "learning_rate": 9.894732931911056e-06, + "loss": 0.8361, + "step": 4948 + }, + { + "epoch": 0.2668212206167781, + "grad_norm": 0.9865973591804504, + "learning_rate": 9.894689653529389e-06, + "loss": 0.8228, + "step": 4949 + }, + { + "epoch": 0.26687513478542163, + "grad_norm": 0.7901219129562378, + "learning_rate": 9.89464636634773e-06, + "loss": 0.8059, + "step": 4950 + }, + { + "epoch": 0.2669290489540651, + "grad_norm": 0.8485696911811829, + "learning_rate": 9.89460307036616e-06, + "loss": 0.781, + "step": 4951 + }, + { + "epoch": 0.26698296312270864, + "grad_norm": 0.8590619564056396, + "learning_rate": 9.89455976558475e-06, + "loss": 0.7428, + "step": 4952 + }, + { + "epoch": 0.26703687729135217, + "grad_norm": 0.8802759051322937, + "learning_rate": 9.894516452003584e-06, + "loss": 0.7261, + "step": 4953 + }, + { + "epoch": 0.2670907914599957, + "grad_norm": 0.9600741267204285, + "learning_rate": 9.894473129622739e-06, + "loss": 0.8006, + "step": 4954 + }, + { + "epoch": 0.26714470562863923, + "grad_norm": 0.8588278889656067, + "learning_rate": 9.894429798442288e-06, + "loss": 0.7971, + "step": 4955 + }, + { + "epoch": 0.2671986197972827, + "grad_norm": 0.7204979658126831, + "learning_rate": 9.894386458462315e-06, + "loss": 0.7733, + "step": 4956 + }, + { + "epoch": 0.26725253396592624, + "grad_norm": 0.9327245354652405, + "learning_rate": 9.894343109682893e-06, + "loss": 0.7785, + "step": 4957 + }, + { + "epoch": 0.2673064481345698, + "grad_norm": 0.6946107745170593, + "learning_rate": 9.894299752104105e-06, + "loss": 0.7144, + "step": 4958 + }, + { + "epoch": 0.2673603623032133, + "grad_norm": 0.7115009427070618, + "learning_rate": 9.894256385726025e-06, + "loss": 0.7705, + "step": 4959 + }, + { + "epoch": 0.2674142764718568, + "grad_norm": 0.7661309242248535, + "learning_rate": 9.89421301054873e-06, + "loss": 0.6842, + "step": 4960 + }, + { + "epoch": 0.2674681906405003, + "grad_norm": 0.7183328866958618, + "learning_rate": 9.894169626572302e-06, + "loss": 0.8208, + "step": 4961 + }, + { + "epoch": 0.26752210480914385, + "grad_norm": 0.9643034338951111, + "learning_rate": 9.894126233796816e-06, + "loss": 0.8814, + "step": 4962 + }, + { + "epoch": 0.2675760189777874, + "grad_norm": 0.7522911429405212, + "learning_rate": 9.894082832222352e-06, + "loss": 0.7545, + "step": 4963 + }, + { + "epoch": 0.2676299331464309, + "grad_norm": 0.733444333076477, + "learning_rate": 9.894039421848988e-06, + "loss": 0.6791, + "step": 4964 + }, + { + "epoch": 0.2676838473150744, + "grad_norm": 0.7534430623054504, + "learning_rate": 9.8939960026768e-06, + "loss": 0.8344, + "step": 4965 + }, + { + "epoch": 0.2677377614837179, + "grad_norm": 0.7849922776222229, + "learning_rate": 9.893952574705867e-06, + "loss": 0.6955, + "step": 4966 + }, + { + "epoch": 0.26779167565236145, + "grad_norm": 0.7080478668212891, + "learning_rate": 9.893909137936268e-06, + "loss": 0.7518, + "step": 4967 + }, + { + "epoch": 0.267845589821005, + "grad_norm": 0.7007871270179749, + "learning_rate": 9.893865692368081e-06, + "loss": 0.7011, + "step": 4968 + }, + { + "epoch": 0.26789950398964846, + "grad_norm": 0.8561926484107971, + "learning_rate": 9.893822238001383e-06, + "loss": 0.7918, + "step": 4969 + }, + { + "epoch": 0.267953418158292, + "grad_norm": 0.9306691288948059, + "learning_rate": 9.893778774836251e-06, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.2680073323269355, + "grad_norm": 0.8165447115898132, + "learning_rate": 9.893735302872767e-06, + "loss": 0.8634, + "step": 4971 + }, + { + "epoch": 0.26806124649557905, + "grad_norm": 0.7696943283081055, + "learning_rate": 9.893691822111005e-06, + "loss": 0.7597, + "step": 4972 + }, + { + "epoch": 0.2681151606642226, + "grad_norm": 0.821960985660553, + "learning_rate": 9.893648332551047e-06, + "loss": 0.8266, + "step": 4973 + }, + { + "epoch": 0.26816907483286606, + "grad_norm": 0.7997711300849915, + "learning_rate": 9.893604834192968e-06, + "loss": 0.738, + "step": 4974 + }, + { + "epoch": 0.2682229890015096, + "grad_norm": 0.7624261379241943, + "learning_rate": 9.893561327036847e-06, + "loss": 0.8676, + "step": 4975 + }, + { + "epoch": 0.2682769031701531, + "grad_norm": 0.8748223185539246, + "learning_rate": 9.893517811082764e-06, + "loss": 0.8396, + "step": 4976 + }, + { + "epoch": 0.26833081733879666, + "grad_norm": 0.9294693470001221, + "learning_rate": 9.893474286330797e-06, + "loss": 0.8869, + "step": 4977 + }, + { + "epoch": 0.26838473150744013, + "grad_norm": 0.7981976866722107, + "learning_rate": 9.893430752781021e-06, + "loss": 0.8176, + "step": 4978 + }, + { + "epoch": 0.26843864567608366, + "grad_norm": 0.8983638882637024, + "learning_rate": 9.893387210433518e-06, + "loss": 0.8181, + "step": 4979 + }, + { + "epoch": 0.2684925598447272, + "grad_norm": 0.7371122241020203, + "learning_rate": 9.893343659288364e-06, + "loss": 0.8004, + "step": 4980 + }, + { + "epoch": 0.26854647401337073, + "grad_norm": 0.8287851214408875, + "learning_rate": 9.893300099345639e-06, + "loss": 0.8249, + "step": 4981 + }, + { + "epoch": 0.26860038818201426, + "grad_norm": 0.7839323878288269, + "learning_rate": 9.89325653060542e-06, + "loss": 0.7561, + "step": 4982 + }, + { + "epoch": 0.26865430235065774, + "grad_norm": 0.7348718643188477, + "learning_rate": 9.893212953067784e-06, + "loss": 0.7693, + "step": 4983 + }, + { + "epoch": 0.26870821651930127, + "grad_norm": 0.7529023885726929, + "learning_rate": 9.893169366732814e-06, + "loss": 0.7874, + "step": 4984 + }, + { + "epoch": 0.2687621306879448, + "grad_norm": 0.8256911635398865, + "learning_rate": 9.893125771600583e-06, + "loss": 0.8646, + "step": 4985 + }, + { + "epoch": 0.26881604485658833, + "grad_norm": 0.8608624935150146, + "learning_rate": 9.893082167671172e-06, + "loss": 0.7953, + "step": 4986 + }, + { + "epoch": 0.2688699590252318, + "grad_norm": 0.7824952006340027, + "learning_rate": 9.893038554944661e-06, + "loss": 0.6885, + "step": 4987 + }, + { + "epoch": 0.26892387319387534, + "grad_norm": 0.8561933636665344, + "learning_rate": 9.892994933421125e-06, + "loss": 0.771, + "step": 4988 + }, + { + "epoch": 0.26897778736251887, + "grad_norm": 0.8238648176193237, + "learning_rate": 9.892951303100644e-06, + "loss": 0.7308, + "step": 4989 + }, + { + "epoch": 0.2690317015311624, + "grad_norm": 0.6714439392089844, + "learning_rate": 9.892907663983297e-06, + "loss": 0.6775, + "step": 4990 + }, + { + "epoch": 0.26908561569980594, + "grad_norm": 0.714019775390625, + "learning_rate": 9.892864016069162e-06, + "loss": 0.753, + "step": 4991 + }, + { + "epoch": 0.2691395298684494, + "grad_norm": 0.7529036402702332, + "learning_rate": 9.892820359358318e-06, + "loss": 0.8614, + "step": 4992 + }, + { + "epoch": 0.26919344403709294, + "grad_norm": 0.8602166771888733, + "learning_rate": 9.89277669385084e-06, + "loss": 0.8552, + "step": 4993 + }, + { + "epoch": 0.2692473582057365, + "grad_norm": 0.7607848048210144, + "learning_rate": 9.892733019546811e-06, + "loss": 0.7749, + "step": 4994 + }, + { + "epoch": 0.26930127237438, + "grad_norm": 0.664573609828949, + "learning_rate": 9.89268933644631e-06, + "loss": 0.7819, + "step": 4995 + }, + { + "epoch": 0.2693551865430235, + "grad_norm": 0.7218571901321411, + "learning_rate": 9.892645644549412e-06, + "loss": 0.7618, + "step": 4996 + }, + { + "epoch": 0.269409100711667, + "grad_norm": 0.7744899988174438, + "learning_rate": 9.892601943856198e-06, + "loss": 0.7899, + "step": 4997 + }, + { + "epoch": 0.26946301488031055, + "grad_norm": 0.866887629032135, + "learning_rate": 9.892558234366743e-06, + "loss": 0.7779, + "step": 4998 + }, + { + "epoch": 0.2695169290489541, + "grad_norm": 0.7656950354576111, + "learning_rate": 9.892514516081129e-06, + "loss": 0.8449, + "step": 4999 + }, + { + "epoch": 0.2695708432175976, + "grad_norm": 0.8089601397514343, + "learning_rate": 9.892470788999435e-06, + "loss": 0.7406, + "step": 5000 + }, + { + "epoch": 0.2696247573862411, + "grad_norm": 0.7319750189781189, + "learning_rate": 9.892427053121738e-06, + "loss": 0.7794, + "step": 5001 + }, + { + "epoch": 0.2696786715548846, + "grad_norm": 0.8019516468048096, + "learning_rate": 9.892383308448117e-06, + "loss": 0.7322, + "step": 5002 + }, + { + "epoch": 0.26973258572352815, + "grad_norm": 0.7320996522903442, + "learning_rate": 9.89233955497865e-06, + "loss": 0.7882, + "step": 5003 + }, + { + "epoch": 0.2697864998921717, + "grad_norm": 0.8075882792472839, + "learning_rate": 9.892295792713417e-06, + "loss": 0.8391, + "step": 5004 + }, + { + "epoch": 0.26984041406081516, + "grad_norm": 0.7340912222862244, + "learning_rate": 9.892252021652495e-06, + "loss": 0.7815, + "step": 5005 + }, + { + "epoch": 0.2698943282294587, + "grad_norm": 0.8739588260650635, + "learning_rate": 9.892208241795965e-06, + "loss": 0.8287, + "step": 5006 + }, + { + "epoch": 0.2699482423981022, + "grad_norm": 0.7938231229782104, + "learning_rate": 9.892164453143904e-06, + "loss": 0.8494, + "step": 5007 + }, + { + "epoch": 0.27000215656674575, + "grad_norm": 0.7387966513633728, + "learning_rate": 9.892120655696391e-06, + "loss": 0.7465, + "step": 5008 + }, + { + "epoch": 0.2700560707353893, + "grad_norm": 0.7171775102615356, + "learning_rate": 9.892076849453504e-06, + "loss": 0.7227, + "step": 5009 + }, + { + "epoch": 0.27010998490403276, + "grad_norm": 0.7506486773490906, + "learning_rate": 9.892033034415324e-06, + "loss": 0.7606, + "step": 5010 + }, + { + "epoch": 0.2701638990726763, + "grad_norm": 0.833413302898407, + "learning_rate": 9.891989210581928e-06, + "loss": 0.7998, + "step": 5011 + }, + { + "epoch": 0.2702178132413198, + "grad_norm": 0.7675343155860901, + "learning_rate": 9.891945377953395e-06, + "loss": 0.7554, + "step": 5012 + }, + { + "epoch": 0.27027172740996336, + "grad_norm": 0.8682401180267334, + "learning_rate": 9.891901536529804e-06, + "loss": 0.8342, + "step": 5013 + }, + { + "epoch": 0.27032564157860683, + "grad_norm": 0.7674192190170288, + "learning_rate": 9.891857686311232e-06, + "loss": 0.7055, + "step": 5014 + }, + { + "epoch": 0.27037955574725037, + "grad_norm": 0.717960000038147, + "learning_rate": 9.891813827297762e-06, + "loss": 0.7939, + "step": 5015 + }, + { + "epoch": 0.2704334699158939, + "grad_norm": 0.8811343908309937, + "learning_rate": 9.89176995948947e-06, + "loss": 0.7987, + "step": 5016 + }, + { + "epoch": 0.27048738408453743, + "grad_norm": 0.9724238514900208, + "learning_rate": 9.891726082886436e-06, + "loss": 0.8342, + "step": 5017 + }, + { + "epoch": 0.27054129825318096, + "grad_norm": 0.7969245314598083, + "learning_rate": 9.891682197488737e-06, + "loss": 0.8937, + "step": 5018 + }, + { + "epoch": 0.27059521242182444, + "grad_norm": 0.8564383387565613, + "learning_rate": 9.891638303296453e-06, + "loss": 0.7454, + "step": 5019 + }, + { + "epoch": 0.27064912659046797, + "grad_norm": 0.7879497408866882, + "learning_rate": 9.891594400309665e-06, + "loss": 0.7283, + "step": 5020 + }, + { + "epoch": 0.2707030407591115, + "grad_norm": 0.7248218059539795, + "learning_rate": 9.891550488528448e-06, + "loss": 0.7661, + "step": 5021 + }, + { + "epoch": 0.27075695492775503, + "grad_norm": 0.7548377513885498, + "learning_rate": 9.891506567952884e-06, + "loss": 0.8127, + "step": 5022 + }, + { + "epoch": 0.2708108690963985, + "grad_norm": 0.72477787733078, + "learning_rate": 9.891462638583051e-06, + "loss": 0.6732, + "step": 5023 + }, + { + "epoch": 0.27086478326504204, + "grad_norm": 0.7293525338172913, + "learning_rate": 9.891418700419026e-06, + "loss": 0.7547, + "step": 5024 + }, + { + "epoch": 0.2709186974336856, + "grad_norm": 0.6827152371406555, + "learning_rate": 9.891374753460893e-06, + "loss": 0.7069, + "step": 5025 + }, + { + "epoch": 0.2709726116023291, + "grad_norm": 0.8005618453025818, + "learning_rate": 9.891330797708726e-06, + "loss": 0.7789, + "step": 5026 + }, + { + "epoch": 0.27102652577097264, + "grad_norm": 0.8415570259094238, + "learning_rate": 9.891286833162606e-06, + "loss": 0.8397, + "step": 5027 + }, + { + "epoch": 0.2710804399396161, + "grad_norm": 0.7276983261108398, + "learning_rate": 9.891242859822612e-06, + "loss": 0.7051, + "step": 5028 + }, + { + "epoch": 0.27113435410825965, + "grad_norm": 0.7116531729698181, + "learning_rate": 9.891198877688824e-06, + "loss": 0.6909, + "step": 5029 + }, + { + "epoch": 0.2711882682769032, + "grad_norm": 0.7504072189331055, + "learning_rate": 9.891154886761319e-06, + "loss": 0.7552, + "step": 5030 + }, + { + "epoch": 0.2712421824455467, + "grad_norm": 0.7239630222320557, + "learning_rate": 9.891110887040177e-06, + "loss": 0.7546, + "step": 5031 + }, + { + "epoch": 0.2712960966141902, + "grad_norm": 0.7500813603401184, + "learning_rate": 9.891066878525478e-06, + "loss": 0.7983, + "step": 5032 + }, + { + "epoch": 0.2713500107828337, + "grad_norm": 1.0069187879562378, + "learning_rate": 9.8910228612173e-06, + "loss": 0.8422, + "step": 5033 + }, + { + "epoch": 0.27140392495147725, + "grad_norm": 0.7656623721122742, + "learning_rate": 9.890978835115723e-06, + "loss": 0.7754, + "step": 5034 + }, + { + "epoch": 0.2714578391201208, + "grad_norm": 0.8915570974349976, + "learning_rate": 9.890934800220825e-06, + "loss": 0.8195, + "step": 5035 + }, + { + "epoch": 0.2715117532887643, + "grad_norm": 0.8333117961883545, + "learning_rate": 9.890890756532686e-06, + "loss": 0.8419, + "step": 5036 + }, + { + "epoch": 0.2715656674574078, + "grad_norm": 0.8374854922294617, + "learning_rate": 9.890846704051386e-06, + "loss": 0.7581, + "step": 5037 + }, + { + "epoch": 0.2716195816260513, + "grad_norm": 0.7093636989593506, + "learning_rate": 9.890802642777002e-06, + "loss": 0.6926, + "step": 5038 + }, + { + "epoch": 0.27167349579469485, + "grad_norm": 0.7575312852859497, + "learning_rate": 9.890758572709615e-06, + "loss": 0.802, + "step": 5039 + }, + { + "epoch": 0.2717274099633384, + "grad_norm": 0.902991771697998, + "learning_rate": 9.890714493849304e-06, + "loss": 0.9113, + "step": 5040 + }, + { + "epoch": 0.27178132413198186, + "grad_norm": 0.7198828458786011, + "learning_rate": 9.890670406196147e-06, + "loss": 0.7271, + "step": 5041 + }, + { + "epoch": 0.2718352383006254, + "grad_norm": 0.8525444269180298, + "learning_rate": 9.890626309750226e-06, + "loss": 0.7872, + "step": 5042 + }, + { + "epoch": 0.2718891524692689, + "grad_norm": 0.7253887057304382, + "learning_rate": 9.890582204511616e-06, + "loss": 0.7847, + "step": 5043 + }, + { + "epoch": 0.27194306663791246, + "grad_norm": 0.871543824672699, + "learning_rate": 9.890538090480402e-06, + "loss": 0.7855, + "step": 5044 + }, + { + "epoch": 0.271996980806556, + "grad_norm": 0.7563179731369019, + "learning_rate": 9.890493967656658e-06, + "loss": 0.765, + "step": 5045 + }, + { + "epoch": 0.27205089497519946, + "grad_norm": 0.8132460713386536, + "learning_rate": 9.890449836040465e-06, + "loss": 0.815, + "step": 5046 + }, + { + "epoch": 0.272104809143843, + "grad_norm": 0.6690226197242737, + "learning_rate": 9.890405695631905e-06, + "loss": 0.679, + "step": 5047 + }, + { + "epoch": 0.27215872331248653, + "grad_norm": 0.7403889894485474, + "learning_rate": 9.890361546431052e-06, + "loss": 0.6578, + "step": 5048 + }, + { + "epoch": 0.27221263748113006, + "grad_norm": 0.7937926054000854, + "learning_rate": 9.89031738843799e-06, + "loss": 0.8178, + "step": 5049 + }, + { + "epoch": 0.27226655164977354, + "grad_norm": 0.7222248911857605, + "learning_rate": 9.890273221652798e-06, + "loss": 0.6765, + "step": 5050 + }, + { + "epoch": 0.27232046581841707, + "grad_norm": 0.7936972975730896, + "learning_rate": 9.890229046075553e-06, + "loss": 0.7552, + "step": 5051 + }, + { + "epoch": 0.2723743799870606, + "grad_norm": 0.7286278009414673, + "learning_rate": 9.890184861706336e-06, + "loss": 0.7409, + "step": 5052 + }, + { + "epoch": 0.27242829415570413, + "grad_norm": 0.7878450751304626, + "learning_rate": 9.890140668545226e-06, + "loss": 0.8493, + "step": 5053 + }, + { + "epoch": 0.27248220832434766, + "grad_norm": 0.7352455854415894, + "learning_rate": 9.890096466592303e-06, + "loss": 0.6574, + "step": 5054 + }, + { + "epoch": 0.27253612249299114, + "grad_norm": 0.7900424003601074, + "learning_rate": 9.890052255847646e-06, + "loss": 0.8187, + "step": 5055 + }, + { + "epoch": 0.27259003666163467, + "grad_norm": 0.8364367485046387, + "learning_rate": 9.890008036311334e-06, + "loss": 0.7423, + "step": 5056 + }, + { + "epoch": 0.2726439508302782, + "grad_norm": 0.7436595559120178, + "learning_rate": 9.889963807983447e-06, + "loss": 0.7412, + "step": 5057 + }, + { + "epoch": 0.27269786499892174, + "grad_norm": 0.7472354769706726, + "learning_rate": 9.889919570864066e-06, + "loss": 0.8264, + "step": 5058 + }, + { + "epoch": 0.2727517791675652, + "grad_norm": 0.7758167386054993, + "learning_rate": 9.889875324953268e-06, + "loss": 0.7133, + "step": 5059 + }, + { + "epoch": 0.27280569333620874, + "grad_norm": 0.7223731875419617, + "learning_rate": 9.889831070251135e-06, + "loss": 0.7244, + "step": 5060 + }, + { + "epoch": 0.2728596075048523, + "grad_norm": 1.041771650314331, + "learning_rate": 9.889786806757743e-06, + "loss": 0.9429, + "step": 5061 + }, + { + "epoch": 0.2729135216734958, + "grad_norm": 0.8936665654182434, + "learning_rate": 9.889742534473174e-06, + "loss": 0.7424, + "step": 5062 + }, + { + "epoch": 0.27296743584213934, + "grad_norm": 0.8620690107345581, + "learning_rate": 9.88969825339751e-06, + "loss": 0.8211, + "step": 5063 + }, + { + "epoch": 0.2730213500107828, + "grad_norm": 0.8004252314567566, + "learning_rate": 9.889653963530826e-06, + "loss": 0.7296, + "step": 5064 + }, + { + "epoch": 0.27307526417942635, + "grad_norm": 0.7337127327919006, + "learning_rate": 9.889609664873203e-06, + "loss": 0.7898, + "step": 5065 + }, + { + "epoch": 0.2731291783480699, + "grad_norm": 1.7178047895431519, + "learning_rate": 9.889565357424722e-06, + "loss": 0.8032, + "step": 5066 + }, + { + "epoch": 0.2731830925167134, + "grad_norm": 0.871757984161377, + "learning_rate": 9.889521041185464e-06, + "loss": 0.8074, + "step": 5067 + }, + { + "epoch": 0.2732370066853569, + "grad_norm": 1.1161519289016724, + "learning_rate": 9.889476716155503e-06, + "loss": 0.8783, + "step": 5068 + }, + { + "epoch": 0.2732909208540004, + "grad_norm": 1.4781978130340576, + "learning_rate": 9.889432382334924e-06, + "loss": 0.8364, + "step": 5069 + }, + { + "epoch": 0.27334483502264395, + "grad_norm": 0.7921425700187683, + "learning_rate": 9.889388039723807e-06, + "loss": 0.7559, + "step": 5070 + }, + { + "epoch": 0.2733987491912875, + "grad_norm": 0.9014592170715332, + "learning_rate": 9.889343688322227e-06, + "loss": 0.8887, + "step": 5071 + }, + { + "epoch": 0.273452663359931, + "grad_norm": 0.7558442950248718, + "learning_rate": 9.889299328130268e-06, + "loss": 0.7839, + "step": 5072 + }, + { + "epoch": 0.2735065775285745, + "grad_norm": 0.7945775985717773, + "learning_rate": 9.889254959148006e-06, + "loss": 0.8413, + "step": 5073 + }, + { + "epoch": 0.273560491697218, + "grad_norm": 0.8391217589378357, + "learning_rate": 9.889210581375526e-06, + "loss": 0.7617, + "step": 5074 + }, + { + "epoch": 0.27361440586586155, + "grad_norm": 0.8547251224517822, + "learning_rate": 9.889166194812903e-06, + "loss": 0.7955, + "step": 5075 + }, + { + "epoch": 0.2736683200345051, + "grad_norm": 0.8064761757850647, + "learning_rate": 9.88912179946022e-06, + "loss": 0.7557, + "step": 5076 + }, + { + "epoch": 0.27372223420314856, + "grad_norm": 0.7102752923965454, + "learning_rate": 9.889077395317553e-06, + "loss": 0.7526, + "step": 5077 + }, + { + "epoch": 0.2737761483717921, + "grad_norm": 0.8167790770530701, + "learning_rate": 9.889032982384986e-06, + "loss": 0.8245, + "step": 5078 + }, + { + "epoch": 0.2738300625404356, + "grad_norm": 0.7231212854385376, + "learning_rate": 9.888988560662597e-06, + "loss": 0.799, + "step": 5079 + }, + { + "epoch": 0.27388397670907916, + "grad_norm": 0.7393338084220886, + "learning_rate": 9.888944130150464e-06, + "loss": 0.8118, + "step": 5080 + }, + { + "epoch": 0.2739378908777227, + "grad_norm": 0.847621738910675, + "learning_rate": 9.888899690848673e-06, + "loss": 0.8174, + "step": 5081 + }, + { + "epoch": 0.27399180504636617, + "grad_norm": 0.9880374073982239, + "learning_rate": 9.888855242757296e-06, + "loss": 0.9501, + "step": 5082 + }, + { + "epoch": 0.2740457192150097, + "grad_norm": 0.7384204864501953, + "learning_rate": 9.888810785876416e-06, + "loss": 0.785, + "step": 5083 + }, + { + "epoch": 0.27409963338365323, + "grad_norm": 1.001950740814209, + "learning_rate": 9.888766320206118e-06, + "loss": 0.8439, + "step": 5084 + }, + { + "epoch": 0.27415354755229676, + "grad_norm": 0.8231346011161804, + "learning_rate": 9.888721845746473e-06, + "loss": 0.8127, + "step": 5085 + }, + { + "epoch": 0.27420746172094024, + "grad_norm": 0.7128643989562988, + "learning_rate": 9.888677362497568e-06, + "loss": 0.6922, + "step": 5086 + }, + { + "epoch": 0.27426137588958377, + "grad_norm": 0.7206726670265198, + "learning_rate": 9.88863287045948e-06, + "loss": 0.7977, + "step": 5087 + }, + { + "epoch": 0.2743152900582273, + "grad_norm": 0.7943522334098816, + "learning_rate": 9.888588369632289e-06, + "loss": 0.7565, + "step": 5088 + }, + { + "epoch": 0.27436920422687083, + "grad_norm": 0.7610237002372742, + "learning_rate": 9.888543860016075e-06, + "loss": 0.7539, + "step": 5089 + }, + { + "epoch": 0.27442311839551436, + "grad_norm": 0.7111551761627197, + "learning_rate": 9.88849934161092e-06, + "loss": 0.6942, + "step": 5090 + }, + { + "epoch": 0.27447703256415784, + "grad_norm": 0.8590908050537109, + "learning_rate": 9.888454814416901e-06, + "loss": 0.8405, + "step": 5091 + }, + { + "epoch": 0.2745309467328014, + "grad_norm": 0.7120518684387207, + "learning_rate": 9.888410278434101e-06, + "loss": 0.7574, + "step": 5092 + }, + { + "epoch": 0.2745848609014449, + "grad_norm": 0.7736578583717346, + "learning_rate": 9.888365733662598e-06, + "loss": 0.7823, + "step": 5093 + }, + { + "epoch": 0.27463877507008844, + "grad_norm": 0.712278425693512, + "learning_rate": 9.888321180102472e-06, + "loss": 0.7657, + "step": 5094 + }, + { + "epoch": 0.2746926892387319, + "grad_norm": 0.7149209976196289, + "learning_rate": 9.888276617753804e-06, + "loss": 0.7515, + "step": 5095 + }, + { + "epoch": 0.27474660340737544, + "grad_norm": 0.8070907592773438, + "learning_rate": 9.888232046616676e-06, + "loss": 0.7541, + "step": 5096 + }, + { + "epoch": 0.274800517576019, + "grad_norm": 0.8107784390449524, + "learning_rate": 9.888187466691163e-06, + "loss": 0.896, + "step": 5097 + }, + { + "epoch": 0.2748544317446625, + "grad_norm": 0.7852044105529785, + "learning_rate": 9.888142877977349e-06, + "loss": 0.8934, + "step": 5098 + }, + { + "epoch": 0.27490834591330604, + "grad_norm": 0.8732671141624451, + "learning_rate": 9.888098280475315e-06, + "loss": 0.7711, + "step": 5099 + }, + { + "epoch": 0.2749622600819495, + "grad_norm": 0.8847461342811584, + "learning_rate": 9.888053674185138e-06, + "loss": 0.7291, + "step": 5100 + }, + { + "epoch": 0.27501617425059305, + "grad_norm": 0.8422223329544067, + "learning_rate": 9.8880090591069e-06, + "loss": 0.6604, + "step": 5101 + }, + { + "epoch": 0.2750700884192366, + "grad_norm": 0.6901240944862366, + "learning_rate": 9.887964435240681e-06, + "loss": 0.7411, + "step": 5102 + }, + { + "epoch": 0.2751240025878801, + "grad_norm": 0.7141496539115906, + "learning_rate": 9.887919802586561e-06, + "loss": 0.7647, + "step": 5103 + }, + { + "epoch": 0.2751779167565236, + "grad_norm": 0.7716993093490601, + "learning_rate": 9.88787516114462e-06, + "loss": 0.7541, + "step": 5104 + }, + { + "epoch": 0.2752318309251671, + "grad_norm": 0.7874771356582642, + "learning_rate": 9.88783051091494e-06, + "loss": 0.7992, + "step": 5105 + }, + { + "epoch": 0.27528574509381065, + "grad_norm": 0.7106810212135315, + "learning_rate": 9.8877858518976e-06, + "loss": 0.7383, + "step": 5106 + }, + { + "epoch": 0.2753396592624542, + "grad_norm": 0.7486706376075745, + "learning_rate": 9.88774118409268e-06, + "loss": 0.7741, + "step": 5107 + }, + { + "epoch": 0.2753935734310977, + "grad_norm": 0.8137489557266235, + "learning_rate": 9.887696507500259e-06, + "loss": 0.8238, + "step": 5108 + }, + { + "epoch": 0.2754474875997412, + "grad_norm": 0.8295445442199707, + "learning_rate": 9.88765182212042e-06, + "loss": 0.7718, + "step": 5109 + }, + { + "epoch": 0.2755014017683847, + "grad_norm": 0.8613603115081787, + "learning_rate": 9.887607127953243e-06, + "loss": 0.835, + "step": 5110 + }, + { + "epoch": 0.27555531593702826, + "grad_norm": 0.7091763019561768, + "learning_rate": 9.887562424998806e-06, + "loss": 0.7089, + "step": 5111 + }, + { + "epoch": 0.2756092301056718, + "grad_norm": 0.7690724730491638, + "learning_rate": 9.887517713257193e-06, + "loss": 0.7846, + "step": 5112 + }, + { + "epoch": 0.27566314427431526, + "grad_norm": 0.7905461192131042, + "learning_rate": 9.88747299272848e-06, + "loss": 0.7955, + "step": 5113 + }, + { + "epoch": 0.2757170584429588, + "grad_norm": 0.7611652612686157, + "learning_rate": 9.887428263412752e-06, + "loss": 0.7802, + "step": 5114 + }, + { + "epoch": 0.2757709726116023, + "grad_norm": 0.7323983311653137, + "learning_rate": 9.887383525310086e-06, + "loss": 0.8312, + "step": 5115 + }, + { + "epoch": 0.27582488678024586, + "grad_norm": 0.7839152216911316, + "learning_rate": 9.887338778420563e-06, + "loss": 0.7792, + "step": 5116 + }, + { + "epoch": 0.2758788009488894, + "grad_norm": 0.9436889886856079, + "learning_rate": 9.887294022744264e-06, + "loss": 0.8232, + "step": 5117 + }, + { + "epoch": 0.27593271511753287, + "grad_norm": 0.7726641893386841, + "learning_rate": 9.88724925828127e-06, + "loss": 0.7142, + "step": 5118 + }, + { + "epoch": 0.2759866292861764, + "grad_norm": 0.7798104286193848, + "learning_rate": 9.887204485031662e-06, + "loss": 0.7575, + "step": 5119 + }, + { + "epoch": 0.27604054345481993, + "grad_norm": 0.7332453727722168, + "learning_rate": 9.887159702995518e-06, + "loss": 0.7362, + "step": 5120 + }, + { + "epoch": 0.27609445762346346, + "grad_norm": 0.7793838381767273, + "learning_rate": 9.887114912172922e-06, + "loss": 0.8488, + "step": 5121 + }, + { + "epoch": 0.27614837179210694, + "grad_norm": 0.8711932301521301, + "learning_rate": 9.88707011256395e-06, + "loss": 0.714, + "step": 5122 + }, + { + "epoch": 0.27620228596075047, + "grad_norm": 0.747809886932373, + "learning_rate": 9.887025304168686e-06, + "loss": 0.7847, + "step": 5123 + }, + { + "epoch": 0.276256200129394, + "grad_norm": 0.7189614176750183, + "learning_rate": 9.88698048698721e-06, + "loss": 0.7773, + "step": 5124 + }, + { + "epoch": 0.27631011429803753, + "grad_norm": 0.745582640171051, + "learning_rate": 9.886935661019604e-06, + "loss": 0.7567, + "step": 5125 + }, + { + "epoch": 0.27636402846668107, + "grad_norm": 0.7648694515228271, + "learning_rate": 9.886890826265942e-06, + "loss": 0.7938, + "step": 5126 + }, + { + "epoch": 0.27641794263532454, + "grad_norm": 0.8848762512207031, + "learning_rate": 9.886845982726312e-06, + "loss": 0.7978, + "step": 5127 + }, + { + "epoch": 0.2764718568039681, + "grad_norm": 0.8495482206344604, + "learning_rate": 9.886801130400794e-06, + "loss": 0.8016, + "step": 5128 + }, + { + "epoch": 0.2765257709726116, + "grad_norm": 0.7696657180786133, + "learning_rate": 9.886756269289463e-06, + "loss": 0.8715, + "step": 5129 + }, + { + "epoch": 0.27657968514125514, + "grad_norm": 0.7655208110809326, + "learning_rate": 9.886711399392406e-06, + "loss": 0.7964, + "step": 5130 + }, + { + "epoch": 0.2766335993098986, + "grad_norm": 0.7606762051582336, + "learning_rate": 9.8866665207097e-06, + "loss": 0.7159, + "step": 5131 + }, + { + "epoch": 0.27668751347854215, + "grad_norm": 0.8046274781227112, + "learning_rate": 9.886621633241427e-06, + "loss": 0.8083, + "step": 5132 + }, + { + "epoch": 0.2767414276471857, + "grad_norm": 0.9933425784111023, + "learning_rate": 9.886576736987667e-06, + "loss": 0.8654, + "step": 5133 + }, + { + "epoch": 0.2767953418158292, + "grad_norm": 1.6594408750534058, + "learning_rate": 9.8865318319485e-06, + "loss": 0.9209, + "step": 5134 + }, + { + "epoch": 0.27684925598447274, + "grad_norm": 0.857893168926239, + "learning_rate": 9.88648691812401e-06, + "loss": 0.7785, + "step": 5135 + }, + { + "epoch": 0.2769031701531162, + "grad_norm": 0.8305732011795044, + "learning_rate": 9.886441995514275e-06, + "loss": 0.8565, + "step": 5136 + }, + { + "epoch": 0.27695708432175975, + "grad_norm": 0.7797301411628723, + "learning_rate": 9.886397064119375e-06, + "loss": 0.7577, + "step": 5137 + }, + { + "epoch": 0.2770109984904033, + "grad_norm": 0.8581737875938416, + "learning_rate": 9.886352123939393e-06, + "loss": 0.8265, + "step": 5138 + }, + { + "epoch": 0.2770649126590468, + "grad_norm": 0.7265759110450745, + "learning_rate": 9.88630717497441e-06, + "loss": 0.7848, + "step": 5139 + }, + { + "epoch": 0.2771188268276903, + "grad_norm": 0.7873173952102661, + "learning_rate": 9.886262217224505e-06, + "loss": 0.8573, + "step": 5140 + }, + { + "epoch": 0.2771727409963338, + "grad_norm": 0.755599319934845, + "learning_rate": 9.886217250689758e-06, + "loss": 0.7217, + "step": 5141 + }, + { + "epoch": 0.27722665516497735, + "grad_norm": 0.8430512547492981, + "learning_rate": 9.886172275370254e-06, + "loss": 0.8689, + "step": 5142 + }, + { + "epoch": 0.2772805693336209, + "grad_norm": 0.8128552436828613, + "learning_rate": 9.88612729126607e-06, + "loss": 0.7929, + "step": 5143 + }, + { + "epoch": 0.2773344835022644, + "grad_norm": 0.7788698077201843, + "learning_rate": 9.886082298377287e-06, + "loss": 0.8285, + "step": 5144 + }, + { + "epoch": 0.2773883976709079, + "grad_norm": 0.8579205870628357, + "learning_rate": 9.886037296703987e-06, + "loss": 0.8288, + "step": 5145 + }, + { + "epoch": 0.2774423118395514, + "grad_norm": 0.767217755317688, + "learning_rate": 9.885992286246253e-06, + "loss": 0.7999, + "step": 5146 + }, + { + "epoch": 0.27749622600819496, + "grad_norm": 0.7575383186340332, + "learning_rate": 9.885947267004162e-06, + "loss": 0.7647, + "step": 5147 + }, + { + "epoch": 0.2775501401768385, + "grad_norm": 0.8674237132072449, + "learning_rate": 9.885902238977798e-06, + "loss": 0.6781, + "step": 5148 + }, + { + "epoch": 0.27760405434548197, + "grad_norm": 0.6494048833847046, + "learning_rate": 9.885857202167239e-06, + "loss": 0.6748, + "step": 5149 + }, + { + "epoch": 0.2776579685141255, + "grad_norm": 0.8333936333656311, + "learning_rate": 9.885812156572569e-06, + "loss": 0.8393, + "step": 5150 + }, + { + "epoch": 0.27771188268276903, + "grad_norm": 0.8702477812767029, + "learning_rate": 9.885767102193869e-06, + "loss": 0.6428, + "step": 5151 + }, + { + "epoch": 0.27776579685141256, + "grad_norm": 0.8017061948776245, + "learning_rate": 9.885722039031217e-06, + "loss": 0.7933, + "step": 5152 + }, + { + "epoch": 0.2778197110200561, + "grad_norm": 0.7803055047988892, + "learning_rate": 9.885676967084696e-06, + "loss": 0.723, + "step": 5153 + }, + { + "epoch": 0.27787362518869957, + "grad_norm": 0.7224579453468323, + "learning_rate": 9.885631886354387e-06, + "loss": 0.74, + "step": 5154 + }, + { + "epoch": 0.2779275393573431, + "grad_norm": 1.1245145797729492, + "learning_rate": 9.885586796840369e-06, + "loss": 0.877, + "step": 5155 + }, + { + "epoch": 0.27798145352598663, + "grad_norm": 0.7135274410247803, + "learning_rate": 9.885541698542728e-06, + "loss": 0.775, + "step": 5156 + }, + { + "epoch": 0.27803536769463016, + "grad_norm": 0.7516048550605774, + "learning_rate": 9.885496591461541e-06, + "loss": 0.8302, + "step": 5157 + }, + { + "epoch": 0.27808928186327364, + "grad_norm": 0.8390230536460876, + "learning_rate": 9.885451475596887e-06, + "loss": 0.8098, + "step": 5158 + }, + { + "epoch": 0.2781431960319172, + "grad_norm": 0.7310529947280884, + "learning_rate": 9.885406350948854e-06, + "loss": 0.7605, + "step": 5159 + }, + { + "epoch": 0.2781971102005607, + "grad_norm": 0.7502579689025879, + "learning_rate": 9.885361217517517e-06, + "loss": 0.8413, + "step": 5160 + }, + { + "epoch": 0.27825102436920424, + "grad_norm": 0.7119940519332886, + "learning_rate": 9.885316075302963e-06, + "loss": 0.6954, + "step": 5161 + }, + { + "epoch": 0.27830493853784777, + "grad_norm": 0.7565783262252808, + "learning_rate": 9.885270924305266e-06, + "loss": 0.7479, + "step": 5162 + }, + { + "epoch": 0.27835885270649124, + "grad_norm": 0.7579078078269958, + "learning_rate": 9.885225764524511e-06, + "loss": 0.7976, + "step": 5163 + }, + { + "epoch": 0.2784127668751348, + "grad_norm": 0.7112993001937866, + "learning_rate": 9.885180595960779e-06, + "loss": 0.7153, + "step": 5164 + }, + { + "epoch": 0.2784666810437783, + "grad_norm": 1.1651597023010254, + "learning_rate": 9.88513541861415e-06, + "loss": 0.7977, + "step": 5165 + }, + { + "epoch": 0.27852059521242184, + "grad_norm": 0.7818348407745361, + "learning_rate": 9.88509023248471e-06, + "loss": 0.7502, + "step": 5166 + }, + { + "epoch": 0.27857450938106537, + "grad_norm": 0.6622827053070068, + "learning_rate": 9.885045037572534e-06, + "loss": 0.6677, + "step": 5167 + }, + { + "epoch": 0.27862842354970885, + "grad_norm": 0.7490810751914978, + "learning_rate": 9.884999833877706e-06, + "loss": 0.8084, + "step": 5168 + }, + { + "epoch": 0.2786823377183524, + "grad_norm": 0.7105234861373901, + "learning_rate": 9.88495462140031e-06, + "loss": 0.7346, + "step": 5169 + }, + { + "epoch": 0.2787362518869959, + "grad_norm": 0.7885896563529968, + "learning_rate": 9.884909400140421e-06, + "loss": 0.7143, + "step": 5170 + }, + { + "epoch": 0.27879016605563944, + "grad_norm": 0.8720527291297913, + "learning_rate": 9.884864170098125e-06, + "loss": 0.7752, + "step": 5171 + }, + { + "epoch": 0.2788440802242829, + "grad_norm": 0.749433159828186, + "learning_rate": 9.884818931273501e-06, + "loss": 0.8073, + "step": 5172 + }, + { + "epoch": 0.27889799439292645, + "grad_norm": 0.784222424030304, + "learning_rate": 9.884773683666633e-06, + "loss": 0.8404, + "step": 5173 + }, + { + "epoch": 0.27895190856157, + "grad_norm": 0.7572906613349915, + "learning_rate": 9.8847284272776e-06, + "loss": 0.8205, + "step": 5174 + }, + { + "epoch": 0.2790058227302135, + "grad_norm": 0.793807327747345, + "learning_rate": 9.884683162106484e-06, + "loss": 0.7864, + "step": 5175 + }, + { + "epoch": 0.27905973689885705, + "grad_norm": 0.7129535675048828, + "learning_rate": 9.884637888153366e-06, + "loss": 0.8112, + "step": 5176 + }, + { + "epoch": 0.2791136510675005, + "grad_norm": 0.9506208896636963, + "learning_rate": 9.884592605418329e-06, + "loss": 0.8708, + "step": 5177 + }, + { + "epoch": 0.27916756523614406, + "grad_norm": 0.7119637727737427, + "learning_rate": 9.884547313901452e-06, + "loss": 0.7684, + "step": 5178 + }, + { + "epoch": 0.2792214794047876, + "grad_norm": 0.7711455225944519, + "learning_rate": 9.88450201360282e-06, + "loss": 0.7419, + "step": 5179 + }, + { + "epoch": 0.2792753935734311, + "grad_norm": 0.7384727001190186, + "learning_rate": 9.88445670452251e-06, + "loss": 0.8021, + "step": 5180 + }, + { + "epoch": 0.2793293077420746, + "grad_norm": 0.8161928057670593, + "learning_rate": 9.884411386660606e-06, + "loss": 0.8036, + "step": 5181 + }, + { + "epoch": 0.2793832219107181, + "grad_norm": 0.7533312439918518, + "learning_rate": 9.88436606001719e-06, + "loss": 0.6964, + "step": 5182 + }, + { + "epoch": 0.27943713607936166, + "grad_norm": 0.7554582357406616, + "learning_rate": 9.884320724592342e-06, + "loss": 0.7167, + "step": 5183 + }, + { + "epoch": 0.2794910502480052, + "grad_norm": 0.8615080118179321, + "learning_rate": 9.884275380386143e-06, + "loss": 0.8091, + "step": 5184 + }, + { + "epoch": 0.2795449644166487, + "grad_norm": 0.7100309133529663, + "learning_rate": 9.884230027398676e-06, + "loss": 0.7496, + "step": 5185 + }, + { + "epoch": 0.2795988785852922, + "grad_norm": 0.7255486845970154, + "learning_rate": 9.884184665630024e-06, + "loss": 0.6666, + "step": 5186 + }, + { + "epoch": 0.27965279275393573, + "grad_norm": 0.8223450779914856, + "learning_rate": 9.884139295080264e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.27970670692257926, + "grad_norm": 0.7971575856208801, + "learning_rate": 9.884093915749483e-06, + "loss": 0.7606, + "step": 5188 + }, + { + "epoch": 0.2797606210912228, + "grad_norm": 0.749407947063446, + "learning_rate": 9.884048527637757e-06, + "loss": 0.7972, + "step": 5189 + }, + { + "epoch": 0.27981453525986627, + "grad_norm": 0.812382698059082, + "learning_rate": 9.884003130745172e-06, + "loss": 0.736, + "step": 5190 + }, + { + "epoch": 0.2798684494285098, + "grad_norm": 0.7022697925567627, + "learning_rate": 9.883957725071808e-06, + "loss": 0.7541, + "step": 5191 + }, + { + "epoch": 0.27992236359715333, + "grad_norm": 0.8105473518371582, + "learning_rate": 9.883912310617747e-06, + "loss": 0.8424, + "step": 5192 + }, + { + "epoch": 0.27997627776579687, + "grad_norm": 0.7091902494430542, + "learning_rate": 9.883866887383072e-06, + "loss": 0.7669, + "step": 5193 + }, + { + "epoch": 0.2800301919344404, + "grad_norm": 1.0922960042953491, + "learning_rate": 9.88382145536786e-06, + "loss": 0.7833, + "step": 5194 + }, + { + "epoch": 0.2800841061030839, + "grad_norm": 0.6879577040672302, + "learning_rate": 9.883776014572197e-06, + "loss": 0.7359, + "step": 5195 + }, + { + "epoch": 0.2801380202717274, + "grad_norm": 0.7436100244522095, + "learning_rate": 9.883730564996164e-06, + "loss": 0.8516, + "step": 5196 + }, + { + "epoch": 0.28019193444037094, + "grad_norm": 0.6883706450462341, + "learning_rate": 9.88368510663984e-06, + "loss": 0.7517, + "step": 5197 + }, + { + "epoch": 0.28024584860901447, + "grad_norm": 0.7650019526481628, + "learning_rate": 9.88363963950331e-06, + "loss": 0.8363, + "step": 5198 + }, + { + "epoch": 0.28029976277765795, + "grad_norm": 0.7188605070114136, + "learning_rate": 9.883594163586657e-06, + "loss": 0.6951, + "step": 5199 + }, + { + "epoch": 0.2803536769463015, + "grad_norm": 0.8194975852966309, + "learning_rate": 9.883548678889956e-06, + "loss": 0.8482, + "step": 5200 + }, + { + "epoch": 0.280407591114945, + "grad_norm": 0.8289690017700195, + "learning_rate": 9.883503185413296e-06, + "loss": 0.779, + "step": 5201 + }, + { + "epoch": 0.28046150528358854, + "grad_norm": 0.7521582245826721, + "learning_rate": 9.883457683156755e-06, + "loss": 0.7972, + "step": 5202 + }, + { + "epoch": 0.2805154194522321, + "grad_norm": 0.788716733455658, + "learning_rate": 9.883412172120416e-06, + "loss": 0.7889, + "step": 5203 + }, + { + "epoch": 0.28056933362087555, + "grad_norm": 0.7104058265686035, + "learning_rate": 9.88336665230436e-06, + "loss": 0.8007, + "step": 5204 + }, + { + "epoch": 0.2806232477895191, + "grad_norm": 0.7565460801124573, + "learning_rate": 9.88332112370867e-06, + "loss": 0.7697, + "step": 5205 + }, + { + "epoch": 0.2806771619581626, + "grad_norm": 0.7092845439910889, + "learning_rate": 9.883275586333427e-06, + "loss": 0.8195, + "step": 5206 + }, + { + "epoch": 0.28073107612680614, + "grad_norm": 0.7442013621330261, + "learning_rate": 9.883230040178712e-06, + "loss": 0.7558, + "step": 5207 + }, + { + "epoch": 0.2807849902954496, + "grad_norm": 0.7580548524856567, + "learning_rate": 9.88318448524461e-06, + "loss": 0.8208, + "step": 5208 + }, + { + "epoch": 0.28083890446409315, + "grad_norm": 0.8129982948303223, + "learning_rate": 9.883138921531202e-06, + "loss": 0.7475, + "step": 5209 + }, + { + "epoch": 0.2808928186327367, + "grad_norm": 0.708234429359436, + "learning_rate": 9.883093349038567e-06, + "loss": 0.7157, + "step": 5210 + }, + { + "epoch": 0.2809467328013802, + "grad_norm": 0.8034481406211853, + "learning_rate": 9.883047767766786e-06, + "loss": 0.8209, + "step": 5211 + }, + { + "epoch": 0.28100064697002375, + "grad_norm": 0.7125903367996216, + "learning_rate": 9.883002177715948e-06, + "loss": 0.7216, + "step": 5212 + }, + { + "epoch": 0.2810545611386672, + "grad_norm": 0.7171017527580261, + "learning_rate": 9.882956578886128e-06, + "loss": 0.7364, + "step": 5213 + }, + { + "epoch": 0.28110847530731076, + "grad_norm": 0.8210889101028442, + "learning_rate": 9.882910971277413e-06, + "loss": 0.7802, + "step": 5214 + }, + { + "epoch": 0.2811623894759543, + "grad_norm": 0.8742465972900391, + "learning_rate": 9.88286535488988e-06, + "loss": 0.8323, + "step": 5215 + }, + { + "epoch": 0.2812163036445978, + "grad_norm": 0.7613638043403625, + "learning_rate": 9.882819729723616e-06, + "loss": 0.82, + "step": 5216 + }, + { + "epoch": 0.2812702178132413, + "grad_norm": 0.8171275854110718, + "learning_rate": 9.882774095778698e-06, + "loss": 0.7821, + "step": 5217 + }, + { + "epoch": 0.28132413198188483, + "grad_norm": 0.9041802287101746, + "learning_rate": 9.882728453055212e-06, + "loss": 0.7862, + "step": 5218 + }, + { + "epoch": 0.28137804615052836, + "grad_norm": 0.7604931592941284, + "learning_rate": 9.88268280155324e-06, + "loss": 0.7713, + "step": 5219 + }, + { + "epoch": 0.2814319603191719, + "grad_norm": 0.7445857524871826, + "learning_rate": 9.882637141272861e-06, + "loss": 0.7453, + "step": 5220 + }, + { + "epoch": 0.2814858744878154, + "grad_norm": 0.7988085150718689, + "learning_rate": 9.882591472214161e-06, + "loss": 0.7268, + "step": 5221 + }, + { + "epoch": 0.2815397886564589, + "grad_norm": 0.7210063934326172, + "learning_rate": 9.882545794377219e-06, + "loss": 0.7605, + "step": 5222 + }, + { + "epoch": 0.28159370282510243, + "grad_norm": 0.8361137509346008, + "learning_rate": 9.882500107762117e-06, + "loss": 0.8923, + "step": 5223 + }, + { + "epoch": 0.28164761699374596, + "grad_norm": 0.7608784437179565, + "learning_rate": 9.88245441236894e-06, + "loss": 0.7601, + "step": 5224 + }, + { + "epoch": 0.2817015311623895, + "grad_norm": 0.7668020725250244, + "learning_rate": 9.882408708197766e-06, + "loss": 0.7655, + "step": 5225 + }, + { + "epoch": 0.28175544533103297, + "grad_norm": 0.7516483068466187, + "learning_rate": 9.882362995248681e-06, + "loss": 0.7331, + "step": 5226 + }, + { + "epoch": 0.2818093594996765, + "grad_norm": 0.7915279865264893, + "learning_rate": 9.882317273521769e-06, + "loss": 0.8115, + "step": 5227 + }, + { + "epoch": 0.28186327366832004, + "grad_norm": 0.8899939656257629, + "learning_rate": 9.882271543017106e-06, + "loss": 0.7087, + "step": 5228 + }, + { + "epoch": 0.28191718783696357, + "grad_norm": 0.7095377445220947, + "learning_rate": 9.882225803734778e-06, + "loss": 0.6556, + "step": 5229 + }, + { + "epoch": 0.2819711020056071, + "grad_norm": 0.9443415403366089, + "learning_rate": 9.882180055674864e-06, + "loss": 0.7678, + "step": 5230 + }, + { + "epoch": 0.2820250161742506, + "grad_norm": 0.7427262663841248, + "learning_rate": 9.882134298837452e-06, + "loss": 0.7256, + "step": 5231 + }, + { + "epoch": 0.2820789303428941, + "grad_norm": 0.8854336142539978, + "learning_rate": 9.88208853322262e-06, + "loss": 0.7646, + "step": 5232 + }, + { + "epoch": 0.28213284451153764, + "grad_norm": 0.7270344495773315, + "learning_rate": 9.88204275883045e-06, + "loss": 0.7714, + "step": 5233 + }, + { + "epoch": 0.28218675868018117, + "grad_norm": 0.8252066969871521, + "learning_rate": 9.881996975661026e-06, + "loss": 0.835, + "step": 5234 + }, + { + "epoch": 0.28224067284882465, + "grad_norm": 0.803297221660614, + "learning_rate": 9.881951183714432e-06, + "loss": 0.758, + "step": 5235 + }, + { + "epoch": 0.2822945870174682, + "grad_norm": 0.7957141399383545, + "learning_rate": 9.881905382990746e-06, + "loss": 0.8464, + "step": 5236 + }, + { + "epoch": 0.2823485011861117, + "grad_norm": 0.7272878885269165, + "learning_rate": 9.881859573490054e-06, + "loss": 0.8062, + "step": 5237 + }, + { + "epoch": 0.28240241535475524, + "grad_norm": 0.7172011733055115, + "learning_rate": 9.881813755212434e-06, + "loss": 0.8349, + "step": 5238 + }, + { + "epoch": 0.2824563295233988, + "grad_norm": 0.9161372184753418, + "learning_rate": 9.881767928157976e-06, + "loss": 0.815, + "step": 5239 + }, + { + "epoch": 0.28251024369204225, + "grad_norm": 0.6740238666534424, + "learning_rate": 9.881722092326753e-06, + "loss": 0.7096, + "step": 5240 + }, + { + "epoch": 0.2825641578606858, + "grad_norm": 0.740080714225769, + "learning_rate": 9.881676247718855e-06, + "loss": 0.7848, + "step": 5241 + }, + { + "epoch": 0.2826180720293293, + "grad_norm": 0.7177533507347107, + "learning_rate": 9.88163039433436e-06, + "loss": 0.7406, + "step": 5242 + }, + { + "epoch": 0.28267198619797285, + "grad_norm": 0.6941720247268677, + "learning_rate": 9.881584532173352e-06, + "loss": 0.746, + "step": 5243 + }, + { + "epoch": 0.2827259003666163, + "grad_norm": 0.902332603931427, + "learning_rate": 9.881538661235914e-06, + "loss": 0.8925, + "step": 5244 + }, + { + "epoch": 0.28277981453525985, + "grad_norm": 0.7620295882225037, + "learning_rate": 9.881492781522128e-06, + "loss": 0.7031, + "step": 5245 + }, + { + "epoch": 0.2828337287039034, + "grad_norm": 0.735544741153717, + "learning_rate": 9.881446893032077e-06, + "loss": 0.8363, + "step": 5246 + }, + { + "epoch": 0.2828876428725469, + "grad_norm": 0.7686198353767395, + "learning_rate": 9.881400995765843e-06, + "loss": 0.681, + "step": 5247 + }, + { + "epoch": 0.28294155704119045, + "grad_norm": 0.7868270874023438, + "learning_rate": 9.881355089723505e-06, + "loss": 0.7561, + "step": 5248 + }, + { + "epoch": 0.2829954712098339, + "grad_norm": 0.8680627942085266, + "learning_rate": 9.881309174905152e-06, + "loss": 0.7526, + "step": 5249 + }, + { + "epoch": 0.28304938537847746, + "grad_norm": 0.748607873916626, + "learning_rate": 9.881263251310862e-06, + "loss": 0.7898, + "step": 5250 + }, + { + "epoch": 0.283103299547121, + "grad_norm": 0.7534239292144775, + "learning_rate": 9.88121731894072e-06, + "loss": 0.7188, + "step": 5251 + }, + { + "epoch": 0.2831572137157645, + "grad_norm": 0.7027668952941895, + "learning_rate": 9.881171377794808e-06, + "loss": 0.8206, + "step": 5252 + }, + { + "epoch": 0.283211127884408, + "grad_norm": 0.7831504940986633, + "learning_rate": 9.881125427873206e-06, + "loss": 0.7974, + "step": 5253 + }, + { + "epoch": 0.28326504205305153, + "grad_norm": 0.657393753528595, + "learning_rate": 9.881079469176e-06, + "loss": 0.6525, + "step": 5254 + }, + { + "epoch": 0.28331895622169506, + "grad_norm": 0.7056339383125305, + "learning_rate": 9.881033501703272e-06, + "loss": 0.6854, + "step": 5255 + }, + { + "epoch": 0.2833728703903386, + "grad_norm": 0.7217456102371216, + "learning_rate": 9.880987525455105e-06, + "loss": 0.7541, + "step": 5256 + }, + { + "epoch": 0.2834267845589821, + "grad_norm": 0.7223145961761475, + "learning_rate": 9.880941540431579e-06, + "loss": 0.8176, + "step": 5257 + }, + { + "epoch": 0.2834806987276256, + "grad_norm": 0.6996636986732483, + "learning_rate": 9.880895546632779e-06, + "loss": 0.7219, + "step": 5258 + }, + { + "epoch": 0.28353461289626913, + "grad_norm": 0.7340953946113586, + "learning_rate": 9.880849544058787e-06, + "loss": 0.8225, + "step": 5259 + }, + { + "epoch": 0.28358852706491267, + "grad_norm": 0.7698047757148743, + "learning_rate": 9.880803532709687e-06, + "loss": 0.8609, + "step": 5260 + }, + { + "epoch": 0.2836424412335562, + "grad_norm": 0.781949520111084, + "learning_rate": 9.880757512585558e-06, + "loss": 0.8634, + "step": 5261 + }, + { + "epoch": 0.2836963554021997, + "grad_norm": 0.681658923625946, + "learning_rate": 9.880711483686488e-06, + "loss": 0.6711, + "step": 5262 + }, + { + "epoch": 0.2837502695708432, + "grad_norm": 0.802488386631012, + "learning_rate": 9.880665446012553e-06, + "loss": 0.7737, + "step": 5263 + }, + { + "epoch": 0.28380418373948674, + "grad_norm": 0.8142992258071899, + "learning_rate": 9.880619399563844e-06, + "loss": 0.8235, + "step": 5264 + }, + { + "epoch": 0.28385809790813027, + "grad_norm": 0.7499324083328247, + "learning_rate": 9.880573344340438e-06, + "loss": 0.7042, + "step": 5265 + }, + { + "epoch": 0.2839120120767738, + "grad_norm": 0.768059253692627, + "learning_rate": 9.880527280342419e-06, + "loss": 0.7423, + "step": 5266 + }, + { + "epoch": 0.2839659262454173, + "grad_norm": 0.7405000329017639, + "learning_rate": 9.88048120756987e-06, + "loss": 0.6528, + "step": 5267 + }, + { + "epoch": 0.2840198404140608, + "grad_norm": 0.7251627445220947, + "learning_rate": 9.880435126022875e-06, + "loss": 0.7517, + "step": 5268 + }, + { + "epoch": 0.28407375458270434, + "grad_norm": 0.9095546007156372, + "learning_rate": 9.880389035701515e-06, + "loss": 0.8721, + "step": 5269 + }, + { + "epoch": 0.2841276687513479, + "grad_norm": 0.7784069776535034, + "learning_rate": 9.880342936605874e-06, + "loss": 0.8546, + "step": 5270 + }, + { + "epoch": 0.28418158291999135, + "grad_norm": 0.6824434995651245, + "learning_rate": 9.880296828736034e-06, + "loss": 0.7583, + "step": 5271 + }, + { + "epoch": 0.2842354970886349, + "grad_norm": 0.749785840511322, + "learning_rate": 9.88025071209208e-06, + "loss": 0.709, + "step": 5272 + }, + { + "epoch": 0.2842894112572784, + "grad_norm": 0.7068313360214233, + "learning_rate": 9.880204586674093e-06, + "loss": 0.699, + "step": 5273 + }, + { + "epoch": 0.28434332542592194, + "grad_norm": 0.7990247011184692, + "learning_rate": 9.880158452482155e-06, + "loss": 0.8137, + "step": 5274 + }, + { + "epoch": 0.2843972395945655, + "grad_norm": 0.821013867855072, + "learning_rate": 9.880112309516352e-06, + "loss": 0.7723, + "step": 5275 + }, + { + "epoch": 0.28445115376320895, + "grad_norm": 0.680288553237915, + "learning_rate": 9.880066157776764e-06, + "loss": 0.6754, + "step": 5276 + }, + { + "epoch": 0.2845050679318525, + "grad_norm": 0.7425721883773804, + "learning_rate": 9.880019997263477e-06, + "loss": 0.7894, + "step": 5277 + }, + { + "epoch": 0.284558982100496, + "grad_norm": 0.7550294995307922, + "learning_rate": 9.87997382797657e-06, + "loss": 0.732, + "step": 5278 + }, + { + "epoch": 0.28461289626913955, + "grad_norm": 0.8641289472579956, + "learning_rate": 9.87992764991613e-06, + "loss": 0.8209, + "step": 5279 + }, + { + "epoch": 0.284666810437783, + "grad_norm": 0.7044229507446289, + "learning_rate": 9.879881463082238e-06, + "loss": 0.7403, + "step": 5280 + }, + { + "epoch": 0.28472072460642656, + "grad_norm": 0.7343770861625671, + "learning_rate": 9.879835267474975e-06, + "loss": 0.7428, + "step": 5281 + }, + { + "epoch": 0.2847746387750701, + "grad_norm": 0.7690380215644836, + "learning_rate": 9.879789063094429e-06, + "loss": 0.8236, + "step": 5282 + }, + { + "epoch": 0.2848285529437136, + "grad_norm": 0.7682362198829651, + "learning_rate": 9.879742849940679e-06, + "loss": 0.7854, + "step": 5283 + }, + { + "epoch": 0.28488246711235715, + "grad_norm": 0.7253369688987732, + "learning_rate": 9.87969662801381e-06, + "loss": 0.8281, + "step": 5284 + }, + { + "epoch": 0.28493638128100063, + "grad_norm": 0.7726433277130127, + "learning_rate": 9.879650397313905e-06, + "loss": 0.7586, + "step": 5285 + }, + { + "epoch": 0.28499029544964416, + "grad_norm": 0.804685115814209, + "learning_rate": 9.879604157841044e-06, + "loss": 0.8654, + "step": 5286 + }, + { + "epoch": 0.2850442096182877, + "grad_norm": 0.7872894406318665, + "learning_rate": 9.879557909595316e-06, + "loss": 0.7907, + "step": 5287 + }, + { + "epoch": 0.2850981237869312, + "grad_norm": 0.7489103078842163, + "learning_rate": 9.879511652576801e-06, + "loss": 0.7459, + "step": 5288 + }, + { + "epoch": 0.2851520379555747, + "grad_norm": 0.8003327250480652, + "learning_rate": 9.879465386785581e-06, + "loss": 0.7579, + "step": 5289 + }, + { + "epoch": 0.28520595212421823, + "grad_norm": 0.7461791634559631, + "learning_rate": 9.879419112221741e-06, + "loss": 0.7757, + "step": 5290 + }, + { + "epoch": 0.28525986629286176, + "grad_norm": 0.7338587641716003, + "learning_rate": 9.879372828885364e-06, + "loss": 0.7835, + "step": 5291 + }, + { + "epoch": 0.2853137804615053, + "grad_norm": 0.7397693395614624, + "learning_rate": 9.87932653677653e-06, + "loss": 0.7713, + "step": 5292 + }, + { + "epoch": 0.2853676946301488, + "grad_norm": 0.8379868865013123, + "learning_rate": 9.879280235895327e-06, + "loss": 0.8882, + "step": 5293 + }, + { + "epoch": 0.2854216087987923, + "grad_norm": 0.7283885478973389, + "learning_rate": 9.879233926241836e-06, + "loss": 0.7085, + "step": 5294 + }, + { + "epoch": 0.28547552296743584, + "grad_norm": 0.915597140789032, + "learning_rate": 9.879187607816141e-06, + "loss": 0.853, + "step": 5295 + }, + { + "epoch": 0.28552943713607937, + "grad_norm": 0.7851650714874268, + "learning_rate": 9.879141280618325e-06, + "loss": 0.8858, + "step": 5296 + }, + { + "epoch": 0.2855833513047229, + "grad_norm": 0.7895732522010803, + "learning_rate": 9.879094944648468e-06, + "loss": 0.8603, + "step": 5297 + }, + { + "epoch": 0.2856372654733664, + "grad_norm": 0.9263603687286377, + "learning_rate": 9.87904859990666e-06, + "loss": 0.8225, + "step": 5298 + }, + { + "epoch": 0.2856911796420099, + "grad_norm": 0.8861474990844727, + "learning_rate": 9.879002246392979e-06, + "loss": 0.7079, + "step": 5299 + }, + { + "epoch": 0.28574509381065344, + "grad_norm": 0.7643340229988098, + "learning_rate": 9.87895588410751e-06, + "loss": 0.7841, + "step": 5300 + }, + { + "epoch": 0.28579900797929697, + "grad_norm": 0.746583878993988, + "learning_rate": 9.878909513050337e-06, + "loss": 0.8013, + "step": 5301 + }, + { + "epoch": 0.2858529221479405, + "grad_norm": 0.7135025262832642, + "learning_rate": 9.878863133221542e-06, + "loss": 0.7171, + "step": 5302 + }, + { + "epoch": 0.285906836316584, + "grad_norm": 0.7493758201599121, + "learning_rate": 9.878816744621209e-06, + "loss": 0.8217, + "step": 5303 + }, + { + "epoch": 0.2859607504852275, + "grad_norm": 0.8908335566520691, + "learning_rate": 9.878770347249423e-06, + "loss": 0.8303, + "step": 5304 + }, + { + "epoch": 0.28601466465387104, + "grad_norm": 0.7408186793327332, + "learning_rate": 9.878723941106263e-06, + "loss": 0.7275, + "step": 5305 + }, + { + "epoch": 0.2860685788225146, + "grad_norm": 0.8047646880149841, + "learning_rate": 9.878677526191818e-06, + "loss": 0.6659, + "step": 5306 + }, + { + "epoch": 0.28612249299115805, + "grad_norm": 0.7265205979347229, + "learning_rate": 9.878631102506168e-06, + "loss": 0.7725, + "step": 5307 + }, + { + "epoch": 0.2861764071598016, + "grad_norm": 0.98882657289505, + "learning_rate": 9.878584670049398e-06, + "loss": 0.8552, + "step": 5308 + }, + { + "epoch": 0.2862303213284451, + "grad_norm": 0.8431620001792908, + "learning_rate": 9.878538228821588e-06, + "loss": 0.8504, + "step": 5309 + }, + { + "epoch": 0.28628423549708865, + "grad_norm": 0.920662522315979, + "learning_rate": 9.878491778822828e-06, + "loss": 0.8216, + "step": 5310 + }, + { + "epoch": 0.2863381496657322, + "grad_norm": 0.7579310536384583, + "learning_rate": 9.878445320053195e-06, + "loss": 0.7501, + "step": 5311 + }, + { + "epoch": 0.28639206383437565, + "grad_norm": 0.7596756219863892, + "learning_rate": 9.878398852512776e-06, + "loss": 0.815, + "step": 5312 + }, + { + "epoch": 0.2864459780030192, + "grad_norm": 0.8128134608268738, + "learning_rate": 9.878352376201654e-06, + "loss": 0.7782, + "step": 5313 + }, + { + "epoch": 0.2864998921716627, + "grad_norm": 0.7208645939826965, + "learning_rate": 9.878305891119913e-06, + "loss": 0.7444, + "step": 5314 + }, + { + "epoch": 0.28655380634030625, + "grad_norm": 0.8024547100067139, + "learning_rate": 9.878259397267635e-06, + "loss": 0.818, + "step": 5315 + }, + { + "epoch": 0.2866077205089497, + "grad_norm": 0.8033369183540344, + "learning_rate": 9.878212894644904e-06, + "loss": 0.777, + "step": 5316 + }, + { + "epoch": 0.28666163467759326, + "grad_norm": 0.7594527006149292, + "learning_rate": 9.878166383251805e-06, + "loss": 0.7681, + "step": 5317 + }, + { + "epoch": 0.2867155488462368, + "grad_norm": 0.6697728037834167, + "learning_rate": 9.878119863088421e-06, + "loss": 0.724, + "step": 5318 + }, + { + "epoch": 0.2867694630148803, + "grad_norm": 0.7886657118797302, + "learning_rate": 9.878073334154835e-06, + "loss": 0.8544, + "step": 5319 + }, + { + "epoch": 0.28682337718352385, + "grad_norm": 0.7841383218765259, + "learning_rate": 9.878026796451132e-06, + "loss": 0.7671, + "step": 5320 + }, + { + "epoch": 0.28687729135216733, + "grad_norm": 0.766963005065918, + "learning_rate": 9.877980249977393e-06, + "loss": 0.7516, + "step": 5321 + }, + { + "epoch": 0.28693120552081086, + "grad_norm": 0.7714352607727051, + "learning_rate": 9.877933694733705e-06, + "loss": 0.7246, + "step": 5322 + }, + { + "epoch": 0.2869851196894544, + "grad_norm": 0.7595851421356201, + "learning_rate": 9.87788713072015e-06, + "loss": 0.7312, + "step": 5323 + }, + { + "epoch": 0.2870390338580979, + "grad_norm": 0.8249819278717041, + "learning_rate": 9.877840557936811e-06, + "loss": 0.6534, + "step": 5324 + }, + { + "epoch": 0.2870929480267414, + "grad_norm": 0.8358021974563599, + "learning_rate": 9.877793976383772e-06, + "loss": 0.7759, + "step": 5325 + }, + { + "epoch": 0.28714686219538493, + "grad_norm": 0.7542338371276855, + "learning_rate": 9.877747386061118e-06, + "loss": 0.7497, + "step": 5326 + }, + { + "epoch": 0.28720077636402846, + "grad_norm": 0.6970787644386292, + "learning_rate": 9.877700786968932e-06, + "loss": 0.6836, + "step": 5327 + }, + { + "epoch": 0.287254690532672, + "grad_norm": 0.7709139585494995, + "learning_rate": 9.877654179107298e-06, + "loss": 0.7574, + "step": 5328 + }, + { + "epoch": 0.28730860470131553, + "grad_norm": 0.7152370810508728, + "learning_rate": 9.877607562476299e-06, + "loss": 0.7974, + "step": 5329 + }, + { + "epoch": 0.287362518869959, + "grad_norm": 1.1318089962005615, + "learning_rate": 9.877560937076021e-06, + "loss": 0.7187, + "step": 5330 + }, + { + "epoch": 0.28741643303860254, + "grad_norm": 0.66380774974823, + "learning_rate": 9.877514302906546e-06, + "loss": 0.6937, + "step": 5331 + }, + { + "epoch": 0.28747034720724607, + "grad_norm": 0.8609433770179749, + "learning_rate": 9.877467659967957e-06, + "loss": 0.8743, + "step": 5332 + }, + { + "epoch": 0.2875242613758896, + "grad_norm": 0.7391762733459473, + "learning_rate": 9.87742100826034e-06, + "loss": 0.6951, + "step": 5333 + }, + { + "epoch": 0.2875781755445331, + "grad_norm": 0.7332816123962402, + "learning_rate": 9.877374347783776e-06, + "loss": 0.7715, + "step": 5334 + }, + { + "epoch": 0.2876320897131766, + "grad_norm": 0.7669941782951355, + "learning_rate": 9.877327678538351e-06, + "loss": 0.7943, + "step": 5335 + }, + { + "epoch": 0.28768600388182014, + "grad_norm": 0.9585753679275513, + "learning_rate": 9.87728100052415e-06, + "loss": 0.8018, + "step": 5336 + }, + { + "epoch": 0.28773991805046367, + "grad_norm": 0.7633230686187744, + "learning_rate": 9.877234313741255e-06, + "loss": 0.7118, + "step": 5337 + }, + { + "epoch": 0.2877938322191072, + "grad_norm": 0.7662307620048523, + "learning_rate": 9.877187618189751e-06, + "loss": 0.8159, + "step": 5338 + }, + { + "epoch": 0.2878477463877507, + "grad_norm": 0.8725135922431946, + "learning_rate": 9.877140913869722e-06, + "loss": 0.7314, + "step": 5339 + }, + { + "epoch": 0.2879016605563942, + "grad_norm": 0.7815779447555542, + "learning_rate": 9.87709420078125e-06, + "loss": 0.7562, + "step": 5340 + }, + { + "epoch": 0.28795557472503774, + "grad_norm": 0.7647536396980286, + "learning_rate": 9.877047478924421e-06, + "loss": 0.7642, + "step": 5341 + }, + { + "epoch": 0.2880094888936813, + "grad_norm": 0.7150182723999023, + "learning_rate": 9.87700074829932e-06, + "loss": 0.7773, + "step": 5342 + }, + { + "epoch": 0.28806340306232475, + "grad_norm": 0.7187753915786743, + "learning_rate": 9.876954008906026e-06, + "loss": 0.7776, + "step": 5343 + }, + { + "epoch": 0.2881173172309683, + "grad_norm": 0.7617197036743164, + "learning_rate": 9.876907260744628e-06, + "loss": 0.8818, + "step": 5344 + }, + { + "epoch": 0.2881712313996118, + "grad_norm": 0.7334546446800232, + "learning_rate": 9.876860503815208e-06, + "loss": 0.8162, + "step": 5345 + }, + { + "epoch": 0.28822514556825535, + "grad_norm": 0.8149188756942749, + "learning_rate": 9.876813738117852e-06, + "loss": 0.7801, + "step": 5346 + }, + { + "epoch": 0.2882790597368989, + "grad_norm": 0.8440023064613342, + "learning_rate": 9.876766963652642e-06, + "loss": 0.8394, + "step": 5347 + }, + { + "epoch": 0.28833297390554236, + "grad_norm": 0.7138864994049072, + "learning_rate": 9.876720180419664e-06, + "loss": 0.7316, + "step": 5348 + }, + { + "epoch": 0.2883868880741859, + "grad_norm": 0.7690035104751587, + "learning_rate": 9.876673388418999e-06, + "loss": 0.6458, + "step": 5349 + }, + { + "epoch": 0.2884408022428294, + "grad_norm": 0.844340980052948, + "learning_rate": 9.876626587650733e-06, + "loss": 0.8192, + "step": 5350 + }, + { + "epoch": 0.28849471641147295, + "grad_norm": 0.7028863430023193, + "learning_rate": 9.87657977811495e-06, + "loss": 0.7469, + "step": 5351 + }, + { + "epoch": 0.2885486305801164, + "grad_norm": 0.6825146079063416, + "learning_rate": 9.876532959811735e-06, + "loss": 0.6672, + "step": 5352 + }, + { + "epoch": 0.28860254474875996, + "grad_norm": 0.928514838218689, + "learning_rate": 9.876486132741172e-06, + "loss": 0.9185, + "step": 5353 + }, + { + "epoch": 0.2886564589174035, + "grad_norm": 0.9195801615715027, + "learning_rate": 9.876439296903345e-06, + "loss": 0.887, + "step": 5354 + }, + { + "epoch": 0.288710373086047, + "grad_norm": 0.8025040030479431, + "learning_rate": 9.876392452298335e-06, + "loss": 0.7647, + "step": 5355 + }, + { + "epoch": 0.28876428725469055, + "grad_norm": 0.6811031699180603, + "learning_rate": 9.876345598926232e-06, + "loss": 0.7118, + "step": 5356 + }, + { + "epoch": 0.28881820142333403, + "grad_norm": 0.7687453031539917, + "learning_rate": 9.876298736787115e-06, + "loss": 0.8349, + "step": 5357 + }, + { + "epoch": 0.28887211559197756, + "grad_norm": 0.7131432890892029, + "learning_rate": 9.876251865881072e-06, + "loss": 0.7868, + "step": 5358 + }, + { + "epoch": 0.2889260297606211, + "grad_norm": 0.8985068202018738, + "learning_rate": 9.876204986208185e-06, + "loss": 0.8927, + "step": 5359 + }, + { + "epoch": 0.2889799439292646, + "grad_norm": 0.8284032344818115, + "learning_rate": 9.87615809776854e-06, + "loss": 0.9579, + "step": 5360 + }, + { + "epoch": 0.2890338580979081, + "grad_norm": 0.7818793058395386, + "learning_rate": 9.87611120056222e-06, + "loss": 0.8718, + "step": 5361 + }, + { + "epoch": 0.28908777226655163, + "grad_norm": 0.7686202526092529, + "learning_rate": 9.87606429458931e-06, + "loss": 0.7685, + "step": 5362 + }, + { + "epoch": 0.28914168643519517, + "grad_norm": 0.768067479133606, + "learning_rate": 9.876017379849892e-06, + "loss": 0.7785, + "step": 5363 + }, + { + "epoch": 0.2891956006038387, + "grad_norm": 0.846842885017395, + "learning_rate": 9.875970456344055e-06, + "loss": 0.7418, + "step": 5364 + }, + { + "epoch": 0.28924951477248223, + "grad_norm": 0.800483226776123, + "learning_rate": 9.87592352407188e-06, + "loss": 0.8441, + "step": 5365 + }, + { + "epoch": 0.2893034289411257, + "grad_norm": 0.8230191469192505, + "learning_rate": 9.875876583033451e-06, + "loss": 0.8538, + "step": 5366 + }, + { + "epoch": 0.28935734310976924, + "grad_norm": 0.7700148224830627, + "learning_rate": 9.875829633228855e-06, + "loss": 0.7969, + "step": 5367 + }, + { + "epoch": 0.28941125727841277, + "grad_norm": 0.8188271522521973, + "learning_rate": 9.875782674658173e-06, + "loss": 0.8411, + "step": 5368 + }, + { + "epoch": 0.2894651714470563, + "grad_norm": 0.8774964809417725, + "learning_rate": 9.875735707321495e-06, + "loss": 0.9097, + "step": 5369 + }, + { + "epoch": 0.2895190856156998, + "grad_norm": 0.6922599077224731, + "learning_rate": 9.875688731218898e-06, + "loss": 0.7647, + "step": 5370 + }, + { + "epoch": 0.2895729997843433, + "grad_norm": 0.8296899795532227, + "learning_rate": 9.875641746350472e-06, + "loss": 0.8133, + "step": 5371 + }, + { + "epoch": 0.28962691395298684, + "grad_norm": 0.9972916841506958, + "learning_rate": 9.8755947527163e-06, + "loss": 0.9084, + "step": 5372 + }, + { + "epoch": 0.2896808281216304, + "grad_norm": 0.6791282892227173, + "learning_rate": 9.875547750316465e-06, + "loss": 0.6742, + "step": 5373 + }, + { + "epoch": 0.2897347422902739, + "grad_norm": 0.7278220057487488, + "learning_rate": 9.875500739151054e-06, + "loss": 0.7947, + "step": 5374 + }, + { + "epoch": 0.2897886564589174, + "grad_norm": 0.7634933590888977, + "learning_rate": 9.87545371922015e-06, + "loss": 0.8535, + "step": 5375 + }, + { + "epoch": 0.2898425706275609, + "grad_norm": 0.8038228750228882, + "learning_rate": 9.875406690523837e-06, + "loss": 0.8205, + "step": 5376 + }, + { + "epoch": 0.28989648479620445, + "grad_norm": 0.7821580767631531, + "learning_rate": 9.8753596530622e-06, + "loss": 0.7765, + "step": 5377 + }, + { + "epoch": 0.289950398964848, + "grad_norm": 0.7491927742958069, + "learning_rate": 9.875312606835325e-06, + "loss": 0.7238, + "step": 5378 + }, + { + "epoch": 0.2900043131334915, + "grad_norm": 0.8357378840446472, + "learning_rate": 9.875265551843294e-06, + "loss": 0.8244, + "step": 5379 + }, + { + "epoch": 0.290058227302135, + "grad_norm": 0.792351484298706, + "learning_rate": 9.875218488086194e-06, + "loss": 0.7871, + "step": 5380 + }, + { + "epoch": 0.2901121414707785, + "grad_norm": 0.7484980225563049, + "learning_rate": 9.875171415564109e-06, + "loss": 0.7487, + "step": 5381 + }, + { + "epoch": 0.29016605563942205, + "grad_norm": 0.8140117526054382, + "learning_rate": 9.875124334277123e-06, + "loss": 0.7895, + "step": 5382 + }, + { + "epoch": 0.2902199698080656, + "grad_norm": 0.7369776964187622, + "learning_rate": 9.875077244225322e-06, + "loss": 0.7785, + "step": 5383 + }, + { + "epoch": 0.29027388397670906, + "grad_norm": 0.8499336242675781, + "learning_rate": 9.875030145408789e-06, + "loss": 0.8289, + "step": 5384 + }, + { + "epoch": 0.2903277981453526, + "grad_norm": 0.7209733724594116, + "learning_rate": 9.874983037827608e-06, + "loss": 0.6624, + "step": 5385 + }, + { + "epoch": 0.2903817123139961, + "grad_norm": 0.8489585518836975, + "learning_rate": 9.874935921481865e-06, + "loss": 0.8074, + "step": 5386 + }, + { + "epoch": 0.29043562648263965, + "grad_norm": 0.7765734195709229, + "learning_rate": 9.874888796371647e-06, + "loss": 0.7899, + "step": 5387 + }, + { + "epoch": 0.2904895406512832, + "grad_norm": 0.7301489114761353, + "learning_rate": 9.874841662497034e-06, + "loss": 0.6868, + "step": 5388 + }, + { + "epoch": 0.29054345481992666, + "grad_norm": 0.7872721552848816, + "learning_rate": 9.874794519858114e-06, + "loss": 0.8456, + "step": 5389 + }, + { + "epoch": 0.2905973689885702, + "grad_norm": 0.7796556949615479, + "learning_rate": 9.87474736845497e-06, + "loss": 0.7338, + "step": 5390 + }, + { + "epoch": 0.2906512831572137, + "grad_norm": 0.7958070635795593, + "learning_rate": 9.874700208287691e-06, + "loss": 0.773, + "step": 5391 + }, + { + "epoch": 0.29070519732585726, + "grad_norm": 0.8552476167678833, + "learning_rate": 9.874653039356356e-06, + "loss": 0.772, + "step": 5392 + }, + { + "epoch": 0.29075911149450073, + "grad_norm": 0.7346936464309692, + "learning_rate": 9.874605861661051e-06, + "loss": 0.7714, + "step": 5393 + }, + { + "epoch": 0.29081302566314426, + "grad_norm": 0.804050862789154, + "learning_rate": 9.874558675201864e-06, + "loss": 0.8539, + "step": 5394 + }, + { + "epoch": 0.2908669398317878, + "grad_norm": 0.7373083233833313, + "learning_rate": 9.874511479978879e-06, + "loss": 0.7483, + "step": 5395 + }, + { + "epoch": 0.29092085400043133, + "grad_norm": 0.8145542740821838, + "learning_rate": 9.874464275992177e-06, + "loss": 0.7697, + "step": 5396 + }, + { + "epoch": 0.29097476816907486, + "grad_norm": 0.6865667700767517, + "learning_rate": 9.874417063241848e-06, + "loss": 0.771, + "step": 5397 + }, + { + "epoch": 0.29102868233771834, + "grad_norm": 0.7204734086990356, + "learning_rate": 9.874369841727973e-06, + "loss": 0.7562, + "step": 5398 + }, + { + "epoch": 0.29108259650636187, + "grad_norm": 0.8261793851852417, + "learning_rate": 9.87432261145064e-06, + "loss": 0.8245, + "step": 5399 + }, + { + "epoch": 0.2911365106750054, + "grad_norm": 0.7563614845275879, + "learning_rate": 9.87427537240993e-06, + "loss": 0.8051, + "step": 5400 + }, + { + "epoch": 0.29119042484364893, + "grad_norm": 0.7967458367347717, + "learning_rate": 9.874228124605932e-06, + "loss": 0.8236, + "step": 5401 + }, + { + "epoch": 0.2912443390122924, + "grad_norm": 0.806373119354248, + "learning_rate": 9.874180868038729e-06, + "loss": 0.8202, + "step": 5402 + }, + { + "epoch": 0.29129825318093594, + "grad_norm": 0.6726234555244446, + "learning_rate": 9.874133602708406e-06, + "loss": 0.7128, + "step": 5403 + }, + { + "epoch": 0.29135216734957947, + "grad_norm": 0.7642708420753479, + "learning_rate": 9.874086328615047e-06, + "loss": 0.8134, + "step": 5404 + }, + { + "epoch": 0.291406081518223, + "grad_norm": 0.6992095708847046, + "learning_rate": 9.874039045758742e-06, + "loss": 0.6887, + "step": 5405 + }, + { + "epoch": 0.29145999568686654, + "grad_norm": 0.7869388461112976, + "learning_rate": 9.873991754139567e-06, + "loss": 0.8069, + "step": 5406 + }, + { + "epoch": 0.29151390985551, + "grad_norm": 0.7390547394752502, + "learning_rate": 9.873944453757616e-06, + "loss": 0.7591, + "step": 5407 + }, + { + "epoch": 0.29156782402415354, + "grad_norm": 0.6705611348152161, + "learning_rate": 9.873897144612968e-06, + "loss": 0.7474, + "step": 5408 + }, + { + "epoch": 0.2916217381927971, + "grad_norm": 0.7684745788574219, + "learning_rate": 9.873849826705711e-06, + "loss": 0.7477, + "step": 5409 + }, + { + "epoch": 0.2916756523614406, + "grad_norm": 0.7341989278793335, + "learning_rate": 9.87380250003593e-06, + "loss": 0.7634, + "step": 5410 + }, + { + "epoch": 0.2917295665300841, + "grad_norm": 0.7358923554420471, + "learning_rate": 9.873755164603708e-06, + "loss": 0.7, + "step": 5411 + }, + { + "epoch": 0.2917834806987276, + "grad_norm": 0.8319085836410522, + "learning_rate": 9.873707820409132e-06, + "loss": 0.859, + "step": 5412 + }, + { + "epoch": 0.29183739486737115, + "grad_norm": 0.8299946188926697, + "learning_rate": 9.873660467452288e-06, + "loss": 0.9912, + "step": 5413 + }, + { + "epoch": 0.2918913090360147, + "grad_norm": 0.7632084488868713, + "learning_rate": 9.87361310573326e-06, + "loss": 0.7579, + "step": 5414 + }, + { + "epoch": 0.2919452232046582, + "grad_norm": 0.8068237900733948, + "learning_rate": 9.873565735252131e-06, + "loss": 0.7249, + "step": 5415 + }, + { + "epoch": 0.2919991373733017, + "grad_norm": 0.8328914046287537, + "learning_rate": 9.873518356008988e-06, + "loss": 0.7903, + "step": 5416 + }, + { + "epoch": 0.2920530515419452, + "grad_norm": 0.7877300977706909, + "learning_rate": 9.873470968003917e-06, + "loss": 0.8328, + "step": 5417 + }, + { + "epoch": 0.29210696571058875, + "grad_norm": 0.7755314111709595, + "learning_rate": 9.873423571237004e-06, + "loss": 0.6584, + "step": 5418 + }, + { + "epoch": 0.2921608798792323, + "grad_norm": 0.8157472014427185, + "learning_rate": 9.873376165708332e-06, + "loss": 0.6761, + "step": 5419 + }, + { + "epoch": 0.29221479404787576, + "grad_norm": 0.7559711933135986, + "learning_rate": 9.873328751417985e-06, + "loss": 0.8345, + "step": 5420 + }, + { + "epoch": 0.2922687082165193, + "grad_norm": 0.8466331958770752, + "learning_rate": 9.873281328366053e-06, + "loss": 0.7568, + "step": 5421 + }, + { + "epoch": 0.2923226223851628, + "grad_norm": 0.7468219995498657, + "learning_rate": 9.873233896552617e-06, + "loss": 0.7857, + "step": 5422 + }, + { + "epoch": 0.29237653655380635, + "grad_norm": 0.7857210040092468, + "learning_rate": 9.873186455977763e-06, + "loss": 0.7557, + "step": 5423 + }, + { + "epoch": 0.2924304507224499, + "grad_norm": 0.7680637240409851, + "learning_rate": 9.873139006641577e-06, + "loss": 0.7225, + "step": 5424 + }, + { + "epoch": 0.29248436489109336, + "grad_norm": 0.7393225431442261, + "learning_rate": 9.873091548544146e-06, + "loss": 0.7978, + "step": 5425 + }, + { + "epoch": 0.2925382790597369, + "grad_norm": 0.8140562176704407, + "learning_rate": 9.873044081685552e-06, + "loss": 0.8496, + "step": 5426 + }, + { + "epoch": 0.2925921932283804, + "grad_norm": 0.7890025973320007, + "learning_rate": 9.872996606065883e-06, + "loss": 0.7475, + "step": 5427 + }, + { + "epoch": 0.29264610739702396, + "grad_norm": 0.8253166079521179, + "learning_rate": 9.872949121685223e-06, + "loss": 0.8336, + "step": 5428 + }, + { + "epoch": 0.29270002156566743, + "grad_norm": 0.9723641276359558, + "learning_rate": 9.872901628543657e-06, + "loss": 0.83, + "step": 5429 + }, + { + "epoch": 0.29275393573431097, + "grad_norm": 0.884645938873291, + "learning_rate": 9.87285412664127e-06, + "loss": 0.8324, + "step": 5430 + }, + { + "epoch": 0.2928078499029545, + "grad_norm": 0.7741670608520508, + "learning_rate": 9.872806615978152e-06, + "loss": 0.8724, + "step": 5431 + }, + { + "epoch": 0.29286176407159803, + "grad_norm": 0.6959695219993591, + "learning_rate": 9.872759096554383e-06, + "loss": 0.657, + "step": 5432 + }, + { + "epoch": 0.29291567824024156, + "grad_norm": 0.7823370695114136, + "learning_rate": 9.872711568370051e-06, + "loss": 0.7939, + "step": 5433 + }, + { + "epoch": 0.29296959240888504, + "grad_norm": 0.7705811858177185, + "learning_rate": 9.87266403142524e-06, + "loss": 0.7604, + "step": 5434 + }, + { + "epoch": 0.29302350657752857, + "grad_norm": 0.7560339570045471, + "learning_rate": 9.872616485720037e-06, + "loss": 0.7303, + "step": 5435 + }, + { + "epoch": 0.2930774207461721, + "grad_norm": 0.7380449771881104, + "learning_rate": 9.872568931254524e-06, + "loss": 0.6181, + "step": 5436 + }, + { + "epoch": 0.29313133491481563, + "grad_norm": 0.743810772895813, + "learning_rate": 9.872521368028794e-06, + "loss": 0.8403, + "step": 5437 + }, + { + "epoch": 0.2931852490834591, + "grad_norm": 0.7859793901443481, + "learning_rate": 9.872473796042924e-06, + "loss": 0.8448, + "step": 5438 + }, + { + "epoch": 0.29323916325210264, + "grad_norm": 0.7643007040023804, + "learning_rate": 9.872426215297003e-06, + "loss": 0.757, + "step": 5439 + }, + { + "epoch": 0.2932930774207462, + "grad_norm": 0.7227921485900879, + "learning_rate": 9.87237862579112e-06, + "loss": 0.7882, + "step": 5440 + }, + { + "epoch": 0.2933469915893897, + "grad_norm": 0.7416848540306091, + "learning_rate": 9.872331027525356e-06, + "loss": 0.7644, + "step": 5441 + }, + { + "epoch": 0.29340090575803324, + "grad_norm": 0.7258424758911133, + "learning_rate": 9.872283420499797e-06, + "loss": 0.6828, + "step": 5442 + }, + { + "epoch": 0.2934548199266767, + "grad_norm": 0.7854428291320801, + "learning_rate": 9.87223580471453e-06, + "loss": 0.795, + "step": 5443 + }, + { + "epoch": 0.29350873409532025, + "grad_norm": 0.7590177655220032, + "learning_rate": 9.87218818016964e-06, + "loss": 0.7798, + "step": 5444 + }, + { + "epoch": 0.2935626482639638, + "grad_norm": 0.7291384339332581, + "learning_rate": 9.872140546865212e-06, + "loss": 0.7249, + "step": 5445 + }, + { + "epoch": 0.2936165624326073, + "grad_norm": 0.8444628119468689, + "learning_rate": 9.872092904801334e-06, + "loss": 0.824, + "step": 5446 + }, + { + "epoch": 0.2936704766012508, + "grad_norm": 0.7586516737937927, + "learning_rate": 9.87204525397809e-06, + "loss": 0.7716, + "step": 5447 + }, + { + "epoch": 0.2937243907698943, + "grad_norm": 0.7367489337921143, + "learning_rate": 9.871997594395565e-06, + "loss": 0.6108, + "step": 5448 + }, + { + "epoch": 0.29377830493853785, + "grad_norm": 0.8746148347854614, + "learning_rate": 9.871949926053845e-06, + "loss": 0.841, + "step": 5449 + }, + { + "epoch": 0.2938322191071814, + "grad_norm": 0.8738248944282532, + "learning_rate": 9.871902248953017e-06, + "loss": 0.7911, + "step": 5450 + }, + { + "epoch": 0.2938861332758249, + "grad_norm": 0.8541892766952515, + "learning_rate": 9.871854563093167e-06, + "loss": 0.8283, + "step": 5451 + }, + { + "epoch": 0.2939400474444684, + "grad_norm": 0.7325894832611084, + "learning_rate": 9.871806868474376e-06, + "loss": 0.6988, + "step": 5452 + }, + { + "epoch": 0.2939939616131119, + "grad_norm": 0.730920135974884, + "learning_rate": 9.871759165096735e-06, + "loss": 0.7696, + "step": 5453 + }, + { + "epoch": 0.29404787578175545, + "grad_norm": 0.8190314173698425, + "learning_rate": 9.871711452960329e-06, + "loss": 0.8021, + "step": 5454 + }, + { + "epoch": 0.294101789950399, + "grad_norm": 0.7794191241264343, + "learning_rate": 9.871663732065243e-06, + "loss": 0.7141, + "step": 5455 + }, + { + "epoch": 0.29415570411904246, + "grad_norm": 0.729831874370575, + "learning_rate": 9.871616002411561e-06, + "loss": 0.7142, + "step": 5456 + }, + { + "epoch": 0.294209618287686, + "grad_norm": 0.8393380641937256, + "learning_rate": 9.871568263999371e-06, + "loss": 0.8494, + "step": 5457 + }, + { + "epoch": 0.2942635324563295, + "grad_norm": 0.7556251883506775, + "learning_rate": 9.87152051682876e-06, + "loss": 0.6954, + "step": 5458 + }, + { + "epoch": 0.29431744662497306, + "grad_norm": 0.7716967463493347, + "learning_rate": 9.87147276089981e-06, + "loss": 0.7139, + "step": 5459 + }, + { + "epoch": 0.2943713607936166, + "grad_norm": 0.7605961561203003, + "learning_rate": 9.871424996212611e-06, + "loss": 0.7788, + "step": 5460 + }, + { + "epoch": 0.29442527496226006, + "grad_norm": 0.7812150716781616, + "learning_rate": 9.871377222767245e-06, + "loss": 0.8462, + "step": 5461 + }, + { + "epoch": 0.2944791891309036, + "grad_norm": 0.7436057925224304, + "learning_rate": 9.8713294405638e-06, + "loss": 0.859, + "step": 5462 + }, + { + "epoch": 0.2945331032995471, + "grad_norm": 0.8104838132858276, + "learning_rate": 9.871281649602362e-06, + "loss": 0.8203, + "step": 5463 + }, + { + "epoch": 0.29458701746819066, + "grad_norm": 0.730912446975708, + "learning_rate": 9.871233849883018e-06, + "loss": 0.8419, + "step": 5464 + }, + { + "epoch": 0.29464093163683414, + "grad_norm": 0.7726290822029114, + "learning_rate": 9.871186041405852e-06, + "loss": 0.8276, + "step": 5465 + }, + { + "epoch": 0.29469484580547767, + "grad_norm": 0.7509479522705078, + "learning_rate": 9.871138224170949e-06, + "loss": 0.656, + "step": 5466 + }, + { + "epoch": 0.2947487599741212, + "grad_norm": 1.2936142683029175, + "learning_rate": 9.871090398178396e-06, + "loss": 0.7648, + "step": 5467 + }, + { + "epoch": 0.29480267414276473, + "grad_norm": 0.7731900215148926, + "learning_rate": 9.87104256342828e-06, + "loss": 0.8216, + "step": 5468 + }, + { + "epoch": 0.29485658831140826, + "grad_norm": 0.7106019258499146, + "learning_rate": 9.870994719920688e-06, + "loss": 0.6923, + "step": 5469 + }, + { + "epoch": 0.29491050248005174, + "grad_norm": 0.7590166926383972, + "learning_rate": 9.870946867655704e-06, + "loss": 0.7469, + "step": 5470 + }, + { + "epoch": 0.29496441664869527, + "grad_norm": 0.7591565847396851, + "learning_rate": 9.870899006633414e-06, + "loss": 0.8032, + "step": 5471 + }, + { + "epoch": 0.2950183308173388, + "grad_norm": 0.9401304125785828, + "learning_rate": 9.870851136853904e-06, + "loss": 0.7261, + "step": 5472 + }, + { + "epoch": 0.29507224498598233, + "grad_norm": 0.7991933822631836, + "learning_rate": 9.870803258317261e-06, + "loss": 0.7226, + "step": 5473 + }, + { + "epoch": 0.2951261591546258, + "grad_norm": 0.7324903011322021, + "learning_rate": 9.87075537102357e-06, + "loss": 0.7444, + "step": 5474 + }, + { + "epoch": 0.29518007332326934, + "grad_norm": 0.7185311317443848, + "learning_rate": 9.87070747497292e-06, + "loss": 0.7125, + "step": 5475 + }, + { + "epoch": 0.2952339874919129, + "grad_norm": 0.750343382358551, + "learning_rate": 9.870659570165393e-06, + "loss": 0.7458, + "step": 5476 + }, + { + "epoch": 0.2952879016605564, + "grad_norm": 0.8604345917701721, + "learning_rate": 9.870611656601077e-06, + "loss": 0.7445, + "step": 5477 + }, + { + "epoch": 0.29534181582919994, + "grad_norm": 0.7870331406593323, + "learning_rate": 9.870563734280059e-06, + "loss": 0.7116, + "step": 5478 + }, + { + "epoch": 0.2953957299978434, + "grad_norm": 0.6978838443756104, + "learning_rate": 9.870515803202424e-06, + "loss": 0.7563, + "step": 5479 + }, + { + "epoch": 0.29544964416648695, + "grad_norm": 1.5832971334457397, + "learning_rate": 9.870467863368258e-06, + "loss": 0.7095, + "step": 5480 + }, + { + "epoch": 0.2955035583351305, + "grad_norm": 0.7247046828269958, + "learning_rate": 9.870419914777646e-06, + "loss": 0.7586, + "step": 5481 + }, + { + "epoch": 0.295557472503774, + "grad_norm": 0.7100489735603333, + "learning_rate": 9.87037195743068e-06, + "loss": 0.804, + "step": 5482 + }, + { + "epoch": 0.2956113866724175, + "grad_norm": 0.78151935338974, + "learning_rate": 9.87032399132744e-06, + "loss": 0.7443, + "step": 5483 + }, + { + "epoch": 0.295665300841061, + "grad_norm": 0.7440445423126221, + "learning_rate": 9.870276016468013e-06, + "loss": 0.7476, + "step": 5484 + }, + { + "epoch": 0.29571921500970455, + "grad_norm": 0.7003461718559265, + "learning_rate": 9.870228032852489e-06, + "loss": 0.7401, + "step": 5485 + }, + { + "epoch": 0.2957731291783481, + "grad_norm": 0.7338505387306213, + "learning_rate": 9.87018004048095e-06, + "loss": 0.7757, + "step": 5486 + }, + { + "epoch": 0.2958270433469916, + "grad_norm": 0.7721376419067383, + "learning_rate": 9.870132039353484e-06, + "loss": 0.8646, + "step": 5487 + }, + { + "epoch": 0.2958809575156351, + "grad_norm": 0.7995434999465942, + "learning_rate": 9.870084029470179e-06, + "loss": 0.7917, + "step": 5488 + }, + { + "epoch": 0.2959348716842786, + "grad_norm": 0.8954901099205017, + "learning_rate": 9.87003601083112e-06, + "loss": 0.8308, + "step": 5489 + }, + { + "epoch": 0.29598878585292215, + "grad_norm": 0.7231770753860474, + "learning_rate": 9.86998798343639e-06, + "loss": 0.8126, + "step": 5490 + }, + { + "epoch": 0.2960427000215657, + "grad_norm": 0.8772289752960205, + "learning_rate": 9.869939947286081e-06, + "loss": 0.8513, + "step": 5491 + }, + { + "epoch": 0.29609661419020916, + "grad_norm": 0.726995050907135, + "learning_rate": 9.869891902380276e-06, + "loss": 0.6717, + "step": 5492 + }, + { + "epoch": 0.2961505283588527, + "grad_norm": 0.7519280910491943, + "learning_rate": 9.869843848719062e-06, + "loss": 0.7634, + "step": 5493 + }, + { + "epoch": 0.2962044425274962, + "grad_norm": 0.8302793502807617, + "learning_rate": 9.869795786302528e-06, + "loss": 0.845, + "step": 5494 + }, + { + "epoch": 0.29625835669613976, + "grad_norm": 0.9483422636985779, + "learning_rate": 9.869747715130756e-06, + "loss": 0.8187, + "step": 5495 + }, + { + "epoch": 0.2963122708647833, + "grad_norm": 0.808182418346405, + "learning_rate": 9.869699635203833e-06, + "loss": 0.8221, + "step": 5496 + }, + { + "epoch": 0.29636618503342677, + "grad_norm": 0.8152076601982117, + "learning_rate": 9.869651546521848e-06, + "loss": 0.8683, + "step": 5497 + }, + { + "epoch": 0.2964200992020703, + "grad_norm": 0.9072142243385315, + "learning_rate": 9.869603449084886e-06, + "loss": 0.8853, + "step": 5498 + }, + { + "epoch": 0.29647401337071383, + "grad_norm": 0.7798082828521729, + "learning_rate": 9.869555342893035e-06, + "loss": 0.84, + "step": 5499 + }, + { + "epoch": 0.29652792753935736, + "grad_norm": 0.7505926489830017, + "learning_rate": 9.869507227946378e-06, + "loss": 0.748, + "step": 5500 + }, + { + "epoch": 0.29658184170800084, + "grad_norm": 0.7643826007843018, + "learning_rate": 9.869459104245006e-06, + "loss": 0.7423, + "step": 5501 + }, + { + "epoch": 0.29663575587664437, + "grad_norm": 0.8993945717811584, + "learning_rate": 9.869410971789003e-06, + "loss": 0.7736, + "step": 5502 + }, + { + "epoch": 0.2966896700452879, + "grad_norm": 0.8132869005203247, + "learning_rate": 9.869362830578455e-06, + "loss": 0.8926, + "step": 5503 + }, + { + "epoch": 0.29674358421393143, + "grad_norm": 0.7741131782531738, + "learning_rate": 9.869314680613449e-06, + "loss": 0.7087, + "step": 5504 + }, + { + "epoch": 0.29679749838257496, + "grad_norm": 0.83815598487854, + "learning_rate": 9.869266521894073e-06, + "loss": 0.808, + "step": 5505 + }, + { + "epoch": 0.29685141255121844, + "grad_norm": 0.7051485180854797, + "learning_rate": 9.869218354420413e-06, + "loss": 0.695, + "step": 5506 + }, + { + "epoch": 0.296905326719862, + "grad_norm": 0.7514739036560059, + "learning_rate": 9.869170178192554e-06, + "loss": 0.7496, + "step": 5507 + }, + { + "epoch": 0.2969592408885055, + "grad_norm": 0.8005251288414001, + "learning_rate": 9.869121993210582e-06, + "loss": 0.8144, + "step": 5508 + }, + { + "epoch": 0.29701315505714904, + "grad_norm": 0.7894544005393982, + "learning_rate": 9.86907379947459e-06, + "loss": 0.7345, + "step": 5509 + }, + { + "epoch": 0.2970670692257925, + "grad_norm": 0.7498524785041809, + "learning_rate": 9.869025596984655e-06, + "loss": 0.6906, + "step": 5510 + }, + { + "epoch": 0.29712098339443604, + "grad_norm": 0.7346488237380981, + "learning_rate": 9.868977385740873e-06, + "loss": 0.7512, + "step": 5511 + }, + { + "epoch": 0.2971748975630796, + "grad_norm": 0.8185198307037354, + "learning_rate": 9.868929165743323e-06, + "loss": 0.847, + "step": 5512 + }, + { + "epoch": 0.2972288117317231, + "grad_norm": 0.7798783183097839, + "learning_rate": 9.868880936992095e-06, + "loss": 0.8442, + "step": 5513 + }, + { + "epoch": 0.29728272590036664, + "grad_norm": 0.862074077129364, + "learning_rate": 9.868832699487279e-06, + "loss": 0.8357, + "step": 5514 + }, + { + "epoch": 0.2973366400690101, + "grad_norm": 0.7395896911621094, + "learning_rate": 9.868784453228957e-06, + "loss": 0.7449, + "step": 5515 + }, + { + "epoch": 0.29739055423765365, + "grad_norm": 0.7291044592857361, + "learning_rate": 9.868736198217215e-06, + "loss": 0.7686, + "step": 5516 + }, + { + "epoch": 0.2974444684062972, + "grad_norm": 1.070936918258667, + "learning_rate": 9.868687934452143e-06, + "loss": 0.8639, + "step": 5517 + }, + { + "epoch": 0.2974983825749407, + "grad_norm": 0.7176975607872009, + "learning_rate": 9.868639661933828e-06, + "loss": 0.7579, + "step": 5518 + }, + { + "epoch": 0.2975522967435842, + "grad_norm": 0.7830207943916321, + "learning_rate": 9.868591380662356e-06, + "loss": 0.7744, + "step": 5519 + }, + { + "epoch": 0.2976062109122277, + "grad_norm": 1.0292960405349731, + "learning_rate": 9.868543090637812e-06, + "loss": 0.6333, + "step": 5520 + }, + { + "epoch": 0.29766012508087125, + "grad_norm": 0.7741127014160156, + "learning_rate": 9.868494791860285e-06, + "loss": 0.7859, + "step": 5521 + }, + { + "epoch": 0.2977140392495148, + "grad_norm": 0.8201294541358948, + "learning_rate": 9.86844648432986e-06, + "loss": 0.8045, + "step": 5522 + }, + { + "epoch": 0.2977679534181583, + "grad_norm": 0.7732555866241455, + "learning_rate": 9.868398168046625e-06, + "loss": 0.7253, + "step": 5523 + }, + { + "epoch": 0.2978218675868018, + "grad_norm": 0.727921724319458, + "learning_rate": 9.868349843010668e-06, + "loss": 0.7155, + "step": 5524 + }, + { + "epoch": 0.2978757817554453, + "grad_norm": 0.7359254360198975, + "learning_rate": 9.868301509222072e-06, + "loss": 0.894, + "step": 5525 + }, + { + "epoch": 0.29792969592408886, + "grad_norm": 0.8356531858444214, + "learning_rate": 9.868253166680927e-06, + "loss": 0.7506, + "step": 5526 + }, + { + "epoch": 0.2979836100927324, + "grad_norm": 0.8150777816772461, + "learning_rate": 9.868204815387321e-06, + "loss": 0.7737, + "step": 5527 + }, + { + "epoch": 0.29803752426137586, + "grad_norm": 0.7688710689544678, + "learning_rate": 9.86815645534134e-06, + "loss": 0.7656, + "step": 5528 + }, + { + "epoch": 0.2980914384300194, + "grad_norm": 0.7309591174125671, + "learning_rate": 9.868108086543069e-06, + "loss": 0.7655, + "step": 5529 + }, + { + "epoch": 0.2981453525986629, + "grad_norm": 0.9307131767272949, + "learning_rate": 9.868059708992595e-06, + "loss": 0.742, + "step": 5530 + }, + { + "epoch": 0.29819926676730646, + "grad_norm": 0.7241950631141663, + "learning_rate": 9.868011322690008e-06, + "loss": 0.7113, + "step": 5531 + }, + { + "epoch": 0.29825318093595, + "grad_norm": 0.8070489168167114, + "learning_rate": 9.867962927635393e-06, + "loss": 0.7835, + "step": 5532 + }, + { + "epoch": 0.29830709510459347, + "grad_norm": 0.6972863078117371, + "learning_rate": 9.867914523828836e-06, + "loss": 0.6914, + "step": 5533 + }, + { + "epoch": 0.298361009273237, + "grad_norm": 0.8001635670661926, + "learning_rate": 9.867866111270425e-06, + "loss": 0.82, + "step": 5534 + }, + { + "epoch": 0.29841492344188053, + "grad_norm": 0.7933236956596375, + "learning_rate": 9.867817689960249e-06, + "loss": 0.8148, + "step": 5535 + }, + { + "epoch": 0.29846883761052406, + "grad_norm": 0.7881083488464355, + "learning_rate": 9.867769259898393e-06, + "loss": 0.8342, + "step": 5536 + }, + { + "epoch": 0.29852275177916754, + "grad_norm": 0.7492312788963318, + "learning_rate": 9.867720821084943e-06, + "loss": 0.7736, + "step": 5537 + }, + { + "epoch": 0.29857666594781107, + "grad_norm": 0.7429683804512024, + "learning_rate": 9.86767237351999e-06, + "loss": 0.7602, + "step": 5538 + }, + { + "epoch": 0.2986305801164546, + "grad_norm": 0.7982121109962463, + "learning_rate": 9.867623917203618e-06, + "loss": 0.9007, + "step": 5539 + }, + { + "epoch": 0.29868449428509813, + "grad_norm": 0.77519291639328, + "learning_rate": 9.867575452135911e-06, + "loss": 0.7136, + "step": 5540 + }, + { + "epoch": 0.29873840845374167, + "grad_norm": 0.8341544270515442, + "learning_rate": 9.867526978316963e-06, + "loss": 0.852, + "step": 5541 + }, + { + "epoch": 0.29879232262238514, + "grad_norm": 0.8006002306938171, + "learning_rate": 9.867478495746859e-06, + "loss": 0.7557, + "step": 5542 + }, + { + "epoch": 0.2988462367910287, + "grad_norm": 0.7797364592552185, + "learning_rate": 9.867430004425683e-06, + "loss": 0.7776, + "step": 5543 + }, + { + "epoch": 0.2989001509596722, + "grad_norm": 0.8187147378921509, + "learning_rate": 9.867381504353525e-06, + "loss": 0.7871, + "step": 5544 + }, + { + "epoch": 0.29895406512831574, + "grad_norm": 0.8447971343994141, + "learning_rate": 9.867332995530471e-06, + "loss": 0.7563, + "step": 5545 + }, + { + "epoch": 0.2990079792969592, + "grad_norm": 0.7229753136634827, + "learning_rate": 9.867284477956608e-06, + "loss": 0.7227, + "step": 5546 + }, + { + "epoch": 0.29906189346560275, + "grad_norm": 0.7300926446914673, + "learning_rate": 9.867235951632026e-06, + "loss": 0.834, + "step": 5547 + }, + { + "epoch": 0.2991158076342463, + "grad_norm": 0.873554527759552, + "learning_rate": 9.86718741655681e-06, + "loss": 0.8454, + "step": 5548 + }, + { + "epoch": 0.2991697218028898, + "grad_norm": 0.7391233444213867, + "learning_rate": 9.867138872731047e-06, + "loss": 0.7505, + "step": 5549 + }, + { + "epoch": 0.29922363597153334, + "grad_norm": 0.7330740690231323, + "learning_rate": 9.867090320154824e-06, + "loss": 0.776, + "step": 5550 + }, + { + "epoch": 0.2992775501401768, + "grad_norm": 0.7050237655639648, + "learning_rate": 9.867041758828231e-06, + "loss": 0.7063, + "step": 5551 + }, + { + "epoch": 0.29933146430882035, + "grad_norm": 0.7757040858268738, + "learning_rate": 9.86699318875135e-06, + "loss": 0.757, + "step": 5552 + }, + { + "epoch": 0.2993853784774639, + "grad_norm": 0.7693188190460205, + "learning_rate": 9.866944609924274e-06, + "loss": 0.8058, + "step": 5553 + }, + { + "epoch": 0.2994392926461074, + "grad_norm": 0.8201676607131958, + "learning_rate": 9.866896022347088e-06, + "loss": 0.8317, + "step": 5554 + }, + { + "epoch": 0.2994932068147509, + "grad_norm": 0.768905758857727, + "learning_rate": 9.866847426019878e-06, + "loss": 0.705, + "step": 5555 + }, + { + "epoch": 0.2995471209833944, + "grad_norm": 0.7787859439849854, + "learning_rate": 9.866798820942735e-06, + "loss": 0.84, + "step": 5556 + }, + { + "epoch": 0.29960103515203795, + "grad_norm": 0.7377595901489258, + "learning_rate": 9.866750207115742e-06, + "loss": 0.7687, + "step": 5557 + }, + { + "epoch": 0.2996549493206815, + "grad_norm": 0.7098401784896851, + "learning_rate": 9.86670158453899e-06, + "loss": 0.6778, + "step": 5558 + }, + { + "epoch": 0.299708863489325, + "grad_norm": 0.7346776723861694, + "learning_rate": 9.866652953212563e-06, + "loss": 0.7201, + "step": 5559 + }, + { + "epoch": 0.2997627776579685, + "grad_norm": 0.9845607280731201, + "learning_rate": 9.866604313136551e-06, + "loss": 0.784, + "step": 5560 + }, + { + "epoch": 0.299816691826612, + "grad_norm": 0.7436274886131287, + "learning_rate": 9.866555664311042e-06, + "loss": 0.7904, + "step": 5561 + }, + { + "epoch": 0.29987060599525556, + "grad_norm": 0.712096095085144, + "learning_rate": 9.866507006736123e-06, + "loss": 0.6796, + "step": 5562 + }, + { + "epoch": 0.2999245201638991, + "grad_norm": 0.7324764132499695, + "learning_rate": 9.86645834041188e-06, + "loss": 0.7305, + "step": 5563 + }, + { + "epoch": 0.29997843433254257, + "grad_norm": 0.7124375104904175, + "learning_rate": 9.866409665338399e-06, + "loss": 0.7425, + "step": 5564 + }, + { + "epoch": 0.3000323485011861, + "grad_norm": 0.8470612168312073, + "learning_rate": 9.866360981515772e-06, + "loss": 0.7878, + "step": 5565 + } + ], + "logging_steps": 1, + "max_steps": 74192, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1855, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.642262919892697e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5565/training_args.bin b/checkpoint-5565/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..736549377f516c4bc25a43293c6f37ec549a9a60 --- /dev/null +++ b/checkpoint-5565/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb97268504007caea6a1175a54f08b974d7fa47a1a5fb4547021d5b9d223b4a4 +size 7928 diff --git a/checkpoint-5565/zero_to_fp32.py b/checkpoint-5565/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-5565/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-7420/config.json b/checkpoint-7420/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7f34bbd5159c9a132258ecf79562e79459cb64d9 --- /dev/null +++ b/checkpoint-7420/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B-Instruct/", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-7420/generation_config.json b/checkpoint-7420/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0484b997a9ea9b5b6d711db644716bfd32d5470e --- /dev/null +++ b/checkpoint-7420/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f4fdd1bdd855e4a96a7df0b5c17bf45374cd84a --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5542a7137bb3603c114669c4eda2723e91752a4943724b1bec505e3c19efcb72 +size 12045398464 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ffd4230ba9a48cb23a36d3286a0fadfad674da2 --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed069b1b57cf6def6ce12c058a1b9cf786d137fde527aaacca74e4a4401baf20 +size 12045399232 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13e798bb74ecf7de2db542bd3a40b0c6c3f8b2df --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b872b4eb15cac0d115a1b4923463578815a72d04515828cd82b3fa3481a42ef0 +size 12045399488 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d09fbe4686eb29387559ed6546fe2f34f24fc09 --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538232b751a369c009452140223886d8467d9554f1fd023ab1c44debc2e441c6 +size 12045399232 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..761c719a57c887bf7e7a551a8a767d27a313e59b --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c477324968214ad0419d8405ad09997e95b23df697376d818790f5bcb805d794 +size 12045399488 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ec84309e4c9d014a09a4483a37cb9c5e340199a --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be046a605faf12c0802c7e1e809a529b5c2aed010e384954d1b9113200a37121 +size 12045399552 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25e3d214f9c6944b8e375b3997004d9ad1a0e09b --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e13d2bec3bb67841b2b0cd52da21953c2056b31aad2535709770cab180f4bef +size 12045399232 diff --git a/checkpoint-7420/global_step7420/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9347b18e84713b58e550582874d4cea54d2822a --- /dev/null +++ b/checkpoint-7420/global_step7420/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7787c6d071f58ece68fd190aa160b2c2d243598ea7d276c4f0d09463951ff308 +size 12045398144 diff --git a/checkpoint-7420/global_step7420/mp_rank_00_model_states.pt b/checkpoint-7420/global_step7420/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d1a9cf6540eca0796c5968260e764a600f8bb9a --- /dev/null +++ b/checkpoint-7420/global_step7420/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3605aaba83eed7875ca8eaa7e2f6f054a997f62eeef57a97da3dfd2027eac57 +size 16060610552 diff --git a/checkpoint-7420/latest b/checkpoint-7420/latest new file mode 100644 index 0000000000000000000000000000000000000000..b81851db8a0473fde41f6d094c142b2ac8174e71 --- /dev/null +++ b/checkpoint-7420/latest @@ -0,0 +1 @@ +global_step7420 \ No newline at end of file diff --git a/checkpoint-7420/model-00001-of-00004.safetensors b/checkpoint-7420/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1165c1c6b57d81bc7a4e68e141a4c57f3bbf2076 --- /dev/null +++ b/checkpoint-7420/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2617026719eaf7fa1937a546a7cef022c016a3295ab1cc54512df609ffb7d5c6 +size 4976698672 diff --git a/checkpoint-7420/model-00002-of-00004.safetensors b/checkpoint-7420/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea9ead094b7b06c52e4f27d6fdde62d479626389 --- /dev/null +++ b/checkpoint-7420/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0004e971074fdb38f93456e01b6dbecaf52e1d0400c06c784d078757dcb72269 +size 4999802720 diff --git a/checkpoint-7420/model-00003-of-00004.safetensors b/checkpoint-7420/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dbbf733a804a82167bc6b43c1f2cd38c001ae490 --- /dev/null +++ b/checkpoint-7420/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4c0d7aeffd8166a64178b28756f53e8dfd7dc4d5c88b6251e84c4efbc775ce +size 4915916176 diff --git a/checkpoint-7420/model-00004-of-00004.safetensors b/checkpoint-7420/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8cea1f4c8974a0309dd99657ba0043050b67e693 --- /dev/null +++ b/checkpoint-7420/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b914162cbe49f9ecfea74f3e860e5b0f0d16636aaf42be8c4c94d9b56571f4 +size 1168138808 diff --git a/checkpoint-7420/model.safetensors.index.json b/checkpoint-7420/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/checkpoint-7420/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-7420/rng_state_0.pth b/checkpoint-7420/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-7420/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-7420/rng_state_1.pth b/checkpoint-7420/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-7420/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-7420/rng_state_2.pth b/checkpoint-7420/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-7420/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-7420/rng_state_3.pth b/checkpoint-7420/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-7420/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-7420/rng_state_4.pth b/checkpoint-7420/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-7420/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-7420/rng_state_5.pth b/checkpoint-7420/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-7420/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-7420/rng_state_6.pth b/checkpoint-7420/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-7420/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-7420/rng_state_7.pth b/checkpoint-7420/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-7420/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-7420/scheduler.pt b/checkpoint-7420/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb71cc8628f55dc1fc583fb3eadb043b3c35515f --- /dev/null +++ b/checkpoint-7420/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430dba78e01279bfa24f080ad3b7a54d6244e85f393a2c5aa9cd0884322609c3 +size 1064 diff --git a/checkpoint-7420/special_tokens_map.json b/checkpoint-7420/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-7420/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-7420/tokenizer.json b/checkpoint-7420/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a5a81eb733cae803b39ffc7644de0048c3a26c3 --- /dev/null +++ b/checkpoint-7420/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d7990a7c3f12081b24b3d098ab366211161e43494d2368211815c164b5f2b7 +size 17209828 diff --git a/checkpoint-7420/tokenizer_config.json b/checkpoint-7420/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5cd68a680b8f949dba64516158c30db7ea52c3cd --- /dev/null +++ b/checkpoint-7420/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-7420/trainer_state.json b/checkpoint-7420/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b5c76eab5bc84cd746da6976a18497f8a92dcea5 --- /dev/null +++ b/checkpoint-7420/trainer_state.json @@ -0,0 +1,51973 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4000431313349148, + "eval_steps": 500, + "global_step": 7420, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.391416864351952e-05, + "grad_norm": 53.75010299682617, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.5864, + "step": 1 + }, + { + "epoch": 0.00010782833728703904, + "grad_norm": 45.00067138671875, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3757, + "step": 2 + }, + { + "epoch": 0.00016174250593055855, + "grad_norm": 51.22366714477539, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.4653, + "step": 3 + }, + { + "epoch": 0.00021565667457407807, + "grad_norm": 62.225242614746094, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5819, + "step": 4 + }, + { + "epoch": 0.0002695708432175976, + "grad_norm": 54.67008590698242, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6368, + "step": 5 + }, + { + "epoch": 0.0003234850118611171, + "grad_norm": 51.261009216308594, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3245, + "step": 6 + }, + { + "epoch": 0.0003773991805046366, + "grad_norm": 53.58714294433594, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7622, + "step": 7 + }, + { + "epoch": 0.00043131334914815614, + "grad_norm": 41.32997131347656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.6444, + "step": 8 + }, + { + "epoch": 0.00048522751779167566, + "grad_norm": 33.232242584228516, + "learning_rate": 9.000000000000001e-07, + "loss": 2.1475, + "step": 9 + }, + { + "epoch": 0.0005391416864351952, + "grad_norm": 34.1890983581543, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7256, + "step": 10 + }, + { + "epoch": 0.0005930558550787146, + "grad_norm": 19.263437271118164, + "learning_rate": 1.1e-06, + "loss": 2.4132, + "step": 11 + }, + { + "epoch": 0.0006469700237222342, + "grad_norm": 15.612638473510742, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.0422, + "step": 12 + }, + { + "epoch": 0.0007008841923657537, + "grad_norm": 13.81751537322998, + "learning_rate": 1.3e-06, + "loss": 1.9663, + "step": 13 + }, + { + "epoch": 0.0007547983610092732, + "grad_norm": 16.390897750854492, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.1135, + "step": 14 + }, + { + "epoch": 0.0008087125296527927, + "grad_norm": 21.830646514892578, + "learning_rate": 1.5e-06, + "loss": 2.217, + "step": 15 + }, + { + "epoch": 0.0008626266982963123, + "grad_norm": 18.630046844482422, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1612, + "step": 16 + }, + { + "epoch": 0.0009165408669398317, + "grad_norm": 12.403571128845215, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9358, + "step": 17 + }, + { + "epoch": 0.0009704550355833513, + "grad_norm": 7.713366508483887, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8522, + "step": 18 + }, + { + "epoch": 0.001024369204226871, + "grad_norm": 7.731616973876953, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.7984, + "step": 19 + }, + { + "epoch": 0.0010782833728703904, + "grad_norm": 7.5799174308776855, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.701, + "step": 20 + }, + { + "epoch": 0.0011321975415139098, + "grad_norm": 5.5428080558776855, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.624, + "step": 21 + }, + { + "epoch": 0.0011861117101574293, + "grad_norm": 5.851474285125732, + "learning_rate": 2.2e-06, + "loss": 1.8064, + "step": 22 + }, + { + "epoch": 0.001240025878800949, + "grad_norm": 5.243111610412598, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7246, + "step": 23 + }, + { + "epoch": 0.0012939400474444684, + "grad_norm": 4.835971832275391, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.763, + "step": 24 + }, + { + "epoch": 0.0013478542160879879, + "grad_norm": 4.127845287322998, + "learning_rate": 2.5e-06, + "loss": 1.5869, + "step": 25 + }, + { + "epoch": 0.0014017683847315074, + "grad_norm": 3.7648322582244873, + "learning_rate": 2.6e-06, + "loss": 1.5599, + "step": 26 + }, + { + "epoch": 0.001455682553375027, + "grad_norm": 3.5424962043762207, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4703, + "step": 27 + }, + { + "epoch": 0.0015095967220185465, + "grad_norm": 3.3707985877990723, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.354, + "step": 28 + }, + { + "epoch": 0.001563510890662066, + "grad_norm": 4.71254825592041, + "learning_rate": 2.9e-06, + "loss": 1.8162, + "step": 29 + }, + { + "epoch": 0.0016174250593055854, + "grad_norm": 3.7660300731658936, + "learning_rate": 3e-06, + "loss": 1.5951, + "step": 30 + }, + { + "epoch": 0.001671339227949105, + "grad_norm": 3.4810571670532227, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.5183, + "step": 31 + }, + { + "epoch": 0.0017252533965926246, + "grad_norm": 3.672693967819214, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6374, + "step": 32 + }, + { + "epoch": 0.001779167565236144, + "grad_norm": 3.3589682579040527, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.4371, + "step": 33 + }, + { + "epoch": 0.0018330817338796635, + "grad_norm": 3.6365807056427, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.595, + "step": 34 + }, + { + "epoch": 0.0018869959025231832, + "grad_norm": 3.6467039585113525, + "learning_rate": 3.5e-06, + "loss": 1.5714, + "step": 35 + }, + { + "epoch": 0.0019409100711667026, + "grad_norm": 3.4684648513793945, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4897, + "step": 36 + }, + { + "epoch": 0.001994824239810222, + "grad_norm": 3.70845627784729, + "learning_rate": 3.7e-06, + "loss": 1.5954, + "step": 37 + }, + { + "epoch": 0.002048738408453742, + "grad_norm": 3.1803395748138428, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3976, + "step": 38 + }, + { + "epoch": 0.002102652577097261, + "grad_norm": 2.851703405380249, + "learning_rate": 3.900000000000001e-06, + "loss": 1.1894, + "step": 39 + }, + { + "epoch": 0.0021565667457407807, + "grad_norm": 2.832003593444824, + "learning_rate": 4.000000000000001e-06, + "loss": 1.353, + "step": 40 + }, + { + "epoch": 0.0022104809143843004, + "grad_norm": 3.397498607635498, + "learning_rate": 4.1e-06, + "loss": 1.4541, + "step": 41 + }, + { + "epoch": 0.0022643950830278196, + "grad_norm": 3.4537954330444336, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.4475, + "step": 42 + }, + { + "epoch": 0.0023183092516713393, + "grad_norm": 3.1131632328033447, + "learning_rate": 4.3e-06, + "loss": 1.2707, + "step": 43 + }, + { + "epoch": 0.0023722234203148586, + "grad_norm": 3.0421881675720215, + "learning_rate": 4.4e-06, + "loss": 1.3418, + "step": 44 + }, + { + "epoch": 0.0024261375889583782, + "grad_norm": 3.528514862060547, + "learning_rate": 4.5e-06, + "loss": 1.4432, + "step": 45 + }, + { + "epoch": 0.002480051757601898, + "grad_norm": 3.6783225536346436, + "learning_rate": 4.600000000000001e-06, + "loss": 1.4863, + "step": 46 + }, + { + "epoch": 0.002533965926245417, + "grad_norm": 2.9829189777374268, + "learning_rate": 4.7e-06, + "loss": 1.2856, + "step": 47 + }, + { + "epoch": 0.002587880094888937, + "grad_norm": 3.4480350017547607, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4129, + "step": 48 + }, + { + "epoch": 0.0026417942635324565, + "grad_norm": 3.4247214794158936, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3467, + "step": 49 + }, + { + "epoch": 0.0026957084321759758, + "grad_norm": 3.5268948078155518, + "learning_rate": 5e-06, + "loss": 1.4795, + "step": 50 + }, + { + "epoch": 0.0027496226008194955, + "grad_norm": 3.3228304386138916, + "learning_rate": 5.1e-06, + "loss": 1.461, + "step": 51 + }, + { + "epoch": 0.0028035367694630147, + "grad_norm": 3.365630865097046, + "learning_rate": 5.2e-06, + "loss": 1.2947, + "step": 52 + }, + { + "epoch": 0.0028574509381065344, + "grad_norm": 3.4889328479766846, + "learning_rate": 5.300000000000001e-06, + "loss": 1.432, + "step": 53 + }, + { + "epoch": 0.002911365106750054, + "grad_norm": 3.5767273902893066, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3773, + "step": 54 + }, + { + "epoch": 0.0029652792753935733, + "grad_norm": 3.499298095703125, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4132, + "step": 55 + }, + { + "epoch": 0.003019193444037093, + "grad_norm": 3.6990244388580322, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4595, + "step": 56 + }, + { + "epoch": 0.0030731076126806127, + "grad_norm": 3.0908327102661133, + "learning_rate": 5.7e-06, + "loss": 1.1873, + "step": 57 + }, + { + "epoch": 0.003127021781324132, + "grad_norm": 3.149425745010376, + "learning_rate": 5.8e-06, + "loss": 1.3306, + "step": 58 + }, + { + "epoch": 0.0031809359499676516, + "grad_norm": 3.193023204803467, + "learning_rate": 5.9e-06, + "loss": 1.3326, + "step": 59 + }, + { + "epoch": 0.003234850118611171, + "grad_norm": 3.610344409942627, + "learning_rate": 6e-06, + "loss": 1.4527, + "step": 60 + }, + { + "epoch": 0.0032887642872546905, + "grad_norm": 2.9877095222473145, + "learning_rate": 6.1e-06, + "loss": 1.2029, + "step": 61 + }, + { + "epoch": 0.00334267845589821, + "grad_norm": 3.0241923332214355, + "learning_rate": 6.200000000000001e-06, + "loss": 1.3413, + "step": 62 + }, + { + "epoch": 0.0033965926245417295, + "grad_norm": 3.212700366973877, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3471, + "step": 63 + }, + { + "epoch": 0.003450506793185249, + "grad_norm": 2.7138960361480713, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.0885, + "step": 64 + }, + { + "epoch": 0.0035044209618287684, + "grad_norm": 2.5690340995788574, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1168, + "step": 65 + }, + { + "epoch": 0.003558335130472288, + "grad_norm": 3.0344784259796143, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2828, + "step": 66 + }, + { + "epoch": 0.0036122492991158077, + "grad_norm": 3.0589816570281982, + "learning_rate": 6.700000000000001e-06, + "loss": 1.2604, + "step": 67 + }, + { + "epoch": 0.003666163467759327, + "grad_norm": 2.676417112350464, + "learning_rate": 6.800000000000001e-06, + "loss": 1.1679, + "step": 68 + }, + { + "epoch": 0.0037200776364028467, + "grad_norm": 2.6590960025787354, + "learning_rate": 6.9e-06, + "loss": 1.2283, + "step": 69 + }, + { + "epoch": 0.0037739918050463664, + "grad_norm": 2.6973354816436768, + "learning_rate": 7e-06, + "loss": 1.2028, + "step": 70 + }, + { + "epoch": 0.0038279059736898856, + "grad_norm": 2.7046608924865723, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2629, + "step": 71 + }, + { + "epoch": 0.0038818201423334053, + "grad_norm": 2.2172696590423584, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.1367, + "step": 72 + }, + { + "epoch": 0.0039357343109769245, + "grad_norm": 2.6138789653778076, + "learning_rate": 7.3e-06, + "loss": 1.3167, + "step": 73 + }, + { + "epoch": 0.003989648479620444, + "grad_norm": 2.2926838397979736, + "learning_rate": 7.4e-06, + "loss": 1.2909, + "step": 74 + }, + { + "epoch": 0.004043562648263964, + "grad_norm": 2.0647220611572266, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2054, + "step": 75 + }, + { + "epoch": 0.004097476816907484, + "grad_norm": 2.1190452575683594, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1497, + "step": 76 + }, + { + "epoch": 0.004151390985551002, + "grad_norm": 1.9973243474960327, + "learning_rate": 7.7e-06, + "loss": 1.1997, + "step": 77 + }, + { + "epoch": 0.004205305154194522, + "grad_norm": 2.11751651763916, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2181, + "step": 78 + }, + { + "epoch": 0.004259219322838042, + "grad_norm": 1.8975950479507446, + "learning_rate": 7.9e-06, + "loss": 1.1582, + "step": 79 + }, + { + "epoch": 0.004313133491481561, + "grad_norm": 1.8368147611618042, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1389, + "step": 80 + }, + { + "epoch": 0.004367047660125081, + "grad_norm": 1.7472988367080688, + "learning_rate": 8.1e-06, + "loss": 1.0959, + "step": 81 + }, + { + "epoch": 0.004420961828768601, + "grad_norm": 1.7325443029403687, + "learning_rate": 8.2e-06, + "loss": 1.1847, + "step": 82 + }, + { + "epoch": 0.00447487599741212, + "grad_norm": 1.6171561479568481, + "learning_rate": 8.3e-06, + "loss": 0.9834, + "step": 83 + }, + { + "epoch": 0.004528790166055639, + "grad_norm": 1.6583327054977417, + "learning_rate": 8.400000000000001e-06, + "loss": 1.0413, + "step": 84 + }, + { + "epoch": 0.004582704334699159, + "grad_norm": 1.8914967775344849, + "learning_rate": 8.5e-06, + "loss": 1.2413, + "step": 85 + }, + { + "epoch": 0.004636618503342679, + "grad_norm": 1.6018317937850952, + "learning_rate": 8.6e-06, + "loss": 1.0577, + "step": 86 + }, + { + "epoch": 0.004690532671986198, + "grad_norm": 1.9170053005218506, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2463, + "step": 87 + }, + { + "epoch": 0.004744446840629717, + "grad_norm": 1.666536569595337, + "learning_rate": 8.8e-06, + "loss": 1.0532, + "step": 88 + }, + { + "epoch": 0.004798361009273237, + "grad_norm": 1.660115361213684, + "learning_rate": 8.900000000000001e-06, + "loss": 1.0514, + "step": 89 + }, + { + "epoch": 0.0048522751779167565, + "grad_norm": 1.8667477369308472, + "learning_rate": 9e-06, + "loss": 1.2039, + "step": 90 + }, + { + "epoch": 0.004906189346560276, + "grad_norm": 1.9490039348602295, + "learning_rate": 9.100000000000001e-06, + "loss": 1.1804, + "step": 91 + }, + { + "epoch": 0.004960103515203796, + "grad_norm": 1.8415377140045166, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1435, + "step": 92 + }, + { + "epoch": 0.005014017683847315, + "grad_norm": 1.8571438789367676, + "learning_rate": 9.3e-06, + "loss": 1.0974, + "step": 93 + }, + { + "epoch": 0.005067931852490834, + "grad_norm": 1.8480113744735718, + "learning_rate": 9.4e-06, + "loss": 1.149, + "step": 94 + }, + { + "epoch": 0.005121846021134354, + "grad_norm": 2.003490447998047, + "learning_rate": 9.5e-06, + "loss": 1.1954, + "step": 95 + }, + { + "epoch": 0.005175760189777874, + "grad_norm": 1.8002668619155884, + "learning_rate": 9.600000000000001e-06, + "loss": 0.9953, + "step": 96 + }, + { + "epoch": 0.005229674358421393, + "grad_norm": 1.9040817022323608, + "learning_rate": 9.7e-06, + "loss": 1.1195, + "step": 97 + }, + { + "epoch": 0.005283588527064913, + "grad_norm": 1.8311433792114258, + "learning_rate": 9.800000000000001e-06, + "loss": 1.083, + "step": 98 + }, + { + "epoch": 0.005337502695708432, + "grad_norm": 1.9509624242782593, + "learning_rate": 9.9e-06, + "loss": 1.176, + "step": 99 + }, + { + "epoch": 0.0053914168643519516, + "grad_norm": 2.0624589920043945, + "learning_rate": 1e-05, + "loss": 1.119, + "step": 100 + }, + { + "epoch": 0.005445331032995471, + "grad_norm": 1.9618796110153198, + "learning_rate": 9.999999995505339e-06, + "loss": 1.1371, + "step": 101 + }, + { + "epoch": 0.005499245201638991, + "grad_norm": 1.946245551109314, + "learning_rate": 9.999999982021349e-06, + "loss": 0.9736, + "step": 102 + }, + { + "epoch": 0.005553159370282511, + "grad_norm": 1.9871301651000977, + "learning_rate": 9.999999959548035e-06, + "loss": 1.1077, + "step": 103 + }, + { + "epoch": 0.005607073538926029, + "grad_norm": 1.86216402053833, + "learning_rate": 9.999999928085396e-06, + "loss": 1.0882, + "step": 104 + }, + { + "epoch": 0.005660987707569549, + "grad_norm": 1.8447723388671875, + "learning_rate": 9.999999887633432e-06, + "loss": 1.0344, + "step": 105 + }, + { + "epoch": 0.005714901876213069, + "grad_norm": 1.8345638513565063, + "learning_rate": 9.99999983819214e-06, + "loss": 1.1077, + "step": 106 + }, + { + "epoch": 0.0057688160448565885, + "grad_norm": 1.8410178422927856, + "learning_rate": 9.999999779761524e-06, + "loss": 1.0824, + "step": 107 + }, + { + "epoch": 0.005822730213500108, + "grad_norm": 1.5881969928741455, + "learning_rate": 9.999999712341583e-06, + "loss": 0.9439, + "step": 108 + }, + { + "epoch": 0.005876644382143627, + "grad_norm": 1.6704047918319702, + "learning_rate": 9.999999635932316e-06, + "loss": 1.033, + "step": 109 + }, + { + "epoch": 0.005930558550787147, + "grad_norm": 1.792449712753296, + "learning_rate": 9.999999550533726e-06, + "loss": 1.0279, + "step": 110 + }, + { + "epoch": 0.005984472719430666, + "grad_norm": 1.6515668630599976, + "learning_rate": 9.999999456145809e-06, + "loss": 1.0301, + "step": 111 + }, + { + "epoch": 0.006038386888074186, + "grad_norm": 1.8541395664215088, + "learning_rate": 9.999999352768568e-06, + "loss": 1.1057, + "step": 112 + }, + { + "epoch": 0.006092301056717706, + "grad_norm": 1.6490236520767212, + "learning_rate": 9.999999240402002e-06, + "loss": 1.0523, + "step": 113 + }, + { + "epoch": 0.006146215225361225, + "grad_norm": 1.655333161354065, + "learning_rate": 9.999999119046113e-06, + "loss": 1.0448, + "step": 114 + }, + { + "epoch": 0.006200129394004744, + "grad_norm": 1.5721609592437744, + "learning_rate": 9.999998988700899e-06, + "loss": 0.9883, + "step": 115 + }, + { + "epoch": 0.006254043562648264, + "grad_norm": 1.6411349773406982, + "learning_rate": 9.99999884936636e-06, + "loss": 1.0255, + "step": 116 + }, + { + "epoch": 0.0063079577312917835, + "grad_norm": 1.6399502754211426, + "learning_rate": 9.999998701042501e-06, + "loss": 1.0146, + "step": 117 + }, + { + "epoch": 0.006361871899935303, + "grad_norm": 1.615026831626892, + "learning_rate": 9.999998543729316e-06, + "loss": 1.0022, + "step": 118 + }, + { + "epoch": 0.006415786068578823, + "grad_norm": 1.4867664575576782, + "learning_rate": 9.99999837742681e-06, + "loss": 1.0164, + "step": 119 + }, + { + "epoch": 0.006469700237222342, + "grad_norm": 1.540153980255127, + "learning_rate": 9.999998202134979e-06, + "loss": 0.989, + "step": 120 + }, + { + "epoch": 0.006523614405865861, + "grad_norm": 1.5535691976547241, + "learning_rate": 9.999998017853825e-06, + "loss": 0.9942, + "step": 121 + }, + { + "epoch": 0.006577528574509381, + "grad_norm": 1.4892929792404175, + "learning_rate": 9.999997824583351e-06, + "loss": 1.0537, + "step": 122 + }, + { + "epoch": 0.006631442743152901, + "grad_norm": 1.4674094915390015, + "learning_rate": 9.999997622323554e-06, + "loss": 1.0239, + "step": 123 + }, + { + "epoch": 0.00668535691179642, + "grad_norm": 1.394027590751648, + "learning_rate": 9.999997411074436e-06, + "loss": 0.9781, + "step": 124 + }, + { + "epoch": 0.006739271080439939, + "grad_norm": 1.372728705406189, + "learning_rate": 9.999997190835999e-06, + "loss": 1.0433, + "step": 125 + }, + { + "epoch": 0.006793185249083459, + "grad_norm": 1.2535908222198486, + "learning_rate": 9.999996961608238e-06, + "loss": 0.958, + "step": 126 + }, + { + "epoch": 0.006847099417726979, + "grad_norm": 1.337633490562439, + "learning_rate": 9.999996723391158e-06, + "loss": 1.0213, + "step": 127 + }, + { + "epoch": 0.006901013586370498, + "grad_norm": 1.3640319108963013, + "learning_rate": 9.999996476184759e-06, + "loss": 1.0432, + "step": 128 + }, + { + "epoch": 0.006954927755014018, + "grad_norm": 1.2663391828536987, + "learning_rate": 9.99999621998904e-06, + "loss": 1.0154, + "step": 129 + }, + { + "epoch": 0.007008841923657537, + "grad_norm": 1.450737476348877, + "learning_rate": 9.999995954804004e-06, + "loss": 1.0074, + "step": 130 + }, + { + "epoch": 0.0070627560923010565, + "grad_norm": 1.2757987976074219, + "learning_rate": 9.999995680629649e-06, + "loss": 0.9996, + "step": 131 + }, + { + "epoch": 0.007116670260944576, + "grad_norm": 1.3978132009506226, + "learning_rate": 9.999995397465974e-06, + "loss": 1.04, + "step": 132 + }, + { + "epoch": 0.007170584429588096, + "grad_norm": 1.3167297840118408, + "learning_rate": 9.999995105312982e-06, + "loss": 1.0069, + "step": 133 + }, + { + "epoch": 0.0072244985982316155, + "grad_norm": 1.1626744270324707, + "learning_rate": 9.999994804170674e-06, + "loss": 0.9722, + "step": 134 + }, + { + "epoch": 0.007278412766875135, + "grad_norm": 1.354797601699829, + "learning_rate": 9.99999449403905e-06, + "loss": 0.9019, + "step": 135 + }, + { + "epoch": 0.007332326935518654, + "grad_norm": 1.2605732679367065, + "learning_rate": 9.99999417491811e-06, + "loss": 1.0038, + "step": 136 + }, + { + "epoch": 0.007386241104162174, + "grad_norm": 1.3804657459259033, + "learning_rate": 9.999993846807855e-06, + "loss": 1.0139, + "step": 137 + }, + { + "epoch": 0.007440155272805693, + "grad_norm": 1.3001742362976074, + "learning_rate": 9.999993509708286e-06, + "loss": 1.1436, + "step": 138 + }, + { + "epoch": 0.007494069441449213, + "grad_norm": 1.2776422500610352, + "learning_rate": 9.999993163619401e-06, + "loss": 0.9792, + "step": 139 + }, + { + "epoch": 0.007547983610092733, + "grad_norm": 1.2149187326431274, + "learning_rate": 9.999992808541204e-06, + "loss": 0.963, + "step": 140 + }, + { + "epoch": 0.0076018977787362515, + "grad_norm": 1.341806173324585, + "learning_rate": 9.999992444473694e-06, + "loss": 0.9639, + "step": 141 + }, + { + "epoch": 0.007655811947379771, + "grad_norm": 1.2565757036209106, + "learning_rate": 9.999992071416874e-06, + "loss": 0.9193, + "step": 142 + }, + { + "epoch": 0.007709726116023291, + "grad_norm": 1.3059918880462646, + "learning_rate": 9.99999168937074e-06, + "loss": 0.9632, + "step": 143 + }, + { + "epoch": 0.0077636402846668106, + "grad_norm": 1.1719332933425903, + "learning_rate": 9.999991298335295e-06, + "loss": 0.9687, + "step": 144 + }, + { + "epoch": 0.00781755445331033, + "grad_norm": 1.125950813293457, + "learning_rate": 9.999990898310542e-06, + "loss": 0.968, + "step": 145 + }, + { + "epoch": 0.007871468621953849, + "grad_norm": 1.2400416135787964, + "learning_rate": 9.999990489296478e-06, + "loss": 0.972, + "step": 146 + }, + { + "epoch": 0.007925382790597369, + "grad_norm": 1.172117829322815, + "learning_rate": 9.999990071293106e-06, + "loss": 0.9243, + "step": 147 + }, + { + "epoch": 0.007979296959240888, + "grad_norm": 1.240317463874817, + "learning_rate": 9.999989644300427e-06, + "loss": 1.0655, + "step": 148 + }, + { + "epoch": 0.008033211127884408, + "grad_norm": 1.1535708904266357, + "learning_rate": 9.999989208318438e-06, + "loss": 0.9871, + "step": 149 + }, + { + "epoch": 0.008087125296527928, + "grad_norm": 1.2711198329925537, + "learning_rate": 9.999988763347145e-06, + "loss": 1.0307, + "step": 150 + }, + { + "epoch": 0.008141039465171447, + "grad_norm": 1.2345954179763794, + "learning_rate": 9.999988309386548e-06, + "loss": 1.1343, + "step": 151 + }, + { + "epoch": 0.008194953633814967, + "grad_norm": 1.2489601373672485, + "learning_rate": 9.999987846436645e-06, + "loss": 1.0303, + "step": 152 + }, + { + "epoch": 0.008248867802458487, + "grad_norm": 1.264240026473999, + "learning_rate": 9.999987374497439e-06, + "loss": 0.9562, + "step": 153 + }, + { + "epoch": 0.008302781971102005, + "grad_norm": 1.2613575458526611, + "learning_rate": 9.99998689356893e-06, + "loss": 0.954, + "step": 154 + }, + { + "epoch": 0.008356696139745524, + "grad_norm": 1.2091072797775269, + "learning_rate": 9.999986403651116e-06, + "loss": 1.0734, + "step": 155 + }, + { + "epoch": 0.008410610308389044, + "grad_norm": 1.18421471118927, + "learning_rate": 9.999985904744002e-06, + "loss": 0.9167, + "step": 156 + }, + { + "epoch": 0.008464524477032564, + "grad_norm": 1.0399659872055054, + "learning_rate": 9.99998539684759e-06, + "loss": 0.9068, + "step": 157 + }, + { + "epoch": 0.008518438645676083, + "grad_norm": 1.1292288303375244, + "learning_rate": 9.999984879961877e-06, + "loss": 1.0027, + "step": 158 + }, + { + "epoch": 0.008572352814319603, + "grad_norm": 1.2592105865478516, + "learning_rate": 9.999984354086867e-06, + "loss": 1.0794, + "step": 159 + }, + { + "epoch": 0.008626266982963123, + "grad_norm": 1.1646504402160645, + "learning_rate": 9.999983819222558e-06, + "loss": 1.0468, + "step": 160 + }, + { + "epoch": 0.008680181151606643, + "grad_norm": 1.156711220741272, + "learning_rate": 9.999983275368952e-06, + "loss": 0.9053, + "step": 161 + }, + { + "epoch": 0.008734095320250162, + "grad_norm": 1.1169341802597046, + "learning_rate": 9.999982722526051e-06, + "loss": 0.97, + "step": 162 + }, + { + "epoch": 0.008788009488893682, + "grad_norm": 1.3474149703979492, + "learning_rate": 9.999982160693856e-06, + "loss": 1.0221, + "step": 163 + }, + { + "epoch": 0.008841923657537202, + "grad_norm": 1.2021468877792358, + "learning_rate": 9.999981589872368e-06, + "loss": 0.9303, + "step": 164 + }, + { + "epoch": 0.00889583782618072, + "grad_norm": 1.0625534057617188, + "learning_rate": 9.999981010061586e-06, + "loss": 0.8765, + "step": 165 + }, + { + "epoch": 0.00894975199482424, + "grad_norm": 1.2688498497009277, + "learning_rate": 9.999980421261512e-06, + "loss": 1.0163, + "step": 166 + }, + { + "epoch": 0.009003666163467759, + "grad_norm": 1.122948408126831, + "learning_rate": 9.999979823472148e-06, + "loss": 0.9953, + "step": 167 + }, + { + "epoch": 0.009057580332111279, + "grad_norm": 1.1817872524261475, + "learning_rate": 9.999979216693495e-06, + "loss": 1.0774, + "step": 168 + }, + { + "epoch": 0.009111494500754798, + "grad_norm": 1.1483280658721924, + "learning_rate": 9.999978600925553e-06, + "loss": 1.0105, + "step": 169 + }, + { + "epoch": 0.009165408669398318, + "grad_norm": 1.4039335250854492, + "learning_rate": 9.999977976168325e-06, + "loss": 0.944, + "step": 170 + }, + { + "epoch": 0.009219322838041838, + "grad_norm": 1.1459723711013794, + "learning_rate": 9.999977342421812e-06, + "loss": 0.9208, + "step": 171 + }, + { + "epoch": 0.009273237006685357, + "grad_norm": 1.0897774696350098, + "learning_rate": 9.999976699686011e-06, + "loss": 0.8719, + "step": 172 + }, + { + "epoch": 0.009327151175328877, + "grad_norm": 1.206467866897583, + "learning_rate": 9.999976047960928e-06, + "loss": 1.0645, + "step": 173 + }, + { + "epoch": 0.009381065343972397, + "grad_norm": 1.004550814628601, + "learning_rate": 9.999975387246563e-06, + "loss": 0.9317, + "step": 174 + }, + { + "epoch": 0.009434979512615916, + "grad_norm": 1.2359992265701294, + "learning_rate": 9.999974717542916e-06, + "loss": 1.1136, + "step": 175 + }, + { + "epoch": 0.009488893681259434, + "grad_norm": 1.1922352313995361, + "learning_rate": 9.999974038849989e-06, + "loss": 1.0307, + "step": 176 + }, + { + "epoch": 0.009542807849902954, + "grad_norm": 1.1597613096237183, + "learning_rate": 9.999973351167782e-06, + "loss": 1.0275, + "step": 177 + }, + { + "epoch": 0.009596722018546474, + "grad_norm": 1.172133445739746, + "learning_rate": 9.999972654496298e-06, + "loss": 0.9269, + "step": 178 + }, + { + "epoch": 0.009650636187189993, + "grad_norm": 1.1879733800888062, + "learning_rate": 9.999971948835538e-06, + "loss": 0.9547, + "step": 179 + }, + { + "epoch": 0.009704550355833513, + "grad_norm": 1.0029833316802979, + "learning_rate": 9.999971234185502e-06, + "loss": 0.8994, + "step": 180 + }, + { + "epoch": 0.009758464524477033, + "grad_norm": 1.0769891738891602, + "learning_rate": 9.999970510546194e-06, + "loss": 0.9107, + "step": 181 + }, + { + "epoch": 0.009812378693120552, + "grad_norm": 1.3288064002990723, + "learning_rate": 9.99996977791761e-06, + "loss": 1.0116, + "step": 182 + }, + { + "epoch": 0.009866292861764072, + "grad_norm": 1.142452597618103, + "learning_rate": 9.999969036299757e-06, + "loss": 0.9367, + "step": 183 + }, + { + "epoch": 0.009920207030407592, + "grad_norm": 1.2458518743515015, + "learning_rate": 9.999968285692632e-06, + "loss": 1.1398, + "step": 184 + }, + { + "epoch": 0.009974121199051111, + "grad_norm": 1.3373422622680664, + "learning_rate": 9.99996752609624e-06, + "loss": 0.959, + "step": 185 + }, + { + "epoch": 0.01002803536769463, + "grad_norm": 1.2288920879364014, + "learning_rate": 9.99996675751058e-06, + "loss": 0.9908, + "step": 186 + }, + { + "epoch": 0.010081949536338149, + "grad_norm": 1.1954001188278198, + "learning_rate": 9.999965979935656e-06, + "loss": 0.9332, + "step": 187 + }, + { + "epoch": 0.010135863704981669, + "grad_norm": 1.171021819114685, + "learning_rate": 9.999965193371466e-06, + "loss": 0.9119, + "step": 188 + }, + { + "epoch": 0.010189777873625188, + "grad_norm": 1.025169014930725, + "learning_rate": 9.999964397818013e-06, + "loss": 0.784, + "step": 189 + }, + { + "epoch": 0.010243692042268708, + "grad_norm": 1.1340326070785522, + "learning_rate": 9.999963593275298e-06, + "loss": 1.0036, + "step": 190 + }, + { + "epoch": 0.010297606210912228, + "grad_norm": 1.0302847623825073, + "learning_rate": 9.999962779743324e-06, + "loss": 0.8293, + "step": 191 + }, + { + "epoch": 0.010351520379555747, + "grad_norm": 1.2410109043121338, + "learning_rate": 9.99996195722209e-06, + "loss": 0.9507, + "step": 192 + }, + { + "epoch": 0.010405434548199267, + "grad_norm": 1.2054308652877808, + "learning_rate": 9.9999611257116e-06, + "loss": 0.9356, + "step": 193 + }, + { + "epoch": 0.010459348716842787, + "grad_norm": 1.2046679258346558, + "learning_rate": 9.999960285211853e-06, + "loss": 1.0638, + "step": 194 + }, + { + "epoch": 0.010513262885486306, + "grad_norm": 1.4594306945800781, + "learning_rate": 9.999959435722852e-06, + "loss": 0.9624, + "step": 195 + }, + { + "epoch": 0.010567177054129826, + "grad_norm": 1.0909247398376465, + "learning_rate": 9.999958577244598e-06, + "loss": 0.9503, + "step": 196 + }, + { + "epoch": 0.010621091222773344, + "grad_norm": 1.1524754762649536, + "learning_rate": 9.999957709777094e-06, + "loss": 0.8954, + "step": 197 + }, + { + "epoch": 0.010675005391416864, + "grad_norm": 1.4128906726837158, + "learning_rate": 9.99995683332034e-06, + "loss": 0.8903, + "step": 198 + }, + { + "epoch": 0.010728919560060383, + "grad_norm": 1.1304652690887451, + "learning_rate": 9.999955947874338e-06, + "loss": 0.9247, + "step": 199 + }, + { + "epoch": 0.010782833728703903, + "grad_norm": 1.2978957891464233, + "learning_rate": 9.99995505343909e-06, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.010836747897347423, + "grad_norm": 1.0742554664611816, + "learning_rate": 9.999954150014595e-06, + "loss": 0.9626, + "step": 201 + }, + { + "epoch": 0.010890662065990942, + "grad_norm": 1.0707745552062988, + "learning_rate": 9.999953237600859e-06, + "loss": 0.8721, + "step": 202 + }, + { + "epoch": 0.010944576234634462, + "grad_norm": 1.17974853515625, + "learning_rate": 9.99995231619788e-06, + "loss": 1.0059, + "step": 203 + }, + { + "epoch": 0.010998490403277982, + "grad_norm": 1.0108370780944824, + "learning_rate": 9.999951385805662e-06, + "loss": 0.9527, + "step": 204 + }, + { + "epoch": 0.011052404571921502, + "grad_norm": 0.9983445405960083, + "learning_rate": 9.999950446424204e-06, + "loss": 0.7626, + "step": 205 + }, + { + "epoch": 0.011106318740565021, + "grad_norm": 1.0860002040863037, + "learning_rate": 9.99994949805351e-06, + "loss": 0.9591, + "step": 206 + }, + { + "epoch": 0.01116023290920854, + "grad_norm": 1.0447322130203247, + "learning_rate": 9.999948540693584e-06, + "loss": 0.9861, + "step": 207 + }, + { + "epoch": 0.011214147077852059, + "grad_norm": 1.2582998275756836, + "learning_rate": 9.999947574344423e-06, + "loss": 0.8949, + "step": 208 + }, + { + "epoch": 0.011268061246495579, + "grad_norm": 1.1507002115249634, + "learning_rate": 9.99994659900603e-06, + "loss": 0.918, + "step": 209 + }, + { + "epoch": 0.011321975415139098, + "grad_norm": 1.135169267654419, + "learning_rate": 9.999945614678408e-06, + "loss": 0.9891, + "step": 210 + }, + { + "epoch": 0.011375889583782618, + "grad_norm": 1.1746275424957275, + "learning_rate": 9.999944621361558e-06, + "loss": 1.0186, + "step": 211 + }, + { + "epoch": 0.011429803752426138, + "grad_norm": 1.1137248277664185, + "learning_rate": 9.999943619055483e-06, + "loss": 0.9584, + "step": 212 + }, + { + "epoch": 0.011483717921069657, + "grad_norm": 1.336651086807251, + "learning_rate": 9.999942607760182e-06, + "loss": 1.091, + "step": 213 + }, + { + "epoch": 0.011537632089713177, + "grad_norm": 1.1966856718063354, + "learning_rate": 9.999941587475658e-06, + "loss": 0.9761, + "step": 214 + }, + { + "epoch": 0.011591546258356697, + "grad_norm": 1.0843144655227661, + "learning_rate": 9.999940558201915e-06, + "loss": 0.8917, + "step": 215 + }, + { + "epoch": 0.011645460427000216, + "grad_norm": 1.2089293003082275, + "learning_rate": 9.999939519938953e-06, + "loss": 0.9704, + "step": 216 + }, + { + "epoch": 0.011699374595643736, + "grad_norm": 1.2409982681274414, + "learning_rate": 9.999938472686775e-06, + "loss": 0.9949, + "step": 217 + }, + { + "epoch": 0.011753288764287254, + "grad_norm": 1.1310094594955444, + "learning_rate": 9.99993741644538e-06, + "loss": 0.9666, + "step": 218 + }, + { + "epoch": 0.011807202932930774, + "grad_norm": 1.120510220527649, + "learning_rate": 9.999936351214772e-06, + "loss": 0.8844, + "step": 219 + }, + { + "epoch": 0.011861117101574293, + "grad_norm": 1.0931518077850342, + "learning_rate": 9.999935276994954e-06, + "loss": 0.9647, + "step": 220 + }, + { + "epoch": 0.011915031270217813, + "grad_norm": 1.2821122407913208, + "learning_rate": 9.999934193785926e-06, + "loss": 1.0533, + "step": 221 + }, + { + "epoch": 0.011968945438861333, + "grad_norm": 1.183580756187439, + "learning_rate": 9.999933101587691e-06, + "loss": 0.9196, + "step": 222 + }, + { + "epoch": 0.012022859607504852, + "grad_norm": 1.045825719833374, + "learning_rate": 9.99993200040025e-06, + "loss": 0.8953, + "step": 223 + }, + { + "epoch": 0.012076773776148372, + "grad_norm": 1.0963969230651855, + "learning_rate": 9.999930890223605e-06, + "loss": 0.9723, + "step": 224 + }, + { + "epoch": 0.012130687944791892, + "grad_norm": 1.0356731414794922, + "learning_rate": 9.999929771057761e-06, + "loss": 1.0215, + "step": 225 + }, + { + "epoch": 0.012184602113435411, + "grad_norm": 1.112277626991272, + "learning_rate": 9.999928642902717e-06, + "loss": 0.9886, + "step": 226 + }, + { + "epoch": 0.012238516282078931, + "grad_norm": 0.9969072937965393, + "learning_rate": 9.999927505758475e-06, + "loss": 0.8601, + "step": 227 + }, + { + "epoch": 0.01229243045072245, + "grad_norm": 1.123781442642212, + "learning_rate": 9.999926359625036e-06, + "loss": 0.9894, + "step": 228 + }, + { + "epoch": 0.012346344619365969, + "grad_norm": 1.2122100591659546, + "learning_rate": 9.999925204502406e-06, + "loss": 1.0783, + "step": 229 + }, + { + "epoch": 0.012400258788009488, + "grad_norm": 1.1256672143936157, + "learning_rate": 9.999924040390584e-06, + "loss": 0.9116, + "step": 230 + }, + { + "epoch": 0.012454172956653008, + "grad_norm": 1.0646952390670776, + "learning_rate": 9.999922867289573e-06, + "loss": 0.8993, + "step": 231 + }, + { + "epoch": 0.012508087125296528, + "grad_norm": 1.194676399230957, + "learning_rate": 9.999921685199376e-06, + "loss": 1.0377, + "step": 232 + }, + { + "epoch": 0.012562001293940047, + "grad_norm": 1.0519152879714966, + "learning_rate": 9.999920494119992e-06, + "loss": 0.8283, + "step": 233 + }, + { + "epoch": 0.012615915462583567, + "grad_norm": 1.243249773979187, + "learning_rate": 9.999919294051427e-06, + "loss": 0.9741, + "step": 234 + }, + { + "epoch": 0.012669829631227087, + "grad_norm": 1.1071687936782837, + "learning_rate": 9.999918084993681e-06, + "loss": 1.0402, + "step": 235 + }, + { + "epoch": 0.012723743799870606, + "grad_norm": 1.1224809885025024, + "learning_rate": 9.999916866946757e-06, + "loss": 0.8793, + "step": 236 + }, + { + "epoch": 0.012777657968514126, + "grad_norm": 1.0458532571792603, + "learning_rate": 9.999915639910656e-06, + "loss": 0.9855, + "step": 237 + }, + { + "epoch": 0.012831572137157646, + "grad_norm": 1.0610811710357666, + "learning_rate": 9.999914403885383e-06, + "loss": 0.8092, + "step": 238 + }, + { + "epoch": 0.012885486305801164, + "grad_norm": 1.2818992137908936, + "learning_rate": 9.999913158870936e-06, + "loss": 1.0101, + "step": 239 + }, + { + "epoch": 0.012939400474444683, + "grad_norm": 1.110400915145874, + "learning_rate": 9.999911904867319e-06, + "loss": 0.9782, + "step": 240 + }, + { + "epoch": 0.012993314643088203, + "grad_norm": 1.3290835618972778, + "learning_rate": 9.999910641874537e-06, + "loss": 1.0683, + "step": 241 + }, + { + "epoch": 0.013047228811731723, + "grad_norm": 1.1448980569839478, + "learning_rate": 9.999909369892588e-06, + "loss": 0.9223, + "step": 242 + }, + { + "epoch": 0.013101142980375242, + "grad_norm": 1.1710877418518066, + "learning_rate": 9.999908088921477e-06, + "loss": 0.8022, + "step": 243 + }, + { + "epoch": 0.013155057149018762, + "grad_norm": 1.1242793798446655, + "learning_rate": 9.999906798961207e-06, + "loss": 0.9238, + "step": 244 + }, + { + "epoch": 0.013208971317662282, + "grad_norm": 1.0338802337646484, + "learning_rate": 9.999905500011778e-06, + "loss": 0.8386, + "step": 245 + }, + { + "epoch": 0.013262885486305801, + "grad_norm": 1.0910224914550781, + "learning_rate": 9.999904192073193e-06, + "loss": 0.937, + "step": 246 + }, + { + "epoch": 0.013316799654949321, + "grad_norm": 1.297788143157959, + "learning_rate": 9.999902875145453e-06, + "loss": 0.9054, + "step": 247 + }, + { + "epoch": 0.01337071382359284, + "grad_norm": 1.1317543983459473, + "learning_rate": 9.999901549228564e-06, + "loss": 0.9418, + "step": 248 + }, + { + "epoch": 0.01342462799223636, + "grad_norm": 1.0944132804870605, + "learning_rate": 9.999900214322526e-06, + "loss": 0.9445, + "step": 249 + }, + { + "epoch": 0.013478542160879878, + "grad_norm": 1.4942843914031982, + "learning_rate": 9.999898870427342e-06, + "loss": 0.8956, + "step": 250 + }, + { + "epoch": 0.013532456329523398, + "grad_norm": 1.0630019903182983, + "learning_rate": 9.999897517543013e-06, + "loss": 0.8381, + "step": 251 + }, + { + "epoch": 0.013586370498166918, + "grad_norm": 1.65073561668396, + "learning_rate": 9.999896155669544e-06, + "loss": 1.0148, + "step": 252 + }, + { + "epoch": 0.013640284666810438, + "grad_norm": 1.035731315612793, + "learning_rate": 9.999894784806936e-06, + "loss": 0.8092, + "step": 253 + }, + { + "epoch": 0.013694198835453957, + "grad_norm": 1.308863639831543, + "learning_rate": 9.99989340495519e-06, + "loss": 0.9742, + "step": 254 + }, + { + "epoch": 0.013748113004097477, + "grad_norm": 1.1512938737869263, + "learning_rate": 9.999892016114313e-06, + "loss": 0.8747, + "step": 255 + }, + { + "epoch": 0.013802027172740997, + "grad_norm": 0.9977009296417236, + "learning_rate": 9.9998906182843e-06, + "loss": 0.8183, + "step": 256 + }, + { + "epoch": 0.013855941341384516, + "grad_norm": 1.2228175401687622, + "learning_rate": 9.99988921146516e-06, + "loss": 0.9917, + "step": 257 + }, + { + "epoch": 0.013909855510028036, + "grad_norm": 1.0753847360610962, + "learning_rate": 9.999887795656896e-06, + "loss": 1.0063, + "step": 258 + }, + { + "epoch": 0.013963769678671556, + "grad_norm": 1.0010429620742798, + "learning_rate": 9.999886370859506e-06, + "loss": 0.9315, + "step": 259 + }, + { + "epoch": 0.014017683847315074, + "grad_norm": 1.2038911581039429, + "learning_rate": 9.999884937072995e-06, + "loss": 0.8764, + "step": 260 + }, + { + "epoch": 0.014071598015958593, + "grad_norm": 1.1268917322158813, + "learning_rate": 9.999883494297365e-06, + "loss": 1.0059, + "step": 261 + }, + { + "epoch": 0.014125512184602113, + "grad_norm": 1.1053709983825684, + "learning_rate": 9.999882042532619e-06, + "loss": 0.8866, + "step": 262 + }, + { + "epoch": 0.014179426353245633, + "grad_norm": 1.091145396232605, + "learning_rate": 9.999880581778758e-06, + "loss": 1.0415, + "step": 263 + }, + { + "epoch": 0.014233340521889152, + "grad_norm": 1.0019958019256592, + "learning_rate": 9.999879112035786e-06, + "loss": 0.8177, + "step": 264 + }, + { + "epoch": 0.014287254690532672, + "grad_norm": 1.1044156551361084, + "learning_rate": 9.999877633303708e-06, + "loss": 0.9508, + "step": 265 + }, + { + "epoch": 0.014341168859176192, + "grad_norm": 0.9750218391418457, + "learning_rate": 9.999876145582524e-06, + "loss": 0.8501, + "step": 266 + }, + { + "epoch": 0.014395083027819711, + "grad_norm": 1.4015804529190063, + "learning_rate": 9.999874648872235e-06, + "loss": 0.9491, + "step": 267 + }, + { + "epoch": 0.014448997196463231, + "grad_norm": 1.066422939300537, + "learning_rate": 9.999873143172848e-06, + "loss": 1.0104, + "step": 268 + }, + { + "epoch": 0.01450291136510675, + "grad_norm": 1.1133167743682861, + "learning_rate": 9.99987162848436e-06, + "loss": 1.0142, + "step": 269 + }, + { + "epoch": 0.01455682553375027, + "grad_norm": 1.1259140968322754, + "learning_rate": 9.999870104806782e-06, + "loss": 0.9803, + "step": 270 + }, + { + "epoch": 0.014610739702393788, + "grad_norm": 1.0813393592834473, + "learning_rate": 9.999868572140108e-06, + "loss": 0.8728, + "step": 271 + }, + { + "epoch": 0.014664653871037308, + "grad_norm": 0.9939939379692078, + "learning_rate": 9.999867030484347e-06, + "loss": 0.8826, + "step": 272 + }, + { + "epoch": 0.014718568039680828, + "grad_norm": 1.0081939697265625, + "learning_rate": 9.999865479839499e-06, + "loss": 0.8682, + "step": 273 + }, + { + "epoch": 0.014772482208324347, + "grad_norm": 1.0190658569335938, + "learning_rate": 9.999863920205567e-06, + "loss": 0.9094, + "step": 274 + }, + { + "epoch": 0.014826396376967867, + "grad_norm": 1.0702111721038818, + "learning_rate": 9.999862351582553e-06, + "loss": 0.9244, + "step": 275 + }, + { + "epoch": 0.014880310545611387, + "grad_norm": 1.0891972780227661, + "learning_rate": 9.999860773970461e-06, + "loss": 1.0318, + "step": 276 + }, + { + "epoch": 0.014934224714254906, + "grad_norm": 0.9788139462471008, + "learning_rate": 9.999859187369294e-06, + "loss": 0.8779, + "step": 277 + }, + { + "epoch": 0.014988138882898426, + "grad_norm": 1.0678125619888306, + "learning_rate": 9.999857591779055e-06, + "loss": 0.8962, + "step": 278 + }, + { + "epoch": 0.015042053051541946, + "grad_norm": 0.9882293343544006, + "learning_rate": 9.999855987199747e-06, + "loss": 0.9082, + "step": 279 + }, + { + "epoch": 0.015095967220185465, + "grad_norm": 0.9987571835517883, + "learning_rate": 9.999854373631371e-06, + "loss": 0.9708, + "step": 280 + }, + { + "epoch": 0.015149881388828985, + "grad_norm": 1.0238722562789917, + "learning_rate": 9.99985275107393e-06, + "loss": 0.9461, + "step": 281 + }, + { + "epoch": 0.015203795557472503, + "grad_norm": 0.9628013372421265, + "learning_rate": 9.999851119527431e-06, + "loss": 0.9412, + "step": 282 + }, + { + "epoch": 0.015257709726116023, + "grad_norm": 1.0021862983703613, + "learning_rate": 9.999849478991873e-06, + "loss": 0.8461, + "step": 283 + }, + { + "epoch": 0.015311623894759542, + "grad_norm": 0.9776142239570618, + "learning_rate": 9.99984782946726e-06, + "loss": 0.962, + "step": 284 + }, + { + "epoch": 0.015365538063403062, + "grad_norm": 1.0114799737930298, + "learning_rate": 9.999846170953593e-06, + "loss": 0.8732, + "step": 285 + }, + { + "epoch": 0.015419452232046582, + "grad_norm": 0.9860401749610901, + "learning_rate": 9.999844503450879e-06, + "loss": 0.8204, + "step": 286 + }, + { + "epoch": 0.015473366400690101, + "grad_norm": 1.0743263959884644, + "learning_rate": 9.999842826959119e-06, + "loss": 0.9445, + "step": 287 + }, + { + "epoch": 0.015527280569333621, + "grad_norm": 1.0456606149673462, + "learning_rate": 9.999841141478315e-06, + "loss": 0.8869, + "step": 288 + }, + { + "epoch": 0.01558119473797714, + "grad_norm": 1.0299748182296753, + "learning_rate": 9.99983944700847e-06, + "loss": 0.9543, + "step": 289 + }, + { + "epoch": 0.01563510890662066, + "grad_norm": 1.0176036357879639, + "learning_rate": 9.99983774354959e-06, + "loss": 0.9672, + "step": 290 + }, + { + "epoch": 0.01568902307526418, + "grad_norm": 1.0023303031921387, + "learning_rate": 9.999836031101675e-06, + "loss": 0.9417, + "step": 291 + }, + { + "epoch": 0.015742937243907698, + "grad_norm": 0.9801005721092224, + "learning_rate": 9.99983430966473e-06, + "loss": 0.9376, + "step": 292 + }, + { + "epoch": 0.01579685141255122, + "grad_norm": 1.002906322479248, + "learning_rate": 9.999832579238756e-06, + "loss": 0.8973, + "step": 293 + }, + { + "epoch": 0.015850765581194737, + "grad_norm": 1.0014845132827759, + "learning_rate": 9.999830839823759e-06, + "loss": 0.9583, + "step": 294 + }, + { + "epoch": 0.01590467974983826, + "grad_norm": 1.0173449516296387, + "learning_rate": 9.999829091419739e-06, + "loss": 0.9006, + "step": 295 + }, + { + "epoch": 0.015958593918481777, + "grad_norm": 0.9779545664787292, + "learning_rate": 9.999827334026702e-06, + "loss": 0.9342, + "step": 296 + }, + { + "epoch": 0.016012508087125298, + "grad_norm": 0.9800315499305725, + "learning_rate": 9.999825567644648e-06, + "loss": 0.7948, + "step": 297 + }, + { + "epoch": 0.016066422255768816, + "grad_norm": 0.9628249406814575, + "learning_rate": 9.999823792273583e-06, + "loss": 0.8415, + "step": 298 + }, + { + "epoch": 0.016120336424412334, + "grad_norm": 1.1227449178695679, + "learning_rate": 9.99982200791351e-06, + "loss": 0.9646, + "step": 299 + }, + { + "epoch": 0.016174250593055856, + "grad_norm": 1.1018567085266113, + "learning_rate": 9.99982021456443e-06, + "loss": 0.8647, + "step": 300 + }, + { + "epoch": 0.016228164761699373, + "grad_norm": 1.1017298698425293, + "learning_rate": 9.999818412226347e-06, + "loss": 0.8708, + "step": 301 + }, + { + "epoch": 0.016282078930342895, + "grad_norm": 1.084594488143921, + "learning_rate": 9.999816600899267e-06, + "loss": 0.9765, + "step": 302 + }, + { + "epoch": 0.016335993098986413, + "grad_norm": 1.3735941648483276, + "learning_rate": 9.99981478058319e-06, + "loss": 1.0253, + "step": 303 + }, + { + "epoch": 0.016389907267629934, + "grad_norm": 1.1644489765167236, + "learning_rate": 9.999812951278119e-06, + "loss": 0.8519, + "step": 304 + }, + { + "epoch": 0.016443821436273452, + "grad_norm": 1.0079474449157715, + "learning_rate": 9.99981111298406e-06, + "loss": 0.9422, + "step": 305 + }, + { + "epoch": 0.016497735604916974, + "grad_norm": 1.0046736001968384, + "learning_rate": 9.999809265701015e-06, + "loss": 0.7766, + "step": 306 + }, + { + "epoch": 0.01655164977356049, + "grad_norm": 1.0312374830245972, + "learning_rate": 9.999807409428987e-06, + "loss": 0.8844, + "step": 307 + }, + { + "epoch": 0.01660556394220401, + "grad_norm": 1.0419421195983887, + "learning_rate": 9.99980554416798e-06, + "loss": 0.8902, + "step": 308 + }, + { + "epoch": 0.01665947811084753, + "grad_norm": 1.2056832313537598, + "learning_rate": 9.999803669917996e-06, + "loss": 0.9842, + "step": 309 + }, + { + "epoch": 0.01671339227949105, + "grad_norm": 0.9645346403121948, + "learning_rate": 9.999801786679039e-06, + "loss": 0.7837, + "step": 310 + }, + { + "epoch": 0.01676730644813457, + "grad_norm": 1.0259841680526733, + "learning_rate": 9.999799894451115e-06, + "loss": 0.8927, + "step": 311 + }, + { + "epoch": 0.016821220616778088, + "grad_norm": 0.9932212233543396, + "learning_rate": 9.999797993234224e-06, + "loss": 0.815, + "step": 312 + }, + { + "epoch": 0.01687513478542161, + "grad_norm": 1.0666078329086304, + "learning_rate": 9.99979608302837e-06, + "loss": 0.8245, + "step": 313 + }, + { + "epoch": 0.016929048954065128, + "grad_norm": 0.9566568732261658, + "learning_rate": 9.999794163833557e-06, + "loss": 0.851, + "step": 314 + }, + { + "epoch": 0.01698296312270865, + "grad_norm": 1.0056332349777222, + "learning_rate": 9.999792235649789e-06, + "loss": 0.8704, + "step": 315 + }, + { + "epoch": 0.017036877291352167, + "grad_norm": 1.036537528038025, + "learning_rate": 9.999790298477068e-06, + "loss": 0.9512, + "step": 316 + }, + { + "epoch": 0.01709079145999569, + "grad_norm": 1.1026023626327515, + "learning_rate": 9.9997883523154e-06, + "loss": 1.0007, + "step": 317 + }, + { + "epoch": 0.017144705628639206, + "grad_norm": 1.006659984588623, + "learning_rate": 9.999786397164786e-06, + "loss": 0.8992, + "step": 318 + }, + { + "epoch": 0.017198619797282724, + "grad_norm": 1.0100573301315308, + "learning_rate": 9.99978443302523e-06, + "loss": 0.9545, + "step": 319 + }, + { + "epoch": 0.017252533965926246, + "grad_norm": 1.000086784362793, + "learning_rate": 9.999782459896735e-06, + "loss": 0.8732, + "step": 320 + }, + { + "epoch": 0.017306448134569764, + "grad_norm": 1.2039650678634644, + "learning_rate": 9.999780477779306e-06, + "loss": 0.9881, + "step": 321 + }, + { + "epoch": 0.017360362303213285, + "grad_norm": 1.0316474437713623, + "learning_rate": 9.999778486672948e-06, + "loss": 0.8686, + "step": 322 + }, + { + "epoch": 0.017414276471856803, + "grad_norm": 1.1697666645050049, + "learning_rate": 9.999776486577661e-06, + "loss": 0.9185, + "step": 323 + }, + { + "epoch": 0.017468190640500324, + "grad_norm": 0.9523053169250488, + "learning_rate": 9.999774477493451e-06, + "loss": 0.858, + "step": 324 + }, + { + "epoch": 0.017522104809143842, + "grad_norm": 0.9660015106201172, + "learning_rate": 9.999772459420319e-06, + "loss": 0.9964, + "step": 325 + }, + { + "epoch": 0.017576018977787364, + "grad_norm": 0.971128523349762, + "learning_rate": 9.999770432358271e-06, + "loss": 0.8999, + "step": 326 + }, + { + "epoch": 0.01762993314643088, + "grad_norm": 1.221969485282898, + "learning_rate": 9.999768396307312e-06, + "loss": 0.8628, + "step": 327 + }, + { + "epoch": 0.017683847315074403, + "grad_norm": 1.0868507623672485, + "learning_rate": 9.999766351267442e-06, + "loss": 1.0732, + "step": 328 + }, + { + "epoch": 0.01773776148371792, + "grad_norm": 0.9527992606163025, + "learning_rate": 9.999764297238666e-06, + "loss": 0.8221, + "step": 329 + }, + { + "epoch": 0.01779167565236144, + "grad_norm": 0.9969122409820557, + "learning_rate": 9.99976223422099e-06, + "loss": 0.9234, + "step": 330 + }, + { + "epoch": 0.01784558982100496, + "grad_norm": 0.9291784763336182, + "learning_rate": 9.999760162214415e-06, + "loss": 0.7839, + "step": 331 + }, + { + "epoch": 0.01789950398964848, + "grad_norm": 0.9766960144042969, + "learning_rate": 9.999758081218944e-06, + "loss": 0.7929, + "step": 332 + }, + { + "epoch": 0.017953418158292, + "grad_norm": 0.9536904692649841, + "learning_rate": 9.999755991234585e-06, + "loss": 0.9136, + "step": 333 + }, + { + "epoch": 0.018007332326935518, + "grad_norm": 1.0325372219085693, + "learning_rate": 9.999753892261337e-06, + "loss": 0.8367, + "step": 334 + }, + { + "epoch": 0.01806124649557904, + "grad_norm": 0.9486141800880432, + "learning_rate": 9.999751784299207e-06, + "loss": 0.8802, + "step": 335 + }, + { + "epoch": 0.018115160664222557, + "grad_norm": 0.9880577921867371, + "learning_rate": 9.999749667348198e-06, + "loss": 0.8597, + "step": 336 + }, + { + "epoch": 0.01816907483286608, + "grad_norm": 1.043199896812439, + "learning_rate": 9.999747541408312e-06, + "loss": 0.9142, + "step": 337 + }, + { + "epoch": 0.018222989001509596, + "grad_norm": 1.0606465339660645, + "learning_rate": 9.999745406479554e-06, + "loss": 0.9876, + "step": 338 + }, + { + "epoch": 0.018276903170153118, + "grad_norm": 1.139449954032898, + "learning_rate": 9.999743262561929e-06, + "loss": 0.7773, + "step": 339 + }, + { + "epoch": 0.018330817338796636, + "grad_norm": 1.1416115760803223, + "learning_rate": 9.99974110965544e-06, + "loss": 0.9566, + "step": 340 + }, + { + "epoch": 0.018384731507440154, + "grad_norm": 1.0145153999328613, + "learning_rate": 9.99973894776009e-06, + "loss": 0.9543, + "step": 341 + }, + { + "epoch": 0.018438645676083675, + "grad_norm": 0.950528621673584, + "learning_rate": 9.999736776875885e-06, + "loss": 0.8007, + "step": 342 + }, + { + "epoch": 0.018492559844727193, + "grad_norm": 0.9080097079277039, + "learning_rate": 9.999734597002826e-06, + "loss": 0.8273, + "step": 343 + }, + { + "epoch": 0.018546474013370715, + "grad_norm": 1.0038888454437256, + "learning_rate": 9.99973240814092e-06, + "loss": 0.9394, + "step": 344 + }, + { + "epoch": 0.018600388182014232, + "grad_norm": 1.05253267288208, + "learning_rate": 9.999730210290168e-06, + "loss": 0.9485, + "step": 345 + }, + { + "epoch": 0.018654302350657754, + "grad_norm": 0.9396592974662781, + "learning_rate": 9.999728003450577e-06, + "loss": 0.8943, + "step": 346 + }, + { + "epoch": 0.018708216519301272, + "grad_norm": 1.149387240409851, + "learning_rate": 9.999725787622148e-06, + "loss": 0.8566, + "step": 347 + }, + { + "epoch": 0.018762130687944793, + "grad_norm": 1.1573290824890137, + "learning_rate": 9.999723562804887e-06, + "loss": 0.9641, + "step": 348 + }, + { + "epoch": 0.01881604485658831, + "grad_norm": 1.0217385292053223, + "learning_rate": 9.999721328998797e-06, + "loss": 0.9555, + "step": 349 + }, + { + "epoch": 0.018869959025231833, + "grad_norm": 1.034690499305725, + "learning_rate": 9.999719086203884e-06, + "loss": 0.9407, + "step": 350 + }, + { + "epoch": 0.01892387319387535, + "grad_norm": 0.9819002151489258, + "learning_rate": 9.999716834420148e-06, + "loss": 0.9104, + "step": 351 + }, + { + "epoch": 0.01897778736251887, + "grad_norm": 1.0459688901901245, + "learning_rate": 9.999714573647597e-06, + "loss": 0.9296, + "step": 352 + }, + { + "epoch": 0.01903170153116239, + "grad_norm": 0.9575183391571045, + "learning_rate": 9.999712303886232e-06, + "loss": 0.8517, + "step": 353 + }, + { + "epoch": 0.019085615699805908, + "grad_norm": 1.0018881559371948, + "learning_rate": 9.99971002513606e-06, + "loss": 0.9208, + "step": 354 + }, + { + "epoch": 0.01913952986844943, + "grad_norm": 1.0291972160339355, + "learning_rate": 9.999707737397085e-06, + "loss": 0.8765, + "step": 355 + }, + { + "epoch": 0.019193444037092947, + "grad_norm": 1.0081498622894287, + "learning_rate": 9.999705440669306e-06, + "loss": 0.9204, + "step": 356 + }, + { + "epoch": 0.01924735820573647, + "grad_norm": 0.956950843334198, + "learning_rate": 9.999703134952733e-06, + "loss": 0.8058, + "step": 357 + }, + { + "epoch": 0.019301272374379987, + "grad_norm": 1.1130229234695435, + "learning_rate": 9.999700820247369e-06, + "loss": 0.8202, + "step": 358 + }, + { + "epoch": 0.019355186543023508, + "grad_norm": 1.047211766242981, + "learning_rate": 9.999698496553216e-06, + "loss": 0.9357, + "step": 359 + }, + { + "epoch": 0.019409100711667026, + "grad_norm": 1.0225415229797363, + "learning_rate": 9.99969616387028e-06, + "loss": 0.8306, + "step": 360 + }, + { + "epoch": 0.019463014880310544, + "grad_norm": 1.060727596282959, + "learning_rate": 9.999693822198564e-06, + "loss": 0.9178, + "step": 361 + }, + { + "epoch": 0.019516929048954065, + "grad_norm": 1.0743412971496582, + "learning_rate": 9.999691471538074e-06, + "loss": 0.8761, + "step": 362 + }, + { + "epoch": 0.019570843217597583, + "grad_norm": 1.2229491472244263, + "learning_rate": 9.99968911188881e-06, + "loss": 1.0738, + "step": 363 + }, + { + "epoch": 0.019624757386241105, + "grad_norm": 0.9889073967933655, + "learning_rate": 9.999686743250783e-06, + "loss": 0.9458, + "step": 364 + }, + { + "epoch": 0.019678671554884623, + "grad_norm": 1.0398520231246948, + "learning_rate": 9.999684365623992e-06, + "loss": 0.9096, + "step": 365 + }, + { + "epoch": 0.019732585723528144, + "grad_norm": 1.0613081455230713, + "learning_rate": 9.999681979008442e-06, + "loss": 0.9312, + "step": 366 + }, + { + "epoch": 0.019786499892171662, + "grad_norm": 0.946211040019989, + "learning_rate": 9.99967958340414e-06, + "loss": 0.9208, + "step": 367 + }, + { + "epoch": 0.019840414060815183, + "grad_norm": 1.1298933029174805, + "learning_rate": 9.999677178811087e-06, + "loss": 0.9378, + "step": 368 + }, + { + "epoch": 0.0198943282294587, + "grad_norm": 1.1042351722717285, + "learning_rate": 9.999674765229288e-06, + "loss": 0.9487, + "step": 369 + }, + { + "epoch": 0.019948242398102223, + "grad_norm": 1.0717188119888306, + "learning_rate": 9.999672342658751e-06, + "loss": 0.939, + "step": 370 + }, + { + "epoch": 0.02000215656674574, + "grad_norm": 1.0936871767044067, + "learning_rate": 9.999669911099474e-06, + "loss": 1.1361, + "step": 371 + }, + { + "epoch": 0.02005607073538926, + "grad_norm": 1.0650005340576172, + "learning_rate": 9.999667470551466e-06, + "loss": 0.9709, + "step": 372 + }, + { + "epoch": 0.02010998490403278, + "grad_norm": 1.0154083967208862, + "learning_rate": 9.999665021014731e-06, + "loss": 0.9422, + "step": 373 + }, + { + "epoch": 0.020163899072676298, + "grad_norm": 1.1382607221603394, + "learning_rate": 9.999662562489272e-06, + "loss": 0.984, + "step": 374 + }, + { + "epoch": 0.02021781324131982, + "grad_norm": 0.9372896552085876, + "learning_rate": 9.999660094975095e-06, + "loss": 0.9857, + "step": 375 + }, + { + "epoch": 0.020271727409963337, + "grad_norm": 1.1777011156082153, + "learning_rate": 9.999657618472203e-06, + "loss": 0.9731, + "step": 376 + }, + { + "epoch": 0.02032564157860686, + "grad_norm": 0.9054237604141235, + "learning_rate": 9.9996551329806e-06, + "loss": 0.9104, + "step": 377 + }, + { + "epoch": 0.020379555747250377, + "grad_norm": 0.9255661964416504, + "learning_rate": 9.999652638500292e-06, + "loss": 0.8632, + "step": 378 + }, + { + "epoch": 0.020433469915893898, + "grad_norm": 0.9440998435020447, + "learning_rate": 9.999650135031282e-06, + "loss": 0.8945, + "step": 379 + }, + { + "epoch": 0.020487384084537416, + "grad_norm": 0.9822732210159302, + "learning_rate": 9.999647622573577e-06, + "loss": 0.8874, + "step": 380 + }, + { + "epoch": 0.020541298253180938, + "grad_norm": 1.1294387578964233, + "learning_rate": 9.999645101127179e-06, + "loss": 0.9892, + "step": 381 + }, + { + "epoch": 0.020595212421824455, + "grad_norm": 1.0458290576934814, + "learning_rate": 9.999642570692094e-06, + "loss": 0.9163, + "step": 382 + }, + { + "epoch": 0.020649126590467973, + "grad_norm": 0.8124557733535767, + "learning_rate": 9.999640031268326e-06, + "loss": 0.6927, + "step": 383 + }, + { + "epoch": 0.020703040759111495, + "grad_norm": 1.1053259372711182, + "learning_rate": 9.999637482855878e-06, + "loss": 0.8651, + "step": 384 + }, + { + "epoch": 0.020756954927755013, + "grad_norm": 1.1280632019042969, + "learning_rate": 9.999634925454757e-06, + "loss": 0.9708, + "step": 385 + }, + { + "epoch": 0.020810869096398534, + "grad_norm": 0.9916180372238159, + "learning_rate": 9.999632359064965e-06, + "loss": 0.9081, + "step": 386 + }, + { + "epoch": 0.020864783265042052, + "grad_norm": 1.0430771112442017, + "learning_rate": 9.99962978368651e-06, + "loss": 0.9837, + "step": 387 + }, + { + "epoch": 0.020918697433685574, + "grad_norm": 1.031343698501587, + "learning_rate": 9.999627199319398e-06, + "loss": 0.9156, + "step": 388 + }, + { + "epoch": 0.02097261160232909, + "grad_norm": 1.0157191753387451, + "learning_rate": 9.999624605963627e-06, + "loss": 0.9379, + "step": 389 + }, + { + "epoch": 0.021026525770972613, + "grad_norm": 0.9524544477462769, + "learning_rate": 9.999622003619204e-06, + "loss": 0.8448, + "step": 390 + }, + { + "epoch": 0.02108043993961613, + "grad_norm": 1.091670036315918, + "learning_rate": 9.999619392286137e-06, + "loss": 0.9794, + "step": 391 + }, + { + "epoch": 0.021134354108259652, + "grad_norm": 1.0502233505249023, + "learning_rate": 9.999616771964429e-06, + "loss": 1.0047, + "step": 392 + }, + { + "epoch": 0.02118826827690317, + "grad_norm": 1.2087476253509521, + "learning_rate": 9.999614142654084e-06, + "loss": 0.8964, + "step": 393 + }, + { + "epoch": 0.021242182445546688, + "grad_norm": 1.0264590978622437, + "learning_rate": 9.999611504355106e-06, + "loss": 0.8608, + "step": 394 + }, + { + "epoch": 0.02129609661419021, + "grad_norm": 0.9883281588554382, + "learning_rate": 9.999608857067503e-06, + "loss": 0.9109, + "step": 395 + }, + { + "epoch": 0.021350010782833728, + "grad_norm": 0.9913623332977295, + "learning_rate": 9.999606200791276e-06, + "loss": 0.8993, + "step": 396 + }, + { + "epoch": 0.02140392495147725, + "grad_norm": 1.019178867340088, + "learning_rate": 9.999603535526432e-06, + "loss": 0.9115, + "step": 397 + }, + { + "epoch": 0.021457839120120767, + "grad_norm": 0.9756026864051819, + "learning_rate": 9.999600861272974e-06, + "loss": 0.834, + "step": 398 + }, + { + "epoch": 0.02151175328876429, + "grad_norm": 0.9956341981887817, + "learning_rate": 9.999598178030909e-06, + "loss": 0.8756, + "step": 399 + }, + { + "epoch": 0.021565667457407806, + "grad_norm": 1.0267717838287354, + "learning_rate": 9.999595485800239e-06, + "loss": 0.9427, + "step": 400 + }, + { + "epoch": 0.021619581626051328, + "grad_norm": 1.061139464378357, + "learning_rate": 9.999592784580974e-06, + "loss": 0.9835, + "step": 401 + }, + { + "epoch": 0.021673495794694846, + "grad_norm": 0.9970353245735168, + "learning_rate": 9.999590074373114e-06, + "loss": 0.8946, + "step": 402 + }, + { + "epoch": 0.021727409963338367, + "grad_norm": 1.056242823600769, + "learning_rate": 9.999587355176664e-06, + "loss": 0.9076, + "step": 403 + }, + { + "epoch": 0.021781324131981885, + "grad_norm": 1.0285427570343018, + "learning_rate": 9.999584626991632e-06, + "loss": 0.8506, + "step": 404 + }, + { + "epoch": 0.021835238300625403, + "grad_norm": 1.0026901960372925, + "learning_rate": 9.99958188981802e-06, + "loss": 0.8457, + "step": 405 + }, + { + "epoch": 0.021889152469268924, + "grad_norm": 0.8921003341674805, + "learning_rate": 9.999579143655833e-06, + "loss": 0.8215, + "step": 406 + }, + { + "epoch": 0.021943066637912442, + "grad_norm": 1.2816855907440186, + "learning_rate": 9.99957638850508e-06, + "loss": 0.8779, + "step": 407 + }, + { + "epoch": 0.021996980806555964, + "grad_norm": 1.4713681936264038, + "learning_rate": 9.99957362436576e-06, + "loss": 0.8581, + "step": 408 + }, + { + "epoch": 0.02205089497519948, + "grad_norm": 1.0117568969726562, + "learning_rate": 9.999570851237883e-06, + "loss": 0.8865, + "step": 409 + }, + { + "epoch": 0.022104809143843003, + "grad_norm": 0.9530962705612183, + "learning_rate": 9.99956806912145e-06, + "loss": 0.8888, + "step": 410 + }, + { + "epoch": 0.02215872331248652, + "grad_norm": 0.865692675113678, + "learning_rate": 9.99956527801647e-06, + "loss": 0.8075, + "step": 411 + }, + { + "epoch": 0.022212637481130042, + "grad_norm": 0.9613220691680908, + "learning_rate": 9.999562477922944e-06, + "loss": 0.9289, + "step": 412 + }, + { + "epoch": 0.02226655164977356, + "grad_norm": 0.9419745802879333, + "learning_rate": 9.99955966884088e-06, + "loss": 0.8758, + "step": 413 + }, + { + "epoch": 0.02232046581841708, + "grad_norm": 1.0120573043823242, + "learning_rate": 9.999556850770282e-06, + "loss": 0.9014, + "step": 414 + }, + { + "epoch": 0.0223743799870606, + "grad_norm": 0.9833963513374329, + "learning_rate": 9.999554023711155e-06, + "loss": 0.9354, + "step": 415 + }, + { + "epoch": 0.022428294155704118, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.999551187663505e-06, + "loss": 0.9201, + "step": 416 + }, + { + "epoch": 0.02248220832434764, + "grad_norm": 1.0103633403778076, + "learning_rate": 9.999548342627334e-06, + "loss": 0.9023, + "step": 417 + }, + { + "epoch": 0.022536122492991157, + "grad_norm": 0.8671039342880249, + "learning_rate": 9.99954548860265e-06, + "loss": 0.7263, + "step": 418 + }, + { + "epoch": 0.02259003666163468, + "grad_norm": 1.0967090129852295, + "learning_rate": 9.999542625589461e-06, + "loss": 1.0616, + "step": 419 + }, + { + "epoch": 0.022643950830278196, + "grad_norm": 0.9032139778137207, + "learning_rate": 9.999539753587764e-06, + "loss": 0.782, + "step": 420 + }, + { + "epoch": 0.022697864998921718, + "grad_norm": 0.9532387256622314, + "learning_rate": 9.99953687259757e-06, + "loss": 0.9628, + "step": 421 + }, + { + "epoch": 0.022751779167565236, + "grad_norm": 0.9732246994972229, + "learning_rate": 9.999533982618885e-06, + "loss": 0.8682, + "step": 422 + }, + { + "epoch": 0.022805693336208757, + "grad_norm": 0.9160019159317017, + "learning_rate": 9.99953108365171e-06, + "loss": 0.9051, + "step": 423 + }, + { + "epoch": 0.022859607504852275, + "grad_norm": 1.0100488662719727, + "learning_rate": 9.999528175696054e-06, + "loss": 0.9836, + "step": 424 + }, + { + "epoch": 0.022913521673495793, + "grad_norm": 1.0130014419555664, + "learning_rate": 9.99952525875192e-06, + "loss": 0.8653, + "step": 425 + }, + { + "epoch": 0.022967435842139314, + "grad_norm": 0.9726247787475586, + "learning_rate": 9.999522332819313e-06, + "loss": 0.8761, + "step": 426 + }, + { + "epoch": 0.023021350010782832, + "grad_norm": 0.9457972049713135, + "learning_rate": 9.99951939789824e-06, + "loss": 0.8792, + "step": 427 + }, + { + "epoch": 0.023075264179426354, + "grad_norm": 1.083130121231079, + "learning_rate": 9.999516453988706e-06, + "loss": 0.9035, + "step": 428 + }, + { + "epoch": 0.023129178348069872, + "grad_norm": 0.9195771217346191, + "learning_rate": 9.999513501090714e-06, + "loss": 0.8586, + "step": 429 + }, + { + "epoch": 0.023183092516713393, + "grad_norm": 0.983346700668335, + "learning_rate": 9.999510539204273e-06, + "loss": 0.8335, + "step": 430 + }, + { + "epoch": 0.02323700668535691, + "grad_norm": 1.0524029731750488, + "learning_rate": 9.999507568329386e-06, + "loss": 0.838, + "step": 431 + }, + { + "epoch": 0.023290920854000433, + "grad_norm": 1.0267860889434814, + "learning_rate": 9.999504588466058e-06, + "loss": 0.9345, + "step": 432 + }, + { + "epoch": 0.02334483502264395, + "grad_norm": 1.025707483291626, + "learning_rate": 9.999501599614294e-06, + "loss": 0.9042, + "step": 433 + }, + { + "epoch": 0.023398749191287472, + "grad_norm": 0.9739174842834473, + "learning_rate": 9.999498601774101e-06, + "loss": 0.7433, + "step": 434 + }, + { + "epoch": 0.02345266335993099, + "grad_norm": 0.9468310475349426, + "learning_rate": 9.999495594945486e-06, + "loss": 0.8447, + "step": 435 + }, + { + "epoch": 0.023506577528574508, + "grad_norm": 0.9820529818534851, + "learning_rate": 9.99949257912845e-06, + "loss": 0.8842, + "step": 436 + }, + { + "epoch": 0.02356049169721803, + "grad_norm": 0.998515784740448, + "learning_rate": 9.999489554323e-06, + "loss": 0.9226, + "step": 437 + }, + { + "epoch": 0.023614405865861547, + "grad_norm": 0.9819791316986084, + "learning_rate": 9.999486520529144e-06, + "loss": 0.8559, + "step": 438 + }, + { + "epoch": 0.02366832003450507, + "grad_norm": 0.9468326568603516, + "learning_rate": 9.999483477746884e-06, + "loss": 0.8064, + "step": 439 + }, + { + "epoch": 0.023722234203148587, + "grad_norm": 1.0087614059448242, + "learning_rate": 9.999480425976229e-06, + "loss": 0.9232, + "step": 440 + }, + { + "epoch": 0.023776148371792108, + "grad_norm": 0.9446098208427429, + "learning_rate": 9.99947736521718e-06, + "loss": 0.8511, + "step": 441 + }, + { + "epoch": 0.023830062540435626, + "grad_norm": 1.0966850519180298, + "learning_rate": 9.999474295469746e-06, + "loss": 0.9929, + "step": 442 + }, + { + "epoch": 0.023883976709079147, + "grad_norm": 0.8858770728111267, + "learning_rate": 9.99947121673393e-06, + "loss": 0.8492, + "step": 443 + }, + { + "epoch": 0.023937890877722665, + "grad_norm": 1.083717703819275, + "learning_rate": 9.999468129009742e-06, + "loss": 0.9948, + "step": 444 + }, + { + "epoch": 0.023991805046366187, + "grad_norm": 1.0251178741455078, + "learning_rate": 9.999465032297184e-06, + "loss": 0.8769, + "step": 445 + }, + { + "epoch": 0.024045719215009705, + "grad_norm": 0.9331875443458557, + "learning_rate": 9.999461926596261e-06, + "loss": 0.8663, + "step": 446 + }, + { + "epoch": 0.024099633383653223, + "grad_norm": 0.8941493034362793, + "learning_rate": 9.999458811906979e-06, + "loss": 0.8172, + "step": 447 + }, + { + "epoch": 0.024153547552296744, + "grad_norm": 0.9978699684143066, + "learning_rate": 9.999455688229347e-06, + "loss": 0.9303, + "step": 448 + }, + { + "epoch": 0.024207461720940262, + "grad_norm": 0.8835211992263794, + "learning_rate": 9.999452555563366e-06, + "loss": 0.8921, + "step": 449 + }, + { + "epoch": 0.024261375889583783, + "grad_norm": 0.9061810970306396, + "learning_rate": 9.999449413909043e-06, + "loss": 0.8201, + "step": 450 + }, + { + "epoch": 0.0243152900582273, + "grad_norm": 1.0061571598052979, + "learning_rate": 9.999446263266385e-06, + "loss": 0.8506, + "step": 451 + }, + { + "epoch": 0.024369204226870823, + "grad_norm": 0.9286402463912964, + "learning_rate": 9.999443103635398e-06, + "loss": 0.8532, + "step": 452 + }, + { + "epoch": 0.02442311839551434, + "grad_norm": 1.0919772386550903, + "learning_rate": 9.999439935016087e-06, + "loss": 0.9466, + "step": 453 + }, + { + "epoch": 0.024477032564157862, + "grad_norm": 1.0552513599395752, + "learning_rate": 9.999436757408453e-06, + "loss": 0.8406, + "step": 454 + }, + { + "epoch": 0.02453094673280138, + "grad_norm": 0.9604331851005554, + "learning_rate": 9.999433570812511e-06, + "loss": 0.8928, + "step": 455 + }, + { + "epoch": 0.0245848609014449, + "grad_norm": 1.0126323699951172, + "learning_rate": 9.999430375228259e-06, + "loss": 0.924, + "step": 456 + }, + { + "epoch": 0.02463877507008842, + "grad_norm": 1.0540791749954224, + "learning_rate": 9.999427170655707e-06, + "loss": 0.9656, + "step": 457 + }, + { + "epoch": 0.024692689238731937, + "grad_norm": 0.8622417449951172, + "learning_rate": 9.999423957094857e-06, + "loss": 0.7428, + "step": 458 + }, + { + "epoch": 0.02474660340737546, + "grad_norm": 1.106581211090088, + "learning_rate": 9.999420734545719e-06, + "loss": 0.9258, + "step": 459 + }, + { + "epoch": 0.024800517576018977, + "grad_norm": 0.990807294845581, + "learning_rate": 9.999417503008296e-06, + "loss": 0.9083, + "step": 460 + }, + { + "epoch": 0.024854431744662498, + "grad_norm": 0.9302589893341064, + "learning_rate": 9.999414262482594e-06, + "loss": 0.8654, + "step": 461 + }, + { + "epoch": 0.024908345913306016, + "grad_norm": 1.0218255519866943, + "learning_rate": 9.999411012968621e-06, + "loss": 0.8996, + "step": 462 + }, + { + "epoch": 0.024962260081949537, + "grad_norm": 0.976108193397522, + "learning_rate": 9.99940775446638e-06, + "loss": 0.9423, + "step": 463 + }, + { + "epoch": 0.025016174250593055, + "grad_norm": 1.1027617454528809, + "learning_rate": 9.99940448697588e-06, + "loss": 1.0407, + "step": 464 + }, + { + "epoch": 0.025070088419236577, + "grad_norm": 1.0148764848709106, + "learning_rate": 9.999401210497122e-06, + "loss": 0.9418, + "step": 465 + }, + { + "epoch": 0.025124002587880095, + "grad_norm": 1.0120681524276733, + "learning_rate": 9.999397925030116e-06, + "loss": 0.92, + "step": 466 + }, + { + "epoch": 0.025177916756523613, + "grad_norm": 1.1855127811431885, + "learning_rate": 9.999394630574868e-06, + "loss": 0.9285, + "step": 467 + }, + { + "epoch": 0.025231830925167134, + "grad_norm": 1.8014320135116577, + "learning_rate": 9.999391327131383e-06, + "loss": 0.979, + "step": 468 + }, + { + "epoch": 0.025285745093810652, + "grad_norm": 1.1568403244018555, + "learning_rate": 9.999388014699664e-06, + "loss": 0.9574, + "step": 469 + }, + { + "epoch": 0.025339659262454173, + "grad_norm": 1.2544865608215332, + "learning_rate": 9.99938469327972e-06, + "loss": 0.8356, + "step": 470 + }, + { + "epoch": 0.02539357343109769, + "grad_norm": 1.8647997379302979, + "learning_rate": 9.99938136287156e-06, + "loss": 0.9181, + "step": 471 + }, + { + "epoch": 0.025447487599741213, + "grad_norm": 0.9942222237586975, + "learning_rate": 9.999378023475184e-06, + "loss": 0.9297, + "step": 472 + }, + { + "epoch": 0.02550140176838473, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.9993746750906e-06, + "loss": 0.9181, + "step": 473 + }, + { + "epoch": 0.025555315937028252, + "grad_norm": 0.9353258609771729, + "learning_rate": 9.999371317717817e-06, + "loss": 0.8789, + "step": 474 + }, + { + "epoch": 0.02560923010567177, + "grad_norm": 0.9256170988082886, + "learning_rate": 9.999367951356838e-06, + "loss": 0.8725, + "step": 475 + }, + { + "epoch": 0.02566314427431529, + "grad_norm": 1.1102124452590942, + "learning_rate": 9.999364576007669e-06, + "loss": 0.9818, + "step": 476 + }, + { + "epoch": 0.02571705844295881, + "grad_norm": 1.04171884059906, + "learning_rate": 9.999361191670316e-06, + "loss": 0.9275, + "step": 477 + }, + { + "epoch": 0.025770972611602327, + "grad_norm": 0.9670290350914001, + "learning_rate": 9.999357798344787e-06, + "loss": 0.8919, + "step": 478 + }, + { + "epoch": 0.02582488678024585, + "grad_norm": 1.0543723106384277, + "learning_rate": 9.999354396031085e-06, + "loss": 0.9356, + "step": 479 + }, + { + "epoch": 0.025878800948889367, + "grad_norm": 1.1368457078933716, + "learning_rate": 9.99935098472922e-06, + "loss": 0.9387, + "step": 480 + }, + { + "epoch": 0.025932715117532888, + "grad_norm": 1.0627872943878174, + "learning_rate": 9.999347564439196e-06, + "loss": 1.0047, + "step": 481 + }, + { + "epoch": 0.025986629286176406, + "grad_norm": 0.9553730487823486, + "learning_rate": 9.999344135161018e-06, + "loss": 0.8845, + "step": 482 + }, + { + "epoch": 0.026040543454819928, + "grad_norm": 0.9605830907821655, + "learning_rate": 9.999340696894694e-06, + "loss": 0.8816, + "step": 483 + }, + { + "epoch": 0.026094457623463446, + "grad_norm": 1.0464140176773071, + "learning_rate": 9.999337249640232e-06, + "loss": 0.9344, + "step": 484 + }, + { + "epoch": 0.026148371792106967, + "grad_norm": 1.0667988061904907, + "learning_rate": 9.999333793397635e-06, + "loss": 0.8834, + "step": 485 + }, + { + "epoch": 0.026202285960750485, + "grad_norm": 0.8996486663818359, + "learning_rate": 9.999330328166908e-06, + "loss": 0.8247, + "step": 486 + }, + { + "epoch": 0.026256200129394006, + "grad_norm": 1.0483838319778442, + "learning_rate": 9.99932685394806e-06, + "loss": 0.9414, + "step": 487 + }, + { + "epoch": 0.026310114298037524, + "grad_norm": 1.2089953422546387, + "learning_rate": 9.999323370741097e-06, + "loss": 1.0913, + "step": 488 + }, + { + "epoch": 0.026364028466681042, + "grad_norm": 1.074291467666626, + "learning_rate": 9.999319878546025e-06, + "loss": 0.8882, + "step": 489 + }, + { + "epoch": 0.026417942635324564, + "grad_norm": 1.0076494216918945, + "learning_rate": 9.99931637736285e-06, + "loss": 0.8393, + "step": 490 + }, + { + "epoch": 0.02647185680396808, + "grad_norm": 1.2263407707214355, + "learning_rate": 9.99931286719158e-06, + "loss": 0.955, + "step": 491 + }, + { + "epoch": 0.026525770972611603, + "grad_norm": 0.9093664884567261, + "learning_rate": 9.999309348032218e-06, + "loss": 0.8366, + "step": 492 + }, + { + "epoch": 0.02657968514125512, + "grad_norm": 1.0704407691955566, + "learning_rate": 9.999305819884772e-06, + "loss": 0.981, + "step": 493 + }, + { + "epoch": 0.026633599309898642, + "grad_norm": 1.2105270624160767, + "learning_rate": 9.999302282749249e-06, + "loss": 0.8896, + "step": 494 + }, + { + "epoch": 0.02668751347854216, + "grad_norm": 1.0142449140548706, + "learning_rate": 9.999298736625654e-06, + "loss": 0.8627, + "step": 495 + }, + { + "epoch": 0.02674142764718568, + "grad_norm": 1.0887057781219482, + "learning_rate": 9.999295181513994e-06, + "loss": 0.8884, + "step": 496 + }, + { + "epoch": 0.0267953418158292, + "grad_norm": 0.9958952069282532, + "learning_rate": 9.999291617414277e-06, + "loss": 0.7768, + "step": 497 + }, + { + "epoch": 0.02684925598447272, + "grad_norm": 0.8576722741127014, + "learning_rate": 9.999288044326508e-06, + "loss": 0.715, + "step": 498 + }, + { + "epoch": 0.02690317015311624, + "grad_norm": 1.058148741722107, + "learning_rate": 9.999284462250691e-06, + "loss": 0.8693, + "step": 499 + }, + { + "epoch": 0.026957084321759757, + "grad_norm": 0.9429569244384766, + "learning_rate": 9.999280871186837e-06, + "loss": 0.8883, + "step": 500 + }, + { + "epoch": 0.02701099849040328, + "grad_norm": 0.9450993537902832, + "learning_rate": 9.999277271134948e-06, + "loss": 0.9376, + "step": 501 + }, + { + "epoch": 0.027064912659046796, + "grad_norm": 1.0307891368865967, + "learning_rate": 9.999273662095035e-06, + "loss": 0.9098, + "step": 502 + }, + { + "epoch": 0.027118826827690318, + "grad_norm": 0.9515891671180725, + "learning_rate": 9.999270044067101e-06, + "loss": 0.8854, + "step": 503 + }, + { + "epoch": 0.027172740996333836, + "grad_norm": 1.1173255443572998, + "learning_rate": 9.999266417051154e-06, + "loss": 0.7977, + "step": 504 + }, + { + "epoch": 0.027226655164977357, + "grad_norm": 1.028194785118103, + "learning_rate": 9.9992627810472e-06, + "loss": 0.9585, + "step": 505 + }, + { + "epoch": 0.027280569333620875, + "grad_norm": 1.0855528116226196, + "learning_rate": 9.999259136055245e-06, + "loss": 0.9807, + "step": 506 + }, + { + "epoch": 0.027334483502264396, + "grad_norm": 1.1148236989974976, + "learning_rate": 9.999255482075298e-06, + "loss": 0.9672, + "step": 507 + }, + { + "epoch": 0.027388397670907914, + "grad_norm": 0.9697713255882263, + "learning_rate": 9.999251819107364e-06, + "loss": 0.9073, + "step": 508 + }, + { + "epoch": 0.027442311839551436, + "grad_norm": 0.9802384972572327, + "learning_rate": 9.999248147151448e-06, + "loss": 0.8704, + "step": 509 + }, + { + "epoch": 0.027496226008194954, + "grad_norm": 0.963330090045929, + "learning_rate": 9.999244466207559e-06, + "loss": 0.9312, + "step": 510 + }, + { + "epoch": 0.02755014017683847, + "grad_norm": 0.8776309490203857, + "learning_rate": 9.999240776275703e-06, + "loss": 0.8068, + "step": 511 + }, + { + "epoch": 0.027604054345481993, + "grad_norm": 1.1159353256225586, + "learning_rate": 9.999237077355886e-06, + "loss": 0.8164, + "step": 512 + }, + { + "epoch": 0.02765796851412551, + "grad_norm": 1.004232406616211, + "learning_rate": 9.999233369448115e-06, + "loss": 0.8666, + "step": 513 + }, + { + "epoch": 0.027711882682769032, + "grad_norm": 1.0300110578536987, + "learning_rate": 9.999229652552395e-06, + "loss": 0.8774, + "step": 514 + }, + { + "epoch": 0.02776579685141255, + "grad_norm": 0.8823155164718628, + "learning_rate": 9.999225926668736e-06, + "loss": 0.7579, + "step": 515 + }, + { + "epoch": 0.027819711020056072, + "grad_norm": 0.938956618309021, + "learning_rate": 9.999222191797144e-06, + "loss": 0.8749, + "step": 516 + }, + { + "epoch": 0.02787362518869959, + "grad_norm": 0.9111800789833069, + "learning_rate": 9.999218447937624e-06, + "loss": 0.8915, + "step": 517 + }, + { + "epoch": 0.02792753935734311, + "grad_norm": 0.971813440322876, + "learning_rate": 9.999214695090182e-06, + "loss": 0.9038, + "step": 518 + }, + { + "epoch": 0.02798145352598663, + "grad_norm": 0.9159868359565735, + "learning_rate": 9.999210933254828e-06, + "loss": 0.8726, + "step": 519 + }, + { + "epoch": 0.028035367694630147, + "grad_norm": 1.0223439931869507, + "learning_rate": 9.999207162431566e-06, + "loss": 0.8738, + "step": 520 + }, + { + "epoch": 0.02808928186327367, + "grad_norm": 0.9844004511833191, + "learning_rate": 9.999203382620404e-06, + "loss": 0.8815, + "step": 521 + }, + { + "epoch": 0.028143196031917186, + "grad_norm": 1.1636719703674316, + "learning_rate": 9.99919959382135e-06, + "loss": 0.8781, + "step": 522 + }, + { + "epoch": 0.028197110200560708, + "grad_norm": 0.9637702703475952, + "learning_rate": 9.999195796034407e-06, + "loss": 0.8491, + "step": 523 + }, + { + "epoch": 0.028251024369204226, + "grad_norm": 0.975931704044342, + "learning_rate": 9.999191989259584e-06, + "loss": 0.9983, + "step": 524 + }, + { + "epoch": 0.028304938537847747, + "grad_norm": 0.9855527877807617, + "learning_rate": 9.999188173496889e-06, + "loss": 0.9587, + "step": 525 + }, + { + "epoch": 0.028358852706491265, + "grad_norm": 0.9925652742385864, + "learning_rate": 9.99918434874633e-06, + "loss": 0.8408, + "step": 526 + }, + { + "epoch": 0.028412766875134787, + "grad_norm": 0.9272180795669556, + "learning_rate": 9.999180515007908e-06, + "loss": 0.8267, + "step": 527 + }, + { + "epoch": 0.028466681043778305, + "grad_norm": 1.161076307296753, + "learning_rate": 9.999176672281636e-06, + "loss": 0.9282, + "step": 528 + }, + { + "epoch": 0.028520595212421826, + "grad_norm": 0.8953909277915955, + "learning_rate": 9.99917282056752e-06, + "loss": 0.8078, + "step": 529 + }, + { + "epoch": 0.028574509381065344, + "grad_norm": 0.9194382429122925, + "learning_rate": 9.999168959865562e-06, + "loss": 0.8385, + "step": 530 + }, + { + "epoch": 0.028628423549708862, + "grad_norm": 1.0351816415786743, + "learning_rate": 9.999165090175775e-06, + "loss": 0.8155, + "step": 531 + }, + { + "epoch": 0.028682337718352383, + "grad_norm": 0.9233224391937256, + "learning_rate": 9.999161211498163e-06, + "loss": 0.8825, + "step": 532 + }, + { + "epoch": 0.0287362518869959, + "grad_norm": 1.0415356159210205, + "learning_rate": 9.999157323832732e-06, + "loss": 0.7844, + "step": 533 + }, + { + "epoch": 0.028790166055639423, + "grad_norm": 1.0329923629760742, + "learning_rate": 9.999153427179492e-06, + "loss": 0.893, + "step": 534 + }, + { + "epoch": 0.02884408022428294, + "grad_norm": 1.237291932106018, + "learning_rate": 9.999149521538448e-06, + "loss": 0.9786, + "step": 535 + }, + { + "epoch": 0.028897994392926462, + "grad_norm": 0.9952654242515564, + "learning_rate": 9.999145606909607e-06, + "loss": 0.9262, + "step": 536 + }, + { + "epoch": 0.02895190856156998, + "grad_norm": 1.016533374786377, + "learning_rate": 9.999141683292977e-06, + "loss": 0.9854, + "step": 537 + }, + { + "epoch": 0.0290058227302135, + "grad_norm": 1.0334454774856567, + "learning_rate": 9.999137750688564e-06, + "loss": 0.8928, + "step": 538 + }, + { + "epoch": 0.02905973689885702, + "grad_norm": 0.941662609577179, + "learning_rate": 9.999133809096374e-06, + "loss": 0.8698, + "step": 539 + }, + { + "epoch": 0.02911365106750054, + "grad_norm": 0.9454428553581238, + "learning_rate": 9.999129858516418e-06, + "loss": 0.9261, + "step": 540 + }, + { + "epoch": 0.02916756523614406, + "grad_norm": 1.0921217203140259, + "learning_rate": 9.9991258989487e-06, + "loss": 0.9163, + "step": 541 + }, + { + "epoch": 0.029221479404787577, + "grad_norm": 0.8999170064926147, + "learning_rate": 9.999121930393227e-06, + "loss": 0.883, + "step": 542 + }, + { + "epoch": 0.029275393573431098, + "grad_norm": 0.9732702970504761, + "learning_rate": 9.999117952850009e-06, + "loss": 0.9168, + "step": 543 + }, + { + "epoch": 0.029329307742074616, + "grad_norm": 1.00196373462677, + "learning_rate": 9.99911396631905e-06, + "loss": 0.826, + "step": 544 + }, + { + "epoch": 0.029383221910718137, + "grad_norm": 0.9776156544685364, + "learning_rate": 9.999109970800358e-06, + "loss": 0.8176, + "step": 545 + }, + { + "epoch": 0.029437136079361655, + "grad_norm": 1.0503387451171875, + "learning_rate": 9.99910596629394e-06, + "loss": 0.8617, + "step": 546 + }, + { + "epoch": 0.029491050248005177, + "grad_norm": 0.9195687174797058, + "learning_rate": 9.999101952799805e-06, + "loss": 0.8224, + "step": 547 + }, + { + "epoch": 0.029544964416648695, + "grad_norm": 0.8746809959411621, + "learning_rate": 9.999097930317959e-06, + "loss": 0.8407, + "step": 548 + }, + { + "epoch": 0.029598878585292216, + "grad_norm": 0.9035898447036743, + "learning_rate": 9.999093898848407e-06, + "loss": 0.8344, + "step": 549 + }, + { + "epoch": 0.029652792753935734, + "grad_norm": 0.8764795064926147, + "learning_rate": 9.99908985839116e-06, + "loss": 0.8323, + "step": 550 + }, + { + "epoch": 0.029706706922579255, + "grad_norm": 0.9654614329338074, + "learning_rate": 9.999085808946224e-06, + "loss": 0.8696, + "step": 551 + }, + { + "epoch": 0.029760621091222773, + "grad_norm": 1.1295796632766724, + "learning_rate": 9.999081750513606e-06, + "loss": 0.9608, + "step": 552 + }, + { + "epoch": 0.02981453525986629, + "grad_norm": 0.9591107368469238, + "learning_rate": 9.999077683093313e-06, + "loss": 0.8762, + "step": 553 + }, + { + "epoch": 0.029868449428509813, + "grad_norm": 0.8287899494171143, + "learning_rate": 9.999073606685353e-06, + "loss": 0.7265, + "step": 554 + }, + { + "epoch": 0.02992236359715333, + "grad_norm": 0.9429282546043396, + "learning_rate": 9.99906952128973e-06, + "loss": 0.8835, + "step": 555 + }, + { + "epoch": 0.029976277765796852, + "grad_norm": 0.9617370963096619, + "learning_rate": 9.999065426906459e-06, + "loss": 0.9138, + "step": 556 + }, + { + "epoch": 0.03003019193444037, + "grad_norm": 1.2346372604370117, + "learning_rate": 9.999061323535538e-06, + "loss": 0.831, + "step": 557 + }, + { + "epoch": 0.03008410610308389, + "grad_norm": 1.2413623332977295, + "learning_rate": 9.999057211176982e-06, + "loss": 1.0211, + "step": 558 + }, + { + "epoch": 0.03013802027172741, + "grad_norm": 0.98906010389328, + "learning_rate": 9.999053089830794e-06, + "loss": 0.7821, + "step": 559 + }, + { + "epoch": 0.03019193444037093, + "grad_norm": 0.96706622838974, + "learning_rate": 9.999048959496983e-06, + "loss": 0.8593, + "step": 560 + }, + { + "epoch": 0.03024584860901445, + "grad_norm": 0.9400071501731873, + "learning_rate": 9.999044820175556e-06, + "loss": 0.8731, + "step": 561 + }, + { + "epoch": 0.03029976277765797, + "grad_norm": 1.1276499032974243, + "learning_rate": 9.999040671866522e-06, + "loss": 0.86, + "step": 562 + }, + { + "epoch": 0.030353676946301488, + "grad_norm": 0.8859087228775024, + "learning_rate": 9.999036514569885e-06, + "loss": 0.8274, + "step": 563 + }, + { + "epoch": 0.030407591114945006, + "grad_norm": 1.1617575883865356, + "learning_rate": 9.999032348285656e-06, + "loss": 1.0519, + "step": 564 + }, + { + "epoch": 0.030461505283588527, + "grad_norm": 0.9717594385147095, + "learning_rate": 9.99902817301384e-06, + "loss": 0.9276, + "step": 565 + }, + { + "epoch": 0.030515419452232045, + "grad_norm": 1.000722050666809, + "learning_rate": 9.999023988754446e-06, + "loss": 0.8714, + "step": 566 + }, + { + "epoch": 0.030569333620875567, + "grad_norm": 1.1744625568389893, + "learning_rate": 9.999019795507481e-06, + "loss": 1.0087, + "step": 567 + }, + { + "epoch": 0.030623247789519085, + "grad_norm": 1.0199978351593018, + "learning_rate": 9.999015593272953e-06, + "loss": 0.8537, + "step": 568 + }, + { + "epoch": 0.030677161958162606, + "grad_norm": 0.9232216477394104, + "learning_rate": 9.999011382050869e-06, + "loss": 0.8488, + "step": 569 + }, + { + "epoch": 0.030731076126806124, + "grad_norm": 0.9905959367752075, + "learning_rate": 9.99900716184124e-06, + "loss": 0.9048, + "step": 570 + }, + { + "epoch": 0.030784990295449646, + "grad_norm": 0.9921644330024719, + "learning_rate": 9.999002932644066e-06, + "loss": 0.9294, + "step": 571 + }, + { + "epoch": 0.030838904464093164, + "grad_norm": 1.1583740711212158, + "learning_rate": 9.99899869445936e-06, + "loss": 0.727, + "step": 572 + }, + { + "epoch": 0.03089281863273668, + "grad_norm": 0.906736433506012, + "learning_rate": 9.998994447287127e-06, + "loss": 0.7889, + "step": 573 + }, + { + "epoch": 0.030946732801380203, + "grad_norm": 0.9060770869255066, + "learning_rate": 9.998990191127379e-06, + "loss": 0.8493, + "step": 574 + }, + { + "epoch": 0.03100064697002372, + "grad_norm": 0.9094041585922241, + "learning_rate": 9.99898592598012e-06, + "loss": 0.8604, + "step": 575 + }, + { + "epoch": 0.031054561138667242, + "grad_norm": 1.0964977741241455, + "learning_rate": 9.998981651845358e-06, + "loss": 0.8481, + "step": 576 + }, + { + "epoch": 0.03110847530731076, + "grad_norm": 0.9509627223014832, + "learning_rate": 9.998977368723102e-06, + "loss": 0.8601, + "step": 577 + }, + { + "epoch": 0.03116238947595428, + "grad_norm": 1.0108642578125, + "learning_rate": 9.998973076613359e-06, + "loss": 0.9076, + "step": 578 + }, + { + "epoch": 0.0312163036445978, + "grad_norm": 1.0268129110336304, + "learning_rate": 9.998968775516136e-06, + "loss": 0.8273, + "step": 579 + }, + { + "epoch": 0.03127021781324132, + "grad_norm": 0.968941867351532, + "learning_rate": 9.99896446543144e-06, + "loss": 0.8859, + "step": 580 + }, + { + "epoch": 0.03132413198188484, + "grad_norm": 0.936779260635376, + "learning_rate": 9.998960146359283e-06, + "loss": 0.8589, + "step": 581 + }, + { + "epoch": 0.03137804615052836, + "grad_norm": 0.9675167202949524, + "learning_rate": 9.998955818299667e-06, + "loss": 0.973, + "step": 582 + }, + { + "epoch": 0.03143196031917188, + "grad_norm": 0.9475553035736084, + "learning_rate": 9.998951481252604e-06, + "loss": 0.8936, + "step": 583 + }, + { + "epoch": 0.031485874487815396, + "grad_norm": 0.9130968451499939, + "learning_rate": 9.9989471352181e-06, + "loss": 0.7668, + "step": 584 + }, + { + "epoch": 0.031539788656458914, + "grad_norm": 0.8890071511268616, + "learning_rate": 9.998942780196164e-06, + "loss": 0.8971, + "step": 585 + }, + { + "epoch": 0.03159370282510244, + "grad_norm": 0.9298738837242126, + "learning_rate": 9.998938416186803e-06, + "loss": 0.9313, + "step": 586 + }, + { + "epoch": 0.03164761699374596, + "grad_norm": 1.0683361291885376, + "learning_rate": 9.998934043190025e-06, + "loss": 0.9018, + "step": 587 + }, + { + "epoch": 0.031701531162389475, + "grad_norm": 0.939253568649292, + "learning_rate": 9.99892966120584e-06, + "loss": 0.9119, + "step": 588 + }, + { + "epoch": 0.03175544533103299, + "grad_norm": 0.9245349764823914, + "learning_rate": 9.99892527023425e-06, + "loss": 0.9258, + "step": 589 + }, + { + "epoch": 0.03180935949967652, + "grad_norm": 0.9318797588348389, + "learning_rate": 9.998920870275267e-06, + "loss": 0.9557, + "step": 590 + }, + { + "epoch": 0.031863273668320036, + "grad_norm": 0.8909592628479004, + "learning_rate": 9.998916461328899e-06, + "loss": 0.8122, + "step": 591 + }, + { + "epoch": 0.031917187836963554, + "grad_norm": 1.0637080669403076, + "learning_rate": 9.998912043395154e-06, + "loss": 0.9517, + "step": 592 + }, + { + "epoch": 0.03197110200560707, + "grad_norm": 0.881934642791748, + "learning_rate": 9.99890761647404e-06, + "loss": 0.8729, + "step": 593 + }, + { + "epoch": 0.032025016174250596, + "grad_norm": 0.8882094025611877, + "learning_rate": 9.998903180565562e-06, + "loss": 0.7943, + "step": 594 + }, + { + "epoch": 0.032078930342894114, + "grad_norm": 0.965085506439209, + "learning_rate": 9.99889873566973e-06, + "loss": 0.8894, + "step": 595 + }, + { + "epoch": 0.03213284451153763, + "grad_norm": 0.9679432511329651, + "learning_rate": 9.998894281786556e-06, + "loss": 0.854, + "step": 596 + }, + { + "epoch": 0.03218675868018115, + "grad_norm": 1.4454354047775269, + "learning_rate": 9.998889818916043e-06, + "loss": 0.9944, + "step": 597 + }, + { + "epoch": 0.03224067284882467, + "grad_norm": 0.9369311928749084, + "learning_rate": 9.998885347058198e-06, + "loss": 0.8699, + "step": 598 + }, + { + "epoch": 0.03229458701746819, + "grad_norm": 0.9014303088188171, + "learning_rate": 9.998880866213033e-06, + "loss": 0.8735, + "step": 599 + }, + { + "epoch": 0.03234850118611171, + "grad_norm": 0.989251971244812, + "learning_rate": 9.998876376380555e-06, + "loss": 0.8872, + "step": 600 + }, + { + "epoch": 0.03240241535475523, + "grad_norm": 1.0256885290145874, + "learning_rate": 9.99887187756077e-06, + "loss": 0.8787, + "step": 601 + }, + { + "epoch": 0.03245632952339875, + "grad_norm": 0.9560148119926453, + "learning_rate": 9.998867369753688e-06, + "loss": 0.8301, + "step": 602 + }, + { + "epoch": 0.03251024369204227, + "grad_norm": 1.044754147529602, + "learning_rate": 9.998862852959316e-06, + "loss": 0.9286, + "step": 603 + }, + { + "epoch": 0.03256415786068579, + "grad_norm": 0.8769629597663879, + "learning_rate": 9.998858327177665e-06, + "loss": 0.7927, + "step": 604 + }, + { + "epoch": 0.03261807202932931, + "grad_norm": 0.9217430949211121, + "learning_rate": 9.99885379240874e-06, + "loss": 0.8327, + "step": 605 + }, + { + "epoch": 0.032671986197972826, + "grad_norm": 0.8202590942382812, + "learning_rate": 9.99884924865255e-06, + "loss": 0.7269, + "step": 606 + }, + { + "epoch": 0.032725900366616344, + "grad_norm": 0.9598796367645264, + "learning_rate": 9.998844695909102e-06, + "loss": 0.9329, + "step": 607 + }, + { + "epoch": 0.03277981453525987, + "grad_norm": 1.1016643047332764, + "learning_rate": 9.998840134178407e-06, + "loss": 0.9836, + "step": 608 + }, + { + "epoch": 0.032833728703903386, + "grad_norm": 0.9639281630516052, + "learning_rate": 9.998835563460471e-06, + "loss": 0.8475, + "step": 609 + }, + { + "epoch": 0.032887642872546904, + "grad_norm": 0.9266204833984375, + "learning_rate": 9.998830983755304e-06, + "loss": 0.7307, + "step": 610 + }, + { + "epoch": 0.03294155704119042, + "grad_norm": 0.9282877445220947, + "learning_rate": 9.99882639506291e-06, + "loss": 0.8163, + "step": 611 + }, + { + "epoch": 0.03299547120983395, + "grad_norm": 0.8939738869667053, + "learning_rate": 9.998821797383302e-06, + "loss": 0.6902, + "step": 612 + }, + { + "epoch": 0.033049385378477465, + "grad_norm": 0.9041041731834412, + "learning_rate": 9.998817190716488e-06, + "loss": 0.8735, + "step": 613 + }, + { + "epoch": 0.03310329954712098, + "grad_norm": 0.9973318576812744, + "learning_rate": 9.998812575062473e-06, + "loss": 0.9017, + "step": 614 + }, + { + "epoch": 0.0331572137157645, + "grad_norm": 1.0416412353515625, + "learning_rate": 9.998807950421268e-06, + "loss": 0.9293, + "step": 615 + }, + { + "epoch": 0.03321112788440802, + "grad_norm": 0.8686584234237671, + "learning_rate": 9.998803316792882e-06, + "loss": 0.8585, + "step": 616 + }, + { + "epoch": 0.033265042053051544, + "grad_norm": 0.9907833337783813, + "learning_rate": 9.998798674177319e-06, + "loss": 0.9264, + "step": 617 + }, + { + "epoch": 0.03331895622169506, + "grad_norm": 0.9927001595497131, + "learning_rate": 9.998794022574592e-06, + "loss": 0.895, + "step": 618 + }, + { + "epoch": 0.03337287039033858, + "grad_norm": 0.9314623475074768, + "learning_rate": 9.998789361984707e-06, + "loss": 0.8353, + "step": 619 + }, + { + "epoch": 0.0334267845589821, + "grad_norm": 0.9768248796463013, + "learning_rate": 9.998784692407673e-06, + "loss": 0.8917, + "step": 620 + }, + { + "epoch": 0.03348069872762562, + "grad_norm": 0.9487942457199097, + "learning_rate": 9.998780013843498e-06, + "loss": 0.9022, + "step": 621 + }, + { + "epoch": 0.03353461289626914, + "grad_norm": 1.0376895666122437, + "learning_rate": 9.99877532629219e-06, + "loss": 0.7692, + "step": 622 + }, + { + "epoch": 0.03358852706491266, + "grad_norm": 1.021345853805542, + "learning_rate": 9.99877062975376e-06, + "loss": 1.0386, + "step": 623 + }, + { + "epoch": 0.033642441233556176, + "grad_norm": 0.9979421496391296, + "learning_rate": 9.998765924228214e-06, + "loss": 0.9209, + "step": 624 + }, + { + "epoch": 0.0336963554021997, + "grad_norm": 0.8552166819572449, + "learning_rate": 9.998761209715559e-06, + "loss": 0.8765, + "step": 625 + }, + { + "epoch": 0.03375026957084322, + "grad_norm": 0.9737898707389832, + "learning_rate": 9.998756486215809e-06, + "loss": 0.7459, + "step": 626 + }, + { + "epoch": 0.03380418373948674, + "grad_norm": 1.1067259311676025, + "learning_rate": 9.998751753728967e-06, + "loss": 0.8582, + "step": 627 + }, + { + "epoch": 0.033858097908130255, + "grad_norm": 1.0689613819122314, + "learning_rate": 9.998747012255044e-06, + "loss": 0.8523, + "step": 628 + }, + { + "epoch": 0.03391201207677377, + "grad_norm": 1.1880419254302979, + "learning_rate": 9.998742261794048e-06, + "loss": 0.9085, + "step": 629 + }, + { + "epoch": 0.0339659262454173, + "grad_norm": 0.9569217562675476, + "learning_rate": 9.998737502345987e-06, + "loss": 0.9112, + "step": 630 + }, + { + "epoch": 0.034019840414060816, + "grad_norm": 0.9955928921699524, + "learning_rate": 9.99873273391087e-06, + "loss": 0.9166, + "step": 631 + }, + { + "epoch": 0.034073754582704334, + "grad_norm": 0.8906963467597961, + "learning_rate": 9.998727956488708e-06, + "loss": 0.882, + "step": 632 + }, + { + "epoch": 0.03412766875134785, + "grad_norm": 0.9241589307785034, + "learning_rate": 9.998723170079506e-06, + "loss": 0.8488, + "step": 633 + }, + { + "epoch": 0.03418158291999138, + "grad_norm": 0.9666005969047546, + "learning_rate": 9.998718374683271e-06, + "loss": 0.8432, + "step": 634 + }, + { + "epoch": 0.034235497088634895, + "grad_norm": 0.9036918878555298, + "learning_rate": 9.998713570300018e-06, + "loss": 0.7979, + "step": 635 + }, + { + "epoch": 0.03428941125727841, + "grad_norm": 0.8946508765220642, + "learning_rate": 9.998708756929751e-06, + "loss": 0.8854, + "step": 636 + }, + { + "epoch": 0.03434332542592193, + "grad_norm": 1.0300164222717285, + "learning_rate": 9.99870393457248e-06, + "loss": 0.9116, + "step": 637 + }, + { + "epoch": 0.03439723959456545, + "grad_norm": 1.0635035037994385, + "learning_rate": 9.998699103228214e-06, + "loss": 0.9138, + "step": 638 + }, + { + "epoch": 0.03445115376320897, + "grad_norm": 1.0362621545791626, + "learning_rate": 9.998694262896962e-06, + "loss": 1.0177, + "step": 639 + }, + { + "epoch": 0.03450506793185249, + "grad_norm": 0.9081454873085022, + "learning_rate": 9.99868941357873e-06, + "loss": 0.7802, + "step": 640 + }, + { + "epoch": 0.03455898210049601, + "grad_norm": 0.9943915605545044, + "learning_rate": 9.998684555273529e-06, + "loss": 0.9356, + "step": 641 + }, + { + "epoch": 0.03461289626913953, + "grad_norm": 0.9647786021232605, + "learning_rate": 9.998679687981367e-06, + "loss": 0.741, + "step": 642 + }, + { + "epoch": 0.03466681043778305, + "grad_norm": 0.9655315279960632, + "learning_rate": 9.998674811702255e-06, + "loss": 0.8644, + "step": 643 + }, + { + "epoch": 0.03472072460642657, + "grad_norm": 0.9162091612815857, + "learning_rate": 9.998669926436197e-06, + "loss": 0.8383, + "step": 644 + }, + { + "epoch": 0.03477463877507009, + "grad_norm": 0.9509754776954651, + "learning_rate": 9.998665032183207e-06, + "loss": 0.8066, + "step": 645 + }, + { + "epoch": 0.034828552943713606, + "grad_norm": 1.0545740127563477, + "learning_rate": 9.998660128943292e-06, + "loss": 0.8455, + "step": 646 + }, + { + "epoch": 0.03488246711235713, + "grad_norm": 1.0928760766983032, + "learning_rate": 9.998655216716458e-06, + "loss": 0.8708, + "step": 647 + }, + { + "epoch": 0.03493638128100065, + "grad_norm": 0.9743762016296387, + "learning_rate": 9.998650295502717e-06, + "loss": 0.878, + "step": 648 + }, + { + "epoch": 0.03499029544964417, + "grad_norm": 1.016741156578064, + "learning_rate": 9.998645365302077e-06, + "loss": 0.867, + "step": 649 + }, + { + "epoch": 0.035044209618287685, + "grad_norm": 1.125252366065979, + "learning_rate": 9.998640426114548e-06, + "loss": 0.9443, + "step": 650 + }, + { + "epoch": 0.0350981237869312, + "grad_norm": 0.9555762410163879, + "learning_rate": 9.998635477940135e-06, + "loss": 0.8353, + "step": 651 + }, + { + "epoch": 0.03515203795557473, + "grad_norm": 0.930173397064209, + "learning_rate": 9.998630520778851e-06, + "loss": 0.8383, + "step": 652 + }, + { + "epoch": 0.035205952124218245, + "grad_norm": 1.1592127084732056, + "learning_rate": 9.998625554630704e-06, + "loss": 0.9708, + "step": 653 + }, + { + "epoch": 0.03525986629286176, + "grad_norm": 0.9333894848823547, + "learning_rate": 9.998620579495701e-06, + "loss": 0.9055, + "step": 654 + }, + { + "epoch": 0.03531378046150528, + "grad_norm": 0.9495646357536316, + "learning_rate": 9.998615595373853e-06, + "loss": 0.7993, + "step": 655 + }, + { + "epoch": 0.035367694630148806, + "grad_norm": 1.0919233560562134, + "learning_rate": 9.99861060226517e-06, + "loss": 0.8852, + "step": 656 + }, + { + "epoch": 0.035421608798792324, + "grad_norm": 0.907940685749054, + "learning_rate": 9.998605600169657e-06, + "loss": 0.8294, + "step": 657 + }, + { + "epoch": 0.03547552296743584, + "grad_norm": 1.0423756837844849, + "learning_rate": 9.998600589087328e-06, + "loss": 0.8758, + "step": 658 + }, + { + "epoch": 0.03552943713607936, + "grad_norm": 1.0387269258499146, + "learning_rate": 9.998595569018186e-06, + "loss": 0.9099, + "step": 659 + }, + { + "epoch": 0.03558335130472288, + "grad_norm": 0.9186104536056519, + "learning_rate": 9.998590539962245e-06, + "loss": 0.9025, + "step": 660 + }, + { + "epoch": 0.0356372654733664, + "grad_norm": 1.0173289775848389, + "learning_rate": 9.998585501919514e-06, + "loss": 0.8468, + "step": 661 + }, + { + "epoch": 0.03569117964200992, + "grad_norm": 0.9579570889472961, + "learning_rate": 9.998580454889996e-06, + "loss": 0.8542, + "step": 662 + }, + { + "epoch": 0.03574509381065344, + "grad_norm": 1.093515396118164, + "learning_rate": 9.99857539887371e-06, + "loss": 0.8932, + "step": 663 + }, + { + "epoch": 0.03579900797929696, + "grad_norm": 1.0651243925094604, + "learning_rate": 9.998570333870656e-06, + "loss": 0.8822, + "step": 664 + }, + { + "epoch": 0.03585292214794048, + "grad_norm": 0.973278284072876, + "learning_rate": 9.998565259880845e-06, + "loss": 0.8724, + "step": 665 + }, + { + "epoch": 0.035906836316584, + "grad_norm": 0.961321234703064, + "learning_rate": 9.998560176904291e-06, + "loss": 0.947, + "step": 666 + }, + { + "epoch": 0.03596075048522752, + "grad_norm": 1.0216654539108276, + "learning_rate": 9.998555084940999e-06, + "loss": 0.8528, + "step": 667 + }, + { + "epoch": 0.036014664653871035, + "grad_norm": 0.9917817711830139, + "learning_rate": 9.99854998399098e-06, + "loss": 0.8608, + "step": 668 + }, + { + "epoch": 0.03606857882251455, + "grad_norm": 1.0164326429367065, + "learning_rate": 9.998544874054243e-06, + "loss": 0.8752, + "step": 669 + }, + { + "epoch": 0.03612249299115808, + "grad_norm": 0.9181317687034607, + "learning_rate": 9.998539755130793e-06, + "loss": 0.8032, + "step": 670 + }, + { + "epoch": 0.036176407159801596, + "grad_norm": 1.0100011825561523, + "learning_rate": 9.998534627220646e-06, + "loss": 0.9205, + "step": 671 + }, + { + "epoch": 0.036230321328445114, + "grad_norm": 0.9306463599205017, + "learning_rate": 9.998529490323807e-06, + "loss": 0.8209, + "step": 672 + }, + { + "epoch": 0.03628423549708863, + "grad_norm": 1.8988754749298096, + "learning_rate": 9.998524344440286e-06, + "loss": 0.8455, + "step": 673 + }, + { + "epoch": 0.03633814966573216, + "grad_norm": 0.9742317795753479, + "learning_rate": 9.998519189570091e-06, + "loss": 0.8733, + "step": 674 + }, + { + "epoch": 0.036392063834375675, + "grad_norm": 0.9334224462509155, + "learning_rate": 9.998514025713234e-06, + "loss": 0.8761, + "step": 675 + }, + { + "epoch": 0.03644597800301919, + "grad_norm": 0.9729838371276855, + "learning_rate": 9.998508852869724e-06, + "loss": 0.8916, + "step": 676 + }, + { + "epoch": 0.03649989217166271, + "grad_norm": 0.9721505641937256, + "learning_rate": 9.998503671039568e-06, + "loss": 0.8735, + "step": 677 + }, + { + "epoch": 0.036553806340306236, + "grad_norm": 0.9600850939750671, + "learning_rate": 9.998498480222775e-06, + "loss": 0.9157, + "step": 678 + }, + { + "epoch": 0.036607720508949754, + "grad_norm": 0.9010732173919678, + "learning_rate": 9.998493280419358e-06, + "loss": 0.9215, + "step": 679 + }, + { + "epoch": 0.03666163467759327, + "grad_norm": 0.8708087801933289, + "learning_rate": 9.998488071629324e-06, + "loss": 0.7218, + "step": 680 + }, + { + "epoch": 0.03671554884623679, + "grad_norm": 0.9739180207252502, + "learning_rate": 9.998482853852682e-06, + "loss": 0.8845, + "step": 681 + }, + { + "epoch": 0.03676946301488031, + "grad_norm": 0.9823595881462097, + "learning_rate": 9.998477627089443e-06, + "loss": 0.896, + "step": 682 + }, + { + "epoch": 0.03682337718352383, + "grad_norm": 0.9629859328269958, + "learning_rate": 9.998472391339612e-06, + "loss": 0.8636, + "step": 683 + }, + { + "epoch": 0.03687729135216735, + "grad_norm": 0.8644251823425293, + "learning_rate": 9.998467146603206e-06, + "loss": 0.9124, + "step": 684 + }, + { + "epoch": 0.03693120552081087, + "grad_norm": 0.8987632989883423, + "learning_rate": 9.99846189288023e-06, + "loss": 0.801, + "step": 685 + }, + { + "epoch": 0.036985119689454386, + "grad_norm": 0.9017630219459534, + "learning_rate": 9.99845663017069e-06, + "loss": 0.8675, + "step": 686 + }, + { + "epoch": 0.03703903385809791, + "grad_norm": 0.8905850648880005, + "learning_rate": 9.998451358474603e-06, + "loss": 0.8512, + "step": 687 + }, + { + "epoch": 0.03709294802674143, + "grad_norm": 0.9807800650596619, + "learning_rate": 9.998446077791972e-06, + "loss": 0.9258, + "step": 688 + }, + { + "epoch": 0.03714686219538495, + "grad_norm": 0.8916336894035339, + "learning_rate": 9.99844078812281e-06, + "loss": 0.8236, + "step": 689 + }, + { + "epoch": 0.037200776364028465, + "grad_norm": 0.9330187439918518, + "learning_rate": 9.998435489467126e-06, + "loss": 0.7812, + "step": 690 + }, + { + "epoch": 0.03725469053267198, + "grad_norm": 0.9859142899513245, + "learning_rate": 9.99843018182493e-06, + "loss": 0.8699, + "step": 691 + }, + { + "epoch": 0.03730860470131551, + "grad_norm": 0.9277002215385437, + "learning_rate": 9.998424865196228e-06, + "loss": 0.9276, + "step": 692 + }, + { + "epoch": 0.037362518869959026, + "grad_norm": 0.9764281511306763, + "learning_rate": 9.998419539581034e-06, + "loss": 0.9482, + "step": 693 + }, + { + "epoch": 0.037416433038602544, + "grad_norm": 1.0108616352081299, + "learning_rate": 9.998414204979357e-06, + "loss": 0.8582, + "step": 694 + }, + { + "epoch": 0.03747034720724606, + "grad_norm": 1.2767362594604492, + "learning_rate": 9.998408861391202e-06, + "loss": 0.7833, + "step": 695 + }, + { + "epoch": 0.03752426137588959, + "grad_norm": 0.8874560594558716, + "learning_rate": 9.998403508816585e-06, + "loss": 0.8935, + "step": 696 + }, + { + "epoch": 0.037578175544533104, + "grad_norm": 0.8549458980560303, + "learning_rate": 9.998398147255511e-06, + "loss": 0.7747, + "step": 697 + }, + { + "epoch": 0.03763208971317662, + "grad_norm": 0.9971988201141357, + "learning_rate": 9.998392776707993e-06, + "loss": 0.753, + "step": 698 + }, + { + "epoch": 0.03768600388182014, + "grad_norm": 0.9822113513946533, + "learning_rate": 9.998387397174037e-06, + "loss": 0.9121, + "step": 699 + }, + { + "epoch": 0.037739918050463665, + "grad_norm": 0.996151864528656, + "learning_rate": 9.998382008653656e-06, + "loss": 0.9356, + "step": 700 + }, + { + "epoch": 0.03779383221910718, + "grad_norm": 1.7505156993865967, + "learning_rate": 9.998376611146857e-06, + "loss": 0.8351, + "step": 701 + }, + { + "epoch": 0.0378477463877507, + "grad_norm": 1.070356011390686, + "learning_rate": 9.998371204653651e-06, + "loss": 0.9153, + "step": 702 + }, + { + "epoch": 0.03790166055639422, + "grad_norm": 0.9383741617202759, + "learning_rate": 9.998365789174048e-06, + "loss": 0.8904, + "step": 703 + }, + { + "epoch": 0.03795557472503774, + "grad_norm": 0.8444882035255432, + "learning_rate": 9.998360364708058e-06, + "loss": 0.8243, + "step": 704 + }, + { + "epoch": 0.03800948889368126, + "grad_norm": 1.0012257099151611, + "learning_rate": 9.99835493125569e-06, + "loss": 0.9439, + "step": 705 + }, + { + "epoch": 0.03806340306232478, + "grad_norm": 0.9745193719863892, + "learning_rate": 9.998349488816954e-06, + "loss": 0.8667, + "step": 706 + }, + { + "epoch": 0.0381173172309683, + "grad_norm": 0.8363852500915527, + "learning_rate": 9.998344037391859e-06, + "loss": 0.8082, + "step": 707 + }, + { + "epoch": 0.038171231399611816, + "grad_norm": 0.9389918446540833, + "learning_rate": 9.998338576980417e-06, + "loss": 0.8113, + "step": 708 + }, + { + "epoch": 0.03822514556825534, + "grad_norm": 0.9216110110282898, + "learning_rate": 9.998333107582635e-06, + "loss": 0.8179, + "step": 709 + }, + { + "epoch": 0.03827905973689886, + "grad_norm": 1.0292471647262573, + "learning_rate": 9.998327629198526e-06, + "loss": 0.8605, + "step": 710 + }, + { + "epoch": 0.03833297390554238, + "grad_norm": 0.9812708497047424, + "learning_rate": 9.998322141828097e-06, + "loss": 0.9279, + "step": 711 + }, + { + "epoch": 0.038386888074185894, + "grad_norm": 0.8186620473861694, + "learning_rate": 9.998316645471358e-06, + "loss": 0.7877, + "step": 712 + }, + { + "epoch": 0.03844080224282941, + "grad_norm": 1.034134864807129, + "learning_rate": 9.99831114012832e-06, + "loss": 0.9867, + "step": 713 + }, + { + "epoch": 0.03849471641147294, + "grad_norm": 1.1604938507080078, + "learning_rate": 9.998305625798993e-06, + "loss": 0.9134, + "step": 714 + }, + { + "epoch": 0.038548630580116455, + "grad_norm": 0.8452483415603638, + "learning_rate": 9.998300102483388e-06, + "loss": 0.8732, + "step": 715 + }, + { + "epoch": 0.03860254474875997, + "grad_norm": 0.8881269693374634, + "learning_rate": 9.998294570181512e-06, + "loss": 0.847, + "step": 716 + }, + { + "epoch": 0.03865645891740349, + "grad_norm": 0.8822013735771179, + "learning_rate": 9.998289028893375e-06, + "loss": 0.8404, + "step": 717 + }, + { + "epoch": 0.038710373086047016, + "grad_norm": 1.0011916160583496, + "learning_rate": 9.998283478618991e-06, + "loss": 0.8133, + "step": 718 + }, + { + "epoch": 0.038764287254690534, + "grad_norm": 1.0004018545150757, + "learning_rate": 9.998277919358367e-06, + "loss": 0.9556, + "step": 719 + }, + { + "epoch": 0.03881820142333405, + "grad_norm": 0.8176954984664917, + "learning_rate": 9.998272351111513e-06, + "loss": 0.7977, + "step": 720 + }, + { + "epoch": 0.03887211559197757, + "grad_norm": 0.9160690307617188, + "learning_rate": 9.99826677387844e-06, + "loss": 0.9239, + "step": 721 + }, + { + "epoch": 0.03892602976062109, + "grad_norm": 1.2158405780792236, + "learning_rate": 9.998261187659157e-06, + "loss": 0.9023, + "step": 722 + }, + { + "epoch": 0.03897994392926461, + "grad_norm": 0.9564448595046997, + "learning_rate": 9.998255592453674e-06, + "loss": 0.8585, + "step": 723 + }, + { + "epoch": 0.03903385809790813, + "grad_norm": 0.8902252316474915, + "learning_rate": 9.998249988262002e-06, + "loss": 0.8388, + "step": 724 + }, + { + "epoch": 0.03908777226655165, + "grad_norm": 0.8738620281219482, + "learning_rate": 9.998244375084152e-06, + "loss": 0.9545, + "step": 725 + }, + { + "epoch": 0.03914168643519517, + "grad_norm": 0.9670735001564026, + "learning_rate": 9.99823875292013e-06, + "loss": 0.8335, + "step": 726 + }, + { + "epoch": 0.03919560060383869, + "grad_norm": 0.8719429969787598, + "learning_rate": 9.998233121769952e-06, + "loss": 0.8546, + "step": 727 + }, + { + "epoch": 0.03924951477248221, + "grad_norm": 1.318429708480835, + "learning_rate": 9.998227481633622e-06, + "loss": 1.0658, + "step": 728 + }, + { + "epoch": 0.03930342894112573, + "grad_norm": 0.962630569934845, + "learning_rate": 9.998221832511155e-06, + "loss": 0.9049, + "step": 729 + }, + { + "epoch": 0.039357343109769245, + "grad_norm": 0.9639857411384583, + "learning_rate": 9.998216174402558e-06, + "loss": 0.9114, + "step": 730 + }, + { + "epoch": 0.03941125727841277, + "grad_norm": 1.1621571779251099, + "learning_rate": 9.998210507307843e-06, + "loss": 0.8776, + "step": 731 + }, + { + "epoch": 0.03946517144705629, + "grad_norm": 1.170089840888977, + "learning_rate": 9.998204831227019e-06, + "loss": 0.9928, + "step": 732 + }, + { + "epoch": 0.039519085615699806, + "grad_norm": 0.8257297873497009, + "learning_rate": 9.998199146160098e-06, + "loss": 0.7885, + "step": 733 + }, + { + "epoch": 0.039572999784343324, + "grad_norm": 0.8887513279914856, + "learning_rate": 9.998193452107088e-06, + "loss": 0.8389, + "step": 734 + }, + { + "epoch": 0.03962691395298684, + "grad_norm": 0.9321185350418091, + "learning_rate": 9.998187749068001e-06, + "loss": 0.9083, + "step": 735 + }, + { + "epoch": 0.03968082812163037, + "grad_norm": 0.9926772713661194, + "learning_rate": 9.998182037042847e-06, + "loss": 0.9102, + "step": 736 + }, + { + "epoch": 0.039734742290273885, + "grad_norm": 1.0760009288787842, + "learning_rate": 9.998176316031634e-06, + "loss": 0.7781, + "step": 737 + }, + { + "epoch": 0.0397886564589174, + "grad_norm": 1.0998133420944214, + "learning_rate": 9.998170586034376e-06, + "loss": 0.9725, + "step": 738 + }, + { + "epoch": 0.03984257062756092, + "grad_norm": 0.9367475509643555, + "learning_rate": 9.99816484705108e-06, + "loss": 0.8277, + "step": 739 + }, + { + "epoch": 0.039896484796204446, + "grad_norm": 0.942954957485199, + "learning_rate": 9.998159099081758e-06, + "loss": 0.8542, + "step": 740 + }, + { + "epoch": 0.039950398964847963, + "grad_norm": 0.9841166138648987, + "learning_rate": 9.998153342126421e-06, + "loss": 0.9179, + "step": 741 + }, + { + "epoch": 0.04000431313349148, + "grad_norm": 0.9215245246887207, + "learning_rate": 9.998147576185077e-06, + "loss": 0.8899, + "step": 742 + }, + { + "epoch": 0.040058227302135, + "grad_norm": 1.0368192195892334, + "learning_rate": 9.998141801257739e-06, + "loss": 0.9828, + "step": 743 + }, + { + "epoch": 0.04011214147077852, + "grad_norm": 0.9696660041809082, + "learning_rate": 9.998136017344416e-06, + "loss": 0.9431, + "step": 744 + }, + { + "epoch": 0.04016605563942204, + "grad_norm": 1.111257791519165, + "learning_rate": 9.998130224445117e-06, + "loss": 0.9666, + "step": 745 + }, + { + "epoch": 0.04021996980806556, + "grad_norm": 0.9260644316673279, + "learning_rate": 9.998124422559856e-06, + "loss": 0.8941, + "step": 746 + }, + { + "epoch": 0.04027388397670908, + "grad_norm": 0.8622020483016968, + "learning_rate": 9.99811861168864e-06, + "loss": 0.8148, + "step": 747 + }, + { + "epoch": 0.040327798145352596, + "grad_norm": 0.8767471313476562, + "learning_rate": 9.998112791831483e-06, + "loss": 0.7093, + "step": 748 + }, + { + "epoch": 0.04038171231399612, + "grad_norm": 0.902917206287384, + "learning_rate": 9.998106962988391e-06, + "loss": 0.7677, + "step": 749 + }, + { + "epoch": 0.04043562648263964, + "grad_norm": 1.351694941520691, + "learning_rate": 9.998101125159377e-06, + "loss": 1.0382, + "step": 750 + }, + { + "epoch": 0.04048954065128316, + "grad_norm": 0.8547930121421814, + "learning_rate": 9.998095278344452e-06, + "loss": 0.7974, + "step": 751 + }, + { + "epoch": 0.040543454819926675, + "grad_norm": 0.941149115562439, + "learning_rate": 9.998089422543626e-06, + "loss": 0.8518, + "step": 752 + }, + { + "epoch": 0.0405973689885702, + "grad_norm": 0.8671521544456482, + "learning_rate": 9.998083557756908e-06, + "loss": 0.8049, + "step": 753 + }, + { + "epoch": 0.04065128315721372, + "grad_norm": 0.9877942800521851, + "learning_rate": 9.998077683984311e-06, + "loss": 0.8874, + "step": 754 + }, + { + "epoch": 0.040705197325857236, + "grad_norm": 1.2130393981933594, + "learning_rate": 9.998071801225843e-06, + "loss": 0.9794, + "step": 755 + }, + { + "epoch": 0.040759111494500753, + "grad_norm": 0.9422823786735535, + "learning_rate": 9.998065909481518e-06, + "loss": 0.899, + "step": 756 + }, + { + "epoch": 0.04081302566314427, + "grad_norm": 0.9770492911338806, + "learning_rate": 9.998060008751343e-06, + "loss": 0.8434, + "step": 757 + }, + { + "epoch": 0.040866939831787796, + "grad_norm": 0.9227531552314758, + "learning_rate": 9.998054099035332e-06, + "loss": 0.8797, + "step": 758 + }, + { + "epoch": 0.040920854000431314, + "grad_norm": 1.0452102422714233, + "learning_rate": 9.998048180333492e-06, + "loss": 0.8702, + "step": 759 + }, + { + "epoch": 0.04097476816907483, + "grad_norm": 1.034125566482544, + "learning_rate": 9.998042252645837e-06, + "loss": 0.9041, + "step": 760 + }, + { + "epoch": 0.04102868233771835, + "grad_norm": 0.886029064655304, + "learning_rate": 9.998036315972375e-06, + "loss": 0.7805, + "step": 761 + }, + { + "epoch": 0.041082596506361875, + "grad_norm": 0.9845888614654541, + "learning_rate": 9.998030370313116e-06, + "loss": 0.9836, + "step": 762 + }, + { + "epoch": 0.04113651067500539, + "grad_norm": 0.9223973155021667, + "learning_rate": 9.998024415668075e-06, + "loss": 0.768, + "step": 763 + }, + { + "epoch": 0.04119042484364891, + "grad_norm": 1.0607362985610962, + "learning_rate": 9.99801845203726e-06, + "loss": 0.865, + "step": 764 + }, + { + "epoch": 0.04124433901229243, + "grad_norm": 0.9620907306671143, + "learning_rate": 9.998012479420683e-06, + "loss": 0.7645, + "step": 765 + }, + { + "epoch": 0.04129825318093595, + "grad_norm": 0.9490310549736023, + "learning_rate": 9.99800649781835e-06, + "loss": 0.9124, + "step": 766 + }, + { + "epoch": 0.04135216734957947, + "grad_norm": 0.9684557914733887, + "learning_rate": 9.99800050723028e-06, + "loss": 0.876, + "step": 767 + }, + { + "epoch": 0.04140608151822299, + "grad_norm": 0.9633080959320068, + "learning_rate": 9.997994507656476e-06, + "loss": 0.8976, + "step": 768 + }, + { + "epoch": 0.04145999568686651, + "grad_norm": 0.9495208263397217, + "learning_rate": 9.997988499096953e-06, + "loss": 0.9049, + "step": 769 + }, + { + "epoch": 0.041513909855510026, + "grad_norm": 1.0614326000213623, + "learning_rate": 9.997982481551721e-06, + "loss": 0.905, + "step": 770 + }, + { + "epoch": 0.04156782402415355, + "grad_norm": 0.820672869682312, + "learning_rate": 9.99797645502079e-06, + "loss": 0.8306, + "step": 771 + }, + { + "epoch": 0.04162173819279707, + "grad_norm": 0.9719771146774292, + "learning_rate": 9.997970419504171e-06, + "loss": 0.828, + "step": 772 + }, + { + "epoch": 0.041675652361440586, + "grad_norm": 0.893326997756958, + "learning_rate": 9.997964375001875e-06, + "loss": 0.8416, + "step": 773 + }, + { + "epoch": 0.041729566530084104, + "grad_norm": 0.858121395111084, + "learning_rate": 9.997958321513915e-06, + "loss": 0.8779, + "step": 774 + }, + { + "epoch": 0.04178348069872762, + "grad_norm": 0.9703636765480042, + "learning_rate": 9.997952259040297e-06, + "loss": 0.8623, + "step": 775 + }, + { + "epoch": 0.04183739486737115, + "grad_norm": 0.9626398086547852, + "learning_rate": 9.997946187581039e-06, + "loss": 0.8309, + "step": 776 + }, + { + "epoch": 0.041891309036014665, + "grad_norm": 0.9132344722747803, + "learning_rate": 9.997940107136143e-06, + "loss": 0.8798, + "step": 777 + }, + { + "epoch": 0.04194522320465818, + "grad_norm": 0.9608821272850037, + "learning_rate": 9.997934017705629e-06, + "loss": 0.8764, + "step": 778 + }, + { + "epoch": 0.0419991373733017, + "grad_norm": 1.0852513313293457, + "learning_rate": 9.997927919289501e-06, + "loss": 0.8908, + "step": 779 + }, + { + "epoch": 0.042053051541945226, + "grad_norm": 0.9690573215484619, + "learning_rate": 9.997921811887774e-06, + "loss": 0.8556, + "step": 780 + }, + { + "epoch": 0.042106965710588744, + "grad_norm": 0.9107050895690918, + "learning_rate": 9.997915695500458e-06, + "loss": 0.9249, + "step": 781 + }, + { + "epoch": 0.04216087987923226, + "grad_norm": 1.029974102973938, + "learning_rate": 9.997909570127564e-06, + "loss": 0.8369, + "step": 782 + }, + { + "epoch": 0.04221479404787578, + "grad_norm": 0.8179258704185486, + "learning_rate": 9.997903435769101e-06, + "loss": 0.7729, + "step": 783 + }, + { + "epoch": 0.042268708216519305, + "grad_norm": 1.0664961338043213, + "learning_rate": 9.997897292425082e-06, + "loss": 0.8815, + "step": 784 + }, + { + "epoch": 0.04232262238516282, + "grad_norm": 0.9794465899467468, + "learning_rate": 9.997891140095519e-06, + "loss": 0.9244, + "step": 785 + }, + { + "epoch": 0.04237653655380634, + "grad_norm": 0.875953197479248, + "learning_rate": 9.99788497878042e-06, + "loss": 0.9191, + "step": 786 + }, + { + "epoch": 0.04243045072244986, + "grad_norm": 0.9880902767181396, + "learning_rate": 9.9978788084798e-06, + "loss": 0.8639, + "step": 787 + }, + { + "epoch": 0.042484364891093376, + "grad_norm": 1.0391566753387451, + "learning_rate": 9.997872629193666e-06, + "loss": 0.9943, + "step": 788 + }, + { + "epoch": 0.0425382790597369, + "grad_norm": 0.9321290850639343, + "learning_rate": 9.997866440922033e-06, + "loss": 0.7809, + "step": 789 + }, + { + "epoch": 0.04259219322838042, + "grad_norm": 0.8898556232452393, + "learning_rate": 9.99786024366491e-06, + "loss": 0.9353, + "step": 790 + }, + { + "epoch": 0.04264610739702394, + "grad_norm": 1.1177983283996582, + "learning_rate": 9.997854037422306e-06, + "loss": 0.8157, + "step": 791 + }, + { + "epoch": 0.042700021565667455, + "grad_norm": 0.8821296691894531, + "learning_rate": 9.997847822194236e-06, + "loss": 0.8729, + "step": 792 + }, + { + "epoch": 0.04275393573431098, + "grad_norm": 0.8545325398445129, + "learning_rate": 9.997841597980709e-06, + "loss": 0.8415, + "step": 793 + }, + { + "epoch": 0.0428078499029545, + "grad_norm": 0.9313606023788452, + "learning_rate": 9.997835364781739e-06, + "loss": 0.8411, + "step": 794 + }, + { + "epoch": 0.042861764071598016, + "grad_norm": 0.9587781429290771, + "learning_rate": 9.997829122597332e-06, + "loss": 0.8086, + "step": 795 + }, + { + "epoch": 0.042915678240241534, + "grad_norm": 0.9708360433578491, + "learning_rate": 9.997822871427504e-06, + "loss": 0.8715, + "step": 796 + }, + { + "epoch": 0.04296959240888505, + "grad_norm": 0.8868080973625183, + "learning_rate": 9.997816611272265e-06, + "loss": 0.8549, + "step": 797 + }, + { + "epoch": 0.04302350657752858, + "grad_norm": 0.9147778153419495, + "learning_rate": 9.997810342131624e-06, + "loss": 0.7854, + "step": 798 + }, + { + "epoch": 0.043077420746172095, + "grad_norm": 0.9853960275650024, + "learning_rate": 9.997804064005596e-06, + "loss": 0.8243, + "step": 799 + }, + { + "epoch": 0.04313133491481561, + "grad_norm": 1.0076130628585815, + "learning_rate": 9.997797776894189e-06, + "loss": 0.9077, + "step": 800 + }, + { + "epoch": 0.04318524908345913, + "grad_norm": 0.9694076776504517, + "learning_rate": 9.997791480797417e-06, + "loss": 0.8767, + "step": 801 + }, + { + "epoch": 0.043239163252102655, + "grad_norm": 1.114001750946045, + "learning_rate": 9.99778517571529e-06, + "loss": 0.8211, + "step": 802 + }, + { + "epoch": 0.04329307742074617, + "grad_norm": 0.9701128005981445, + "learning_rate": 9.997778861647817e-06, + "loss": 0.9084, + "step": 803 + }, + { + "epoch": 0.04334699158938969, + "grad_norm": 0.868299126625061, + "learning_rate": 9.997772538595015e-06, + "loss": 0.7556, + "step": 804 + }, + { + "epoch": 0.04340090575803321, + "grad_norm": 0.9160446524620056, + "learning_rate": 9.997766206556888e-06, + "loss": 0.821, + "step": 805 + }, + { + "epoch": 0.043454819926676734, + "grad_norm": 0.934198260307312, + "learning_rate": 9.997759865533454e-06, + "loss": 0.9113, + "step": 806 + }, + { + "epoch": 0.04350873409532025, + "grad_norm": 0.8949079513549805, + "learning_rate": 9.997753515524722e-06, + "loss": 0.7821, + "step": 807 + }, + { + "epoch": 0.04356264826396377, + "grad_norm": 0.9035944938659668, + "learning_rate": 9.997747156530702e-06, + "loss": 0.8233, + "step": 808 + }, + { + "epoch": 0.04361656243260729, + "grad_norm": 0.9681552052497864, + "learning_rate": 9.99774078855141e-06, + "loss": 0.9241, + "step": 809 + }, + { + "epoch": 0.043670476601250806, + "grad_norm": 0.906092643737793, + "learning_rate": 9.99773441158685e-06, + "loss": 0.8948, + "step": 810 + }, + { + "epoch": 0.04372439076989433, + "grad_norm": 0.9229143261909485, + "learning_rate": 9.997728025637039e-06, + "loss": 0.8897, + "step": 811 + }, + { + "epoch": 0.04377830493853785, + "grad_norm": 0.9263061881065369, + "learning_rate": 9.997721630701986e-06, + "loss": 0.7923, + "step": 812 + }, + { + "epoch": 0.04383221910718137, + "grad_norm": 0.8474372029304504, + "learning_rate": 9.997715226781706e-06, + "loss": 0.796, + "step": 813 + }, + { + "epoch": 0.043886133275824885, + "grad_norm": 0.9960548877716064, + "learning_rate": 9.997708813876206e-06, + "loss": 0.9166, + "step": 814 + }, + { + "epoch": 0.04394004744446841, + "grad_norm": 0.9843032956123352, + "learning_rate": 9.997702391985499e-06, + "loss": 0.9354, + "step": 815 + }, + { + "epoch": 0.04399396161311193, + "grad_norm": 0.9313154220581055, + "learning_rate": 9.997695961109599e-06, + "loss": 0.8972, + "step": 816 + }, + { + "epoch": 0.044047875781755445, + "grad_norm": 0.8846973180770874, + "learning_rate": 9.997689521248515e-06, + "loss": 0.8599, + "step": 817 + }, + { + "epoch": 0.04410178995039896, + "grad_norm": 0.8113641738891602, + "learning_rate": 9.99768307240226e-06, + "loss": 0.8509, + "step": 818 + }, + { + "epoch": 0.04415570411904248, + "grad_norm": 1.0659984350204468, + "learning_rate": 9.997676614570844e-06, + "loss": 0.938, + "step": 819 + }, + { + "epoch": 0.044209618287686006, + "grad_norm": 0.9183745384216309, + "learning_rate": 9.99767014775428e-06, + "loss": 0.8761, + "step": 820 + }, + { + "epoch": 0.044263532456329524, + "grad_norm": 0.87090003490448, + "learning_rate": 9.997663671952578e-06, + "loss": 0.8535, + "step": 821 + }, + { + "epoch": 0.04431744662497304, + "grad_norm": 0.9857214093208313, + "learning_rate": 9.997657187165753e-06, + "loss": 0.9434, + "step": 822 + }, + { + "epoch": 0.04437136079361656, + "grad_norm": 1.0443209409713745, + "learning_rate": 9.997650693393812e-06, + "loss": 0.8994, + "step": 823 + }, + { + "epoch": 0.044425274962260085, + "grad_norm": 0.8348391652107239, + "learning_rate": 9.99764419063677e-06, + "loss": 0.8383, + "step": 824 + }, + { + "epoch": 0.0444791891309036, + "grad_norm": 1.2708821296691895, + "learning_rate": 9.997637678894639e-06, + "loss": 0.8733, + "step": 825 + }, + { + "epoch": 0.04453310329954712, + "grad_norm": 0.9863126277923584, + "learning_rate": 9.997631158167428e-06, + "loss": 0.9364, + "step": 826 + }, + { + "epoch": 0.04458701746819064, + "grad_norm": 1.0223352909088135, + "learning_rate": 9.99762462845515e-06, + "loss": 0.9139, + "step": 827 + }, + { + "epoch": 0.04464093163683416, + "grad_norm": 0.8559738397598267, + "learning_rate": 9.997618089757818e-06, + "loss": 0.7461, + "step": 828 + }, + { + "epoch": 0.04469484580547768, + "grad_norm": 0.9347368478775024, + "learning_rate": 9.997611542075442e-06, + "loss": 0.9275, + "step": 829 + }, + { + "epoch": 0.0447487599741212, + "grad_norm": 1.0208019018173218, + "learning_rate": 9.997604985408036e-06, + "loss": 0.8338, + "step": 830 + }, + { + "epoch": 0.04480267414276472, + "grad_norm": 0.9792174100875854, + "learning_rate": 9.997598419755607e-06, + "loss": 0.9437, + "step": 831 + }, + { + "epoch": 0.044856588311408235, + "grad_norm": 0.851665198802948, + "learning_rate": 9.997591845118173e-06, + "loss": 0.8008, + "step": 832 + }, + { + "epoch": 0.04491050248005176, + "grad_norm": 0.9315025806427002, + "learning_rate": 9.997585261495742e-06, + "loss": 0.8389, + "step": 833 + }, + { + "epoch": 0.04496441664869528, + "grad_norm": 0.9658921360969543, + "learning_rate": 9.997578668888326e-06, + "loss": 0.9252, + "step": 834 + }, + { + "epoch": 0.045018330817338796, + "grad_norm": 0.8989397287368774, + "learning_rate": 9.997572067295938e-06, + "loss": 0.8648, + "step": 835 + }, + { + "epoch": 0.045072244985982314, + "grad_norm": 0.8874988555908203, + "learning_rate": 9.99756545671859e-06, + "loss": 0.7801, + "step": 836 + }, + { + "epoch": 0.04512615915462584, + "grad_norm": 0.9186223745346069, + "learning_rate": 9.997558837156293e-06, + "loss": 0.767, + "step": 837 + }, + { + "epoch": 0.04518007332326936, + "grad_norm": 1.163044810295105, + "learning_rate": 9.997552208609059e-06, + "loss": 0.8938, + "step": 838 + }, + { + "epoch": 0.045233987491912875, + "grad_norm": 0.8315468430519104, + "learning_rate": 9.997545571076901e-06, + "loss": 0.725, + "step": 839 + }, + { + "epoch": 0.04528790166055639, + "grad_norm": 1.0088660717010498, + "learning_rate": 9.99753892455983e-06, + "loss": 0.8533, + "step": 840 + }, + { + "epoch": 0.04534181582919991, + "grad_norm": 0.9268692135810852, + "learning_rate": 9.997532269057857e-06, + "loss": 0.8739, + "step": 841 + }, + { + "epoch": 0.045395729997843436, + "grad_norm": 1.0793242454528809, + "learning_rate": 9.997525604570995e-06, + "loss": 0.9605, + "step": 842 + }, + { + "epoch": 0.045449644166486954, + "grad_norm": 1.101798176765442, + "learning_rate": 9.997518931099258e-06, + "loss": 0.9525, + "step": 843 + }, + { + "epoch": 0.04550355833513047, + "grad_norm": 0.9046466946601868, + "learning_rate": 9.997512248642654e-06, + "loss": 0.8853, + "step": 844 + }, + { + "epoch": 0.04555747250377399, + "grad_norm": 0.9629097580909729, + "learning_rate": 9.997505557201198e-06, + "loss": 0.8882, + "step": 845 + }, + { + "epoch": 0.045611386672417514, + "grad_norm": 1.1880977153778076, + "learning_rate": 9.997498856774898e-06, + "loss": 0.8812, + "step": 846 + }, + { + "epoch": 0.04566530084106103, + "grad_norm": 0.8678451180458069, + "learning_rate": 9.997492147363772e-06, + "loss": 0.887, + "step": 847 + }, + { + "epoch": 0.04571921500970455, + "grad_norm": 1.3359739780426025, + "learning_rate": 9.99748542896783e-06, + "loss": 0.8141, + "step": 848 + }, + { + "epoch": 0.04577312917834807, + "grad_norm": 0.9263296127319336, + "learning_rate": 9.99747870158708e-06, + "loss": 0.9357, + "step": 849 + }, + { + "epoch": 0.045827043346991586, + "grad_norm": 0.9199776649475098, + "learning_rate": 9.997471965221541e-06, + "loss": 0.8352, + "step": 850 + }, + { + "epoch": 0.04588095751563511, + "grad_norm": 0.8880730867385864, + "learning_rate": 9.997465219871218e-06, + "loss": 0.7802, + "step": 851 + }, + { + "epoch": 0.04593487168427863, + "grad_norm": 0.8561250567436218, + "learning_rate": 9.99745846553613e-06, + "loss": 0.7987, + "step": 852 + }, + { + "epoch": 0.04598878585292215, + "grad_norm": 0.8975661396980286, + "learning_rate": 9.997451702216283e-06, + "loss": 0.8325, + "step": 853 + }, + { + "epoch": 0.046042700021565665, + "grad_norm": 0.9350215196609497, + "learning_rate": 9.997444929911693e-06, + "loss": 0.7708, + "step": 854 + }, + { + "epoch": 0.04609661419020919, + "grad_norm": 1.0229014158248901, + "learning_rate": 9.99743814862237e-06, + "loss": 0.9643, + "step": 855 + }, + { + "epoch": 0.04615052835885271, + "grad_norm": 0.9249217510223389, + "learning_rate": 9.997431358348329e-06, + "loss": 0.8411, + "step": 856 + }, + { + "epoch": 0.046204442527496226, + "grad_norm": 0.9823042154312134, + "learning_rate": 9.99742455908958e-06, + "loss": 0.9406, + "step": 857 + }, + { + "epoch": 0.046258356696139744, + "grad_norm": 1.2525794506072998, + "learning_rate": 9.997417750846134e-06, + "loss": 0.8507, + "step": 858 + }, + { + "epoch": 0.04631227086478327, + "grad_norm": 0.9583309888839722, + "learning_rate": 9.997410933618006e-06, + "loss": 0.8504, + "step": 859 + }, + { + "epoch": 0.046366185033426786, + "grad_norm": 0.9264401793479919, + "learning_rate": 9.997404107405207e-06, + "loss": 0.8595, + "step": 860 + }, + { + "epoch": 0.046420099202070304, + "grad_norm": 0.9833316206932068, + "learning_rate": 9.99739727220775e-06, + "loss": 0.9025, + "step": 861 + }, + { + "epoch": 0.04647401337071382, + "grad_norm": 1.0220664739608765, + "learning_rate": 9.997390428025645e-06, + "loss": 0.8671, + "step": 862 + }, + { + "epoch": 0.04652792753935734, + "grad_norm": 1.0774664878845215, + "learning_rate": 9.997383574858908e-06, + "loss": 0.8463, + "step": 863 + }, + { + "epoch": 0.046581841708000865, + "grad_norm": 0.8821879029273987, + "learning_rate": 9.997376712707547e-06, + "loss": 0.7565, + "step": 864 + }, + { + "epoch": 0.04663575587664438, + "grad_norm": 0.9233925938606262, + "learning_rate": 9.997369841571577e-06, + "loss": 0.9151, + "step": 865 + }, + { + "epoch": 0.0466896700452879, + "grad_norm": 1.0006109476089478, + "learning_rate": 9.997362961451015e-06, + "loss": 0.8339, + "step": 866 + }, + { + "epoch": 0.04674358421393142, + "grad_norm": 0.865035891532898, + "learning_rate": 9.997356072345863e-06, + "loss": 0.8997, + "step": 867 + }, + { + "epoch": 0.046797498382574944, + "grad_norm": 1.0450654029846191, + "learning_rate": 9.99734917425614e-06, + "loss": 0.7966, + "step": 868 + }, + { + "epoch": 0.04685141255121846, + "grad_norm": 0.8878824710845947, + "learning_rate": 9.997342267181857e-06, + "loss": 0.831, + "step": 869 + }, + { + "epoch": 0.04690532671986198, + "grad_norm": 1.0056546926498413, + "learning_rate": 9.997335351123028e-06, + "loss": 0.8178, + "step": 870 + }, + { + "epoch": 0.0469592408885055, + "grad_norm": 1.0531659126281738, + "learning_rate": 9.997328426079661e-06, + "loss": 0.7773, + "step": 871 + }, + { + "epoch": 0.047013155057149016, + "grad_norm": 0.911021888256073, + "learning_rate": 9.997321492051775e-06, + "loss": 0.9001, + "step": 872 + }, + { + "epoch": 0.04706706922579254, + "grad_norm": 0.920103132724762, + "learning_rate": 9.997314549039379e-06, + "loss": 0.7222, + "step": 873 + }, + { + "epoch": 0.04712098339443606, + "grad_norm": 0.9449265599250793, + "learning_rate": 9.997307597042483e-06, + "loss": 0.9197, + "step": 874 + }, + { + "epoch": 0.047174897563079576, + "grad_norm": 1.013066291809082, + "learning_rate": 9.997300636061103e-06, + "loss": 0.8854, + "step": 875 + }, + { + "epoch": 0.047228811731723094, + "grad_norm": 0.8990256786346436, + "learning_rate": 9.99729366609525e-06, + "loss": 0.81, + "step": 876 + }, + { + "epoch": 0.04728272590036662, + "grad_norm": 1.0211769342422485, + "learning_rate": 9.997286687144938e-06, + "loss": 0.8335, + "step": 877 + }, + { + "epoch": 0.04733664006901014, + "grad_norm": 1.14606773853302, + "learning_rate": 9.997279699210178e-06, + "loss": 1.0956, + "step": 878 + }, + { + "epoch": 0.047390554237653655, + "grad_norm": 0.982725977897644, + "learning_rate": 9.997272702290981e-06, + "loss": 0.8289, + "step": 879 + }, + { + "epoch": 0.04744446840629717, + "grad_norm": 0.8667361736297607, + "learning_rate": 9.997265696387364e-06, + "loss": 0.8056, + "step": 880 + }, + { + "epoch": 0.04749838257494069, + "grad_norm": 0.9029837250709534, + "learning_rate": 9.997258681499338e-06, + "loss": 0.8461, + "step": 881 + }, + { + "epoch": 0.047552296743584216, + "grad_norm": 0.8767060041427612, + "learning_rate": 9.997251657626915e-06, + "loss": 0.8162, + "step": 882 + }, + { + "epoch": 0.047606210912227734, + "grad_norm": 1.4750713109970093, + "learning_rate": 9.997244624770104e-06, + "loss": 0.8677, + "step": 883 + }, + { + "epoch": 0.04766012508087125, + "grad_norm": 1.001286506652832, + "learning_rate": 9.997237582928924e-06, + "loss": 0.7673, + "step": 884 + }, + { + "epoch": 0.04771403924951477, + "grad_norm": 0.9560269713401794, + "learning_rate": 9.997230532103384e-06, + "loss": 0.8597, + "step": 885 + }, + { + "epoch": 0.047767953418158295, + "grad_norm": 0.834237277507782, + "learning_rate": 9.997223472293499e-06, + "loss": 0.7629, + "step": 886 + }, + { + "epoch": 0.04782186758680181, + "grad_norm": 0.9642406702041626, + "learning_rate": 9.997216403499278e-06, + "loss": 0.83, + "step": 887 + }, + { + "epoch": 0.04787578175544533, + "grad_norm": 1.2931480407714844, + "learning_rate": 9.997209325720736e-06, + "loss": 1.0333, + "step": 888 + }, + { + "epoch": 0.04792969592408885, + "grad_norm": 0.8024531602859497, + "learning_rate": 9.997202238957886e-06, + "loss": 0.7166, + "step": 889 + }, + { + "epoch": 0.04798361009273237, + "grad_norm": 0.9585899710655212, + "learning_rate": 9.997195143210741e-06, + "loss": 0.8099, + "step": 890 + }, + { + "epoch": 0.04803752426137589, + "grad_norm": 0.9917063117027283, + "learning_rate": 9.997188038479313e-06, + "loss": 0.8486, + "step": 891 + }, + { + "epoch": 0.04809143843001941, + "grad_norm": 1.6290080547332764, + "learning_rate": 9.997180924763616e-06, + "loss": 0.863, + "step": 892 + }, + { + "epoch": 0.04814535259866293, + "grad_norm": 0.9488585591316223, + "learning_rate": 9.99717380206366e-06, + "loss": 0.8277, + "step": 893 + }, + { + "epoch": 0.048199266767306445, + "grad_norm": 1.0710817575454712, + "learning_rate": 9.997166670379459e-06, + "loss": 0.8898, + "step": 894 + }, + { + "epoch": 0.04825318093594997, + "grad_norm": 0.9916248917579651, + "learning_rate": 9.997159529711026e-06, + "loss": 0.9144, + "step": 895 + }, + { + "epoch": 0.04830709510459349, + "grad_norm": 1.0074565410614014, + "learning_rate": 9.997152380058378e-06, + "loss": 0.8391, + "step": 896 + }, + { + "epoch": 0.048361009273237006, + "grad_norm": 1.0258312225341797, + "learning_rate": 9.99714522142152e-06, + "loss": 0.973, + "step": 897 + }, + { + "epoch": 0.048414923441880524, + "grad_norm": 0.9497826099395752, + "learning_rate": 9.99713805380047e-06, + "loss": 0.9221, + "step": 898 + }, + { + "epoch": 0.04846883761052405, + "grad_norm": 0.9103115200996399, + "learning_rate": 9.99713087719524e-06, + "loss": 0.7942, + "step": 899 + }, + { + "epoch": 0.04852275177916757, + "grad_norm": 0.9810470938682556, + "learning_rate": 9.997123691605843e-06, + "loss": 0.8673, + "step": 900 + }, + { + "epoch": 0.048576665947811085, + "grad_norm": 1.0422937870025635, + "learning_rate": 9.997116497032291e-06, + "loss": 0.9263, + "step": 901 + }, + { + "epoch": 0.0486305801164546, + "grad_norm": 0.8522017002105713, + "learning_rate": 9.997109293474596e-06, + "loss": 0.8296, + "step": 902 + }, + { + "epoch": 0.04868449428509812, + "grad_norm": 0.818270742893219, + "learning_rate": 9.997102080932775e-06, + "loss": 0.7898, + "step": 903 + }, + { + "epoch": 0.048738408453741645, + "grad_norm": 0.9286766648292542, + "learning_rate": 9.997094859406838e-06, + "loss": 0.8751, + "step": 904 + }, + { + "epoch": 0.04879232262238516, + "grad_norm": 1.0779087543487549, + "learning_rate": 9.997087628896797e-06, + "loss": 0.8377, + "step": 905 + }, + { + "epoch": 0.04884623679102868, + "grad_norm": 0.8711867928504944, + "learning_rate": 9.997080389402667e-06, + "loss": 0.8547, + "step": 906 + }, + { + "epoch": 0.0489001509596722, + "grad_norm": 0.8919721245765686, + "learning_rate": 9.99707314092446e-06, + "loss": 0.8178, + "step": 907 + }, + { + "epoch": 0.048954065128315724, + "grad_norm": 0.9084917306900024, + "learning_rate": 9.997065883462192e-06, + "loss": 0.8618, + "step": 908 + }, + { + "epoch": 0.04900797929695924, + "grad_norm": 0.869216799736023, + "learning_rate": 9.997058617015871e-06, + "loss": 0.8636, + "step": 909 + }, + { + "epoch": 0.04906189346560276, + "grad_norm": 0.9376553893089294, + "learning_rate": 9.997051341585513e-06, + "loss": 0.8986, + "step": 910 + }, + { + "epoch": 0.04911580763424628, + "grad_norm": 0.9041107892990112, + "learning_rate": 9.99704405717113e-06, + "loss": 0.817, + "step": 911 + }, + { + "epoch": 0.0491697218028898, + "grad_norm": 0.9530431628227234, + "learning_rate": 9.997036763772737e-06, + "loss": 0.9464, + "step": 912 + }, + { + "epoch": 0.04922363597153332, + "grad_norm": 0.9601117968559265, + "learning_rate": 9.997029461390344e-06, + "loss": 0.9014, + "step": 913 + }, + { + "epoch": 0.04927755014017684, + "grad_norm": 0.9162781834602356, + "learning_rate": 9.997022150023968e-06, + "loss": 0.8851, + "step": 914 + }, + { + "epoch": 0.04933146430882036, + "grad_norm": 0.9514605402946472, + "learning_rate": 9.99701482967362e-06, + "loss": 0.8975, + "step": 915 + }, + { + "epoch": 0.049385378477463875, + "grad_norm": 0.897203803062439, + "learning_rate": 9.997007500339313e-06, + "loss": 0.8371, + "step": 916 + }, + { + "epoch": 0.0494392926461074, + "grad_norm": 0.9372673630714417, + "learning_rate": 9.99700016202106e-06, + "loss": 0.9432, + "step": 917 + }, + { + "epoch": 0.04949320681475092, + "grad_norm": 0.8993443846702576, + "learning_rate": 9.996992814718875e-06, + "loss": 0.8528, + "step": 918 + }, + { + "epoch": 0.049547120983394435, + "grad_norm": 0.9300720691680908, + "learning_rate": 9.996985458432771e-06, + "loss": 0.873, + "step": 919 + }, + { + "epoch": 0.04960103515203795, + "grad_norm": 0.9311426281929016, + "learning_rate": 9.996978093162761e-06, + "loss": 0.9092, + "step": 920 + }, + { + "epoch": 0.04965494932068148, + "grad_norm": 0.9244507551193237, + "learning_rate": 9.996970718908859e-06, + "loss": 0.764, + "step": 921 + }, + { + "epoch": 0.049708863489324996, + "grad_norm": 0.915512204170227, + "learning_rate": 9.996963335671074e-06, + "loss": 0.8328, + "step": 922 + }, + { + "epoch": 0.049762777657968514, + "grad_norm": 0.889994740486145, + "learning_rate": 9.996955943449426e-06, + "loss": 0.8491, + "step": 923 + }, + { + "epoch": 0.04981669182661203, + "grad_norm": 0.8676478266716003, + "learning_rate": 9.996948542243925e-06, + "loss": 0.7677, + "step": 924 + }, + { + "epoch": 0.04987060599525555, + "grad_norm": 0.9795013070106506, + "learning_rate": 9.996941132054586e-06, + "loss": 0.9279, + "step": 925 + }, + { + "epoch": 0.049924520163899075, + "grad_norm": 0.940078854560852, + "learning_rate": 9.996933712881419e-06, + "loss": 0.8685, + "step": 926 + }, + { + "epoch": 0.04997843433254259, + "grad_norm": 0.9440926313400269, + "learning_rate": 9.996926284724437e-06, + "loss": 0.9634, + "step": 927 + }, + { + "epoch": 0.05003234850118611, + "grad_norm": 0.9120537638664246, + "learning_rate": 9.99691884758366e-06, + "loss": 0.7656, + "step": 928 + }, + { + "epoch": 0.05008626266982963, + "grad_norm": 1.1514596939086914, + "learning_rate": 9.996911401459093e-06, + "loss": 0.864, + "step": 929 + }, + { + "epoch": 0.050140176838473154, + "grad_norm": 0.8924434185028076, + "learning_rate": 9.996903946350756e-06, + "loss": 0.877, + "step": 930 + }, + { + "epoch": 0.05019409100711667, + "grad_norm": 0.9884456992149353, + "learning_rate": 9.996896482258657e-06, + "loss": 0.94, + "step": 931 + }, + { + "epoch": 0.05024800517576019, + "grad_norm": 0.9282665252685547, + "learning_rate": 9.996889009182814e-06, + "loss": 0.8443, + "step": 932 + }, + { + "epoch": 0.05030191934440371, + "grad_norm": 1.1029064655303955, + "learning_rate": 9.996881527123237e-06, + "loss": 0.9168, + "step": 933 + }, + { + "epoch": 0.050355833513047225, + "grad_norm": 0.839625358581543, + "learning_rate": 9.996874036079942e-06, + "loss": 0.8261, + "step": 934 + }, + { + "epoch": 0.05040974768169075, + "grad_norm": 0.8612869381904602, + "learning_rate": 9.996866536052942e-06, + "loss": 0.8197, + "step": 935 + }, + { + "epoch": 0.05046366185033427, + "grad_norm": 0.9483891129493713, + "learning_rate": 9.996859027042249e-06, + "loss": 0.8374, + "step": 936 + }, + { + "epoch": 0.050517576018977786, + "grad_norm": 0.9374566674232483, + "learning_rate": 9.996851509047877e-06, + "loss": 0.8884, + "step": 937 + }, + { + "epoch": 0.050571490187621304, + "grad_norm": 0.9164647459983826, + "learning_rate": 9.99684398206984e-06, + "loss": 0.8419, + "step": 938 + }, + { + "epoch": 0.05062540435626483, + "grad_norm": 1.0109184980392456, + "learning_rate": 9.996836446108153e-06, + "loss": 0.8912, + "step": 939 + }, + { + "epoch": 0.05067931852490835, + "grad_norm": 0.8549674153327942, + "learning_rate": 9.996828901162825e-06, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.050733232693551865, + "grad_norm": 0.9618684649467468, + "learning_rate": 9.996821347233875e-06, + "loss": 0.8246, + "step": 941 + }, + { + "epoch": 0.05078714686219538, + "grad_norm": 0.9777100682258606, + "learning_rate": 9.996813784321314e-06, + "loss": 0.887, + "step": 942 + }, + { + "epoch": 0.05084106103083891, + "grad_norm": 0.8675182461738586, + "learning_rate": 9.996806212425157e-06, + "loss": 0.7584, + "step": 943 + }, + { + "epoch": 0.050894975199482426, + "grad_norm": 0.9174523949623108, + "learning_rate": 9.996798631545414e-06, + "loss": 0.8911, + "step": 944 + }, + { + "epoch": 0.050948889368125944, + "grad_norm": 0.9269078373908997, + "learning_rate": 9.996791041682101e-06, + "loss": 0.8049, + "step": 945 + }, + { + "epoch": 0.05100280353676946, + "grad_norm": 0.8447721600532532, + "learning_rate": 9.996783442835233e-06, + "loss": 0.7781, + "step": 946 + }, + { + "epoch": 0.05105671770541298, + "grad_norm": 0.9178231954574585, + "learning_rate": 9.99677583500482e-06, + "loss": 0.8107, + "step": 947 + }, + { + "epoch": 0.051110631874056504, + "grad_norm": 0.8741039633750916, + "learning_rate": 9.996768218190879e-06, + "loss": 0.9278, + "step": 948 + }, + { + "epoch": 0.05116454604270002, + "grad_norm": 0.7997228503227234, + "learning_rate": 9.996760592393425e-06, + "loss": 0.7706, + "step": 949 + }, + { + "epoch": 0.05121846021134354, + "grad_norm": 1.003300428390503, + "learning_rate": 9.996752957612468e-06, + "loss": 0.8464, + "step": 950 + }, + { + "epoch": 0.05127237437998706, + "grad_norm": 0.9237748980522156, + "learning_rate": 9.996745313848021e-06, + "loss": 0.9088, + "step": 951 + }, + { + "epoch": 0.05132628854863058, + "grad_norm": 0.8565654754638672, + "learning_rate": 9.996737661100103e-06, + "loss": 0.8208, + "step": 952 + }, + { + "epoch": 0.0513802027172741, + "grad_norm": 1.0590770244598389, + "learning_rate": 9.996729999368722e-06, + "loss": 0.9272, + "step": 953 + }, + { + "epoch": 0.05143411688591762, + "grad_norm": 0.8888198733329773, + "learning_rate": 9.996722328653897e-06, + "loss": 0.8264, + "step": 954 + }, + { + "epoch": 0.05148803105456114, + "grad_norm": 0.9211130142211914, + "learning_rate": 9.996714648955636e-06, + "loss": 0.8807, + "step": 955 + }, + { + "epoch": 0.051541945223204655, + "grad_norm": 1.0241321325302124, + "learning_rate": 9.996706960273958e-06, + "loss": 0.7638, + "step": 956 + }, + { + "epoch": 0.05159585939184818, + "grad_norm": 0.903762698173523, + "learning_rate": 9.996699262608875e-06, + "loss": 0.8583, + "step": 957 + }, + { + "epoch": 0.0516497735604917, + "grad_norm": 0.9271189570426941, + "learning_rate": 9.9966915559604e-06, + "loss": 0.8341, + "step": 958 + }, + { + "epoch": 0.051703687729135216, + "grad_norm": 0.865260899066925, + "learning_rate": 9.996683840328546e-06, + "loss": 0.9136, + "step": 959 + }, + { + "epoch": 0.051757601897778734, + "grad_norm": 0.8903625011444092, + "learning_rate": 9.996676115713332e-06, + "loss": 0.8706, + "step": 960 + }, + { + "epoch": 0.05181151606642226, + "grad_norm": 0.9228227138519287, + "learning_rate": 9.996668382114765e-06, + "loss": 0.8825, + "step": 961 + }, + { + "epoch": 0.051865430235065776, + "grad_norm": 0.9146421551704407, + "learning_rate": 9.996660639532863e-06, + "loss": 0.8347, + "step": 962 + }, + { + "epoch": 0.051919344403709294, + "grad_norm": 0.9010991454124451, + "learning_rate": 9.99665288796764e-06, + "loss": 0.8016, + "step": 963 + }, + { + "epoch": 0.05197325857235281, + "grad_norm": 0.8763105869293213, + "learning_rate": 9.996645127419107e-06, + "loss": 0.8651, + "step": 964 + }, + { + "epoch": 0.05202717274099634, + "grad_norm": 0.9506256580352783, + "learning_rate": 9.996637357887281e-06, + "loss": 0.9429, + "step": 965 + }, + { + "epoch": 0.052081086909639855, + "grad_norm": 0.9484269022941589, + "learning_rate": 9.996629579372175e-06, + "loss": 0.855, + "step": 966 + }, + { + "epoch": 0.05213500107828337, + "grad_norm": 0.8970646262168884, + "learning_rate": 9.996621791873804e-06, + "loss": 0.8611, + "step": 967 + }, + { + "epoch": 0.05218891524692689, + "grad_norm": 0.8925203680992126, + "learning_rate": 9.99661399539218e-06, + "loss": 0.8206, + "step": 968 + }, + { + "epoch": 0.05224282941557041, + "grad_norm": 1.069669246673584, + "learning_rate": 9.996606189927318e-06, + "loss": 0.876, + "step": 969 + }, + { + "epoch": 0.052296743584213934, + "grad_norm": 0.8456307649612427, + "learning_rate": 9.996598375479232e-06, + "loss": 0.7514, + "step": 970 + }, + { + "epoch": 0.05235065775285745, + "grad_norm": 0.9182801246643066, + "learning_rate": 9.996590552047936e-06, + "loss": 0.8915, + "step": 971 + }, + { + "epoch": 0.05240457192150097, + "grad_norm": 0.7616676688194275, + "learning_rate": 9.996582719633445e-06, + "loss": 0.7106, + "step": 972 + }, + { + "epoch": 0.05245848609014449, + "grad_norm": 0.8873127102851868, + "learning_rate": 9.99657487823577e-06, + "loss": 0.9171, + "step": 973 + }, + { + "epoch": 0.05251240025878801, + "grad_norm": 0.9724618792533875, + "learning_rate": 9.996567027854929e-06, + "loss": 0.9765, + "step": 974 + }, + { + "epoch": 0.05256631442743153, + "grad_norm": 0.9106513857841492, + "learning_rate": 9.996559168490933e-06, + "loss": 0.8332, + "step": 975 + }, + { + "epoch": 0.05262022859607505, + "grad_norm": 0.8551159501075745, + "learning_rate": 9.996551300143798e-06, + "loss": 0.8128, + "step": 976 + }, + { + "epoch": 0.052674142764718566, + "grad_norm": 0.9829822182655334, + "learning_rate": 9.996543422813539e-06, + "loss": 0.9088, + "step": 977 + }, + { + "epoch": 0.052728056933362084, + "grad_norm": 0.8281888961791992, + "learning_rate": 9.996535536500166e-06, + "loss": 0.8338, + "step": 978 + }, + { + "epoch": 0.05278197110200561, + "grad_norm": 0.951319694519043, + "learning_rate": 9.9965276412037e-06, + "loss": 0.9359, + "step": 979 + }, + { + "epoch": 0.05283588527064913, + "grad_norm": 0.841390073299408, + "learning_rate": 9.996519736924148e-06, + "loss": 0.7952, + "step": 980 + }, + { + "epoch": 0.052889799439292645, + "grad_norm": 0.8847686648368835, + "learning_rate": 9.996511823661528e-06, + "loss": 0.8435, + "step": 981 + }, + { + "epoch": 0.05294371360793616, + "grad_norm": 0.9261316061019897, + "learning_rate": 9.996503901415855e-06, + "loss": 0.8646, + "step": 982 + }, + { + "epoch": 0.05299762777657969, + "grad_norm": 0.9366586804389954, + "learning_rate": 9.99649597018714e-06, + "loss": 0.8586, + "step": 983 + }, + { + "epoch": 0.053051541945223206, + "grad_norm": 0.8916764259338379, + "learning_rate": 9.9964880299754e-06, + "loss": 0.8215, + "step": 984 + }, + { + "epoch": 0.053105456113866724, + "grad_norm": 0.9496534466743469, + "learning_rate": 9.996480080780648e-06, + "loss": 0.7984, + "step": 985 + }, + { + "epoch": 0.05315937028251024, + "grad_norm": 0.9736526608467102, + "learning_rate": 9.9964721226029e-06, + "loss": 0.7881, + "step": 986 + }, + { + "epoch": 0.05321328445115376, + "grad_norm": 0.9533856511116028, + "learning_rate": 9.996464155442167e-06, + "loss": 0.9855, + "step": 987 + }, + { + "epoch": 0.053267198619797285, + "grad_norm": 0.9656437039375305, + "learning_rate": 9.996456179298467e-06, + "loss": 0.9571, + "step": 988 + }, + { + "epoch": 0.0533211127884408, + "grad_norm": 0.8887313008308411, + "learning_rate": 9.996448194171813e-06, + "loss": 0.9381, + "step": 989 + }, + { + "epoch": 0.05337502695708432, + "grad_norm": 1.0181535482406616, + "learning_rate": 9.996440200062217e-06, + "loss": 0.8834, + "step": 990 + }, + { + "epoch": 0.05342894112572784, + "grad_norm": 0.9083503484725952, + "learning_rate": 9.996432196969696e-06, + "loss": 0.9733, + "step": 991 + }, + { + "epoch": 0.05348285529437136, + "grad_norm": 0.9051093459129333, + "learning_rate": 9.996424184894264e-06, + "loss": 0.8531, + "step": 992 + }, + { + "epoch": 0.05353676946301488, + "grad_norm": 1.0264357328414917, + "learning_rate": 9.996416163835935e-06, + "loss": 0.9212, + "step": 993 + }, + { + "epoch": 0.0535906836316584, + "grad_norm": 1.0350812673568726, + "learning_rate": 9.996408133794726e-06, + "loss": 0.7843, + "step": 994 + }, + { + "epoch": 0.05364459780030192, + "grad_norm": 0.9610341787338257, + "learning_rate": 9.996400094770647e-06, + "loss": 0.8561, + "step": 995 + }, + { + "epoch": 0.05369851196894544, + "grad_norm": 0.8123961687088013, + "learning_rate": 9.996392046763714e-06, + "loss": 0.8296, + "step": 996 + }, + { + "epoch": 0.05375242613758896, + "grad_norm": 0.9337920546531677, + "learning_rate": 9.996383989773942e-06, + "loss": 0.8525, + "step": 997 + }, + { + "epoch": 0.05380634030623248, + "grad_norm": 1.1319444179534912, + "learning_rate": 9.996375923801347e-06, + "loss": 0.9127, + "step": 998 + }, + { + "epoch": 0.053860254474875996, + "grad_norm": 0.8506798148155212, + "learning_rate": 9.996367848845941e-06, + "loss": 0.884, + "step": 999 + }, + { + "epoch": 0.053914168643519514, + "grad_norm": 0.8248615860939026, + "learning_rate": 9.996359764907739e-06, + "loss": 0.7579, + "step": 1000 + }, + { + "epoch": 0.05396808281216304, + "grad_norm": 0.9258946180343628, + "learning_rate": 9.996351671986756e-06, + "loss": 0.8632, + "step": 1001 + }, + { + "epoch": 0.05402199698080656, + "grad_norm": 0.8891279101371765, + "learning_rate": 9.996343570083006e-06, + "loss": 0.8758, + "step": 1002 + }, + { + "epoch": 0.054075911149450075, + "grad_norm": 0.9592086672782898, + "learning_rate": 9.996335459196505e-06, + "loss": 0.8962, + "step": 1003 + }, + { + "epoch": 0.05412982531809359, + "grad_norm": 0.8937798738479614, + "learning_rate": 9.996327339327267e-06, + "loss": 0.8434, + "step": 1004 + }, + { + "epoch": 0.05418373948673712, + "grad_norm": 0.9602083563804626, + "learning_rate": 9.996319210475307e-06, + "loss": 0.9692, + "step": 1005 + }, + { + "epoch": 0.054237653655380635, + "grad_norm": 0.870637834072113, + "learning_rate": 9.996311072640637e-06, + "loss": 0.9146, + "step": 1006 + }, + { + "epoch": 0.05429156782402415, + "grad_norm": 0.9330273866653442, + "learning_rate": 9.996302925823276e-06, + "loss": 0.8584, + "step": 1007 + }, + { + "epoch": 0.05434548199266767, + "grad_norm": 0.8185963034629822, + "learning_rate": 9.996294770023234e-06, + "loss": 0.7854, + "step": 1008 + }, + { + "epoch": 0.05439939616131119, + "grad_norm": 0.8727489113807678, + "learning_rate": 9.996286605240528e-06, + "loss": 0.7388, + "step": 1009 + }, + { + "epoch": 0.054453310329954714, + "grad_norm": 1.0858477354049683, + "learning_rate": 9.996278431475172e-06, + "loss": 0.9201, + "step": 1010 + }, + { + "epoch": 0.05450722449859823, + "grad_norm": 0.9749255776405334, + "learning_rate": 9.996270248727184e-06, + "loss": 0.9041, + "step": 1011 + }, + { + "epoch": 0.05456113866724175, + "grad_norm": 0.9460576176643372, + "learning_rate": 9.996262056996575e-06, + "loss": 0.8553, + "step": 1012 + }, + { + "epoch": 0.05461505283588527, + "grad_norm": 0.9379808306694031, + "learning_rate": 9.99625385628336e-06, + "loss": 0.9253, + "step": 1013 + }, + { + "epoch": 0.05466896700452879, + "grad_norm": 0.8154170513153076, + "learning_rate": 9.996245646587553e-06, + "loss": 0.8703, + "step": 1014 + }, + { + "epoch": 0.05472288117317231, + "grad_norm": 0.9122161269187927, + "learning_rate": 9.996237427909172e-06, + "loss": 0.7734, + "step": 1015 + }, + { + "epoch": 0.05477679534181583, + "grad_norm": 0.9049486517906189, + "learning_rate": 9.996229200248228e-06, + "loss": 0.8991, + "step": 1016 + }, + { + "epoch": 0.05483070951045935, + "grad_norm": 0.9244295358657837, + "learning_rate": 9.996220963604741e-06, + "loss": 0.8514, + "step": 1017 + }, + { + "epoch": 0.05488462367910287, + "grad_norm": 0.9817934036254883, + "learning_rate": 9.99621271797872e-06, + "loss": 0.8641, + "step": 1018 + }, + { + "epoch": 0.05493853784774639, + "grad_norm": 0.9253972768783569, + "learning_rate": 9.996204463370182e-06, + "loss": 0.9199, + "step": 1019 + }, + { + "epoch": 0.05499245201638991, + "grad_norm": 0.9114319682121277, + "learning_rate": 9.996196199779145e-06, + "loss": 0.8063, + "step": 1020 + }, + { + "epoch": 0.055046366185033425, + "grad_norm": 0.9643195867538452, + "learning_rate": 9.996187927205619e-06, + "loss": 0.9668, + "step": 1021 + }, + { + "epoch": 0.05510028035367694, + "grad_norm": 0.8127598166465759, + "learning_rate": 9.996179645649622e-06, + "loss": 0.764, + "step": 1022 + }, + { + "epoch": 0.05515419452232047, + "grad_norm": 0.8728108406066895, + "learning_rate": 9.996171355111167e-06, + "loss": 0.7703, + "step": 1023 + }, + { + "epoch": 0.055208108690963986, + "grad_norm": 0.8554317355155945, + "learning_rate": 9.996163055590269e-06, + "loss": 0.8266, + "step": 1024 + }, + { + "epoch": 0.055262022859607504, + "grad_norm": 0.7951076030731201, + "learning_rate": 9.996154747086946e-06, + "loss": 0.7601, + "step": 1025 + }, + { + "epoch": 0.05531593702825102, + "grad_norm": 0.8916927576065063, + "learning_rate": 9.996146429601208e-06, + "loss": 0.8936, + "step": 1026 + }, + { + "epoch": 0.05536985119689455, + "grad_norm": 1.0242576599121094, + "learning_rate": 9.996138103133075e-06, + "loss": 0.8868, + "step": 1027 + }, + { + "epoch": 0.055423765365538065, + "grad_norm": 0.9273019433021545, + "learning_rate": 9.996129767682557e-06, + "loss": 0.8622, + "step": 1028 + }, + { + "epoch": 0.05547767953418158, + "grad_norm": 0.9547039866447449, + "learning_rate": 9.996121423249673e-06, + "loss": 0.7814, + "step": 1029 + }, + { + "epoch": 0.0555315937028251, + "grad_norm": 0.8750621676445007, + "learning_rate": 9.996113069834437e-06, + "loss": 0.7717, + "step": 1030 + }, + { + "epoch": 0.05558550787146862, + "grad_norm": 0.9547988176345825, + "learning_rate": 9.996104707436862e-06, + "loss": 0.8877, + "step": 1031 + }, + { + "epoch": 0.055639422040112144, + "grad_norm": 0.8856480717658997, + "learning_rate": 9.996096336056966e-06, + "loss": 0.7927, + "step": 1032 + }, + { + "epoch": 0.05569333620875566, + "grad_norm": 0.8311342000961304, + "learning_rate": 9.99608795569476e-06, + "loss": 0.7847, + "step": 1033 + }, + { + "epoch": 0.05574725037739918, + "grad_norm": 1.0720731019973755, + "learning_rate": 9.996079566350266e-06, + "loss": 0.9243, + "step": 1034 + }, + { + "epoch": 0.0558011645460427, + "grad_norm": 0.9498684406280518, + "learning_rate": 9.996071168023491e-06, + "loss": 0.8605, + "step": 1035 + }, + { + "epoch": 0.05585507871468622, + "grad_norm": 0.9043952822685242, + "learning_rate": 9.996062760714456e-06, + "loss": 0.8488, + "step": 1036 + }, + { + "epoch": 0.05590899288332974, + "grad_norm": 0.8051116466522217, + "learning_rate": 9.996054344423173e-06, + "loss": 0.8275, + "step": 1037 + }, + { + "epoch": 0.05596290705197326, + "grad_norm": 0.857120156288147, + "learning_rate": 9.996045919149658e-06, + "loss": 0.8837, + "step": 1038 + }, + { + "epoch": 0.056016821220616776, + "grad_norm": 0.8810911774635315, + "learning_rate": 9.996037484893926e-06, + "loss": 0.8179, + "step": 1039 + }, + { + "epoch": 0.056070735389260294, + "grad_norm": 0.8783093690872192, + "learning_rate": 9.996029041655994e-06, + "loss": 0.7734, + "step": 1040 + }, + { + "epoch": 0.05612464955790382, + "grad_norm": 0.9281952977180481, + "learning_rate": 9.996020589435874e-06, + "loss": 0.8747, + "step": 1041 + }, + { + "epoch": 0.05617856372654734, + "grad_norm": 0.8307299613952637, + "learning_rate": 9.996012128233583e-06, + "loss": 0.8055, + "step": 1042 + }, + { + "epoch": 0.056232477895190855, + "grad_norm": 0.9520873427391052, + "learning_rate": 9.996003658049136e-06, + "loss": 0.8181, + "step": 1043 + }, + { + "epoch": 0.05628639206383437, + "grad_norm": 0.8753806948661804, + "learning_rate": 9.995995178882549e-06, + "loss": 0.808, + "step": 1044 + }, + { + "epoch": 0.0563403062324779, + "grad_norm": 1.067691683769226, + "learning_rate": 9.995986690733836e-06, + "loss": 0.8048, + "step": 1045 + }, + { + "epoch": 0.056394220401121416, + "grad_norm": 0.8575261235237122, + "learning_rate": 9.995978193603013e-06, + "loss": 0.9231, + "step": 1046 + }, + { + "epoch": 0.056448134569764934, + "grad_norm": 0.9857104420661926, + "learning_rate": 9.995969687490096e-06, + "loss": 0.8883, + "step": 1047 + }, + { + "epoch": 0.05650204873840845, + "grad_norm": 0.9203484654426575, + "learning_rate": 9.995961172395098e-06, + "loss": 0.7634, + "step": 1048 + }, + { + "epoch": 0.056555962907051976, + "grad_norm": 0.8741904497146606, + "learning_rate": 9.995952648318036e-06, + "loss": 0.8061, + "step": 1049 + }, + { + "epoch": 0.056609877075695494, + "grad_norm": 0.9495588541030884, + "learning_rate": 9.995944115258925e-06, + "loss": 0.8922, + "step": 1050 + }, + { + "epoch": 0.05666379124433901, + "grad_norm": 0.9306020140647888, + "learning_rate": 9.99593557321778e-06, + "loss": 0.8454, + "step": 1051 + }, + { + "epoch": 0.05671770541298253, + "grad_norm": 0.9457784295082092, + "learning_rate": 9.995927022194615e-06, + "loss": 0.8701, + "step": 1052 + }, + { + "epoch": 0.05677161958162605, + "grad_norm": 0.88719242811203, + "learning_rate": 9.99591846218945e-06, + "loss": 0.8416, + "step": 1053 + }, + { + "epoch": 0.05682553375026957, + "grad_norm": 0.8740848302841187, + "learning_rate": 9.995909893202296e-06, + "loss": 0.7962, + "step": 1054 + }, + { + "epoch": 0.05687944791891309, + "grad_norm": 1.0149377584457397, + "learning_rate": 9.99590131523317e-06, + "loss": 0.8352, + "step": 1055 + }, + { + "epoch": 0.05693336208755661, + "grad_norm": 0.9014917016029358, + "learning_rate": 9.995892728282088e-06, + "loss": 0.9244, + "step": 1056 + }, + { + "epoch": 0.05698727625620013, + "grad_norm": 0.9351898431777954, + "learning_rate": 9.995884132349062e-06, + "loss": 0.865, + "step": 1057 + }, + { + "epoch": 0.05704119042484365, + "grad_norm": 0.8656749129295349, + "learning_rate": 9.995875527434113e-06, + "loss": 0.8836, + "step": 1058 + }, + { + "epoch": 0.05709510459348717, + "grad_norm": 0.9120789170265198, + "learning_rate": 9.995866913537254e-06, + "loss": 0.8772, + "step": 1059 + }, + { + "epoch": 0.05714901876213069, + "grad_norm": 1.0019149780273438, + "learning_rate": 9.995858290658497e-06, + "loss": 0.9338, + "step": 1060 + }, + { + "epoch": 0.057202932930774206, + "grad_norm": 0.8492977023124695, + "learning_rate": 9.995849658797863e-06, + "loss": 0.742, + "step": 1061 + }, + { + "epoch": 0.057256847099417724, + "grad_norm": 1.000607967376709, + "learning_rate": 9.995841017955363e-06, + "loss": 0.8498, + "step": 1062 + }, + { + "epoch": 0.05731076126806125, + "grad_norm": 1.0268487930297852, + "learning_rate": 9.995832368131016e-06, + "loss": 0.8937, + "step": 1063 + }, + { + "epoch": 0.057364675436704766, + "grad_norm": 0.9388830661773682, + "learning_rate": 9.995823709324836e-06, + "loss": 0.877, + "step": 1064 + }, + { + "epoch": 0.057418589605348284, + "grad_norm": 0.9747199416160583, + "learning_rate": 9.99581504153684e-06, + "loss": 0.8436, + "step": 1065 + }, + { + "epoch": 0.0574725037739918, + "grad_norm": 0.9125073552131653, + "learning_rate": 9.99580636476704e-06, + "loss": 0.8853, + "step": 1066 + }, + { + "epoch": 0.05752641794263533, + "grad_norm": 0.8910282254219055, + "learning_rate": 9.995797679015455e-06, + "loss": 0.8566, + "step": 1067 + }, + { + "epoch": 0.057580332111278845, + "grad_norm": 0.8546010255813599, + "learning_rate": 9.995788984282101e-06, + "loss": 0.8209, + "step": 1068 + }, + { + "epoch": 0.05763424627992236, + "grad_norm": 0.9205883145332336, + "learning_rate": 9.99578028056699e-06, + "loss": 0.7814, + "step": 1069 + }, + { + "epoch": 0.05768816044856588, + "grad_norm": 0.9627780914306641, + "learning_rate": 9.995771567870142e-06, + "loss": 0.8686, + "step": 1070 + }, + { + "epoch": 0.057742074617209406, + "grad_norm": 0.9917465448379517, + "learning_rate": 9.995762846191569e-06, + "loss": 0.9672, + "step": 1071 + }, + { + "epoch": 0.057795988785852924, + "grad_norm": 0.9396706223487854, + "learning_rate": 9.995754115531288e-06, + "loss": 0.8631, + "step": 1072 + }, + { + "epoch": 0.05784990295449644, + "grad_norm": 0.8310922980308533, + "learning_rate": 9.995745375889317e-06, + "loss": 0.8637, + "step": 1073 + }, + { + "epoch": 0.05790381712313996, + "grad_norm": 0.9085954427719116, + "learning_rate": 9.995736627265667e-06, + "loss": 0.8821, + "step": 1074 + }, + { + "epoch": 0.05795773129178348, + "grad_norm": 0.8529816269874573, + "learning_rate": 9.995727869660357e-06, + "loss": 0.8426, + "step": 1075 + }, + { + "epoch": 0.058011645460427, + "grad_norm": 0.8288499116897583, + "learning_rate": 9.995719103073403e-06, + "loss": 0.8415, + "step": 1076 + }, + { + "epoch": 0.05806555962907052, + "grad_norm": 0.9105609059333801, + "learning_rate": 9.995710327504819e-06, + "loss": 0.7683, + "step": 1077 + }, + { + "epoch": 0.05811947379771404, + "grad_norm": 0.9578274488449097, + "learning_rate": 9.995701542954622e-06, + "loss": 0.8796, + "step": 1078 + }, + { + "epoch": 0.058173387966357556, + "grad_norm": 0.8542460799217224, + "learning_rate": 9.995692749422827e-06, + "loss": 0.8363, + "step": 1079 + }, + { + "epoch": 0.05822730213500108, + "grad_norm": 0.8723183274269104, + "learning_rate": 9.99568394690945e-06, + "loss": 0.8434, + "step": 1080 + }, + { + "epoch": 0.0582812163036446, + "grad_norm": 0.9157887697219849, + "learning_rate": 9.995675135414507e-06, + "loss": 0.6532, + "step": 1081 + }, + { + "epoch": 0.05833513047228812, + "grad_norm": 0.9055691361427307, + "learning_rate": 9.995666314938014e-06, + "loss": 0.8762, + "step": 1082 + }, + { + "epoch": 0.058389044640931635, + "grad_norm": 0.8224693536758423, + "learning_rate": 9.995657485479987e-06, + "loss": 0.7976, + "step": 1083 + }, + { + "epoch": 0.05844295880957515, + "grad_norm": 0.925414502620697, + "learning_rate": 9.995648647040441e-06, + "loss": 0.8673, + "step": 1084 + }, + { + "epoch": 0.05849687297821868, + "grad_norm": 0.9194141626358032, + "learning_rate": 9.995639799619395e-06, + "loss": 0.7916, + "step": 1085 + }, + { + "epoch": 0.058550787146862196, + "grad_norm": 1.08795166015625, + "learning_rate": 9.995630943216859e-06, + "loss": 0.9135, + "step": 1086 + }, + { + "epoch": 0.058604701315505714, + "grad_norm": 0.9648925065994263, + "learning_rate": 9.995622077832854e-06, + "loss": 0.8442, + "step": 1087 + }, + { + "epoch": 0.05865861548414923, + "grad_norm": 1.0012339353561401, + "learning_rate": 9.995613203467394e-06, + "loss": 0.9543, + "step": 1088 + }, + { + "epoch": 0.05871252965279276, + "grad_norm": 0.9333881735801697, + "learning_rate": 9.995604320120496e-06, + "loss": 0.9267, + "step": 1089 + }, + { + "epoch": 0.058766443821436275, + "grad_norm": 0.8566498160362244, + "learning_rate": 9.995595427792173e-06, + "loss": 0.8539, + "step": 1090 + }, + { + "epoch": 0.05882035799007979, + "grad_norm": 0.8766364455223083, + "learning_rate": 9.995586526482446e-06, + "loss": 0.9293, + "step": 1091 + }, + { + "epoch": 0.05887427215872331, + "grad_norm": 0.9181047677993774, + "learning_rate": 9.995577616191326e-06, + "loss": 0.8333, + "step": 1092 + }, + { + "epoch": 0.05892818632736683, + "grad_norm": 0.8831031918525696, + "learning_rate": 9.995568696918833e-06, + "loss": 0.8016, + "step": 1093 + }, + { + "epoch": 0.05898210049601035, + "grad_norm": 0.8618754148483276, + "learning_rate": 9.99555976866498e-06, + "loss": 0.8988, + "step": 1094 + }, + { + "epoch": 0.05903601466465387, + "grad_norm": 0.9083183407783508, + "learning_rate": 9.995550831429785e-06, + "loss": 0.8626, + "step": 1095 + }, + { + "epoch": 0.05908992883329739, + "grad_norm": 0.8423884510993958, + "learning_rate": 9.995541885213262e-06, + "loss": 0.9121, + "step": 1096 + }, + { + "epoch": 0.05914384300194091, + "grad_norm": 0.7747607827186584, + "learning_rate": 9.99553293001543e-06, + "loss": 0.8087, + "step": 1097 + }, + { + "epoch": 0.05919775717058443, + "grad_norm": 0.8828368186950684, + "learning_rate": 9.995523965836302e-06, + "loss": 0.8284, + "step": 1098 + }, + { + "epoch": 0.05925167133922795, + "grad_norm": 0.9448524713516235, + "learning_rate": 9.995514992675896e-06, + "loss": 0.9565, + "step": 1099 + }, + { + "epoch": 0.05930558550787147, + "grad_norm": 0.8967006206512451, + "learning_rate": 9.99550601053423e-06, + "loss": 0.8412, + "step": 1100 + }, + { + "epoch": 0.059359499676514986, + "grad_norm": 0.9394551515579224, + "learning_rate": 9.995497019411315e-06, + "loss": 0.929, + "step": 1101 + }, + { + "epoch": 0.05941341384515851, + "grad_norm": 0.9002842903137207, + "learning_rate": 9.995488019307172e-06, + "loss": 0.734, + "step": 1102 + }, + { + "epoch": 0.05946732801380203, + "grad_norm": 1.3590562343597412, + "learning_rate": 9.995479010221816e-06, + "loss": 0.8843, + "step": 1103 + }, + { + "epoch": 0.05952124218244555, + "grad_norm": 1.041528582572937, + "learning_rate": 9.99546999215526e-06, + "loss": 0.9001, + "step": 1104 + }, + { + "epoch": 0.059575156351089065, + "grad_norm": 0.9846720099449158, + "learning_rate": 9.995460965107524e-06, + "loss": 0.8174, + "step": 1105 + }, + { + "epoch": 0.05962907051973258, + "grad_norm": 0.9171685576438904, + "learning_rate": 9.995451929078624e-06, + "loss": 0.8756, + "step": 1106 + }, + { + "epoch": 0.05968298468837611, + "grad_norm": 0.9155516028404236, + "learning_rate": 9.995442884068574e-06, + "loss": 0.7327, + "step": 1107 + }, + { + "epoch": 0.059736898857019625, + "grad_norm": 0.8734007477760315, + "learning_rate": 9.99543383007739e-06, + "loss": 0.8385, + "step": 1108 + }, + { + "epoch": 0.05979081302566314, + "grad_norm": 0.8580977320671082, + "learning_rate": 9.99542476710509e-06, + "loss": 0.885, + "step": 1109 + }, + { + "epoch": 0.05984472719430666, + "grad_norm": 0.8499299883842468, + "learning_rate": 9.995415695151692e-06, + "loss": 0.8323, + "step": 1110 + }, + { + "epoch": 0.059898641362950186, + "grad_norm": 0.8348694443702698, + "learning_rate": 9.99540661421721e-06, + "loss": 0.7947, + "step": 1111 + }, + { + "epoch": 0.059952555531593704, + "grad_norm": 0.8865199685096741, + "learning_rate": 9.99539752430166e-06, + "loss": 0.9363, + "step": 1112 + }, + { + "epoch": 0.06000646970023722, + "grad_norm": 0.9492315649986267, + "learning_rate": 9.995388425405059e-06, + "loss": 0.913, + "step": 1113 + }, + { + "epoch": 0.06006038386888074, + "grad_norm": 0.938252329826355, + "learning_rate": 9.995379317527422e-06, + "loss": 0.861, + "step": 1114 + }, + { + "epoch": 0.06011429803752426, + "grad_norm": 1.2601032257080078, + "learning_rate": 9.995370200668768e-06, + "loss": 0.9435, + "step": 1115 + }, + { + "epoch": 0.06016821220616778, + "grad_norm": 0.915830671787262, + "learning_rate": 9.995361074829112e-06, + "loss": 0.9372, + "step": 1116 + }, + { + "epoch": 0.0602221263748113, + "grad_norm": 1.4548465013504028, + "learning_rate": 9.995351940008473e-06, + "loss": 0.9055, + "step": 1117 + }, + { + "epoch": 0.06027604054345482, + "grad_norm": 0.9090906381607056, + "learning_rate": 9.995342796206861e-06, + "loss": 0.8849, + "step": 1118 + }, + { + "epoch": 0.06032995471209834, + "grad_norm": 0.9860616326332092, + "learning_rate": 9.995333643424298e-06, + "loss": 0.8304, + "step": 1119 + }, + { + "epoch": 0.06038386888074186, + "grad_norm": 0.8320879340171814, + "learning_rate": 9.9953244816608e-06, + "loss": 0.8432, + "step": 1120 + }, + { + "epoch": 0.06043778304938538, + "grad_norm": 0.8633564114570618, + "learning_rate": 9.995315310916381e-06, + "loss": 0.7461, + "step": 1121 + }, + { + "epoch": 0.0604916972180289, + "grad_norm": 0.881287693977356, + "learning_rate": 9.995306131191059e-06, + "loss": 0.8512, + "step": 1122 + }, + { + "epoch": 0.060545611386672415, + "grad_norm": 0.8888201713562012, + "learning_rate": 9.99529694248485e-06, + "loss": 0.8416, + "step": 1123 + }, + { + "epoch": 0.06059952555531594, + "grad_norm": 0.8073605895042419, + "learning_rate": 9.99528774479777e-06, + "loss": 0.8369, + "step": 1124 + }, + { + "epoch": 0.06065343972395946, + "grad_norm": 0.9260549545288086, + "learning_rate": 9.995278538129837e-06, + "loss": 0.8548, + "step": 1125 + }, + { + "epoch": 0.060707353892602976, + "grad_norm": 0.9169156551361084, + "learning_rate": 9.99526932248107e-06, + "loss": 0.9149, + "step": 1126 + }, + { + "epoch": 0.060761268061246494, + "grad_norm": 0.8481706380844116, + "learning_rate": 9.995260097851478e-06, + "loss": 0.8591, + "step": 1127 + }, + { + "epoch": 0.06081518222989001, + "grad_norm": 0.8934486508369446, + "learning_rate": 9.995250864241085e-06, + "loss": 0.9322, + "step": 1128 + }, + { + "epoch": 0.06086909639853354, + "grad_norm": 0.947390615940094, + "learning_rate": 9.995241621649902e-06, + "loss": 1.0015, + "step": 1129 + }, + { + "epoch": 0.060923010567177055, + "grad_norm": 0.9185096025466919, + "learning_rate": 9.995232370077949e-06, + "loss": 0.9293, + "step": 1130 + }, + { + "epoch": 0.06097692473582057, + "grad_norm": 0.9517882466316223, + "learning_rate": 9.995223109525245e-06, + "loss": 0.8673, + "step": 1131 + }, + { + "epoch": 0.06103083890446409, + "grad_norm": 1.065699815750122, + "learning_rate": 9.9952138399918e-06, + "loss": 0.9144, + "step": 1132 + }, + { + "epoch": 0.061084753073107616, + "grad_norm": 0.9048404693603516, + "learning_rate": 9.995204561477635e-06, + "loss": 0.7773, + "step": 1133 + }, + { + "epoch": 0.061138667241751134, + "grad_norm": 1.104457139968872, + "learning_rate": 9.995195273982768e-06, + "loss": 0.8847, + "step": 1134 + }, + { + "epoch": 0.06119258141039465, + "grad_norm": 0.9009587168693542, + "learning_rate": 9.995185977507212e-06, + "loss": 0.8118, + "step": 1135 + }, + { + "epoch": 0.06124649557903817, + "grad_norm": 1.0740209817886353, + "learning_rate": 9.995176672050983e-06, + "loss": 0.9173, + "step": 1136 + }, + { + "epoch": 0.06130040974768169, + "grad_norm": 0.9820743203163147, + "learning_rate": 9.995167357614104e-06, + "loss": 0.8555, + "step": 1137 + }, + { + "epoch": 0.06135432391632521, + "grad_norm": 0.9250825047492981, + "learning_rate": 9.995158034196586e-06, + "loss": 0.8771, + "step": 1138 + }, + { + "epoch": 0.06140823808496873, + "grad_norm": 0.8952597379684448, + "learning_rate": 9.995148701798447e-06, + "loss": 0.8598, + "step": 1139 + }, + { + "epoch": 0.06146215225361225, + "grad_norm": 0.8485212922096252, + "learning_rate": 9.995139360419706e-06, + "loss": 0.8557, + "step": 1140 + }, + { + "epoch": 0.061516066422255766, + "grad_norm": 0.9676715731620789, + "learning_rate": 9.995130010060377e-06, + "loss": 0.7748, + "step": 1141 + }, + { + "epoch": 0.06156998059089929, + "grad_norm": 0.7896347045898438, + "learning_rate": 9.995120650720478e-06, + "loss": 0.6183, + "step": 1142 + }, + { + "epoch": 0.06162389475954281, + "grad_norm": 0.8746615052223206, + "learning_rate": 9.995111282400024e-06, + "loss": 0.8321, + "step": 1143 + }, + { + "epoch": 0.06167780892818633, + "grad_norm": 0.9029875993728638, + "learning_rate": 9.995101905099036e-06, + "loss": 0.8686, + "step": 1144 + }, + { + "epoch": 0.061731723096829845, + "grad_norm": 0.9529547095298767, + "learning_rate": 9.995092518817528e-06, + "loss": 0.8878, + "step": 1145 + }, + { + "epoch": 0.06178563726547336, + "grad_norm": 0.8280455470085144, + "learning_rate": 9.995083123555517e-06, + "loss": 0.8232, + "step": 1146 + }, + { + "epoch": 0.06183955143411689, + "grad_norm": 0.908881664276123, + "learning_rate": 9.995073719313021e-06, + "loss": 0.8387, + "step": 1147 + }, + { + "epoch": 0.061893465602760406, + "grad_norm": 0.9137653708457947, + "learning_rate": 9.995064306090055e-06, + "loss": 0.8943, + "step": 1148 + }, + { + "epoch": 0.061947379771403924, + "grad_norm": 0.863861620426178, + "learning_rate": 9.995054883886639e-06, + "loss": 0.7435, + "step": 1149 + }, + { + "epoch": 0.06200129394004744, + "grad_norm": 0.8534915447235107, + "learning_rate": 9.995045452702786e-06, + "loss": 0.941, + "step": 1150 + }, + { + "epoch": 0.06205520810869097, + "grad_norm": 0.9469791650772095, + "learning_rate": 9.995036012538515e-06, + "loss": 0.9137, + "step": 1151 + }, + { + "epoch": 0.062109122277334484, + "grad_norm": 0.9044890999794006, + "learning_rate": 9.995026563393844e-06, + "loss": 0.9117, + "step": 1152 + }, + { + "epoch": 0.062163036445978, + "grad_norm": 0.989772379398346, + "learning_rate": 9.995017105268789e-06, + "loss": 0.8306, + "step": 1153 + }, + { + "epoch": 0.06221695061462152, + "grad_norm": 0.8586496114730835, + "learning_rate": 9.995007638163365e-06, + "loss": 0.8012, + "step": 1154 + }, + { + "epoch": 0.062270864783265045, + "grad_norm": 0.9221116304397583, + "learning_rate": 9.994998162077594e-06, + "loss": 0.7935, + "step": 1155 + }, + { + "epoch": 0.06232477895190856, + "grad_norm": 0.9453061819076538, + "learning_rate": 9.994988677011489e-06, + "loss": 0.8257, + "step": 1156 + }, + { + "epoch": 0.06237869312055208, + "grad_norm": 0.8065335154533386, + "learning_rate": 9.994979182965065e-06, + "loss": 0.86, + "step": 1157 + }, + { + "epoch": 0.0624326072891956, + "grad_norm": 0.9597793817520142, + "learning_rate": 9.994969679938346e-06, + "loss": 0.862, + "step": 1158 + }, + { + "epoch": 0.06248652145783912, + "grad_norm": 0.9118353128433228, + "learning_rate": 9.994960167931342e-06, + "loss": 0.8925, + "step": 1159 + }, + { + "epoch": 0.06254043562648263, + "grad_norm": 1.0216273069381714, + "learning_rate": 9.994950646944077e-06, + "loss": 0.7078, + "step": 1160 + }, + { + "epoch": 0.06259434979512615, + "grad_norm": 0.960182785987854, + "learning_rate": 9.994941116976562e-06, + "loss": 0.8936, + "step": 1161 + }, + { + "epoch": 0.06264826396376968, + "grad_norm": 0.9551856517791748, + "learning_rate": 9.994931578028817e-06, + "loss": 0.8053, + "step": 1162 + }, + { + "epoch": 0.0627021781324132, + "grad_norm": 0.9419867992401123, + "learning_rate": 9.994922030100857e-06, + "loss": 0.8333, + "step": 1163 + }, + { + "epoch": 0.06275609230105672, + "grad_norm": 0.9780306816101074, + "learning_rate": 9.994912473192702e-06, + "loss": 0.88, + "step": 1164 + }, + { + "epoch": 0.06281000646970024, + "grad_norm": 0.9320577383041382, + "learning_rate": 9.99490290730437e-06, + "loss": 0.8859, + "step": 1165 + }, + { + "epoch": 0.06286392063834376, + "grad_norm": 0.7692422270774841, + "learning_rate": 9.994893332435874e-06, + "loss": 0.8093, + "step": 1166 + }, + { + "epoch": 0.06291783480698727, + "grad_norm": 1.0622048377990723, + "learning_rate": 9.994883748587234e-06, + "loss": 0.8959, + "step": 1167 + }, + { + "epoch": 0.06297174897563079, + "grad_norm": 0.9598555564880371, + "learning_rate": 9.994874155758467e-06, + "loss": 0.8153, + "step": 1168 + }, + { + "epoch": 0.06302566314427431, + "grad_norm": 0.9207014441490173, + "learning_rate": 9.994864553949591e-06, + "loss": 0.9383, + "step": 1169 + }, + { + "epoch": 0.06307957731291783, + "grad_norm": 1.0074093341827393, + "learning_rate": 9.99485494316062e-06, + "loss": 0.9999, + "step": 1170 + }, + { + "epoch": 0.06313349148156136, + "grad_norm": 0.8454248905181885, + "learning_rate": 9.994845323391575e-06, + "loss": 0.7946, + "step": 1171 + }, + { + "epoch": 0.06318740565020488, + "grad_norm": 0.847578763961792, + "learning_rate": 9.99483569464247e-06, + "loss": 0.7144, + "step": 1172 + }, + { + "epoch": 0.0632413198188484, + "grad_norm": 0.9083126187324524, + "learning_rate": 9.994826056913325e-06, + "loss": 0.774, + "step": 1173 + }, + { + "epoch": 0.06329523398749191, + "grad_norm": 0.8995345830917358, + "learning_rate": 9.994816410204158e-06, + "loss": 0.8995, + "step": 1174 + }, + { + "epoch": 0.06334914815613543, + "grad_norm": 1.0547746419906616, + "learning_rate": 9.994806754514983e-06, + "loss": 0.8142, + "step": 1175 + }, + { + "epoch": 0.06340306232477895, + "grad_norm": 0.946854829788208, + "learning_rate": 9.99479708984582e-06, + "loss": 0.8639, + "step": 1176 + }, + { + "epoch": 0.06345697649342247, + "grad_norm": 0.8746247291564941, + "learning_rate": 9.994787416196683e-06, + "loss": 0.8601, + "step": 1177 + }, + { + "epoch": 0.06351089066206599, + "grad_norm": 0.9075024127960205, + "learning_rate": 9.994777733567595e-06, + "loss": 0.7969, + "step": 1178 + }, + { + "epoch": 0.0635648048307095, + "grad_norm": 0.9435486197471619, + "learning_rate": 9.994768041958569e-06, + "loss": 0.8199, + "step": 1179 + }, + { + "epoch": 0.06361871899935304, + "grad_norm": 0.8597564697265625, + "learning_rate": 9.994758341369624e-06, + "loss": 0.8791, + "step": 1180 + }, + { + "epoch": 0.06367263316799655, + "grad_norm": 0.7960480451583862, + "learning_rate": 9.994748631800777e-06, + "loss": 0.8035, + "step": 1181 + }, + { + "epoch": 0.06372654733664007, + "grad_norm": 1.1984984874725342, + "learning_rate": 9.994738913252045e-06, + "loss": 0.7372, + "step": 1182 + }, + { + "epoch": 0.06378046150528359, + "grad_norm": 0.8532997369766235, + "learning_rate": 9.994729185723446e-06, + "loss": 0.9094, + "step": 1183 + }, + { + "epoch": 0.06383437567392711, + "grad_norm": 0.8327267169952393, + "learning_rate": 9.994719449214999e-06, + "loss": 0.809, + "step": 1184 + }, + { + "epoch": 0.06388828984257063, + "grad_norm": 0.9086306691169739, + "learning_rate": 9.99470970372672e-06, + "loss": 0.8278, + "step": 1185 + }, + { + "epoch": 0.06394220401121414, + "grad_norm": 0.8422104716300964, + "learning_rate": 9.994699949258626e-06, + "loss": 0.7754, + "step": 1186 + }, + { + "epoch": 0.06399611817985766, + "grad_norm": 1.0434929132461548, + "learning_rate": 9.994690185810733e-06, + "loss": 0.908, + "step": 1187 + }, + { + "epoch": 0.06405003234850119, + "grad_norm": 1.1625720262527466, + "learning_rate": 9.994680413383064e-06, + "loss": 0.8814, + "step": 1188 + }, + { + "epoch": 0.06410394651714471, + "grad_norm": 0.9940767288208008, + "learning_rate": 9.994670631975631e-06, + "loss": 0.7846, + "step": 1189 + }, + { + "epoch": 0.06415786068578823, + "grad_norm": 0.8356907963752747, + "learning_rate": 9.994660841588457e-06, + "loss": 0.798, + "step": 1190 + }, + { + "epoch": 0.06421177485443175, + "grad_norm": 0.830348014831543, + "learning_rate": 9.994651042221552e-06, + "loss": 0.7875, + "step": 1191 + }, + { + "epoch": 0.06426568902307526, + "grad_norm": 1.1060880422592163, + "learning_rate": 9.994641233874943e-06, + "loss": 0.8893, + "step": 1192 + }, + { + "epoch": 0.06431960319171878, + "grad_norm": 0.9319590926170349, + "learning_rate": 9.994631416548637e-06, + "loss": 0.791, + "step": 1193 + }, + { + "epoch": 0.0643735173603623, + "grad_norm": 0.8345780968666077, + "learning_rate": 9.994621590242661e-06, + "loss": 0.8213, + "step": 1194 + }, + { + "epoch": 0.06442743152900582, + "grad_norm": 0.9848359227180481, + "learning_rate": 9.99461175495703e-06, + "loss": 0.735, + "step": 1195 + }, + { + "epoch": 0.06448134569764934, + "grad_norm": 0.9134055972099304, + "learning_rate": 9.994601910691758e-06, + "loss": 0.8415, + "step": 1196 + }, + { + "epoch": 0.06453525986629287, + "grad_norm": 0.8084586262702942, + "learning_rate": 9.994592057446866e-06, + "loss": 0.8702, + "step": 1197 + }, + { + "epoch": 0.06458917403493639, + "grad_norm": 0.9168767333030701, + "learning_rate": 9.994582195222371e-06, + "loss": 0.8921, + "step": 1198 + }, + { + "epoch": 0.0646430882035799, + "grad_norm": 0.8380446434020996, + "learning_rate": 9.994572324018292e-06, + "loss": 0.7705, + "step": 1199 + }, + { + "epoch": 0.06469700237222342, + "grad_norm": 0.8120049238204956, + "learning_rate": 9.994562443834646e-06, + "loss": 0.7576, + "step": 1200 + }, + { + "epoch": 0.06475091654086694, + "grad_norm": 0.9559764266014099, + "learning_rate": 9.994552554671448e-06, + "loss": 0.8427, + "step": 1201 + }, + { + "epoch": 0.06480483070951046, + "grad_norm": 0.9473673105239868, + "learning_rate": 9.99454265652872e-06, + "loss": 0.9988, + "step": 1202 + }, + { + "epoch": 0.06485874487815398, + "grad_norm": 1.0704870223999023, + "learning_rate": 9.994532749406477e-06, + "loss": 0.9499, + "step": 1203 + }, + { + "epoch": 0.0649126590467975, + "grad_norm": 0.9905646443367004, + "learning_rate": 9.994522833304738e-06, + "loss": 0.8801, + "step": 1204 + }, + { + "epoch": 0.06496657321544101, + "grad_norm": 1.194190502166748, + "learning_rate": 9.99451290822352e-06, + "loss": 0.9051, + "step": 1205 + }, + { + "epoch": 0.06502048738408454, + "grad_norm": 0.8571314811706543, + "learning_rate": 9.994502974162843e-06, + "loss": 0.8131, + "step": 1206 + }, + { + "epoch": 0.06507440155272806, + "grad_norm": 0.9769417643547058, + "learning_rate": 9.994493031122721e-06, + "loss": 0.8524, + "step": 1207 + }, + { + "epoch": 0.06512831572137158, + "grad_norm": 0.8106759786605835, + "learning_rate": 9.994483079103176e-06, + "loss": 0.8142, + "step": 1208 + }, + { + "epoch": 0.0651822298900151, + "grad_norm": 0.8817846775054932, + "learning_rate": 9.994473118104223e-06, + "loss": 0.9076, + "step": 1209 + }, + { + "epoch": 0.06523614405865862, + "grad_norm": 0.8271930813789368, + "learning_rate": 9.994463148125882e-06, + "loss": 0.7914, + "step": 1210 + }, + { + "epoch": 0.06529005822730213, + "grad_norm": 0.9060614705085754, + "learning_rate": 9.994453169168169e-06, + "loss": 0.8375, + "step": 1211 + }, + { + "epoch": 0.06534397239594565, + "grad_norm": 0.880614697933197, + "learning_rate": 9.994443181231103e-06, + "loss": 0.7751, + "step": 1212 + }, + { + "epoch": 0.06539788656458917, + "grad_norm": 0.9420819282531738, + "learning_rate": 9.994433184314702e-06, + "loss": 0.8532, + "step": 1213 + }, + { + "epoch": 0.06545180073323269, + "grad_norm": 0.8587054014205933, + "learning_rate": 9.994423178418984e-06, + "loss": 0.8804, + "step": 1214 + }, + { + "epoch": 0.06550571490187622, + "grad_norm": 0.9624550938606262, + "learning_rate": 9.994413163543965e-06, + "loss": 0.9782, + "step": 1215 + }, + { + "epoch": 0.06555962907051974, + "grad_norm": 0.9458224773406982, + "learning_rate": 9.994403139689665e-06, + "loss": 0.8274, + "step": 1216 + }, + { + "epoch": 0.06561354323916326, + "grad_norm": 1.0417940616607666, + "learning_rate": 9.994393106856104e-06, + "loss": 0.9065, + "step": 1217 + }, + { + "epoch": 0.06566745740780677, + "grad_norm": 1.0225417613983154, + "learning_rate": 9.994383065043296e-06, + "loss": 0.8642, + "step": 1218 + }, + { + "epoch": 0.06572137157645029, + "grad_norm": 0.9015594720840454, + "learning_rate": 9.994373014251261e-06, + "loss": 0.8775, + "step": 1219 + }, + { + "epoch": 0.06577528574509381, + "grad_norm": 0.8473883271217346, + "learning_rate": 9.994362954480018e-06, + "loss": 0.8566, + "step": 1220 + }, + { + "epoch": 0.06582919991373733, + "grad_norm": 0.8571242690086365, + "learning_rate": 9.994352885729584e-06, + "loss": 0.8502, + "step": 1221 + }, + { + "epoch": 0.06588311408238084, + "grad_norm": 0.8793268799781799, + "learning_rate": 9.994342807999977e-06, + "loss": 0.9062, + "step": 1222 + }, + { + "epoch": 0.06593702825102436, + "grad_norm": 0.8866230249404907, + "learning_rate": 9.994332721291214e-06, + "loss": 0.9026, + "step": 1223 + }, + { + "epoch": 0.0659909424196679, + "grad_norm": 0.9135996103286743, + "learning_rate": 9.994322625603314e-06, + "loss": 0.8558, + "step": 1224 + }, + { + "epoch": 0.06604485658831141, + "grad_norm": 0.9904530048370361, + "learning_rate": 9.994312520936297e-06, + "loss": 0.8823, + "step": 1225 + }, + { + "epoch": 0.06609877075695493, + "grad_norm": 0.8590260148048401, + "learning_rate": 9.99430240729018e-06, + "loss": 0.8344, + "step": 1226 + }, + { + "epoch": 0.06615268492559845, + "grad_norm": 1.1669397354125977, + "learning_rate": 9.99429228466498e-06, + "loss": 0.9459, + "step": 1227 + }, + { + "epoch": 0.06620659909424197, + "grad_norm": 0.9290857315063477, + "learning_rate": 9.994282153060715e-06, + "loss": 0.8723, + "step": 1228 + }, + { + "epoch": 0.06626051326288548, + "grad_norm": 0.9619696140289307, + "learning_rate": 9.994272012477405e-06, + "loss": 0.8986, + "step": 1229 + }, + { + "epoch": 0.066314427431529, + "grad_norm": 0.8312071561813354, + "learning_rate": 9.994261862915068e-06, + "loss": 0.7291, + "step": 1230 + }, + { + "epoch": 0.06636834160017252, + "grad_norm": 1.0099300146102905, + "learning_rate": 9.994251704373721e-06, + "loss": 0.8725, + "step": 1231 + }, + { + "epoch": 0.06642225576881604, + "grad_norm": 0.8522336483001709, + "learning_rate": 9.994241536853384e-06, + "loss": 0.8656, + "step": 1232 + }, + { + "epoch": 0.06647616993745957, + "grad_norm": 0.919360339641571, + "learning_rate": 9.994231360354074e-06, + "loss": 0.8854, + "step": 1233 + }, + { + "epoch": 0.06653008410610309, + "grad_norm": 0.8002495169639587, + "learning_rate": 9.994221174875809e-06, + "loss": 0.7879, + "step": 1234 + }, + { + "epoch": 0.0665839982747466, + "grad_norm": 0.9539757370948792, + "learning_rate": 9.994210980418607e-06, + "loss": 0.9027, + "step": 1235 + }, + { + "epoch": 0.06663791244339012, + "grad_norm": 0.9222649335861206, + "learning_rate": 9.99420077698249e-06, + "loss": 0.7611, + "step": 1236 + }, + { + "epoch": 0.06669182661203364, + "grad_norm": 0.8629900813102722, + "learning_rate": 9.994190564567472e-06, + "loss": 0.8122, + "step": 1237 + }, + { + "epoch": 0.06674574078067716, + "grad_norm": 0.8339203000068665, + "learning_rate": 9.994180343173574e-06, + "loss": 0.7873, + "step": 1238 + }, + { + "epoch": 0.06679965494932068, + "grad_norm": 0.8844656348228455, + "learning_rate": 9.994170112800812e-06, + "loss": 0.8176, + "step": 1239 + }, + { + "epoch": 0.0668535691179642, + "grad_norm": 1.0024579763412476, + "learning_rate": 9.994159873449206e-06, + "loss": 0.844, + "step": 1240 + }, + { + "epoch": 0.06690748328660773, + "grad_norm": 0.8317261338233948, + "learning_rate": 9.994149625118774e-06, + "loss": 0.9103, + "step": 1241 + }, + { + "epoch": 0.06696139745525125, + "grad_norm": 0.8915300965309143, + "learning_rate": 9.994139367809534e-06, + "loss": 0.9084, + "step": 1242 + }, + { + "epoch": 0.06701531162389476, + "grad_norm": 0.9270803332328796, + "learning_rate": 9.994129101521506e-06, + "loss": 0.7634, + "step": 1243 + }, + { + "epoch": 0.06706922579253828, + "grad_norm": 0.9891652464866638, + "learning_rate": 9.994118826254708e-06, + "loss": 0.9776, + "step": 1244 + }, + { + "epoch": 0.0671231399611818, + "grad_norm": 0.7778229713439941, + "learning_rate": 9.994108542009156e-06, + "loss": 0.7481, + "step": 1245 + }, + { + "epoch": 0.06717705412982532, + "grad_norm": 0.8451201319694519, + "learning_rate": 9.994098248784872e-06, + "loss": 0.8012, + "step": 1246 + }, + { + "epoch": 0.06723096829846884, + "grad_norm": 0.8115825057029724, + "learning_rate": 9.994087946581873e-06, + "loss": 0.874, + "step": 1247 + }, + { + "epoch": 0.06728488246711235, + "grad_norm": 0.815934419631958, + "learning_rate": 9.994077635400175e-06, + "loss": 0.8114, + "step": 1248 + }, + { + "epoch": 0.06733879663575587, + "grad_norm": 1.1179388761520386, + "learning_rate": 9.9940673152398e-06, + "loss": 0.9078, + "step": 1249 + }, + { + "epoch": 0.0673927108043994, + "grad_norm": 0.9235454201698303, + "learning_rate": 9.994056986100767e-06, + "loss": 0.7511, + "step": 1250 + }, + { + "epoch": 0.06744662497304292, + "grad_norm": 0.8568270206451416, + "learning_rate": 9.994046647983093e-06, + "loss": 0.7805, + "step": 1251 + }, + { + "epoch": 0.06750053914168644, + "grad_norm": 1.1337388753890991, + "learning_rate": 9.994036300886796e-06, + "loss": 0.8835, + "step": 1252 + }, + { + "epoch": 0.06755445331032996, + "grad_norm": 0.9154239892959595, + "learning_rate": 9.994025944811896e-06, + "loss": 0.8804, + "step": 1253 + }, + { + "epoch": 0.06760836747897347, + "grad_norm": 0.8301606774330139, + "learning_rate": 9.99401557975841e-06, + "loss": 0.7905, + "step": 1254 + }, + { + "epoch": 0.06766228164761699, + "grad_norm": 0.9907017350196838, + "learning_rate": 9.994005205726358e-06, + "loss": 0.9091, + "step": 1255 + }, + { + "epoch": 0.06771619581626051, + "grad_norm": 0.8883876204490662, + "learning_rate": 9.993994822715758e-06, + "loss": 0.8815, + "step": 1256 + }, + { + "epoch": 0.06777010998490403, + "grad_norm": 0.9746614098548889, + "learning_rate": 9.993984430726627e-06, + "loss": 0.7897, + "step": 1257 + }, + { + "epoch": 0.06782402415354755, + "grad_norm": 0.9773344993591309, + "learning_rate": 9.993974029758988e-06, + "loss": 0.8499, + "step": 1258 + }, + { + "epoch": 0.06787793832219108, + "grad_norm": 0.9552164077758789, + "learning_rate": 9.993963619812856e-06, + "loss": 0.711, + "step": 1259 + }, + { + "epoch": 0.0679318524908346, + "grad_norm": 0.9146968126296997, + "learning_rate": 9.993953200888252e-06, + "loss": 0.9016, + "step": 1260 + }, + { + "epoch": 0.06798576665947811, + "grad_norm": 0.924244225025177, + "learning_rate": 9.993942772985192e-06, + "loss": 0.7534, + "step": 1261 + }, + { + "epoch": 0.06803968082812163, + "grad_norm": 1.2963265180587769, + "learning_rate": 9.993932336103699e-06, + "loss": 0.9409, + "step": 1262 + }, + { + "epoch": 0.06809359499676515, + "grad_norm": 0.7954462766647339, + "learning_rate": 9.993921890243788e-06, + "loss": 0.7669, + "step": 1263 + }, + { + "epoch": 0.06814750916540867, + "grad_norm": 0.9115849137306213, + "learning_rate": 9.993911435405478e-06, + "loss": 0.7567, + "step": 1264 + }, + { + "epoch": 0.06820142333405219, + "grad_norm": 1.0030237436294556, + "learning_rate": 9.99390097158879e-06, + "loss": 0.8952, + "step": 1265 + }, + { + "epoch": 0.0682553375026957, + "grad_norm": 0.8897690773010254, + "learning_rate": 9.993890498793742e-06, + "loss": 0.7993, + "step": 1266 + }, + { + "epoch": 0.06830925167133922, + "grad_norm": 0.9283807277679443, + "learning_rate": 9.993880017020349e-06, + "loss": 0.8808, + "step": 1267 + }, + { + "epoch": 0.06836316583998275, + "grad_norm": 0.848922848701477, + "learning_rate": 9.993869526268637e-06, + "loss": 0.7979, + "step": 1268 + }, + { + "epoch": 0.06841708000862627, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.993859026538618e-06, + "loss": 0.8886, + "step": 1269 + }, + { + "epoch": 0.06847099417726979, + "grad_norm": 0.8602685928344727, + "learning_rate": 9.993848517830318e-06, + "loss": 0.8209, + "step": 1270 + }, + { + "epoch": 0.06852490834591331, + "grad_norm": 0.9300077557563782, + "learning_rate": 9.99383800014375e-06, + "loss": 0.9261, + "step": 1271 + }, + { + "epoch": 0.06857882251455683, + "grad_norm": 0.8691270351409912, + "learning_rate": 9.993827473478934e-06, + "loss": 0.9217, + "step": 1272 + }, + { + "epoch": 0.06863273668320034, + "grad_norm": 0.7943814992904663, + "learning_rate": 9.99381693783589e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.06868665085184386, + "grad_norm": 0.9060125946998596, + "learning_rate": 9.993806393214638e-06, + "loss": 0.8314, + "step": 1274 + }, + { + "epoch": 0.06874056502048738, + "grad_norm": 0.8014434576034546, + "learning_rate": 9.993795839615194e-06, + "loss": 0.8047, + "step": 1275 + }, + { + "epoch": 0.0687944791891309, + "grad_norm": 1.0498815774917603, + "learning_rate": 9.993785277037578e-06, + "loss": 0.7125, + "step": 1276 + }, + { + "epoch": 0.06884839335777443, + "grad_norm": 0.8868438005447388, + "learning_rate": 9.993774705481812e-06, + "loss": 0.8594, + "step": 1277 + }, + { + "epoch": 0.06890230752641795, + "grad_norm": 0.8213896155357361, + "learning_rate": 9.993764124947911e-06, + "loss": 0.7995, + "step": 1278 + }, + { + "epoch": 0.06895622169506146, + "grad_norm": 0.9007741212844849, + "learning_rate": 9.993753535435895e-06, + "loss": 0.8982, + "step": 1279 + }, + { + "epoch": 0.06901013586370498, + "grad_norm": 0.8377478122711182, + "learning_rate": 9.993742936945785e-06, + "loss": 0.7387, + "step": 1280 + }, + { + "epoch": 0.0690640500323485, + "grad_norm": 0.8009492754936218, + "learning_rate": 9.993732329477598e-06, + "loss": 0.8079, + "step": 1281 + }, + { + "epoch": 0.06911796420099202, + "grad_norm": 0.8478789925575256, + "learning_rate": 9.993721713031354e-06, + "loss": 0.8682, + "step": 1282 + }, + { + "epoch": 0.06917187836963554, + "grad_norm": 0.7498561143875122, + "learning_rate": 9.993711087607072e-06, + "loss": 0.8107, + "step": 1283 + }, + { + "epoch": 0.06922579253827905, + "grad_norm": 0.8972634077072144, + "learning_rate": 9.99370045320477e-06, + "loss": 0.8494, + "step": 1284 + }, + { + "epoch": 0.06927970670692257, + "grad_norm": 0.942449152469635, + "learning_rate": 9.99368980982447e-06, + "loss": 0.8487, + "step": 1285 + }, + { + "epoch": 0.0693336208755661, + "grad_norm": 0.8752795457839966, + "learning_rate": 9.993679157466188e-06, + "loss": 0.8859, + "step": 1286 + }, + { + "epoch": 0.06938753504420962, + "grad_norm": 0.8289507031440735, + "learning_rate": 9.993668496129945e-06, + "loss": 0.8726, + "step": 1287 + }, + { + "epoch": 0.06944144921285314, + "grad_norm": 0.9452151656150818, + "learning_rate": 9.993657825815759e-06, + "loss": 0.9266, + "step": 1288 + }, + { + "epoch": 0.06949536338149666, + "grad_norm": 0.8697348237037659, + "learning_rate": 9.993647146523651e-06, + "loss": 0.8946, + "step": 1289 + }, + { + "epoch": 0.06954927755014018, + "grad_norm": 0.8712061643600464, + "learning_rate": 9.993636458253637e-06, + "loss": 0.8551, + "step": 1290 + }, + { + "epoch": 0.0696031917187837, + "grad_norm": 0.9295617938041687, + "learning_rate": 9.993625761005739e-06, + "loss": 0.8963, + "step": 1291 + }, + { + "epoch": 0.06965710588742721, + "grad_norm": 0.9441055059432983, + "learning_rate": 9.993615054779975e-06, + "loss": 0.9567, + "step": 1292 + }, + { + "epoch": 0.06971102005607073, + "grad_norm": 0.8742032051086426, + "learning_rate": 9.993604339576365e-06, + "loss": 0.8341, + "step": 1293 + }, + { + "epoch": 0.06976493422471426, + "grad_norm": 0.8596220016479492, + "learning_rate": 9.993593615394928e-06, + "loss": 0.8576, + "step": 1294 + }, + { + "epoch": 0.06981884839335778, + "grad_norm": 0.8011770844459534, + "learning_rate": 9.993582882235682e-06, + "loss": 0.7317, + "step": 1295 + }, + { + "epoch": 0.0698727625620013, + "grad_norm": 0.8578245043754578, + "learning_rate": 9.993572140098648e-06, + "loss": 0.8853, + "step": 1296 + }, + { + "epoch": 0.06992667673064482, + "grad_norm": 1.1155178546905518, + "learning_rate": 9.993561388983845e-06, + "loss": 0.8199, + "step": 1297 + }, + { + "epoch": 0.06998059089928833, + "grad_norm": 1.035699486732483, + "learning_rate": 9.993550628891293e-06, + "loss": 0.9498, + "step": 1298 + }, + { + "epoch": 0.07003450506793185, + "grad_norm": 0.8635748028755188, + "learning_rate": 9.99353985982101e-06, + "loss": 0.8741, + "step": 1299 + }, + { + "epoch": 0.07008841923657537, + "grad_norm": 0.8650850653648376, + "learning_rate": 9.993529081773016e-06, + "loss": 0.7337, + "step": 1300 + }, + { + "epoch": 0.07014233340521889, + "grad_norm": 0.8334539532661438, + "learning_rate": 9.99351829474733e-06, + "loss": 0.8927, + "step": 1301 + }, + { + "epoch": 0.0701962475738624, + "grad_norm": 0.9150926470756531, + "learning_rate": 9.993507498743971e-06, + "loss": 0.8464, + "step": 1302 + }, + { + "epoch": 0.07025016174250594, + "grad_norm": 0.8916522860527039, + "learning_rate": 9.993496693762958e-06, + "loss": 0.7899, + "step": 1303 + }, + { + "epoch": 0.07030407591114946, + "grad_norm": 1.0224976539611816, + "learning_rate": 9.993485879804314e-06, + "loss": 0.8256, + "step": 1304 + }, + { + "epoch": 0.07035799007979297, + "grad_norm": 0.921816885471344, + "learning_rate": 9.993475056868054e-06, + "loss": 0.7944, + "step": 1305 + }, + { + "epoch": 0.07041190424843649, + "grad_norm": 0.8775705099105835, + "learning_rate": 9.9934642249542e-06, + "loss": 0.9098, + "step": 1306 + }, + { + "epoch": 0.07046581841708001, + "grad_norm": 0.9802567362785339, + "learning_rate": 9.99345338406277e-06, + "loss": 0.9756, + "step": 1307 + }, + { + "epoch": 0.07051973258572353, + "grad_norm": 0.9785491228103638, + "learning_rate": 9.993442534193786e-06, + "loss": 1.0017, + "step": 1308 + }, + { + "epoch": 0.07057364675436704, + "grad_norm": 0.8796840906143188, + "learning_rate": 9.993431675347265e-06, + "loss": 0.7202, + "step": 1309 + }, + { + "epoch": 0.07062756092301056, + "grad_norm": 0.878099799156189, + "learning_rate": 9.993420807523227e-06, + "loss": 0.8655, + "step": 1310 + }, + { + "epoch": 0.07068147509165408, + "grad_norm": 0.8361509442329407, + "learning_rate": 9.99340993072169e-06, + "loss": 0.8522, + "step": 1311 + }, + { + "epoch": 0.07073538926029761, + "grad_norm": 0.8556873798370361, + "learning_rate": 9.99339904494268e-06, + "loss": 0.8603, + "step": 1312 + }, + { + "epoch": 0.07078930342894113, + "grad_norm": 0.8434461355209351, + "learning_rate": 9.993388150186208e-06, + "loss": 0.8571, + "step": 1313 + }, + { + "epoch": 0.07084321759758465, + "grad_norm": 0.8545907139778137, + "learning_rate": 9.9933772464523e-06, + "loss": 0.8145, + "step": 1314 + }, + { + "epoch": 0.07089713176622817, + "grad_norm": 0.9502561092376709, + "learning_rate": 9.993366333740971e-06, + "loss": 0.8068, + "step": 1315 + }, + { + "epoch": 0.07095104593487168, + "grad_norm": 0.848628580570221, + "learning_rate": 9.993355412052244e-06, + "loss": 0.8793, + "step": 1316 + }, + { + "epoch": 0.0710049601035152, + "grad_norm": 0.9699797630310059, + "learning_rate": 9.993344481386137e-06, + "loss": 0.9904, + "step": 1317 + }, + { + "epoch": 0.07105887427215872, + "grad_norm": 0.8888396620750427, + "learning_rate": 9.993333541742671e-06, + "loss": 0.8363, + "step": 1318 + }, + { + "epoch": 0.07111278844080224, + "grad_norm": 0.8805423974990845, + "learning_rate": 9.993322593121863e-06, + "loss": 0.8905, + "step": 1319 + }, + { + "epoch": 0.07116670260944576, + "grad_norm": 0.8875272274017334, + "learning_rate": 9.993311635523736e-06, + "loss": 0.7717, + "step": 1320 + }, + { + "epoch": 0.07122061677808929, + "grad_norm": 0.8853299617767334, + "learning_rate": 9.993300668948308e-06, + "loss": 0.9077, + "step": 1321 + }, + { + "epoch": 0.0712745309467328, + "grad_norm": 0.8847644329071045, + "learning_rate": 9.993289693395599e-06, + "loss": 0.8362, + "step": 1322 + }, + { + "epoch": 0.07132844511537632, + "grad_norm": 0.9531683325767517, + "learning_rate": 9.993278708865629e-06, + "loss": 0.8848, + "step": 1323 + }, + { + "epoch": 0.07138235928401984, + "grad_norm": 0.8573325276374817, + "learning_rate": 9.993267715358414e-06, + "loss": 0.8367, + "step": 1324 + }, + { + "epoch": 0.07143627345266336, + "grad_norm": 0.8920298218727112, + "learning_rate": 9.99325671287398e-06, + "loss": 0.8838, + "step": 1325 + }, + { + "epoch": 0.07149018762130688, + "grad_norm": 0.8472782969474792, + "learning_rate": 9.993245701412343e-06, + "loss": 0.8313, + "step": 1326 + }, + { + "epoch": 0.0715441017899504, + "grad_norm": 1.047664761543274, + "learning_rate": 9.993234680973525e-06, + "loss": 0.8663, + "step": 1327 + }, + { + "epoch": 0.07159801595859391, + "grad_norm": 0.9395570158958435, + "learning_rate": 9.993223651557542e-06, + "loss": 0.7703, + "step": 1328 + }, + { + "epoch": 0.07165193012723743, + "grad_norm": 0.9125472903251648, + "learning_rate": 9.993212613164419e-06, + "loss": 0.9335, + "step": 1329 + }, + { + "epoch": 0.07170584429588096, + "grad_norm": 0.9043323397636414, + "learning_rate": 9.993201565794172e-06, + "loss": 0.9185, + "step": 1330 + }, + { + "epoch": 0.07175975846452448, + "grad_norm": 0.8764339089393616, + "learning_rate": 9.993190509446821e-06, + "loss": 0.8807, + "step": 1331 + }, + { + "epoch": 0.071813672633168, + "grad_norm": 0.9123268723487854, + "learning_rate": 9.99317944412239e-06, + "loss": 0.8134, + "step": 1332 + }, + { + "epoch": 0.07186758680181152, + "grad_norm": 0.9625567197799683, + "learning_rate": 9.993168369820892e-06, + "loss": 0.8132, + "step": 1333 + }, + { + "epoch": 0.07192150097045504, + "grad_norm": 0.880536675453186, + "learning_rate": 9.993157286542352e-06, + "loss": 0.8107, + "step": 1334 + }, + { + "epoch": 0.07197541513909855, + "grad_norm": 0.9165224432945251, + "learning_rate": 9.99314619428679e-06, + "loss": 0.8376, + "step": 1335 + }, + { + "epoch": 0.07202932930774207, + "grad_norm": 0.8278066515922546, + "learning_rate": 9.993135093054223e-06, + "loss": 0.8075, + "step": 1336 + }, + { + "epoch": 0.07208324347638559, + "grad_norm": 0.9237795472145081, + "learning_rate": 9.993123982844674e-06, + "loss": 0.7838, + "step": 1337 + }, + { + "epoch": 0.0721371576450291, + "grad_norm": 0.8200939297676086, + "learning_rate": 9.993112863658161e-06, + "loss": 0.8475, + "step": 1338 + }, + { + "epoch": 0.07219107181367264, + "grad_norm": 0.8505958318710327, + "learning_rate": 9.993101735494704e-06, + "loss": 0.7891, + "step": 1339 + }, + { + "epoch": 0.07224498598231616, + "grad_norm": 0.8407264351844788, + "learning_rate": 9.993090598354323e-06, + "loss": 0.8128, + "step": 1340 + }, + { + "epoch": 0.07229890015095967, + "grad_norm": 0.8039887547492981, + "learning_rate": 9.993079452237038e-06, + "loss": 0.8504, + "step": 1341 + }, + { + "epoch": 0.07235281431960319, + "grad_norm": 0.7590643167495728, + "learning_rate": 9.993068297142871e-06, + "loss": 0.7402, + "step": 1342 + }, + { + "epoch": 0.07240672848824671, + "grad_norm": 0.7866249680519104, + "learning_rate": 9.993057133071842e-06, + "loss": 0.7076, + "step": 1343 + }, + { + "epoch": 0.07246064265689023, + "grad_norm": 0.9846029281616211, + "learning_rate": 9.993045960023967e-06, + "loss": 0.9179, + "step": 1344 + }, + { + "epoch": 0.07251455682553375, + "grad_norm": 0.8918319940567017, + "learning_rate": 9.99303477799927e-06, + "loss": 0.8087, + "step": 1345 + }, + { + "epoch": 0.07256847099417726, + "grad_norm": 0.8407700061798096, + "learning_rate": 9.99302358699777e-06, + "loss": 0.7272, + "step": 1346 + }, + { + "epoch": 0.0726223851628208, + "grad_norm": 0.9637326598167419, + "learning_rate": 9.993012387019486e-06, + "loss": 0.8613, + "step": 1347 + }, + { + "epoch": 0.07267629933146431, + "grad_norm": 0.8362317681312561, + "learning_rate": 9.99300117806444e-06, + "loss": 0.917, + "step": 1348 + }, + { + "epoch": 0.07273021350010783, + "grad_norm": 0.8584982752799988, + "learning_rate": 9.992989960132651e-06, + "loss": 0.8857, + "step": 1349 + }, + { + "epoch": 0.07278412766875135, + "grad_norm": 0.8341198563575745, + "learning_rate": 9.992978733224139e-06, + "loss": 0.802, + "step": 1350 + }, + { + "epoch": 0.07283804183739487, + "grad_norm": 1.6860167980194092, + "learning_rate": 9.992967497338926e-06, + "loss": 0.8789, + "step": 1351 + }, + { + "epoch": 0.07289195600603839, + "grad_norm": 0.8399189114570618, + "learning_rate": 9.99295625247703e-06, + "loss": 0.6338, + "step": 1352 + }, + { + "epoch": 0.0729458701746819, + "grad_norm": 0.9616976976394653, + "learning_rate": 9.992944998638473e-06, + "loss": 0.9735, + "step": 1353 + }, + { + "epoch": 0.07299978434332542, + "grad_norm": 0.8592861890792847, + "learning_rate": 9.992933735823272e-06, + "loss": 0.8159, + "step": 1354 + }, + { + "epoch": 0.07305369851196894, + "grad_norm": 0.8448725342750549, + "learning_rate": 9.992922464031451e-06, + "loss": 0.7942, + "step": 1355 + }, + { + "epoch": 0.07310761268061247, + "grad_norm": 0.8015927672386169, + "learning_rate": 9.99291118326303e-06, + "loss": 0.7429, + "step": 1356 + }, + { + "epoch": 0.07316152684925599, + "grad_norm": 0.8255912065505981, + "learning_rate": 9.992899893518025e-06, + "loss": 0.8532, + "step": 1357 + }, + { + "epoch": 0.07321544101789951, + "grad_norm": 0.8764085173606873, + "learning_rate": 9.992888594796462e-06, + "loss": 0.7989, + "step": 1358 + }, + { + "epoch": 0.07326935518654303, + "grad_norm": 0.8405522704124451, + "learning_rate": 9.992877287098357e-06, + "loss": 0.8709, + "step": 1359 + }, + { + "epoch": 0.07332326935518654, + "grad_norm": 0.8657836318016052, + "learning_rate": 9.992865970423733e-06, + "loss": 0.8236, + "step": 1360 + }, + { + "epoch": 0.07337718352383006, + "grad_norm": 0.8817959427833557, + "learning_rate": 9.992854644772609e-06, + "loss": 0.902, + "step": 1361 + }, + { + "epoch": 0.07343109769247358, + "grad_norm": 0.8290701508522034, + "learning_rate": 9.992843310145006e-06, + "loss": 0.8454, + "step": 1362 + }, + { + "epoch": 0.0734850118611171, + "grad_norm": 0.9637642502784729, + "learning_rate": 9.992831966540946e-06, + "loss": 0.9414, + "step": 1363 + }, + { + "epoch": 0.07353892602976062, + "grad_norm": 0.9220197200775146, + "learning_rate": 9.992820613960446e-06, + "loss": 0.9827, + "step": 1364 + }, + { + "epoch": 0.07359284019840415, + "grad_norm": 0.9008362889289856, + "learning_rate": 9.992809252403526e-06, + "loss": 0.8388, + "step": 1365 + }, + { + "epoch": 0.07364675436704766, + "grad_norm": 0.9517331123352051, + "learning_rate": 9.992797881870212e-06, + "loss": 0.8758, + "step": 1366 + }, + { + "epoch": 0.07370066853569118, + "grad_norm": 0.7811571359634399, + "learning_rate": 9.992786502360517e-06, + "loss": 0.6984, + "step": 1367 + }, + { + "epoch": 0.0737545827043347, + "grad_norm": 0.9887184500694275, + "learning_rate": 9.992775113874466e-06, + "loss": 0.7832, + "step": 1368 + }, + { + "epoch": 0.07380849687297822, + "grad_norm": 1.025869607925415, + "learning_rate": 9.99276371641208e-06, + "loss": 0.8417, + "step": 1369 + }, + { + "epoch": 0.07386241104162174, + "grad_norm": 0.8479165434837341, + "learning_rate": 9.99275230997338e-06, + "loss": 0.7862, + "step": 1370 + }, + { + "epoch": 0.07391632521026525, + "grad_norm": 0.9213555455207825, + "learning_rate": 9.992740894558381e-06, + "loss": 0.915, + "step": 1371 + }, + { + "epoch": 0.07397023937890877, + "grad_norm": 0.832306444644928, + "learning_rate": 9.992729470167109e-06, + "loss": 0.7566, + "step": 1372 + }, + { + "epoch": 0.07402415354755229, + "grad_norm": 1.0360348224639893, + "learning_rate": 9.992718036799583e-06, + "loss": 0.9096, + "step": 1373 + }, + { + "epoch": 0.07407806771619582, + "grad_norm": 0.8898483514785767, + "learning_rate": 9.992706594455823e-06, + "loss": 0.8738, + "step": 1374 + }, + { + "epoch": 0.07413198188483934, + "grad_norm": 0.8813758492469788, + "learning_rate": 9.992695143135849e-06, + "loss": 0.8736, + "step": 1375 + }, + { + "epoch": 0.07418589605348286, + "grad_norm": 1.1480571031570435, + "learning_rate": 9.992683682839683e-06, + "loss": 0.915, + "step": 1376 + }, + { + "epoch": 0.07423981022212638, + "grad_norm": 0.8588376641273499, + "learning_rate": 9.992672213567345e-06, + "loss": 0.8295, + "step": 1377 + }, + { + "epoch": 0.0742937243907699, + "grad_norm": 0.8729918599128723, + "learning_rate": 9.992660735318858e-06, + "loss": 0.9058, + "step": 1378 + }, + { + "epoch": 0.07434763855941341, + "grad_norm": 0.7953224778175354, + "learning_rate": 9.992649248094236e-06, + "loss": 0.7857, + "step": 1379 + }, + { + "epoch": 0.07440155272805693, + "grad_norm": 0.8485717177391052, + "learning_rate": 9.992637751893508e-06, + "loss": 0.7641, + "step": 1380 + }, + { + "epoch": 0.07445546689670045, + "grad_norm": 0.8630878329277039, + "learning_rate": 9.99262624671669e-06, + "loss": 0.8624, + "step": 1381 + }, + { + "epoch": 0.07450938106534397, + "grad_norm": 0.8655185103416443, + "learning_rate": 9.992614732563802e-06, + "loss": 0.8428, + "step": 1382 + }, + { + "epoch": 0.0745632952339875, + "grad_norm": 0.7875732779502869, + "learning_rate": 9.992603209434868e-06, + "loss": 0.7272, + "step": 1383 + }, + { + "epoch": 0.07461720940263102, + "grad_norm": 0.875879168510437, + "learning_rate": 9.992591677329905e-06, + "loss": 0.8539, + "step": 1384 + }, + { + "epoch": 0.07467112357127453, + "grad_norm": 0.8618319034576416, + "learning_rate": 9.992580136248934e-06, + "loss": 0.879, + "step": 1385 + }, + { + "epoch": 0.07472503773991805, + "grad_norm": 0.8695591688156128, + "learning_rate": 9.992568586191981e-06, + "loss": 0.8477, + "step": 1386 + }, + { + "epoch": 0.07477895190856157, + "grad_norm": 0.8539825677871704, + "learning_rate": 9.992557027159062e-06, + "loss": 0.7347, + "step": 1387 + }, + { + "epoch": 0.07483286607720509, + "grad_norm": 0.9625217914581299, + "learning_rate": 9.992545459150197e-06, + "loss": 0.8561, + "step": 1388 + }, + { + "epoch": 0.0748867802458486, + "grad_norm": 0.9862298369407654, + "learning_rate": 9.992533882165409e-06, + "loss": 0.9583, + "step": 1389 + }, + { + "epoch": 0.07494069441449212, + "grad_norm": 0.8217719793319702, + "learning_rate": 9.99252229620472e-06, + "loss": 0.7995, + "step": 1390 + }, + { + "epoch": 0.07499460858313564, + "grad_norm": 0.8668621182441711, + "learning_rate": 9.992510701268147e-06, + "loss": 0.8484, + "step": 1391 + }, + { + "epoch": 0.07504852275177917, + "grad_norm": 0.8549453616142273, + "learning_rate": 9.992499097355716e-06, + "loss": 0.8552, + "step": 1392 + }, + { + "epoch": 0.07510243692042269, + "grad_norm": 0.8262618184089661, + "learning_rate": 9.992487484467444e-06, + "loss": 0.7054, + "step": 1393 + }, + { + "epoch": 0.07515635108906621, + "grad_norm": 0.8524961471557617, + "learning_rate": 9.992475862603352e-06, + "loss": 0.8231, + "step": 1394 + }, + { + "epoch": 0.07521026525770973, + "grad_norm": 0.7805570363998413, + "learning_rate": 9.99246423176346e-06, + "loss": 0.7778, + "step": 1395 + }, + { + "epoch": 0.07526417942635324, + "grad_norm": 0.950484037399292, + "learning_rate": 9.992452591947794e-06, + "loss": 0.8662, + "step": 1396 + }, + { + "epoch": 0.07531809359499676, + "grad_norm": 0.8746458888053894, + "learning_rate": 9.99244094315637e-06, + "loss": 0.7854, + "step": 1397 + }, + { + "epoch": 0.07537200776364028, + "grad_norm": 0.9450538754463196, + "learning_rate": 9.992429285389212e-06, + "loss": 0.954, + "step": 1398 + }, + { + "epoch": 0.0754259219322838, + "grad_norm": 0.9048300385475159, + "learning_rate": 9.992417618646337e-06, + "loss": 0.8915, + "step": 1399 + }, + { + "epoch": 0.07547983610092733, + "grad_norm": 0.8735381364822388, + "learning_rate": 9.99240594292777e-06, + "loss": 0.8391, + "step": 1400 + }, + { + "epoch": 0.07553375026957085, + "grad_norm": 1.0980675220489502, + "learning_rate": 9.99239425823353e-06, + "loss": 0.8892, + "step": 1401 + }, + { + "epoch": 0.07558766443821437, + "grad_norm": 0.9016425013542175, + "learning_rate": 9.992382564563638e-06, + "loss": 0.8192, + "step": 1402 + }, + { + "epoch": 0.07564157860685788, + "grad_norm": 0.801419198513031, + "learning_rate": 9.992370861918117e-06, + "loss": 0.7914, + "step": 1403 + }, + { + "epoch": 0.0756954927755014, + "grad_norm": 0.9043407440185547, + "learning_rate": 9.992359150296985e-06, + "loss": 0.8767, + "step": 1404 + }, + { + "epoch": 0.07574940694414492, + "grad_norm": 0.9703086018562317, + "learning_rate": 9.992347429700266e-06, + "loss": 0.9173, + "step": 1405 + }, + { + "epoch": 0.07580332111278844, + "grad_norm": 0.8154104351997375, + "learning_rate": 9.992335700127978e-06, + "loss": 0.8453, + "step": 1406 + }, + { + "epoch": 0.07585723528143196, + "grad_norm": 0.8551482558250427, + "learning_rate": 9.992323961580146e-06, + "loss": 0.9132, + "step": 1407 + }, + { + "epoch": 0.07591114945007547, + "grad_norm": 0.9425063729286194, + "learning_rate": 9.992312214056785e-06, + "loss": 0.8171, + "step": 1408 + }, + { + "epoch": 0.075965063618719, + "grad_norm": 0.8958794474601746, + "learning_rate": 9.992300457557922e-06, + "loss": 0.7983, + "step": 1409 + }, + { + "epoch": 0.07601897778736252, + "grad_norm": 0.873874843120575, + "learning_rate": 9.992288692083579e-06, + "loss": 0.798, + "step": 1410 + }, + { + "epoch": 0.07607289195600604, + "grad_norm": 0.7951189279556274, + "learning_rate": 9.99227691763377e-06, + "loss": 0.8671, + "step": 1411 + }, + { + "epoch": 0.07612680612464956, + "grad_norm": 0.8073802590370178, + "learning_rate": 9.992265134208522e-06, + "loss": 0.8214, + "step": 1412 + }, + { + "epoch": 0.07618072029329308, + "grad_norm": 0.918222188949585, + "learning_rate": 9.992253341807854e-06, + "loss": 0.807, + "step": 1413 + }, + { + "epoch": 0.0762346344619366, + "grad_norm": 0.834381103515625, + "learning_rate": 9.992241540431789e-06, + "loss": 0.8737, + "step": 1414 + }, + { + "epoch": 0.07628854863058011, + "grad_norm": 0.808437168598175, + "learning_rate": 9.992229730080347e-06, + "loss": 0.7982, + "step": 1415 + }, + { + "epoch": 0.07634246279922363, + "grad_norm": 0.7868708968162537, + "learning_rate": 9.992217910753547e-06, + "loss": 0.7071, + "step": 1416 + }, + { + "epoch": 0.07639637696786715, + "grad_norm": 0.8445919156074524, + "learning_rate": 9.992206082451416e-06, + "loss": 0.8353, + "step": 1417 + }, + { + "epoch": 0.07645029113651068, + "grad_norm": 0.8283419609069824, + "learning_rate": 9.992194245173969e-06, + "loss": 0.867, + "step": 1418 + }, + { + "epoch": 0.0765042053051542, + "grad_norm": 0.8390635251998901, + "learning_rate": 9.99218239892123e-06, + "loss": 0.822, + "step": 1419 + }, + { + "epoch": 0.07655811947379772, + "grad_norm": 0.9037001132965088, + "learning_rate": 9.992170543693222e-06, + "loss": 0.8759, + "step": 1420 + }, + { + "epoch": 0.07661203364244124, + "grad_norm": 0.9708169102668762, + "learning_rate": 9.992158679489965e-06, + "loss": 0.875, + "step": 1421 + }, + { + "epoch": 0.07666594781108475, + "grad_norm": 0.8712205290794373, + "learning_rate": 9.992146806311479e-06, + "loss": 0.8711, + "step": 1422 + }, + { + "epoch": 0.07671986197972827, + "grad_norm": 0.953936755657196, + "learning_rate": 9.992134924157786e-06, + "loss": 0.8117, + "step": 1423 + }, + { + "epoch": 0.07677377614837179, + "grad_norm": 1.3178669214248657, + "learning_rate": 9.992123033028908e-06, + "loss": 0.8932, + "step": 1424 + }, + { + "epoch": 0.0768276903170153, + "grad_norm": 0.8657799959182739, + "learning_rate": 9.992111132924867e-06, + "loss": 0.8429, + "step": 1425 + }, + { + "epoch": 0.07688160448565882, + "grad_norm": 0.8979378938674927, + "learning_rate": 9.992099223845681e-06, + "loss": 0.9165, + "step": 1426 + }, + { + "epoch": 0.07693551865430236, + "grad_norm": 0.797493040561676, + "learning_rate": 9.992087305791376e-06, + "loss": 0.8139, + "step": 1427 + }, + { + "epoch": 0.07698943282294587, + "grad_norm": 0.9762497544288635, + "learning_rate": 9.99207537876197e-06, + "loss": 0.8006, + "step": 1428 + }, + { + "epoch": 0.07704334699158939, + "grad_norm": 0.9322238564491272, + "learning_rate": 9.992063442757487e-06, + "loss": 0.8708, + "step": 1429 + }, + { + "epoch": 0.07709726116023291, + "grad_norm": 0.9208402037620544, + "learning_rate": 9.992051497777947e-06, + "loss": 0.9137, + "step": 1430 + }, + { + "epoch": 0.07715117532887643, + "grad_norm": 0.9262849688529968, + "learning_rate": 9.99203954382337e-06, + "loss": 0.8043, + "step": 1431 + }, + { + "epoch": 0.07720508949751995, + "grad_norm": 1.0556507110595703, + "learning_rate": 9.992027580893781e-06, + "loss": 0.8321, + "step": 1432 + }, + { + "epoch": 0.07725900366616346, + "grad_norm": 1.0503417253494263, + "learning_rate": 9.9920156089892e-06, + "loss": 0.8875, + "step": 1433 + }, + { + "epoch": 0.07731291783480698, + "grad_norm": 0.8772387504577637, + "learning_rate": 9.992003628109647e-06, + "loss": 0.7407, + "step": 1434 + }, + { + "epoch": 0.0773668320034505, + "grad_norm": 0.942286491394043, + "learning_rate": 9.991991638255146e-06, + "loss": 0.8493, + "step": 1435 + }, + { + "epoch": 0.07742074617209403, + "grad_norm": 0.8584794998168945, + "learning_rate": 9.991979639425717e-06, + "loss": 0.8003, + "step": 1436 + }, + { + "epoch": 0.07747466034073755, + "grad_norm": 0.8247780203819275, + "learning_rate": 9.99196763162138e-06, + "loss": 0.9156, + "step": 1437 + }, + { + "epoch": 0.07752857450938107, + "grad_norm": 0.859018862247467, + "learning_rate": 9.99195561484216e-06, + "loss": 0.8255, + "step": 1438 + }, + { + "epoch": 0.07758248867802459, + "grad_norm": 0.9073282480239868, + "learning_rate": 9.991943589088078e-06, + "loss": 0.903, + "step": 1439 + }, + { + "epoch": 0.0776364028466681, + "grad_norm": 0.9324385523796082, + "learning_rate": 9.991931554359154e-06, + "loss": 0.8618, + "step": 1440 + }, + { + "epoch": 0.07769031701531162, + "grad_norm": 0.8038938045501709, + "learning_rate": 9.991919510655409e-06, + "loss": 0.7545, + "step": 1441 + }, + { + "epoch": 0.07774423118395514, + "grad_norm": 0.7999526858329773, + "learning_rate": 9.991907457976866e-06, + "loss": 0.6804, + "step": 1442 + }, + { + "epoch": 0.07779814535259866, + "grad_norm": 1.0165048837661743, + "learning_rate": 9.991895396323548e-06, + "loss": 0.7664, + "step": 1443 + }, + { + "epoch": 0.07785205952124218, + "grad_norm": 0.9513073563575745, + "learning_rate": 9.991883325695475e-06, + "loss": 0.8115, + "step": 1444 + }, + { + "epoch": 0.07790597368988571, + "grad_norm": 1.0391769409179688, + "learning_rate": 9.991871246092669e-06, + "loss": 0.9197, + "step": 1445 + }, + { + "epoch": 0.07795988785852923, + "grad_norm": 0.8990768194198608, + "learning_rate": 9.991859157515151e-06, + "loss": 0.9507, + "step": 1446 + }, + { + "epoch": 0.07801380202717274, + "grad_norm": 0.9990912079811096, + "learning_rate": 9.991847059962945e-06, + "loss": 0.7951, + "step": 1447 + }, + { + "epoch": 0.07806771619581626, + "grad_norm": 1.0030032396316528, + "learning_rate": 9.99183495343607e-06, + "loss": 0.7237, + "step": 1448 + }, + { + "epoch": 0.07812163036445978, + "grad_norm": 0.889561116695404, + "learning_rate": 9.991822837934551e-06, + "loss": 0.9061, + "step": 1449 + }, + { + "epoch": 0.0781755445331033, + "grad_norm": 0.8766982555389404, + "learning_rate": 9.991810713458405e-06, + "loss": 0.7952, + "step": 1450 + }, + { + "epoch": 0.07822945870174682, + "grad_norm": 0.9144406914710999, + "learning_rate": 9.991798580007658e-06, + "loss": 0.9235, + "step": 1451 + }, + { + "epoch": 0.07828337287039033, + "grad_norm": 0.895516037940979, + "learning_rate": 9.99178643758233e-06, + "loss": 0.9469, + "step": 1452 + }, + { + "epoch": 0.07833728703903386, + "grad_norm": 0.8802943229675293, + "learning_rate": 9.991774286182443e-06, + "loss": 0.8548, + "step": 1453 + }, + { + "epoch": 0.07839120120767738, + "grad_norm": 1.2773913145065308, + "learning_rate": 9.99176212580802e-06, + "loss": 0.794, + "step": 1454 + }, + { + "epoch": 0.0784451153763209, + "grad_norm": 0.9501168131828308, + "learning_rate": 9.99174995645908e-06, + "loss": 0.8711, + "step": 1455 + }, + { + "epoch": 0.07849902954496442, + "grad_norm": 0.9047390222549438, + "learning_rate": 9.991737778135649e-06, + "loss": 0.8419, + "step": 1456 + }, + { + "epoch": 0.07855294371360794, + "grad_norm": 0.9492837190628052, + "learning_rate": 9.991725590837747e-06, + "loss": 0.9832, + "step": 1457 + }, + { + "epoch": 0.07860685788225145, + "grad_norm": 0.9585106372833252, + "learning_rate": 9.991713394565394e-06, + "loss": 0.8393, + "step": 1458 + }, + { + "epoch": 0.07866077205089497, + "grad_norm": 0.9568297266960144, + "learning_rate": 9.991701189318615e-06, + "loss": 0.8711, + "step": 1459 + }, + { + "epoch": 0.07871468621953849, + "grad_norm": 0.9201347231864929, + "learning_rate": 9.991688975097429e-06, + "loss": 0.7947, + "step": 1460 + }, + { + "epoch": 0.07876860038818201, + "grad_norm": 0.8375768661499023, + "learning_rate": 9.99167675190186e-06, + "loss": 0.8051, + "step": 1461 + }, + { + "epoch": 0.07882251455682554, + "grad_norm": 0.8397765755653381, + "learning_rate": 9.99166451973193e-06, + "loss": 0.7727, + "step": 1462 + }, + { + "epoch": 0.07887642872546906, + "grad_norm": 0.8697947859764099, + "learning_rate": 9.99165227858766e-06, + "loss": 0.8171, + "step": 1463 + }, + { + "epoch": 0.07893034289411258, + "grad_norm": 0.8894750475883484, + "learning_rate": 9.991640028469073e-06, + "loss": 0.8773, + "step": 1464 + }, + { + "epoch": 0.0789842570627561, + "grad_norm": 0.8817871809005737, + "learning_rate": 9.991627769376189e-06, + "loss": 0.8983, + "step": 1465 + }, + { + "epoch": 0.07903817123139961, + "grad_norm": 0.9241123795509338, + "learning_rate": 9.99161550130903e-06, + "loss": 0.8967, + "step": 1466 + }, + { + "epoch": 0.07909208540004313, + "grad_norm": 0.852982223033905, + "learning_rate": 9.991603224267623e-06, + "loss": 0.9054, + "step": 1467 + }, + { + "epoch": 0.07914599956868665, + "grad_norm": 0.7719098925590515, + "learning_rate": 9.991590938251986e-06, + "loss": 0.7845, + "step": 1468 + }, + { + "epoch": 0.07919991373733017, + "grad_norm": 0.8700329661369324, + "learning_rate": 9.99157864326214e-06, + "loss": 0.9664, + "step": 1469 + }, + { + "epoch": 0.07925382790597368, + "grad_norm": 0.880553126335144, + "learning_rate": 9.991566339298112e-06, + "loss": 0.8803, + "step": 1470 + }, + { + "epoch": 0.07930774207461722, + "grad_norm": 0.9425762295722961, + "learning_rate": 9.991554026359918e-06, + "loss": 0.8259, + "step": 1471 + }, + { + "epoch": 0.07936165624326073, + "grad_norm": 0.8611294031143188, + "learning_rate": 9.991541704447585e-06, + "loss": 0.8693, + "step": 1472 + }, + { + "epoch": 0.07941557041190425, + "grad_norm": 0.856023907661438, + "learning_rate": 9.99152937356113e-06, + "loss": 0.7073, + "step": 1473 + }, + { + "epoch": 0.07946948458054777, + "grad_norm": 0.7763693332672119, + "learning_rate": 9.991517033700582e-06, + "loss": 0.6815, + "step": 1474 + }, + { + "epoch": 0.07952339874919129, + "grad_norm": 0.8417321443557739, + "learning_rate": 9.991504684865959e-06, + "loss": 0.8239, + "step": 1475 + }, + { + "epoch": 0.0795773129178348, + "grad_norm": 0.9151323437690735, + "learning_rate": 9.991492327057282e-06, + "loss": 0.8327, + "step": 1476 + }, + { + "epoch": 0.07963122708647832, + "grad_norm": 0.8285405039787292, + "learning_rate": 9.991479960274576e-06, + "loss": 0.8623, + "step": 1477 + }, + { + "epoch": 0.07968514125512184, + "grad_norm": 0.8204792141914368, + "learning_rate": 9.991467584517863e-06, + "loss": 0.8494, + "step": 1478 + }, + { + "epoch": 0.07973905542376536, + "grad_norm": 0.8516230583190918, + "learning_rate": 9.991455199787164e-06, + "loss": 0.8219, + "step": 1479 + }, + { + "epoch": 0.07979296959240889, + "grad_norm": 0.9418333172798157, + "learning_rate": 9.991442806082501e-06, + "loss": 0.9293, + "step": 1480 + }, + { + "epoch": 0.07984688376105241, + "grad_norm": 0.8852763175964355, + "learning_rate": 9.991430403403898e-06, + "loss": 0.8124, + "step": 1481 + }, + { + "epoch": 0.07990079792969593, + "grad_norm": 0.8435791730880737, + "learning_rate": 9.991417991751376e-06, + "loss": 0.8634, + "step": 1482 + }, + { + "epoch": 0.07995471209833944, + "grad_norm": 0.7795083522796631, + "learning_rate": 9.991405571124957e-06, + "loss": 0.802, + "step": 1483 + }, + { + "epoch": 0.08000862626698296, + "grad_norm": 0.8102303743362427, + "learning_rate": 9.991393141524663e-06, + "loss": 0.7492, + "step": 1484 + }, + { + "epoch": 0.08006254043562648, + "grad_norm": 0.8433593511581421, + "learning_rate": 9.99138070295052e-06, + "loss": 0.7926, + "step": 1485 + }, + { + "epoch": 0.08011645460427, + "grad_norm": 0.8992267847061157, + "learning_rate": 9.991368255402546e-06, + "loss": 0.7859, + "step": 1486 + }, + { + "epoch": 0.08017036877291352, + "grad_norm": 0.8748059868812561, + "learning_rate": 9.991355798880765e-06, + "loss": 0.8245, + "step": 1487 + }, + { + "epoch": 0.08022428294155703, + "grad_norm": 0.8456832766532898, + "learning_rate": 9.9913433333852e-06, + "loss": 0.9009, + "step": 1488 + }, + { + "epoch": 0.08027819711020057, + "grad_norm": 0.8582474589347839, + "learning_rate": 9.991330858915873e-06, + "loss": 0.7607, + "step": 1489 + }, + { + "epoch": 0.08033211127884408, + "grad_norm": 0.8157060146331787, + "learning_rate": 9.991318375472807e-06, + "loss": 0.8426, + "step": 1490 + }, + { + "epoch": 0.0803860254474876, + "grad_norm": 0.7474784851074219, + "learning_rate": 9.991305883056021e-06, + "loss": 0.8014, + "step": 1491 + }, + { + "epoch": 0.08043993961613112, + "grad_norm": 0.8432475924491882, + "learning_rate": 9.991293381665543e-06, + "loss": 0.8254, + "step": 1492 + }, + { + "epoch": 0.08049385378477464, + "grad_norm": 0.8733057379722595, + "learning_rate": 9.991280871301392e-06, + "loss": 0.8694, + "step": 1493 + }, + { + "epoch": 0.08054776795341816, + "grad_norm": 0.8694074153900146, + "learning_rate": 9.991268351963592e-06, + "loss": 0.7306, + "step": 1494 + }, + { + "epoch": 0.08060168212206167, + "grad_norm": 0.8981258869171143, + "learning_rate": 9.991255823652162e-06, + "loss": 0.7821, + "step": 1495 + }, + { + "epoch": 0.08065559629070519, + "grad_norm": 0.9740719795227051, + "learning_rate": 9.99124328636713e-06, + "loss": 0.7678, + "step": 1496 + }, + { + "epoch": 0.08070951045934871, + "grad_norm": 0.8847763538360596, + "learning_rate": 9.991230740108515e-06, + "loss": 0.73, + "step": 1497 + }, + { + "epoch": 0.08076342462799224, + "grad_norm": 0.8909339308738708, + "learning_rate": 9.99121818487634e-06, + "loss": 0.7713, + "step": 1498 + }, + { + "epoch": 0.08081733879663576, + "grad_norm": 0.8183975219726562, + "learning_rate": 9.991205620670626e-06, + "loss": 0.8234, + "step": 1499 + }, + { + "epoch": 0.08087125296527928, + "grad_norm": 1.241355299949646, + "learning_rate": 9.991193047491399e-06, + "loss": 0.8135, + "step": 1500 + }, + { + "epoch": 0.0809251671339228, + "grad_norm": 0.9039500951766968, + "learning_rate": 9.991180465338682e-06, + "loss": 0.8642, + "step": 1501 + }, + { + "epoch": 0.08097908130256631, + "grad_norm": 1.1762068271636963, + "learning_rate": 9.991167874212493e-06, + "loss": 0.7892, + "step": 1502 + }, + { + "epoch": 0.08103299547120983, + "grad_norm": 0.8402833938598633, + "learning_rate": 9.991155274112857e-06, + "loss": 0.9054, + "step": 1503 + }, + { + "epoch": 0.08108690963985335, + "grad_norm": 0.9271976351737976, + "learning_rate": 9.991142665039799e-06, + "loss": 0.8902, + "step": 1504 + }, + { + "epoch": 0.08114082380849687, + "grad_norm": 0.9105845093727112, + "learning_rate": 9.991130046993337e-06, + "loss": 0.8522, + "step": 1505 + }, + { + "epoch": 0.0811947379771404, + "grad_norm": 0.8248290419578552, + "learning_rate": 9.991117419973499e-06, + "loss": 0.882, + "step": 1506 + }, + { + "epoch": 0.08124865214578392, + "grad_norm": 1.0726820230484009, + "learning_rate": 9.991104783980305e-06, + "loss": 0.8001, + "step": 1507 + }, + { + "epoch": 0.08130256631442744, + "grad_norm": 1.296281337738037, + "learning_rate": 9.991092139013776e-06, + "loss": 1.0022, + "step": 1508 + }, + { + "epoch": 0.08135648048307095, + "grad_norm": 1.7287628650665283, + "learning_rate": 9.991079485073938e-06, + "loss": 0.914, + "step": 1509 + }, + { + "epoch": 0.08141039465171447, + "grad_norm": 0.8731694221496582, + "learning_rate": 9.991066822160813e-06, + "loss": 0.8672, + "step": 1510 + }, + { + "epoch": 0.08146430882035799, + "grad_norm": 0.875747799873352, + "learning_rate": 9.99105415027442e-06, + "loss": 0.8044, + "step": 1511 + }, + { + "epoch": 0.08151822298900151, + "grad_norm": 0.9055120348930359, + "learning_rate": 9.991041469414787e-06, + "loss": 0.8312, + "step": 1512 + }, + { + "epoch": 0.08157213715764502, + "grad_norm": 0.8849499821662903, + "learning_rate": 9.991028779581935e-06, + "loss": 0.889, + "step": 1513 + }, + { + "epoch": 0.08162605132628854, + "grad_norm": 0.9549855589866638, + "learning_rate": 9.991016080775884e-06, + "loss": 0.8929, + "step": 1514 + }, + { + "epoch": 0.08167996549493207, + "grad_norm": 0.8395527005195618, + "learning_rate": 9.991003372996662e-06, + "loss": 0.6774, + "step": 1515 + }, + { + "epoch": 0.08173387966357559, + "grad_norm": 0.7791672945022583, + "learning_rate": 9.990990656244287e-06, + "loss": 0.7178, + "step": 1516 + }, + { + "epoch": 0.08178779383221911, + "grad_norm": 0.91841721534729, + "learning_rate": 9.990977930518785e-06, + "loss": 0.8372, + "step": 1517 + }, + { + "epoch": 0.08184170800086263, + "grad_norm": 0.923937976360321, + "learning_rate": 9.990965195820178e-06, + "loss": 0.8467, + "step": 1518 + }, + { + "epoch": 0.08189562216950615, + "grad_norm": 0.9804415106773376, + "learning_rate": 9.990952452148488e-06, + "loss": 0.9281, + "step": 1519 + }, + { + "epoch": 0.08194953633814966, + "grad_norm": 0.9396255016326904, + "learning_rate": 9.99093969950374e-06, + "loss": 0.8606, + "step": 1520 + }, + { + "epoch": 0.08200345050679318, + "grad_norm": 0.8492118120193481, + "learning_rate": 9.990926937885953e-06, + "loss": 0.8253, + "step": 1521 + }, + { + "epoch": 0.0820573646754367, + "grad_norm": 0.8482204079627991, + "learning_rate": 9.990914167295154e-06, + "loss": 0.7361, + "step": 1522 + }, + { + "epoch": 0.08211127884408022, + "grad_norm": 1.1302778720855713, + "learning_rate": 9.990901387731365e-06, + "loss": 0.7511, + "step": 1523 + }, + { + "epoch": 0.08216519301272375, + "grad_norm": 0.9285756945610046, + "learning_rate": 9.990888599194607e-06, + "loss": 0.8329, + "step": 1524 + }, + { + "epoch": 0.08221910718136727, + "grad_norm": 0.8932104110717773, + "learning_rate": 9.990875801684905e-06, + "loss": 0.8146, + "step": 1525 + }, + { + "epoch": 0.08227302135001079, + "grad_norm": 0.8232647180557251, + "learning_rate": 9.990862995202282e-06, + "loss": 0.763, + "step": 1526 + }, + { + "epoch": 0.0823269355186543, + "grad_norm": 0.8582163453102112, + "learning_rate": 9.990850179746759e-06, + "loss": 0.7675, + "step": 1527 + }, + { + "epoch": 0.08238084968729782, + "grad_norm": 0.9890977144241333, + "learning_rate": 9.990837355318362e-06, + "loss": 0.8438, + "step": 1528 + }, + { + "epoch": 0.08243476385594134, + "grad_norm": 0.9228235483169556, + "learning_rate": 9.990824521917113e-06, + "loss": 0.9324, + "step": 1529 + }, + { + "epoch": 0.08248867802458486, + "grad_norm": 0.8286252617835999, + "learning_rate": 9.990811679543033e-06, + "loss": 0.872, + "step": 1530 + }, + { + "epoch": 0.08254259219322838, + "grad_norm": 0.8546530604362488, + "learning_rate": 9.990798828196146e-06, + "loss": 0.7256, + "step": 1531 + }, + { + "epoch": 0.0825965063618719, + "grad_norm": 0.8240640759468079, + "learning_rate": 9.990785967876478e-06, + "loss": 0.8083, + "step": 1532 + }, + { + "epoch": 0.08265042053051543, + "grad_norm": 0.8650565147399902, + "learning_rate": 9.99077309858405e-06, + "loss": 0.8274, + "step": 1533 + }, + { + "epoch": 0.08270433469915894, + "grad_norm": 0.7865849137306213, + "learning_rate": 9.990760220318884e-06, + "loss": 0.7978, + "step": 1534 + }, + { + "epoch": 0.08275824886780246, + "grad_norm": 0.8567995429039001, + "learning_rate": 9.990747333081005e-06, + "loss": 0.8172, + "step": 1535 + }, + { + "epoch": 0.08281216303644598, + "grad_norm": 0.8242521286010742, + "learning_rate": 9.990734436870435e-06, + "loss": 0.8045, + "step": 1536 + }, + { + "epoch": 0.0828660772050895, + "grad_norm": 0.801266074180603, + "learning_rate": 9.990721531687197e-06, + "loss": 0.8312, + "step": 1537 + }, + { + "epoch": 0.08291999137373302, + "grad_norm": 0.8027862906455994, + "learning_rate": 9.990708617531314e-06, + "loss": 0.7227, + "step": 1538 + }, + { + "epoch": 0.08297390554237653, + "grad_norm": 1.0332401990890503, + "learning_rate": 9.990695694402811e-06, + "loss": 0.9091, + "step": 1539 + }, + { + "epoch": 0.08302781971102005, + "grad_norm": 0.8537373542785645, + "learning_rate": 9.99068276230171e-06, + "loss": 0.7573, + "step": 1540 + }, + { + "epoch": 0.08308173387966357, + "grad_norm": 0.8734087944030762, + "learning_rate": 9.990669821228037e-06, + "loss": 0.901, + "step": 1541 + }, + { + "epoch": 0.0831356480483071, + "grad_norm": 0.8546577095985413, + "learning_rate": 9.99065687118181e-06, + "loss": 0.8294, + "step": 1542 + }, + { + "epoch": 0.08318956221695062, + "grad_norm": 0.9555438756942749, + "learning_rate": 9.990643912163055e-06, + "loss": 0.83, + "step": 1543 + }, + { + "epoch": 0.08324347638559414, + "grad_norm": 0.8778670430183411, + "learning_rate": 9.990630944171798e-06, + "loss": 0.8694, + "step": 1544 + }, + { + "epoch": 0.08329739055423765, + "grad_norm": 0.973791241645813, + "learning_rate": 9.990617967208058e-06, + "loss": 0.8348, + "step": 1545 + }, + { + "epoch": 0.08335130472288117, + "grad_norm": 0.7933714389801025, + "learning_rate": 9.990604981271858e-06, + "loss": 0.8208, + "step": 1546 + }, + { + "epoch": 0.08340521889152469, + "grad_norm": 0.9328469634056091, + "learning_rate": 9.990591986363226e-06, + "loss": 0.8188, + "step": 1547 + }, + { + "epoch": 0.08345913306016821, + "grad_norm": 0.8217103481292725, + "learning_rate": 9.990578982482183e-06, + "loss": 0.7948, + "step": 1548 + }, + { + "epoch": 0.08351304722881173, + "grad_norm": 0.8556894659996033, + "learning_rate": 9.990565969628749e-06, + "loss": 0.8129, + "step": 1549 + }, + { + "epoch": 0.08356696139745524, + "grad_norm": 0.901633083820343, + "learning_rate": 9.990552947802954e-06, + "loss": 0.9025, + "step": 1550 + }, + { + "epoch": 0.08362087556609878, + "grad_norm": 0.9021494388580322, + "learning_rate": 9.990539917004815e-06, + "loss": 0.8882, + "step": 1551 + }, + { + "epoch": 0.0836747897347423, + "grad_norm": 0.8187722563743591, + "learning_rate": 9.990526877234359e-06, + "loss": 0.7385, + "step": 1552 + }, + { + "epoch": 0.08372870390338581, + "grad_norm": 0.9237630367279053, + "learning_rate": 9.990513828491609e-06, + "loss": 0.851, + "step": 1553 + }, + { + "epoch": 0.08378261807202933, + "grad_norm": 1.1868582963943481, + "learning_rate": 9.990500770776589e-06, + "loss": 0.7701, + "step": 1554 + }, + { + "epoch": 0.08383653224067285, + "grad_norm": 0.9831421971321106, + "learning_rate": 9.990487704089322e-06, + "loss": 0.836, + "step": 1555 + }, + { + "epoch": 0.08389044640931637, + "grad_norm": 0.9255663752555847, + "learning_rate": 9.99047462842983e-06, + "loss": 0.7916, + "step": 1556 + }, + { + "epoch": 0.08394436057795988, + "grad_norm": 1.0069084167480469, + "learning_rate": 9.990461543798137e-06, + "loss": 0.8652, + "step": 1557 + }, + { + "epoch": 0.0839982747466034, + "grad_norm": 0.943044900894165, + "learning_rate": 9.990448450194267e-06, + "loss": 0.9511, + "step": 1558 + }, + { + "epoch": 0.08405218891524693, + "grad_norm": 0.9996150135993958, + "learning_rate": 9.990435347618246e-06, + "loss": 0.8751, + "step": 1559 + }, + { + "epoch": 0.08410610308389045, + "grad_norm": 0.9531681537628174, + "learning_rate": 9.990422236070094e-06, + "loss": 0.8988, + "step": 1560 + }, + { + "epoch": 0.08416001725253397, + "grad_norm": 0.9504678249359131, + "learning_rate": 9.990409115549837e-06, + "loss": 0.808, + "step": 1561 + }, + { + "epoch": 0.08421393142117749, + "grad_norm": 0.9796282052993774, + "learning_rate": 9.990395986057496e-06, + "loss": 0.778, + "step": 1562 + }, + { + "epoch": 0.084267845589821, + "grad_norm": 0.8871618509292603, + "learning_rate": 9.990382847593096e-06, + "loss": 0.8945, + "step": 1563 + }, + { + "epoch": 0.08432175975846452, + "grad_norm": 0.8253110647201538, + "learning_rate": 9.990369700156662e-06, + "loss": 0.8206, + "step": 1564 + }, + { + "epoch": 0.08437567392710804, + "grad_norm": 0.8799824118614197, + "learning_rate": 9.990356543748216e-06, + "loss": 0.7665, + "step": 1565 + }, + { + "epoch": 0.08442958809575156, + "grad_norm": 0.8275637626647949, + "learning_rate": 9.990343378367782e-06, + "loss": 0.8468, + "step": 1566 + }, + { + "epoch": 0.08448350226439508, + "grad_norm": 1.0431691408157349, + "learning_rate": 9.990330204015382e-06, + "loss": 0.8539, + "step": 1567 + }, + { + "epoch": 0.08453741643303861, + "grad_norm": 1.298999547958374, + "learning_rate": 9.990317020691043e-06, + "loss": 0.8989, + "step": 1568 + }, + { + "epoch": 0.08459133060168213, + "grad_norm": 0.865868866443634, + "learning_rate": 9.990303828394787e-06, + "loss": 0.8296, + "step": 1569 + }, + { + "epoch": 0.08464524477032564, + "grad_norm": 0.9162652492523193, + "learning_rate": 9.990290627126637e-06, + "loss": 0.8617, + "step": 1570 + }, + { + "epoch": 0.08469915893896916, + "grad_norm": 0.9753283858299255, + "learning_rate": 9.990277416886618e-06, + "loss": 0.8082, + "step": 1571 + }, + { + "epoch": 0.08475307310761268, + "grad_norm": 0.9561176300048828, + "learning_rate": 9.990264197674754e-06, + "loss": 0.8678, + "step": 1572 + }, + { + "epoch": 0.0848069872762562, + "grad_norm": 0.833341658115387, + "learning_rate": 9.990250969491067e-06, + "loss": 0.8164, + "step": 1573 + }, + { + "epoch": 0.08486090144489972, + "grad_norm": 0.9928603172302246, + "learning_rate": 9.990237732335581e-06, + "loss": 0.6889, + "step": 1574 + }, + { + "epoch": 0.08491481561354323, + "grad_norm": 1.0163367986679077, + "learning_rate": 9.990224486208322e-06, + "loss": 0.8278, + "step": 1575 + }, + { + "epoch": 0.08496872978218675, + "grad_norm": 0.9905970096588135, + "learning_rate": 9.990211231109312e-06, + "loss": 0.8094, + "step": 1576 + }, + { + "epoch": 0.08502264395083028, + "grad_norm": 0.9112648963928223, + "learning_rate": 9.990197967038574e-06, + "loss": 0.8782, + "step": 1577 + }, + { + "epoch": 0.0850765581194738, + "grad_norm": 1.1176974773406982, + "learning_rate": 9.990184693996136e-06, + "loss": 0.8826, + "step": 1578 + }, + { + "epoch": 0.08513047228811732, + "grad_norm": 0.7696222066879272, + "learning_rate": 9.990171411982016e-06, + "loss": 0.8025, + "step": 1579 + }, + { + "epoch": 0.08518438645676084, + "grad_norm": 0.9288634061813354, + "learning_rate": 9.990158120996242e-06, + "loss": 0.8777, + "step": 1580 + }, + { + "epoch": 0.08523830062540436, + "grad_norm": 0.9235022068023682, + "learning_rate": 9.990144821038839e-06, + "loss": 0.9339, + "step": 1581 + }, + { + "epoch": 0.08529221479404787, + "grad_norm": 0.9124205708503723, + "learning_rate": 9.990131512109826e-06, + "loss": 0.8368, + "step": 1582 + }, + { + "epoch": 0.08534612896269139, + "grad_norm": 0.8409048914909363, + "learning_rate": 9.990118194209229e-06, + "loss": 0.7772, + "step": 1583 + }, + { + "epoch": 0.08540004313133491, + "grad_norm": 0.8279136419296265, + "learning_rate": 9.990104867337074e-06, + "loss": 0.738, + "step": 1584 + }, + { + "epoch": 0.08545395729997843, + "grad_norm": 0.8895745873451233, + "learning_rate": 9.990091531493382e-06, + "loss": 0.7669, + "step": 1585 + }, + { + "epoch": 0.08550787146862196, + "grad_norm": 0.9280734062194824, + "learning_rate": 9.99007818667818e-06, + "loss": 0.9052, + "step": 1586 + }, + { + "epoch": 0.08556178563726548, + "grad_norm": 0.7676610350608826, + "learning_rate": 9.990064832891491e-06, + "loss": 0.807, + "step": 1587 + }, + { + "epoch": 0.085615699805909, + "grad_norm": 0.9035676121711731, + "learning_rate": 9.990051470133337e-06, + "loss": 0.8848, + "step": 1588 + }, + { + "epoch": 0.08566961397455251, + "grad_norm": 1.0960334539413452, + "learning_rate": 9.990038098403742e-06, + "loss": 0.8279, + "step": 1589 + }, + { + "epoch": 0.08572352814319603, + "grad_norm": 0.87922203540802, + "learning_rate": 9.990024717702736e-06, + "loss": 0.8325, + "step": 1590 + }, + { + "epoch": 0.08577744231183955, + "grad_norm": 0.922815203666687, + "learning_rate": 9.990011328030335e-06, + "loss": 0.881, + "step": 1591 + }, + { + "epoch": 0.08583135648048307, + "grad_norm": 0.9880780577659607, + "learning_rate": 9.989997929386567e-06, + "loss": 0.7506, + "step": 1592 + }, + { + "epoch": 0.08588527064912659, + "grad_norm": 0.8827483057975769, + "learning_rate": 9.989984521771456e-06, + "loss": 0.8961, + "step": 1593 + }, + { + "epoch": 0.0859391848177701, + "grad_norm": 0.8395072817802429, + "learning_rate": 9.989971105185026e-06, + "loss": 0.8564, + "step": 1594 + }, + { + "epoch": 0.08599309898641364, + "grad_norm": 0.8731534481048584, + "learning_rate": 9.989957679627302e-06, + "loss": 0.8209, + "step": 1595 + }, + { + "epoch": 0.08604701315505715, + "grad_norm": 0.7969424724578857, + "learning_rate": 9.989944245098305e-06, + "loss": 0.8031, + "step": 1596 + }, + { + "epoch": 0.08610092732370067, + "grad_norm": 0.8420547246932983, + "learning_rate": 9.989930801598062e-06, + "loss": 0.8027, + "step": 1597 + }, + { + "epoch": 0.08615484149234419, + "grad_norm": 0.7900253534317017, + "learning_rate": 9.989917349126597e-06, + "loss": 0.8246, + "step": 1598 + }, + { + "epoch": 0.08620875566098771, + "grad_norm": 0.8860716819763184, + "learning_rate": 9.989903887683934e-06, + "loss": 0.7846, + "step": 1599 + }, + { + "epoch": 0.08626266982963122, + "grad_norm": 0.907744288444519, + "learning_rate": 9.989890417270097e-06, + "loss": 0.7813, + "step": 1600 + }, + { + "epoch": 0.08631658399827474, + "grad_norm": 0.764076828956604, + "learning_rate": 9.989876937885108e-06, + "loss": 0.7953, + "step": 1601 + }, + { + "epoch": 0.08637049816691826, + "grad_norm": 1.0143790245056152, + "learning_rate": 9.989863449528994e-06, + "loss": 0.8854, + "step": 1602 + }, + { + "epoch": 0.08642441233556178, + "grad_norm": 0.8605815172195435, + "learning_rate": 9.989849952201779e-06, + "loss": 0.9289, + "step": 1603 + }, + { + "epoch": 0.08647832650420531, + "grad_norm": 0.8897641897201538, + "learning_rate": 9.989836445903487e-06, + "loss": 0.8659, + "step": 1604 + }, + { + "epoch": 0.08653224067284883, + "grad_norm": 0.8893518447875977, + "learning_rate": 9.989822930634141e-06, + "loss": 0.8724, + "step": 1605 + }, + { + "epoch": 0.08658615484149235, + "grad_norm": 0.8152129054069519, + "learning_rate": 9.989809406393767e-06, + "loss": 0.8321, + "step": 1606 + }, + { + "epoch": 0.08664006901013586, + "grad_norm": 0.8394732475280762, + "learning_rate": 9.98979587318239e-06, + "loss": 0.8074, + "step": 1607 + }, + { + "epoch": 0.08669398317877938, + "grad_norm": 0.8038346767425537, + "learning_rate": 9.989782331000031e-06, + "loss": 0.8132, + "step": 1608 + }, + { + "epoch": 0.0867478973474229, + "grad_norm": 0.8574134111404419, + "learning_rate": 9.989768779846717e-06, + "loss": 0.8191, + "step": 1609 + }, + { + "epoch": 0.08680181151606642, + "grad_norm": 1.0049889087677002, + "learning_rate": 9.989755219722472e-06, + "loss": 0.8771, + "step": 1610 + }, + { + "epoch": 0.08685572568470994, + "grad_norm": 0.9765112996101379, + "learning_rate": 9.989741650627319e-06, + "loss": 0.839, + "step": 1611 + }, + { + "epoch": 0.08690963985335347, + "grad_norm": 0.9430082440376282, + "learning_rate": 9.989728072561284e-06, + "loss": 1.0316, + "step": 1612 + }, + { + "epoch": 0.08696355402199699, + "grad_norm": 0.841590404510498, + "learning_rate": 9.989714485524391e-06, + "loss": 0.8727, + "step": 1613 + }, + { + "epoch": 0.0870174681906405, + "grad_norm": 0.9475975632667542, + "learning_rate": 9.989700889516664e-06, + "loss": 0.8131, + "step": 1614 + }, + { + "epoch": 0.08707138235928402, + "grad_norm": 0.8059530258178711, + "learning_rate": 9.98968728453813e-06, + "loss": 0.8297, + "step": 1615 + }, + { + "epoch": 0.08712529652792754, + "grad_norm": 0.8513601422309875, + "learning_rate": 9.989673670588808e-06, + "loss": 0.8016, + "step": 1616 + }, + { + "epoch": 0.08717921069657106, + "grad_norm": 0.8434658646583557, + "learning_rate": 9.989660047668728e-06, + "loss": 0.866, + "step": 1617 + }, + { + "epoch": 0.08723312486521458, + "grad_norm": 0.9081484079360962, + "learning_rate": 9.989646415777912e-06, + "loss": 0.816, + "step": 1618 + }, + { + "epoch": 0.0872870390338581, + "grad_norm": 0.7941877841949463, + "learning_rate": 9.989632774916385e-06, + "loss": 0.7191, + "step": 1619 + }, + { + "epoch": 0.08734095320250161, + "grad_norm": 0.8800172209739685, + "learning_rate": 9.98961912508417e-06, + "loss": 0.8135, + "step": 1620 + }, + { + "epoch": 0.08739486737114514, + "grad_norm": 0.7940575480461121, + "learning_rate": 9.989605466281292e-06, + "loss": 0.8124, + "step": 1621 + }, + { + "epoch": 0.08744878153978866, + "grad_norm": 0.9570618271827698, + "learning_rate": 9.989591798507779e-06, + "loss": 0.9043, + "step": 1622 + }, + { + "epoch": 0.08750269570843218, + "grad_norm": 0.8635395169258118, + "learning_rate": 9.98957812176365e-06, + "loss": 0.835, + "step": 1623 + }, + { + "epoch": 0.0875566098770757, + "grad_norm": 0.8289955258369446, + "learning_rate": 9.989564436048932e-06, + "loss": 0.8265, + "step": 1624 + }, + { + "epoch": 0.08761052404571922, + "grad_norm": 0.9519028663635254, + "learning_rate": 9.989550741363654e-06, + "loss": 0.8127, + "step": 1625 + }, + { + "epoch": 0.08766443821436273, + "grad_norm": 0.9611422419548035, + "learning_rate": 9.989537037707834e-06, + "loss": 0.8422, + "step": 1626 + }, + { + "epoch": 0.08771835238300625, + "grad_norm": 0.8824746608734131, + "learning_rate": 9.9895233250815e-06, + "loss": 0.8669, + "step": 1627 + }, + { + "epoch": 0.08777226655164977, + "grad_norm": 0.8402838706970215, + "learning_rate": 9.989509603484676e-06, + "loss": 0.8072, + "step": 1628 + }, + { + "epoch": 0.08782618072029329, + "grad_norm": 0.7537099719047546, + "learning_rate": 9.989495872917386e-06, + "loss": 0.7127, + "step": 1629 + }, + { + "epoch": 0.08788009488893682, + "grad_norm": 0.78285151720047, + "learning_rate": 9.989482133379656e-06, + "loss": 0.819, + "step": 1630 + }, + { + "epoch": 0.08793400905758034, + "grad_norm": 0.9339445233345032, + "learning_rate": 9.98946838487151e-06, + "loss": 0.8694, + "step": 1631 + }, + { + "epoch": 0.08798792322622385, + "grad_norm": 0.8022040128707886, + "learning_rate": 9.989454627392973e-06, + "loss": 0.7601, + "step": 1632 + }, + { + "epoch": 0.08804183739486737, + "grad_norm": 0.8593827486038208, + "learning_rate": 9.98944086094407e-06, + "loss": 0.8536, + "step": 1633 + }, + { + "epoch": 0.08809575156351089, + "grad_norm": 0.8415039777755737, + "learning_rate": 9.989427085524824e-06, + "loss": 0.9027, + "step": 1634 + }, + { + "epoch": 0.08814966573215441, + "grad_norm": 0.9551103711128235, + "learning_rate": 9.989413301135263e-06, + "loss": 0.8063, + "step": 1635 + }, + { + "epoch": 0.08820357990079793, + "grad_norm": 0.8554351925849915, + "learning_rate": 9.989399507775407e-06, + "loss": 0.7694, + "step": 1636 + }, + { + "epoch": 0.08825749406944144, + "grad_norm": 0.8688547015190125, + "learning_rate": 9.989385705445285e-06, + "loss": 0.8862, + "step": 1637 + }, + { + "epoch": 0.08831140823808496, + "grad_norm": 0.816558837890625, + "learning_rate": 9.98937189414492e-06, + "loss": 0.7302, + "step": 1638 + }, + { + "epoch": 0.0883653224067285, + "grad_norm": 0.8164445757865906, + "learning_rate": 9.989358073874337e-06, + "loss": 0.8724, + "step": 1639 + }, + { + "epoch": 0.08841923657537201, + "grad_norm": 0.8909460306167603, + "learning_rate": 9.989344244633564e-06, + "loss": 0.7618, + "step": 1640 + }, + { + "epoch": 0.08847315074401553, + "grad_norm": 1.0117470026016235, + "learning_rate": 9.98933040642262e-06, + "loss": 0.8191, + "step": 1641 + }, + { + "epoch": 0.08852706491265905, + "grad_norm": 0.8317937850952148, + "learning_rate": 9.989316559241533e-06, + "loss": 0.8339, + "step": 1642 + }, + { + "epoch": 0.08858097908130257, + "grad_norm": 0.7955135107040405, + "learning_rate": 9.98930270309033e-06, + "loss": 0.7799, + "step": 1643 + }, + { + "epoch": 0.08863489324994608, + "grad_norm": 0.996306300163269, + "learning_rate": 9.98928883796903e-06, + "loss": 0.8547, + "step": 1644 + }, + { + "epoch": 0.0886888074185896, + "grad_norm": 0.9679511189460754, + "learning_rate": 9.989274963877664e-06, + "loss": 1.0831, + "step": 1645 + }, + { + "epoch": 0.08874272158723312, + "grad_norm": 0.8471615314483643, + "learning_rate": 9.989261080816253e-06, + "loss": 0.7765, + "step": 1646 + }, + { + "epoch": 0.08879663575587664, + "grad_norm": 0.8662555813789368, + "learning_rate": 9.989247188784826e-06, + "loss": 0.8894, + "step": 1647 + }, + { + "epoch": 0.08885054992452017, + "grad_norm": 0.9549373388290405, + "learning_rate": 9.989233287783402e-06, + "loss": 0.8341, + "step": 1648 + }, + { + "epoch": 0.08890446409316369, + "grad_norm": 0.8179014325141907, + "learning_rate": 9.989219377812014e-06, + "loss": 0.8653, + "step": 1649 + }, + { + "epoch": 0.0889583782618072, + "grad_norm": 0.9237802624702454, + "learning_rate": 9.989205458870678e-06, + "loss": 0.8206, + "step": 1650 + }, + { + "epoch": 0.08901229243045072, + "grad_norm": 0.940217137336731, + "learning_rate": 9.989191530959426e-06, + "loss": 0.8695, + "step": 1651 + }, + { + "epoch": 0.08906620659909424, + "grad_norm": 0.9200409054756165, + "learning_rate": 9.98917759407828e-06, + "loss": 0.7984, + "step": 1652 + }, + { + "epoch": 0.08912012076773776, + "grad_norm": 0.9270562529563904, + "learning_rate": 9.989163648227265e-06, + "loss": 0.8265, + "step": 1653 + }, + { + "epoch": 0.08917403493638128, + "grad_norm": 0.9945223331451416, + "learning_rate": 9.989149693406408e-06, + "loss": 0.84, + "step": 1654 + }, + { + "epoch": 0.0892279491050248, + "grad_norm": 0.826195478439331, + "learning_rate": 9.98913572961573e-06, + "loss": 0.7862, + "step": 1655 + }, + { + "epoch": 0.08928186327366831, + "grad_norm": 0.9132022857666016, + "learning_rate": 9.989121756855263e-06, + "loss": 0.826, + "step": 1656 + }, + { + "epoch": 0.08933577744231185, + "grad_norm": 0.8559401631355286, + "learning_rate": 9.989107775125023e-06, + "loss": 0.8007, + "step": 1657 + }, + { + "epoch": 0.08938969161095536, + "grad_norm": 0.8000867366790771, + "learning_rate": 9.989093784425044e-06, + "loss": 0.7547, + "step": 1658 + }, + { + "epoch": 0.08944360577959888, + "grad_norm": 0.7761433720588684, + "learning_rate": 9.989079784755346e-06, + "loss": 0.8083, + "step": 1659 + }, + { + "epoch": 0.0894975199482424, + "grad_norm": 0.8072230815887451, + "learning_rate": 9.989065776115956e-06, + "loss": 0.892, + "step": 1660 + }, + { + "epoch": 0.08955143411688592, + "grad_norm": 0.9021360874176025, + "learning_rate": 9.989051758506898e-06, + "loss": 0.8715, + "step": 1661 + }, + { + "epoch": 0.08960534828552943, + "grad_norm": 0.7585147023200989, + "learning_rate": 9.989037731928197e-06, + "loss": 0.7115, + "step": 1662 + }, + { + "epoch": 0.08965926245417295, + "grad_norm": 0.9388399124145508, + "learning_rate": 9.98902369637988e-06, + "loss": 0.8976, + "step": 1663 + }, + { + "epoch": 0.08971317662281647, + "grad_norm": 0.8454418778419495, + "learning_rate": 9.989009651861972e-06, + "loss": 0.8063, + "step": 1664 + }, + { + "epoch": 0.08976709079146, + "grad_norm": 0.82308030128479, + "learning_rate": 9.988995598374496e-06, + "loss": 0.8044, + "step": 1665 + }, + { + "epoch": 0.08982100496010352, + "grad_norm": 1.006800651550293, + "learning_rate": 9.98898153591748e-06, + "loss": 0.8609, + "step": 1666 + }, + { + "epoch": 0.08987491912874704, + "grad_norm": 0.8325724601745605, + "learning_rate": 9.988967464490947e-06, + "loss": 0.8295, + "step": 1667 + }, + { + "epoch": 0.08992883329739056, + "grad_norm": 0.7575547695159912, + "learning_rate": 9.988953384094923e-06, + "loss": 0.8252, + "step": 1668 + }, + { + "epoch": 0.08998274746603407, + "grad_norm": 0.869877278804779, + "learning_rate": 9.988939294729436e-06, + "loss": 0.8304, + "step": 1669 + }, + { + "epoch": 0.09003666163467759, + "grad_norm": 0.7840037941932678, + "learning_rate": 9.988925196394508e-06, + "loss": 0.7742, + "step": 1670 + }, + { + "epoch": 0.09009057580332111, + "grad_norm": 0.8044409155845642, + "learning_rate": 9.988911089090163e-06, + "loss": 0.8371, + "step": 1671 + }, + { + "epoch": 0.09014448997196463, + "grad_norm": 0.8635613322257996, + "learning_rate": 9.988896972816431e-06, + "loss": 0.7693, + "step": 1672 + }, + { + "epoch": 0.09019840414060815, + "grad_norm": 0.7780656814575195, + "learning_rate": 9.988882847573335e-06, + "loss": 0.841, + "step": 1673 + }, + { + "epoch": 0.09025231830925168, + "grad_norm": 0.8938048481941223, + "learning_rate": 9.9888687133609e-06, + "loss": 0.8149, + "step": 1674 + }, + { + "epoch": 0.0903062324778952, + "grad_norm": 0.8432002663612366, + "learning_rate": 9.988854570179152e-06, + "loss": 0.853, + "step": 1675 + }, + { + "epoch": 0.09036014664653871, + "grad_norm": 0.8222450613975525, + "learning_rate": 9.988840418028118e-06, + "loss": 0.897, + "step": 1676 + }, + { + "epoch": 0.09041406081518223, + "grad_norm": 0.8370371460914612, + "learning_rate": 9.98882625690782e-06, + "loss": 0.8288, + "step": 1677 + }, + { + "epoch": 0.09046797498382575, + "grad_norm": 0.8510713577270508, + "learning_rate": 9.988812086818285e-06, + "loss": 0.7637, + "step": 1678 + }, + { + "epoch": 0.09052188915246927, + "grad_norm": 0.8271141648292542, + "learning_rate": 9.98879790775954e-06, + "loss": 0.853, + "step": 1679 + }, + { + "epoch": 0.09057580332111279, + "grad_norm": 1.0627025365829468, + "learning_rate": 9.988783719731607e-06, + "loss": 0.7569, + "step": 1680 + }, + { + "epoch": 0.0906297174897563, + "grad_norm": 0.880283534526825, + "learning_rate": 9.988769522734517e-06, + "loss": 0.8362, + "step": 1681 + }, + { + "epoch": 0.09068363165839982, + "grad_norm": 0.8721734881401062, + "learning_rate": 9.988755316768288e-06, + "loss": 0.8585, + "step": 1682 + }, + { + "epoch": 0.09073754582704335, + "grad_norm": 0.8830682039260864, + "learning_rate": 9.988741101832952e-06, + "loss": 0.8853, + "step": 1683 + }, + { + "epoch": 0.09079145999568687, + "grad_norm": 0.7676220536231995, + "learning_rate": 9.988726877928534e-06, + "loss": 0.7832, + "step": 1684 + }, + { + "epoch": 0.09084537416433039, + "grad_norm": 0.866149365901947, + "learning_rate": 9.988712645055055e-06, + "loss": 0.8534, + "step": 1685 + }, + { + "epoch": 0.09089928833297391, + "grad_norm": 0.8467028141021729, + "learning_rate": 9.988698403212546e-06, + "loss": 0.8637, + "step": 1686 + }, + { + "epoch": 0.09095320250161743, + "grad_norm": 0.913436770439148, + "learning_rate": 9.988684152401028e-06, + "loss": 0.855, + "step": 1687 + }, + { + "epoch": 0.09100711667026094, + "grad_norm": 0.8307977914810181, + "learning_rate": 9.98866989262053e-06, + "loss": 0.8538, + "step": 1688 + }, + { + "epoch": 0.09106103083890446, + "grad_norm": 1.13442862033844, + "learning_rate": 9.988655623871075e-06, + "loss": 0.8129, + "step": 1689 + }, + { + "epoch": 0.09111494500754798, + "grad_norm": 0.8950080871582031, + "learning_rate": 9.988641346152692e-06, + "loss": 0.8674, + "step": 1690 + }, + { + "epoch": 0.0911688591761915, + "grad_norm": 0.9107043147087097, + "learning_rate": 9.988627059465403e-06, + "loss": 0.9507, + "step": 1691 + }, + { + "epoch": 0.09122277334483503, + "grad_norm": 0.8210874795913696, + "learning_rate": 9.988612763809237e-06, + "loss": 0.8913, + "step": 1692 + }, + { + "epoch": 0.09127668751347855, + "grad_norm": 1.0306476354599, + "learning_rate": 9.988598459184217e-06, + "loss": 0.8589, + "step": 1693 + }, + { + "epoch": 0.09133060168212206, + "grad_norm": 0.7582615613937378, + "learning_rate": 9.98858414559037e-06, + "loss": 0.7482, + "step": 1694 + }, + { + "epoch": 0.09138451585076558, + "grad_norm": 0.8572216629981995, + "learning_rate": 9.98856982302772e-06, + "loss": 0.822, + "step": 1695 + }, + { + "epoch": 0.0914384300194091, + "grad_norm": 0.9358139038085938, + "learning_rate": 9.988555491496297e-06, + "loss": 0.8298, + "step": 1696 + }, + { + "epoch": 0.09149234418805262, + "grad_norm": 0.8705672025680542, + "learning_rate": 9.988541150996123e-06, + "loss": 0.8818, + "step": 1697 + }, + { + "epoch": 0.09154625835669614, + "grad_norm": 0.9081273674964905, + "learning_rate": 9.988526801527224e-06, + "loss": 0.8994, + "step": 1698 + }, + { + "epoch": 0.09160017252533965, + "grad_norm": 0.7358905076980591, + "learning_rate": 9.988512443089627e-06, + "loss": 0.7752, + "step": 1699 + }, + { + "epoch": 0.09165408669398317, + "grad_norm": 0.8570963740348816, + "learning_rate": 9.988498075683357e-06, + "loss": 0.908, + "step": 1700 + }, + { + "epoch": 0.0917080008626267, + "grad_norm": 0.8998208045959473, + "learning_rate": 9.988483699308442e-06, + "loss": 0.8561, + "step": 1701 + }, + { + "epoch": 0.09176191503127022, + "grad_norm": 0.7481779456138611, + "learning_rate": 9.988469313964903e-06, + "loss": 0.7184, + "step": 1702 + }, + { + "epoch": 0.09181582919991374, + "grad_norm": 1.052809238433838, + "learning_rate": 9.988454919652772e-06, + "loss": 0.8579, + "step": 1703 + }, + { + "epoch": 0.09186974336855726, + "grad_norm": 0.8492130637168884, + "learning_rate": 9.988440516372071e-06, + "loss": 0.8796, + "step": 1704 + }, + { + "epoch": 0.09192365753720078, + "grad_norm": 0.884483277797699, + "learning_rate": 9.988426104122826e-06, + "loss": 0.8781, + "step": 1705 + }, + { + "epoch": 0.0919775717058443, + "grad_norm": 0.8844857811927795, + "learning_rate": 9.988411682905065e-06, + "loss": 0.8981, + "step": 1706 + }, + { + "epoch": 0.09203148587448781, + "grad_norm": 0.906216025352478, + "learning_rate": 9.988397252718811e-06, + "loss": 0.8741, + "step": 1707 + }, + { + "epoch": 0.09208540004313133, + "grad_norm": 0.8565787076950073, + "learning_rate": 9.988382813564092e-06, + "loss": 0.7358, + "step": 1708 + }, + { + "epoch": 0.09213931421177485, + "grad_norm": 0.8036391139030457, + "learning_rate": 9.988368365440935e-06, + "loss": 0.7966, + "step": 1709 + }, + { + "epoch": 0.09219322838041838, + "grad_norm": 1.1708556413650513, + "learning_rate": 9.988353908349361e-06, + "loss": 0.8385, + "step": 1710 + }, + { + "epoch": 0.0922471425490619, + "grad_norm": 0.8536746501922607, + "learning_rate": 9.988339442289403e-06, + "loss": 0.7387, + "step": 1711 + }, + { + "epoch": 0.09230105671770542, + "grad_norm": 0.8376518487930298, + "learning_rate": 9.988324967261083e-06, + "loss": 0.8537, + "step": 1712 + }, + { + "epoch": 0.09235497088634893, + "grad_norm": 0.8793227672576904, + "learning_rate": 9.988310483264426e-06, + "loss": 0.8028, + "step": 1713 + }, + { + "epoch": 0.09240888505499245, + "grad_norm": 0.8186830282211304, + "learning_rate": 9.98829599029946e-06, + "loss": 0.8478, + "step": 1714 + }, + { + "epoch": 0.09246279922363597, + "grad_norm": 0.8845428824424744, + "learning_rate": 9.98828148836621e-06, + "loss": 0.8524, + "step": 1715 + }, + { + "epoch": 0.09251671339227949, + "grad_norm": 1.0494492053985596, + "learning_rate": 9.988266977464704e-06, + "loss": 0.8542, + "step": 1716 + }, + { + "epoch": 0.092570627560923, + "grad_norm": 0.8876493573188782, + "learning_rate": 9.988252457594966e-06, + "loss": 0.8989, + "step": 1717 + }, + { + "epoch": 0.09262454172956654, + "grad_norm": 0.8787088394165039, + "learning_rate": 9.988237928757024e-06, + "loss": 0.8214, + "step": 1718 + }, + { + "epoch": 0.09267845589821005, + "grad_norm": 1.069684624671936, + "learning_rate": 9.988223390950901e-06, + "loss": 0.9714, + "step": 1719 + }, + { + "epoch": 0.09273237006685357, + "grad_norm": 0.7957501411437988, + "learning_rate": 9.988208844176626e-06, + "loss": 0.7562, + "step": 1720 + }, + { + "epoch": 0.09278628423549709, + "grad_norm": 0.8354908227920532, + "learning_rate": 9.988194288434225e-06, + "loss": 0.7494, + "step": 1721 + }, + { + "epoch": 0.09284019840414061, + "grad_norm": 0.8205936551094055, + "learning_rate": 9.988179723723722e-06, + "loss": 0.7727, + "step": 1722 + }, + { + "epoch": 0.09289411257278413, + "grad_norm": 0.8364951014518738, + "learning_rate": 9.988165150045146e-06, + "loss": 0.861, + "step": 1723 + }, + { + "epoch": 0.09294802674142764, + "grad_norm": 0.8664119243621826, + "learning_rate": 9.98815056739852e-06, + "loss": 0.8512, + "step": 1724 + }, + { + "epoch": 0.09300194091007116, + "grad_norm": 0.9565482139587402, + "learning_rate": 9.988135975783874e-06, + "loss": 0.8606, + "step": 1725 + }, + { + "epoch": 0.09305585507871468, + "grad_norm": 0.8696085214614868, + "learning_rate": 9.988121375201232e-06, + "loss": 0.8614, + "step": 1726 + }, + { + "epoch": 0.09310976924735821, + "grad_norm": 0.8623467683792114, + "learning_rate": 9.98810676565062e-06, + "loss": 0.8547, + "step": 1727 + }, + { + "epoch": 0.09316368341600173, + "grad_norm": 0.8284831047058105, + "learning_rate": 9.988092147132064e-06, + "loss": 0.8376, + "step": 1728 + }, + { + "epoch": 0.09321759758464525, + "grad_norm": 0.7768245339393616, + "learning_rate": 9.988077519645591e-06, + "loss": 0.7472, + "step": 1729 + }, + { + "epoch": 0.09327151175328877, + "grad_norm": 1.221225619316101, + "learning_rate": 9.988062883191228e-06, + "loss": 0.9052, + "step": 1730 + }, + { + "epoch": 0.09332542592193228, + "grad_norm": 1.0027954578399658, + "learning_rate": 9.988048237769002e-06, + "loss": 0.9411, + "step": 1731 + }, + { + "epoch": 0.0933793400905758, + "grad_norm": 0.8029824495315552, + "learning_rate": 9.988033583378937e-06, + "loss": 0.8141, + "step": 1732 + }, + { + "epoch": 0.09343325425921932, + "grad_norm": 0.8081389665603638, + "learning_rate": 9.98801892002106e-06, + "loss": 0.7977, + "step": 1733 + }, + { + "epoch": 0.09348716842786284, + "grad_norm": 0.887438952922821, + "learning_rate": 9.988004247695398e-06, + "loss": 0.8574, + "step": 1734 + }, + { + "epoch": 0.09354108259650636, + "grad_norm": 0.887238085269928, + "learning_rate": 9.987989566401977e-06, + "loss": 0.9041, + "step": 1735 + }, + { + "epoch": 0.09359499676514989, + "grad_norm": 0.9135997891426086, + "learning_rate": 9.987974876140822e-06, + "loss": 0.738, + "step": 1736 + }, + { + "epoch": 0.0936489109337934, + "grad_norm": 0.7749861478805542, + "learning_rate": 9.987960176911964e-06, + "loss": 0.773, + "step": 1737 + }, + { + "epoch": 0.09370282510243692, + "grad_norm": 0.7850096225738525, + "learning_rate": 9.987945468715425e-06, + "loss": 0.7924, + "step": 1738 + }, + { + "epoch": 0.09375673927108044, + "grad_norm": 0.8044145107269287, + "learning_rate": 9.987930751551231e-06, + "loss": 0.8196, + "step": 1739 + }, + { + "epoch": 0.09381065343972396, + "grad_norm": 0.8781464695930481, + "learning_rate": 9.987916025419413e-06, + "loss": 0.9337, + "step": 1740 + }, + { + "epoch": 0.09386456760836748, + "grad_norm": 1.0839952230453491, + "learning_rate": 9.987901290319993e-06, + "loss": 0.8092, + "step": 1741 + }, + { + "epoch": 0.093918481777011, + "grad_norm": 0.7910736203193665, + "learning_rate": 9.987886546253e-06, + "loss": 0.8775, + "step": 1742 + }, + { + "epoch": 0.09397239594565451, + "grad_norm": 0.887287974357605, + "learning_rate": 9.98787179321846e-06, + "loss": 0.8271, + "step": 1743 + }, + { + "epoch": 0.09402631011429803, + "grad_norm": 1.1318427324295044, + "learning_rate": 9.987857031216397e-06, + "loss": 0.8328, + "step": 1744 + }, + { + "epoch": 0.09408022428294156, + "grad_norm": 0.8660401105880737, + "learning_rate": 9.987842260246842e-06, + "loss": 0.8647, + "step": 1745 + }, + { + "epoch": 0.09413413845158508, + "grad_norm": 0.9396790266036987, + "learning_rate": 9.98782748030982e-06, + "loss": 0.9373, + "step": 1746 + }, + { + "epoch": 0.0941880526202286, + "grad_norm": 0.8715323209762573, + "learning_rate": 9.987812691405353e-06, + "loss": 0.8621, + "step": 1747 + }, + { + "epoch": 0.09424196678887212, + "grad_norm": 0.7882347106933594, + "learning_rate": 9.987797893533475e-06, + "loss": 0.7283, + "step": 1748 + }, + { + "epoch": 0.09429588095751563, + "grad_norm": 0.9641733765602112, + "learning_rate": 9.987783086694208e-06, + "loss": 0.8038, + "step": 1749 + }, + { + "epoch": 0.09434979512615915, + "grad_norm": 0.8808518648147583, + "learning_rate": 9.98776827088758e-06, + "loss": 0.8072, + "step": 1750 + }, + { + "epoch": 0.09440370929480267, + "grad_norm": 0.7720713019371033, + "learning_rate": 9.987753446113618e-06, + "loss": 0.7786, + "step": 1751 + }, + { + "epoch": 0.09445762346344619, + "grad_norm": 1.0507936477661133, + "learning_rate": 9.987738612372346e-06, + "loss": 0.9302, + "step": 1752 + }, + { + "epoch": 0.0945115376320897, + "grad_norm": 0.7705017328262329, + "learning_rate": 9.987723769663795e-06, + "loss": 0.7366, + "step": 1753 + }, + { + "epoch": 0.09456545180073324, + "grad_norm": 0.82464200258255, + "learning_rate": 9.987708917987989e-06, + "loss": 0.8063, + "step": 1754 + }, + { + "epoch": 0.09461936596937676, + "grad_norm": 0.9387272000312805, + "learning_rate": 9.987694057344953e-06, + "loss": 0.8108, + "step": 1755 + }, + { + "epoch": 0.09467328013802027, + "grad_norm": 0.9161933064460754, + "learning_rate": 9.987679187734717e-06, + "loss": 0.8331, + "step": 1756 + }, + { + "epoch": 0.09472719430666379, + "grad_norm": 0.9379769563674927, + "learning_rate": 9.987664309157306e-06, + "loss": 0.9064, + "step": 1757 + }, + { + "epoch": 0.09478110847530731, + "grad_norm": 0.9597976803779602, + "learning_rate": 9.987649421612748e-06, + "loss": 0.7785, + "step": 1758 + }, + { + "epoch": 0.09483502264395083, + "grad_norm": 0.8689720630645752, + "learning_rate": 9.98763452510107e-06, + "loss": 0.7828, + "step": 1759 + }, + { + "epoch": 0.09488893681259435, + "grad_norm": 0.9207726716995239, + "learning_rate": 9.987619619622296e-06, + "loss": 0.7853, + "step": 1760 + }, + { + "epoch": 0.09494285098123786, + "grad_norm": 0.8130320310592651, + "learning_rate": 9.987604705176455e-06, + "loss": 0.858, + "step": 1761 + }, + { + "epoch": 0.09499676514988138, + "grad_norm": 0.9004638195037842, + "learning_rate": 9.987589781763574e-06, + "loss": 0.8148, + "step": 1762 + }, + { + "epoch": 0.09505067931852491, + "grad_norm": 0.8554181456565857, + "learning_rate": 9.987574849383678e-06, + "loss": 0.8103, + "step": 1763 + }, + { + "epoch": 0.09510459348716843, + "grad_norm": 0.9148527979850769, + "learning_rate": 9.987559908036797e-06, + "loss": 0.9467, + "step": 1764 + }, + { + "epoch": 0.09515850765581195, + "grad_norm": 0.890083909034729, + "learning_rate": 9.987544957722956e-06, + "loss": 0.8338, + "step": 1765 + }, + { + "epoch": 0.09521242182445547, + "grad_norm": 0.8118012547492981, + "learning_rate": 9.98752999844218e-06, + "loss": 0.8355, + "step": 1766 + }, + { + "epoch": 0.09526633599309899, + "grad_norm": 0.8115151524543762, + "learning_rate": 9.987515030194498e-06, + "loss": 0.9172, + "step": 1767 + }, + { + "epoch": 0.0953202501617425, + "grad_norm": 0.8750082850456238, + "learning_rate": 9.987500052979938e-06, + "loss": 0.8301, + "step": 1768 + }, + { + "epoch": 0.09537416433038602, + "grad_norm": 0.9008756875991821, + "learning_rate": 9.987485066798525e-06, + "loss": 0.8642, + "step": 1769 + }, + { + "epoch": 0.09542807849902954, + "grad_norm": 0.8335922956466675, + "learning_rate": 9.987470071650287e-06, + "loss": 0.8466, + "step": 1770 + }, + { + "epoch": 0.09548199266767307, + "grad_norm": 0.8604272603988647, + "learning_rate": 9.987455067535249e-06, + "loss": 0.8801, + "step": 1771 + }, + { + "epoch": 0.09553590683631659, + "grad_norm": 0.889854371547699, + "learning_rate": 9.98744005445344e-06, + "loss": 0.8804, + "step": 1772 + }, + { + "epoch": 0.09558982100496011, + "grad_norm": 0.8756876587867737, + "learning_rate": 9.987425032404887e-06, + "loss": 0.8367, + "step": 1773 + }, + { + "epoch": 0.09564373517360363, + "grad_norm": 0.9071298837661743, + "learning_rate": 9.987410001389616e-06, + "loss": 0.8875, + "step": 1774 + }, + { + "epoch": 0.09569764934224714, + "grad_norm": 0.8214284777641296, + "learning_rate": 9.987394961407654e-06, + "loss": 0.7859, + "step": 1775 + }, + { + "epoch": 0.09575156351089066, + "grad_norm": 0.940034806728363, + "learning_rate": 9.98737991245903e-06, + "loss": 0.8272, + "step": 1776 + }, + { + "epoch": 0.09580547767953418, + "grad_norm": 0.8156501054763794, + "learning_rate": 9.987364854543768e-06, + "loss": 0.7831, + "step": 1777 + }, + { + "epoch": 0.0958593918481777, + "grad_norm": 0.8450450301170349, + "learning_rate": 9.987349787661898e-06, + "loss": 0.7888, + "step": 1778 + }, + { + "epoch": 0.09591330601682121, + "grad_norm": 0.8143148422241211, + "learning_rate": 9.987334711813446e-06, + "loss": 0.7593, + "step": 1779 + }, + { + "epoch": 0.09596722018546475, + "grad_norm": 1.0489457845687866, + "learning_rate": 9.987319626998437e-06, + "loss": 0.8248, + "step": 1780 + }, + { + "epoch": 0.09602113435410826, + "grad_norm": 0.9584689140319824, + "learning_rate": 9.987304533216901e-06, + "loss": 0.9025, + "step": 1781 + }, + { + "epoch": 0.09607504852275178, + "grad_norm": 0.8366501331329346, + "learning_rate": 9.987289430468862e-06, + "loss": 0.7513, + "step": 1782 + }, + { + "epoch": 0.0961289626913953, + "grad_norm": 0.9896461963653564, + "learning_rate": 9.987274318754352e-06, + "loss": 0.8598, + "step": 1783 + }, + { + "epoch": 0.09618287686003882, + "grad_norm": 1.1904568672180176, + "learning_rate": 9.987259198073396e-06, + "loss": 0.9143, + "step": 1784 + }, + { + "epoch": 0.09623679102868234, + "grad_norm": 0.8100086450576782, + "learning_rate": 9.987244068426019e-06, + "loss": 0.7733, + "step": 1785 + }, + { + "epoch": 0.09629070519732585, + "grad_norm": 0.7814387083053589, + "learning_rate": 9.987228929812249e-06, + "loss": 0.7735, + "step": 1786 + }, + { + "epoch": 0.09634461936596937, + "grad_norm": 0.8880924582481384, + "learning_rate": 9.987213782232115e-06, + "loss": 0.8377, + "step": 1787 + }, + { + "epoch": 0.09639853353461289, + "grad_norm": 0.8739203810691833, + "learning_rate": 9.987198625685643e-06, + "loss": 0.8851, + "step": 1788 + }, + { + "epoch": 0.09645244770325642, + "grad_norm": 0.8984062671661377, + "learning_rate": 9.987183460172861e-06, + "loss": 0.8773, + "step": 1789 + }, + { + "epoch": 0.09650636187189994, + "grad_norm": 1.2485296726226807, + "learning_rate": 9.987168285693795e-06, + "loss": 0.787, + "step": 1790 + }, + { + "epoch": 0.09656027604054346, + "grad_norm": 0.8414161205291748, + "learning_rate": 9.987153102248474e-06, + "loss": 0.7895, + "step": 1791 + }, + { + "epoch": 0.09661419020918698, + "grad_norm": 0.7895180583000183, + "learning_rate": 9.987137909836924e-06, + "loss": 0.7592, + "step": 1792 + }, + { + "epoch": 0.0966681043778305, + "grad_norm": 1.0752787590026855, + "learning_rate": 9.987122708459173e-06, + "loss": 0.8472, + "step": 1793 + }, + { + "epoch": 0.09672201854647401, + "grad_norm": 0.9069424271583557, + "learning_rate": 9.987107498115247e-06, + "loss": 0.8746, + "step": 1794 + }, + { + "epoch": 0.09677593271511753, + "grad_norm": 0.8566716909408569, + "learning_rate": 9.987092278805175e-06, + "loss": 0.7604, + "step": 1795 + }, + { + "epoch": 0.09682984688376105, + "grad_norm": 0.833852231502533, + "learning_rate": 9.987077050528983e-06, + "loss": 0.8645, + "step": 1796 + }, + { + "epoch": 0.09688376105240457, + "grad_norm": 0.8439596891403198, + "learning_rate": 9.9870618132867e-06, + "loss": 0.7673, + "step": 1797 + }, + { + "epoch": 0.0969376752210481, + "grad_norm": 0.9743669629096985, + "learning_rate": 9.987046567078352e-06, + "loss": 0.7754, + "step": 1798 + }, + { + "epoch": 0.09699158938969162, + "grad_norm": 0.9291634559631348, + "learning_rate": 9.987031311903968e-06, + "loss": 0.8431, + "step": 1799 + }, + { + "epoch": 0.09704550355833513, + "grad_norm": 1.169450283050537, + "learning_rate": 9.987016047763571e-06, + "loss": 0.9321, + "step": 1800 + }, + { + "epoch": 0.09709941772697865, + "grad_norm": 0.7758163809776306, + "learning_rate": 9.987000774657195e-06, + "loss": 0.7832, + "step": 1801 + }, + { + "epoch": 0.09715333189562217, + "grad_norm": 0.9673672914505005, + "learning_rate": 9.986985492584863e-06, + "loss": 0.9822, + "step": 1802 + }, + { + "epoch": 0.09720724606426569, + "grad_norm": 1.1516417264938354, + "learning_rate": 9.986970201546605e-06, + "loss": 0.9956, + "step": 1803 + }, + { + "epoch": 0.0972611602329092, + "grad_norm": 0.9660587906837463, + "learning_rate": 9.986954901542445e-06, + "loss": 0.8248, + "step": 1804 + }, + { + "epoch": 0.09731507440155272, + "grad_norm": 0.9452739953994751, + "learning_rate": 9.986939592572413e-06, + "loss": 0.8805, + "step": 1805 + }, + { + "epoch": 0.09736898857019624, + "grad_norm": 0.9339364171028137, + "learning_rate": 9.986924274636538e-06, + "loss": 0.8819, + "step": 1806 + }, + { + "epoch": 0.09742290273883977, + "grad_norm": 0.9344542026519775, + "learning_rate": 9.986908947734844e-06, + "loss": 0.8531, + "step": 1807 + }, + { + "epoch": 0.09747681690748329, + "grad_norm": 0.8910528421401978, + "learning_rate": 9.986893611867362e-06, + "loss": 0.8949, + "step": 1808 + }, + { + "epoch": 0.09753073107612681, + "grad_norm": 0.8484895825386047, + "learning_rate": 9.986878267034115e-06, + "loss": 0.8028, + "step": 1809 + }, + { + "epoch": 0.09758464524477033, + "grad_norm": 1.0784810781478882, + "learning_rate": 9.986862913235135e-06, + "loss": 0.9564, + "step": 1810 + }, + { + "epoch": 0.09763855941341384, + "grad_norm": 0.8350296020507812, + "learning_rate": 9.98684755047045e-06, + "loss": 0.8672, + "step": 1811 + }, + { + "epoch": 0.09769247358205736, + "grad_norm": 0.8558050990104675, + "learning_rate": 9.986832178740084e-06, + "loss": 0.8538, + "step": 1812 + }, + { + "epoch": 0.09774638775070088, + "grad_norm": 0.8633396029472351, + "learning_rate": 9.986816798044066e-06, + "loss": 0.8356, + "step": 1813 + }, + { + "epoch": 0.0978003019193444, + "grad_norm": 0.8256344199180603, + "learning_rate": 9.986801408382424e-06, + "loss": 0.7552, + "step": 1814 + }, + { + "epoch": 0.09785421608798792, + "grad_norm": 0.872844398021698, + "learning_rate": 9.986786009755186e-06, + "loss": 0.9153, + "step": 1815 + }, + { + "epoch": 0.09790813025663145, + "grad_norm": 0.842241108417511, + "learning_rate": 9.986770602162378e-06, + "loss": 0.7965, + "step": 1816 + }, + { + "epoch": 0.09796204442527497, + "grad_norm": 0.9673634171485901, + "learning_rate": 9.98675518560403e-06, + "loss": 0.8317, + "step": 1817 + }, + { + "epoch": 0.09801595859391848, + "grad_norm": 0.8744896650314331, + "learning_rate": 9.98673976008017e-06, + "loss": 0.7342, + "step": 1818 + }, + { + "epoch": 0.098069872762562, + "grad_norm": 0.7830422520637512, + "learning_rate": 9.986724325590825e-06, + "loss": 0.721, + "step": 1819 + }, + { + "epoch": 0.09812378693120552, + "grad_norm": 1.0335441827774048, + "learning_rate": 9.986708882136021e-06, + "loss": 0.8088, + "step": 1820 + }, + { + "epoch": 0.09817770109984904, + "grad_norm": 0.841342568397522, + "learning_rate": 9.986693429715785e-06, + "loss": 0.8847, + "step": 1821 + }, + { + "epoch": 0.09823161526849256, + "grad_norm": 0.9405834674835205, + "learning_rate": 9.98667796833015e-06, + "loss": 0.8878, + "step": 1822 + }, + { + "epoch": 0.09828552943713607, + "grad_norm": 0.8358225226402283, + "learning_rate": 9.986662497979138e-06, + "loss": 0.7377, + "step": 1823 + }, + { + "epoch": 0.0983394436057796, + "grad_norm": 0.8844004273414612, + "learning_rate": 9.98664701866278e-06, + "loss": 0.7236, + "step": 1824 + }, + { + "epoch": 0.09839335777442312, + "grad_norm": 0.8165417313575745, + "learning_rate": 9.986631530381105e-06, + "loss": 0.819, + "step": 1825 + }, + { + "epoch": 0.09844727194306664, + "grad_norm": 0.9569553732872009, + "learning_rate": 9.986616033134137e-06, + "loss": 0.9337, + "step": 1826 + }, + { + "epoch": 0.09850118611171016, + "grad_norm": 0.8311771750450134, + "learning_rate": 9.986600526921907e-06, + "loss": 0.8516, + "step": 1827 + }, + { + "epoch": 0.09855510028035368, + "grad_norm": 0.9444357752799988, + "learning_rate": 9.986585011744441e-06, + "loss": 0.805, + "step": 1828 + }, + { + "epoch": 0.0986090144489972, + "grad_norm": 1.0128875970840454, + "learning_rate": 9.986569487601769e-06, + "loss": 0.8514, + "step": 1829 + }, + { + "epoch": 0.09866292861764071, + "grad_norm": 0.8973994255065918, + "learning_rate": 9.986553954493917e-06, + "loss": 0.7938, + "step": 1830 + }, + { + "epoch": 0.09871684278628423, + "grad_norm": 0.8571779131889343, + "learning_rate": 9.986538412420912e-06, + "loss": 0.7506, + "step": 1831 + }, + { + "epoch": 0.09877075695492775, + "grad_norm": 0.9053436517715454, + "learning_rate": 9.986522861382785e-06, + "loss": 0.8551, + "step": 1832 + }, + { + "epoch": 0.09882467112357128, + "grad_norm": 0.9941746592521667, + "learning_rate": 9.986507301379562e-06, + "loss": 0.8828, + "step": 1833 + }, + { + "epoch": 0.0988785852922148, + "grad_norm": 0.9620066285133362, + "learning_rate": 9.986491732411272e-06, + "loss": 0.8982, + "step": 1834 + }, + { + "epoch": 0.09893249946085832, + "grad_norm": 0.9470074772834778, + "learning_rate": 9.986476154477941e-06, + "loss": 0.8295, + "step": 1835 + }, + { + "epoch": 0.09898641362950183, + "grad_norm": 0.9962137937545776, + "learning_rate": 9.986460567579599e-06, + "loss": 0.8714, + "step": 1836 + }, + { + "epoch": 0.09904032779814535, + "grad_norm": 0.8492829203605652, + "learning_rate": 9.986444971716273e-06, + "loss": 0.8234, + "step": 1837 + }, + { + "epoch": 0.09909424196678887, + "grad_norm": 0.9463719725608826, + "learning_rate": 9.986429366887994e-06, + "loss": 0.7769, + "step": 1838 + }, + { + "epoch": 0.09914815613543239, + "grad_norm": 0.8588153123855591, + "learning_rate": 9.986413753094786e-06, + "loss": 0.8883, + "step": 1839 + }, + { + "epoch": 0.0992020703040759, + "grad_norm": 0.7692183256149292, + "learning_rate": 9.986398130336677e-06, + "loss": 0.7691, + "step": 1840 + }, + { + "epoch": 0.09925598447271942, + "grad_norm": 0.8377199172973633, + "learning_rate": 9.986382498613699e-06, + "loss": 0.789, + "step": 1841 + }, + { + "epoch": 0.09930989864136296, + "grad_norm": 0.9783869385719299, + "learning_rate": 9.986366857925876e-06, + "loss": 0.8517, + "step": 1842 + }, + { + "epoch": 0.09936381281000647, + "grad_norm": 0.8233169913291931, + "learning_rate": 9.986351208273239e-06, + "loss": 0.8701, + "step": 1843 + }, + { + "epoch": 0.09941772697864999, + "grad_norm": 0.9393780827522278, + "learning_rate": 9.986335549655814e-06, + "loss": 0.8837, + "step": 1844 + }, + { + "epoch": 0.09947164114729351, + "grad_norm": 0.8517693877220154, + "learning_rate": 9.986319882073631e-06, + "loss": 0.9043, + "step": 1845 + }, + { + "epoch": 0.09952555531593703, + "grad_norm": 0.8296724557876587, + "learning_rate": 9.986304205526718e-06, + "loss": 0.7406, + "step": 1846 + }, + { + "epoch": 0.09957946948458055, + "grad_norm": 0.8372161388397217, + "learning_rate": 9.986288520015102e-06, + "loss": 0.7763, + "step": 1847 + }, + { + "epoch": 0.09963338365322406, + "grad_norm": 0.8086470365524292, + "learning_rate": 9.986272825538812e-06, + "loss": 0.8786, + "step": 1848 + }, + { + "epoch": 0.09968729782186758, + "grad_norm": 0.8562842011451721, + "learning_rate": 9.986257122097875e-06, + "loss": 0.8391, + "step": 1849 + }, + { + "epoch": 0.0997412119905111, + "grad_norm": 0.9052720665931702, + "learning_rate": 9.986241409692321e-06, + "loss": 0.948, + "step": 1850 + }, + { + "epoch": 0.09979512615915463, + "grad_norm": 0.8220609426498413, + "learning_rate": 9.986225688322178e-06, + "loss": 0.8039, + "step": 1851 + }, + { + "epoch": 0.09984904032779815, + "grad_norm": 0.8018030524253845, + "learning_rate": 9.98620995798747e-06, + "loss": 0.7748, + "step": 1852 + }, + { + "epoch": 0.09990295449644167, + "grad_norm": 0.8150879144668579, + "learning_rate": 9.986194218688235e-06, + "loss": 0.7304, + "step": 1853 + }, + { + "epoch": 0.09995686866508519, + "grad_norm": 0.8677535653114319, + "learning_rate": 9.98617847042449e-06, + "loss": 0.8756, + "step": 1854 + }, + { + "epoch": 0.1000107828337287, + "grad_norm": 0.8889294862747192, + "learning_rate": 9.986162713196272e-06, + "loss": 0.8926, + "step": 1855 + }, + { + "epoch": 0.10006469700237222, + "grad_norm": 0.7618375420570374, + "learning_rate": 9.986146947003603e-06, + "loss": 0.7317, + "step": 1856 + }, + { + "epoch": 0.10011861117101574, + "grad_norm": 0.8775038719177246, + "learning_rate": 9.986131171846518e-06, + "loss": 0.8318, + "step": 1857 + }, + { + "epoch": 0.10017252533965926, + "grad_norm": 0.9671807289123535, + "learning_rate": 9.986115387725039e-06, + "loss": 0.7412, + "step": 1858 + }, + { + "epoch": 0.10022643950830278, + "grad_norm": 0.8808870911598206, + "learning_rate": 9.986099594639197e-06, + "loss": 0.8213, + "step": 1859 + }, + { + "epoch": 0.10028035367694631, + "grad_norm": 0.8104208707809448, + "learning_rate": 9.986083792589021e-06, + "loss": 0.8108, + "step": 1860 + }, + { + "epoch": 0.10033426784558983, + "grad_norm": 0.839911937713623, + "learning_rate": 9.986067981574538e-06, + "loss": 0.8391, + "step": 1861 + }, + { + "epoch": 0.10038818201423334, + "grad_norm": 0.8402823805809021, + "learning_rate": 9.986052161595778e-06, + "loss": 0.7434, + "step": 1862 + }, + { + "epoch": 0.10044209618287686, + "grad_norm": 0.7591431140899658, + "learning_rate": 9.986036332652768e-06, + "loss": 0.763, + "step": 1863 + }, + { + "epoch": 0.10049601035152038, + "grad_norm": 0.8613053560256958, + "learning_rate": 9.986020494745538e-06, + "loss": 0.8324, + "step": 1864 + }, + { + "epoch": 0.1005499245201639, + "grad_norm": 0.8467068076133728, + "learning_rate": 9.986004647874117e-06, + "loss": 0.882, + "step": 1865 + }, + { + "epoch": 0.10060383868880741, + "grad_norm": 1.0717257261276245, + "learning_rate": 9.98598879203853e-06, + "loss": 0.9305, + "step": 1866 + }, + { + "epoch": 0.10065775285745093, + "grad_norm": 0.8680382370948792, + "learning_rate": 9.985972927238808e-06, + "loss": 0.7521, + "step": 1867 + }, + { + "epoch": 0.10071166702609445, + "grad_norm": 0.8465799689292908, + "learning_rate": 9.98595705347498e-06, + "loss": 0.8562, + "step": 1868 + }, + { + "epoch": 0.10076558119473798, + "grad_norm": 0.938218355178833, + "learning_rate": 9.985941170747072e-06, + "loss": 0.7737, + "step": 1869 + }, + { + "epoch": 0.1008194953633815, + "grad_norm": 0.8189761638641357, + "learning_rate": 9.985925279055117e-06, + "loss": 0.8502, + "step": 1870 + }, + { + "epoch": 0.10087340953202502, + "grad_norm": 0.915703535079956, + "learning_rate": 9.985909378399138e-06, + "loss": 0.9576, + "step": 1871 + }, + { + "epoch": 0.10092732370066854, + "grad_norm": 0.7837297916412354, + "learning_rate": 9.985893468779168e-06, + "loss": 0.7091, + "step": 1872 + }, + { + "epoch": 0.10098123786931205, + "grad_norm": 0.7426577806472778, + "learning_rate": 9.985877550195234e-06, + "loss": 0.768, + "step": 1873 + }, + { + "epoch": 0.10103515203795557, + "grad_norm": 0.9437102675437927, + "learning_rate": 9.985861622647364e-06, + "loss": 0.8308, + "step": 1874 + }, + { + "epoch": 0.10108906620659909, + "grad_norm": 0.7381339073181152, + "learning_rate": 9.985845686135586e-06, + "loss": 0.7206, + "step": 1875 + }, + { + "epoch": 0.10114298037524261, + "grad_norm": 0.8478738069534302, + "learning_rate": 9.985829740659932e-06, + "loss": 0.7512, + "step": 1876 + }, + { + "epoch": 0.10119689454388614, + "grad_norm": 0.8331673741340637, + "learning_rate": 9.985813786220428e-06, + "loss": 0.8281, + "step": 1877 + }, + { + "epoch": 0.10125080871252966, + "grad_norm": 0.7703354954719543, + "learning_rate": 9.985797822817102e-06, + "loss": 0.7313, + "step": 1878 + }, + { + "epoch": 0.10130472288117318, + "grad_norm": 0.9182866811752319, + "learning_rate": 9.985781850449985e-06, + "loss": 0.8365, + "step": 1879 + }, + { + "epoch": 0.1013586370498167, + "grad_norm": 0.8285559415817261, + "learning_rate": 9.985765869119104e-06, + "loss": 0.8439, + "step": 1880 + }, + { + "epoch": 0.10141255121846021, + "grad_norm": 0.8400557041168213, + "learning_rate": 9.985749878824488e-06, + "loss": 0.8011, + "step": 1881 + }, + { + "epoch": 0.10146646538710373, + "grad_norm": 0.9225326776504517, + "learning_rate": 9.985733879566168e-06, + "loss": 0.8402, + "step": 1882 + }, + { + "epoch": 0.10152037955574725, + "grad_norm": 0.9194371700286865, + "learning_rate": 9.985717871344172e-06, + "loss": 0.8245, + "step": 1883 + }, + { + "epoch": 0.10157429372439077, + "grad_norm": 0.7443274259567261, + "learning_rate": 9.985701854158525e-06, + "loss": 0.7708, + "step": 1884 + }, + { + "epoch": 0.10162820789303428, + "grad_norm": 1.1139355897903442, + "learning_rate": 9.985685828009259e-06, + "loss": 0.8384, + "step": 1885 + }, + { + "epoch": 0.10168212206167782, + "grad_norm": 0.8835493326187134, + "learning_rate": 9.985669792896402e-06, + "loss": 0.8063, + "step": 1886 + }, + { + "epoch": 0.10173603623032133, + "grad_norm": 0.8012663125991821, + "learning_rate": 9.985653748819983e-06, + "loss": 0.8393, + "step": 1887 + }, + { + "epoch": 0.10178995039896485, + "grad_norm": 0.8092807531356812, + "learning_rate": 9.985637695780033e-06, + "loss": 0.7631, + "step": 1888 + }, + { + "epoch": 0.10184386456760837, + "grad_norm": 1.7357290983200073, + "learning_rate": 9.985621633776577e-06, + "loss": 0.8067, + "step": 1889 + }, + { + "epoch": 0.10189777873625189, + "grad_norm": 0.8562015891075134, + "learning_rate": 9.985605562809646e-06, + "loss": 0.8543, + "step": 1890 + }, + { + "epoch": 0.1019516929048954, + "grad_norm": 0.9570844769477844, + "learning_rate": 9.98558948287927e-06, + "loss": 0.7778, + "step": 1891 + }, + { + "epoch": 0.10200560707353892, + "grad_norm": 0.748468279838562, + "learning_rate": 9.985573393985475e-06, + "loss": 0.6559, + "step": 1892 + }, + { + "epoch": 0.10205952124218244, + "grad_norm": 1.004490852355957, + "learning_rate": 9.98555729612829e-06, + "loss": 0.8453, + "step": 1893 + }, + { + "epoch": 0.10211343541082596, + "grad_norm": 0.9566166996955872, + "learning_rate": 9.985541189307749e-06, + "loss": 0.8984, + "step": 1894 + }, + { + "epoch": 0.10216734957946949, + "grad_norm": 0.8624017834663391, + "learning_rate": 9.985525073523874e-06, + "loss": 0.7442, + "step": 1895 + }, + { + "epoch": 0.10222126374811301, + "grad_norm": 1.0596553087234497, + "learning_rate": 9.9855089487767e-06, + "loss": 0.778, + "step": 1896 + }, + { + "epoch": 0.10227517791675653, + "grad_norm": 0.8003553152084351, + "learning_rate": 9.985492815066252e-06, + "loss": 0.7513, + "step": 1897 + }, + { + "epoch": 0.10232909208540004, + "grad_norm": 1.0067185163497925, + "learning_rate": 9.98547667239256e-06, + "loss": 0.8878, + "step": 1898 + }, + { + "epoch": 0.10238300625404356, + "grad_norm": 0.8398754596710205, + "learning_rate": 9.985460520755654e-06, + "loss": 0.8222, + "step": 1899 + }, + { + "epoch": 0.10243692042268708, + "grad_norm": 0.9688541293144226, + "learning_rate": 9.985444360155563e-06, + "loss": 0.8304, + "step": 1900 + }, + { + "epoch": 0.1024908345913306, + "grad_norm": 0.8848011493682861, + "learning_rate": 9.985428190592314e-06, + "loss": 0.7853, + "step": 1901 + }, + { + "epoch": 0.10254474875997412, + "grad_norm": 0.9240403771400452, + "learning_rate": 9.985412012065937e-06, + "loss": 0.9058, + "step": 1902 + }, + { + "epoch": 0.10259866292861763, + "grad_norm": 0.814194917678833, + "learning_rate": 9.985395824576463e-06, + "loss": 0.7775, + "step": 1903 + }, + { + "epoch": 0.10265257709726117, + "grad_norm": 0.9210302233695984, + "learning_rate": 9.98537962812392e-06, + "loss": 0.9288, + "step": 1904 + }, + { + "epoch": 0.10270649126590468, + "grad_norm": 0.8850705027580261, + "learning_rate": 9.985363422708336e-06, + "loss": 0.9036, + "step": 1905 + }, + { + "epoch": 0.1027604054345482, + "grad_norm": 0.8312196731567383, + "learning_rate": 9.985347208329742e-06, + "loss": 0.811, + "step": 1906 + }, + { + "epoch": 0.10281431960319172, + "grad_norm": 1.1294670104980469, + "learning_rate": 9.985330984988164e-06, + "loss": 0.9775, + "step": 1907 + }, + { + "epoch": 0.10286823377183524, + "grad_norm": 0.7980399131774902, + "learning_rate": 9.985314752683635e-06, + "loss": 0.7786, + "step": 1908 + }, + { + "epoch": 0.10292214794047876, + "grad_norm": 0.8291264176368713, + "learning_rate": 9.985298511416181e-06, + "loss": 0.7028, + "step": 1909 + }, + { + "epoch": 0.10297606210912227, + "grad_norm": 0.8284684419631958, + "learning_rate": 9.985282261185833e-06, + "loss": 0.8043, + "step": 1910 + }, + { + "epoch": 0.10302997627776579, + "grad_norm": 0.8680904507637024, + "learning_rate": 9.985266001992622e-06, + "loss": 0.8274, + "step": 1911 + }, + { + "epoch": 0.10308389044640931, + "grad_norm": 0.7380900979042053, + "learning_rate": 9.985249733836573e-06, + "loss": 0.6991, + "step": 1912 + }, + { + "epoch": 0.10313780461505284, + "grad_norm": 0.8572129011154175, + "learning_rate": 9.985233456717718e-06, + "loss": 0.7751, + "step": 1913 + }, + { + "epoch": 0.10319171878369636, + "grad_norm": 0.8797627687454224, + "learning_rate": 9.985217170636085e-06, + "loss": 0.8681, + "step": 1914 + }, + { + "epoch": 0.10324563295233988, + "grad_norm": 0.9301999807357788, + "learning_rate": 9.985200875591704e-06, + "loss": 0.6208, + "step": 1915 + }, + { + "epoch": 0.1032995471209834, + "grad_norm": 0.8296228647232056, + "learning_rate": 9.985184571584606e-06, + "loss": 0.8027, + "step": 1916 + }, + { + "epoch": 0.10335346128962691, + "grad_norm": 0.8241246342658997, + "learning_rate": 9.985168258614815e-06, + "loss": 0.8223, + "step": 1917 + }, + { + "epoch": 0.10340737545827043, + "grad_norm": 0.9633389115333557, + "learning_rate": 9.985151936682367e-06, + "loss": 0.9037, + "step": 1918 + }, + { + "epoch": 0.10346128962691395, + "grad_norm": 0.8903288245201111, + "learning_rate": 9.985135605787286e-06, + "loss": 0.8949, + "step": 1919 + }, + { + "epoch": 0.10351520379555747, + "grad_norm": 0.8670981526374817, + "learning_rate": 9.985119265929604e-06, + "loss": 0.7094, + "step": 1920 + }, + { + "epoch": 0.10356911796420099, + "grad_norm": 0.9681735038757324, + "learning_rate": 9.985102917109351e-06, + "loss": 0.9617, + "step": 1921 + }, + { + "epoch": 0.10362303213284452, + "grad_norm": 0.9229291081428528, + "learning_rate": 9.985086559326555e-06, + "loss": 0.9384, + "step": 1922 + }, + { + "epoch": 0.10367694630148803, + "grad_norm": 0.8501392602920532, + "learning_rate": 9.985070192581245e-06, + "loss": 0.8647, + "step": 1923 + }, + { + "epoch": 0.10373086047013155, + "grad_norm": 1.4047728776931763, + "learning_rate": 9.985053816873452e-06, + "loss": 0.7905, + "step": 1924 + }, + { + "epoch": 0.10378477463877507, + "grad_norm": 1.154661774635315, + "learning_rate": 9.985037432203204e-06, + "loss": 0.8666, + "step": 1925 + }, + { + "epoch": 0.10383868880741859, + "grad_norm": 1.042126178741455, + "learning_rate": 9.985021038570532e-06, + "loss": 0.7736, + "step": 1926 + }, + { + "epoch": 0.1038926029760621, + "grad_norm": 0.7904629111289978, + "learning_rate": 9.985004635975464e-06, + "loss": 0.7247, + "step": 1927 + }, + { + "epoch": 0.10394651714470562, + "grad_norm": 0.8718095421791077, + "learning_rate": 9.984988224418029e-06, + "loss": 0.7792, + "step": 1928 + }, + { + "epoch": 0.10400043131334914, + "grad_norm": 0.870330274105072, + "learning_rate": 9.984971803898258e-06, + "loss": 0.7992, + "step": 1929 + }, + { + "epoch": 0.10405434548199267, + "grad_norm": 0.8473007678985596, + "learning_rate": 9.98495537441618e-06, + "loss": 0.883, + "step": 1930 + }, + { + "epoch": 0.10410825965063619, + "grad_norm": 1.0333232879638672, + "learning_rate": 9.984938935971824e-06, + "loss": 0.9228, + "step": 1931 + }, + { + "epoch": 0.10416217381927971, + "grad_norm": 0.9389268159866333, + "learning_rate": 9.984922488565221e-06, + "loss": 0.7792, + "step": 1932 + }, + { + "epoch": 0.10421608798792323, + "grad_norm": 0.9977405667304993, + "learning_rate": 9.9849060321964e-06, + "loss": 0.7971, + "step": 1933 + }, + { + "epoch": 0.10427000215656675, + "grad_norm": 0.7879780530929565, + "learning_rate": 9.98488956686539e-06, + "loss": 0.8149, + "step": 1934 + }, + { + "epoch": 0.10432391632521026, + "grad_norm": 0.8149437308311462, + "learning_rate": 9.98487309257222e-06, + "loss": 0.8391, + "step": 1935 + }, + { + "epoch": 0.10437783049385378, + "grad_norm": 0.9226745367050171, + "learning_rate": 9.984856609316921e-06, + "loss": 0.9581, + "step": 1936 + }, + { + "epoch": 0.1044317446624973, + "grad_norm": 0.9190924167633057, + "learning_rate": 9.984840117099524e-06, + "loss": 0.8859, + "step": 1937 + }, + { + "epoch": 0.10448565883114082, + "grad_norm": 0.7996852397918701, + "learning_rate": 9.984823615920054e-06, + "loss": 0.7377, + "step": 1938 + }, + { + "epoch": 0.10453957299978435, + "grad_norm": 1.0055615901947021, + "learning_rate": 9.984807105778544e-06, + "loss": 1.4365, + "step": 1939 + }, + { + "epoch": 0.10459348716842787, + "grad_norm": 0.8595201969146729, + "learning_rate": 9.984790586675023e-06, + "loss": 0.807, + "step": 1940 + }, + { + "epoch": 0.10464740133707139, + "grad_norm": 0.9500923156738281, + "learning_rate": 9.984774058609522e-06, + "loss": 0.9378, + "step": 1941 + }, + { + "epoch": 0.1047013155057149, + "grad_norm": 0.8677893877029419, + "learning_rate": 9.98475752158207e-06, + "loss": 0.8399, + "step": 1942 + }, + { + "epoch": 0.10475522967435842, + "grad_norm": 0.8256751298904419, + "learning_rate": 9.984740975592695e-06, + "loss": 0.8552, + "step": 1943 + }, + { + "epoch": 0.10480914384300194, + "grad_norm": 0.8910439610481262, + "learning_rate": 9.984724420641427e-06, + "loss": 0.9704, + "step": 1944 + }, + { + "epoch": 0.10486305801164546, + "grad_norm": 0.8732389807701111, + "learning_rate": 9.9847078567283e-06, + "loss": 0.8448, + "step": 1945 + }, + { + "epoch": 0.10491697218028898, + "grad_norm": 0.856151282787323, + "learning_rate": 9.984691283853338e-06, + "loss": 0.7403, + "step": 1946 + }, + { + "epoch": 0.1049708863489325, + "grad_norm": 0.8741405010223389, + "learning_rate": 9.984674702016573e-06, + "loss": 0.8913, + "step": 1947 + }, + { + "epoch": 0.10502480051757603, + "grad_norm": 0.9214139580726624, + "learning_rate": 9.984658111218036e-06, + "loss": 0.8901, + "step": 1948 + }, + { + "epoch": 0.10507871468621954, + "grad_norm": 0.9773908853530884, + "learning_rate": 9.984641511457757e-06, + "loss": 0.7979, + "step": 1949 + }, + { + "epoch": 0.10513262885486306, + "grad_norm": 0.9136568903923035, + "learning_rate": 9.984624902735765e-06, + "loss": 0.9019, + "step": 1950 + }, + { + "epoch": 0.10518654302350658, + "grad_norm": 0.857468843460083, + "learning_rate": 9.984608285052087e-06, + "loss": 0.7663, + "step": 1951 + }, + { + "epoch": 0.1052404571921501, + "grad_norm": 0.8473180532455444, + "learning_rate": 9.984591658406756e-06, + "loss": 0.8137, + "step": 1952 + }, + { + "epoch": 0.10529437136079361, + "grad_norm": 0.8932186961174011, + "learning_rate": 9.984575022799805e-06, + "loss": 0.8859, + "step": 1953 + }, + { + "epoch": 0.10534828552943713, + "grad_norm": 0.8191091418266296, + "learning_rate": 9.984558378231257e-06, + "loss": 0.8111, + "step": 1954 + }, + { + "epoch": 0.10540219969808065, + "grad_norm": 0.8452546000480652, + "learning_rate": 9.984541724701147e-06, + "loss": 0.8563, + "step": 1955 + }, + { + "epoch": 0.10545611386672417, + "grad_norm": 0.8053101897239685, + "learning_rate": 9.984525062209502e-06, + "loss": 0.8166, + "step": 1956 + }, + { + "epoch": 0.1055100280353677, + "grad_norm": 0.7936314344406128, + "learning_rate": 9.984508390756354e-06, + "loss": 0.8446, + "step": 1957 + }, + { + "epoch": 0.10556394220401122, + "grad_norm": 0.7867884635925293, + "learning_rate": 9.984491710341733e-06, + "loss": 0.7719, + "step": 1958 + }, + { + "epoch": 0.10561785637265474, + "grad_norm": 0.8387873768806458, + "learning_rate": 9.984475020965667e-06, + "loss": 0.842, + "step": 1959 + }, + { + "epoch": 0.10567177054129825, + "grad_norm": 0.8028631806373596, + "learning_rate": 9.984458322628188e-06, + "loss": 0.7673, + "step": 1960 + }, + { + "epoch": 0.10572568470994177, + "grad_norm": 0.765836238861084, + "learning_rate": 9.984441615329323e-06, + "loss": 0.7383, + "step": 1961 + }, + { + "epoch": 0.10577959887858529, + "grad_norm": 0.8619019389152527, + "learning_rate": 9.984424899069106e-06, + "loss": 0.8076, + "step": 1962 + }, + { + "epoch": 0.10583351304722881, + "grad_norm": 1.1085911989212036, + "learning_rate": 9.984408173847565e-06, + "loss": 0.9379, + "step": 1963 + }, + { + "epoch": 0.10588742721587233, + "grad_norm": 0.7861249446868896, + "learning_rate": 9.98439143966473e-06, + "loss": 0.7531, + "step": 1964 + }, + { + "epoch": 0.10594134138451584, + "grad_norm": 0.8964807391166687, + "learning_rate": 9.984374696520633e-06, + "loss": 0.7991, + "step": 1965 + }, + { + "epoch": 0.10599525555315938, + "grad_norm": 0.720808207988739, + "learning_rate": 9.984357944415302e-06, + "loss": 0.7171, + "step": 1966 + }, + { + "epoch": 0.1060491697218029, + "grad_norm": 0.9870907068252563, + "learning_rate": 9.984341183348766e-06, + "loss": 0.8168, + "step": 1967 + }, + { + "epoch": 0.10610308389044641, + "grad_norm": 0.7987208366394043, + "learning_rate": 9.984324413321057e-06, + "loss": 0.817, + "step": 1968 + }, + { + "epoch": 0.10615699805908993, + "grad_norm": 0.7737677097320557, + "learning_rate": 9.984307634332206e-06, + "loss": 0.855, + "step": 1969 + }, + { + "epoch": 0.10621091222773345, + "grad_norm": 0.9125123620033264, + "learning_rate": 9.984290846382243e-06, + "loss": 0.8059, + "step": 1970 + }, + { + "epoch": 0.10626482639637697, + "grad_norm": 0.8460454344749451, + "learning_rate": 9.984274049471197e-06, + "loss": 0.7415, + "step": 1971 + }, + { + "epoch": 0.10631874056502048, + "grad_norm": 0.8322888016700745, + "learning_rate": 9.984257243599096e-06, + "loss": 0.793, + "step": 1972 + }, + { + "epoch": 0.106372654733664, + "grad_norm": 0.7797715067863464, + "learning_rate": 9.984240428765975e-06, + "loss": 0.7324, + "step": 1973 + }, + { + "epoch": 0.10642656890230752, + "grad_norm": 0.847457766532898, + "learning_rate": 9.98422360497186e-06, + "loss": 0.7949, + "step": 1974 + }, + { + "epoch": 0.10648048307095105, + "grad_norm": 0.8471247553825378, + "learning_rate": 9.984206772216785e-06, + "loss": 0.8368, + "step": 1975 + }, + { + "epoch": 0.10653439723959457, + "grad_norm": 0.879416823387146, + "learning_rate": 9.984189930500778e-06, + "loss": 0.7779, + "step": 1976 + }, + { + "epoch": 0.10658831140823809, + "grad_norm": 0.8355580568313599, + "learning_rate": 9.98417307982387e-06, + "loss": 0.7741, + "step": 1977 + }, + { + "epoch": 0.1066422255768816, + "grad_norm": 0.8388553857803345, + "learning_rate": 9.98415622018609e-06, + "loss": 0.7839, + "step": 1978 + }, + { + "epoch": 0.10669613974552512, + "grad_norm": 0.7899215221405029, + "learning_rate": 9.98413935158747e-06, + "loss": 0.7419, + "step": 1979 + }, + { + "epoch": 0.10675005391416864, + "grad_norm": 0.9422525763511658, + "learning_rate": 9.98412247402804e-06, + "loss": 0.7977, + "step": 1980 + }, + { + "epoch": 0.10680396808281216, + "grad_norm": 0.8084313869476318, + "learning_rate": 9.984105587507831e-06, + "loss": 0.6813, + "step": 1981 + }, + { + "epoch": 0.10685788225145568, + "grad_norm": 0.9860095977783203, + "learning_rate": 9.98408869202687e-06, + "loss": 0.8934, + "step": 1982 + }, + { + "epoch": 0.10691179642009921, + "grad_norm": 0.9511064887046814, + "learning_rate": 9.98407178758519e-06, + "loss": 0.8438, + "step": 1983 + }, + { + "epoch": 0.10696571058874273, + "grad_norm": 0.9021103978157043, + "learning_rate": 9.984054874182822e-06, + "loss": 0.854, + "step": 1984 + }, + { + "epoch": 0.10701962475738624, + "grad_norm": 0.8343318104743958, + "learning_rate": 9.984037951819796e-06, + "loss": 0.8075, + "step": 1985 + }, + { + "epoch": 0.10707353892602976, + "grad_norm": 0.8592053651809692, + "learning_rate": 9.984021020496141e-06, + "loss": 0.8431, + "step": 1986 + }, + { + "epoch": 0.10712745309467328, + "grad_norm": 0.8554633259773254, + "learning_rate": 9.98400408021189e-06, + "loss": 0.797, + "step": 1987 + }, + { + "epoch": 0.1071813672633168, + "grad_norm": 0.8476511240005493, + "learning_rate": 9.98398713096707e-06, + "loss": 0.834, + "step": 1988 + }, + { + "epoch": 0.10723528143196032, + "grad_norm": 0.8374871611595154, + "learning_rate": 9.983970172761715e-06, + "loss": 0.7934, + "step": 1989 + }, + { + "epoch": 0.10728919560060383, + "grad_norm": 0.8740583658218384, + "learning_rate": 9.983953205595853e-06, + "loss": 0.8945, + "step": 1990 + }, + { + "epoch": 0.10734310976924735, + "grad_norm": 0.8888646364212036, + "learning_rate": 9.983936229469514e-06, + "loss": 0.8582, + "step": 1991 + }, + { + "epoch": 0.10739702393789088, + "grad_norm": 0.7999173402786255, + "learning_rate": 9.983919244382732e-06, + "loss": 0.7906, + "step": 1992 + }, + { + "epoch": 0.1074509381065344, + "grad_norm": 0.8284609913825989, + "learning_rate": 9.983902250335532e-06, + "loss": 0.8282, + "step": 1993 + }, + { + "epoch": 0.10750485227517792, + "grad_norm": 0.8933084607124329, + "learning_rate": 9.98388524732795e-06, + "loss": 0.8332, + "step": 1994 + }, + { + "epoch": 0.10755876644382144, + "grad_norm": 1.1771386861801147, + "learning_rate": 9.983868235360017e-06, + "loss": 0.6624, + "step": 1995 + }, + { + "epoch": 0.10761268061246496, + "grad_norm": 0.7977056503295898, + "learning_rate": 9.98385121443176e-06, + "loss": 0.7169, + "step": 1996 + }, + { + "epoch": 0.10766659478110847, + "grad_norm": 1.1132346391677856, + "learning_rate": 9.98383418454321e-06, + "loss": 0.8448, + "step": 1997 + }, + { + "epoch": 0.10772050894975199, + "grad_norm": 0.8148393034934998, + "learning_rate": 9.983817145694396e-06, + "loss": 0.7313, + "step": 1998 + }, + { + "epoch": 0.10777442311839551, + "grad_norm": 1.0594265460968018, + "learning_rate": 9.983800097885353e-06, + "loss": 0.9795, + "step": 1999 + }, + { + "epoch": 0.10782833728703903, + "grad_norm": 0.8699034452438354, + "learning_rate": 9.983783041116109e-06, + "loss": 0.8717, + "step": 2000 + }, + { + "epoch": 0.10788225145568256, + "grad_norm": 1.0455189943313599, + "learning_rate": 9.983765975386696e-06, + "loss": 0.898, + "step": 2001 + }, + { + "epoch": 0.10793616562432608, + "grad_norm": 1.0363630056381226, + "learning_rate": 9.983748900697143e-06, + "loss": 0.8404, + "step": 2002 + }, + { + "epoch": 0.1079900797929696, + "grad_norm": 0.7753402590751648, + "learning_rate": 9.983731817047482e-06, + "loss": 0.8416, + "step": 2003 + }, + { + "epoch": 0.10804399396161311, + "grad_norm": 0.7321370244026184, + "learning_rate": 9.983714724437744e-06, + "loss": 0.7051, + "step": 2004 + }, + { + "epoch": 0.10809790813025663, + "grad_norm": 0.8907992839813232, + "learning_rate": 9.983697622867959e-06, + "loss": 0.8347, + "step": 2005 + }, + { + "epoch": 0.10815182229890015, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.983680512338157e-06, + "loss": 0.7704, + "step": 2006 + }, + { + "epoch": 0.10820573646754367, + "grad_norm": 0.9187548756599426, + "learning_rate": 9.983663392848371e-06, + "loss": 0.8926, + "step": 2007 + }, + { + "epoch": 0.10825965063618719, + "grad_norm": 1.0350191593170166, + "learning_rate": 9.983646264398629e-06, + "loss": 0.8253, + "step": 2008 + }, + { + "epoch": 0.1083135648048307, + "grad_norm": 0.9566621780395508, + "learning_rate": 9.983629126988963e-06, + "loss": 0.8545, + "step": 2009 + }, + { + "epoch": 0.10836747897347423, + "grad_norm": 0.7644455432891846, + "learning_rate": 9.983611980619405e-06, + "loss": 0.707, + "step": 2010 + }, + { + "epoch": 0.10842139314211775, + "grad_norm": 0.7929621934890747, + "learning_rate": 9.983594825289983e-06, + "loss": 0.8123, + "step": 2011 + }, + { + "epoch": 0.10847530731076127, + "grad_norm": 0.8667447566986084, + "learning_rate": 9.983577661000732e-06, + "loss": 0.8371, + "step": 2012 + }, + { + "epoch": 0.10852922147940479, + "grad_norm": 0.9008684158325195, + "learning_rate": 9.98356048775168e-06, + "loss": 0.8088, + "step": 2013 + }, + { + "epoch": 0.1085831356480483, + "grad_norm": 0.8797710537910461, + "learning_rate": 9.983543305542858e-06, + "loss": 0.8315, + "step": 2014 + }, + { + "epoch": 0.10863704981669182, + "grad_norm": 1.0082249641418457, + "learning_rate": 9.983526114374296e-06, + "loss": 0.6944, + "step": 2015 + }, + { + "epoch": 0.10869096398533534, + "grad_norm": 0.8216932415962219, + "learning_rate": 9.983508914246027e-06, + "loss": 0.7704, + "step": 2016 + }, + { + "epoch": 0.10874487815397886, + "grad_norm": 0.7873802781105042, + "learning_rate": 9.983491705158082e-06, + "loss": 0.8269, + "step": 2017 + }, + { + "epoch": 0.10879879232262238, + "grad_norm": 0.9200018644332886, + "learning_rate": 9.983474487110492e-06, + "loss": 0.8736, + "step": 2018 + }, + { + "epoch": 0.10885270649126591, + "grad_norm": 0.8780434727668762, + "learning_rate": 9.983457260103284e-06, + "loss": 0.8959, + "step": 2019 + }, + { + "epoch": 0.10890662065990943, + "grad_norm": 0.8503702878952026, + "learning_rate": 9.983440024136493e-06, + "loss": 0.874, + "step": 2020 + }, + { + "epoch": 0.10896053482855295, + "grad_norm": 0.8003312349319458, + "learning_rate": 9.98342277921015e-06, + "loss": 0.8053, + "step": 2021 + }, + { + "epoch": 0.10901444899719646, + "grad_norm": 0.8508152961730957, + "learning_rate": 9.983405525324284e-06, + "loss": 0.8349, + "step": 2022 + }, + { + "epoch": 0.10906836316583998, + "grad_norm": 0.7947866320610046, + "learning_rate": 9.983388262478928e-06, + "loss": 0.7969, + "step": 2023 + }, + { + "epoch": 0.1091222773344835, + "grad_norm": 0.7566391229629517, + "learning_rate": 9.98337099067411e-06, + "loss": 0.7485, + "step": 2024 + }, + { + "epoch": 0.10917619150312702, + "grad_norm": 0.7484708428382874, + "learning_rate": 9.983353709909865e-06, + "loss": 0.7223, + "step": 2025 + }, + { + "epoch": 0.10923010567177054, + "grad_norm": 0.7474842667579651, + "learning_rate": 9.983336420186223e-06, + "loss": 0.7643, + "step": 2026 + }, + { + "epoch": 0.10928401984041405, + "grad_norm": 0.9116804003715515, + "learning_rate": 9.983319121503212e-06, + "loss": 0.9259, + "step": 2027 + }, + { + "epoch": 0.10933793400905759, + "grad_norm": 0.7918151617050171, + "learning_rate": 9.983301813860866e-06, + "loss": 0.8006, + "step": 2028 + }, + { + "epoch": 0.1093918481777011, + "grad_norm": 0.8043256998062134, + "learning_rate": 9.983284497259216e-06, + "loss": 0.7776, + "step": 2029 + }, + { + "epoch": 0.10944576234634462, + "grad_norm": 0.7829573154449463, + "learning_rate": 9.983267171698292e-06, + "loss": 0.7518, + "step": 2030 + }, + { + "epoch": 0.10949967651498814, + "grad_norm": 0.9080957174301147, + "learning_rate": 9.983249837178126e-06, + "loss": 0.777, + "step": 2031 + }, + { + "epoch": 0.10955359068363166, + "grad_norm": 0.9077693223953247, + "learning_rate": 9.983232493698748e-06, + "loss": 0.7412, + "step": 2032 + }, + { + "epoch": 0.10960750485227518, + "grad_norm": 0.7891800403594971, + "learning_rate": 9.98321514126019e-06, + "loss": 0.8089, + "step": 2033 + }, + { + "epoch": 0.1096614190209187, + "grad_norm": 0.8350703716278076, + "learning_rate": 9.983197779862485e-06, + "loss": 0.8414, + "step": 2034 + }, + { + "epoch": 0.10971533318956221, + "grad_norm": 0.8714777231216431, + "learning_rate": 9.983180409505663e-06, + "loss": 0.7355, + "step": 2035 + }, + { + "epoch": 0.10976924735820574, + "grad_norm": 0.8524130582809448, + "learning_rate": 9.98316303018975e-06, + "loss": 0.8611, + "step": 2036 + }, + { + "epoch": 0.10982316152684926, + "grad_norm": 0.8570566177368164, + "learning_rate": 9.983145641914787e-06, + "loss": 0.799, + "step": 2037 + }, + { + "epoch": 0.10987707569549278, + "grad_norm": 0.8222963213920593, + "learning_rate": 9.983128244680797e-06, + "loss": 0.8302, + "step": 2038 + }, + { + "epoch": 0.1099309898641363, + "grad_norm": 0.7977816462516785, + "learning_rate": 9.983110838487818e-06, + "loss": 0.8475, + "step": 2039 + }, + { + "epoch": 0.10998490403277981, + "grad_norm": 0.7925818562507629, + "learning_rate": 9.983093423335875e-06, + "loss": 0.7176, + "step": 2040 + }, + { + "epoch": 0.11003881820142333, + "grad_norm": 0.8456152081489563, + "learning_rate": 9.983075999225002e-06, + "loss": 0.785, + "step": 2041 + }, + { + "epoch": 0.11009273237006685, + "grad_norm": 0.8691622018814087, + "learning_rate": 9.98305856615523e-06, + "loss": 0.8871, + "step": 2042 + }, + { + "epoch": 0.11014664653871037, + "grad_norm": 0.9402886629104614, + "learning_rate": 9.983041124126593e-06, + "loss": 0.8239, + "step": 2043 + }, + { + "epoch": 0.11020056070735389, + "grad_norm": 0.7975844144821167, + "learning_rate": 9.98302367313912e-06, + "loss": 0.7336, + "step": 2044 + }, + { + "epoch": 0.11025447487599742, + "grad_norm": 0.8384075164794922, + "learning_rate": 9.98300621319284e-06, + "loss": 0.9003, + "step": 2045 + }, + { + "epoch": 0.11030838904464094, + "grad_norm": 0.847994327545166, + "learning_rate": 9.98298874428779e-06, + "loss": 0.8611, + "step": 2046 + }, + { + "epoch": 0.11036230321328445, + "grad_norm": 0.801159143447876, + "learning_rate": 9.982971266423996e-06, + "loss": 0.7967, + "step": 2047 + }, + { + "epoch": 0.11041621738192797, + "grad_norm": 0.8316680192947388, + "learning_rate": 9.982953779601492e-06, + "loss": 0.8644, + "step": 2048 + }, + { + "epoch": 0.11047013155057149, + "grad_norm": 0.9387392401695251, + "learning_rate": 9.982936283820311e-06, + "loss": 0.916, + "step": 2049 + }, + { + "epoch": 0.11052404571921501, + "grad_norm": 0.8682491779327393, + "learning_rate": 9.982918779080481e-06, + "loss": 0.8267, + "step": 2050 + }, + { + "epoch": 0.11057795988785853, + "grad_norm": 0.8443827629089355, + "learning_rate": 9.982901265382034e-06, + "loss": 0.8129, + "step": 2051 + }, + { + "epoch": 0.11063187405650204, + "grad_norm": 0.8612427115440369, + "learning_rate": 9.982883742725005e-06, + "loss": 0.9203, + "step": 2052 + }, + { + "epoch": 0.11068578822514556, + "grad_norm": 0.786834716796875, + "learning_rate": 9.98286621110942e-06, + "loss": 0.7731, + "step": 2053 + }, + { + "epoch": 0.1107397023937891, + "grad_norm": 0.8566606044769287, + "learning_rate": 9.982848670535316e-06, + "loss": 0.8111, + "step": 2054 + }, + { + "epoch": 0.11079361656243261, + "grad_norm": 0.7485222816467285, + "learning_rate": 9.982831121002722e-06, + "loss": 0.722, + "step": 2055 + }, + { + "epoch": 0.11084753073107613, + "grad_norm": 0.7441151738166809, + "learning_rate": 9.98281356251167e-06, + "loss": 0.7081, + "step": 2056 + }, + { + "epoch": 0.11090144489971965, + "grad_norm": 0.8212536573410034, + "learning_rate": 9.98279599506219e-06, + "loss": 0.8572, + "step": 2057 + }, + { + "epoch": 0.11095535906836317, + "grad_norm": 0.8686707019805908, + "learning_rate": 9.982778418654315e-06, + "loss": 0.8553, + "step": 2058 + }, + { + "epoch": 0.11100927323700668, + "grad_norm": 0.8908647298812866, + "learning_rate": 9.982760833288079e-06, + "loss": 0.9059, + "step": 2059 + }, + { + "epoch": 0.1110631874056502, + "grad_norm": 0.9393401741981506, + "learning_rate": 9.982743238963508e-06, + "loss": 0.8574, + "step": 2060 + }, + { + "epoch": 0.11111710157429372, + "grad_norm": 0.9027063250541687, + "learning_rate": 9.982725635680638e-06, + "loss": 0.7717, + "step": 2061 + }, + { + "epoch": 0.11117101574293724, + "grad_norm": 0.7742587924003601, + "learning_rate": 9.982708023439498e-06, + "loss": 0.6618, + "step": 2062 + }, + { + "epoch": 0.11122492991158077, + "grad_norm": 0.8025707602500916, + "learning_rate": 9.982690402240124e-06, + "loss": 0.7263, + "step": 2063 + }, + { + "epoch": 0.11127884408022429, + "grad_norm": 0.8629397749900818, + "learning_rate": 9.982672772082541e-06, + "loss": 0.8222, + "step": 2064 + }, + { + "epoch": 0.1113327582488678, + "grad_norm": 0.8332691788673401, + "learning_rate": 9.982655132966785e-06, + "loss": 0.8302, + "step": 2065 + }, + { + "epoch": 0.11138667241751132, + "grad_norm": 0.8381907939910889, + "learning_rate": 9.982637484892889e-06, + "loss": 0.8638, + "step": 2066 + }, + { + "epoch": 0.11144058658615484, + "grad_norm": 1.0945167541503906, + "learning_rate": 9.982619827860882e-06, + "loss": 0.8866, + "step": 2067 + }, + { + "epoch": 0.11149450075479836, + "grad_norm": 0.8755025267601013, + "learning_rate": 9.982602161870795e-06, + "loss": 0.8587, + "step": 2068 + }, + { + "epoch": 0.11154841492344188, + "grad_norm": 0.8665636777877808, + "learning_rate": 9.982584486922664e-06, + "loss": 0.8309, + "step": 2069 + }, + { + "epoch": 0.1116023290920854, + "grad_norm": 0.8764104247093201, + "learning_rate": 9.982566803016516e-06, + "loss": 0.9003, + "step": 2070 + }, + { + "epoch": 0.11165624326072891, + "grad_norm": 1.1225675344467163, + "learning_rate": 9.982549110152387e-06, + "loss": 0.8897, + "step": 2071 + }, + { + "epoch": 0.11171015742937244, + "grad_norm": 0.7883412837982178, + "learning_rate": 9.982531408330304e-06, + "loss": 0.7104, + "step": 2072 + }, + { + "epoch": 0.11176407159801596, + "grad_norm": 0.8683668971061707, + "learning_rate": 9.982513697550303e-06, + "loss": 0.831, + "step": 2073 + }, + { + "epoch": 0.11181798576665948, + "grad_norm": 0.9139745831489563, + "learning_rate": 9.982495977812415e-06, + "loss": 0.7492, + "step": 2074 + }, + { + "epoch": 0.111871899935303, + "grad_norm": 0.8651925921440125, + "learning_rate": 9.98247824911667e-06, + "loss": 0.8385, + "step": 2075 + }, + { + "epoch": 0.11192581410394652, + "grad_norm": 0.9110192656517029, + "learning_rate": 9.982460511463102e-06, + "loss": 0.8513, + "step": 2076 + }, + { + "epoch": 0.11197972827259003, + "grad_norm": 0.8511810302734375, + "learning_rate": 9.982442764851742e-06, + "loss": 0.8352, + "step": 2077 + }, + { + "epoch": 0.11203364244123355, + "grad_norm": 0.8981106877326965, + "learning_rate": 9.982425009282622e-06, + "loss": 0.7837, + "step": 2078 + }, + { + "epoch": 0.11208755660987707, + "grad_norm": 0.7660240530967712, + "learning_rate": 9.982407244755771e-06, + "loss": 0.6994, + "step": 2079 + }, + { + "epoch": 0.11214147077852059, + "grad_norm": 0.830569863319397, + "learning_rate": 9.982389471271228e-06, + "loss": 0.7756, + "step": 2080 + }, + { + "epoch": 0.11219538494716412, + "grad_norm": 0.8888838887214661, + "learning_rate": 9.982371688829018e-06, + "loss": 0.7302, + "step": 2081 + }, + { + "epoch": 0.11224929911580764, + "grad_norm": 0.823513388633728, + "learning_rate": 9.982353897429176e-06, + "loss": 0.8357, + "step": 2082 + }, + { + "epoch": 0.11230321328445116, + "grad_norm": 0.8353226780891418, + "learning_rate": 9.982336097071734e-06, + "loss": 0.7939, + "step": 2083 + }, + { + "epoch": 0.11235712745309467, + "grad_norm": 1.0246703624725342, + "learning_rate": 9.982318287756725e-06, + "loss": 0.9416, + "step": 2084 + }, + { + "epoch": 0.11241104162173819, + "grad_norm": 0.9405194520950317, + "learning_rate": 9.982300469484178e-06, + "loss": 0.8296, + "step": 2085 + }, + { + "epoch": 0.11246495579038171, + "grad_norm": 0.905885636806488, + "learning_rate": 9.982282642254126e-06, + "loss": 0.8181, + "step": 2086 + }, + { + "epoch": 0.11251886995902523, + "grad_norm": 0.8098746538162231, + "learning_rate": 9.982264806066604e-06, + "loss": 0.7372, + "step": 2087 + }, + { + "epoch": 0.11257278412766875, + "grad_norm": 1.2416350841522217, + "learning_rate": 9.98224696092164e-06, + "loss": 0.8984, + "step": 2088 + }, + { + "epoch": 0.11262669829631228, + "grad_norm": 0.8675969839096069, + "learning_rate": 9.98222910681927e-06, + "loss": 0.8417, + "step": 2089 + }, + { + "epoch": 0.1126806124649558, + "grad_norm": 1.063124179840088, + "learning_rate": 9.982211243759522e-06, + "loss": 0.9227, + "step": 2090 + }, + { + "epoch": 0.11273452663359931, + "grad_norm": 0.9010531902313232, + "learning_rate": 9.98219337174243e-06, + "loss": 0.9547, + "step": 2091 + }, + { + "epoch": 0.11278844080224283, + "grad_norm": 0.7843347191810608, + "learning_rate": 9.982175490768027e-06, + "loss": 0.8607, + "step": 2092 + }, + { + "epoch": 0.11284235497088635, + "grad_norm": 0.8451966643333435, + "learning_rate": 9.982157600836344e-06, + "loss": 0.8788, + "step": 2093 + }, + { + "epoch": 0.11289626913952987, + "grad_norm": 0.7359250783920288, + "learning_rate": 9.982139701947415e-06, + "loss": 0.7916, + "step": 2094 + }, + { + "epoch": 0.11295018330817339, + "grad_norm": 0.8133944869041443, + "learning_rate": 9.98212179410127e-06, + "loss": 0.8327, + "step": 2095 + }, + { + "epoch": 0.1130040974768169, + "grad_norm": 0.8658613562583923, + "learning_rate": 9.982103877297941e-06, + "loss": 0.7648, + "step": 2096 + }, + { + "epoch": 0.11305801164546042, + "grad_norm": 0.8523211479187012, + "learning_rate": 9.982085951537463e-06, + "loss": 0.8618, + "step": 2097 + }, + { + "epoch": 0.11311192581410395, + "grad_norm": 0.9494971632957458, + "learning_rate": 9.982068016819867e-06, + "loss": 0.8116, + "step": 2098 + }, + { + "epoch": 0.11316583998274747, + "grad_norm": 0.797603964805603, + "learning_rate": 9.982050073145182e-06, + "loss": 0.7268, + "step": 2099 + }, + { + "epoch": 0.11321975415139099, + "grad_norm": 0.8662691712379456, + "learning_rate": 9.982032120513443e-06, + "loss": 0.8007, + "step": 2100 + }, + { + "epoch": 0.1132736683200345, + "grad_norm": 0.8377127051353455, + "learning_rate": 9.982014158924684e-06, + "loss": 0.813, + "step": 2101 + }, + { + "epoch": 0.11332758248867802, + "grad_norm": 1.0051186084747314, + "learning_rate": 9.981996188378934e-06, + "loss": 0.921, + "step": 2102 + }, + { + "epoch": 0.11338149665732154, + "grad_norm": 0.7831799983978271, + "learning_rate": 9.981978208876228e-06, + "loss": 0.9197, + "step": 2103 + }, + { + "epoch": 0.11343541082596506, + "grad_norm": 1.0273268222808838, + "learning_rate": 9.981960220416595e-06, + "loss": 0.9144, + "step": 2104 + }, + { + "epoch": 0.11348932499460858, + "grad_norm": 0.8754317164421082, + "learning_rate": 9.981942223000072e-06, + "loss": 0.8359, + "step": 2105 + }, + { + "epoch": 0.1135432391632521, + "grad_norm": 0.7923420071601868, + "learning_rate": 9.981924216626686e-06, + "loss": 0.737, + "step": 2106 + }, + { + "epoch": 0.11359715333189563, + "grad_norm": 0.8651608824729919, + "learning_rate": 9.981906201296475e-06, + "loss": 0.7588, + "step": 2107 + }, + { + "epoch": 0.11365106750053915, + "grad_norm": 0.9219616651535034, + "learning_rate": 9.981888177009468e-06, + "loss": 0.8598, + "step": 2108 + }, + { + "epoch": 0.11370498166918266, + "grad_norm": 0.8936532139778137, + "learning_rate": 9.981870143765697e-06, + "loss": 0.7718, + "step": 2109 + }, + { + "epoch": 0.11375889583782618, + "grad_norm": 0.8959317803382874, + "learning_rate": 9.981852101565195e-06, + "loss": 0.794, + "step": 2110 + }, + { + "epoch": 0.1138128100064697, + "grad_norm": 0.8781943917274475, + "learning_rate": 9.981834050407997e-06, + "loss": 0.8045, + "step": 2111 + }, + { + "epoch": 0.11386672417511322, + "grad_norm": 0.8148792386054993, + "learning_rate": 9.981815990294131e-06, + "loss": 0.7398, + "step": 2112 + }, + { + "epoch": 0.11392063834375674, + "grad_norm": 0.8491646647453308, + "learning_rate": 9.981797921223633e-06, + "loss": 0.878, + "step": 2113 + }, + { + "epoch": 0.11397455251240025, + "grad_norm": 0.8166778087615967, + "learning_rate": 9.981779843196533e-06, + "loss": 0.918, + "step": 2114 + }, + { + "epoch": 0.11402846668104377, + "grad_norm": 0.8016941547393799, + "learning_rate": 9.981761756212867e-06, + "loss": 0.7958, + "step": 2115 + }, + { + "epoch": 0.1140823808496873, + "grad_norm": 0.9108608961105347, + "learning_rate": 9.981743660272663e-06, + "loss": 0.8645, + "step": 2116 + }, + { + "epoch": 0.11413629501833082, + "grad_norm": 0.8930072784423828, + "learning_rate": 9.981725555375956e-06, + "loss": 0.842, + "step": 2117 + }, + { + "epoch": 0.11419020918697434, + "grad_norm": 0.75871342420578, + "learning_rate": 9.981707441522778e-06, + "loss": 0.7513, + "step": 2118 + }, + { + "epoch": 0.11424412335561786, + "grad_norm": 0.9924628734588623, + "learning_rate": 9.981689318713163e-06, + "loss": 0.8248, + "step": 2119 + }, + { + "epoch": 0.11429803752426138, + "grad_norm": 0.9345909953117371, + "learning_rate": 9.981671186947145e-06, + "loss": 0.7963, + "step": 2120 + }, + { + "epoch": 0.1143519516929049, + "grad_norm": 0.8094825148582458, + "learning_rate": 9.98165304622475e-06, + "loss": 0.8189, + "step": 2121 + }, + { + "epoch": 0.11440586586154841, + "grad_norm": 0.789262056350708, + "learning_rate": 9.981634896546017e-06, + "loss": 0.721, + "step": 2122 + }, + { + "epoch": 0.11445978003019193, + "grad_norm": 0.9279952645301819, + "learning_rate": 9.981616737910975e-06, + "loss": 0.8499, + "step": 2123 + }, + { + "epoch": 0.11451369419883545, + "grad_norm": 0.8332392573356628, + "learning_rate": 9.981598570319657e-06, + "loss": 0.8296, + "step": 2124 + }, + { + "epoch": 0.11456760836747898, + "grad_norm": 0.7957965731620789, + "learning_rate": 9.981580393772098e-06, + "loss": 0.7872, + "step": 2125 + }, + { + "epoch": 0.1146215225361225, + "grad_norm": 0.7587382197380066, + "learning_rate": 9.981562208268331e-06, + "loss": 0.721, + "step": 2126 + }, + { + "epoch": 0.11467543670476602, + "grad_norm": 0.7246111631393433, + "learning_rate": 9.981544013808385e-06, + "loss": 0.7965, + "step": 2127 + }, + { + "epoch": 0.11472935087340953, + "grad_norm": 0.9953028559684753, + "learning_rate": 9.981525810392295e-06, + "loss": 0.7129, + "step": 2128 + }, + { + "epoch": 0.11478326504205305, + "grad_norm": 1.0731823444366455, + "learning_rate": 9.981507598020094e-06, + "loss": 0.8532, + "step": 2129 + }, + { + "epoch": 0.11483717921069657, + "grad_norm": 0.8425208926200867, + "learning_rate": 9.981489376691814e-06, + "loss": 0.8191, + "step": 2130 + }, + { + "epoch": 0.11489109337934009, + "grad_norm": 0.7841627597808838, + "learning_rate": 9.981471146407487e-06, + "loss": 0.7946, + "step": 2131 + }, + { + "epoch": 0.1149450075479836, + "grad_norm": 0.8923974633216858, + "learning_rate": 9.981452907167148e-06, + "loss": 0.8445, + "step": 2132 + }, + { + "epoch": 0.11499892171662712, + "grad_norm": 0.7729552984237671, + "learning_rate": 9.981434658970828e-06, + "loss": 0.7566, + "step": 2133 + }, + { + "epoch": 0.11505283588527065, + "grad_norm": 0.910899817943573, + "learning_rate": 9.98141640181856e-06, + "loss": 0.8236, + "step": 2134 + }, + { + "epoch": 0.11510675005391417, + "grad_norm": 0.8768936395645142, + "learning_rate": 9.981398135710377e-06, + "loss": 0.8929, + "step": 2135 + }, + { + "epoch": 0.11516066422255769, + "grad_norm": 0.9078627824783325, + "learning_rate": 9.981379860646313e-06, + "loss": 0.745, + "step": 2136 + }, + { + "epoch": 0.11521457839120121, + "grad_norm": 0.8225182890892029, + "learning_rate": 9.981361576626399e-06, + "loss": 0.8349, + "step": 2137 + }, + { + "epoch": 0.11526849255984473, + "grad_norm": 0.8092076778411865, + "learning_rate": 9.981343283650668e-06, + "loss": 0.8157, + "step": 2138 + }, + { + "epoch": 0.11532240672848824, + "grad_norm": 0.8253282308578491, + "learning_rate": 9.981324981719156e-06, + "loss": 0.7412, + "step": 2139 + }, + { + "epoch": 0.11537632089713176, + "grad_norm": 0.9668901562690735, + "learning_rate": 9.981306670831892e-06, + "loss": 0.7868, + "step": 2140 + }, + { + "epoch": 0.11543023506577528, + "grad_norm": 0.7919616103172302, + "learning_rate": 9.981288350988911e-06, + "loss": 0.7384, + "step": 2141 + }, + { + "epoch": 0.11548414923441881, + "grad_norm": 0.8589178919792175, + "learning_rate": 9.981270022190244e-06, + "loss": 0.8352, + "step": 2142 + }, + { + "epoch": 0.11553806340306233, + "grad_norm": 0.8211520910263062, + "learning_rate": 9.981251684435926e-06, + "loss": 0.8124, + "step": 2143 + }, + { + "epoch": 0.11559197757170585, + "grad_norm": 0.911702573299408, + "learning_rate": 9.98123333772599e-06, + "loss": 0.8468, + "step": 2144 + }, + { + "epoch": 0.11564589174034937, + "grad_norm": 0.7934874892234802, + "learning_rate": 9.981214982060469e-06, + "loss": 0.8091, + "step": 2145 + }, + { + "epoch": 0.11569980590899288, + "grad_norm": 0.7407031655311584, + "learning_rate": 9.981196617439394e-06, + "loss": 0.7755, + "step": 2146 + }, + { + "epoch": 0.1157537200776364, + "grad_norm": 0.757688581943512, + "learning_rate": 9.9811782438628e-06, + "loss": 0.7468, + "step": 2147 + }, + { + "epoch": 0.11580763424627992, + "grad_norm": 1.0007857084274292, + "learning_rate": 9.981159861330717e-06, + "loss": 0.9108, + "step": 2148 + }, + { + "epoch": 0.11586154841492344, + "grad_norm": 1.300113558769226, + "learning_rate": 9.981141469843183e-06, + "loss": 0.8099, + "step": 2149 + }, + { + "epoch": 0.11591546258356696, + "grad_norm": 1.0352274179458618, + "learning_rate": 9.981123069400226e-06, + "loss": 0.801, + "step": 2150 + }, + { + "epoch": 0.11596937675221049, + "grad_norm": 0.9033756256103516, + "learning_rate": 9.981104660001885e-06, + "loss": 0.8789, + "step": 2151 + }, + { + "epoch": 0.116023290920854, + "grad_norm": 0.9051264524459839, + "learning_rate": 9.981086241648188e-06, + "loss": 0.8737, + "step": 2152 + }, + { + "epoch": 0.11607720508949752, + "grad_norm": 0.7855859398841858, + "learning_rate": 9.98106781433917e-06, + "loss": 0.7508, + "step": 2153 + }, + { + "epoch": 0.11613111925814104, + "grad_norm": 0.9001717567443848, + "learning_rate": 9.981049378074862e-06, + "loss": 0.6852, + "step": 2154 + }, + { + "epoch": 0.11618503342678456, + "grad_norm": 0.8165149092674255, + "learning_rate": 9.9810309328553e-06, + "loss": 0.8755, + "step": 2155 + }, + { + "epoch": 0.11623894759542808, + "grad_norm": 0.8920814990997314, + "learning_rate": 9.981012478680517e-06, + "loss": 0.753, + "step": 2156 + }, + { + "epoch": 0.1162928617640716, + "grad_norm": 0.8186051249504089, + "learning_rate": 9.980994015550544e-06, + "loss": 0.8341, + "step": 2157 + }, + { + "epoch": 0.11634677593271511, + "grad_norm": 0.8103832602500916, + "learning_rate": 9.980975543465417e-06, + "loss": 0.8276, + "step": 2158 + }, + { + "epoch": 0.11640069010135863, + "grad_norm": 0.8752830028533936, + "learning_rate": 9.980957062425167e-06, + "loss": 0.8449, + "step": 2159 + }, + { + "epoch": 0.11645460427000216, + "grad_norm": 0.9748302698135376, + "learning_rate": 9.98093857242983e-06, + "loss": 0.8323, + "step": 2160 + }, + { + "epoch": 0.11650851843864568, + "grad_norm": 0.8948556184768677, + "learning_rate": 9.980920073479435e-06, + "loss": 0.7836, + "step": 2161 + }, + { + "epoch": 0.1165624326072892, + "grad_norm": 0.8715651035308838, + "learning_rate": 9.980901565574017e-06, + "loss": 0.7942, + "step": 2162 + }, + { + "epoch": 0.11661634677593272, + "grad_norm": 0.7667563557624817, + "learning_rate": 9.980883048713612e-06, + "loss": 0.7517, + "step": 2163 + }, + { + "epoch": 0.11667026094457623, + "grad_norm": 0.8058063387870789, + "learning_rate": 9.980864522898247e-06, + "loss": 0.7997, + "step": 2164 + }, + { + "epoch": 0.11672417511321975, + "grad_norm": 0.9300008416175842, + "learning_rate": 9.980845988127963e-06, + "loss": 0.856, + "step": 2165 + }, + { + "epoch": 0.11677808928186327, + "grad_norm": 0.8321848511695862, + "learning_rate": 9.98082744440279e-06, + "loss": 0.7483, + "step": 2166 + }, + { + "epoch": 0.11683200345050679, + "grad_norm": 0.9346274137496948, + "learning_rate": 9.98080889172276e-06, + "loss": 0.8149, + "step": 2167 + }, + { + "epoch": 0.1168859176191503, + "grad_norm": 0.9119831919670105, + "learning_rate": 9.980790330087906e-06, + "loss": 0.8384, + "step": 2168 + }, + { + "epoch": 0.11693983178779384, + "grad_norm": 0.8416613936424255, + "learning_rate": 9.980771759498264e-06, + "loss": 0.776, + "step": 2169 + }, + { + "epoch": 0.11699374595643736, + "grad_norm": 0.765889048576355, + "learning_rate": 9.980753179953867e-06, + "loss": 0.7413, + "step": 2170 + }, + { + "epoch": 0.11704766012508087, + "grad_norm": 1.3491352796554565, + "learning_rate": 9.980734591454746e-06, + "loss": 0.7444, + "step": 2171 + }, + { + "epoch": 0.11710157429372439, + "grad_norm": 0.926618218421936, + "learning_rate": 9.980715994000936e-06, + "loss": 0.8495, + "step": 2172 + }, + { + "epoch": 0.11715548846236791, + "grad_norm": 0.7720175981521606, + "learning_rate": 9.98069738759247e-06, + "loss": 0.8238, + "step": 2173 + }, + { + "epoch": 0.11720940263101143, + "grad_norm": 0.9114102125167847, + "learning_rate": 9.980678772229385e-06, + "loss": 0.7805, + "step": 2174 + }, + { + "epoch": 0.11726331679965495, + "grad_norm": 0.778404712677002, + "learning_rate": 9.980660147911709e-06, + "loss": 0.7705, + "step": 2175 + }, + { + "epoch": 0.11731723096829846, + "grad_norm": 0.7945864200592041, + "learning_rate": 9.980641514639478e-06, + "loss": 0.7052, + "step": 2176 + }, + { + "epoch": 0.11737114513694198, + "grad_norm": 0.8246831297874451, + "learning_rate": 9.980622872412723e-06, + "loss": 0.8514, + "step": 2177 + }, + { + "epoch": 0.11742505930558551, + "grad_norm": 0.899563193321228, + "learning_rate": 9.980604221231482e-06, + "loss": 0.761, + "step": 2178 + }, + { + "epoch": 0.11747897347422903, + "grad_norm": 0.7277782559394836, + "learning_rate": 9.980585561095788e-06, + "loss": 0.6671, + "step": 2179 + }, + { + "epoch": 0.11753288764287255, + "grad_norm": 0.7977896928787231, + "learning_rate": 9.98056689200567e-06, + "loss": 0.8045, + "step": 2180 + }, + { + "epoch": 0.11758680181151607, + "grad_norm": 0.8606321811676025, + "learning_rate": 9.980548213961165e-06, + "loss": 0.8232, + "step": 2181 + }, + { + "epoch": 0.11764071598015959, + "grad_norm": 0.769458532333374, + "learning_rate": 9.980529526962308e-06, + "loss": 0.729, + "step": 2182 + }, + { + "epoch": 0.1176946301488031, + "grad_norm": 1.1045739650726318, + "learning_rate": 9.98051083100913e-06, + "loss": 0.802, + "step": 2183 + }, + { + "epoch": 0.11774854431744662, + "grad_norm": 0.7568592429161072, + "learning_rate": 9.980492126101664e-06, + "loss": 0.7427, + "step": 2184 + }, + { + "epoch": 0.11780245848609014, + "grad_norm": 0.7503477931022644, + "learning_rate": 9.980473412239946e-06, + "loss": 0.7857, + "step": 2185 + }, + { + "epoch": 0.11785637265473366, + "grad_norm": 0.8330819606781006, + "learning_rate": 9.980454689424007e-06, + "loss": 0.7561, + "step": 2186 + }, + { + "epoch": 0.11791028682337719, + "grad_norm": 0.792736291885376, + "learning_rate": 9.980435957653884e-06, + "loss": 0.837, + "step": 2187 + }, + { + "epoch": 0.1179642009920207, + "grad_norm": 0.8983330130577087, + "learning_rate": 9.980417216929608e-06, + "loss": 0.8499, + "step": 2188 + }, + { + "epoch": 0.11801811516066422, + "grad_norm": 0.8700925707817078, + "learning_rate": 9.980398467251214e-06, + "loss": 0.9048, + "step": 2189 + }, + { + "epoch": 0.11807202932930774, + "grad_norm": 0.8873588442802429, + "learning_rate": 9.980379708618734e-06, + "loss": 0.7617, + "step": 2190 + }, + { + "epoch": 0.11812594349795126, + "grad_norm": 0.7786865234375, + "learning_rate": 9.980360941032204e-06, + "loss": 0.7828, + "step": 2191 + }, + { + "epoch": 0.11817985766659478, + "grad_norm": 0.796852171421051, + "learning_rate": 9.980342164491657e-06, + "loss": 0.7739, + "step": 2192 + }, + { + "epoch": 0.1182337718352383, + "grad_norm": 0.7752018570899963, + "learning_rate": 9.980323378997126e-06, + "loss": 0.6969, + "step": 2193 + }, + { + "epoch": 0.11828768600388181, + "grad_norm": 0.8607134819030762, + "learning_rate": 9.980304584548644e-06, + "loss": 0.8623, + "step": 2194 + }, + { + "epoch": 0.11834160017252535, + "grad_norm": 0.8624950051307678, + "learning_rate": 9.980285781146248e-06, + "loss": 0.8124, + "step": 2195 + }, + { + "epoch": 0.11839551434116886, + "grad_norm": 0.8951582908630371, + "learning_rate": 9.98026696878997e-06, + "loss": 0.8491, + "step": 2196 + }, + { + "epoch": 0.11844942850981238, + "grad_norm": 0.8373478055000305, + "learning_rate": 9.980248147479843e-06, + "loss": 0.7166, + "step": 2197 + }, + { + "epoch": 0.1185033426784559, + "grad_norm": 0.8007619976997375, + "learning_rate": 9.980229317215901e-06, + "loss": 0.8137, + "step": 2198 + }, + { + "epoch": 0.11855725684709942, + "grad_norm": 0.8464154601097107, + "learning_rate": 9.980210477998177e-06, + "loss": 0.7803, + "step": 2199 + }, + { + "epoch": 0.11861117101574294, + "grad_norm": 0.8384450078010559, + "learning_rate": 9.98019162982671e-06, + "loss": 0.8511, + "step": 2200 + }, + { + "epoch": 0.11866508518438645, + "grad_norm": 0.9059091210365295, + "learning_rate": 9.980172772701527e-06, + "loss": 0.8538, + "step": 2201 + }, + { + "epoch": 0.11871899935302997, + "grad_norm": 1.1080526113510132, + "learning_rate": 9.980153906622667e-06, + "loss": 1.0067, + "step": 2202 + }, + { + "epoch": 0.11877291352167349, + "grad_norm": 0.8379873633384705, + "learning_rate": 9.980135031590162e-06, + "loss": 0.8285, + "step": 2203 + }, + { + "epoch": 0.11882682769031702, + "grad_norm": 0.9143814444541931, + "learning_rate": 9.980116147604044e-06, + "loss": 0.8286, + "step": 2204 + }, + { + "epoch": 0.11888074185896054, + "grad_norm": 0.8619917631149292, + "learning_rate": 9.98009725466435e-06, + "loss": 0.8304, + "step": 2205 + }, + { + "epoch": 0.11893465602760406, + "grad_norm": 0.8470893502235413, + "learning_rate": 9.980078352771112e-06, + "loss": 0.8245, + "step": 2206 + }, + { + "epoch": 0.11898857019624758, + "grad_norm": 0.9560073614120483, + "learning_rate": 9.980059441924365e-06, + "loss": 0.8821, + "step": 2207 + }, + { + "epoch": 0.1190424843648911, + "grad_norm": 0.8186134696006775, + "learning_rate": 9.980040522124143e-06, + "loss": 0.7166, + "step": 2208 + }, + { + "epoch": 0.11909639853353461, + "grad_norm": 0.8410859704017639, + "learning_rate": 9.980021593370481e-06, + "loss": 0.7465, + "step": 2209 + }, + { + "epoch": 0.11915031270217813, + "grad_norm": 0.9180718660354614, + "learning_rate": 9.980002655663412e-06, + "loss": 0.8508, + "step": 2210 + }, + { + "epoch": 0.11920422687082165, + "grad_norm": 0.8384451270103455, + "learning_rate": 9.979983709002967e-06, + "loss": 0.7723, + "step": 2211 + }, + { + "epoch": 0.11925814103946517, + "grad_norm": 0.815075159072876, + "learning_rate": 9.979964753389187e-06, + "loss": 0.7769, + "step": 2212 + }, + { + "epoch": 0.1193120552081087, + "grad_norm": 0.9130523800849915, + "learning_rate": 9.9799457888221e-06, + "loss": 0.8616, + "step": 2213 + }, + { + "epoch": 0.11936596937675222, + "grad_norm": 0.8262661099433899, + "learning_rate": 9.97992681530174e-06, + "loss": 0.7507, + "step": 2214 + }, + { + "epoch": 0.11941988354539573, + "grad_norm": 0.8962772488594055, + "learning_rate": 9.979907832828145e-06, + "loss": 0.8387, + "step": 2215 + }, + { + "epoch": 0.11947379771403925, + "grad_norm": 0.8966812491416931, + "learning_rate": 9.979888841401348e-06, + "loss": 0.8095, + "step": 2216 + }, + { + "epoch": 0.11952771188268277, + "grad_norm": 0.8484013676643372, + "learning_rate": 9.979869841021381e-06, + "loss": 0.8475, + "step": 2217 + }, + { + "epoch": 0.11958162605132629, + "grad_norm": 0.8858511447906494, + "learning_rate": 9.979850831688282e-06, + "loss": 0.8576, + "step": 2218 + }, + { + "epoch": 0.1196355402199698, + "grad_norm": 0.8044704794883728, + "learning_rate": 9.97983181340208e-06, + "loss": 0.8195, + "step": 2219 + }, + { + "epoch": 0.11968945438861332, + "grad_norm": 0.8463665246963501, + "learning_rate": 9.979812786162815e-06, + "loss": 0.8177, + "step": 2220 + }, + { + "epoch": 0.11974336855725684, + "grad_norm": 0.8145734071731567, + "learning_rate": 9.979793749970517e-06, + "loss": 0.8307, + "step": 2221 + }, + { + "epoch": 0.11979728272590037, + "grad_norm": 0.7789961695671082, + "learning_rate": 9.97977470482522e-06, + "loss": 0.7854, + "step": 2222 + }, + { + "epoch": 0.11985119689454389, + "grad_norm": 0.858213484287262, + "learning_rate": 9.97975565072696e-06, + "loss": 0.8914, + "step": 2223 + }, + { + "epoch": 0.11990511106318741, + "grad_norm": 0.8503074645996094, + "learning_rate": 9.979736587675772e-06, + "loss": 0.8731, + "step": 2224 + }, + { + "epoch": 0.11995902523183093, + "grad_norm": 0.9815833568572998, + "learning_rate": 9.97971751567169e-06, + "loss": 0.8769, + "step": 2225 + }, + { + "epoch": 0.12001293940047444, + "grad_norm": 0.7897947430610657, + "learning_rate": 9.979698434714747e-06, + "loss": 0.8308, + "step": 2226 + }, + { + "epoch": 0.12006685356911796, + "grad_norm": 0.9122232794761658, + "learning_rate": 9.979679344804976e-06, + "loss": 0.8934, + "step": 2227 + }, + { + "epoch": 0.12012076773776148, + "grad_norm": 0.7640379071235657, + "learning_rate": 9.979660245942416e-06, + "loss": 0.8205, + "step": 2228 + }, + { + "epoch": 0.120174681906405, + "grad_norm": 0.8736944198608398, + "learning_rate": 9.979641138127097e-06, + "loss": 0.8522, + "step": 2229 + }, + { + "epoch": 0.12022859607504852, + "grad_norm": 0.8782697916030884, + "learning_rate": 9.979622021359054e-06, + "loss": 0.812, + "step": 2230 + }, + { + "epoch": 0.12028251024369205, + "grad_norm": 0.8260065317153931, + "learning_rate": 9.979602895638322e-06, + "loss": 0.768, + "step": 2231 + }, + { + "epoch": 0.12033642441233557, + "grad_norm": 0.8338255286216736, + "learning_rate": 9.979583760964939e-06, + "loss": 0.7747, + "step": 2232 + }, + { + "epoch": 0.12039033858097908, + "grad_norm": 0.8310086131095886, + "learning_rate": 9.979564617338933e-06, + "loss": 0.8206, + "step": 2233 + }, + { + "epoch": 0.1204442527496226, + "grad_norm": 0.8234529495239258, + "learning_rate": 9.979545464760342e-06, + "loss": 0.847, + "step": 2234 + }, + { + "epoch": 0.12049816691826612, + "grad_norm": 0.9490135908126831, + "learning_rate": 9.9795263032292e-06, + "loss": 0.7277, + "step": 2235 + }, + { + "epoch": 0.12055208108690964, + "grad_norm": 0.8937979340553284, + "learning_rate": 9.97950713274554e-06, + "loss": 0.8714, + "step": 2236 + }, + { + "epoch": 0.12060599525555316, + "grad_norm": 0.7739347219467163, + "learning_rate": 9.9794879533094e-06, + "loss": 0.8009, + "step": 2237 + }, + { + "epoch": 0.12065990942419667, + "grad_norm": 0.8843472003936768, + "learning_rate": 9.979468764920812e-06, + "loss": 0.7748, + "step": 2238 + }, + { + "epoch": 0.12071382359284019, + "grad_norm": 0.815528154373169, + "learning_rate": 9.979449567579809e-06, + "loss": 0.7896, + "step": 2239 + }, + { + "epoch": 0.12076773776148372, + "grad_norm": 0.8802885413169861, + "learning_rate": 9.979430361286428e-06, + "loss": 0.8468, + "step": 2240 + }, + { + "epoch": 0.12082165193012724, + "grad_norm": 0.7907035946846008, + "learning_rate": 9.979411146040703e-06, + "loss": 0.7742, + "step": 2241 + }, + { + "epoch": 0.12087556609877076, + "grad_norm": 0.8344926238059998, + "learning_rate": 9.979391921842669e-06, + "loss": 0.8242, + "step": 2242 + }, + { + "epoch": 0.12092948026741428, + "grad_norm": 0.8011842370033264, + "learning_rate": 9.979372688692359e-06, + "loss": 0.7697, + "step": 2243 + }, + { + "epoch": 0.1209833944360578, + "grad_norm": 0.9063104391098022, + "learning_rate": 9.97935344658981e-06, + "loss": 0.8487, + "step": 2244 + }, + { + "epoch": 0.12103730860470131, + "grad_norm": 0.8313894867897034, + "learning_rate": 9.979334195535053e-06, + "loss": 0.8601, + "step": 2245 + }, + { + "epoch": 0.12109122277334483, + "grad_norm": 0.7892987728118896, + "learning_rate": 9.979314935528125e-06, + "loss": 0.7539, + "step": 2246 + }, + { + "epoch": 0.12114513694198835, + "grad_norm": 0.8141210079193115, + "learning_rate": 9.979295666569062e-06, + "loss": 0.8749, + "step": 2247 + }, + { + "epoch": 0.12119905111063188, + "grad_norm": 0.8218675851821899, + "learning_rate": 9.979276388657895e-06, + "loss": 0.743, + "step": 2248 + }, + { + "epoch": 0.1212529652792754, + "grad_norm": 0.8640784025192261, + "learning_rate": 9.979257101794661e-06, + "loss": 0.8876, + "step": 2249 + }, + { + "epoch": 0.12130687944791892, + "grad_norm": 0.8411698341369629, + "learning_rate": 9.979237805979395e-06, + "loss": 0.8692, + "step": 2250 + }, + { + "epoch": 0.12136079361656243, + "grad_norm": 0.9402859210968018, + "learning_rate": 9.97921850121213e-06, + "loss": 0.9362, + "step": 2251 + }, + { + "epoch": 0.12141470778520595, + "grad_norm": 0.8132252097129822, + "learning_rate": 9.979199187492903e-06, + "loss": 0.8119, + "step": 2252 + }, + { + "epoch": 0.12146862195384947, + "grad_norm": 0.9142205119132996, + "learning_rate": 9.979179864821747e-06, + "loss": 0.8219, + "step": 2253 + }, + { + "epoch": 0.12152253612249299, + "grad_norm": 0.9614750742912292, + "learning_rate": 9.979160533198697e-06, + "loss": 0.8342, + "step": 2254 + }, + { + "epoch": 0.1215764502911365, + "grad_norm": 0.7893047332763672, + "learning_rate": 9.979141192623787e-06, + "loss": 0.7111, + "step": 2255 + }, + { + "epoch": 0.12163036445978002, + "grad_norm": 0.8807032704353333, + "learning_rate": 9.979121843097053e-06, + "loss": 0.7677, + "step": 2256 + }, + { + "epoch": 0.12168427862842356, + "grad_norm": 1.1099025011062622, + "learning_rate": 9.97910248461853e-06, + "loss": 0.9548, + "step": 2257 + }, + { + "epoch": 0.12173819279706707, + "grad_norm": 0.9182586669921875, + "learning_rate": 9.979083117188253e-06, + "loss": 0.8734, + "step": 2258 + }, + { + "epoch": 0.12179210696571059, + "grad_norm": 0.9201869964599609, + "learning_rate": 9.979063740806253e-06, + "loss": 0.823, + "step": 2259 + }, + { + "epoch": 0.12184602113435411, + "grad_norm": 1.0309760570526123, + "learning_rate": 9.979044355472571e-06, + "loss": 0.7175, + "step": 2260 + }, + { + "epoch": 0.12189993530299763, + "grad_norm": 0.8577457070350647, + "learning_rate": 9.979024961187238e-06, + "loss": 0.8963, + "step": 2261 + }, + { + "epoch": 0.12195384947164115, + "grad_norm": 0.8203986883163452, + "learning_rate": 9.97900555795029e-06, + "loss": 0.736, + "step": 2262 + }, + { + "epoch": 0.12200776364028466, + "grad_norm": 0.8232439160346985, + "learning_rate": 9.97898614576176e-06, + "loss": 0.8104, + "step": 2263 + }, + { + "epoch": 0.12206167780892818, + "grad_norm": 1.276479959487915, + "learning_rate": 9.978966724621686e-06, + "loss": 0.7975, + "step": 2264 + }, + { + "epoch": 0.1221155919775717, + "grad_norm": 1.0115424394607544, + "learning_rate": 9.978947294530102e-06, + "loss": 1.0566, + "step": 2265 + }, + { + "epoch": 0.12216950614621523, + "grad_norm": 0.8645843863487244, + "learning_rate": 9.97892785548704e-06, + "loss": 0.8772, + "step": 2266 + }, + { + "epoch": 0.12222342031485875, + "grad_norm": 0.8335905075073242, + "learning_rate": 9.978908407492539e-06, + "loss": 0.7735, + "step": 2267 + }, + { + "epoch": 0.12227733448350227, + "grad_norm": 0.7752977013587952, + "learning_rate": 9.978888950546632e-06, + "loss": 0.725, + "step": 2268 + }, + { + "epoch": 0.12233124865214579, + "grad_norm": 0.9533143639564514, + "learning_rate": 9.978869484649354e-06, + "loss": 0.7845, + "step": 2269 + }, + { + "epoch": 0.1223851628207893, + "grad_norm": 1.2071044445037842, + "learning_rate": 9.978850009800739e-06, + "loss": 0.8394, + "step": 2270 + }, + { + "epoch": 0.12243907698943282, + "grad_norm": 0.8296889662742615, + "learning_rate": 9.978830526000825e-06, + "loss": 0.8088, + "step": 2271 + }, + { + "epoch": 0.12249299115807634, + "grad_norm": 0.7804126739501953, + "learning_rate": 9.978811033249643e-06, + "loss": 0.8174, + "step": 2272 + }, + { + "epoch": 0.12254690532671986, + "grad_norm": 0.9114241600036621, + "learning_rate": 9.978791531547232e-06, + "loss": 0.8601, + "step": 2273 + }, + { + "epoch": 0.12260081949536338, + "grad_norm": 0.9482108354568481, + "learning_rate": 9.978772020893626e-06, + "loss": 0.8063, + "step": 2274 + }, + { + "epoch": 0.1226547336640069, + "grad_norm": 0.7750483751296997, + "learning_rate": 9.978752501288857e-06, + "loss": 0.7875, + "step": 2275 + }, + { + "epoch": 0.12270864783265042, + "grad_norm": 0.838796854019165, + "learning_rate": 9.978732972732964e-06, + "loss": 0.7617, + "step": 2276 + }, + { + "epoch": 0.12276256200129394, + "grad_norm": 0.8419491052627563, + "learning_rate": 9.97871343522598e-06, + "loss": 0.8438, + "step": 2277 + }, + { + "epoch": 0.12281647616993746, + "grad_norm": 0.8125029802322388, + "learning_rate": 9.97869388876794e-06, + "loss": 0.8376, + "step": 2278 + }, + { + "epoch": 0.12287039033858098, + "grad_norm": 0.8310109972953796, + "learning_rate": 9.978674333358882e-06, + "loss": 0.8159, + "step": 2279 + }, + { + "epoch": 0.1229243045072245, + "grad_norm": 0.9533166289329529, + "learning_rate": 9.978654768998838e-06, + "loss": 0.8911, + "step": 2280 + }, + { + "epoch": 0.12297821867586801, + "grad_norm": 0.7564504742622375, + "learning_rate": 9.978635195687845e-06, + "loss": 0.7685, + "step": 2281 + }, + { + "epoch": 0.12303213284451153, + "grad_norm": 0.7912551760673523, + "learning_rate": 9.978615613425937e-06, + "loss": 0.7392, + "step": 2282 + }, + { + "epoch": 0.12308604701315505, + "grad_norm": 0.8196814656257629, + "learning_rate": 9.978596022213148e-06, + "loss": 0.8619, + "step": 2283 + }, + { + "epoch": 0.12313996118179858, + "grad_norm": 0.9053134918212891, + "learning_rate": 9.978576422049515e-06, + "loss": 0.8822, + "step": 2284 + }, + { + "epoch": 0.1231938753504421, + "grad_norm": 0.7988365292549133, + "learning_rate": 9.978556812935074e-06, + "loss": 0.7993, + "step": 2285 + }, + { + "epoch": 0.12324778951908562, + "grad_norm": 0.7595045566558838, + "learning_rate": 9.978537194869859e-06, + "loss": 0.7589, + "step": 2286 + }, + { + "epoch": 0.12330170368772914, + "grad_norm": 0.872302234172821, + "learning_rate": 9.978517567853908e-06, + "loss": 0.8315, + "step": 2287 + }, + { + "epoch": 0.12335561785637265, + "grad_norm": 0.8375674486160278, + "learning_rate": 9.97849793188725e-06, + "loss": 0.8348, + "step": 2288 + }, + { + "epoch": 0.12340953202501617, + "grad_norm": 0.8239575624465942, + "learning_rate": 9.978478286969927e-06, + "loss": 0.7636, + "step": 2289 + }, + { + "epoch": 0.12346344619365969, + "grad_norm": 0.8614348769187927, + "learning_rate": 9.97845863310197e-06, + "loss": 0.8162, + "step": 2290 + }, + { + "epoch": 0.12351736036230321, + "grad_norm": 0.8609321713447571, + "learning_rate": 9.978438970283417e-06, + "loss": 0.7776, + "step": 2291 + }, + { + "epoch": 0.12357127453094673, + "grad_norm": 0.9590173959732056, + "learning_rate": 9.978419298514302e-06, + "loss": 0.8761, + "step": 2292 + }, + { + "epoch": 0.12362518869959026, + "grad_norm": 0.8345216512680054, + "learning_rate": 9.978399617794659e-06, + "loss": 0.8353, + "step": 2293 + }, + { + "epoch": 0.12367910286823378, + "grad_norm": 0.8771556615829468, + "learning_rate": 9.978379928124526e-06, + "loss": 0.773, + "step": 2294 + }, + { + "epoch": 0.1237330170368773, + "grad_norm": 0.8305835127830505, + "learning_rate": 9.978360229503936e-06, + "loss": 0.7898, + "step": 2295 + }, + { + "epoch": 0.12378693120552081, + "grad_norm": 0.8536269664764404, + "learning_rate": 9.978340521932927e-06, + "loss": 0.8261, + "step": 2296 + }, + { + "epoch": 0.12384084537416433, + "grad_norm": 0.9008522629737854, + "learning_rate": 9.978320805411534e-06, + "loss": 0.7114, + "step": 2297 + }, + { + "epoch": 0.12389475954280785, + "grad_norm": 0.7834939956665039, + "learning_rate": 9.97830107993979e-06, + "loss": 0.7338, + "step": 2298 + }, + { + "epoch": 0.12394867371145137, + "grad_norm": 0.8269515037536621, + "learning_rate": 9.978281345517733e-06, + "loss": 0.7676, + "step": 2299 + }, + { + "epoch": 0.12400258788009488, + "grad_norm": 0.8482736945152283, + "learning_rate": 9.978261602145398e-06, + "loss": 0.8185, + "step": 2300 + }, + { + "epoch": 0.12405650204873842, + "grad_norm": 0.8833953142166138, + "learning_rate": 9.978241849822819e-06, + "loss": 0.7776, + "step": 2301 + }, + { + "epoch": 0.12411041621738193, + "grad_norm": 0.8089832067489624, + "learning_rate": 9.978222088550033e-06, + "loss": 0.7697, + "step": 2302 + }, + { + "epoch": 0.12416433038602545, + "grad_norm": 0.8204466104507446, + "learning_rate": 9.978202318327075e-06, + "loss": 0.839, + "step": 2303 + }, + { + "epoch": 0.12421824455466897, + "grad_norm": 0.8547719120979309, + "learning_rate": 9.97818253915398e-06, + "loss": 0.9022, + "step": 2304 + }, + { + "epoch": 0.12427215872331249, + "grad_norm": 1.090289831161499, + "learning_rate": 9.978162751030787e-06, + "loss": 0.7154, + "step": 2305 + }, + { + "epoch": 0.124326072891956, + "grad_norm": 0.88922518491745, + "learning_rate": 9.978142953957526e-06, + "loss": 0.8962, + "step": 2306 + }, + { + "epoch": 0.12437998706059952, + "grad_norm": 0.8741730451583862, + "learning_rate": 9.978123147934236e-06, + "loss": 0.7742, + "step": 2307 + }, + { + "epoch": 0.12443390122924304, + "grad_norm": 1.2885240316390991, + "learning_rate": 9.97810333296095e-06, + "loss": 0.7256, + "step": 2308 + }, + { + "epoch": 0.12448781539788656, + "grad_norm": 0.7973229885101318, + "learning_rate": 9.978083509037711e-06, + "loss": 0.8433, + "step": 2309 + }, + { + "epoch": 0.12454172956653009, + "grad_norm": 0.8328043222427368, + "learning_rate": 9.978063676164544e-06, + "loss": 0.8617, + "step": 2310 + }, + { + "epoch": 0.12459564373517361, + "grad_norm": 0.8093283176422119, + "learning_rate": 9.978043834341493e-06, + "loss": 0.8407, + "step": 2311 + }, + { + "epoch": 0.12464955790381713, + "grad_norm": 0.7566602826118469, + "learning_rate": 9.978023983568588e-06, + "loss": 0.7602, + "step": 2312 + }, + { + "epoch": 0.12470347207246064, + "grad_norm": 0.7731996178627014, + "learning_rate": 9.97800412384587e-06, + "loss": 0.8323, + "step": 2313 + }, + { + "epoch": 0.12475738624110416, + "grad_norm": 0.9148348569869995, + "learning_rate": 9.97798425517337e-06, + "loss": 0.7886, + "step": 2314 + }, + { + "epoch": 0.12481130040974768, + "grad_norm": 0.8546224236488342, + "learning_rate": 9.977964377551126e-06, + "loss": 0.8116, + "step": 2315 + }, + { + "epoch": 0.1248652145783912, + "grad_norm": 1.0733944177627563, + "learning_rate": 9.977944490979175e-06, + "loss": 0.8255, + "step": 2316 + }, + { + "epoch": 0.12491912874703472, + "grad_norm": 0.8404545783996582, + "learning_rate": 9.977924595457549e-06, + "loss": 0.8542, + "step": 2317 + }, + { + "epoch": 0.12497304291567823, + "grad_norm": 0.8276603817939758, + "learning_rate": 9.977904690986286e-06, + "loss": 0.8242, + "step": 2318 + }, + { + "epoch": 0.12502695708432177, + "grad_norm": 0.8703106641769409, + "learning_rate": 9.977884777565423e-06, + "loss": 0.8525, + "step": 2319 + }, + { + "epoch": 0.12508087125296527, + "grad_norm": 0.8353367447853088, + "learning_rate": 9.977864855194994e-06, + "loss": 0.7921, + "step": 2320 + }, + { + "epoch": 0.1251347854216088, + "grad_norm": 0.8283559083938599, + "learning_rate": 9.977844923875036e-06, + "loss": 0.8262, + "step": 2321 + }, + { + "epoch": 0.1251886995902523, + "grad_norm": 0.8737161755561829, + "learning_rate": 9.977824983605584e-06, + "loss": 0.9117, + "step": 2322 + }, + { + "epoch": 0.12524261375889584, + "grad_norm": 0.8616884350776672, + "learning_rate": 9.977805034386675e-06, + "loss": 0.8178, + "step": 2323 + }, + { + "epoch": 0.12529652792753937, + "grad_norm": 0.9863162636756897, + "learning_rate": 9.977785076218342e-06, + "loss": 0.8671, + "step": 2324 + }, + { + "epoch": 0.12535044209618287, + "grad_norm": 0.9636940360069275, + "learning_rate": 9.977765109100624e-06, + "loss": 0.894, + "step": 2325 + }, + { + "epoch": 0.1254043562648264, + "grad_norm": 0.741320013999939, + "learning_rate": 9.977745133033554e-06, + "loss": 0.7474, + "step": 2326 + }, + { + "epoch": 0.1254582704334699, + "grad_norm": 0.7776119709014893, + "learning_rate": 9.97772514801717e-06, + "loss": 0.7867, + "step": 2327 + }, + { + "epoch": 0.12551218460211344, + "grad_norm": 0.8219690918922424, + "learning_rate": 9.97770515405151e-06, + "loss": 0.8443, + "step": 2328 + }, + { + "epoch": 0.12556609877075695, + "grad_norm": 0.8977565765380859, + "learning_rate": 9.977685151136605e-06, + "loss": 0.7831, + "step": 2329 + }, + { + "epoch": 0.12562001293940048, + "grad_norm": 0.8503162264823914, + "learning_rate": 9.977665139272495e-06, + "loss": 0.8733, + "step": 2330 + }, + { + "epoch": 0.12567392710804398, + "grad_norm": 0.7666327953338623, + "learning_rate": 9.977645118459213e-06, + "loss": 0.7165, + "step": 2331 + }, + { + "epoch": 0.1257278412766875, + "grad_norm": 0.8265602588653564, + "learning_rate": 9.977625088696797e-06, + "loss": 0.8894, + "step": 2332 + }, + { + "epoch": 0.12578175544533104, + "grad_norm": 0.9852930307388306, + "learning_rate": 9.977605049985282e-06, + "loss": 0.9223, + "step": 2333 + }, + { + "epoch": 0.12583566961397455, + "grad_norm": 0.9563886523246765, + "learning_rate": 9.977585002324705e-06, + "loss": 0.8275, + "step": 2334 + }, + { + "epoch": 0.12588958378261808, + "grad_norm": 0.8098574876785278, + "learning_rate": 9.977564945715102e-06, + "loss": 0.8831, + "step": 2335 + }, + { + "epoch": 0.12594349795126158, + "grad_norm": 0.8795431852340698, + "learning_rate": 9.977544880156507e-06, + "loss": 0.8079, + "step": 2336 + }, + { + "epoch": 0.12599741211990512, + "grad_norm": 0.7483893036842346, + "learning_rate": 9.97752480564896e-06, + "loss": 0.7734, + "step": 2337 + }, + { + "epoch": 0.12605132628854862, + "grad_norm": 0.7988960146903992, + "learning_rate": 9.977504722192493e-06, + "loss": 0.6936, + "step": 2338 + }, + { + "epoch": 0.12610524045719215, + "grad_norm": 0.7945669293403625, + "learning_rate": 9.977484629787143e-06, + "loss": 0.8608, + "step": 2339 + }, + { + "epoch": 0.12615915462583566, + "grad_norm": 0.8720629215240479, + "learning_rate": 9.977464528432948e-06, + "loss": 0.8656, + "step": 2340 + }, + { + "epoch": 0.1262130687944792, + "grad_norm": 0.8935837745666504, + "learning_rate": 9.977444418129943e-06, + "loss": 0.8854, + "step": 2341 + }, + { + "epoch": 0.12626698296312272, + "grad_norm": 0.8034403324127197, + "learning_rate": 9.977424298878165e-06, + "loss": 0.8422, + "step": 2342 + }, + { + "epoch": 0.12632089713176622, + "grad_norm": 1.0071096420288086, + "learning_rate": 9.977404170677648e-06, + "loss": 0.9105, + "step": 2343 + }, + { + "epoch": 0.12637481130040976, + "grad_norm": 1.0757510662078857, + "learning_rate": 9.97738403352843e-06, + "loss": 0.7454, + "step": 2344 + }, + { + "epoch": 0.12642872546905326, + "grad_norm": 0.7133142352104187, + "learning_rate": 9.977363887430548e-06, + "loss": 0.6814, + "step": 2345 + }, + { + "epoch": 0.1264826396376968, + "grad_norm": 0.769752025604248, + "learning_rate": 9.977343732384035e-06, + "loss": 0.7209, + "step": 2346 + }, + { + "epoch": 0.1265365538063403, + "grad_norm": 0.8043524622917175, + "learning_rate": 9.977323568388933e-06, + "loss": 0.8379, + "step": 2347 + }, + { + "epoch": 0.12659046797498383, + "grad_norm": 0.9236345887184143, + "learning_rate": 9.97730339544527e-06, + "loss": 0.8091, + "step": 2348 + }, + { + "epoch": 0.12664438214362733, + "grad_norm": 0.8852472305297852, + "learning_rate": 9.97728321355309e-06, + "loss": 0.8527, + "step": 2349 + }, + { + "epoch": 0.12669829631227086, + "grad_norm": 0.8866454362869263, + "learning_rate": 9.977263022712425e-06, + "loss": 0.7412, + "step": 2350 + }, + { + "epoch": 0.1267522104809144, + "grad_norm": 0.7950204014778137, + "learning_rate": 9.977242822923311e-06, + "loss": 0.7778, + "step": 2351 + }, + { + "epoch": 0.1268061246495579, + "grad_norm": 0.8775694966316223, + "learning_rate": 9.977222614185787e-06, + "loss": 0.7437, + "step": 2352 + }, + { + "epoch": 0.12686003881820143, + "grad_norm": 0.8059643507003784, + "learning_rate": 9.977202396499889e-06, + "loss": 0.7935, + "step": 2353 + }, + { + "epoch": 0.12691395298684494, + "grad_norm": 0.8250171542167664, + "learning_rate": 9.977182169865652e-06, + "loss": 0.7936, + "step": 2354 + }, + { + "epoch": 0.12696786715548847, + "grad_norm": 0.8618381023406982, + "learning_rate": 9.97716193428311e-06, + "loss": 0.7884, + "step": 2355 + }, + { + "epoch": 0.12702178132413197, + "grad_norm": 0.8977087140083313, + "learning_rate": 9.977141689752306e-06, + "loss": 0.7764, + "step": 2356 + }, + { + "epoch": 0.1270756954927755, + "grad_norm": 0.7616862058639526, + "learning_rate": 9.97712143627327e-06, + "loss": 0.7222, + "step": 2357 + }, + { + "epoch": 0.127129609661419, + "grad_norm": 0.8255194425582886, + "learning_rate": 9.977101173846042e-06, + "loss": 0.8015, + "step": 2358 + }, + { + "epoch": 0.12718352383006254, + "grad_norm": 0.7783398628234863, + "learning_rate": 9.977080902470657e-06, + "loss": 0.7403, + "step": 2359 + }, + { + "epoch": 0.12723743799870607, + "grad_norm": 1.201339840888977, + "learning_rate": 9.977060622147152e-06, + "loss": 0.8994, + "step": 2360 + }, + { + "epoch": 0.12729135216734958, + "grad_norm": 0.906428337097168, + "learning_rate": 9.977040332875563e-06, + "loss": 0.7791, + "step": 2361 + }, + { + "epoch": 0.1273452663359931, + "grad_norm": 0.8238182663917542, + "learning_rate": 9.977020034655927e-06, + "loss": 0.728, + "step": 2362 + }, + { + "epoch": 0.1273991805046366, + "grad_norm": 0.9390681385993958, + "learning_rate": 9.976999727488279e-06, + "loss": 0.8697, + "step": 2363 + }, + { + "epoch": 0.12745309467328014, + "grad_norm": 0.8595122694969177, + "learning_rate": 9.976979411372658e-06, + "loss": 0.8481, + "step": 2364 + }, + { + "epoch": 0.12750700884192365, + "grad_norm": 0.8220391273498535, + "learning_rate": 9.976959086309099e-06, + "loss": 0.709, + "step": 2365 + }, + { + "epoch": 0.12756092301056718, + "grad_norm": 0.9712308645248413, + "learning_rate": 9.976938752297638e-06, + "loss": 0.8898, + "step": 2366 + }, + { + "epoch": 0.12761483717921068, + "grad_norm": 0.8864933848381042, + "learning_rate": 9.976918409338315e-06, + "loss": 0.8798, + "step": 2367 + }, + { + "epoch": 0.12766875134785421, + "grad_norm": 0.7780918478965759, + "learning_rate": 9.976898057431162e-06, + "loss": 0.8123, + "step": 2368 + }, + { + "epoch": 0.12772266551649775, + "grad_norm": 0.8338439464569092, + "learning_rate": 9.976877696576218e-06, + "loss": 0.8177, + "step": 2369 + }, + { + "epoch": 0.12777657968514125, + "grad_norm": 0.9967712759971619, + "learning_rate": 9.976857326773517e-06, + "loss": 0.8613, + "step": 2370 + }, + { + "epoch": 0.12783049385378478, + "grad_norm": 0.7666492462158203, + "learning_rate": 9.976836948023099e-06, + "loss": 0.7226, + "step": 2371 + }, + { + "epoch": 0.1278844080224283, + "grad_norm": 0.9783684611320496, + "learning_rate": 9.976816560325e-06, + "loss": 0.8616, + "step": 2372 + }, + { + "epoch": 0.12793832219107182, + "grad_norm": 1.0170663595199585, + "learning_rate": 9.976796163679256e-06, + "loss": 0.8211, + "step": 2373 + }, + { + "epoch": 0.12799223635971532, + "grad_norm": 0.8657981157302856, + "learning_rate": 9.976775758085903e-06, + "loss": 0.867, + "step": 2374 + }, + { + "epoch": 0.12804615052835885, + "grad_norm": 0.8487955927848816, + "learning_rate": 9.976755343544979e-06, + "loss": 0.8056, + "step": 2375 + }, + { + "epoch": 0.12810006469700239, + "grad_norm": 0.90731281042099, + "learning_rate": 9.976734920056522e-06, + "loss": 0.8492, + "step": 2376 + }, + { + "epoch": 0.1281539788656459, + "grad_norm": 0.9684501886367798, + "learning_rate": 9.976714487620565e-06, + "loss": 0.8023, + "step": 2377 + }, + { + "epoch": 0.12820789303428942, + "grad_norm": 0.8361303806304932, + "learning_rate": 9.976694046237146e-06, + "loss": 0.8132, + "step": 2378 + }, + { + "epoch": 0.12826180720293293, + "grad_norm": 0.9570466876029968, + "learning_rate": 9.976673595906303e-06, + "loss": 0.8991, + "step": 2379 + }, + { + "epoch": 0.12831572137157646, + "grad_norm": 0.8944576978683472, + "learning_rate": 9.976653136628071e-06, + "loss": 0.8163, + "step": 2380 + }, + { + "epoch": 0.12836963554021996, + "grad_norm": 0.7991742491722107, + "learning_rate": 9.976632668402489e-06, + "loss": 0.7962, + "step": 2381 + }, + { + "epoch": 0.1284235497088635, + "grad_norm": 0.9284802079200745, + "learning_rate": 9.976612191229594e-06, + "loss": 1.0115, + "step": 2382 + }, + { + "epoch": 0.128477463877507, + "grad_norm": 0.8092453479766846, + "learning_rate": 9.97659170510942e-06, + "loss": 0.705, + "step": 2383 + }, + { + "epoch": 0.12853137804615053, + "grad_norm": 0.8068677186965942, + "learning_rate": 9.976571210042005e-06, + "loss": 0.8283, + "step": 2384 + }, + { + "epoch": 0.12858529221479406, + "grad_norm": 0.8636525869369507, + "learning_rate": 9.976550706027386e-06, + "loss": 0.7824, + "step": 2385 + }, + { + "epoch": 0.12863920638343757, + "grad_norm": 0.9768033027648926, + "learning_rate": 9.9765301930656e-06, + "loss": 0.8317, + "step": 2386 + }, + { + "epoch": 0.1286931205520811, + "grad_norm": 0.8494508862495422, + "learning_rate": 9.976509671156684e-06, + "loss": 0.9464, + "step": 2387 + }, + { + "epoch": 0.1287470347207246, + "grad_norm": 0.8336171507835388, + "learning_rate": 9.976489140300676e-06, + "loss": 0.8003, + "step": 2388 + }, + { + "epoch": 0.12880094888936813, + "grad_norm": 0.819869339466095, + "learning_rate": 9.97646860049761e-06, + "loss": 0.6779, + "step": 2389 + }, + { + "epoch": 0.12885486305801164, + "grad_norm": 1.179028868675232, + "learning_rate": 9.976448051747526e-06, + "loss": 0.8183, + "step": 2390 + }, + { + "epoch": 0.12890877722665517, + "grad_norm": 0.8214680552482605, + "learning_rate": 9.97642749405046e-06, + "loss": 0.7659, + "step": 2391 + }, + { + "epoch": 0.12896269139529867, + "grad_norm": 0.8303862810134888, + "learning_rate": 9.976406927406446e-06, + "loss": 0.8993, + "step": 2392 + }, + { + "epoch": 0.1290166055639422, + "grad_norm": 0.8043105006217957, + "learning_rate": 9.976386351815526e-06, + "loss": 0.7948, + "step": 2393 + }, + { + "epoch": 0.12907051973258574, + "grad_norm": 0.7988419532775879, + "learning_rate": 9.976365767277734e-06, + "loss": 0.8042, + "step": 2394 + }, + { + "epoch": 0.12912443390122924, + "grad_norm": 0.8145790696144104, + "learning_rate": 9.976345173793107e-06, + "loss": 0.7214, + "step": 2395 + }, + { + "epoch": 0.12917834806987277, + "grad_norm": 0.8323239088058472, + "learning_rate": 9.976324571361682e-06, + "loss": 0.8692, + "step": 2396 + }, + { + "epoch": 0.12923226223851628, + "grad_norm": 1.5968064069747925, + "learning_rate": 9.976303959983498e-06, + "loss": 0.8573, + "step": 2397 + }, + { + "epoch": 0.1292861764071598, + "grad_norm": 0.8523521423339844, + "learning_rate": 9.976283339658589e-06, + "loss": 0.8856, + "step": 2398 + }, + { + "epoch": 0.1293400905758033, + "grad_norm": 1.3875633478164673, + "learning_rate": 9.976262710386994e-06, + "loss": 0.829, + "step": 2399 + }, + { + "epoch": 0.12939400474444684, + "grad_norm": 0.8131827712059021, + "learning_rate": 9.976242072168751e-06, + "loss": 0.7787, + "step": 2400 + }, + { + "epoch": 0.12944791891309035, + "grad_norm": 0.8347164392471313, + "learning_rate": 9.976221425003896e-06, + "loss": 0.9119, + "step": 2401 + }, + { + "epoch": 0.12950183308173388, + "grad_norm": 0.791674792766571, + "learning_rate": 9.976200768892465e-06, + "loss": 0.8483, + "step": 2402 + }, + { + "epoch": 0.1295557472503774, + "grad_norm": 0.8207666277885437, + "learning_rate": 9.976180103834496e-06, + "loss": 0.7688, + "step": 2403 + }, + { + "epoch": 0.12960966141902092, + "grad_norm": 0.8335880041122437, + "learning_rate": 9.976159429830027e-06, + "loss": 0.8943, + "step": 2404 + }, + { + "epoch": 0.12966357558766445, + "grad_norm": 0.8273102045059204, + "learning_rate": 9.976138746879094e-06, + "loss": 0.7847, + "step": 2405 + }, + { + "epoch": 0.12971748975630795, + "grad_norm": 0.9029181003570557, + "learning_rate": 9.976118054981735e-06, + "loss": 0.9779, + "step": 2406 + }, + { + "epoch": 0.12977140392495148, + "grad_norm": 1.0253269672393799, + "learning_rate": 9.976097354137986e-06, + "loss": 0.8301, + "step": 2407 + }, + { + "epoch": 0.129825318093595, + "grad_norm": 0.859992265701294, + "learning_rate": 9.976076644347887e-06, + "loss": 0.7809, + "step": 2408 + }, + { + "epoch": 0.12987923226223852, + "grad_norm": 0.8313273787498474, + "learning_rate": 9.976055925611472e-06, + "loss": 0.8435, + "step": 2409 + }, + { + "epoch": 0.12993314643088202, + "grad_norm": 0.8921852707862854, + "learning_rate": 9.976035197928779e-06, + "loss": 0.8407, + "step": 2410 + }, + { + "epoch": 0.12998706059952556, + "grad_norm": 0.9168267846107483, + "learning_rate": 9.976014461299848e-06, + "loss": 0.8428, + "step": 2411 + }, + { + "epoch": 0.1300409747681691, + "grad_norm": 0.8943728804588318, + "learning_rate": 9.975993715724712e-06, + "loss": 0.8953, + "step": 2412 + }, + { + "epoch": 0.1300948889368126, + "grad_norm": 0.8288392424583435, + "learning_rate": 9.975972961203411e-06, + "loss": 0.8008, + "step": 2413 + }, + { + "epoch": 0.13014880310545612, + "grad_norm": 0.8432718515396118, + "learning_rate": 9.975952197735982e-06, + "loss": 0.775, + "step": 2414 + }, + { + "epoch": 0.13020271727409963, + "grad_norm": 1.029341220855713, + "learning_rate": 9.975931425322462e-06, + "loss": 0.9086, + "step": 2415 + }, + { + "epoch": 0.13025663144274316, + "grad_norm": 0.8342422842979431, + "learning_rate": 9.975910643962888e-06, + "loss": 0.8867, + "step": 2416 + }, + { + "epoch": 0.13031054561138666, + "grad_norm": 0.7766898274421692, + "learning_rate": 9.975889853657298e-06, + "loss": 0.7597, + "step": 2417 + }, + { + "epoch": 0.1303644597800302, + "grad_norm": 0.865112841129303, + "learning_rate": 9.97586905440573e-06, + "loss": 0.8164, + "step": 2418 + }, + { + "epoch": 0.1304183739486737, + "grad_norm": 0.7938675880432129, + "learning_rate": 9.97584824620822e-06, + "loss": 0.8053, + "step": 2419 + }, + { + "epoch": 0.13047228811731723, + "grad_norm": 0.8813329339027405, + "learning_rate": 9.975827429064805e-06, + "loss": 0.8662, + "step": 2420 + }, + { + "epoch": 0.13052620228596076, + "grad_norm": 0.8217114210128784, + "learning_rate": 9.975806602975525e-06, + "loss": 0.8647, + "step": 2421 + }, + { + "epoch": 0.13058011645460427, + "grad_norm": 1.0177736282348633, + "learning_rate": 9.975785767940413e-06, + "loss": 0.813, + "step": 2422 + }, + { + "epoch": 0.1306340306232478, + "grad_norm": 0.7887234687805176, + "learning_rate": 9.975764923959512e-06, + "loss": 0.7759, + "step": 2423 + }, + { + "epoch": 0.1306879447918913, + "grad_norm": 0.7670013904571533, + "learning_rate": 9.975744071032856e-06, + "loss": 0.7534, + "step": 2424 + }, + { + "epoch": 0.13074185896053483, + "grad_norm": 0.7348708510398865, + "learning_rate": 9.975723209160483e-06, + "loss": 0.7955, + "step": 2425 + }, + { + "epoch": 0.13079577312917834, + "grad_norm": 0.8183468580245972, + "learning_rate": 9.97570233834243e-06, + "loss": 0.8664, + "step": 2426 + }, + { + "epoch": 0.13084968729782187, + "grad_norm": 0.8783697485923767, + "learning_rate": 9.975681458578736e-06, + "loss": 0.8399, + "step": 2427 + }, + { + "epoch": 0.13090360146646537, + "grad_norm": 0.7653324007987976, + "learning_rate": 9.975660569869439e-06, + "loss": 0.7723, + "step": 2428 + }, + { + "epoch": 0.1309575156351089, + "grad_norm": 0.9938413500785828, + "learning_rate": 9.975639672214574e-06, + "loss": 0.7439, + "step": 2429 + }, + { + "epoch": 0.13101142980375244, + "grad_norm": 0.7844074368476868, + "learning_rate": 9.975618765614181e-06, + "loss": 0.8234, + "step": 2430 + }, + { + "epoch": 0.13106534397239594, + "grad_norm": 0.8992919325828552, + "learning_rate": 9.975597850068295e-06, + "loss": 0.7485, + "step": 2431 + }, + { + "epoch": 0.13111925814103947, + "grad_norm": 0.8023738265037537, + "learning_rate": 9.975576925576956e-06, + "loss": 0.7986, + "step": 2432 + }, + { + "epoch": 0.13117317230968298, + "grad_norm": 0.8369026184082031, + "learning_rate": 9.9755559921402e-06, + "loss": 0.8695, + "step": 2433 + }, + { + "epoch": 0.1312270864783265, + "grad_norm": 0.812224805355072, + "learning_rate": 9.975535049758067e-06, + "loss": 0.834, + "step": 2434 + }, + { + "epoch": 0.13128100064697001, + "grad_norm": 0.7718735337257385, + "learning_rate": 9.975514098430591e-06, + "loss": 0.8055, + "step": 2435 + }, + { + "epoch": 0.13133491481561355, + "grad_norm": 0.8709392547607422, + "learning_rate": 9.975493138157813e-06, + "loss": 0.899, + "step": 2436 + }, + { + "epoch": 0.13138882898425705, + "grad_norm": 0.8817125558853149, + "learning_rate": 9.97547216893977e-06, + "loss": 0.7908, + "step": 2437 + }, + { + "epoch": 0.13144274315290058, + "grad_norm": 0.9631084203720093, + "learning_rate": 9.975451190776498e-06, + "loss": 0.9153, + "step": 2438 + }, + { + "epoch": 0.1314966573215441, + "grad_norm": 0.998906672000885, + "learning_rate": 9.975430203668037e-06, + "loss": 0.971, + "step": 2439 + }, + { + "epoch": 0.13155057149018762, + "grad_norm": 0.9689096212387085, + "learning_rate": 9.975409207614422e-06, + "loss": 0.8316, + "step": 2440 + }, + { + "epoch": 0.13160448565883115, + "grad_norm": 0.7694187760353088, + "learning_rate": 9.975388202615692e-06, + "loss": 0.757, + "step": 2441 + }, + { + "epoch": 0.13165839982747465, + "grad_norm": 0.8082549571990967, + "learning_rate": 9.975367188671885e-06, + "loss": 0.8704, + "step": 2442 + }, + { + "epoch": 0.13171231399611819, + "grad_norm": 0.8493963479995728, + "learning_rate": 9.97534616578304e-06, + "loss": 0.8171, + "step": 2443 + }, + { + "epoch": 0.1317662281647617, + "grad_norm": 0.972273588180542, + "learning_rate": 9.975325133949195e-06, + "loss": 0.9834, + "step": 2444 + }, + { + "epoch": 0.13182014233340522, + "grad_norm": 0.8235988616943359, + "learning_rate": 9.975304093170384e-06, + "loss": 0.8896, + "step": 2445 + }, + { + "epoch": 0.13187405650204873, + "grad_norm": 0.8405951261520386, + "learning_rate": 9.975283043446649e-06, + "loss": 0.8362, + "step": 2446 + }, + { + "epoch": 0.13192797067069226, + "grad_norm": 0.765640377998352, + "learning_rate": 9.975261984778024e-06, + "loss": 0.7543, + "step": 2447 + }, + { + "epoch": 0.1319818848393358, + "grad_norm": 0.9431920051574707, + "learning_rate": 9.97524091716455e-06, + "loss": 0.8322, + "step": 2448 + }, + { + "epoch": 0.1320357990079793, + "grad_norm": 0.8060823082923889, + "learning_rate": 9.975219840606265e-06, + "loss": 0.8153, + "step": 2449 + }, + { + "epoch": 0.13208971317662282, + "grad_norm": 1.1293737888336182, + "learning_rate": 9.975198755103203e-06, + "loss": 0.8969, + "step": 2450 + }, + { + "epoch": 0.13214362734526633, + "grad_norm": 0.8462950587272644, + "learning_rate": 9.975177660655407e-06, + "loss": 0.7758, + "step": 2451 + }, + { + "epoch": 0.13219754151390986, + "grad_norm": 0.8241791725158691, + "learning_rate": 9.975156557262914e-06, + "loss": 0.8046, + "step": 2452 + }, + { + "epoch": 0.13225145568255336, + "grad_norm": 0.8260864615440369, + "learning_rate": 9.975135444925756e-06, + "loss": 0.7559, + "step": 2453 + }, + { + "epoch": 0.1323053698511969, + "grad_norm": 0.8952769637107849, + "learning_rate": 9.975114323643978e-06, + "loss": 0.8292, + "step": 2454 + }, + { + "epoch": 0.1323592840198404, + "grad_norm": 0.8182158470153809, + "learning_rate": 9.975093193417615e-06, + "loss": 0.7137, + "step": 2455 + }, + { + "epoch": 0.13241319818848393, + "grad_norm": 0.9926600456237793, + "learning_rate": 9.975072054246706e-06, + "loss": 0.7935, + "step": 2456 + }, + { + "epoch": 0.13246711235712746, + "grad_norm": 0.872171938419342, + "learning_rate": 9.97505090613129e-06, + "loss": 0.882, + "step": 2457 + }, + { + "epoch": 0.13252102652577097, + "grad_norm": 0.8218923807144165, + "learning_rate": 9.975029749071401e-06, + "loss": 0.7675, + "step": 2458 + }, + { + "epoch": 0.1325749406944145, + "grad_norm": 0.8250816464424133, + "learning_rate": 9.97500858306708e-06, + "loss": 0.8404, + "step": 2459 + }, + { + "epoch": 0.132628854863058, + "grad_norm": 0.8135029673576355, + "learning_rate": 9.974987408118365e-06, + "loss": 0.8387, + "step": 2460 + }, + { + "epoch": 0.13268276903170154, + "grad_norm": 1.3989582061767578, + "learning_rate": 9.974966224225293e-06, + "loss": 0.817, + "step": 2461 + }, + { + "epoch": 0.13273668320034504, + "grad_norm": 0.8212644457817078, + "learning_rate": 9.974945031387902e-06, + "loss": 0.8377, + "step": 2462 + }, + { + "epoch": 0.13279059736898857, + "grad_norm": 1.5513782501220703, + "learning_rate": 9.974923829606232e-06, + "loss": 0.7645, + "step": 2463 + }, + { + "epoch": 0.13284451153763208, + "grad_norm": 0.9355224370956421, + "learning_rate": 9.97490261888032e-06, + "loss": 0.7943, + "step": 2464 + }, + { + "epoch": 0.1328984257062756, + "grad_norm": 0.8264141082763672, + "learning_rate": 9.974881399210204e-06, + "loss": 0.7868, + "step": 2465 + }, + { + "epoch": 0.13295233987491914, + "grad_norm": 0.8267685770988464, + "learning_rate": 9.974860170595921e-06, + "loss": 0.8482, + "step": 2466 + }, + { + "epoch": 0.13300625404356264, + "grad_norm": 0.7816182374954224, + "learning_rate": 9.974838933037512e-06, + "loss": 0.6735, + "step": 2467 + }, + { + "epoch": 0.13306016821220618, + "grad_norm": 0.8686188459396362, + "learning_rate": 9.974817686535013e-06, + "loss": 0.7639, + "step": 2468 + }, + { + "epoch": 0.13311408238084968, + "grad_norm": 0.8006383776664734, + "learning_rate": 9.974796431088462e-06, + "loss": 0.9035, + "step": 2469 + }, + { + "epoch": 0.1331679965494932, + "grad_norm": 0.829788327217102, + "learning_rate": 9.974775166697898e-06, + "loss": 0.7724, + "step": 2470 + }, + { + "epoch": 0.13322191071813672, + "grad_norm": 0.7149111032485962, + "learning_rate": 9.97475389336336e-06, + "loss": 0.7543, + "step": 2471 + }, + { + "epoch": 0.13327582488678025, + "grad_norm": 0.8626448512077332, + "learning_rate": 9.974732611084886e-06, + "loss": 0.8903, + "step": 2472 + }, + { + "epoch": 0.13332973905542375, + "grad_norm": 0.818778395652771, + "learning_rate": 9.974711319862514e-06, + "loss": 0.7862, + "step": 2473 + }, + { + "epoch": 0.13338365322406728, + "grad_norm": 0.8285005688667297, + "learning_rate": 9.97469001969628e-06, + "loss": 0.8186, + "step": 2474 + }, + { + "epoch": 0.13343756739271082, + "grad_norm": 0.9331484436988831, + "learning_rate": 9.974668710586226e-06, + "loss": 0.7278, + "step": 2475 + }, + { + "epoch": 0.13349148156135432, + "grad_norm": 0.7760492563247681, + "learning_rate": 9.974647392532387e-06, + "loss": 0.82, + "step": 2476 + }, + { + "epoch": 0.13354539572999785, + "grad_norm": 0.9858410358428955, + "learning_rate": 9.974626065534804e-06, + "loss": 0.9733, + "step": 2477 + }, + { + "epoch": 0.13359930989864136, + "grad_norm": 0.774960458278656, + "learning_rate": 9.974604729593513e-06, + "loss": 0.7899, + "step": 2478 + }, + { + "epoch": 0.1336532240672849, + "grad_norm": 0.7779082655906677, + "learning_rate": 9.974583384708556e-06, + "loss": 0.7727, + "step": 2479 + }, + { + "epoch": 0.1337071382359284, + "grad_norm": 0.8611405491828918, + "learning_rate": 9.974562030879967e-06, + "loss": 0.8341, + "step": 2480 + }, + { + "epoch": 0.13376105240457192, + "grad_norm": 0.9042904376983643, + "learning_rate": 9.974540668107788e-06, + "loss": 0.8015, + "step": 2481 + }, + { + "epoch": 0.13381496657321545, + "grad_norm": 1.067806601524353, + "learning_rate": 9.974519296392054e-06, + "loss": 0.8583, + "step": 2482 + }, + { + "epoch": 0.13386888074185896, + "grad_norm": 0.8079432845115662, + "learning_rate": 9.974497915732806e-06, + "loss": 0.7246, + "step": 2483 + }, + { + "epoch": 0.1339227949105025, + "grad_norm": 0.7360541224479675, + "learning_rate": 9.974476526130082e-06, + "loss": 0.7228, + "step": 2484 + }, + { + "epoch": 0.133976709079146, + "grad_norm": 0.7532739639282227, + "learning_rate": 9.97445512758392e-06, + "loss": 0.7472, + "step": 2485 + }, + { + "epoch": 0.13403062324778953, + "grad_norm": 0.794747531414032, + "learning_rate": 9.974433720094358e-06, + "loss": 0.8288, + "step": 2486 + }, + { + "epoch": 0.13408453741643303, + "grad_norm": 0.9305081367492676, + "learning_rate": 9.974412303661435e-06, + "loss": 0.9414, + "step": 2487 + }, + { + "epoch": 0.13413845158507656, + "grad_norm": 0.9857872128486633, + "learning_rate": 9.97439087828519e-06, + "loss": 0.9123, + "step": 2488 + }, + { + "epoch": 0.13419236575372007, + "grad_norm": 0.9159066081047058, + "learning_rate": 9.97436944396566e-06, + "loss": 0.815, + "step": 2489 + }, + { + "epoch": 0.1342462799223636, + "grad_norm": 0.920803427696228, + "learning_rate": 9.974348000702887e-06, + "loss": 0.855, + "step": 2490 + }, + { + "epoch": 0.13430019409100713, + "grad_norm": 0.8599058389663696, + "learning_rate": 9.974326548496906e-06, + "loss": 0.8944, + "step": 2491 + }, + { + "epoch": 0.13435410825965063, + "grad_norm": 0.7708035111427307, + "learning_rate": 9.974305087347758e-06, + "loss": 0.7733, + "step": 2492 + }, + { + "epoch": 0.13440802242829417, + "grad_norm": 0.771906852722168, + "learning_rate": 9.974283617255478e-06, + "loss": 0.8555, + "step": 2493 + }, + { + "epoch": 0.13446193659693767, + "grad_norm": 0.7494363188743591, + "learning_rate": 9.974262138220108e-06, + "loss": 0.7575, + "step": 2494 + }, + { + "epoch": 0.1345158507655812, + "grad_norm": 0.8488510251045227, + "learning_rate": 9.974240650241687e-06, + "loss": 0.8423, + "step": 2495 + }, + { + "epoch": 0.1345697649342247, + "grad_norm": 0.7665607929229736, + "learning_rate": 9.97421915332025e-06, + "loss": 0.8221, + "step": 2496 + }, + { + "epoch": 0.13462367910286824, + "grad_norm": 0.83452969789505, + "learning_rate": 9.974197647455839e-06, + "loss": 0.8192, + "step": 2497 + }, + { + "epoch": 0.13467759327151174, + "grad_norm": 0.8927843570709229, + "learning_rate": 9.97417613264849e-06, + "loss": 0.8041, + "step": 2498 + }, + { + "epoch": 0.13473150744015527, + "grad_norm": 0.8050754070281982, + "learning_rate": 9.974154608898246e-06, + "loss": 0.7374, + "step": 2499 + }, + { + "epoch": 0.1347854216087988, + "grad_norm": 0.8286676406860352, + "learning_rate": 9.97413307620514e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.1348393357774423, + "grad_norm": 0.8953397870063782, + "learning_rate": 9.974111534569215e-06, + "loss": 0.8419, + "step": 2501 + }, + { + "epoch": 0.13489324994608584, + "grad_norm": 0.8619454503059387, + "learning_rate": 9.974089983990507e-06, + "loss": 0.7231, + "step": 2502 + }, + { + "epoch": 0.13494716411472935, + "grad_norm": 0.8102728724479675, + "learning_rate": 9.974068424469058e-06, + "loss": 0.8701, + "step": 2503 + }, + { + "epoch": 0.13500107828337288, + "grad_norm": 0.7568274736404419, + "learning_rate": 9.974046856004904e-06, + "loss": 0.7864, + "step": 2504 + }, + { + "epoch": 0.13505499245201638, + "grad_norm": 0.7835590839385986, + "learning_rate": 9.974025278598086e-06, + "loss": 0.8595, + "step": 2505 + }, + { + "epoch": 0.1351089066206599, + "grad_norm": 0.854015052318573, + "learning_rate": 9.974003692248638e-06, + "loss": 0.7683, + "step": 2506 + }, + { + "epoch": 0.13516282078930342, + "grad_norm": 0.7973034977912903, + "learning_rate": 9.973982096956604e-06, + "loss": 0.7332, + "step": 2507 + }, + { + "epoch": 0.13521673495794695, + "grad_norm": 0.8860466480255127, + "learning_rate": 9.973960492722022e-06, + "loss": 0.8312, + "step": 2508 + }, + { + "epoch": 0.13527064912659048, + "grad_norm": 0.8370612263679504, + "learning_rate": 9.973938879544928e-06, + "loss": 0.8307, + "step": 2509 + }, + { + "epoch": 0.13532456329523398, + "grad_norm": 0.9102504253387451, + "learning_rate": 9.973917257425365e-06, + "loss": 0.8276, + "step": 2510 + }, + { + "epoch": 0.13537847746387752, + "grad_norm": 0.9040873646736145, + "learning_rate": 9.973895626363367e-06, + "loss": 0.7717, + "step": 2511 + }, + { + "epoch": 0.13543239163252102, + "grad_norm": 0.7447285056114197, + "learning_rate": 9.973873986358977e-06, + "loss": 0.7836, + "step": 2512 + }, + { + "epoch": 0.13548630580116455, + "grad_norm": 0.7533379197120667, + "learning_rate": 9.973852337412234e-06, + "loss": 0.8308, + "step": 2513 + }, + { + "epoch": 0.13554021996980806, + "grad_norm": 0.7503568530082703, + "learning_rate": 9.973830679523173e-06, + "loss": 0.7893, + "step": 2514 + }, + { + "epoch": 0.1355941341384516, + "grad_norm": 0.786011815071106, + "learning_rate": 9.973809012691836e-06, + "loss": 0.7562, + "step": 2515 + }, + { + "epoch": 0.1356480483070951, + "grad_norm": 0.9311261773109436, + "learning_rate": 9.973787336918262e-06, + "loss": 0.7295, + "step": 2516 + }, + { + "epoch": 0.13570196247573862, + "grad_norm": 0.8217887878417969, + "learning_rate": 9.973765652202488e-06, + "loss": 0.8399, + "step": 2517 + }, + { + "epoch": 0.13575587664438216, + "grad_norm": 0.8265646696090698, + "learning_rate": 9.973743958544554e-06, + "loss": 0.8146, + "step": 2518 + }, + { + "epoch": 0.13580979081302566, + "grad_norm": 0.9443806409835815, + "learning_rate": 9.9737222559445e-06, + "loss": 0.9217, + "step": 2519 + }, + { + "epoch": 0.1358637049816692, + "grad_norm": 0.807623028755188, + "learning_rate": 9.973700544402362e-06, + "loss": 0.8266, + "step": 2520 + }, + { + "epoch": 0.1359176191503127, + "grad_norm": 0.819793164730072, + "learning_rate": 9.973678823918184e-06, + "loss": 0.755, + "step": 2521 + }, + { + "epoch": 0.13597153331895623, + "grad_norm": 0.7608258724212646, + "learning_rate": 9.973657094492002e-06, + "loss": 0.7707, + "step": 2522 + }, + { + "epoch": 0.13602544748759973, + "grad_norm": 0.795218825340271, + "learning_rate": 9.973635356123854e-06, + "loss": 0.7235, + "step": 2523 + }, + { + "epoch": 0.13607936165624326, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.973613608813782e-06, + "loss": 0.8698, + "step": 2524 + }, + { + "epoch": 0.13613327582488677, + "grad_norm": 0.8091539144515991, + "learning_rate": 9.973591852561822e-06, + "loss": 0.8492, + "step": 2525 + }, + { + "epoch": 0.1361871899935303, + "grad_norm": 0.9144110679626465, + "learning_rate": 9.973570087368015e-06, + "loss": 0.7952, + "step": 2526 + }, + { + "epoch": 0.13624110416217383, + "grad_norm": 0.761695921421051, + "learning_rate": 9.9735483132324e-06, + "loss": 0.7841, + "step": 2527 + }, + { + "epoch": 0.13629501833081734, + "grad_norm": 0.887026846408844, + "learning_rate": 9.973526530155016e-06, + "loss": 0.8855, + "step": 2528 + }, + { + "epoch": 0.13634893249946087, + "grad_norm": 0.8282152414321899, + "learning_rate": 9.973504738135903e-06, + "loss": 0.8857, + "step": 2529 + }, + { + "epoch": 0.13640284666810437, + "grad_norm": 0.7782665491104126, + "learning_rate": 9.973482937175098e-06, + "loss": 0.8076, + "step": 2530 + }, + { + "epoch": 0.1364567608367479, + "grad_norm": 0.8865575194358826, + "learning_rate": 9.973461127272642e-06, + "loss": 0.8596, + "step": 2531 + }, + { + "epoch": 0.1365106750053914, + "grad_norm": 0.7215422987937927, + "learning_rate": 9.973439308428572e-06, + "loss": 0.7437, + "step": 2532 + }, + { + "epoch": 0.13656458917403494, + "grad_norm": 0.7932387590408325, + "learning_rate": 9.97341748064293e-06, + "loss": 0.8439, + "step": 2533 + }, + { + "epoch": 0.13661850334267844, + "grad_norm": 0.8260403871536255, + "learning_rate": 9.973395643915756e-06, + "loss": 0.7956, + "step": 2534 + }, + { + "epoch": 0.13667241751132198, + "grad_norm": 0.7879858016967773, + "learning_rate": 9.973373798247085e-06, + "loss": 0.8501, + "step": 2535 + }, + { + "epoch": 0.1367263316799655, + "grad_norm": 0.7268496751785278, + "learning_rate": 9.97335194363696e-06, + "loss": 0.78, + "step": 2536 + }, + { + "epoch": 0.136780245848609, + "grad_norm": 0.8170067071914673, + "learning_rate": 9.973330080085417e-06, + "loss": 0.829, + "step": 2537 + }, + { + "epoch": 0.13683416001725254, + "grad_norm": 0.8400061726570129, + "learning_rate": 9.973308207592498e-06, + "loss": 0.8576, + "step": 2538 + }, + { + "epoch": 0.13688807418589605, + "grad_norm": 0.9156914353370667, + "learning_rate": 9.973286326158244e-06, + "loss": 0.8633, + "step": 2539 + }, + { + "epoch": 0.13694198835453958, + "grad_norm": 0.7413343191146851, + "learning_rate": 9.97326443578269e-06, + "loss": 0.8128, + "step": 2540 + }, + { + "epoch": 0.13699590252318308, + "grad_norm": 0.8003092408180237, + "learning_rate": 9.973242536465877e-06, + "loss": 0.7743, + "step": 2541 + }, + { + "epoch": 0.13704981669182661, + "grad_norm": 0.8532862067222595, + "learning_rate": 9.973220628207844e-06, + "loss": 0.8526, + "step": 2542 + }, + { + "epoch": 0.13710373086047012, + "grad_norm": 0.7677969336509705, + "learning_rate": 9.973198711008634e-06, + "loss": 0.8493, + "step": 2543 + }, + { + "epoch": 0.13715764502911365, + "grad_norm": 0.8414867520332336, + "learning_rate": 9.973176784868282e-06, + "loss": 0.7674, + "step": 2544 + }, + { + "epoch": 0.13721155919775718, + "grad_norm": 0.825450599193573, + "learning_rate": 9.973154849786828e-06, + "loss": 0.8328, + "step": 2545 + }, + { + "epoch": 0.1372654733664007, + "grad_norm": 0.8429614305496216, + "learning_rate": 9.973132905764313e-06, + "loss": 0.787, + "step": 2546 + }, + { + "epoch": 0.13731938753504422, + "grad_norm": 0.9791093468666077, + "learning_rate": 9.973110952800776e-06, + "loss": 0.7836, + "step": 2547 + }, + { + "epoch": 0.13737330170368772, + "grad_norm": 0.8728508353233337, + "learning_rate": 9.973088990896255e-06, + "loss": 0.8897, + "step": 2548 + }, + { + "epoch": 0.13742721587233125, + "grad_norm": 0.9933381080627441, + "learning_rate": 9.973067020050792e-06, + "loss": 0.8679, + "step": 2549 + }, + { + "epoch": 0.13748113004097476, + "grad_norm": 0.8786694407463074, + "learning_rate": 9.973045040264423e-06, + "loss": 0.8599, + "step": 2550 + }, + { + "epoch": 0.1375350442096183, + "grad_norm": 0.7714465260505676, + "learning_rate": 9.973023051537193e-06, + "loss": 0.6355, + "step": 2551 + }, + { + "epoch": 0.1375889583782618, + "grad_norm": 0.9043986201286316, + "learning_rate": 9.973001053869138e-06, + "loss": 0.7445, + "step": 2552 + }, + { + "epoch": 0.13764287254690533, + "grad_norm": 0.879623532295227, + "learning_rate": 9.972979047260297e-06, + "loss": 0.8086, + "step": 2553 + }, + { + "epoch": 0.13769678671554886, + "grad_norm": 0.8384745121002197, + "learning_rate": 9.972957031710708e-06, + "loss": 0.6832, + "step": 2554 + }, + { + "epoch": 0.13775070088419236, + "grad_norm": 0.8574655055999756, + "learning_rate": 9.972935007220415e-06, + "loss": 0.8326, + "step": 2555 + }, + { + "epoch": 0.1378046150528359, + "grad_norm": 0.8241353034973145, + "learning_rate": 9.972912973789458e-06, + "loss": 0.7526, + "step": 2556 + }, + { + "epoch": 0.1378585292214794, + "grad_norm": 0.8306788802146912, + "learning_rate": 9.97289093141787e-06, + "loss": 0.9423, + "step": 2557 + }, + { + "epoch": 0.13791244339012293, + "grad_norm": 0.7930428385734558, + "learning_rate": 9.972868880105696e-06, + "loss": 0.8635, + "step": 2558 + }, + { + "epoch": 0.13796635755876643, + "grad_norm": 0.856482207775116, + "learning_rate": 9.972846819852974e-06, + "loss": 0.7902, + "step": 2559 + }, + { + "epoch": 0.13802027172740997, + "grad_norm": 0.8513977527618408, + "learning_rate": 9.972824750659747e-06, + "loss": 0.8485, + "step": 2560 + }, + { + "epoch": 0.13807418589605347, + "grad_norm": 0.7595572471618652, + "learning_rate": 9.97280267252605e-06, + "loss": 0.7294, + "step": 2561 + }, + { + "epoch": 0.138128100064697, + "grad_norm": 0.9774705767631531, + "learning_rate": 9.972780585451923e-06, + "loss": 0.8758, + "step": 2562 + }, + { + "epoch": 0.13818201423334053, + "grad_norm": 0.8011289834976196, + "learning_rate": 9.972758489437408e-06, + "loss": 0.7649, + "step": 2563 + }, + { + "epoch": 0.13823592840198404, + "grad_norm": 0.8921117186546326, + "learning_rate": 9.972736384482545e-06, + "loss": 0.8745, + "step": 2564 + }, + { + "epoch": 0.13828984257062757, + "grad_norm": 0.8739173412322998, + "learning_rate": 9.972714270587372e-06, + "loss": 0.841, + "step": 2565 + }, + { + "epoch": 0.13834375673927107, + "grad_norm": 0.7379958033561707, + "learning_rate": 9.97269214775193e-06, + "loss": 0.813, + "step": 2566 + }, + { + "epoch": 0.1383976709079146, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.972670015976258e-06, + "loss": 0.8319, + "step": 2567 + }, + { + "epoch": 0.1384515850765581, + "grad_norm": 0.7312106490135193, + "learning_rate": 9.972647875260395e-06, + "loss": 0.7494, + "step": 2568 + }, + { + "epoch": 0.13850549924520164, + "grad_norm": 0.8182246088981628, + "learning_rate": 9.972625725604383e-06, + "loss": 0.9543, + "step": 2569 + }, + { + "epoch": 0.13855941341384514, + "grad_norm": 0.8153319358825684, + "learning_rate": 9.97260356700826e-06, + "loss": 0.8411, + "step": 2570 + }, + { + "epoch": 0.13861332758248868, + "grad_norm": 0.7589008212089539, + "learning_rate": 9.972581399472066e-06, + "loss": 0.7576, + "step": 2571 + }, + { + "epoch": 0.1386672417511322, + "grad_norm": 0.8160014748573303, + "learning_rate": 9.972559222995841e-06, + "loss": 0.8801, + "step": 2572 + }, + { + "epoch": 0.1387211559197757, + "grad_norm": 0.752868115901947, + "learning_rate": 9.972537037579626e-06, + "loss": 0.7504, + "step": 2573 + }, + { + "epoch": 0.13877507008841924, + "grad_norm": 0.8015901446342468, + "learning_rate": 9.97251484322346e-06, + "loss": 0.7468, + "step": 2574 + }, + { + "epoch": 0.13882898425706275, + "grad_norm": 0.815352737903595, + "learning_rate": 9.972492639927384e-06, + "loss": 0.8526, + "step": 2575 + }, + { + "epoch": 0.13888289842570628, + "grad_norm": 0.7475571036338806, + "learning_rate": 9.972470427691436e-06, + "loss": 0.7653, + "step": 2576 + }, + { + "epoch": 0.13893681259434978, + "grad_norm": 1.1950535774230957, + "learning_rate": 9.972448206515656e-06, + "loss": 0.9106, + "step": 2577 + }, + { + "epoch": 0.13899072676299332, + "grad_norm": 0.843235194683075, + "learning_rate": 9.972425976400086e-06, + "loss": 0.8922, + "step": 2578 + }, + { + "epoch": 0.13904464093163682, + "grad_norm": 0.8039982914924622, + "learning_rate": 9.972403737344763e-06, + "loss": 0.6855, + "step": 2579 + }, + { + "epoch": 0.13909855510028035, + "grad_norm": 0.7598289251327515, + "learning_rate": 9.97238148934973e-06, + "loss": 0.832, + "step": 2580 + }, + { + "epoch": 0.13915246926892388, + "grad_norm": 0.7986323237419128, + "learning_rate": 9.972359232415025e-06, + "loss": 0.7886, + "step": 2581 + }, + { + "epoch": 0.1392063834375674, + "grad_norm": 0.7465773820877075, + "learning_rate": 9.97233696654069e-06, + "loss": 0.7875, + "step": 2582 + }, + { + "epoch": 0.13926029760621092, + "grad_norm": 0.8853508830070496, + "learning_rate": 9.972314691726764e-06, + "loss": 0.9263, + "step": 2583 + }, + { + "epoch": 0.13931421177485442, + "grad_norm": 0.7267711162567139, + "learning_rate": 9.972292407973286e-06, + "loss": 0.78, + "step": 2584 + }, + { + "epoch": 0.13936812594349796, + "grad_norm": 0.7631322145462036, + "learning_rate": 9.972270115280295e-06, + "loss": 0.7726, + "step": 2585 + }, + { + "epoch": 0.13942204011214146, + "grad_norm": 0.8661205768585205, + "learning_rate": 9.972247813647836e-06, + "loss": 0.977, + "step": 2586 + }, + { + "epoch": 0.139475954280785, + "grad_norm": 0.7955568432807922, + "learning_rate": 9.972225503075943e-06, + "loss": 0.8481, + "step": 2587 + }, + { + "epoch": 0.13952986844942852, + "grad_norm": 0.8810243606567383, + "learning_rate": 9.972203183564661e-06, + "loss": 0.8938, + "step": 2588 + }, + { + "epoch": 0.13958378261807203, + "grad_norm": 0.783968985080719, + "learning_rate": 9.972180855114029e-06, + "loss": 0.7565, + "step": 2589 + }, + { + "epoch": 0.13963769678671556, + "grad_norm": 0.749191164970398, + "learning_rate": 9.972158517724084e-06, + "loss": 0.7283, + "step": 2590 + }, + { + "epoch": 0.13969161095535906, + "grad_norm": 0.7926847338676453, + "learning_rate": 9.972136171394871e-06, + "loss": 0.9073, + "step": 2591 + }, + { + "epoch": 0.1397455251240026, + "grad_norm": 0.7621777653694153, + "learning_rate": 9.972113816126427e-06, + "loss": 0.7176, + "step": 2592 + }, + { + "epoch": 0.1397994392926461, + "grad_norm": 0.8856351375579834, + "learning_rate": 9.972091451918792e-06, + "loss": 0.7428, + "step": 2593 + }, + { + "epoch": 0.13985335346128963, + "grad_norm": 0.8027200698852539, + "learning_rate": 9.972069078772008e-06, + "loss": 0.7794, + "step": 2594 + }, + { + "epoch": 0.13990726762993314, + "grad_norm": 0.8776759505271912, + "learning_rate": 9.972046696686115e-06, + "loss": 0.9087, + "step": 2595 + }, + { + "epoch": 0.13996118179857667, + "grad_norm": 0.8979713320732117, + "learning_rate": 9.972024305661152e-06, + "loss": 0.8031, + "step": 2596 + }, + { + "epoch": 0.1400150959672202, + "grad_norm": 0.8233299851417542, + "learning_rate": 9.97200190569716e-06, + "loss": 0.8462, + "step": 2597 + }, + { + "epoch": 0.1400690101358637, + "grad_norm": 0.8777962327003479, + "learning_rate": 9.971979496794178e-06, + "loss": 0.8464, + "step": 2598 + }, + { + "epoch": 0.14012292430450723, + "grad_norm": 0.7185937166213989, + "learning_rate": 9.971957078952249e-06, + "loss": 0.7423, + "step": 2599 + }, + { + "epoch": 0.14017683847315074, + "grad_norm": 0.8226794600486755, + "learning_rate": 9.971934652171412e-06, + "loss": 0.8017, + "step": 2600 + }, + { + "epoch": 0.14023075264179427, + "grad_norm": 0.8021965622901917, + "learning_rate": 9.971912216451705e-06, + "loss": 0.8018, + "step": 2601 + }, + { + "epoch": 0.14028466681043777, + "grad_norm": 1.0516051054000854, + "learning_rate": 9.971889771793172e-06, + "loss": 0.8894, + "step": 2602 + }, + { + "epoch": 0.1403385809790813, + "grad_norm": 0.8212647438049316, + "learning_rate": 9.971867318195851e-06, + "loss": 0.826, + "step": 2603 + }, + { + "epoch": 0.1403924951477248, + "grad_norm": 0.8427513241767883, + "learning_rate": 9.971844855659783e-06, + "loss": 0.815, + "step": 2604 + }, + { + "epoch": 0.14044640931636834, + "grad_norm": 0.779569149017334, + "learning_rate": 9.97182238418501e-06, + "loss": 0.797, + "step": 2605 + }, + { + "epoch": 0.14050032348501187, + "grad_norm": 0.7430607080459595, + "learning_rate": 9.97179990377157e-06, + "loss": 0.7925, + "step": 2606 + }, + { + "epoch": 0.14055423765365538, + "grad_norm": 0.8079801797866821, + "learning_rate": 9.971777414419503e-06, + "loss": 0.8259, + "step": 2607 + }, + { + "epoch": 0.1406081518222989, + "grad_norm": 0.794086754322052, + "learning_rate": 9.971754916128853e-06, + "loss": 0.833, + "step": 2608 + }, + { + "epoch": 0.14066206599094241, + "grad_norm": 0.8177362680435181, + "learning_rate": 9.971732408899657e-06, + "loss": 0.8543, + "step": 2609 + }, + { + "epoch": 0.14071598015958595, + "grad_norm": 0.8591805100440979, + "learning_rate": 9.971709892731956e-06, + "loss": 0.9323, + "step": 2610 + }, + { + "epoch": 0.14076989432822945, + "grad_norm": 0.8102341890335083, + "learning_rate": 9.971687367625793e-06, + "loss": 0.7679, + "step": 2611 + }, + { + "epoch": 0.14082380849687298, + "grad_norm": 0.8556869626045227, + "learning_rate": 9.971664833581205e-06, + "loss": 0.8458, + "step": 2612 + }, + { + "epoch": 0.14087772266551649, + "grad_norm": 0.7998070120811462, + "learning_rate": 9.971642290598235e-06, + "loss": 0.7663, + "step": 2613 + }, + { + "epoch": 0.14093163683416002, + "grad_norm": 0.8800550103187561, + "learning_rate": 9.971619738676923e-06, + "loss": 0.8653, + "step": 2614 + }, + { + "epoch": 0.14098555100280355, + "grad_norm": 0.8199629187583923, + "learning_rate": 9.971597177817308e-06, + "loss": 0.8804, + "step": 2615 + }, + { + "epoch": 0.14103946517144705, + "grad_norm": 0.8774363398551941, + "learning_rate": 9.971574608019432e-06, + "loss": 0.8468, + "step": 2616 + }, + { + "epoch": 0.14109337934009059, + "grad_norm": 0.7911790013313293, + "learning_rate": 9.971552029283335e-06, + "loss": 0.7841, + "step": 2617 + }, + { + "epoch": 0.1411472935087341, + "grad_norm": 0.8152750134468079, + "learning_rate": 9.97152944160906e-06, + "loss": 0.7753, + "step": 2618 + }, + { + "epoch": 0.14120120767737762, + "grad_norm": 0.8709943890571594, + "learning_rate": 9.971506844996645e-06, + "loss": 0.7259, + "step": 2619 + }, + { + "epoch": 0.14125512184602113, + "grad_norm": 1.1131712198257446, + "learning_rate": 9.97148423944613e-06, + "loss": 0.9422, + "step": 2620 + }, + { + "epoch": 0.14130903601466466, + "grad_norm": 0.8992665410041809, + "learning_rate": 9.971461624957557e-06, + "loss": 0.733, + "step": 2621 + }, + { + "epoch": 0.14136295018330816, + "grad_norm": 0.7548032402992249, + "learning_rate": 9.971439001530967e-06, + "loss": 0.7733, + "step": 2622 + }, + { + "epoch": 0.1414168643519517, + "grad_norm": 0.7988988161087036, + "learning_rate": 9.9714163691664e-06, + "loss": 0.8218, + "step": 2623 + }, + { + "epoch": 0.14147077852059523, + "grad_norm": 0.7697865962982178, + "learning_rate": 9.971393727863899e-06, + "loss": 0.7882, + "step": 2624 + }, + { + "epoch": 0.14152469268923873, + "grad_norm": 0.993664026260376, + "learning_rate": 9.9713710776235e-06, + "loss": 0.8331, + "step": 2625 + }, + { + "epoch": 0.14157860685788226, + "grad_norm": 1.0097055435180664, + "learning_rate": 9.971348418445245e-06, + "loss": 0.8959, + "step": 2626 + }, + { + "epoch": 0.14163252102652577, + "grad_norm": 0.7682481408119202, + "learning_rate": 9.97132575032918e-06, + "loss": 0.7425, + "step": 2627 + }, + { + "epoch": 0.1416864351951693, + "grad_norm": 0.790695309638977, + "learning_rate": 9.971303073275338e-06, + "loss": 0.6887, + "step": 2628 + }, + { + "epoch": 0.1417403493638128, + "grad_norm": 0.9672498106956482, + "learning_rate": 9.971280387283766e-06, + "loss": 0.8617, + "step": 2629 + }, + { + "epoch": 0.14179426353245633, + "grad_norm": 0.8538743853569031, + "learning_rate": 9.971257692354502e-06, + "loss": 0.7826, + "step": 2630 + }, + { + "epoch": 0.14184817770109984, + "grad_norm": 0.7527078986167908, + "learning_rate": 9.971234988487587e-06, + "loss": 0.7542, + "step": 2631 + }, + { + "epoch": 0.14190209186974337, + "grad_norm": 0.9390487670898438, + "learning_rate": 9.97121227568306e-06, + "loss": 0.8415, + "step": 2632 + }, + { + "epoch": 0.1419560060383869, + "grad_norm": 0.8717443346977234, + "learning_rate": 9.971189553940966e-06, + "loss": 0.7969, + "step": 2633 + }, + { + "epoch": 0.1420099202070304, + "grad_norm": 0.7848197817802429, + "learning_rate": 9.971166823261343e-06, + "loss": 0.8049, + "step": 2634 + }, + { + "epoch": 0.14206383437567394, + "grad_norm": 0.8002238273620605, + "learning_rate": 9.971144083644233e-06, + "loss": 0.8681, + "step": 2635 + }, + { + "epoch": 0.14211774854431744, + "grad_norm": 0.7699506282806396, + "learning_rate": 9.971121335089676e-06, + "loss": 0.7815, + "step": 2636 + }, + { + "epoch": 0.14217166271296097, + "grad_norm": 0.9187048673629761, + "learning_rate": 9.971098577597713e-06, + "loss": 0.8611, + "step": 2637 + }, + { + "epoch": 0.14222557688160448, + "grad_norm": 0.802859365940094, + "learning_rate": 9.971075811168385e-06, + "loss": 0.7991, + "step": 2638 + }, + { + "epoch": 0.142279491050248, + "grad_norm": 1.0536410808563232, + "learning_rate": 9.971053035801735e-06, + "loss": 0.9726, + "step": 2639 + }, + { + "epoch": 0.1423334052188915, + "grad_norm": 0.8278898000717163, + "learning_rate": 9.9710302514978e-06, + "loss": 0.8636, + "step": 2640 + }, + { + "epoch": 0.14238731938753504, + "grad_norm": 0.7639529705047607, + "learning_rate": 9.971007458256623e-06, + "loss": 0.7849, + "step": 2641 + }, + { + "epoch": 0.14244123355617858, + "grad_norm": 0.9108867049217224, + "learning_rate": 9.970984656078246e-06, + "loss": 0.891, + "step": 2642 + }, + { + "epoch": 0.14249514772482208, + "grad_norm": 0.8182162046432495, + "learning_rate": 9.97096184496271e-06, + "loss": 0.7975, + "step": 2643 + }, + { + "epoch": 0.1425490618934656, + "grad_norm": 0.848781168460846, + "learning_rate": 9.970939024910053e-06, + "loss": 0.8677, + "step": 2644 + }, + { + "epoch": 0.14260297606210912, + "grad_norm": 0.8322750926017761, + "learning_rate": 9.97091619592032e-06, + "loss": 0.776, + "step": 2645 + }, + { + "epoch": 0.14265689023075265, + "grad_norm": 0.8054049611091614, + "learning_rate": 9.970893357993548e-06, + "loss": 0.804, + "step": 2646 + }, + { + "epoch": 0.14271080439939615, + "grad_norm": 0.8162119388580322, + "learning_rate": 9.970870511129782e-06, + "loss": 0.7856, + "step": 2647 + }, + { + "epoch": 0.14276471856803968, + "grad_norm": 0.73929363489151, + "learning_rate": 9.97084765532906e-06, + "loss": 0.7687, + "step": 2648 + }, + { + "epoch": 0.1428186327366832, + "grad_norm": 0.866688072681427, + "learning_rate": 9.970824790591425e-06, + "loss": 0.8751, + "step": 2649 + }, + { + "epoch": 0.14287254690532672, + "grad_norm": 0.7772359251976013, + "learning_rate": 9.970801916916917e-06, + "loss": 0.7232, + "step": 2650 + }, + { + "epoch": 0.14292646107397025, + "grad_norm": 0.8912346363067627, + "learning_rate": 9.970779034305578e-06, + "loss": 0.8393, + "step": 2651 + }, + { + "epoch": 0.14298037524261376, + "grad_norm": 0.7827256917953491, + "learning_rate": 9.970756142757448e-06, + "loss": 0.7924, + "step": 2652 + }, + { + "epoch": 0.1430342894112573, + "grad_norm": 0.7557843923568726, + "learning_rate": 9.97073324227257e-06, + "loss": 0.8032, + "step": 2653 + }, + { + "epoch": 0.1430882035799008, + "grad_norm": 0.7939576506614685, + "learning_rate": 9.970710332850983e-06, + "loss": 0.7251, + "step": 2654 + }, + { + "epoch": 0.14314211774854432, + "grad_norm": 0.8175502419471741, + "learning_rate": 9.97068741449273e-06, + "loss": 0.7685, + "step": 2655 + }, + { + "epoch": 0.14319603191718783, + "grad_norm": 0.7537406086921692, + "learning_rate": 9.970664487197851e-06, + "loss": 0.7354, + "step": 2656 + }, + { + "epoch": 0.14324994608583136, + "grad_norm": 0.8045641779899597, + "learning_rate": 9.970641550966388e-06, + "loss": 0.7581, + "step": 2657 + }, + { + "epoch": 0.14330386025447486, + "grad_norm": 0.69786137342453, + "learning_rate": 9.97061860579838e-06, + "loss": 0.6923, + "step": 2658 + }, + { + "epoch": 0.1433577744231184, + "grad_norm": 0.7913051843643188, + "learning_rate": 9.970595651693874e-06, + "loss": 0.7579, + "step": 2659 + }, + { + "epoch": 0.14341168859176193, + "grad_norm": 0.7890749573707581, + "learning_rate": 9.970572688652905e-06, + "loss": 0.7843, + "step": 2660 + }, + { + "epoch": 0.14346560276040543, + "grad_norm": 0.913074791431427, + "learning_rate": 9.970549716675516e-06, + "loss": 0.8318, + "step": 2661 + }, + { + "epoch": 0.14351951692904896, + "grad_norm": 0.757522463798523, + "learning_rate": 9.97052673576175e-06, + "loss": 0.6803, + "step": 2662 + }, + { + "epoch": 0.14357343109769247, + "grad_norm": 0.9279198050498962, + "learning_rate": 9.970503745911645e-06, + "loss": 0.8591, + "step": 2663 + }, + { + "epoch": 0.143627345266336, + "grad_norm": 0.8218236565589905, + "learning_rate": 9.97048074712525e-06, + "loss": 0.8253, + "step": 2664 + }, + { + "epoch": 0.1436812594349795, + "grad_norm": 0.7562058568000793, + "learning_rate": 9.970457739402596e-06, + "loss": 0.8114, + "step": 2665 + }, + { + "epoch": 0.14373517360362303, + "grad_norm": 0.7626449465751648, + "learning_rate": 9.970434722743732e-06, + "loss": 0.7932, + "step": 2666 + }, + { + "epoch": 0.14378908777226654, + "grad_norm": 0.8287700414657593, + "learning_rate": 9.970411697148696e-06, + "loss": 0.754, + "step": 2667 + }, + { + "epoch": 0.14384300194091007, + "grad_norm": 1.0403661727905273, + "learning_rate": 9.97038866261753e-06, + "loss": 0.9062, + "step": 2668 + }, + { + "epoch": 0.1438969161095536, + "grad_norm": 0.8278779983520508, + "learning_rate": 9.970365619150276e-06, + "loss": 0.9181, + "step": 2669 + }, + { + "epoch": 0.1439508302781971, + "grad_norm": 0.950964629650116, + "learning_rate": 9.970342566746973e-06, + "loss": 0.9235, + "step": 2670 + }, + { + "epoch": 0.14400474444684064, + "grad_norm": 0.9529917240142822, + "learning_rate": 9.970319505407667e-06, + "loss": 0.7929, + "step": 2671 + }, + { + "epoch": 0.14405865861548414, + "grad_norm": 0.7601970434188843, + "learning_rate": 9.970296435132395e-06, + "loss": 0.7133, + "step": 2672 + }, + { + "epoch": 0.14411257278412767, + "grad_norm": 0.8906385898590088, + "learning_rate": 9.970273355921201e-06, + "loss": 0.8679, + "step": 2673 + }, + { + "epoch": 0.14416648695277118, + "grad_norm": 0.8250144720077515, + "learning_rate": 9.970250267774126e-06, + "loss": 0.7871, + "step": 2674 + }, + { + "epoch": 0.1442204011214147, + "grad_norm": 0.8182716965675354, + "learning_rate": 9.970227170691212e-06, + "loss": 0.7391, + "step": 2675 + }, + { + "epoch": 0.1442743152900582, + "grad_norm": 0.8261950016021729, + "learning_rate": 9.970204064672498e-06, + "loss": 0.8914, + "step": 2676 + }, + { + "epoch": 0.14432822945870175, + "grad_norm": 1.248270869255066, + "learning_rate": 9.97018094971803e-06, + "loss": 0.7834, + "step": 2677 + }, + { + "epoch": 0.14438214362734528, + "grad_norm": 0.7821226119995117, + "learning_rate": 9.970157825827844e-06, + "loss": 0.7436, + "step": 2678 + }, + { + "epoch": 0.14443605779598878, + "grad_norm": 0.9708791375160217, + "learning_rate": 9.970134693001987e-06, + "loss": 0.9038, + "step": 2679 + }, + { + "epoch": 0.1444899719646323, + "grad_norm": 0.8178976774215698, + "learning_rate": 9.970111551240499e-06, + "loss": 0.8748, + "step": 2680 + }, + { + "epoch": 0.14454388613327582, + "grad_norm": 0.8477594256401062, + "learning_rate": 9.970088400543417e-06, + "loss": 0.8169, + "step": 2681 + }, + { + "epoch": 0.14459780030191935, + "grad_norm": 0.9478195309638977, + "learning_rate": 9.970065240910789e-06, + "loss": 0.789, + "step": 2682 + }, + { + "epoch": 0.14465171447056285, + "grad_norm": 0.9151026010513306, + "learning_rate": 9.970042072342652e-06, + "loss": 0.8804, + "step": 2683 + }, + { + "epoch": 0.14470562863920639, + "grad_norm": 0.8062365651130676, + "learning_rate": 9.970018894839052e-06, + "loss": 0.8329, + "step": 2684 + }, + { + "epoch": 0.1447595428078499, + "grad_norm": 0.8029241561889648, + "learning_rate": 9.969995708400028e-06, + "loss": 0.7053, + "step": 2685 + }, + { + "epoch": 0.14481345697649342, + "grad_norm": 0.8023892641067505, + "learning_rate": 9.969972513025621e-06, + "loss": 0.7921, + "step": 2686 + }, + { + "epoch": 0.14486737114513695, + "grad_norm": 0.9224045276641846, + "learning_rate": 9.969949308715874e-06, + "loss": 0.7416, + "step": 2687 + }, + { + "epoch": 0.14492128531378046, + "grad_norm": 0.7767837047576904, + "learning_rate": 9.969926095470829e-06, + "loss": 0.7844, + "step": 2688 + }, + { + "epoch": 0.144975199482424, + "grad_norm": 0.7804312109947205, + "learning_rate": 9.969902873290526e-06, + "loss": 0.712, + "step": 2689 + }, + { + "epoch": 0.1450291136510675, + "grad_norm": 0.9595988988876343, + "learning_rate": 9.969879642175009e-06, + "loss": 0.7686, + "step": 2690 + }, + { + "epoch": 0.14508302781971102, + "grad_norm": 1.0414133071899414, + "learning_rate": 9.969856402124318e-06, + "loss": 0.8833, + "step": 2691 + }, + { + "epoch": 0.14513694198835453, + "grad_norm": 0.9321674108505249, + "learning_rate": 9.969833153138498e-06, + "loss": 0.7576, + "step": 2692 + }, + { + "epoch": 0.14519085615699806, + "grad_norm": 0.7715985774993896, + "learning_rate": 9.969809895217586e-06, + "loss": 0.7371, + "step": 2693 + }, + { + "epoch": 0.1452447703256416, + "grad_norm": 1.0257316827774048, + "learning_rate": 9.969786628361625e-06, + "loss": 0.8394, + "step": 2694 + }, + { + "epoch": 0.1452986844942851, + "grad_norm": 0.7823453545570374, + "learning_rate": 9.969763352570659e-06, + "loss": 0.7974, + "step": 2695 + }, + { + "epoch": 0.14535259866292863, + "grad_norm": 0.8257505893707275, + "learning_rate": 9.969740067844728e-06, + "loss": 0.7948, + "step": 2696 + }, + { + "epoch": 0.14540651283157213, + "grad_norm": 0.6493780016899109, + "learning_rate": 9.969716774183878e-06, + "loss": 0.6531, + "step": 2697 + }, + { + "epoch": 0.14546042700021566, + "grad_norm": 0.8953896760940552, + "learning_rate": 9.969693471588144e-06, + "loss": 0.7414, + "step": 2698 + }, + { + "epoch": 0.14551434116885917, + "grad_norm": 0.7177074551582336, + "learning_rate": 9.969670160057572e-06, + "loss": 0.65, + "step": 2699 + }, + { + "epoch": 0.1455682553375027, + "grad_norm": 0.8214414715766907, + "learning_rate": 9.969646839592204e-06, + "loss": 0.7605, + "step": 2700 + }, + { + "epoch": 0.1456221695061462, + "grad_norm": 0.8062289953231812, + "learning_rate": 9.969623510192081e-06, + "loss": 0.8275, + "step": 2701 + }, + { + "epoch": 0.14567608367478974, + "grad_norm": 0.9606921076774597, + "learning_rate": 9.969600171857246e-06, + "loss": 0.8472, + "step": 2702 + }, + { + "epoch": 0.14572999784343327, + "grad_norm": 1.0146433115005493, + "learning_rate": 9.96957682458774e-06, + "loss": 0.8398, + "step": 2703 + }, + { + "epoch": 0.14578391201207677, + "grad_norm": 0.8463965058326721, + "learning_rate": 9.969553468383604e-06, + "loss": 0.7563, + "step": 2704 + }, + { + "epoch": 0.1458378261807203, + "grad_norm": 0.8125115633010864, + "learning_rate": 9.96953010324488e-06, + "loss": 0.8042, + "step": 2705 + }, + { + "epoch": 0.1458917403493638, + "grad_norm": 0.9350455403327942, + "learning_rate": 9.969506729171612e-06, + "loss": 0.9067, + "step": 2706 + }, + { + "epoch": 0.14594565451800734, + "grad_norm": 0.9979991316795349, + "learning_rate": 9.969483346163843e-06, + "loss": 0.778, + "step": 2707 + }, + { + "epoch": 0.14599956868665084, + "grad_norm": 0.8236498236656189, + "learning_rate": 9.969459954221612e-06, + "loss": 0.9011, + "step": 2708 + }, + { + "epoch": 0.14605348285529438, + "grad_norm": 0.6965605616569519, + "learning_rate": 9.969436553344962e-06, + "loss": 0.6657, + "step": 2709 + }, + { + "epoch": 0.14610739702393788, + "grad_norm": 0.810246467590332, + "learning_rate": 9.969413143533936e-06, + "loss": 0.8099, + "step": 2710 + }, + { + "epoch": 0.1461613111925814, + "grad_norm": 1.1437804698944092, + "learning_rate": 9.969389724788574e-06, + "loss": 0.7457, + "step": 2711 + }, + { + "epoch": 0.14621522536122494, + "grad_norm": 0.8632565140724182, + "learning_rate": 9.96936629710892e-06, + "loss": 0.8549, + "step": 2712 + }, + { + "epoch": 0.14626913952986845, + "grad_norm": 0.9616119265556335, + "learning_rate": 9.969342860495018e-06, + "loss": 0.6219, + "step": 2713 + }, + { + "epoch": 0.14632305369851198, + "grad_norm": 0.9943077564239502, + "learning_rate": 9.969319414946906e-06, + "loss": 0.8676, + "step": 2714 + }, + { + "epoch": 0.14637696786715548, + "grad_norm": 0.861070454120636, + "learning_rate": 9.969295960464627e-06, + "loss": 0.7235, + "step": 2715 + }, + { + "epoch": 0.14643088203579901, + "grad_norm": 0.9375396370887756, + "learning_rate": 9.969272497048225e-06, + "loss": 0.9169, + "step": 2716 + }, + { + "epoch": 0.14648479620444252, + "grad_norm": 0.8180664777755737, + "learning_rate": 9.969249024697741e-06, + "loss": 0.8109, + "step": 2717 + }, + { + "epoch": 0.14653871037308605, + "grad_norm": 0.8574398159980774, + "learning_rate": 9.969225543413218e-06, + "loss": 0.767, + "step": 2718 + }, + { + "epoch": 0.14659262454172955, + "grad_norm": 1.0249319076538086, + "learning_rate": 9.969202053194697e-06, + "loss": 0.902, + "step": 2719 + }, + { + "epoch": 0.1466465387103731, + "grad_norm": 0.8045467734336853, + "learning_rate": 9.96917855404222e-06, + "loss": 0.7797, + "step": 2720 + }, + { + "epoch": 0.14670045287901662, + "grad_norm": 0.880533754825592, + "learning_rate": 9.969155045955831e-06, + "loss": 0.8071, + "step": 2721 + }, + { + "epoch": 0.14675436704766012, + "grad_norm": 0.8733983635902405, + "learning_rate": 9.969131528935572e-06, + "loss": 0.8309, + "step": 2722 + }, + { + "epoch": 0.14680828121630365, + "grad_norm": 0.8205264210700989, + "learning_rate": 9.969108002981484e-06, + "loss": 0.8126, + "step": 2723 + }, + { + "epoch": 0.14686219538494716, + "grad_norm": 0.8250916600227356, + "learning_rate": 9.96908446809361e-06, + "loss": 0.7488, + "step": 2724 + }, + { + "epoch": 0.1469161095535907, + "grad_norm": 0.8082099556922913, + "learning_rate": 9.969060924271994e-06, + "loss": 0.8039, + "step": 2725 + }, + { + "epoch": 0.1469700237222342, + "grad_norm": 0.8376840353012085, + "learning_rate": 9.969037371516674e-06, + "loss": 0.7603, + "step": 2726 + }, + { + "epoch": 0.14702393789087773, + "grad_norm": 1.2106066942214966, + "learning_rate": 9.969013809827697e-06, + "loss": 0.8187, + "step": 2727 + }, + { + "epoch": 0.14707785205952123, + "grad_norm": 0.8828561305999756, + "learning_rate": 9.968990239205103e-06, + "loss": 0.7249, + "step": 2728 + }, + { + "epoch": 0.14713176622816476, + "grad_norm": 0.8182427883148193, + "learning_rate": 9.968966659648935e-06, + "loss": 0.8353, + "step": 2729 + }, + { + "epoch": 0.1471856803968083, + "grad_norm": 0.8091077208518982, + "learning_rate": 9.968943071159234e-06, + "loss": 0.8261, + "step": 2730 + }, + { + "epoch": 0.1472395945654518, + "grad_norm": 0.9515360593795776, + "learning_rate": 9.968919473736043e-06, + "loss": 0.9099, + "step": 2731 + }, + { + "epoch": 0.14729350873409533, + "grad_norm": 0.7404700517654419, + "learning_rate": 9.968895867379407e-06, + "loss": 0.7793, + "step": 2732 + }, + { + "epoch": 0.14734742290273883, + "grad_norm": 0.7887243032455444, + "learning_rate": 9.968872252089365e-06, + "loss": 0.8749, + "step": 2733 + }, + { + "epoch": 0.14740133707138237, + "grad_norm": 1.1335293054580688, + "learning_rate": 9.968848627865962e-06, + "loss": 0.8428, + "step": 2734 + }, + { + "epoch": 0.14745525124002587, + "grad_norm": 0.787325382232666, + "learning_rate": 9.968824994709238e-06, + "loss": 0.8026, + "step": 2735 + }, + { + "epoch": 0.1475091654086694, + "grad_norm": 0.8006013035774231, + "learning_rate": 9.968801352619238e-06, + "loss": 0.9083, + "step": 2736 + }, + { + "epoch": 0.1475630795773129, + "grad_norm": 0.8923180103302002, + "learning_rate": 9.968777701596002e-06, + "loss": 0.8628, + "step": 2737 + }, + { + "epoch": 0.14761699374595644, + "grad_norm": 0.798041582107544, + "learning_rate": 9.968754041639573e-06, + "loss": 0.7519, + "step": 2738 + }, + { + "epoch": 0.14767090791459997, + "grad_norm": 0.8984145522117615, + "learning_rate": 9.968730372749996e-06, + "loss": 0.7624, + "step": 2739 + }, + { + "epoch": 0.14772482208324347, + "grad_norm": 0.8182528018951416, + "learning_rate": 9.968706694927312e-06, + "loss": 0.8442, + "step": 2740 + }, + { + "epoch": 0.147778736251887, + "grad_norm": 0.8047756552696228, + "learning_rate": 9.968683008171562e-06, + "loss": 0.847, + "step": 2741 + }, + { + "epoch": 0.1478326504205305, + "grad_norm": 0.7935258150100708, + "learning_rate": 9.968659312482792e-06, + "loss": 0.8072, + "step": 2742 + }, + { + "epoch": 0.14788656458917404, + "grad_norm": 0.8043146729469299, + "learning_rate": 9.968635607861042e-06, + "loss": 0.7769, + "step": 2743 + }, + { + "epoch": 0.14794047875781755, + "grad_norm": 0.7826459407806396, + "learning_rate": 9.968611894306356e-06, + "loss": 0.8418, + "step": 2744 + }, + { + "epoch": 0.14799439292646108, + "grad_norm": 0.9293491244316101, + "learning_rate": 9.968588171818775e-06, + "loss": 0.8704, + "step": 2745 + }, + { + "epoch": 0.14804830709510458, + "grad_norm": 0.8281397223472595, + "learning_rate": 9.968564440398343e-06, + "loss": 0.9288, + "step": 2746 + }, + { + "epoch": 0.1481022212637481, + "grad_norm": 0.8558036684989929, + "learning_rate": 9.968540700045101e-06, + "loss": 0.8406, + "step": 2747 + }, + { + "epoch": 0.14815613543239164, + "grad_norm": 0.8167025446891785, + "learning_rate": 9.968516950759096e-06, + "loss": 0.8268, + "step": 2748 + }, + { + "epoch": 0.14821004960103515, + "grad_norm": 0.8612670302391052, + "learning_rate": 9.968493192540364e-06, + "loss": 0.8265, + "step": 2749 + }, + { + "epoch": 0.14826396376967868, + "grad_norm": 0.9208493232727051, + "learning_rate": 9.968469425388953e-06, + "loss": 0.8555, + "step": 2750 + }, + { + "epoch": 0.14831787793832218, + "grad_norm": 0.756591260433197, + "learning_rate": 9.968445649304904e-06, + "loss": 0.7655, + "step": 2751 + }, + { + "epoch": 0.14837179210696572, + "grad_norm": 0.8566586375236511, + "learning_rate": 9.96842186428826e-06, + "loss": 0.8125, + "step": 2752 + }, + { + "epoch": 0.14842570627560922, + "grad_norm": 0.7984357476234436, + "learning_rate": 9.968398070339063e-06, + "loss": 0.7307, + "step": 2753 + }, + { + "epoch": 0.14847962044425275, + "grad_norm": 0.8943261504173279, + "learning_rate": 9.968374267457356e-06, + "loss": 0.757, + "step": 2754 + }, + { + "epoch": 0.14853353461289626, + "grad_norm": 0.9466004967689514, + "learning_rate": 9.968350455643184e-06, + "loss": 0.8271, + "step": 2755 + }, + { + "epoch": 0.1485874487815398, + "grad_norm": 0.7604812383651733, + "learning_rate": 9.968326634896585e-06, + "loss": 0.7654, + "step": 2756 + }, + { + "epoch": 0.14864136295018332, + "grad_norm": 0.7803215384483337, + "learning_rate": 9.968302805217609e-06, + "loss": 0.7691, + "step": 2757 + }, + { + "epoch": 0.14869527711882682, + "grad_norm": 0.8579596281051636, + "learning_rate": 9.96827896660629e-06, + "loss": 0.859, + "step": 2758 + }, + { + "epoch": 0.14874919128747036, + "grad_norm": 0.8205640316009521, + "learning_rate": 9.968255119062679e-06, + "loss": 0.8588, + "step": 2759 + }, + { + "epoch": 0.14880310545611386, + "grad_norm": 0.8601415753364563, + "learning_rate": 9.968231262586814e-06, + "loss": 0.8399, + "step": 2760 + }, + { + "epoch": 0.1488570196247574, + "grad_norm": 0.8827456831932068, + "learning_rate": 9.96820739717874e-06, + "loss": 0.8413, + "step": 2761 + }, + { + "epoch": 0.1489109337934009, + "grad_norm": 0.7422264218330383, + "learning_rate": 9.968183522838499e-06, + "loss": 0.7451, + "step": 2762 + }, + { + "epoch": 0.14896484796204443, + "grad_norm": 0.9764127135276794, + "learning_rate": 9.968159639566133e-06, + "loss": 0.8436, + "step": 2763 + }, + { + "epoch": 0.14901876213068793, + "grad_norm": 0.7435232400894165, + "learning_rate": 9.968135747361687e-06, + "loss": 0.7553, + "step": 2764 + }, + { + "epoch": 0.14907267629933146, + "grad_norm": 0.7399751543998718, + "learning_rate": 9.968111846225202e-06, + "loss": 0.7695, + "step": 2765 + }, + { + "epoch": 0.149126590467975, + "grad_norm": 0.882901668548584, + "learning_rate": 9.968087936156722e-06, + "loss": 0.8418, + "step": 2766 + }, + { + "epoch": 0.1491805046366185, + "grad_norm": 0.840501606464386, + "learning_rate": 9.968064017156292e-06, + "loss": 0.83, + "step": 2767 + }, + { + "epoch": 0.14923441880526203, + "grad_norm": 0.9809413552284241, + "learning_rate": 9.96804008922395e-06, + "loss": 0.8029, + "step": 2768 + }, + { + "epoch": 0.14928833297390554, + "grad_norm": 0.7534085512161255, + "learning_rate": 9.968016152359744e-06, + "loss": 0.7201, + "step": 2769 + }, + { + "epoch": 0.14934224714254907, + "grad_norm": 0.813582718372345, + "learning_rate": 9.967992206563714e-06, + "loss": 0.8533, + "step": 2770 + }, + { + "epoch": 0.14939616131119257, + "grad_norm": 0.9827276468276978, + "learning_rate": 9.967968251835905e-06, + "loss": 0.8097, + "step": 2771 + }, + { + "epoch": 0.1494500754798361, + "grad_norm": 0.828959047794342, + "learning_rate": 9.967944288176359e-06, + "loss": 0.859, + "step": 2772 + }, + { + "epoch": 0.1495039896484796, + "grad_norm": 0.8123818039894104, + "learning_rate": 9.967920315585118e-06, + "loss": 0.7044, + "step": 2773 + }, + { + "epoch": 0.14955790381712314, + "grad_norm": 0.7503589987754822, + "learning_rate": 9.967896334062228e-06, + "loss": 0.7255, + "step": 2774 + }, + { + "epoch": 0.14961181798576667, + "grad_norm": 0.7414034605026245, + "learning_rate": 9.96787234360773e-06, + "loss": 0.7599, + "step": 2775 + }, + { + "epoch": 0.14966573215441017, + "grad_norm": 0.7467254400253296, + "learning_rate": 9.967848344221667e-06, + "loss": 0.6835, + "step": 2776 + }, + { + "epoch": 0.1497196463230537, + "grad_norm": 0.8653414249420166, + "learning_rate": 9.967824335904082e-06, + "loss": 0.8205, + "step": 2777 + }, + { + "epoch": 0.1497735604916972, + "grad_norm": 0.9113380312919617, + "learning_rate": 9.96780031865502e-06, + "loss": 0.8758, + "step": 2778 + }, + { + "epoch": 0.14982747466034074, + "grad_norm": 0.8330965042114258, + "learning_rate": 9.967776292474523e-06, + "loss": 0.8696, + "step": 2779 + }, + { + "epoch": 0.14988138882898425, + "grad_norm": 0.9087555408477783, + "learning_rate": 9.967752257362633e-06, + "loss": 0.8381, + "step": 2780 + }, + { + "epoch": 0.14993530299762778, + "grad_norm": 0.856777548789978, + "learning_rate": 9.967728213319394e-06, + "loss": 0.8365, + "step": 2781 + }, + { + "epoch": 0.14998921716627128, + "grad_norm": 0.8314496874809265, + "learning_rate": 9.967704160344852e-06, + "loss": 0.7403, + "step": 2782 + }, + { + "epoch": 0.15004313133491481, + "grad_norm": 0.8357448577880859, + "learning_rate": 9.967680098439047e-06, + "loss": 0.8256, + "step": 2783 + }, + { + "epoch": 0.15009704550355835, + "grad_norm": 0.8366092443466187, + "learning_rate": 9.967656027602023e-06, + "loss": 0.8221, + "step": 2784 + }, + { + "epoch": 0.15015095967220185, + "grad_norm": 0.7944943904876709, + "learning_rate": 9.967631947833823e-06, + "loss": 0.813, + "step": 2785 + }, + { + "epoch": 0.15020487384084538, + "grad_norm": 0.8407523036003113, + "learning_rate": 9.967607859134492e-06, + "loss": 0.8237, + "step": 2786 + }, + { + "epoch": 0.1502587880094889, + "grad_norm": 0.7879778146743774, + "learning_rate": 9.967583761504071e-06, + "loss": 0.777, + "step": 2787 + }, + { + "epoch": 0.15031270217813242, + "grad_norm": 0.8307899832725525, + "learning_rate": 9.967559654942604e-06, + "loss": 0.8394, + "step": 2788 + }, + { + "epoch": 0.15036661634677592, + "grad_norm": 0.8068673610687256, + "learning_rate": 9.967535539450135e-06, + "loss": 0.8435, + "step": 2789 + }, + { + "epoch": 0.15042053051541945, + "grad_norm": 0.8473932147026062, + "learning_rate": 9.967511415026709e-06, + "loss": 0.8698, + "step": 2790 + }, + { + "epoch": 0.15047444468406296, + "grad_norm": 0.8352688550949097, + "learning_rate": 9.967487281672365e-06, + "loss": 0.8617, + "step": 2791 + }, + { + "epoch": 0.1505283588527065, + "grad_norm": 0.7729620337486267, + "learning_rate": 9.96746313938715e-06, + "loss": 0.779, + "step": 2792 + }, + { + "epoch": 0.15058227302135002, + "grad_norm": 0.8704085946083069, + "learning_rate": 9.967438988171106e-06, + "loss": 0.833, + "step": 2793 + }, + { + "epoch": 0.15063618718999353, + "grad_norm": 0.7538182735443115, + "learning_rate": 9.967414828024276e-06, + "loss": 0.7479, + "step": 2794 + }, + { + "epoch": 0.15069010135863706, + "grad_norm": 0.7672195434570312, + "learning_rate": 9.967390658946704e-06, + "loss": 0.7778, + "step": 2795 + }, + { + "epoch": 0.15074401552728056, + "grad_norm": 0.8245819211006165, + "learning_rate": 9.967366480938435e-06, + "loss": 0.6898, + "step": 2796 + }, + { + "epoch": 0.1507979296959241, + "grad_norm": 0.8197571635246277, + "learning_rate": 9.967342293999512e-06, + "loss": 0.8714, + "step": 2797 + }, + { + "epoch": 0.1508518438645676, + "grad_norm": 0.8135389685630798, + "learning_rate": 9.967318098129974e-06, + "loss": 0.8906, + "step": 2798 + }, + { + "epoch": 0.15090575803321113, + "grad_norm": 0.7287562489509583, + "learning_rate": 9.96729389332987e-06, + "loss": 0.7834, + "step": 2799 + }, + { + "epoch": 0.15095967220185466, + "grad_norm": 0.8642309904098511, + "learning_rate": 9.967269679599242e-06, + "loss": 0.7912, + "step": 2800 + }, + { + "epoch": 0.15101358637049817, + "grad_norm": 0.886060893535614, + "learning_rate": 9.967245456938132e-06, + "loss": 0.8614, + "step": 2801 + }, + { + "epoch": 0.1510675005391417, + "grad_norm": 0.8505488038063049, + "learning_rate": 9.967221225346584e-06, + "loss": 0.8323, + "step": 2802 + }, + { + "epoch": 0.1511214147077852, + "grad_norm": 0.8862965703010559, + "learning_rate": 9.967196984824644e-06, + "loss": 0.8292, + "step": 2803 + }, + { + "epoch": 0.15117532887642873, + "grad_norm": 0.8016111254692078, + "learning_rate": 9.967172735372353e-06, + "loss": 0.643, + "step": 2804 + }, + { + "epoch": 0.15122924304507224, + "grad_norm": 0.7599527835845947, + "learning_rate": 9.967148476989755e-06, + "loss": 0.8166, + "step": 2805 + }, + { + "epoch": 0.15128315721371577, + "grad_norm": 0.9574166536331177, + "learning_rate": 9.967124209676894e-06, + "loss": 0.8867, + "step": 2806 + }, + { + "epoch": 0.15133707138235927, + "grad_norm": 0.8384936451911926, + "learning_rate": 9.967099933433815e-06, + "loss": 0.9021, + "step": 2807 + }, + { + "epoch": 0.1513909855510028, + "grad_norm": 0.7779715061187744, + "learning_rate": 9.967075648260559e-06, + "loss": 0.7672, + "step": 2808 + }, + { + "epoch": 0.15144489971964634, + "grad_norm": 0.7783359885215759, + "learning_rate": 9.96705135415717e-06, + "loss": 0.8012, + "step": 2809 + }, + { + "epoch": 0.15149881388828984, + "grad_norm": 0.9124150276184082, + "learning_rate": 9.967027051123695e-06, + "loss": 0.8803, + "step": 2810 + }, + { + "epoch": 0.15155272805693337, + "grad_norm": 0.8135334849357605, + "learning_rate": 9.967002739160173e-06, + "loss": 0.7764, + "step": 2811 + }, + { + "epoch": 0.15160664222557688, + "grad_norm": 0.8082837462425232, + "learning_rate": 9.966978418266651e-06, + "loss": 0.8552, + "step": 2812 + }, + { + "epoch": 0.1516605563942204, + "grad_norm": 0.7978013753890991, + "learning_rate": 9.966954088443171e-06, + "loss": 0.7321, + "step": 2813 + }, + { + "epoch": 0.1517144705628639, + "grad_norm": 0.7845378518104553, + "learning_rate": 9.966929749689778e-06, + "loss": 0.7694, + "step": 2814 + }, + { + "epoch": 0.15176838473150744, + "grad_norm": 0.8671941161155701, + "learning_rate": 9.966905402006516e-06, + "loss": 0.886, + "step": 2815 + }, + { + "epoch": 0.15182229890015095, + "grad_norm": 0.8316017389297485, + "learning_rate": 9.966881045393426e-06, + "loss": 0.8844, + "step": 2816 + }, + { + "epoch": 0.15187621306879448, + "grad_norm": 0.7372319102287292, + "learning_rate": 9.966856679850554e-06, + "loss": 0.739, + "step": 2817 + }, + { + "epoch": 0.151930127237438, + "grad_norm": 0.7547122240066528, + "learning_rate": 9.966832305377944e-06, + "loss": 0.7518, + "step": 2818 + }, + { + "epoch": 0.15198404140608152, + "grad_norm": 0.8701632022857666, + "learning_rate": 9.96680792197564e-06, + "loss": 0.8632, + "step": 2819 + }, + { + "epoch": 0.15203795557472505, + "grad_norm": 0.7842714786529541, + "learning_rate": 9.966783529643686e-06, + "loss": 0.8161, + "step": 2820 + }, + { + "epoch": 0.15209186974336855, + "grad_norm": 0.858406126499176, + "learning_rate": 9.966759128382125e-06, + "loss": 0.7742, + "step": 2821 + }, + { + "epoch": 0.15214578391201208, + "grad_norm": 1.02357816696167, + "learning_rate": 9.966734718190998e-06, + "loss": 0.9142, + "step": 2822 + }, + { + "epoch": 0.1521996980806556, + "grad_norm": 0.81562739610672, + "learning_rate": 9.966710299070355e-06, + "loss": 0.8426, + "step": 2823 + }, + { + "epoch": 0.15225361224929912, + "grad_norm": 0.8576202988624573, + "learning_rate": 9.966685871020236e-06, + "loss": 0.7546, + "step": 2824 + }, + { + "epoch": 0.15230752641794262, + "grad_norm": 0.8974374532699585, + "learning_rate": 9.966661434040684e-06, + "loss": 0.7236, + "step": 2825 + }, + { + "epoch": 0.15236144058658616, + "grad_norm": 0.7306199073791504, + "learning_rate": 9.966636988131745e-06, + "loss": 0.7581, + "step": 2826 + }, + { + "epoch": 0.1524153547552297, + "grad_norm": 0.9296971559524536, + "learning_rate": 9.966612533293465e-06, + "loss": 0.9214, + "step": 2827 + }, + { + "epoch": 0.1524692689238732, + "grad_norm": 1.029969573020935, + "learning_rate": 9.966588069525885e-06, + "loss": 0.8371, + "step": 2828 + }, + { + "epoch": 0.15252318309251672, + "grad_norm": 0.869320809841156, + "learning_rate": 9.966563596829046e-06, + "loss": 0.6396, + "step": 2829 + }, + { + "epoch": 0.15257709726116023, + "grad_norm": 0.8893983960151672, + "learning_rate": 9.966539115202998e-06, + "loss": 0.8423, + "step": 2830 + }, + { + "epoch": 0.15263101142980376, + "grad_norm": 0.823639452457428, + "learning_rate": 9.966514624647783e-06, + "loss": 0.7924, + "step": 2831 + }, + { + "epoch": 0.15268492559844726, + "grad_norm": 0.805551290512085, + "learning_rate": 9.966490125163444e-06, + "loss": 0.8091, + "step": 2832 + }, + { + "epoch": 0.1527388397670908, + "grad_norm": 0.9040341377258301, + "learning_rate": 9.966465616750025e-06, + "loss": 0.8924, + "step": 2833 + }, + { + "epoch": 0.1527927539357343, + "grad_norm": 0.8297836780548096, + "learning_rate": 9.966441099407572e-06, + "loss": 0.7538, + "step": 2834 + }, + { + "epoch": 0.15284666810437783, + "grad_norm": 0.8824244141578674, + "learning_rate": 9.966416573136127e-06, + "loss": 0.8892, + "step": 2835 + }, + { + "epoch": 0.15290058227302136, + "grad_norm": 1.0663546323776245, + "learning_rate": 9.966392037935734e-06, + "loss": 0.7809, + "step": 2836 + }, + { + "epoch": 0.15295449644166487, + "grad_norm": 0.8324514627456665, + "learning_rate": 9.966367493806439e-06, + "loss": 0.8308, + "step": 2837 + }, + { + "epoch": 0.1530084106103084, + "grad_norm": 0.7742459177970886, + "learning_rate": 9.966342940748286e-06, + "loss": 0.8269, + "step": 2838 + }, + { + "epoch": 0.1530623247789519, + "grad_norm": 0.9513984322547913, + "learning_rate": 9.966318378761317e-06, + "loss": 0.8538, + "step": 2839 + }, + { + "epoch": 0.15311623894759543, + "grad_norm": 0.8030692934989929, + "learning_rate": 9.966293807845577e-06, + "loss": 0.7752, + "step": 2840 + }, + { + "epoch": 0.15317015311623894, + "grad_norm": 0.8903285264968872, + "learning_rate": 9.966269228001112e-06, + "loss": 0.8556, + "step": 2841 + }, + { + "epoch": 0.15322406728488247, + "grad_norm": 0.8221173286437988, + "learning_rate": 9.966244639227962e-06, + "loss": 0.7249, + "step": 2842 + }, + { + "epoch": 0.15327798145352597, + "grad_norm": 0.9883365035057068, + "learning_rate": 9.966220041526176e-06, + "loss": 0.961, + "step": 2843 + }, + { + "epoch": 0.1533318956221695, + "grad_norm": 0.8654862642288208, + "learning_rate": 9.966195434895796e-06, + "loss": 0.7779, + "step": 2844 + }, + { + "epoch": 0.15338580979081304, + "grad_norm": 0.7924084663391113, + "learning_rate": 9.966170819336866e-06, + "loss": 0.7706, + "step": 2845 + }, + { + "epoch": 0.15343972395945654, + "grad_norm": 0.8227209448814392, + "learning_rate": 9.96614619484943e-06, + "loss": 0.8659, + "step": 2846 + }, + { + "epoch": 0.15349363812810007, + "grad_norm": 0.9436708688735962, + "learning_rate": 9.966121561433534e-06, + "loss": 0.87, + "step": 2847 + }, + { + "epoch": 0.15354755229674358, + "grad_norm": 1.137171983718872, + "learning_rate": 9.96609691908922e-06, + "loss": 0.7883, + "step": 2848 + }, + { + "epoch": 0.1536014664653871, + "grad_norm": 0.8868550658226013, + "learning_rate": 9.966072267816535e-06, + "loss": 0.8309, + "step": 2849 + }, + { + "epoch": 0.1536553806340306, + "grad_norm": 0.7190971970558167, + "learning_rate": 9.966047607615521e-06, + "loss": 0.6938, + "step": 2850 + }, + { + "epoch": 0.15370929480267415, + "grad_norm": 0.883866548538208, + "learning_rate": 9.966022938486223e-06, + "loss": 0.8368, + "step": 2851 + }, + { + "epoch": 0.15376320897131765, + "grad_norm": 0.9433422684669495, + "learning_rate": 9.965998260428686e-06, + "loss": 0.7739, + "step": 2852 + }, + { + "epoch": 0.15381712313996118, + "grad_norm": 0.9166012406349182, + "learning_rate": 9.965973573442956e-06, + "loss": 0.8308, + "step": 2853 + }, + { + "epoch": 0.1538710373086047, + "grad_norm": 0.8955514430999756, + "learning_rate": 9.965948877529071e-06, + "loss": 0.8403, + "step": 2854 + }, + { + "epoch": 0.15392495147724822, + "grad_norm": 0.8281451463699341, + "learning_rate": 9.965924172687083e-06, + "loss": 0.8127, + "step": 2855 + }, + { + "epoch": 0.15397886564589175, + "grad_norm": 0.8765435218811035, + "learning_rate": 9.965899458917031e-06, + "loss": 0.87, + "step": 2856 + }, + { + "epoch": 0.15403277981453525, + "grad_norm": 0.9525101780891418, + "learning_rate": 9.965874736218964e-06, + "loss": 0.8665, + "step": 2857 + }, + { + "epoch": 0.15408669398317879, + "grad_norm": 0.7836191654205322, + "learning_rate": 9.965850004592921e-06, + "loss": 0.8261, + "step": 2858 + }, + { + "epoch": 0.1541406081518223, + "grad_norm": 0.7918692827224731, + "learning_rate": 9.96582526403895e-06, + "loss": 0.8422, + "step": 2859 + }, + { + "epoch": 0.15419452232046582, + "grad_norm": 0.8489586710929871, + "learning_rate": 9.965800514557096e-06, + "loss": 0.8871, + "step": 2860 + }, + { + "epoch": 0.15424843648910933, + "grad_norm": 0.9581596255302429, + "learning_rate": 9.965775756147402e-06, + "loss": 0.9346, + "step": 2861 + }, + { + "epoch": 0.15430235065775286, + "grad_norm": 1.0253969430923462, + "learning_rate": 9.965750988809913e-06, + "loss": 0.8381, + "step": 2862 + }, + { + "epoch": 0.1543562648263964, + "grad_norm": 0.8403491377830505, + "learning_rate": 9.965726212544674e-06, + "loss": 0.8307, + "step": 2863 + }, + { + "epoch": 0.1544101789950399, + "grad_norm": 0.729560375213623, + "learning_rate": 9.965701427351728e-06, + "loss": 0.8021, + "step": 2864 + }, + { + "epoch": 0.15446409316368342, + "grad_norm": 0.7576143741607666, + "learning_rate": 9.965676633231121e-06, + "loss": 0.7896, + "step": 2865 + }, + { + "epoch": 0.15451800733232693, + "grad_norm": 1.100948452949524, + "learning_rate": 9.965651830182898e-06, + "loss": 0.797, + "step": 2866 + }, + { + "epoch": 0.15457192150097046, + "grad_norm": 1.0760526657104492, + "learning_rate": 9.965627018207102e-06, + "loss": 0.7875, + "step": 2867 + }, + { + "epoch": 0.15462583566961396, + "grad_norm": 0.8553655743598938, + "learning_rate": 9.96560219730378e-06, + "loss": 0.872, + "step": 2868 + }, + { + "epoch": 0.1546797498382575, + "grad_norm": 1.1357450485229492, + "learning_rate": 9.965577367472971e-06, + "loss": 0.7306, + "step": 2869 + }, + { + "epoch": 0.154733664006901, + "grad_norm": 0.8308514952659607, + "learning_rate": 9.965552528714725e-06, + "loss": 0.8106, + "step": 2870 + }, + { + "epoch": 0.15478757817554453, + "grad_norm": 0.8406074047088623, + "learning_rate": 9.965527681029088e-06, + "loss": 0.9085, + "step": 2871 + }, + { + "epoch": 0.15484149234418806, + "grad_norm": 0.8215218186378479, + "learning_rate": 9.9655028244161e-06, + "loss": 0.733, + "step": 2872 + }, + { + "epoch": 0.15489540651283157, + "grad_norm": 1.0004653930664062, + "learning_rate": 9.965477958875806e-06, + "loss": 0.8625, + "step": 2873 + }, + { + "epoch": 0.1549493206814751, + "grad_norm": 0.8359742760658264, + "learning_rate": 9.965453084408256e-06, + "loss": 0.7847, + "step": 2874 + }, + { + "epoch": 0.1550032348501186, + "grad_norm": 1.0257774591445923, + "learning_rate": 9.965428201013488e-06, + "loss": 0.8654, + "step": 2875 + }, + { + "epoch": 0.15505714901876214, + "grad_norm": 0.7931713461875916, + "learning_rate": 9.96540330869155e-06, + "loss": 0.7498, + "step": 2876 + }, + { + "epoch": 0.15511106318740564, + "grad_norm": 0.7873162031173706, + "learning_rate": 9.965378407442488e-06, + "loss": 0.7617, + "step": 2877 + }, + { + "epoch": 0.15516497735604917, + "grad_norm": 0.8008442521095276, + "learning_rate": 9.965353497266346e-06, + "loss": 0.8464, + "step": 2878 + }, + { + "epoch": 0.15521889152469268, + "grad_norm": 0.798004686832428, + "learning_rate": 9.965328578163166e-06, + "loss": 0.8519, + "step": 2879 + }, + { + "epoch": 0.1552728056933362, + "grad_norm": 0.8730151057243347, + "learning_rate": 9.965303650132996e-06, + "loss": 0.8257, + "step": 2880 + }, + { + "epoch": 0.15532671986197974, + "grad_norm": 0.7465460896492004, + "learning_rate": 9.965278713175879e-06, + "loss": 0.7786, + "step": 2881 + }, + { + "epoch": 0.15538063403062324, + "grad_norm": 0.9565917253494263, + "learning_rate": 9.96525376729186e-06, + "loss": 0.8694, + "step": 2882 + }, + { + "epoch": 0.15543454819926678, + "grad_norm": 0.880181074142456, + "learning_rate": 9.965228812480987e-06, + "loss": 0.813, + "step": 2883 + }, + { + "epoch": 0.15548846236791028, + "grad_norm": 0.7912368774414062, + "learning_rate": 9.965203848743299e-06, + "loss": 0.7764, + "step": 2884 + }, + { + "epoch": 0.1555423765365538, + "grad_norm": 0.8370791077613831, + "learning_rate": 9.965178876078846e-06, + "loss": 0.8591, + "step": 2885 + }, + { + "epoch": 0.15559629070519732, + "grad_norm": 0.8508057594299316, + "learning_rate": 9.965153894487672e-06, + "loss": 0.8535, + "step": 2886 + }, + { + "epoch": 0.15565020487384085, + "grad_norm": 1.0393366813659668, + "learning_rate": 9.965128903969818e-06, + "loss": 0.8032, + "step": 2887 + }, + { + "epoch": 0.15570411904248435, + "grad_norm": 0.7545601725578308, + "learning_rate": 9.965103904525334e-06, + "loss": 0.7024, + "step": 2888 + }, + { + "epoch": 0.15575803321112788, + "grad_norm": 0.7933251261711121, + "learning_rate": 9.965078896154262e-06, + "loss": 0.8325, + "step": 2889 + }, + { + "epoch": 0.15581194737977141, + "grad_norm": 0.8319270610809326, + "learning_rate": 9.965053878856648e-06, + "loss": 0.7781, + "step": 2890 + }, + { + "epoch": 0.15586586154841492, + "grad_norm": 1.0789637565612793, + "learning_rate": 9.965028852632537e-06, + "loss": 0.7931, + "step": 2891 + }, + { + "epoch": 0.15591977571705845, + "grad_norm": 0.9561448097229004, + "learning_rate": 9.965003817481974e-06, + "loss": 0.7472, + "step": 2892 + }, + { + "epoch": 0.15597368988570195, + "grad_norm": 0.9099969267845154, + "learning_rate": 9.964978773405003e-06, + "loss": 0.9154, + "step": 2893 + }, + { + "epoch": 0.1560276040543455, + "grad_norm": 0.9164708852767944, + "learning_rate": 9.96495372040167e-06, + "loss": 0.8552, + "step": 2894 + }, + { + "epoch": 0.156081518222989, + "grad_norm": 0.9367608428001404, + "learning_rate": 9.96492865847202e-06, + "loss": 0.7926, + "step": 2895 + }, + { + "epoch": 0.15613543239163252, + "grad_norm": 0.8970937728881836, + "learning_rate": 9.9649035876161e-06, + "loss": 0.8798, + "step": 2896 + }, + { + "epoch": 0.15618934656027603, + "grad_norm": 0.8037889003753662, + "learning_rate": 9.96487850783395e-06, + "loss": 0.8157, + "step": 2897 + }, + { + "epoch": 0.15624326072891956, + "grad_norm": 0.906944215297699, + "learning_rate": 9.964853419125619e-06, + "loss": 0.8191, + "step": 2898 + }, + { + "epoch": 0.1562971748975631, + "grad_norm": 0.8197054266929626, + "learning_rate": 9.964828321491152e-06, + "loss": 0.7899, + "step": 2899 + }, + { + "epoch": 0.1563510890662066, + "grad_norm": 0.7816088795661926, + "learning_rate": 9.96480321493059e-06, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.15640500323485013, + "grad_norm": 0.8319717645645142, + "learning_rate": 9.964778099443985e-06, + "loss": 0.7835, + "step": 2901 + }, + { + "epoch": 0.15645891740349363, + "grad_norm": 0.7739672660827637, + "learning_rate": 9.964752975031378e-06, + "loss": 0.7813, + "step": 2902 + }, + { + "epoch": 0.15651283157213716, + "grad_norm": 0.8002716898918152, + "learning_rate": 9.964727841692815e-06, + "loss": 0.7971, + "step": 2903 + }, + { + "epoch": 0.15656674574078067, + "grad_norm": 0.8796008229255676, + "learning_rate": 9.964702699428339e-06, + "loss": 0.7462, + "step": 2904 + }, + { + "epoch": 0.1566206599094242, + "grad_norm": 0.837027907371521, + "learning_rate": 9.964677548237998e-06, + "loss": 0.864, + "step": 2905 + }, + { + "epoch": 0.15667457407806773, + "grad_norm": 0.9098290205001831, + "learning_rate": 9.964652388121837e-06, + "loss": 0.9079, + "step": 2906 + }, + { + "epoch": 0.15672848824671123, + "grad_norm": 0.7707619071006775, + "learning_rate": 9.964627219079898e-06, + "loss": 0.7472, + "step": 2907 + }, + { + "epoch": 0.15678240241535477, + "grad_norm": 1.0109550952911377, + "learning_rate": 9.964602041112233e-06, + "loss": 0.8981, + "step": 2908 + }, + { + "epoch": 0.15683631658399827, + "grad_norm": 0.8410045504570007, + "learning_rate": 9.964576854218882e-06, + "loss": 0.8488, + "step": 2909 + }, + { + "epoch": 0.1568902307526418, + "grad_norm": 0.8624899983406067, + "learning_rate": 9.96455165839989e-06, + "loss": 0.817, + "step": 2910 + }, + { + "epoch": 0.1569441449212853, + "grad_norm": 0.9060286283493042, + "learning_rate": 9.964526453655304e-06, + "loss": 0.8171, + "step": 2911 + }, + { + "epoch": 0.15699805908992884, + "grad_norm": 0.7718086838722229, + "learning_rate": 9.96450123998517e-06, + "loss": 0.7158, + "step": 2912 + }, + { + "epoch": 0.15705197325857234, + "grad_norm": 0.8690425157546997, + "learning_rate": 9.96447601738953e-06, + "loss": 0.8347, + "step": 2913 + }, + { + "epoch": 0.15710588742721587, + "grad_norm": 0.782656192779541, + "learning_rate": 9.964450785868433e-06, + "loss": 0.7581, + "step": 2914 + }, + { + "epoch": 0.1571598015958594, + "grad_norm": 1.0090769529342651, + "learning_rate": 9.964425545421924e-06, + "loss": 0.8179, + "step": 2915 + }, + { + "epoch": 0.1572137157645029, + "grad_norm": 0.8786135911941528, + "learning_rate": 9.964400296050047e-06, + "loss": 0.8733, + "step": 2916 + }, + { + "epoch": 0.15726762993314644, + "grad_norm": 0.8163133859634399, + "learning_rate": 9.964375037752847e-06, + "loss": 0.8091, + "step": 2917 + }, + { + "epoch": 0.15732154410178995, + "grad_norm": 0.8213543891906738, + "learning_rate": 9.964349770530371e-06, + "loss": 0.7978, + "step": 2918 + }, + { + "epoch": 0.15737545827043348, + "grad_norm": 0.849274218082428, + "learning_rate": 9.964324494382663e-06, + "loss": 0.8168, + "step": 2919 + }, + { + "epoch": 0.15742937243907698, + "grad_norm": 0.8099618554115295, + "learning_rate": 9.964299209309769e-06, + "loss": 0.8372, + "step": 2920 + }, + { + "epoch": 0.1574832866077205, + "grad_norm": 0.9064434766769409, + "learning_rate": 9.964273915311734e-06, + "loss": 0.8681, + "step": 2921 + }, + { + "epoch": 0.15753720077636402, + "grad_norm": 0.7269558310508728, + "learning_rate": 9.964248612388607e-06, + "loss": 0.7179, + "step": 2922 + }, + { + "epoch": 0.15759111494500755, + "grad_norm": 0.8115706443786621, + "learning_rate": 9.964223300540427e-06, + "loss": 0.8572, + "step": 2923 + }, + { + "epoch": 0.15764502911365108, + "grad_norm": 0.8180872797966003, + "learning_rate": 9.964197979767246e-06, + "loss": 0.7463, + "step": 2924 + }, + { + "epoch": 0.15769894328229458, + "grad_norm": 0.741603434085846, + "learning_rate": 9.964172650069105e-06, + "loss": 0.7646, + "step": 2925 + }, + { + "epoch": 0.15775285745093812, + "grad_norm": 0.7558543682098389, + "learning_rate": 9.964147311446051e-06, + "loss": 0.7363, + "step": 2926 + }, + { + "epoch": 0.15780677161958162, + "grad_norm": 0.8128615617752075, + "learning_rate": 9.96412196389813e-06, + "loss": 0.8515, + "step": 2927 + }, + { + "epoch": 0.15786068578822515, + "grad_norm": 0.9731131196022034, + "learning_rate": 9.964096607425388e-06, + "loss": 0.8847, + "step": 2928 + }, + { + "epoch": 0.15791459995686866, + "grad_norm": 1.136883020401001, + "learning_rate": 9.964071242027868e-06, + "loss": 0.8457, + "step": 2929 + }, + { + "epoch": 0.1579685141255122, + "grad_norm": 0.7780461311340332, + "learning_rate": 9.964045867705618e-06, + "loss": 0.737, + "step": 2930 + }, + { + "epoch": 0.1580224282941557, + "grad_norm": 0.801013708114624, + "learning_rate": 9.964020484458684e-06, + "loss": 0.8164, + "step": 2931 + }, + { + "epoch": 0.15807634246279922, + "grad_norm": 0.8851730823516846, + "learning_rate": 9.96399509228711e-06, + "loss": 0.8762, + "step": 2932 + }, + { + "epoch": 0.15813025663144276, + "grad_norm": 0.9501338005065918, + "learning_rate": 9.963969691190942e-06, + "loss": 0.7788, + "step": 2933 + }, + { + "epoch": 0.15818417080008626, + "grad_norm": 0.9714099168777466, + "learning_rate": 9.963944281170227e-06, + "loss": 0.9207, + "step": 2934 + }, + { + "epoch": 0.1582380849687298, + "grad_norm": 0.764689564704895, + "learning_rate": 9.963918862225009e-06, + "loss": 0.737, + "step": 2935 + }, + { + "epoch": 0.1582919991373733, + "grad_norm": 1.1618343591690063, + "learning_rate": 9.963893434355335e-06, + "loss": 0.8055, + "step": 2936 + }, + { + "epoch": 0.15834591330601683, + "grad_norm": 0.8724596500396729, + "learning_rate": 9.96386799756125e-06, + "loss": 0.8449, + "step": 2937 + }, + { + "epoch": 0.15839982747466033, + "grad_norm": 0.7769358158111572, + "learning_rate": 9.963842551842798e-06, + "loss": 0.8155, + "step": 2938 + }, + { + "epoch": 0.15845374164330386, + "grad_norm": 0.8337542414665222, + "learning_rate": 9.963817097200028e-06, + "loss": 0.7331, + "step": 2939 + }, + { + "epoch": 0.15850765581194737, + "grad_norm": 0.8240610957145691, + "learning_rate": 9.963791633632984e-06, + "loss": 0.8076, + "step": 2940 + }, + { + "epoch": 0.1585615699805909, + "grad_norm": 0.7781216502189636, + "learning_rate": 9.963766161141713e-06, + "loss": 0.7274, + "step": 2941 + }, + { + "epoch": 0.15861548414923443, + "grad_norm": 0.8469343781471252, + "learning_rate": 9.96374067972626e-06, + "loss": 0.8364, + "step": 2942 + }, + { + "epoch": 0.15866939831787794, + "grad_norm": 0.7859261631965637, + "learning_rate": 9.963715189386669e-06, + "loss": 0.8006, + "step": 2943 + }, + { + "epoch": 0.15872331248652147, + "grad_norm": 0.8646130561828613, + "learning_rate": 9.963689690122988e-06, + "loss": 0.808, + "step": 2944 + }, + { + "epoch": 0.15877722665516497, + "grad_norm": 0.8905766010284424, + "learning_rate": 9.963664181935263e-06, + "loss": 0.8406, + "step": 2945 + }, + { + "epoch": 0.1588311408238085, + "grad_norm": 0.8756605982780457, + "learning_rate": 9.963638664823539e-06, + "loss": 0.8643, + "step": 2946 + }, + { + "epoch": 0.158885054992452, + "grad_norm": 0.899135410785675, + "learning_rate": 9.963613138787862e-06, + "loss": 0.9063, + "step": 2947 + }, + { + "epoch": 0.15893896916109554, + "grad_norm": 0.8382771015167236, + "learning_rate": 9.96358760382828e-06, + "loss": 0.8004, + "step": 2948 + }, + { + "epoch": 0.15899288332973904, + "grad_norm": 0.7687328457832336, + "learning_rate": 9.963562059944833e-06, + "loss": 0.7695, + "step": 2949 + }, + { + "epoch": 0.15904679749838257, + "grad_norm": 0.807344913482666, + "learning_rate": 9.963536507137574e-06, + "loss": 0.7514, + "step": 2950 + }, + { + "epoch": 0.1591007116670261, + "grad_norm": 0.7882648706436157, + "learning_rate": 9.963510945406545e-06, + "loss": 0.7537, + "step": 2951 + }, + { + "epoch": 0.1591546258356696, + "grad_norm": 0.8422887921333313, + "learning_rate": 9.963485374751793e-06, + "loss": 0.7937, + "step": 2952 + }, + { + "epoch": 0.15920854000431314, + "grad_norm": 0.7578607797622681, + "learning_rate": 9.963459795173362e-06, + "loss": 0.8071, + "step": 2953 + }, + { + "epoch": 0.15926245417295665, + "grad_norm": 0.8854062557220459, + "learning_rate": 9.963434206671302e-06, + "loss": 0.9078, + "step": 2954 + }, + { + "epoch": 0.15931636834160018, + "grad_norm": 0.8705536723136902, + "learning_rate": 9.963408609245654e-06, + "loss": 0.7971, + "step": 2955 + }, + { + "epoch": 0.15937028251024368, + "grad_norm": 0.8247761726379395, + "learning_rate": 9.96338300289647e-06, + "loss": 0.7889, + "step": 2956 + }, + { + "epoch": 0.15942419667888721, + "grad_norm": 0.8216410279273987, + "learning_rate": 9.96335738762379e-06, + "loss": 0.9097, + "step": 2957 + }, + { + "epoch": 0.15947811084753072, + "grad_norm": 0.9624109268188477, + "learning_rate": 9.963331763427666e-06, + "loss": 0.8562, + "step": 2958 + }, + { + "epoch": 0.15953202501617425, + "grad_norm": 0.8426920175552368, + "learning_rate": 9.96330613030814e-06, + "loss": 0.8011, + "step": 2959 + }, + { + "epoch": 0.15958593918481778, + "grad_norm": 0.8987439870834351, + "learning_rate": 9.963280488265256e-06, + "loss": 0.7965, + "step": 2960 + }, + { + "epoch": 0.1596398533534613, + "grad_norm": 0.8105943202972412, + "learning_rate": 9.963254837299066e-06, + "loss": 0.8178, + "step": 2961 + }, + { + "epoch": 0.15969376752210482, + "grad_norm": 0.928841769695282, + "learning_rate": 9.963229177409612e-06, + "loss": 0.8106, + "step": 2962 + }, + { + "epoch": 0.15974768169074832, + "grad_norm": 0.7369773983955383, + "learning_rate": 9.963203508596942e-06, + "loss": 0.7401, + "step": 2963 + }, + { + "epoch": 0.15980159585939185, + "grad_norm": 0.7476964592933655, + "learning_rate": 9.9631778308611e-06, + "loss": 0.8112, + "step": 2964 + }, + { + "epoch": 0.15985551002803536, + "grad_norm": 0.8257710337638855, + "learning_rate": 9.963152144202135e-06, + "loss": 0.8489, + "step": 2965 + }, + { + "epoch": 0.1599094241966789, + "grad_norm": 0.8324301242828369, + "learning_rate": 9.963126448620091e-06, + "loss": 0.8511, + "step": 2966 + }, + { + "epoch": 0.1599633383653224, + "grad_norm": 0.8221176266670227, + "learning_rate": 9.963100744115017e-06, + "loss": 0.7924, + "step": 2967 + }, + { + "epoch": 0.16001725253396593, + "grad_norm": 0.7942221164703369, + "learning_rate": 9.963075030686955e-06, + "loss": 0.7936, + "step": 2968 + }, + { + "epoch": 0.16007116670260946, + "grad_norm": 0.7341020107269287, + "learning_rate": 9.963049308335954e-06, + "loss": 0.7381, + "step": 2969 + }, + { + "epoch": 0.16012508087125296, + "grad_norm": 0.8118404746055603, + "learning_rate": 9.963023577062062e-06, + "loss": 0.756, + "step": 2970 + }, + { + "epoch": 0.1601789950398965, + "grad_norm": 0.7517318725585938, + "learning_rate": 9.96299783686532e-06, + "loss": 0.7051, + "step": 2971 + }, + { + "epoch": 0.16023290920854, + "grad_norm": 0.7982935905456543, + "learning_rate": 9.962972087745777e-06, + "loss": 0.8412, + "step": 2972 + }, + { + "epoch": 0.16028682337718353, + "grad_norm": 0.8397754430770874, + "learning_rate": 9.962946329703482e-06, + "loss": 0.8314, + "step": 2973 + }, + { + "epoch": 0.16034073754582703, + "grad_norm": 0.8342095613479614, + "learning_rate": 9.962920562738477e-06, + "loss": 0.7649, + "step": 2974 + }, + { + "epoch": 0.16039465171447057, + "grad_norm": 0.8053215742111206, + "learning_rate": 9.96289478685081e-06, + "loss": 0.7315, + "step": 2975 + }, + { + "epoch": 0.16044856588311407, + "grad_norm": 0.8931438326835632, + "learning_rate": 9.962869002040529e-06, + "loss": 0.9241, + "step": 2976 + }, + { + "epoch": 0.1605024800517576, + "grad_norm": 0.8217912316322327, + "learning_rate": 9.962843208307677e-06, + "loss": 0.7551, + "step": 2977 + }, + { + "epoch": 0.16055639422040113, + "grad_norm": 0.7592090964317322, + "learning_rate": 9.962817405652305e-06, + "loss": 0.7243, + "step": 2978 + }, + { + "epoch": 0.16061030838904464, + "grad_norm": 0.8466029167175293, + "learning_rate": 9.962791594074455e-06, + "loss": 0.785, + "step": 2979 + }, + { + "epoch": 0.16066422255768817, + "grad_norm": 0.859207272529602, + "learning_rate": 9.962765773574174e-06, + "loss": 0.8344, + "step": 2980 + }, + { + "epoch": 0.16071813672633167, + "grad_norm": 0.8134403824806213, + "learning_rate": 9.962739944151511e-06, + "loss": 0.7595, + "step": 2981 + }, + { + "epoch": 0.1607720508949752, + "grad_norm": 0.7411110401153564, + "learning_rate": 9.962714105806511e-06, + "loss": 0.7751, + "step": 2982 + }, + { + "epoch": 0.1608259650636187, + "grad_norm": 0.7976831793785095, + "learning_rate": 9.962688258539219e-06, + "loss": 0.7353, + "step": 2983 + }, + { + "epoch": 0.16087987923226224, + "grad_norm": 0.8306836485862732, + "learning_rate": 9.962662402349684e-06, + "loss": 0.7903, + "step": 2984 + }, + { + "epoch": 0.16093379340090574, + "grad_norm": 0.794691264629364, + "learning_rate": 9.96263653723795e-06, + "loss": 0.7972, + "step": 2985 + }, + { + "epoch": 0.16098770756954928, + "grad_norm": 0.7471837401390076, + "learning_rate": 9.962610663204066e-06, + "loss": 0.7994, + "step": 2986 + }, + { + "epoch": 0.1610416217381928, + "grad_norm": 0.8046342134475708, + "learning_rate": 9.962584780248079e-06, + "loss": 0.7912, + "step": 2987 + }, + { + "epoch": 0.1610955359068363, + "grad_norm": 0.7935966849327087, + "learning_rate": 9.96255888837003e-06, + "loss": 0.8053, + "step": 2988 + }, + { + "epoch": 0.16114945007547984, + "grad_norm": 0.7403679490089417, + "learning_rate": 9.962532987569973e-06, + "loss": 0.6707, + "step": 2989 + }, + { + "epoch": 0.16120336424412335, + "grad_norm": 0.8277058005332947, + "learning_rate": 9.96250707784795e-06, + "loss": 0.8074, + "step": 2990 + }, + { + "epoch": 0.16125727841276688, + "grad_norm": 1.0225850343704224, + "learning_rate": 9.962481159204008e-06, + "loss": 0.8475, + "step": 2991 + }, + { + "epoch": 0.16131119258141038, + "grad_norm": 0.8091806769371033, + "learning_rate": 9.962455231638193e-06, + "loss": 0.7714, + "step": 2992 + }, + { + "epoch": 0.16136510675005392, + "grad_norm": 0.7496880292892456, + "learning_rate": 9.962429295150554e-06, + "loss": 0.7449, + "step": 2993 + }, + { + "epoch": 0.16141902091869742, + "grad_norm": 0.7799220085144043, + "learning_rate": 9.962403349741137e-06, + "loss": 0.7241, + "step": 2994 + }, + { + "epoch": 0.16147293508734095, + "grad_norm": 0.92058926820755, + "learning_rate": 9.962377395409986e-06, + "loss": 0.8374, + "step": 2995 + }, + { + "epoch": 0.16152684925598448, + "grad_norm": 0.7713897228240967, + "learning_rate": 9.96235143215715e-06, + "loss": 0.7571, + "step": 2996 + }, + { + "epoch": 0.161580763424628, + "grad_norm": 0.779852032661438, + "learning_rate": 9.962325459982678e-06, + "loss": 0.796, + "step": 2997 + }, + { + "epoch": 0.16163467759327152, + "grad_norm": 0.8362038731575012, + "learning_rate": 9.962299478886613e-06, + "loss": 0.8645, + "step": 2998 + }, + { + "epoch": 0.16168859176191502, + "grad_norm": 0.8759078979492188, + "learning_rate": 9.962273488869003e-06, + "loss": 0.8192, + "step": 2999 + }, + { + "epoch": 0.16174250593055856, + "grad_norm": 0.7853894233703613, + "learning_rate": 9.962247489929892e-06, + "loss": 0.81, + "step": 3000 + }, + { + "epoch": 0.16179642009920206, + "grad_norm": 0.8752580881118774, + "learning_rate": 9.962221482069332e-06, + "loss": 0.8172, + "step": 3001 + }, + { + "epoch": 0.1618503342678456, + "grad_norm": 0.8129578828811646, + "learning_rate": 9.962195465287367e-06, + "loss": 0.698, + "step": 3002 + }, + { + "epoch": 0.1619042484364891, + "grad_norm": 0.7905570268630981, + "learning_rate": 9.962169439584043e-06, + "loss": 0.7755, + "step": 3003 + }, + { + "epoch": 0.16195816260513263, + "grad_norm": 1.1296168565750122, + "learning_rate": 9.962143404959408e-06, + "loss": 0.829, + "step": 3004 + }, + { + "epoch": 0.16201207677377616, + "grad_norm": 0.8880928158760071, + "learning_rate": 9.962117361413508e-06, + "loss": 0.8542, + "step": 3005 + }, + { + "epoch": 0.16206599094241966, + "grad_norm": 0.7933239936828613, + "learning_rate": 9.96209130894639e-06, + "loss": 0.714, + "step": 3006 + }, + { + "epoch": 0.1621199051110632, + "grad_norm": 0.8112434148788452, + "learning_rate": 9.962065247558101e-06, + "loss": 0.7967, + "step": 3007 + }, + { + "epoch": 0.1621738192797067, + "grad_norm": 0.7101603150367737, + "learning_rate": 9.962039177248689e-06, + "loss": 0.7054, + "step": 3008 + }, + { + "epoch": 0.16222773344835023, + "grad_norm": 0.9327304363250732, + "learning_rate": 9.962013098018198e-06, + "loss": 0.7683, + "step": 3009 + }, + { + "epoch": 0.16228164761699373, + "grad_norm": 0.8223574161529541, + "learning_rate": 9.961987009866678e-06, + "loss": 0.7174, + "step": 3010 + }, + { + "epoch": 0.16233556178563727, + "grad_norm": 0.889711856842041, + "learning_rate": 9.961960912794176e-06, + "loss": 0.8562, + "step": 3011 + }, + { + "epoch": 0.1623894759542808, + "grad_norm": 0.9297184348106384, + "learning_rate": 9.961934806800736e-06, + "loss": 0.8887, + "step": 3012 + }, + { + "epoch": 0.1624433901229243, + "grad_norm": 0.8206717371940613, + "learning_rate": 9.961908691886404e-06, + "loss": 0.8272, + "step": 3013 + }, + { + "epoch": 0.16249730429156783, + "grad_norm": 0.7833002805709839, + "learning_rate": 9.961882568051233e-06, + "loss": 0.848, + "step": 3014 + }, + { + "epoch": 0.16255121846021134, + "grad_norm": 0.8386265635490417, + "learning_rate": 9.961856435295265e-06, + "loss": 0.7528, + "step": 3015 + }, + { + "epoch": 0.16260513262885487, + "grad_norm": 0.8227097392082214, + "learning_rate": 9.961830293618547e-06, + "loss": 0.8181, + "step": 3016 + }, + { + "epoch": 0.16265904679749837, + "grad_norm": 0.7938892245292664, + "learning_rate": 9.96180414302113e-06, + "loss": 0.8293, + "step": 3017 + }, + { + "epoch": 0.1627129609661419, + "grad_norm": 1.1556557416915894, + "learning_rate": 9.961777983503056e-06, + "loss": 0.9544, + "step": 3018 + }, + { + "epoch": 0.1627668751347854, + "grad_norm": 0.8379788994789124, + "learning_rate": 9.961751815064375e-06, + "loss": 0.7168, + "step": 3019 + }, + { + "epoch": 0.16282078930342894, + "grad_norm": 0.9397227764129639, + "learning_rate": 9.961725637705134e-06, + "loss": 0.8804, + "step": 3020 + }, + { + "epoch": 0.16287470347207247, + "grad_norm": 0.8950162529945374, + "learning_rate": 9.96169945142538e-06, + "loss": 0.8652, + "step": 3021 + }, + { + "epoch": 0.16292861764071598, + "grad_norm": 0.8643755912780762, + "learning_rate": 9.961673256225159e-06, + "loss": 0.9041, + "step": 3022 + }, + { + "epoch": 0.1629825318093595, + "grad_norm": 0.8658211827278137, + "learning_rate": 9.961647052104517e-06, + "loss": 0.8721, + "step": 3023 + }, + { + "epoch": 0.16303644597800301, + "grad_norm": 0.812038242816925, + "learning_rate": 9.961620839063507e-06, + "loss": 0.8715, + "step": 3024 + }, + { + "epoch": 0.16309036014664655, + "grad_norm": 0.7646269798278809, + "learning_rate": 9.961594617102169e-06, + "loss": 0.7805, + "step": 3025 + }, + { + "epoch": 0.16314427431529005, + "grad_norm": 0.7684099674224854, + "learning_rate": 9.961568386220553e-06, + "loss": 0.8214, + "step": 3026 + }, + { + "epoch": 0.16319818848393358, + "grad_norm": 0.888566255569458, + "learning_rate": 9.961542146418706e-06, + "loss": 0.8972, + "step": 3027 + }, + { + "epoch": 0.16325210265257709, + "grad_norm": 0.8100109100341797, + "learning_rate": 9.961515897696675e-06, + "loss": 0.7337, + "step": 3028 + }, + { + "epoch": 0.16330601682122062, + "grad_norm": 0.8838690519332886, + "learning_rate": 9.96148964005451e-06, + "loss": 0.7148, + "step": 3029 + }, + { + "epoch": 0.16335993098986415, + "grad_norm": 0.7518458962440491, + "learning_rate": 9.961463373492253e-06, + "loss": 0.7127, + "step": 3030 + }, + { + "epoch": 0.16341384515850765, + "grad_norm": 0.8280466198921204, + "learning_rate": 9.961437098009956e-06, + "loss": 0.7569, + "step": 3031 + }, + { + "epoch": 0.16346775932715119, + "grad_norm": 0.7333472371101379, + "learning_rate": 9.961410813607663e-06, + "loss": 0.7984, + "step": 3032 + }, + { + "epoch": 0.1635216734957947, + "grad_norm": 0.8064109086990356, + "learning_rate": 9.961384520285423e-06, + "loss": 0.8255, + "step": 3033 + }, + { + "epoch": 0.16357558766443822, + "grad_norm": 0.8310550451278687, + "learning_rate": 9.961358218043282e-06, + "loss": 0.828, + "step": 3034 + }, + { + "epoch": 0.16362950183308173, + "grad_norm": 0.8141489028930664, + "learning_rate": 9.961331906881289e-06, + "loss": 0.8121, + "step": 3035 + }, + { + "epoch": 0.16368341600172526, + "grad_norm": 0.9229308366775513, + "learning_rate": 9.96130558679949e-06, + "loss": 0.9288, + "step": 3036 + }, + { + "epoch": 0.16373733017036876, + "grad_norm": 0.9087804555892944, + "learning_rate": 9.961279257797933e-06, + "loss": 0.8725, + "step": 3037 + }, + { + "epoch": 0.1637912443390123, + "grad_norm": 0.8357719779014587, + "learning_rate": 9.961252919876665e-06, + "loss": 0.8413, + "step": 3038 + }, + { + "epoch": 0.16384515850765582, + "grad_norm": 0.8311809301376343, + "learning_rate": 9.961226573035734e-06, + "loss": 0.885, + "step": 3039 + }, + { + "epoch": 0.16389907267629933, + "grad_norm": 0.7797298431396484, + "learning_rate": 9.961200217275185e-06, + "loss": 0.8767, + "step": 3040 + }, + { + "epoch": 0.16395298684494286, + "grad_norm": 0.8659999370574951, + "learning_rate": 9.961173852595069e-06, + "loss": 0.7852, + "step": 3041 + }, + { + "epoch": 0.16400690101358636, + "grad_norm": 0.8036298155784607, + "learning_rate": 9.96114747899543e-06, + "loss": 0.8122, + "step": 3042 + }, + { + "epoch": 0.1640608151822299, + "grad_norm": 0.8683627843856812, + "learning_rate": 9.961121096476318e-06, + "loss": 0.8197, + "step": 3043 + }, + { + "epoch": 0.1641147293508734, + "grad_norm": 0.8885881900787354, + "learning_rate": 9.96109470503778e-06, + "loss": 0.7302, + "step": 3044 + }, + { + "epoch": 0.16416864351951693, + "grad_norm": 0.7480132579803467, + "learning_rate": 9.961068304679861e-06, + "loss": 0.7938, + "step": 3045 + }, + { + "epoch": 0.16422255768816044, + "grad_norm": 0.680261492729187, + "learning_rate": 9.96104189540261e-06, + "loss": 0.7016, + "step": 3046 + }, + { + "epoch": 0.16427647185680397, + "grad_norm": 0.8690764904022217, + "learning_rate": 9.961015477206078e-06, + "loss": 0.7716, + "step": 3047 + }, + { + "epoch": 0.1643303860254475, + "grad_norm": 0.8533129692077637, + "learning_rate": 9.960989050090306e-06, + "loss": 0.8561, + "step": 3048 + }, + { + "epoch": 0.164384300194091, + "grad_norm": 0.6941283345222473, + "learning_rate": 9.960962614055345e-06, + "loss": 0.6501, + "step": 3049 + }, + { + "epoch": 0.16443821436273454, + "grad_norm": 0.9178086519241333, + "learning_rate": 9.960936169101244e-06, + "loss": 0.8511, + "step": 3050 + }, + { + "epoch": 0.16449212853137804, + "grad_norm": 0.7419497966766357, + "learning_rate": 9.960909715228049e-06, + "loss": 0.7331, + "step": 3051 + }, + { + "epoch": 0.16454604270002157, + "grad_norm": 0.879289984703064, + "learning_rate": 9.960883252435807e-06, + "loss": 0.8969, + "step": 3052 + }, + { + "epoch": 0.16459995686866508, + "grad_norm": 0.7679347991943359, + "learning_rate": 9.960856780724563e-06, + "loss": 0.7467, + "step": 3053 + }, + { + "epoch": 0.1646538710373086, + "grad_norm": 0.7927586436271667, + "learning_rate": 9.960830300094371e-06, + "loss": 0.7479, + "step": 3054 + }, + { + "epoch": 0.1647077852059521, + "grad_norm": 0.7693600058555603, + "learning_rate": 9.960803810545275e-06, + "loss": 0.8421, + "step": 3055 + }, + { + "epoch": 0.16476169937459564, + "grad_norm": 0.8548445105552673, + "learning_rate": 9.96077731207732e-06, + "loss": 0.8104, + "step": 3056 + }, + { + "epoch": 0.16481561354323918, + "grad_norm": 0.8420791029930115, + "learning_rate": 9.960750804690559e-06, + "loss": 0.6974, + "step": 3057 + }, + { + "epoch": 0.16486952771188268, + "grad_norm": 0.7880173921585083, + "learning_rate": 9.960724288385037e-06, + "loss": 0.7723, + "step": 3058 + }, + { + "epoch": 0.1649234418805262, + "grad_norm": 0.8810162544250488, + "learning_rate": 9.960697763160803e-06, + "loss": 0.7488, + "step": 3059 + }, + { + "epoch": 0.16497735604916972, + "grad_norm": 0.9951279759407043, + "learning_rate": 9.9606712290179e-06, + "loss": 0.8119, + "step": 3060 + }, + { + "epoch": 0.16503127021781325, + "grad_norm": 0.755189836025238, + "learning_rate": 9.960644685956383e-06, + "loss": 0.7568, + "step": 3061 + }, + { + "epoch": 0.16508518438645675, + "grad_norm": 0.99064040184021, + "learning_rate": 9.960618133976292e-06, + "loss": 0.8493, + "step": 3062 + }, + { + "epoch": 0.16513909855510028, + "grad_norm": 0.8672367334365845, + "learning_rate": 9.960591573077682e-06, + "loss": 0.7961, + "step": 3063 + }, + { + "epoch": 0.1651930127237438, + "grad_norm": 0.9614015817642212, + "learning_rate": 9.960565003260596e-06, + "loss": 0.8894, + "step": 3064 + }, + { + "epoch": 0.16524692689238732, + "grad_norm": 0.7433729767799377, + "learning_rate": 9.960538424525083e-06, + "loss": 0.7586, + "step": 3065 + }, + { + "epoch": 0.16530084106103085, + "grad_norm": 0.8151267766952515, + "learning_rate": 9.96051183687119e-06, + "loss": 0.8311, + "step": 3066 + }, + { + "epoch": 0.16535475522967436, + "grad_norm": 0.9241605401039124, + "learning_rate": 9.960485240298967e-06, + "loss": 0.8526, + "step": 3067 + }, + { + "epoch": 0.1654086693983179, + "grad_norm": 0.8612751364707947, + "learning_rate": 9.96045863480846e-06, + "loss": 0.7672, + "step": 3068 + }, + { + "epoch": 0.1654625835669614, + "grad_norm": 0.8707523345947266, + "learning_rate": 9.960432020399719e-06, + "loss": 0.7862, + "step": 3069 + }, + { + "epoch": 0.16551649773560492, + "grad_norm": 0.8456318378448486, + "learning_rate": 9.960405397072788e-06, + "loss": 0.8221, + "step": 3070 + }, + { + "epoch": 0.16557041190424843, + "grad_norm": 0.7929409742355347, + "learning_rate": 9.960378764827719e-06, + "loss": 0.8438, + "step": 3071 + }, + { + "epoch": 0.16562432607289196, + "grad_norm": 0.8241098523139954, + "learning_rate": 9.960352123664556e-06, + "loss": 0.7769, + "step": 3072 + }, + { + "epoch": 0.16567824024153546, + "grad_norm": 0.9634597301483154, + "learning_rate": 9.96032547358335e-06, + "loss": 0.8323, + "step": 3073 + }, + { + "epoch": 0.165732154410179, + "grad_norm": 0.6783578395843506, + "learning_rate": 9.960298814584148e-06, + "loss": 0.6585, + "step": 3074 + }, + { + "epoch": 0.16578606857882253, + "grad_norm": 0.756289005279541, + "learning_rate": 9.960272146666997e-06, + "loss": 0.7109, + "step": 3075 + }, + { + "epoch": 0.16583998274746603, + "grad_norm": 0.8414442539215088, + "learning_rate": 9.960245469831947e-06, + "loss": 0.7543, + "step": 3076 + }, + { + "epoch": 0.16589389691610956, + "grad_norm": 0.7551240921020508, + "learning_rate": 9.960218784079044e-06, + "loss": 0.7131, + "step": 3077 + }, + { + "epoch": 0.16594781108475307, + "grad_norm": 0.8211004137992859, + "learning_rate": 9.960192089408335e-06, + "loss": 0.8335, + "step": 3078 + }, + { + "epoch": 0.1660017252533966, + "grad_norm": 0.7540998458862305, + "learning_rate": 9.960165385819873e-06, + "loss": 0.7557, + "step": 3079 + }, + { + "epoch": 0.1660556394220401, + "grad_norm": 0.7917600274085999, + "learning_rate": 9.9601386733137e-06, + "loss": 0.7522, + "step": 3080 + }, + { + "epoch": 0.16610955359068363, + "grad_norm": 0.9180947542190552, + "learning_rate": 9.960111951889868e-06, + "loss": 0.7943, + "step": 3081 + }, + { + "epoch": 0.16616346775932714, + "grad_norm": 0.8169807195663452, + "learning_rate": 9.960085221548422e-06, + "loss": 0.8633, + "step": 3082 + }, + { + "epoch": 0.16621738192797067, + "grad_norm": 0.8790155649185181, + "learning_rate": 9.960058482289413e-06, + "loss": 0.8265, + "step": 3083 + }, + { + "epoch": 0.1662712960966142, + "grad_norm": 0.8958606123924255, + "learning_rate": 9.960031734112887e-06, + "loss": 0.8601, + "step": 3084 + }, + { + "epoch": 0.1663252102652577, + "grad_norm": 0.8116661906242371, + "learning_rate": 9.960004977018893e-06, + "loss": 0.8203, + "step": 3085 + }, + { + "epoch": 0.16637912443390124, + "grad_norm": 0.771135687828064, + "learning_rate": 9.95997821100748e-06, + "loss": 0.7258, + "step": 3086 + }, + { + "epoch": 0.16643303860254474, + "grad_norm": 0.9094653725624084, + "learning_rate": 9.959951436078696e-06, + "loss": 0.9094, + "step": 3087 + }, + { + "epoch": 0.16648695277118827, + "grad_norm": 0.9042958617210388, + "learning_rate": 9.959924652232586e-06, + "loss": 0.7434, + "step": 3088 + }, + { + "epoch": 0.16654086693983178, + "grad_norm": 0.7170906662940979, + "learning_rate": 9.959897859469201e-06, + "loss": 0.7134, + "step": 3089 + }, + { + "epoch": 0.1665947811084753, + "grad_norm": 0.7896520495414734, + "learning_rate": 9.959871057788589e-06, + "loss": 0.7727, + "step": 3090 + }, + { + "epoch": 0.1666486952771188, + "grad_norm": 0.9295204281806946, + "learning_rate": 9.959844247190797e-06, + "loss": 0.8928, + "step": 3091 + }, + { + "epoch": 0.16670260944576235, + "grad_norm": 0.8025391101837158, + "learning_rate": 9.959817427675875e-06, + "loss": 0.7808, + "step": 3092 + }, + { + "epoch": 0.16675652361440588, + "grad_norm": 0.9727420210838318, + "learning_rate": 9.95979059924387e-06, + "loss": 0.9677, + "step": 3093 + }, + { + "epoch": 0.16681043778304938, + "grad_norm": 0.8534692525863647, + "learning_rate": 9.95976376189483e-06, + "loss": 0.8642, + "step": 3094 + }, + { + "epoch": 0.1668643519516929, + "grad_norm": 0.8361443877220154, + "learning_rate": 9.959736915628803e-06, + "loss": 0.8746, + "step": 3095 + }, + { + "epoch": 0.16691826612033642, + "grad_norm": 0.8551936745643616, + "learning_rate": 9.95971006044584e-06, + "loss": 0.7973, + "step": 3096 + }, + { + "epoch": 0.16697218028897995, + "grad_norm": 0.6986585259437561, + "learning_rate": 9.959683196345987e-06, + "loss": 0.6689, + "step": 3097 + }, + { + "epoch": 0.16702609445762345, + "grad_norm": 0.9048603773117065, + "learning_rate": 9.959656323329291e-06, + "loss": 0.7924, + "step": 3098 + }, + { + "epoch": 0.16708000862626698, + "grad_norm": 0.8295788764953613, + "learning_rate": 9.959629441395802e-06, + "loss": 0.843, + "step": 3099 + }, + { + "epoch": 0.1671339227949105, + "grad_norm": 0.838590681552887, + "learning_rate": 9.959602550545568e-06, + "loss": 0.7615, + "step": 3100 + }, + { + "epoch": 0.16718783696355402, + "grad_norm": 0.8323560357093811, + "learning_rate": 9.959575650778639e-06, + "loss": 0.8375, + "step": 3101 + }, + { + "epoch": 0.16724175113219755, + "grad_norm": 0.8825474381446838, + "learning_rate": 9.959548742095062e-06, + "loss": 0.7701, + "step": 3102 + }, + { + "epoch": 0.16729566530084106, + "grad_norm": 0.8911004662513733, + "learning_rate": 9.959521824494884e-06, + "loss": 0.8, + "step": 3103 + }, + { + "epoch": 0.1673495794694846, + "grad_norm": 0.76695317029953, + "learning_rate": 9.959494897978154e-06, + "loss": 0.7177, + "step": 3104 + }, + { + "epoch": 0.1674034936381281, + "grad_norm": 0.9462987184524536, + "learning_rate": 9.959467962544922e-06, + "loss": 0.8479, + "step": 3105 + }, + { + "epoch": 0.16745740780677162, + "grad_norm": 0.7185036540031433, + "learning_rate": 9.959441018195235e-06, + "loss": 0.6444, + "step": 3106 + }, + { + "epoch": 0.16751132197541513, + "grad_norm": 0.9797527194023132, + "learning_rate": 9.959414064929143e-06, + "loss": 0.916, + "step": 3107 + }, + { + "epoch": 0.16756523614405866, + "grad_norm": 0.7815739512443542, + "learning_rate": 9.959387102746693e-06, + "loss": 0.7315, + "step": 3108 + }, + { + "epoch": 0.1676191503127022, + "grad_norm": 0.9536890387535095, + "learning_rate": 9.959360131647933e-06, + "loss": 0.7795, + "step": 3109 + }, + { + "epoch": 0.1676730644813457, + "grad_norm": 0.7770065069198608, + "learning_rate": 9.959333151632913e-06, + "loss": 0.8203, + "step": 3110 + }, + { + "epoch": 0.16772697864998923, + "grad_norm": 0.8031367659568787, + "learning_rate": 9.959306162701681e-06, + "loss": 0.8362, + "step": 3111 + }, + { + "epoch": 0.16778089281863273, + "grad_norm": 0.8009032011032104, + "learning_rate": 9.959279164854286e-06, + "loss": 0.8113, + "step": 3112 + }, + { + "epoch": 0.16783480698727626, + "grad_norm": 0.8091812133789062, + "learning_rate": 9.959252158090775e-06, + "loss": 0.84, + "step": 3113 + }, + { + "epoch": 0.16788872115591977, + "grad_norm": 0.7102682590484619, + "learning_rate": 9.959225142411197e-06, + "loss": 0.7378, + "step": 3114 + }, + { + "epoch": 0.1679426353245633, + "grad_norm": 0.8190940618515015, + "learning_rate": 9.959198117815602e-06, + "loss": 0.8478, + "step": 3115 + }, + { + "epoch": 0.1679965494932068, + "grad_norm": 0.7320457696914673, + "learning_rate": 9.959171084304037e-06, + "loss": 0.8358, + "step": 3116 + }, + { + "epoch": 0.16805046366185034, + "grad_norm": 0.8222710490226746, + "learning_rate": 9.959144041876551e-06, + "loss": 0.809, + "step": 3117 + }, + { + "epoch": 0.16810437783049387, + "grad_norm": 0.7939282059669495, + "learning_rate": 9.959116990533195e-06, + "loss": 0.8562, + "step": 3118 + }, + { + "epoch": 0.16815829199913737, + "grad_norm": 0.7231613993644714, + "learning_rate": 9.959089930274013e-06, + "loss": 0.7656, + "step": 3119 + }, + { + "epoch": 0.1682122061677809, + "grad_norm": 0.8997424840927124, + "learning_rate": 9.959062861099058e-06, + "loss": 0.8831, + "step": 3120 + }, + { + "epoch": 0.1682661203364244, + "grad_norm": 0.80366450548172, + "learning_rate": 9.959035783008374e-06, + "loss": 0.8044, + "step": 3121 + }, + { + "epoch": 0.16832003450506794, + "grad_norm": 0.8153119683265686, + "learning_rate": 9.959008696002015e-06, + "loss": 0.8325, + "step": 3122 + }, + { + "epoch": 0.16837394867371144, + "grad_norm": 0.8638020157814026, + "learning_rate": 9.958981600080026e-06, + "loss": 0.8197, + "step": 3123 + }, + { + "epoch": 0.16842786284235498, + "grad_norm": 0.8430980443954468, + "learning_rate": 9.95895449524246e-06, + "loss": 0.8212, + "step": 3124 + }, + { + "epoch": 0.16848177701099848, + "grad_norm": 0.9273066520690918, + "learning_rate": 9.958927381489358e-06, + "loss": 0.8145, + "step": 3125 + }, + { + "epoch": 0.168535691179642, + "grad_norm": 0.8697495460510254, + "learning_rate": 9.958900258820777e-06, + "loss": 0.8519, + "step": 3126 + }, + { + "epoch": 0.16858960534828554, + "grad_norm": 0.7957634925842285, + "learning_rate": 9.95887312723676e-06, + "loss": 0.8065, + "step": 3127 + }, + { + "epoch": 0.16864351951692905, + "grad_norm": 0.8890637755393982, + "learning_rate": 9.958845986737357e-06, + "loss": 0.822, + "step": 3128 + }, + { + "epoch": 0.16869743368557258, + "grad_norm": 0.7979970574378967, + "learning_rate": 9.95881883732262e-06, + "loss": 0.8346, + "step": 3129 + }, + { + "epoch": 0.16875134785421608, + "grad_norm": 0.8589211106300354, + "learning_rate": 9.958791678992594e-06, + "loss": 0.7498, + "step": 3130 + }, + { + "epoch": 0.16880526202285961, + "grad_norm": 0.7819254398345947, + "learning_rate": 9.95876451174733e-06, + "loss": 0.7515, + "step": 3131 + }, + { + "epoch": 0.16885917619150312, + "grad_norm": 0.9037144184112549, + "learning_rate": 9.958737335586877e-06, + "loss": 0.7684, + "step": 3132 + }, + { + "epoch": 0.16891309036014665, + "grad_norm": 0.9139670133590698, + "learning_rate": 9.958710150511282e-06, + "loss": 0.7848, + "step": 3133 + }, + { + "epoch": 0.16896700452879015, + "grad_norm": 0.8177505135536194, + "learning_rate": 9.958682956520596e-06, + "loss": 0.8656, + "step": 3134 + }, + { + "epoch": 0.1690209186974337, + "grad_norm": 0.7351679801940918, + "learning_rate": 9.958655753614865e-06, + "loss": 0.769, + "step": 3135 + }, + { + "epoch": 0.16907483286607722, + "grad_norm": 0.8661699891090393, + "learning_rate": 9.958628541794142e-06, + "loss": 0.8523, + "step": 3136 + }, + { + "epoch": 0.16912874703472072, + "grad_norm": 0.7755950689315796, + "learning_rate": 9.958601321058471e-06, + "loss": 0.7737, + "step": 3137 + }, + { + "epoch": 0.16918266120336425, + "grad_norm": 0.8523197174072266, + "learning_rate": 9.958574091407906e-06, + "loss": 0.8508, + "step": 3138 + }, + { + "epoch": 0.16923657537200776, + "grad_norm": 0.7154935598373413, + "learning_rate": 9.958546852842493e-06, + "loss": 0.6725, + "step": 3139 + }, + { + "epoch": 0.1692904895406513, + "grad_norm": 0.8140445947647095, + "learning_rate": 9.95851960536228e-06, + "loss": 0.92, + "step": 3140 + }, + { + "epoch": 0.1693444037092948, + "grad_norm": 0.7320675849914551, + "learning_rate": 9.95849234896732e-06, + "loss": 0.8091, + "step": 3141 + }, + { + "epoch": 0.16939831787793833, + "grad_norm": 0.7761030197143555, + "learning_rate": 9.958465083657659e-06, + "loss": 0.7444, + "step": 3142 + }, + { + "epoch": 0.16945223204658183, + "grad_norm": 0.8432923555374146, + "learning_rate": 9.958437809433345e-06, + "loss": 0.8112, + "step": 3143 + }, + { + "epoch": 0.16950614621522536, + "grad_norm": 0.8015188574790955, + "learning_rate": 9.958410526294428e-06, + "loss": 0.8383, + "step": 3144 + }, + { + "epoch": 0.1695600603838689, + "grad_norm": 0.7635226845741272, + "learning_rate": 9.95838323424096e-06, + "loss": 0.7942, + "step": 3145 + }, + { + "epoch": 0.1696139745525124, + "grad_norm": 0.942131757736206, + "learning_rate": 9.958355933272986e-06, + "loss": 0.8877, + "step": 3146 + }, + { + "epoch": 0.16966788872115593, + "grad_norm": 1.1072907447814941, + "learning_rate": 9.958328623390558e-06, + "loss": 0.7369, + "step": 3147 + }, + { + "epoch": 0.16972180288979943, + "grad_norm": 0.8342657685279846, + "learning_rate": 9.958301304593722e-06, + "loss": 0.7946, + "step": 3148 + }, + { + "epoch": 0.16977571705844297, + "grad_norm": 0.7320284843444824, + "learning_rate": 9.958273976882531e-06, + "loss": 0.754, + "step": 3149 + }, + { + "epoch": 0.16982963122708647, + "grad_norm": 0.7840715646743774, + "learning_rate": 9.958246640257031e-06, + "loss": 0.7897, + "step": 3150 + }, + { + "epoch": 0.16988354539573, + "grad_norm": 0.7383304834365845, + "learning_rate": 9.958219294717273e-06, + "loss": 0.8205, + "step": 3151 + }, + { + "epoch": 0.1699374595643735, + "grad_norm": 0.7597193121910095, + "learning_rate": 9.958191940263305e-06, + "loss": 0.8016, + "step": 3152 + }, + { + "epoch": 0.16999137373301704, + "grad_norm": 0.7770809531211853, + "learning_rate": 9.958164576895176e-06, + "loss": 0.7228, + "step": 3153 + }, + { + "epoch": 0.17004528790166057, + "grad_norm": 0.891514241695404, + "learning_rate": 9.958137204612936e-06, + "loss": 0.8598, + "step": 3154 + }, + { + "epoch": 0.17009920207030407, + "grad_norm": 0.8025946021080017, + "learning_rate": 9.958109823416635e-06, + "loss": 0.8979, + "step": 3155 + }, + { + "epoch": 0.1701531162389476, + "grad_norm": 0.7912386059761047, + "learning_rate": 9.95808243330632e-06, + "loss": 0.7562, + "step": 3156 + }, + { + "epoch": 0.1702070304075911, + "grad_norm": 0.8642987608909607, + "learning_rate": 9.958055034282043e-06, + "loss": 0.7916, + "step": 3157 + }, + { + "epoch": 0.17026094457623464, + "grad_norm": 0.8047364950180054, + "learning_rate": 9.958027626343852e-06, + "loss": 0.7598, + "step": 3158 + }, + { + "epoch": 0.17031485874487814, + "grad_norm": 0.8402281999588013, + "learning_rate": 9.958000209491794e-06, + "loss": 0.8572, + "step": 3159 + }, + { + "epoch": 0.17036877291352168, + "grad_norm": 0.7486295700073242, + "learning_rate": 9.95797278372592e-06, + "loss": 0.7221, + "step": 3160 + }, + { + "epoch": 0.17042268708216518, + "grad_norm": 0.7889320254325867, + "learning_rate": 9.95794534904628e-06, + "loss": 0.7734, + "step": 3161 + }, + { + "epoch": 0.1704766012508087, + "grad_norm": 0.7864039540290833, + "learning_rate": 9.957917905452925e-06, + "loss": 0.7763, + "step": 3162 + }, + { + "epoch": 0.17053051541945224, + "grad_norm": 0.8366582989692688, + "learning_rate": 9.957890452945903e-06, + "loss": 0.8594, + "step": 3163 + }, + { + "epoch": 0.17058442958809575, + "grad_norm": 0.8014213442802429, + "learning_rate": 9.95786299152526e-06, + "loss": 0.7802, + "step": 3164 + }, + { + "epoch": 0.17063834375673928, + "grad_norm": 0.8158774375915527, + "learning_rate": 9.957835521191048e-06, + "loss": 0.7693, + "step": 3165 + }, + { + "epoch": 0.17069225792538278, + "grad_norm": 1.0622320175170898, + "learning_rate": 9.957808041943316e-06, + "loss": 0.8949, + "step": 3166 + }, + { + "epoch": 0.17074617209402632, + "grad_norm": 0.7825013399124146, + "learning_rate": 9.957780553782114e-06, + "loss": 0.7681, + "step": 3167 + }, + { + "epoch": 0.17080008626266982, + "grad_norm": 1.0727826356887817, + "learning_rate": 9.957753056707493e-06, + "loss": 0.876, + "step": 3168 + }, + { + "epoch": 0.17085400043131335, + "grad_norm": 0.7952837944030762, + "learning_rate": 9.9577255507195e-06, + "loss": 0.7671, + "step": 3169 + }, + { + "epoch": 0.17090791459995686, + "grad_norm": 0.7251336574554443, + "learning_rate": 9.957698035818185e-06, + "loss": 0.7938, + "step": 3170 + }, + { + "epoch": 0.1709618287686004, + "grad_norm": 0.8674930930137634, + "learning_rate": 9.957670512003598e-06, + "loss": 0.9387, + "step": 3171 + }, + { + "epoch": 0.17101574293724392, + "grad_norm": 0.7578595876693726, + "learning_rate": 9.957642979275787e-06, + "loss": 0.8295, + "step": 3172 + }, + { + "epoch": 0.17106965710588742, + "grad_norm": 0.8236204385757446, + "learning_rate": 9.957615437634802e-06, + "loss": 0.871, + "step": 3173 + }, + { + "epoch": 0.17112357127453096, + "grad_norm": 0.7528506517410278, + "learning_rate": 9.957587887080696e-06, + "loss": 0.7034, + "step": 3174 + }, + { + "epoch": 0.17117748544317446, + "grad_norm": 0.8170275092124939, + "learning_rate": 9.957560327613514e-06, + "loss": 0.7412, + "step": 3175 + }, + { + "epoch": 0.171231399611818, + "grad_norm": 0.91305011510849, + "learning_rate": 9.957532759233307e-06, + "loss": 0.8861, + "step": 3176 + }, + { + "epoch": 0.1712853137804615, + "grad_norm": 0.7793359756469727, + "learning_rate": 9.957505181940124e-06, + "loss": 0.8106, + "step": 3177 + }, + { + "epoch": 0.17133922794910503, + "grad_norm": 0.9424631595611572, + "learning_rate": 9.957477595734016e-06, + "loss": 0.8271, + "step": 3178 + }, + { + "epoch": 0.17139314211774853, + "grad_norm": 0.8909611701965332, + "learning_rate": 9.957450000615031e-06, + "loss": 0.8711, + "step": 3179 + }, + { + "epoch": 0.17144705628639206, + "grad_norm": 0.703960657119751, + "learning_rate": 9.95742239658322e-06, + "loss": 0.6693, + "step": 3180 + }, + { + "epoch": 0.1715009704550356, + "grad_norm": 0.8511449098587036, + "learning_rate": 9.957394783638632e-06, + "loss": 0.8075, + "step": 3181 + }, + { + "epoch": 0.1715548846236791, + "grad_norm": 0.93243008852005, + "learning_rate": 9.957367161781318e-06, + "loss": 0.8663, + "step": 3182 + }, + { + "epoch": 0.17160879879232263, + "grad_norm": 0.926092803478241, + "learning_rate": 9.957339531011325e-06, + "loss": 0.8973, + "step": 3183 + }, + { + "epoch": 0.17166271296096614, + "grad_norm": 0.8564586043357849, + "learning_rate": 9.957311891328705e-06, + "loss": 0.7561, + "step": 3184 + }, + { + "epoch": 0.17171662712960967, + "grad_norm": 0.8317960500717163, + "learning_rate": 9.957284242733507e-06, + "loss": 0.817, + "step": 3185 + }, + { + "epoch": 0.17177054129825317, + "grad_norm": 0.7291557788848877, + "learning_rate": 9.95725658522578e-06, + "loss": 0.6963, + "step": 3186 + }, + { + "epoch": 0.1718244554668967, + "grad_norm": 0.8154743313789368, + "learning_rate": 9.957228918805574e-06, + "loss": 0.8005, + "step": 3187 + }, + { + "epoch": 0.1718783696355402, + "grad_norm": 0.7985217571258545, + "learning_rate": 9.95720124347294e-06, + "loss": 0.8471, + "step": 3188 + }, + { + "epoch": 0.17193228380418374, + "grad_norm": 0.7928630709648132, + "learning_rate": 9.957173559227926e-06, + "loss": 0.8809, + "step": 3189 + }, + { + "epoch": 0.17198619797282727, + "grad_norm": 0.800392210483551, + "learning_rate": 9.957145866070583e-06, + "loss": 0.8031, + "step": 3190 + }, + { + "epoch": 0.17204011214147077, + "grad_norm": 0.8904628157615662, + "learning_rate": 9.95711816400096e-06, + "loss": 0.7583, + "step": 3191 + }, + { + "epoch": 0.1720940263101143, + "grad_norm": 0.7246114611625671, + "learning_rate": 9.957090453019106e-06, + "loss": 0.7365, + "step": 3192 + }, + { + "epoch": 0.1721479404787578, + "grad_norm": 0.8280320763587952, + "learning_rate": 9.957062733125074e-06, + "loss": 0.7723, + "step": 3193 + }, + { + "epoch": 0.17220185464740134, + "grad_norm": 0.929804265499115, + "learning_rate": 9.957035004318911e-06, + "loss": 0.8412, + "step": 3194 + }, + { + "epoch": 0.17225576881604485, + "grad_norm": 0.815108060836792, + "learning_rate": 9.957007266600666e-06, + "loss": 0.8076, + "step": 3195 + }, + { + "epoch": 0.17230968298468838, + "grad_norm": 0.7849567532539368, + "learning_rate": 9.956979519970393e-06, + "loss": 0.8245, + "step": 3196 + }, + { + "epoch": 0.17236359715333188, + "grad_norm": 1.458945393562317, + "learning_rate": 9.956951764428138e-06, + "loss": 0.7647, + "step": 3197 + }, + { + "epoch": 0.17241751132197541, + "grad_norm": 0.8327317833900452, + "learning_rate": 9.956923999973954e-06, + "loss": 0.8824, + "step": 3198 + }, + { + "epoch": 0.17247142549061895, + "grad_norm": 0.7398284077644348, + "learning_rate": 9.956896226607887e-06, + "loss": 0.7907, + "step": 3199 + }, + { + "epoch": 0.17252533965926245, + "grad_norm": 0.8546818494796753, + "learning_rate": 9.95686844432999e-06, + "loss": 0.8723, + "step": 3200 + }, + { + "epoch": 0.17257925382790598, + "grad_norm": 0.7967200875282288, + "learning_rate": 9.956840653140311e-06, + "loss": 0.8156, + "step": 3201 + }, + { + "epoch": 0.17263316799654949, + "grad_norm": 0.9093504548072815, + "learning_rate": 9.956812853038903e-06, + "loss": 0.8002, + "step": 3202 + }, + { + "epoch": 0.17268708216519302, + "grad_norm": 0.7995857000350952, + "learning_rate": 9.956785044025811e-06, + "loss": 0.8413, + "step": 3203 + }, + { + "epoch": 0.17274099633383652, + "grad_norm": 0.828748881816864, + "learning_rate": 9.95675722610109e-06, + "loss": 0.7162, + "step": 3204 + }, + { + "epoch": 0.17279491050248005, + "grad_norm": 0.7679111361503601, + "learning_rate": 9.956729399264789e-06, + "loss": 0.7909, + "step": 3205 + }, + { + "epoch": 0.17284882467112356, + "grad_norm": 0.9187313318252563, + "learning_rate": 9.956701563516956e-06, + "loss": 0.8537, + "step": 3206 + }, + { + "epoch": 0.1729027388397671, + "grad_norm": 0.7859029173851013, + "learning_rate": 9.956673718857642e-06, + "loss": 0.7392, + "step": 3207 + }, + { + "epoch": 0.17295665300841062, + "grad_norm": 0.8365893363952637, + "learning_rate": 9.956645865286897e-06, + "loss": 0.7921, + "step": 3208 + }, + { + "epoch": 0.17301056717705413, + "grad_norm": 0.912382960319519, + "learning_rate": 9.956618002804771e-06, + "loss": 0.8651, + "step": 3209 + }, + { + "epoch": 0.17306448134569766, + "grad_norm": 0.7380210757255554, + "learning_rate": 9.956590131411314e-06, + "loss": 0.7031, + "step": 3210 + }, + { + "epoch": 0.17311839551434116, + "grad_norm": 0.7943229675292969, + "learning_rate": 9.956562251106578e-06, + "loss": 0.7725, + "step": 3211 + }, + { + "epoch": 0.1731723096829847, + "grad_norm": 0.8835777640342712, + "learning_rate": 9.95653436189061e-06, + "loss": 0.8633, + "step": 3212 + }, + { + "epoch": 0.1732262238516282, + "grad_norm": 0.8082174062728882, + "learning_rate": 9.956506463763464e-06, + "loss": 0.8833, + "step": 3213 + }, + { + "epoch": 0.17328013802027173, + "grad_norm": 0.8236085772514343, + "learning_rate": 9.956478556725186e-06, + "loss": 0.8517, + "step": 3214 + }, + { + "epoch": 0.17333405218891526, + "grad_norm": 0.8428922891616821, + "learning_rate": 9.956450640775829e-06, + "loss": 0.8659, + "step": 3215 + }, + { + "epoch": 0.17338796635755876, + "grad_norm": 0.8443105220794678, + "learning_rate": 9.95642271591544e-06, + "loss": 0.9589, + "step": 3216 + }, + { + "epoch": 0.1734418805262023, + "grad_norm": 0.7856699228286743, + "learning_rate": 9.956394782144074e-06, + "loss": 0.787, + "step": 3217 + }, + { + "epoch": 0.1734957946948458, + "grad_norm": 0.8537113666534424, + "learning_rate": 9.95636683946178e-06, + "loss": 0.9339, + "step": 3218 + }, + { + "epoch": 0.17354970886348933, + "grad_norm": 0.8206045627593994, + "learning_rate": 9.956338887868603e-06, + "loss": 0.832, + "step": 3219 + }, + { + "epoch": 0.17360362303213284, + "grad_norm": 0.7913991808891296, + "learning_rate": 9.956310927364599e-06, + "loss": 0.7647, + "step": 3220 + }, + { + "epoch": 0.17365753720077637, + "grad_norm": 0.9481332302093506, + "learning_rate": 9.956282957949817e-06, + "loss": 0.7113, + "step": 3221 + }, + { + "epoch": 0.17371145136941987, + "grad_norm": 0.9326061606407166, + "learning_rate": 9.956254979624304e-06, + "loss": 0.8324, + "step": 3222 + }, + { + "epoch": 0.1737653655380634, + "grad_norm": 1.0496339797973633, + "learning_rate": 9.956226992388117e-06, + "loss": 0.7959, + "step": 3223 + }, + { + "epoch": 0.17381927970670694, + "grad_norm": 0.8025851249694824, + "learning_rate": 9.9561989962413e-06, + "loss": 0.811, + "step": 3224 + }, + { + "epoch": 0.17387319387535044, + "grad_norm": 0.9083681106567383, + "learning_rate": 9.956170991183905e-06, + "loss": 0.7957, + "step": 3225 + }, + { + "epoch": 0.17392710804399397, + "grad_norm": 0.8242226243019104, + "learning_rate": 9.956142977215983e-06, + "loss": 0.8224, + "step": 3226 + }, + { + "epoch": 0.17398102221263748, + "grad_norm": 0.8805774450302124, + "learning_rate": 9.956114954337586e-06, + "loss": 0.8847, + "step": 3227 + }, + { + "epoch": 0.174034936381281, + "grad_norm": 0.748651921749115, + "learning_rate": 9.956086922548761e-06, + "loss": 0.7719, + "step": 3228 + }, + { + "epoch": 0.1740888505499245, + "grad_norm": 0.7385552525520325, + "learning_rate": 9.956058881849562e-06, + "loss": 0.7591, + "step": 3229 + }, + { + "epoch": 0.17414276471856804, + "grad_norm": 0.7795779705047607, + "learning_rate": 9.956030832240037e-06, + "loss": 0.8071, + "step": 3230 + }, + { + "epoch": 0.17419667888721155, + "grad_norm": 9.106490135192871, + "learning_rate": 9.956002773720236e-06, + "loss": 0.7915, + "step": 3231 + }, + { + "epoch": 0.17425059305585508, + "grad_norm": 0.861794650554657, + "learning_rate": 9.955974706290212e-06, + "loss": 0.8293, + "step": 3232 + }, + { + "epoch": 0.1743045072244986, + "grad_norm": 0.8002027869224548, + "learning_rate": 9.955946629950012e-06, + "loss": 0.8404, + "step": 3233 + }, + { + "epoch": 0.17435842139314212, + "grad_norm": 0.8162701725959778, + "learning_rate": 9.95591854469969e-06, + "loss": 0.8362, + "step": 3234 + }, + { + "epoch": 0.17441233556178565, + "grad_norm": 0.7436956763267517, + "learning_rate": 9.955890450539295e-06, + "loss": 0.8339, + "step": 3235 + }, + { + "epoch": 0.17446624973042915, + "grad_norm": 0.8074719309806824, + "learning_rate": 9.955862347468875e-06, + "loss": 0.8403, + "step": 3236 + }, + { + "epoch": 0.17452016389907268, + "grad_norm": 0.8527933955192566, + "learning_rate": 9.955834235488485e-06, + "loss": 0.8201, + "step": 3237 + }, + { + "epoch": 0.1745740780677162, + "grad_norm": 0.792177140712738, + "learning_rate": 9.955806114598173e-06, + "loss": 0.8304, + "step": 3238 + }, + { + "epoch": 0.17462799223635972, + "grad_norm": 0.8211845755577087, + "learning_rate": 9.95577798479799e-06, + "loss": 0.8013, + "step": 3239 + }, + { + "epoch": 0.17468190640500322, + "grad_norm": 0.906973659992218, + "learning_rate": 9.955749846087986e-06, + "loss": 0.823, + "step": 3240 + }, + { + "epoch": 0.17473582057364676, + "grad_norm": 0.904077410697937, + "learning_rate": 9.955721698468213e-06, + "loss": 0.7651, + "step": 3241 + }, + { + "epoch": 0.1747897347422903, + "grad_norm": 0.8147358298301697, + "learning_rate": 9.95569354193872e-06, + "loss": 0.9268, + "step": 3242 + }, + { + "epoch": 0.1748436489109338, + "grad_norm": 0.8664659857749939, + "learning_rate": 9.95566537649956e-06, + "loss": 0.8366, + "step": 3243 + }, + { + "epoch": 0.17489756307957732, + "grad_norm": 0.6882225871086121, + "learning_rate": 9.95563720215078e-06, + "loss": 0.7152, + "step": 3244 + }, + { + "epoch": 0.17495147724822083, + "grad_norm": 0.7605637907981873, + "learning_rate": 9.955609018892434e-06, + "loss": 0.7864, + "step": 3245 + }, + { + "epoch": 0.17500539141686436, + "grad_norm": 0.7316586375236511, + "learning_rate": 9.95558082672457e-06, + "loss": 0.7175, + "step": 3246 + }, + { + "epoch": 0.17505930558550786, + "grad_norm": 0.8258477449417114, + "learning_rate": 9.955552625647241e-06, + "loss": 0.8463, + "step": 3247 + }, + { + "epoch": 0.1751132197541514, + "grad_norm": 0.7658422589302063, + "learning_rate": 9.955524415660498e-06, + "loss": 0.9477, + "step": 3248 + }, + { + "epoch": 0.1751671339227949, + "grad_norm": 0.9374455809593201, + "learning_rate": 9.955496196764387e-06, + "loss": 0.8725, + "step": 3249 + }, + { + "epoch": 0.17522104809143843, + "grad_norm": 0.7676389813423157, + "learning_rate": 9.955467968958965e-06, + "loss": 0.7868, + "step": 3250 + }, + { + "epoch": 0.17527496226008196, + "grad_norm": 0.9800841808319092, + "learning_rate": 9.955439732244279e-06, + "loss": 0.7787, + "step": 3251 + }, + { + "epoch": 0.17532887642872547, + "grad_norm": 0.7501618266105652, + "learning_rate": 9.95541148662038e-06, + "loss": 0.7703, + "step": 3252 + }, + { + "epoch": 0.175382790597369, + "grad_norm": 0.8019260168075562, + "learning_rate": 9.95538323208732e-06, + "loss": 0.7635, + "step": 3253 + }, + { + "epoch": 0.1754367047660125, + "grad_norm": 0.7791414260864258, + "learning_rate": 9.95535496864515e-06, + "loss": 0.7372, + "step": 3254 + }, + { + "epoch": 0.17549061893465603, + "grad_norm": 0.7667005658149719, + "learning_rate": 9.955326696293921e-06, + "loss": 0.8481, + "step": 3255 + }, + { + "epoch": 0.17554453310329954, + "grad_norm": 0.7585765719413757, + "learning_rate": 9.955298415033681e-06, + "loss": 0.7933, + "step": 3256 + }, + { + "epoch": 0.17559844727194307, + "grad_norm": 0.8037384152412415, + "learning_rate": 9.955270124864485e-06, + "loss": 0.8716, + "step": 3257 + }, + { + "epoch": 0.17565236144058657, + "grad_norm": 0.7610961198806763, + "learning_rate": 9.955241825786379e-06, + "loss": 0.7647, + "step": 3258 + }, + { + "epoch": 0.1757062756092301, + "grad_norm": 0.7867752909660339, + "learning_rate": 9.955213517799418e-06, + "loss": 0.7685, + "step": 3259 + }, + { + "epoch": 0.17576018977787364, + "grad_norm": 1.1530165672302246, + "learning_rate": 9.955185200903652e-06, + "loss": 0.9032, + "step": 3260 + }, + { + "epoch": 0.17581410394651714, + "grad_norm": 0.7161276936531067, + "learning_rate": 9.955156875099129e-06, + "loss": 0.7367, + "step": 3261 + }, + { + "epoch": 0.17586801811516067, + "grad_norm": 0.7634873390197754, + "learning_rate": 9.955128540385903e-06, + "loss": 0.6914, + "step": 3262 + }, + { + "epoch": 0.17592193228380418, + "grad_norm": 0.8375166654586792, + "learning_rate": 9.955100196764025e-06, + "loss": 0.965, + "step": 3263 + }, + { + "epoch": 0.1759758464524477, + "grad_norm": 0.784824788570404, + "learning_rate": 9.955071844233545e-06, + "loss": 0.7825, + "step": 3264 + }, + { + "epoch": 0.1760297606210912, + "grad_norm": 0.7765333652496338, + "learning_rate": 9.955043482794514e-06, + "loss": 0.9057, + "step": 3265 + }, + { + "epoch": 0.17608367478973475, + "grad_norm": 0.9159989356994629, + "learning_rate": 9.955015112446985e-06, + "loss": 0.8055, + "step": 3266 + }, + { + "epoch": 0.17613758895837825, + "grad_norm": 0.8813021183013916, + "learning_rate": 9.954986733191003e-06, + "loss": 0.8811, + "step": 3267 + }, + { + "epoch": 0.17619150312702178, + "grad_norm": 0.7664482593536377, + "learning_rate": 9.954958345026627e-06, + "loss": 0.7138, + "step": 3268 + }, + { + "epoch": 0.1762454172956653, + "grad_norm": 0.8903096914291382, + "learning_rate": 9.954929947953902e-06, + "loss": 0.8884, + "step": 3269 + }, + { + "epoch": 0.17629933146430882, + "grad_norm": 0.750549852848053, + "learning_rate": 9.95490154197288e-06, + "loss": 0.7948, + "step": 3270 + }, + { + "epoch": 0.17635324563295235, + "grad_norm": 0.8723561763763428, + "learning_rate": 9.954873127083615e-06, + "loss": 0.8896, + "step": 3271 + }, + { + "epoch": 0.17640715980159585, + "grad_norm": 0.8852900862693787, + "learning_rate": 9.954844703286157e-06, + "loss": 0.8504, + "step": 3272 + }, + { + "epoch": 0.17646107397023938, + "grad_norm": 0.8535251021385193, + "learning_rate": 9.954816270580555e-06, + "loss": 0.7198, + "step": 3273 + }, + { + "epoch": 0.1765149881388829, + "grad_norm": 0.8378668427467346, + "learning_rate": 9.954787828966864e-06, + "loss": 0.8361, + "step": 3274 + }, + { + "epoch": 0.17656890230752642, + "grad_norm": 0.7617664337158203, + "learning_rate": 9.954759378445132e-06, + "loss": 0.8147, + "step": 3275 + }, + { + "epoch": 0.17662281647616992, + "grad_norm": 0.8433284163475037, + "learning_rate": 9.95473091901541e-06, + "loss": 0.9083, + "step": 3276 + }, + { + "epoch": 0.17667673064481346, + "grad_norm": 0.82453453540802, + "learning_rate": 9.954702450677749e-06, + "loss": 0.8646, + "step": 3277 + }, + { + "epoch": 0.176730644813457, + "grad_norm": 0.8066715598106384, + "learning_rate": 9.954673973432202e-06, + "loss": 0.7837, + "step": 3278 + }, + { + "epoch": 0.1767845589821005, + "grad_norm": 0.7899057865142822, + "learning_rate": 9.95464548727882e-06, + "loss": 0.8418, + "step": 3279 + }, + { + "epoch": 0.17683847315074402, + "grad_norm": 0.7744193077087402, + "learning_rate": 9.954616992217654e-06, + "loss": 0.7316, + "step": 3280 + }, + { + "epoch": 0.17689238731938753, + "grad_norm": 0.9195299744606018, + "learning_rate": 9.954588488248756e-06, + "loss": 0.9387, + "step": 3281 + }, + { + "epoch": 0.17694630148803106, + "grad_norm": 0.9263700246810913, + "learning_rate": 9.954559975372173e-06, + "loss": 0.7165, + "step": 3282 + }, + { + "epoch": 0.17700021565667456, + "grad_norm": 0.7949888706207275, + "learning_rate": 9.954531453587962e-06, + "loss": 0.7981, + "step": 3283 + }, + { + "epoch": 0.1770541298253181, + "grad_norm": 0.9938671588897705, + "learning_rate": 9.95450292289617e-06, + "loss": 0.754, + "step": 3284 + }, + { + "epoch": 0.1771080439939616, + "grad_norm": 0.7466611862182617, + "learning_rate": 9.95447438329685e-06, + "loss": 0.8182, + "step": 3285 + }, + { + "epoch": 0.17716195816260513, + "grad_norm": 0.7918881177902222, + "learning_rate": 9.954445834790054e-06, + "loss": 0.6938, + "step": 3286 + }, + { + "epoch": 0.17721587233124866, + "grad_norm": 0.7867146730422974, + "learning_rate": 9.954417277375832e-06, + "loss": 0.7999, + "step": 3287 + }, + { + "epoch": 0.17726978649989217, + "grad_norm": 0.7873522043228149, + "learning_rate": 9.954388711054237e-06, + "loss": 0.7822, + "step": 3288 + }, + { + "epoch": 0.1773237006685357, + "grad_norm": 0.7909482717514038, + "learning_rate": 9.954360135825319e-06, + "loss": 0.724, + "step": 3289 + }, + { + "epoch": 0.1773776148371792, + "grad_norm": 0.7893263697624207, + "learning_rate": 9.954331551689129e-06, + "loss": 0.8892, + "step": 3290 + }, + { + "epoch": 0.17743152900582274, + "grad_norm": 0.813908040523529, + "learning_rate": 9.954302958645719e-06, + "loss": 0.8261, + "step": 3291 + }, + { + "epoch": 0.17748544317446624, + "grad_norm": 1.0279232263565063, + "learning_rate": 9.95427435669514e-06, + "loss": 0.8383, + "step": 3292 + }, + { + "epoch": 0.17753935734310977, + "grad_norm": 0.7427249550819397, + "learning_rate": 9.954245745837445e-06, + "loss": 0.7883, + "step": 3293 + }, + { + "epoch": 0.17759327151175328, + "grad_norm": 0.7699581980705261, + "learning_rate": 9.954217126072686e-06, + "loss": 0.749, + "step": 3294 + }, + { + "epoch": 0.1776471856803968, + "grad_norm": 0.8005263209342957, + "learning_rate": 9.954188497400909e-06, + "loss": 0.7886, + "step": 3295 + }, + { + "epoch": 0.17770109984904034, + "grad_norm": 0.8718039393424988, + "learning_rate": 9.95415985982217e-06, + "loss": 0.8397, + "step": 3296 + }, + { + "epoch": 0.17775501401768384, + "grad_norm": 0.7747098207473755, + "learning_rate": 9.954131213336522e-06, + "loss": 0.7193, + "step": 3297 + }, + { + "epoch": 0.17780892818632738, + "grad_norm": 0.8327599167823792, + "learning_rate": 9.954102557944013e-06, + "loss": 0.8484, + "step": 3298 + }, + { + "epoch": 0.17786284235497088, + "grad_norm": 0.7737470269203186, + "learning_rate": 9.954073893644696e-06, + "loss": 0.7638, + "step": 3299 + }, + { + "epoch": 0.1779167565236144, + "grad_norm": 0.8054937124252319, + "learning_rate": 9.954045220438622e-06, + "loss": 0.7772, + "step": 3300 + }, + { + "epoch": 0.17797067069225792, + "grad_norm": 0.7954006195068359, + "learning_rate": 9.954016538325844e-06, + "loss": 0.7746, + "step": 3301 + }, + { + "epoch": 0.17802458486090145, + "grad_norm": 0.8075349926948547, + "learning_rate": 9.95398784730641e-06, + "loss": 0.794, + "step": 3302 + }, + { + "epoch": 0.17807849902954495, + "grad_norm": 0.8701021075248718, + "learning_rate": 9.953959147380376e-06, + "loss": 0.8493, + "step": 3303 + }, + { + "epoch": 0.17813241319818848, + "grad_norm": 0.9046748876571655, + "learning_rate": 9.953930438547792e-06, + "loss": 0.8491, + "step": 3304 + }, + { + "epoch": 0.17818632736683201, + "grad_norm": 0.8041692972183228, + "learning_rate": 9.953901720808708e-06, + "loss": 0.7422, + "step": 3305 + }, + { + "epoch": 0.17824024153547552, + "grad_norm": 0.8486021757125854, + "learning_rate": 9.953872994163176e-06, + "loss": 0.7876, + "step": 3306 + }, + { + "epoch": 0.17829415570411905, + "grad_norm": 0.7282015085220337, + "learning_rate": 9.95384425861125e-06, + "loss": 0.7729, + "step": 3307 + }, + { + "epoch": 0.17834806987276255, + "grad_norm": 0.8199304342269897, + "learning_rate": 9.953815514152979e-06, + "loss": 0.8046, + "step": 3308 + }, + { + "epoch": 0.1784019840414061, + "grad_norm": 0.9033650755882263, + "learning_rate": 9.953786760788416e-06, + "loss": 0.735, + "step": 3309 + }, + { + "epoch": 0.1784558982100496, + "grad_norm": 1.1363990306854248, + "learning_rate": 9.953757998517614e-06, + "loss": 0.8351, + "step": 3310 + }, + { + "epoch": 0.17850981237869312, + "grad_norm": 0.747763454914093, + "learning_rate": 9.953729227340621e-06, + "loss": 0.7603, + "step": 3311 + }, + { + "epoch": 0.17856372654733663, + "grad_norm": 0.8733643293380737, + "learning_rate": 9.953700447257493e-06, + "loss": 0.8538, + "step": 3312 + }, + { + "epoch": 0.17861764071598016, + "grad_norm": 0.8054553270339966, + "learning_rate": 9.953671658268279e-06, + "loss": 0.6782, + "step": 3313 + }, + { + "epoch": 0.1786715548846237, + "grad_norm": 0.8797160387039185, + "learning_rate": 9.953642860373032e-06, + "loss": 0.613, + "step": 3314 + }, + { + "epoch": 0.1787254690532672, + "grad_norm": 0.7065737843513489, + "learning_rate": 9.953614053571802e-06, + "loss": 0.7912, + "step": 3315 + }, + { + "epoch": 0.17877938322191073, + "grad_norm": 0.8206682205200195, + "learning_rate": 9.953585237864642e-06, + "loss": 0.8505, + "step": 3316 + }, + { + "epoch": 0.17883329739055423, + "grad_norm": 0.7129380702972412, + "learning_rate": 9.953556413251605e-06, + "loss": 0.7242, + "step": 3317 + }, + { + "epoch": 0.17888721155919776, + "grad_norm": 0.8084376454353333, + "learning_rate": 9.953527579732742e-06, + "loss": 0.7626, + "step": 3318 + }, + { + "epoch": 0.17894112572784127, + "grad_norm": 0.8610605001449585, + "learning_rate": 9.953498737308103e-06, + "loss": 0.8255, + "step": 3319 + }, + { + "epoch": 0.1789950398964848, + "grad_norm": 0.7437496185302734, + "learning_rate": 9.953469885977742e-06, + "loss": 0.677, + "step": 3320 + }, + { + "epoch": 0.17904895406512833, + "grad_norm": 0.7540122270584106, + "learning_rate": 9.95344102574171e-06, + "loss": 0.7094, + "step": 3321 + }, + { + "epoch": 0.17910286823377183, + "grad_norm": 0.8017913699150085, + "learning_rate": 9.95341215660006e-06, + "loss": 0.8882, + "step": 3322 + }, + { + "epoch": 0.17915678240241537, + "grad_norm": 1.0244393348693848, + "learning_rate": 9.953383278552841e-06, + "loss": 0.7879, + "step": 3323 + }, + { + "epoch": 0.17921069657105887, + "grad_norm": 0.7007571458816528, + "learning_rate": 9.953354391600109e-06, + "loss": 0.757, + "step": 3324 + }, + { + "epoch": 0.1792646107397024, + "grad_norm": 0.8408647775650024, + "learning_rate": 9.953325495741913e-06, + "loss": 0.7772, + "step": 3325 + }, + { + "epoch": 0.1793185249083459, + "grad_norm": 0.718988299369812, + "learning_rate": 9.953296590978305e-06, + "loss": 0.7885, + "step": 3326 + }, + { + "epoch": 0.17937243907698944, + "grad_norm": 0.7917525768280029, + "learning_rate": 9.95326767730934e-06, + "loss": 0.8321, + "step": 3327 + }, + { + "epoch": 0.17942635324563294, + "grad_norm": 0.9516105055809021, + "learning_rate": 9.953238754735066e-06, + "loss": 0.8124, + "step": 3328 + }, + { + "epoch": 0.17948026741427647, + "grad_norm": 0.8829317688941956, + "learning_rate": 9.953209823255536e-06, + "loss": 0.7426, + "step": 3329 + }, + { + "epoch": 0.17953418158292, + "grad_norm": 0.83402019739151, + "learning_rate": 9.953180882870805e-06, + "loss": 0.7358, + "step": 3330 + }, + { + "epoch": 0.1795880957515635, + "grad_norm": 0.819425106048584, + "learning_rate": 9.953151933580923e-06, + "loss": 0.8002, + "step": 3331 + }, + { + "epoch": 0.17964200992020704, + "grad_norm": 0.8458916544914246, + "learning_rate": 9.95312297538594e-06, + "loss": 0.8305, + "step": 3332 + }, + { + "epoch": 0.17969592408885054, + "grad_norm": 0.8235782980918884, + "learning_rate": 9.95309400828591e-06, + "loss": 0.8228, + "step": 3333 + }, + { + "epoch": 0.17974983825749408, + "grad_norm": 0.7924965023994446, + "learning_rate": 9.953065032280885e-06, + "loss": 0.7369, + "step": 3334 + }, + { + "epoch": 0.17980375242613758, + "grad_norm": 0.7931050658226013, + "learning_rate": 9.953036047370919e-06, + "loss": 0.8337, + "step": 3335 + }, + { + "epoch": 0.1798576665947811, + "grad_norm": 0.7998207211494446, + "learning_rate": 9.95300705355606e-06, + "loss": 0.7341, + "step": 3336 + }, + { + "epoch": 0.17991158076342462, + "grad_norm": 0.713846743106842, + "learning_rate": 9.952978050836364e-06, + "loss": 0.6958, + "step": 3337 + }, + { + "epoch": 0.17996549493206815, + "grad_norm": 0.807744026184082, + "learning_rate": 9.95294903921188e-06, + "loss": 0.7723, + "step": 3338 + }, + { + "epoch": 0.18001940910071168, + "grad_norm": 0.865696370601654, + "learning_rate": 9.95292001868266e-06, + "loss": 0.8957, + "step": 3339 + }, + { + "epoch": 0.18007332326935518, + "grad_norm": 0.7955803871154785, + "learning_rate": 9.952890989248763e-06, + "loss": 0.7632, + "step": 3340 + }, + { + "epoch": 0.18012723743799872, + "grad_norm": 0.8028436303138733, + "learning_rate": 9.952861950910233e-06, + "loss": 0.8642, + "step": 3341 + }, + { + "epoch": 0.18018115160664222, + "grad_norm": 0.8755636215209961, + "learning_rate": 9.952832903667125e-06, + "loss": 0.8521, + "step": 3342 + }, + { + "epoch": 0.18023506577528575, + "grad_norm": 0.8018125891685486, + "learning_rate": 9.952803847519492e-06, + "loss": 0.8719, + "step": 3343 + }, + { + "epoch": 0.18028897994392926, + "grad_norm": 0.6923267245292664, + "learning_rate": 9.952774782467384e-06, + "loss": 0.718, + "step": 3344 + }, + { + "epoch": 0.1803428941125728, + "grad_norm": 0.7926875948905945, + "learning_rate": 9.952745708510856e-06, + "loss": 0.8657, + "step": 3345 + }, + { + "epoch": 0.1803968082812163, + "grad_norm": 0.8815774917602539, + "learning_rate": 9.95271662564996e-06, + "loss": 0.8196, + "step": 3346 + }, + { + "epoch": 0.18045072244985982, + "grad_norm": 0.8497309684753418, + "learning_rate": 9.952687533884748e-06, + "loss": 0.7563, + "step": 3347 + }, + { + "epoch": 0.18050463661850336, + "grad_norm": 0.7040117979049683, + "learning_rate": 9.952658433215269e-06, + "loss": 0.687, + "step": 3348 + }, + { + "epoch": 0.18055855078714686, + "grad_norm": 0.8446635007858276, + "learning_rate": 9.95262932364158e-06, + "loss": 0.895, + "step": 3349 + }, + { + "epoch": 0.1806124649557904, + "grad_norm": 0.821702778339386, + "learning_rate": 9.952600205163733e-06, + "loss": 0.8387, + "step": 3350 + }, + { + "epoch": 0.1806663791244339, + "grad_norm": 0.9755251407623291, + "learning_rate": 9.952571077781776e-06, + "loss": 0.9119, + "step": 3351 + }, + { + "epoch": 0.18072029329307743, + "grad_norm": 0.8260585069656372, + "learning_rate": 9.952541941495766e-06, + "loss": 0.7827, + "step": 3352 + }, + { + "epoch": 0.18077420746172093, + "grad_norm": 0.7443965673446655, + "learning_rate": 9.952512796305753e-06, + "loss": 0.7331, + "step": 3353 + }, + { + "epoch": 0.18082812163036446, + "grad_norm": 0.8674094676971436, + "learning_rate": 9.95248364221179e-06, + "loss": 0.8789, + "step": 3354 + }, + { + "epoch": 0.18088203579900797, + "grad_norm": 0.7950018644332886, + "learning_rate": 9.952454479213929e-06, + "loss": 0.7802, + "step": 3355 + }, + { + "epoch": 0.1809359499676515, + "grad_norm": 0.8740068078041077, + "learning_rate": 9.952425307312223e-06, + "loss": 0.9354, + "step": 3356 + }, + { + "epoch": 0.18098986413629503, + "grad_norm": 0.8254936933517456, + "learning_rate": 9.952396126506724e-06, + "loss": 0.8903, + "step": 3357 + }, + { + "epoch": 0.18104377830493854, + "grad_norm": 0.7814514636993408, + "learning_rate": 9.952366936797484e-06, + "loss": 0.7214, + "step": 3358 + }, + { + "epoch": 0.18109769247358207, + "grad_norm": 0.7647988796234131, + "learning_rate": 9.952337738184557e-06, + "loss": 0.7591, + "step": 3359 + }, + { + "epoch": 0.18115160664222557, + "grad_norm": 0.8247759938240051, + "learning_rate": 9.952308530667996e-06, + "loss": 0.7825, + "step": 3360 + }, + { + "epoch": 0.1812055208108691, + "grad_norm": 0.724585771560669, + "learning_rate": 9.95227931424785e-06, + "loss": 0.7828, + "step": 3361 + }, + { + "epoch": 0.1812594349795126, + "grad_norm": 0.8304919004440308, + "learning_rate": 9.952250088924175e-06, + "loss": 0.8071, + "step": 3362 + }, + { + "epoch": 0.18131334914815614, + "grad_norm": 0.8318499326705933, + "learning_rate": 9.95222085469702e-06, + "loss": 0.7571, + "step": 3363 + }, + { + "epoch": 0.18136726331679964, + "grad_norm": 0.7315414547920227, + "learning_rate": 9.952191611566443e-06, + "loss": 0.7644, + "step": 3364 + }, + { + "epoch": 0.18142117748544317, + "grad_norm": 0.853285551071167, + "learning_rate": 9.952162359532493e-06, + "loss": 0.8946, + "step": 3365 + }, + { + "epoch": 0.1814750916540867, + "grad_norm": 0.8418978452682495, + "learning_rate": 9.95213309859522e-06, + "loss": 0.7892, + "step": 3366 + }, + { + "epoch": 0.1815290058227302, + "grad_norm": 0.7926337122917175, + "learning_rate": 9.952103828754682e-06, + "loss": 0.7182, + "step": 3367 + }, + { + "epoch": 0.18158291999137374, + "grad_norm": 0.9103478193283081, + "learning_rate": 9.95207455001093e-06, + "loss": 0.8474, + "step": 3368 + }, + { + "epoch": 0.18163683416001725, + "grad_norm": 0.8050599098205566, + "learning_rate": 9.952045262364014e-06, + "loss": 0.7581, + "step": 3369 + }, + { + "epoch": 0.18169074832866078, + "grad_norm": 0.7441660165786743, + "learning_rate": 9.952015965813988e-06, + "loss": 0.7713, + "step": 3370 + }, + { + "epoch": 0.18174466249730428, + "grad_norm": 0.7210862636566162, + "learning_rate": 9.951986660360906e-06, + "loss": 0.7732, + "step": 3371 + }, + { + "epoch": 0.18179857666594781, + "grad_norm": 0.8199747204780579, + "learning_rate": 9.951957346004822e-06, + "loss": 0.8697, + "step": 3372 + }, + { + "epoch": 0.18185249083459132, + "grad_norm": 0.7781465649604797, + "learning_rate": 9.951928022745784e-06, + "loss": 0.8011, + "step": 3373 + }, + { + "epoch": 0.18190640500323485, + "grad_norm": 0.8713019490242004, + "learning_rate": 9.951898690583848e-06, + "loss": 0.8328, + "step": 3374 + }, + { + "epoch": 0.18196031917187838, + "grad_norm": 0.7194361686706543, + "learning_rate": 9.951869349519066e-06, + "loss": 0.7291, + "step": 3375 + }, + { + "epoch": 0.18201423334052189, + "grad_norm": 0.7940298914909363, + "learning_rate": 9.95183999955149e-06, + "loss": 0.8128, + "step": 3376 + }, + { + "epoch": 0.18206814750916542, + "grad_norm": 0.8048009872436523, + "learning_rate": 9.951810640681175e-06, + "loss": 0.7627, + "step": 3377 + }, + { + "epoch": 0.18212206167780892, + "grad_norm": 0.8479227423667908, + "learning_rate": 9.951781272908173e-06, + "loss": 0.7587, + "step": 3378 + }, + { + "epoch": 0.18217597584645245, + "grad_norm": 0.8620457053184509, + "learning_rate": 9.951751896232534e-06, + "loss": 0.7409, + "step": 3379 + }, + { + "epoch": 0.18222989001509596, + "grad_norm": 0.8283497095108032, + "learning_rate": 9.951722510654314e-06, + "loss": 0.7953, + "step": 3380 + }, + { + "epoch": 0.1822838041837395, + "grad_norm": 0.9071113467216492, + "learning_rate": 9.951693116173565e-06, + "loss": 0.8476, + "step": 3381 + }, + { + "epoch": 0.182337718352383, + "grad_norm": 0.8383519053459167, + "learning_rate": 9.951663712790338e-06, + "loss": 0.8388, + "step": 3382 + }, + { + "epoch": 0.18239163252102653, + "grad_norm": 0.8026612997055054, + "learning_rate": 9.951634300504689e-06, + "loss": 0.8848, + "step": 3383 + }, + { + "epoch": 0.18244554668967006, + "grad_norm": 0.8395872116088867, + "learning_rate": 9.951604879316667e-06, + "loss": 0.7759, + "step": 3384 + }, + { + "epoch": 0.18249946085831356, + "grad_norm": 1.1459238529205322, + "learning_rate": 9.95157544922633e-06, + "loss": 0.8005, + "step": 3385 + }, + { + "epoch": 0.1825533750269571, + "grad_norm": 0.8083657026290894, + "learning_rate": 9.951546010233729e-06, + "loss": 0.8298, + "step": 3386 + }, + { + "epoch": 0.1826072891956006, + "grad_norm": 0.8329801559448242, + "learning_rate": 9.951516562338912e-06, + "loss": 0.7743, + "step": 3387 + }, + { + "epoch": 0.18266120336424413, + "grad_norm": 0.7916942834854126, + "learning_rate": 9.951487105541939e-06, + "loss": 0.7934, + "step": 3388 + }, + { + "epoch": 0.18271511753288763, + "grad_norm": 0.8752714395523071, + "learning_rate": 9.951457639842861e-06, + "loss": 0.8031, + "step": 3389 + }, + { + "epoch": 0.18276903170153116, + "grad_norm": 0.7645601630210876, + "learning_rate": 9.951428165241728e-06, + "loss": 0.6987, + "step": 3390 + }, + { + "epoch": 0.18282294587017467, + "grad_norm": 0.9860275983810425, + "learning_rate": 9.951398681738595e-06, + "loss": 0.8027, + "step": 3391 + }, + { + "epoch": 0.1828768600388182, + "grad_norm": 0.8548283576965332, + "learning_rate": 9.951369189333515e-06, + "loss": 0.8595, + "step": 3392 + }, + { + "epoch": 0.18293077420746173, + "grad_norm": 0.843217670917511, + "learning_rate": 9.95133968802654e-06, + "loss": 0.8437, + "step": 3393 + }, + { + "epoch": 0.18298468837610524, + "grad_norm": 0.7996432781219482, + "learning_rate": 9.951310177817726e-06, + "loss": 0.7229, + "step": 3394 + }, + { + "epoch": 0.18303860254474877, + "grad_norm": 0.8908971548080444, + "learning_rate": 9.951280658707124e-06, + "loss": 0.8639, + "step": 3395 + }, + { + "epoch": 0.18309251671339227, + "grad_norm": 0.9041224718093872, + "learning_rate": 9.951251130694787e-06, + "loss": 0.8026, + "step": 3396 + }, + { + "epoch": 0.1831464308820358, + "grad_norm": 0.7458503842353821, + "learning_rate": 9.951221593780768e-06, + "loss": 0.8228, + "step": 3397 + }, + { + "epoch": 0.1832003450506793, + "grad_norm": 0.8241537809371948, + "learning_rate": 9.95119204796512e-06, + "loss": 0.7937, + "step": 3398 + }, + { + "epoch": 0.18325425921932284, + "grad_norm": 0.8728781342506409, + "learning_rate": 9.951162493247897e-06, + "loss": 0.8829, + "step": 3399 + }, + { + "epoch": 0.18330817338796634, + "grad_norm": 0.843101978302002, + "learning_rate": 9.95113292962915e-06, + "loss": 0.9562, + "step": 3400 + }, + { + "epoch": 0.18336208755660988, + "grad_norm": 1.031156301498413, + "learning_rate": 9.951103357108935e-06, + "loss": 0.6757, + "step": 3401 + }, + { + "epoch": 0.1834160017252534, + "grad_norm": 0.9858013391494751, + "learning_rate": 9.951073775687304e-06, + "loss": 0.7922, + "step": 3402 + }, + { + "epoch": 0.1834699158938969, + "grad_norm": 0.9532352685928345, + "learning_rate": 9.95104418536431e-06, + "loss": 0.8979, + "step": 3403 + }, + { + "epoch": 0.18352383006254044, + "grad_norm": 0.9552246332168579, + "learning_rate": 9.951014586140006e-06, + "loss": 0.8682, + "step": 3404 + }, + { + "epoch": 0.18357774423118395, + "grad_norm": 0.8952224850654602, + "learning_rate": 9.950984978014446e-06, + "loss": 0.9064, + "step": 3405 + }, + { + "epoch": 0.18363165839982748, + "grad_norm": 0.8228804469108582, + "learning_rate": 9.950955360987684e-06, + "loss": 0.8337, + "step": 3406 + }, + { + "epoch": 0.18368557256847098, + "grad_norm": 0.8621776103973389, + "learning_rate": 9.95092573505977e-06, + "loss": 0.8418, + "step": 3407 + }, + { + "epoch": 0.18373948673711452, + "grad_norm": 0.8312029242515564, + "learning_rate": 9.95089610023076e-06, + "loss": 0.8453, + "step": 3408 + }, + { + "epoch": 0.18379340090575802, + "grad_norm": 0.8212811350822449, + "learning_rate": 9.950866456500706e-06, + "loss": 0.7226, + "step": 3409 + }, + { + "epoch": 0.18384731507440155, + "grad_norm": 0.7918773293495178, + "learning_rate": 9.950836803869663e-06, + "loss": 0.7546, + "step": 3410 + }, + { + "epoch": 0.18390122924304508, + "grad_norm": 0.8544521331787109, + "learning_rate": 9.950807142337682e-06, + "loss": 0.8975, + "step": 3411 + }, + { + "epoch": 0.1839551434116886, + "grad_norm": 0.7909727692604065, + "learning_rate": 9.950777471904818e-06, + "loss": 0.8266, + "step": 3412 + }, + { + "epoch": 0.18400905758033212, + "grad_norm": 0.7834721207618713, + "learning_rate": 9.950747792571122e-06, + "loss": 0.7647, + "step": 3413 + }, + { + "epoch": 0.18406297174897562, + "grad_norm": 1.0084491968154907, + "learning_rate": 9.950718104336651e-06, + "loss": 0.8954, + "step": 3414 + }, + { + "epoch": 0.18411688591761916, + "grad_norm": 0.9300922155380249, + "learning_rate": 9.950688407201457e-06, + "loss": 0.8106, + "step": 3415 + }, + { + "epoch": 0.18417080008626266, + "grad_norm": 0.7957245111465454, + "learning_rate": 9.950658701165593e-06, + "loss": 0.7556, + "step": 3416 + }, + { + "epoch": 0.1842247142549062, + "grad_norm": 0.7386512160301208, + "learning_rate": 9.950628986229111e-06, + "loss": 0.7384, + "step": 3417 + }, + { + "epoch": 0.1842786284235497, + "grad_norm": 0.8791146874427795, + "learning_rate": 9.950599262392067e-06, + "loss": 0.7681, + "step": 3418 + }, + { + "epoch": 0.18433254259219323, + "grad_norm": 0.78180330991745, + "learning_rate": 9.950569529654512e-06, + "loss": 0.7641, + "step": 3419 + }, + { + "epoch": 0.18438645676083676, + "grad_norm": 0.7648051977157593, + "learning_rate": 9.950539788016502e-06, + "loss": 0.7782, + "step": 3420 + }, + { + "epoch": 0.18444037092948026, + "grad_norm": 0.8135426640510559, + "learning_rate": 9.950510037478089e-06, + "loss": 0.8313, + "step": 3421 + }, + { + "epoch": 0.1844942850981238, + "grad_norm": 0.8623054623603821, + "learning_rate": 9.950480278039325e-06, + "loss": 0.8142, + "step": 3422 + }, + { + "epoch": 0.1845481992667673, + "grad_norm": 0.774558424949646, + "learning_rate": 9.950450509700267e-06, + "loss": 0.7747, + "step": 3423 + }, + { + "epoch": 0.18460211343541083, + "grad_norm": 0.7947419285774231, + "learning_rate": 9.950420732460965e-06, + "loss": 0.8757, + "step": 3424 + }, + { + "epoch": 0.18465602760405433, + "grad_norm": 0.8677110075950623, + "learning_rate": 9.950390946321475e-06, + "loss": 0.8527, + "step": 3425 + }, + { + "epoch": 0.18470994177269787, + "grad_norm": 0.8350674510002136, + "learning_rate": 9.950361151281852e-06, + "loss": 0.7209, + "step": 3426 + }, + { + "epoch": 0.1847638559413414, + "grad_norm": 0.7326707243919373, + "learning_rate": 9.950331347342143e-06, + "loss": 0.749, + "step": 3427 + }, + { + "epoch": 0.1848177701099849, + "grad_norm": 0.8775684237480164, + "learning_rate": 9.95030153450241e-06, + "loss": 0.762, + "step": 3428 + }, + { + "epoch": 0.18487168427862843, + "grad_norm": 0.8116014003753662, + "learning_rate": 9.9502717127627e-06, + "loss": 0.7592, + "step": 3429 + }, + { + "epoch": 0.18492559844727194, + "grad_norm": 0.7852542996406555, + "learning_rate": 9.950241882123068e-06, + "loss": 0.8254, + "step": 3430 + }, + { + "epoch": 0.18497951261591547, + "grad_norm": 0.761076807975769, + "learning_rate": 9.950212042583571e-06, + "loss": 0.7444, + "step": 3431 + }, + { + "epoch": 0.18503342678455897, + "grad_norm": 0.914729118347168, + "learning_rate": 9.95018219414426e-06, + "loss": 0.8847, + "step": 3432 + }, + { + "epoch": 0.1850873409532025, + "grad_norm": 0.7256419062614441, + "learning_rate": 9.950152336805188e-06, + "loss": 0.7069, + "step": 3433 + }, + { + "epoch": 0.185141255121846, + "grad_norm": 0.7481849193572998, + "learning_rate": 9.950122470566411e-06, + "loss": 0.7921, + "step": 3434 + }, + { + "epoch": 0.18519516929048954, + "grad_norm": 0.7878799438476562, + "learning_rate": 9.95009259542798e-06, + "loss": 0.7422, + "step": 3435 + }, + { + "epoch": 0.18524908345913307, + "grad_norm": 0.8083212375640869, + "learning_rate": 9.950062711389953e-06, + "loss": 0.8445, + "step": 3436 + }, + { + "epoch": 0.18530299762777658, + "grad_norm": 0.9458408355712891, + "learning_rate": 9.950032818452377e-06, + "loss": 0.771, + "step": 3437 + }, + { + "epoch": 0.1853569117964201, + "grad_norm": 0.7575398087501526, + "learning_rate": 9.950002916615311e-06, + "loss": 0.765, + "step": 3438 + }, + { + "epoch": 0.1854108259650636, + "grad_norm": 0.8672422766685486, + "learning_rate": 9.94997300587881e-06, + "loss": 0.8499, + "step": 3439 + }, + { + "epoch": 0.18546474013370715, + "grad_norm": 0.7971605658531189, + "learning_rate": 9.949943086242923e-06, + "loss": 0.8617, + "step": 3440 + }, + { + "epoch": 0.18551865430235065, + "grad_norm": 1.0215446949005127, + "learning_rate": 9.949913157707704e-06, + "loss": 0.8224, + "step": 3441 + }, + { + "epoch": 0.18557256847099418, + "grad_norm": 0.7983795404434204, + "learning_rate": 9.949883220273211e-06, + "loss": 0.7497, + "step": 3442 + }, + { + "epoch": 0.18562648263963769, + "grad_norm": 0.8548665642738342, + "learning_rate": 9.949853273939496e-06, + "loss": 0.856, + "step": 3443 + }, + { + "epoch": 0.18568039680828122, + "grad_norm": 0.7996117472648621, + "learning_rate": 9.949823318706611e-06, + "loss": 0.7344, + "step": 3444 + }, + { + "epoch": 0.18573431097692475, + "grad_norm": 0.9108440279960632, + "learning_rate": 9.949793354574612e-06, + "loss": 0.8229, + "step": 3445 + }, + { + "epoch": 0.18578822514556825, + "grad_norm": 0.8484078049659729, + "learning_rate": 9.949763381543553e-06, + "loss": 0.7366, + "step": 3446 + }, + { + "epoch": 0.18584213931421179, + "grad_norm": 0.7617974877357483, + "learning_rate": 9.949733399613486e-06, + "loss": 0.777, + "step": 3447 + }, + { + "epoch": 0.1858960534828553, + "grad_norm": 1.0613569021224976, + "learning_rate": 9.949703408784465e-06, + "loss": 0.9028, + "step": 3448 + }, + { + "epoch": 0.18594996765149882, + "grad_norm": 0.7503539323806763, + "learning_rate": 9.949673409056546e-06, + "loss": 0.797, + "step": 3449 + }, + { + "epoch": 0.18600388182014232, + "grad_norm": 0.8162353038787842, + "learning_rate": 9.949643400429782e-06, + "loss": 0.8698, + "step": 3450 + }, + { + "epoch": 0.18605779598878586, + "grad_norm": 0.8876883387565613, + "learning_rate": 9.949613382904226e-06, + "loss": 0.8422, + "step": 3451 + }, + { + "epoch": 0.18611171015742936, + "grad_norm": 0.7412144541740417, + "learning_rate": 9.949583356479934e-06, + "loss": 0.7977, + "step": 3452 + }, + { + "epoch": 0.1861656243260729, + "grad_norm": 0.7515407204627991, + "learning_rate": 9.949553321156957e-06, + "loss": 0.8046, + "step": 3453 + }, + { + "epoch": 0.18621953849471642, + "grad_norm": 0.8171376585960388, + "learning_rate": 9.949523276935352e-06, + "loss": 0.7121, + "step": 3454 + }, + { + "epoch": 0.18627345266335993, + "grad_norm": 0.838368833065033, + "learning_rate": 9.94949322381517e-06, + "loss": 0.833, + "step": 3455 + }, + { + "epoch": 0.18632736683200346, + "grad_norm": 1.0004788637161255, + "learning_rate": 9.949463161796468e-06, + "loss": 0.7967, + "step": 3456 + }, + { + "epoch": 0.18638128100064696, + "grad_norm": 0.8949950337409973, + "learning_rate": 9.949433090879298e-06, + "loss": 0.815, + "step": 3457 + }, + { + "epoch": 0.1864351951692905, + "grad_norm": 0.8611262440681458, + "learning_rate": 9.949403011063716e-06, + "loss": 0.8998, + "step": 3458 + }, + { + "epoch": 0.186489109337934, + "grad_norm": 0.7873225212097168, + "learning_rate": 9.949372922349775e-06, + "loss": 0.8011, + "step": 3459 + }, + { + "epoch": 0.18654302350657753, + "grad_norm": 0.7770752310752869, + "learning_rate": 9.949342824737529e-06, + "loss": 0.7687, + "step": 3460 + }, + { + "epoch": 0.18659693767522104, + "grad_norm": 0.7723278403282166, + "learning_rate": 9.949312718227031e-06, + "loss": 0.8047, + "step": 3461 + }, + { + "epoch": 0.18665085184386457, + "grad_norm": 0.8038878440856934, + "learning_rate": 9.949282602818335e-06, + "loss": 0.6522, + "step": 3462 + }, + { + "epoch": 0.1867047660125081, + "grad_norm": 0.8243177533149719, + "learning_rate": 9.949252478511499e-06, + "loss": 0.7859, + "step": 3463 + }, + { + "epoch": 0.1867586801811516, + "grad_norm": 0.8061205744743347, + "learning_rate": 9.949222345306574e-06, + "loss": 0.8, + "step": 3464 + }, + { + "epoch": 0.18681259434979514, + "grad_norm": 0.8916036486625671, + "learning_rate": 9.949192203203615e-06, + "loss": 0.7831, + "step": 3465 + }, + { + "epoch": 0.18686650851843864, + "grad_norm": 0.7694443464279175, + "learning_rate": 9.949162052202675e-06, + "loss": 0.753, + "step": 3466 + }, + { + "epoch": 0.18692042268708217, + "grad_norm": 0.8028594255447388, + "learning_rate": 9.94913189230381e-06, + "loss": 0.7834, + "step": 3467 + }, + { + "epoch": 0.18697433685572568, + "grad_norm": 0.8558024764060974, + "learning_rate": 9.94910172350707e-06, + "loss": 0.8479, + "step": 3468 + }, + { + "epoch": 0.1870282510243692, + "grad_norm": 0.8418707251548767, + "learning_rate": 9.949071545812517e-06, + "loss": 0.7841, + "step": 3469 + }, + { + "epoch": 0.1870821651930127, + "grad_norm": 0.9143140316009521, + "learning_rate": 9.9490413592202e-06, + "loss": 0.7803, + "step": 3470 + }, + { + "epoch": 0.18713607936165624, + "grad_norm": 0.927670419216156, + "learning_rate": 9.949011163730172e-06, + "loss": 0.7969, + "step": 3471 + }, + { + "epoch": 0.18718999353029978, + "grad_norm": 0.7614530324935913, + "learning_rate": 9.948980959342492e-06, + "loss": 0.7541, + "step": 3472 + }, + { + "epoch": 0.18724390769894328, + "grad_norm": 0.7719544172286987, + "learning_rate": 9.948950746057208e-06, + "loss": 0.6996, + "step": 3473 + }, + { + "epoch": 0.1872978218675868, + "grad_norm": 0.8512967824935913, + "learning_rate": 9.94892052387438e-06, + "loss": 0.8749, + "step": 3474 + }, + { + "epoch": 0.18735173603623032, + "grad_norm": 0.7408632636070251, + "learning_rate": 9.948890292794062e-06, + "loss": 0.7646, + "step": 3475 + }, + { + "epoch": 0.18740565020487385, + "grad_norm": 0.7667837142944336, + "learning_rate": 9.948860052816305e-06, + "loss": 0.7721, + "step": 3476 + }, + { + "epoch": 0.18745956437351735, + "grad_norm": 0.8099546432495117, + "learning_rate": 9.948829803941167e-06, + "loss": 0.8604, + "step": 3477 + }, + { + "epoch": 0.18751347854216088, + "grad_norm": 0.7130147814750671, + "learning_rate": 9.948799546168699e-06, + "loss": 0.7215, + "step": 3478 + }, + { + "epoch": 0.1875673927108044, + "grad_norm": 0.7442251443862915, + "learning_rate": 9.948769279498955e-06, + "loss": 0.7691, + "step": 3479 + }, + { + "epoch": 0.18762130687944792, + "grad_norm": 0.8528403043746948, + "learning_rate": 9.948739003931995e-06, + "loss": 0.8738, + "step": 3480 + }, + { + "epoch": 0.18767522104809145, + "grad_norm": 0.7217040061950684, + "learning_rate": 9.948708719467868e-06, + "loss": 0.6989, + "step": 3481 + }, + { + "epoch": 0.18772913521673495, + "grad_norm": 1.0738893747329712, + "learning_rate": 9.94867842610663e-06, + "loss": 0.7464, + "step": 3482 + }, + { + "epoch": 0.1877830493853785, + "grad_norm": 0.7653424739837646, + "learning_rate": 9.948648123848334e-06, + "loss": 0.8552, + "step": 3483 + }, + { + "epoch": 0.187836963554022, + "grad_norm": 0.791019856929779, + "learning_rate": 9.948617812693037e-06, + "loss": 0.8548, + "step": 3484 + }, + { + "epoch": 0.18789087772266552, + "grad_norm": 0.8527680039405823, + "learning_rate": 9.948587492640796e-06, + "loss": 0.7717, + "step": 3485 + }, + { + "epoch": 0.18794479189130903, + "grad_norm": 1.0001403093338013, + "learning_rate": 9.948557163691659e-06, + "loss": 0.8061, + "step": 3486 + }, + { + "epoch": 0.18799870605995256, + "grad_norm": 0.7622776627540588, + "learning_rate": 9.948526825845683e-06, + "loss": 0.7082, + "step": 3487 + }, + { + "epoch": 0.18805262022859606, + "grad_norm": 0.7377861142158508, + "learning_rate": 9.948496479102925e-06, + "loss": 0.7776, + "step": 3488 + }, + { + "epoch": 0.1881065343972396, + "grad_norm": 0.9017737507820129, + "learning_rate": 9.948466123463436e-06, + "loss": 0.7676, + "step": 3489 + }, + { + "epoch": 0.18816044856588313, + "grad_norm": 0.7733216881752014, + "learning_rate": 9.948435758927274e-06, + "loss": 0.7503, + "step": 3490 + }, + { + "epoch": 0.18821436273452663, + "grad_norm": 0.9103933572769165, + "learning_rate": 9.948405385494491e-06, + "loss": 0.8696, + "step": 3491 + }, + { + "epoch": 0.18826827690317016, + "grad_norm": 0.7228747010231018, + "learning_rate": 9.948375003165143e-06, + "loss": 0.8396, + "step": 3492 + }, + { + "epoch": 0.18832219107181367, + "grad_norm": 0.9336891174316406, + "learning_rate": 9.948344611939283e-06, + "loss": 0.7994, + "step": 3493 + }, + { + "epoch": 0.1883761052404572, + "grad_norm": 0.8534504175186157, + "learning_rate": 9.948314211816968e-06, + "loss": 0.7627, + "step": 3494 + }, + { + "epoch": 0.1884300194091007, + "grad_norm": 0.867060661315918, + "learning_rate": 9.94828380279825e-06, + "loss": 0.8503, + "step": 3495 + }, + { + "epoch": 0.18848393357774423, + "grad_norm": 0.7721019983291626, + "learning_rate": 9.948253384883188e-06, + "loss": 0.7409, + "step": 3496 + }, + { + "epoch": 0.18853784774638774, + "grad_norm": 0.7308738827705383, + "learning_rate": 9.948222958071832e-06, + "loss": 0.7579, + "step": 3497 + }, + { + "epoch": 0.18859176191503127, + "grad_norm": 1.1277705430984497, + "learning_rate": 9.948192522364237e-06, + "loss": 0.8288, + "step": 3498 + }, + { + "epoch": 0.1886456760836748, + "grad_norm": 0.8183790445327759, + "learning_rate": 9.948162077760462e-06, + "loss": 0.7819, + "step": 3499 + }, + { + "epoch": 0.1886995902523183, + "grad_norm": 0.7458687424659729, + "learning_rate": 9.948131624260557e-06, + "loss": 0.7482, + "step": 3500 + }, + { + "epoch": 0.18875350442096184, + "grad_norm": 0.9347942471504211, + "learning_rate": 9.94810116186458e-06, + "loss": 0.8208, + "step": 3501 + }, + { + "epoch": 0.18880741858960534, + "grad_norm": 0.7442129254341125, + "learning_rate": 9.948070690572582e-06, + "loss": 0.7843, + "step": 3502 + }, + { + "epoch": 0.18886133275824887, + "grad_norm": 0.8121855854988098, + "learning_rate": 9.948040210384622e-06, + "loss": 0.738, + "step": 3503 + }, + { + "epoch": 0.18891524692689238, + "grad_norm": 0.8118747472763062, + "learning_rate": 9.948009721300754e-06, + "loss": 0.8792, + "step": 3504 + }, + { + "epoch": 0.1889691610955359, + "grad_norm": 0.8263816833496094, + "learning_rate": 9.94797922332103e-06, + "loss": 0.7759, + "step": 3505 + }, + { + "epoch": 0.1890230752641794, + "grad_norm": 0.7452372908592224, + "learning_rate": 9.947948716445508e-06, + "loss": 0.7588, + "step": 3506 + }, + { + "epoch": 0.18907698943282295, + "grad_norm": 0.7385339736938477, + "learning_rate": 9.94791820067424e-06, + "loss": 0.8412, + "step": 3507 + }, + { + "epoch": 0.18913090360146648, + "grad_norm": 0.7456401586532593, + "learning_rate": 9.947887676007284e-06, + "loss": 0.7539, + "step": 3508 + }, + { + "epoch": 0.18918481777010998, + "grad_norm": 0.8101776242256165, + "learning_rate": 9.947857142444693e-06, + "loss": 0.8006, + "step": 3509 + }, + { + "epoch": 0.1892387319387535, + "grad_norm": 0.7587085962295532, + "learning_rate": 9.947826599986523e-06, + "loss": 0.7958, + "step": 3510 + }, + { + "epoch": 0.18929264610739702, + "grad_norm": 0.7974298596382141, + "learning_rate": 9.947796048632826e-06, + "loss": 0.7954, + "step": 3511 + }, + { + "epoch": 0.18934656027604055, + "grad_norm": 0.8407479524612427, + "learning_rate": 9.94776548838366e-06, + "loss": 0.825, + "step": 3512 + }, + { + "epoch": 0.18940047444468405, + "grad_norm": 0.7465969324111938, + "learning_rate": 9.94773491923908e-06, + "loss": 0.7725, + "step": 3513 + }, + { + "epoch": 0.18945438861332758, + "grad_norm": 0.9324356913566589, + "learning_rate": 9.947704341199137e-06, + "loss": 0.755, + "step": 3514 + }, + { + "epoch": 0.1895083027819711, + "grad_norm": 0.8157918453216553, + "learning_rate": 9.94767375426389e-06, + "loss": 0.8678, + "step": 3515 + }, + { + "epoch": 0.18956221695061462, + "grad_norm": 0.8501976132392883, + "learning_rate": 9.947643158433395e-06, + "loss": 0.8431, + "step": 3516 + }, + { + "epoch": 0.18961613111925815, + "grad_norm": 0.7773411273956299, + "learning_rate": 9.947612553707703e-06, + "loss": 0.748, + "step": 3517 + }, + { + "epoch": 0.18967004528790166, + "grad_norm": 0.7716071605682373, + "learning_rate": 9.947581940086873e-06, + "loss": 0.7563, + "step": 3518 + }, + { + "epoch": 0.1897239594565452, + "grad_norm": 0.9465253353118896, + "learning_rate": 9.947551317570957e-06, + "loss": 0.9289, + "step": 3519 + }, + { + "epoch": 0.1897778736251887, + "grad_norm": 0.7123626470565796, + "learning_rate": 9.94752068616001e-06, + "loss": 0.7012, + "step": 3520 + }, + { + "epoch": 0.18983178779383222, + "grad_norm": 0.7318246960639954, + "learning_rate": 9.94749004585409e-06, + "loss": 0.8247, + "step": 3521 + }, + { + "epoch": 0.18988570196247573, + "grad_norm": 0.8028656244277954, + "learning_rate": 9.947459396653248e-06, + "loss": 0.8606, + "step": 3522 + }, + { + "epoch": 0.18993961613111926, + "grad_norm": 0.7580826282501221, + "learning_rate": 9.947428738557541e-06, + "loss": 0.7801, + "step": 3523 + }, + { + "epoch": 0.18999353029976276, + "grad_norm": 0.7612492442131042, + "learning_rate": 9.947398071567025e-06, + "loss": 0.8298, + "step": 3524 + }, + { + "epoch": 0.1900474444684063, + "grad_norm": 0.7892666459083557, + "learning_rate": 9.947367395681755e-06, + "loss": 0.739, + "step": 3525 + }, + { + "epoch": 0.19010135863704983, + "grad_norm": 0.7531749606132507, + "learning_rate": 9.947336710901785e-06, + "loss": 0.7804, + "step": 3526 + }, + { + "epoch": 0.19015527280569333, + "grad_norm": 0.7833613753318787, + "learning_rate": 9.947306017227171e-06, + "loss": 0.6541, + "step": 3527 + }, + { + "epoch": 0.19020918697433686, + "grad_norm": 0.749286413192749, + "learning_rate": 9.94727531465797e-06, + "loss": 0.6982, + "step": 3528 + }, + { + "epoch": 0.19026310114298037, + "grad_norm": 0.9150011539459229, + "learning_rate": 9.947244603194233e-06, + "loss": 0.8681, + "step": 3529 + }, + { + "epoch": 0.1903170153116239, + "grad_norm": 0.8265007138252258, + "learning_rate": 9.947213882836018e-06, + "loss": 0.9088, + "step": 3530 + }, + { + "epoch": 0.1903709294802674, + "grad_norm": 0.7807170152664185, + "learning_rate": 9.947183153583379e-06, + "loss": 0.7875, + "step": 3531 + }, + { + "epoch": 0.19042484364891094, + "grad_norm": 1.0078792572021484, + "learning_rate": 9.947152415436375e-06, + "loss": 1.2045, + "step": 3532 + }, + { + "epoch": 0.19047875781755447, + "grad_norm": 0.7661539912223816, + "learning_rate": 9.947121668395055e-06, + "loss": 0.8202, + "step": 3533 + }, + { + "epoch": 0.19053267198619797, + "grad_norm": 0.7419549226760864, + "learning_rate": 9.947090912459479e-06, + "loss": 0.7775, + "step": 3534 + }, + { + "epoch": 0.1905865861548415, + "grad_norm": 0.9671319723129272, + "learning_rate": 9.947060147629698e-06, + "loss": 0.8328, + "step": 3535 + }, + { + "epoch": 0.190640500323485, + "grad_norm": 0.9418153762817383, + "learning_rate": 9.947029373905773e-06, + "loss": 0.8476, + "step": 3536 + }, + { + "epoch": 0.19069441449212854, + "grad_norm": 0.8007176518440247, + "learning_rate": 9.946998591287755e-06, + "loss": 0.8379, + "step": 3537 + }, + { + "epoch": 0.19074832866077204, + "grad_norm": 1.0271466970443726, + "learning_rate": 9.946967799775701e-06, + "loss": 0.7789, + "step": 3538 + }, + { + "epoch": 0.19080224282941557, + "grad_norm": 0.7577568888664246, + "learning_rate": 9.946936999369668e-06, + "loss": 0.7749, + "step": 3539 + }, + { + "epoch": 0.19085615699805908, + "grad_norm": 0.7766523361206055, + "learning_rate": 9.946906190069707e-06, + "loss": 0.7143, + "step": 3540 + }, + { + "epoch": 0.1909100711667026, + "grad_norm": 0.798589825630188, + "learning_rate": 9.946875371875876e-06, + "loss": 0.8481, + "step": 3541 + }, + { + "epoch": 0.19096398533534614, + "grad_norm": 0.8279602527618408, + "learning_rate": 9.946844544788232e-06, + "loss": 0.8369, + "step": 3542 + }, + { + "epoch": 0.19101789950398965, + "grad_norm": 0.7607479691505432, + "learning_rate": 9.946813708806828e-06, + "loss": 0.8088, + "step": 3543 + }, + { + "epoch": 0.19107181367263318, + "grad_norm": 0.7722266912460327, + "learning_rate": 9.946782863931719e-06, + "loss": 0.704, + "step": 3544 + }, + { + "epoch": 0.19112572784127668, + "grad_norm": 0.8101015686988831, + "learning_rate": 9.946752010162964e-06, + "loss": 0.7828, + "step": 3545 + }, + { + "epoch": 0.19117964200992021, + "grad_norm": 0.8161671161651611, + "learning_rate": 9.946721147500613e-06, + "loss": 0.8875, + "step": 3546 + }, + { + "epoch": 0.19123355617856372, + "grad_norm": 0.9234161972999573, + "learning_rate": 9.946690275944727e-06, + "loss": 0.8846, + "step": 3547 + }, + { + "epoch": 0.19128747034720725, + "grad_norm": 0.7948644757270813, + "learning_rate": 9.946659395495357e-06, + "loss": 0.8331, + "step": 3548 + }, + { + "epoch": 0.19134138451585075, + "grad_norm": 0.9087135791778564, + "learning_rate": 9.946628506152563e-06, + "loss": 0.7462, + "step": 3549 + }, + { + "epoch": 0.19139529868449429, + "grad_norm": 0.7624903917312622, + "learning_rate": 9.946597607916396e-06, + "loss": 0.6431, + "step": 3550 + }, + { + "epoch": 0.19144921285313782, + "grad_norm": 0.9236660003662109, + "learning_rate": 9.946566700786914e-06, + "loss": 0.921, + "step": 3551 + }, + { + "epoch": 0.19150312702178132, + "grad_norm": 0.8824177980422974, + "learning_rate": 9.946535784764173e-06, + "loss": 0.805, + "step": 3552 + }, + { + "epoch": 0.19155704119042485, + "grad_norm": 0.7843056917190552, + "learning_rate": 9.946504859848227e-06, + "loss": 0.8528, + "step": 3553 + }, + { + "epoch": 0.19161095535906836, + "grad_norm": 1.2314038276672363, + "learning_rate": 9.946473926039134e-06, + "loss": 0.8141, + "step": 3554 + }, + { + "epoch": 0.1916648695277119, + "grad_norm": 0.7956500053405762, + "learning_rate": 9.946442983336945e-06, + "loss": 0.7946, + "step": 3555 + }, + { + "epoch": 0.1917187836963554, + "grad_norm": 0.850674033164978, + "learning_rate": 9.94641203174172e-06, + "loss": 0.8965, + "step": 3556 + }, + { + "epoch": 0.19177269786499893, + "grad_norm": 0.8371244668960571, + "learning_rate": 9.946381071253514e-06, + "loss": 0.7859, + "step": 3557 + }, + { + "epoch": 0.19182661203364243, + "grad_norm": 0.7423365712165833, + "learning_rate": 9.946350101872382e-06, + "loss": 0.8012, + "step": 3558 + }, + { + "epoch": 0.19188052620228596, + "grad_norm": 0.8446981310844421, + "learning_rate": 9.946319123598379e-06, + "loss": 0.9037, + "step": 3559 + }, + { + "epoch": 0.1919344403709295, + "grad_norm": 0.8565588593482971, + "learning_rate": 9.946288136431562e-06, + "loss": 0.7398, + "step": 3560 + }, + { + "epoch": 0.191988354539573, + "grad_norm": 0.8087875843048096, + "learning_rate": 9.946257140371985e-06, + "loss": 0.7214, + "step": 3561 + }, + { + "epoch": 0.19204226870821653, + "grad_norm": 0.7951125502586365, + "learning_rate": 9.946226135419705e-06, + "loss": 0.7988, + "step": 3562 + }, + { + "epoch": 0.19209618287686003, + "grad_norm": 0.8709264397621155, + "learning_rate": 9.946195121574779e-06, + "loss": 0.8563, + "step": 3563 + }, + { + "epoch": 0.19215009704550357, + "grad_norm": 0.7908393740653992, + "learning_rate": 9.94616409883726e-06, + "loss": 0.7874, + "step": 3564 + }, + { + "epoch": 0.19220401121414707, + "grad_norm": 1.0512382984161377, + "learning_rate": 9.946133067207204e-06, + "loss": 0.9174, + "step": 3565 + }, + { + "epoch": 0.1922579253827906, + "grad_norm": 0.7937822937965393, + "learning_rate": 9.94610202668467e-06, + "loss": 0.6863, + "step": 3566 + }, + { + "epoch": 0.1923118395514341, + "grad_norm": 0.9130533337593079, + "learning_rate": 9.94607097726971e-06, + "loss": 0.8287, + "step": 3567 + }, + { + "epoch": 0.19236575372007764, + "grad_norm": 1.1604489088058472, + "learning_rate": 9.946039918962383e-06, + "loss": 0.6922, + "step": 3568 + }, + { + "epoch": 0.19241966788872117, + "grad_norm": 1.0400906801223755, + "learning_rate": 9.946008851762743e-06, + "loss": 0.7978, + "step": 3569 + }, + { + "epoch": 0.19247358205736467, + "grad_norm": 0.8068282008171082, + "learning_rate": 9.945977775670845e-06, + "loss": 0.7365, + "step": 3570 + }, + { + "epoch": 0.1925274962260082, + "grad_norm": 0.8328807353973389, + "learning_rate": 9.945946690686747e-06, + "loss": 0.7308, + "step": 3571 + }, + { + "epoch": 0.1925814103946517, + "grad_norm": 0.946949303150177, + "learning_rate": 9.945915596810502e-06, + "loss": 0.9117, + "step": 3572 + }, + { + "epoch": 0.19263532456329524, + "grad_norm": 0.8421696424484253, + "learning_rate": 9.94588449404217e-06, + "loss": 0.7132, + "step": 3573 + }, + { + "epoch": 0.19268923873193874, + "grad_norm": 0.7321984171867371, + "learning_rate": 9.945853382381805e-06, + "loss": 0.752, + "step": 3574 + }, + { + "epoch": 0.19274315290058228, + "grad_norm": 0.8039024472236633, + "learning_rate": 9.94582226182946e-06, + "loss": 0.7952, + "step": 3575 + }, + { + "epoch": 0.19279706706922578, + "grad_norm": 0.8612285256385803, + "learning_rate": 9.945791132385196e-06, + "loss": 0.7944, + "step": 3576 + }, + { + "epoch": 0.1928509812378693, + "grad_norm": 1.0525864362716675, + "learning_rate": 9.945759994049066e-06, + "loss": 0.8078, + "step": 3577 + }, + { + "epoch": 0.19290489540651284, + "grad_norm": 0.8032466769218445, + "learning_rate": 9.945728846821128e-06, + "loss": 0.8522, + "step": 3578 + }, + { + "epoch": 0.19295880957515635, + "grad_norm": 1.324041485786438, + "learning_rate": 9.945697690701435e-06, + "loss": 0.7705, + "step": 3579 + }, + { + "epoch": 0.19301272374379988, + "grad_norm": 0.8733030557632446, + "learning_rate": 9.945666525690044e-06, + "loss": 0.8115, + "step": 3580 + }, + { + "epoch": 0.19306663791244338, + "grad_norm": 0.8208357095718384, + "learning_rate": 9.945635351787012e-06, + "loss": 0.7975, + "step": 3581 + }, + { + "epoch": 0.19312055208108692, + "grad_norm": 0.744498074054718, + "learning_rate": 9.945604168992395e-06, + "loss": 0.8088, + "step": 3582 + }, + { + "epoch": 0.19317446624973042, + "grad_norm": 0.9391197562217712, + "learning_rate": 9.945572977306249e-06, + "loss": 0.8403, + "step": 3583 + }, + { + "epoch": 0.19322838041837395, + "grad_norm": 0.8050488829612732, + "learning_rate": 9.945541776728629e-06, + "loss": 0.769, + "step": 3584 + }, + { + "epoch": 0.19328229458701746, + "grad_norm": 0.8373685479164124, + "learning_rate": 9.945510567259592e-06, + "loss": 0.7803, + "step": 3585 + }, + { + "epoch": 0.193336208755661, + "grad_norm": 0.8766368627548218, + "learning_rate": 9.945479348899194e-06, + "loss": 0.8325, + "step": 3586 + }, + { + "epoch": 0.19339012292430452, + "grad_norm": 0.8029547333717346, + "learning_rate": 9.945448121647492e-06, + "loss": 0.6647, + "step": 3587 + }, + { + "epoch": 0.19344403709294802, + "grad_norm": 0.7231468558311462, + "learning_rate": 9.94541688550454e-06, + "loss": 0.6939, + "step": 3588 + }, + { + "epoch": 0.19349795126159156, + "grad_norm": 0.8487125039100647, + "learning_rate": 9.945385640470397e-06, + "loss": 0.8097, + "step": 3589 + }, + { + "epoch": 0.19355186543023506, + "grad_norm": 0.7813920378684998, + "learning_rate": 9.945354386545116e-06, + "loss": 0.8023, + "step": 3590 + }, + { + "epoch": 0.1936057795988786, + "grad_norm": 0.8754404783248901, + "learning_rate": 9.945323123728756e-06, + "loss": 0.8401, + "step": 3591 + }, + { + "epoch": 0.1936596937675221, + "grad_norm": 0.8191613554954529, + "learning_rate": 9.945291852021371e-06, + "loss": 0.8151, + "step": 3592 + }, + { + "epoch": 0.19371360793616563, + "grad_norm": 0.7882266044616699, + "learning_rate": 9.945260571423019e-06, + "loss": 0.77, + "step": 3593 + }, + { + "epoch": 0.19376752210480913, + "grad_norm": 0.816411554813385, + "learning_rate": 9.945229281933756e-06, + "loss": 0.7378, + "step": 3594 + }, + { + "epoch": 0.19382143627345266, + "grad_norm": 0.8545891046524048, + "learning_rate": 9.945197983553636e-06, + "loss": 0.7563, + "step": 3595 + }, + { + "epoch": 0.1938753504420962, + "grad_norm": 0.8293501138687134, + "learning_rate": 9.945166676282717e-06, + "loss": 0.893, + "step": 3596 + }, + { + "epoch": 0.1939292646107397, + "grad_norm": 0.7536304593086243, + "learning_rate": 9.945135360121058e-06, + "loss": 0.7101, + "step": 3597 + }, + { + "epoch": 0.19398317877938323, + "grad_norm": 0.96649569272995, + "learning_rate": 9.94510403506871e-06, + "loss": 0.8027, + "step": 3598 + }, + { + "epoch": 0.19403709294802673, + "grad_norm": 0.7543211579322815, + "learning_rate": 9.945072701125733e-06, + "loss": 0.8144, + "step": 3599 + }, + { + "epoch": 0.19409100711667027, + "grad_norm": 0.7223193049430847, + "learning_rate": 9.945041358292183e-06, + "loss": 0.7585, + "step": 3600 + }, + { + "epoch": 0.19414492128531377, + "grad_norm": 0.8515756726264954, + "learning_rate": 9.945010006568115e-06, + "loss": 0.9114, + "step": 3601 + }, + { + "epoch": 0.1941988354539573, + "grad_norm": 0.7318340539932251, + "learning_rate": 9.944978645953585e-06, + "loss": 0.7554, + "step": 3602 + }, + { + "epoch": 0.1942527496226008, + "grad_norm": 0.8565723299980164, + "learning_rate": 9.944947276448649e-06, + "loss": 0.8918, + "step": 3603 + }, + { + "epoch": 0.19430666379124434, + "grad_norm": 0.8536270260810852, + "learning_rate": 9.944915898053367e-06, + "loss": 0.8184, + "step": 3604 + }, + { + "epoch": 0.19436057795988787, + "grad_norm": 0.7093652486801147, + "learning_rate": 9.944884510767792e-06, + "loss": 0.8031, + "step": 3605 + }, + { + "epoch": 0.19441449212853137, + "grad_norm": 0.7644805312156677, + "learning_rate": 9.944853114591984e-06, + "loss": 0.8546, + "step": 3606 + }, + { + "epoch": 0.1944684062971749, + "grad_norm": 0.6533430218696594, + "learning_rate": 9.944821709525994e-06, + "loss": 0.6453, + "step": 3607 + }, + { + "epoch": 0.1945223204658184, + "grad_norm": 0.8608343005180359, + "learning_rate": 9.944790295569883e-06, + "loss": 0.8539, + "step": 3608 + }, + { + "epoch": 0.19457623463446194, + "grad_norm": 0.777740478515625, + "learning_rate": 9.944758872723706e-06, + "loss": 0.7414, + "step": 3609 + }, + { + "epoch": 0.19463014880310545, + "grad_norm": 0.7757480144500732, + "learning_rate": 9.944727440987518e-06, + "loss": 0.7394, + "step": 3610 + }, + { + "epoch": 0.19468406297174898, + "grad_norm": 0.7862492203712463, + "learning_rate": 9.944696000361379e-06, + "loss": 0.8264, + "step": 3611 + }, + { + "epoch": 0.19473797714039248, + "grad_norm": 0.72691410779953, + "learning_rate": 9.944664550845342e-06, + "loss": 0.6876, + "step": 3612 + }, + { + "epoch": 0.194791891309036, + "grad_norm": 0.8702194094657898, + "learning_rate": 9.944633092439467e-06, + "loss": 0.7286, + "step": 3613 + }, + { + "epoch": 0.19484580547767955, + "grad_norm": 1.1160287857055664, + "learning_rate": 9.944601625143806e-06, + "loss": 0.8619, + "step": 3614 + }, + { + "epoch": 0.19489971964632305, + "grad_norm": 0.8278397917747498, + "learning_rate": 9.944570148958419e-06, + "loss": 0.7458, + "step": 3615 + }, + { + "epoch": 0.19495363381496658, + "grad_norm": 0.8430503606796265, + "learning_rate": 9.944538663883362e-06, + "loss": 0.7681, + "step": 3616 + }, + { + "epoch": 0.19500754798361009, + "grad_norm": 0.8198543190956116, + "learning_rate": 9.94450716991869e-06, + "loss": 0.6681, + "step": 3617 + }, + { + "epoch": 0.19506146215225362, + "grad_norm": 0.7874541282653809, + "learning_rate": 9.944475667064464e-06, + "loss": 0.813, + "step": 3618 + }, + { + "epoch": 0.19511537632089712, + "grad_norm": 0.76181960105896, + "learning_rate": 9.944444155320736e-06, + "loss": 0.7443, + "step": 3619 + }, + { + "epoch": 0.19516929048954065, + "grad_norm": 0.7647060751914978, + "learning_rate": 9.944412634687563e-06, + "loss": 0.8232, + "step": 3620 + }, + { + "epoch": 0.19522320465818416, + "grad_norm": 0.7609487771987915, + "learning_rate": 9.944381105165006e-06, + "loss": 0.8134, + "step": 3621 + }, + { + "epoch": 0.1952771188268277, + "grad_norm": 0.8139258027076721, + "learning_rate": 9.944349566753116e-06, + "loss": 0.8053, + "step": 3622 + }, + { + "epoch": 0.19533103299547122, + "grad_norm": 0.7404879927635193, + "learning_rate": 9.944318019451952e-06, + "loss": 0.7774, + "step": 3623 + }, + { + "epoch": 0.19538494716411473, + "grad_norm": 0.863972008228302, + "learning_rate": 9.944286463261573e-06, + "loss": 0.8824, + "step": 3624 + }, + { + "epoch": 0.19543886133275826, + "grad_norm": 0.907744824886322, + "learning_rate": 9.944254898182033e-06, + "loss": 0.7537, + "step": 3625 + }, + { + "epoch": 0.19549277550140176, + "grad_norm": 0.8722240328788757, + "learning_rate": 9.944223324213389e-06, + "loss": 0.8688, + "step": 3626 + }, + { + "epoch": 0.1955466896700453, + "grad_norm": 0.7386543154716492, + "learning_rate": 9.9441917413557e-06, + "loss": 0.6962, + "step": 3627 + }, + { + "epoch": 0.1956006038386888, + "grad_norm": 0.7577354907989502, + "learning_rate": 9.944160149609018e-06, + "loss": 0.7261, + "step": 3628 + }, + { + "epoch": 0.19565451800733233, + "grad_norm": 0.8413889408111572, + "learning_rate": 9.944128548973407e-06, + "loss": 0.8369, + "step": 3629 + }, + { + "epoch": 0.19570843217597583, + "grad_norm": 0.8649793863296509, + "learning_rate": 9.944096939448917e-06, + "loss": 0.8363, + "step": 3630 + }, + { + "epoch": 0.19576234634461936, + "grad_norm": 0.7515233755111694, + "learning_rate": 9.944065321035607e-06, + "loss": 0.7634, + "step": 3631 + }, + { + "epoch": 0.1958162605132629, + "grad_norm": 0.9059920310974121, + "learning_rate": 9.944033693733535e-06, + "loss": 0.9312, + "step": 3632 + }, + { + "epoch": 0.1958701746819064, + "grad_norm": 0.780707597732544, + "learning_rate": 9.944002057542757e-06, + "loss": 0.7545, + "step": 3633 + }, + { + "epoch": 0.19592408885054993, + "grad_norm": 0.7543255686759949, + "learning_rate": 9.94397041246333e-06, + "loss": 0.7496, + "step": 3634 + }, + { + "epoch": 0.19597800301919344, + "grad_norm": 0.7795106172561646, + "learning_rate": 9.943938758495313e-06, + "loss": 0.6734, + "step": 3635 + }, + { + "epoch": 0.19603191718783697, + "grad_norm": 0.9682700037956238, + "learning_rate": 9.943907095638758e-06, + "loss": 0.8928, + "step": 3636 + }, + { + "epoch": 0.19608583135648047, + "grad_norm": 0.7332949638366699, + "learning_rate": 9.943875423893727e-06, + "loss": 0.7507, + "step": 3637 + }, + { + "epoch": 0.196139745525124, + "grad_norm": 0.8316323161125183, + "learning_rate": 9.943843743260275e-06, + "loss": 0.7492, + "step": 3638 + }, + { + "epoch": 0.19619365969376754, + "grad_norm": 0.7973113059997559, + "learning_rate": 9.943812053738458e-06, + "loss": 0.8381, + "step": 3639 + }, + { + "epoch": 0.19624757386241104, + "grad_norm": 0.7654823064804077, + "learning_rate": 9.943780355328332e-06, + "loss": 0.8497, + "step": 3640 + }, + { + "epoch": 0.19630148803105457, + "grad_norm": 0.7055602073669434, + "learning_rate": 9.943748648029958e-06, + "loss": 0.7949, + "step": 3641 + }, + { + "epoch": 0.19635540219969808, + "grad_norm": 0.9971569180488586, + "learning_rate": 9.94371693184339e-06, + "loss": 0.8311, + "step": 3642 + }, + { + "epoch": 0.1964093163683416, + "grad_norm": 0.7608943581581116, + "learning_rate": 9.943685206768686e-06, + "loss": 0.8303, + "step": 3643 + }, + { + "epoch": 0.1964632305369851, + "grad_norm": 0.9169919490814209, + "learning_rate": 9.943653472805901e-06, + "loss": 0.8314, + "step": 3644 + }, + { + "epoch": 0.19651714470562864, + "grad_norm": 0.8501203656196594, + "learning_rate": 9.943621729955096e-06, + "loss": 0.8765, + "step": 3645 + }, + { + "epoch": 0.19657105887427215, + "grad_norm": 0.7438945770263672, + "learning_rate": 9.943589978216325e-06, + "loss": 0.7323, + "step": 3646 + }, + { + "epoch": 0.19662497304291568, + "grad_norm": 0.8795550465583801, + "learning_rate": 9.943558217589646e-06, + "loss": 0.7916, + "step": 3647 + }, + { + "epoch": 0.1966788872115592, + "grad_norm": 0.7928707003593445, + "learning_rate": 9.943526448075117e-06, + "loss": 0.8621, + "step": 3648 + }, + { + "epoch": 0.19673280138020272, + "grad_norm": 0.8225892782211304, + "learning_rate": 9.943494669672792e-06, + "loss": 0.8718, + "step": 3649 + }, + { + "epoch": 0.19678671554884625, + "grad_norm": 0.8227444291114807, + "learning_rate": 9.943462882382732e-06, + "loss": 0.8374, + "step": 3650 + }, + { + "epoch": 0.19684062971748975, + "grad_norm": 0.7860620021820068, + "learning_rate": 9.943431086204991e-06, + "loss": 0.8919, + "step": 3651 + }, + { + "epoch": 0.19689454388613328, + "grad_norm": 0.8000875115394592, + "learning_rate": 9.94339928113963e-06, + "loss": 0.7822, + "step": 3652 + }, + { + "epoch": 0.1969484580547768, + "grad_norm": 0.796389639377594, + "learning_rate": 9.943367467186702e-06, + "loss": 0.7149, + "step": 3653 + }, + { + "epoch": 0.19700237222342032, + "grad_norm": 0.8032622337341309, + "learning_rate": 9.943335644346267e-06, + "loss": 0.8442, + "step": 3654 + }, + { + "epoch": 0.19705628639206382, + "grad_norm": 0.8624833226203918, + "learning_rate": 9.94330381261838e-06, + "loss": 0.8681, + "step": 3655 + }, + { + "epoch": 0.19711020056070735, + "grad_norm": 0.9663752317428589, + "learning_rate": 9.9432719720031e-06, + "loss": 0.8749, + "step": 3656 + }, + { + "epoch": 0.1971641147293509, + "grad_norm": 0.6869292259216309, + "learning_rate": 9.943240122500484e-06, + "loss": 0.7288, + "step": 3657 + }, + { + "epoch": 0.1972180288979944, + "grad_norm": 0.7496824264526367, + "learning_rate": 9.943208264110589e-06, + "loss": 0.7191, + "step": 3658 + }, + { + "epoch": 0.19727194306663792, + "grad_norm": 0.7637088894844055, + "learning_rate": 9.943176396833471e-06, + "loss": 0.7602, + "step": 3659 + }, + { + "epoch": 0.19732585723528143, + "grad_norm": 0.7049651741981506, + "learning_rate": 9.94314452066919e-06, + "loss": 0.7097, + "step": 3660 + }, + { + "epoch": 0.19737977140392496, + "grad_norm": 0.8979986310005188, + "learning_rate": 9.943112635617802e-06, + "loss": 0.7953, + "step": 3661 + }, + { + "epoch": 0.19743368557256846, + "grad_norm": 0.7865282893180847, + "learning_rate": 9.943080741679364e-06, + "loss": 0.7394, + "step": 3662 + }, + { + "epoch": 0.197487599741212, + "grad_norm": 0.7790982723236084, + "learning_rate": 9.943048838853932e-06, + "loss": 0.8587, + "step": 3663 + }, + { + "epoch": 0.1975415139098555, + "grad_norm": 0.8486214876174927, + "learning_rate": 9.943016927141566e-06, + "loss": 0.9232, + "step": 3664 + }, + { + "epoch": 0.19759542807849903, + "grad_norm": 0.7729238867759705, + "learning_rate": 9.942985006542322e-06, + "loss": 0.7704, + "step": 3665 + }, + { + "epoch": 0.19764934224714256, + "grad_norm": 0.7827340960502625, + "learning_rate": 9.942953077056259e-06, + "loss": 0.7834, + "step": 3666 + }, + { + "epoch": 0.19770325641578607, + "grad_norm": 0.8735725283622742, + "learning_rate": 9.94292113868343e-06, + "loss": 0.7521, + "step": 3667 + }, + { + "epoch": 0.1977571705844296, + "grad_norm": 0.803302526473999, + "learning_rate": 9.942889191423897e-06, + "loss": 0.7475, + "step": 3668 + }, + { + "epoch": 0.1978110847530731, + "grad_norm": 0.7523918747901917, + "learning_rate": 9.942857235277716e-06, + "loss": 0.7882, + "step": 3669 + }, + { + "epoch": 0.19786499892171663, + "grad_norm": 0.891010582447052, + "learning_rate": 9.942825270244944e-06, + "loss": 0.6855, + "step": 3670 + }, + { + "epoch": 0.19791891309036014, + "grad_norm": 0.8103521466255188, + "learning_rate": 9.94279329632564e-06, + "loss": 0.7604, + "step": 3671 + }, + { + "epoch": 0.19797282725900367, + "grad_norm": 0.7801117897033691, + "learning_rate": 9.94276131351986e-06, + "loss": 0.757, + "step": 3672 + }, + { + "epoch": 0.19802674142764717, + "grad_norm": 0.8760844469070435, + "learning_rate": 9.942729321827661e-06, + "loss": 0.9507, + "step": 3673 + }, + { + "epoch": 0.1980806555962907, + "grad_norm": 0.7129818201065063, + "learning_rate": 9.942697321249101e-06, + "loss": 0.7118, + "step": 3674 + }, + { + "epoch": 0.19813456976493424, + "grad_norm": 0.7223137021064758, + "learning_rate": 9.942665311784239e-06, + "loss": 0.6911, + "step": 3675 + }, + { + "epoch": 0.19818848393357774, + "grad_norm": 0.7100752592086792, + "learning_rate": 9.94263329343313e-06, + "loss": 0.7569, + "step": 3676 + }, + { + "epoch": 0.19824239810222127, + "grad_norm": 0.955298662185669, + "learning_rate": 9.942601266195834e-06, + "loss": 0.8562, + "step": 3677 + }, + { + "epoch": 0.19829631227086478, + "grad_norm": 0.7367860078811646, + "learning_rate": 9.942569230072408e-06, + "loss": 0.7184, + "step": 3678 + }, + { + "epoch": 0.1983502264395083, + "grad_norm": 0.7822328805923462, + "learning_rate": 9.942537185062909e-06, + "loss": 0.7111, + "step": 3679 + }, + { + "epoch": 0.1984041406081518, + "grad_norm": 0.8836474418640137, + "learning_rate": 9.942505131167394e-06, + "loss": 0.731, + "step": 3680 + }, + { + "epoch": 0.19845805477679535, + "grad_norm": 0.7033706903457642, + "learning_rate": 9.942473068385921e-06, + "loss": 0.7228, + "step": 3681 + }, + { + "epoch": 0.19851196894543885, + "grad_norm": 0.7241103649139404, + "learning_rate": 9.942440996718549e-06, + "loss": 0.7045, + "step": 3682 + }, + { + "epoch": 0.19856588311408238, + "grad_norm": 0.8266516923904419, + "learning_rate": 9.942408916165334e-06, + "loss": 0.781, + "step": 3683 + }, + { + "epoch": 0.1986197972827259, + "grad_norm": 0.9639707207679749, + "learning_rate": 9.942376826726334e-06, + "loss": 0.8136, + "step": 3684 + }, + { + "epoch": 0.19867371145136942, + "grad_norm": 0.874279797077179, + "learning_rate": 9.942344728401609e-06, + "loss": 0.8147, + "step": 3685 + }, + { + "epoch": 0.19872762562001295, + "grad_norm": 0.7670862674713135, + "learning_rate": 9.942312621191213e-06, + "loss": 0.8134, + "step": 3686 + }, + { + "epoch": 0.19878153978865645, + "grad_norm": 0.8974711894989014, + "learning_rate": 9.942280505095206e-06, + "loss": 0.8211, + "step": 3687 + }, + { + "epoch": 0.19883545395729998, + "grad_norm": 0.8174877762794495, + "learning_rate": 9.942248380113646e-06, + "loss": 0.8641, + "step": 3688 + }, + { + "epoch": 0.1988893681259435, + "grad_norm": 0.7798371315002441, + "learning_rate": 9.942216246246588e-06, + "loss": 0.7226, + "step": 3689 + }, + { + "epoch": 0.19894328229458702, + "grad_norm": 0.8269854784011841, + "learning_rate": 9.942184103494093e-06, + "loss": 0.8789, + "step": 3690 + }, + { + "epoch": 0.19899719646323052, + "grad_norm": 0.8148782253265381, + "learning_rate": 9.942151951856217e-06, + "loss": 0.8436, + "step": 3691 + }, + { + "epoch": 0.19905111063187406, + "grad_norm": 0.823692262172699, + "learning_rate": 9.942119791333017e-06, + "loss": 0.6935, + "step": 3692 + }, + { + "epoch": 0.1991050248005176, + "grad_norm": 0.8396292924880981, + "learning_rate": 9.942087621924555e-06, + "loss": 0.8814, + "step": 3693 + }, + { + "epoch": 0.1991589389691611, + "grad_norm": 0.7293786406517029, + "learning_rate": 9.942055443630885e-06, + "loss": 0.7735, + "step": 3694 + }, + { + "epoch": 0.19921285313780462, + "grad_norm": 0.7367222905158997, + "learning_rate": 9.942023256452066e-06, + "loss": 0.7797, + "step": 3695 + }, + { + "epoch": 0.19926676730644813, + "grad_norm": 0.7078450322151184, + "learning_rate": 9.941991060388155e-06, + "loss": 0.7192, + "step": 3696 + }, + { + "epoch": 0.19932068147509166, + "grad_norm": 0.7927302718162537, + "learning_rate": 9.941958855439211e-06, + "loss": 0.8249, + "step": 3697 + }, + { + "epoch": 0.19937459564373516, + "grad_norm": 0.806266725063324, + "learning_rate": 9.941926641605292e-06, + "loss": 0.7829, + "step": 3698 + }, + { + "epoch": 0.1994285098123787, + "grad_norm": 0.8022493720054626, + "learning_rate": 9.941894418886455e-06, + "loss": 0.7843, + "step": 3699 + }, + { + "epoch": 0.1994824239810222, + "grad_norm": 0.8877873420715332, + "learning_rate": 9.941862187282759e-06, + "loss": 0.7266, + "step": 3700 + }, + { + "epoch": 0.19953633814966573, + "grad_norm": 0.7944962382316589, + "learning_rate": 9.94182994679426e-06, + "loss": 0.8078, + "step": 3701 + }, + { + "epoch": 0.19959025231830926, + "grad_norm": 0.8684442639350891, + "learning_rate": 9.941797697421017e-06, + "loss": 0.7445, + "step": 3702 + }, + { + "epoch": 0.19964416648695277, + "grad_norm": 0.7841063141822815, + "learning_rate": 9.94176543916309e-06, + "loss": 0.7231, + "step": 3703 + }, + { + "epoch": 0.1996980806555963, + "grad_norm": 0.7657507658004761, + "learning_rate": 9.941733172020533e-06, + "loss": 0.7018, + "step": 3704 + }, + { + "epoch": 0.1997519948242398, + "grad_norm": 1.086627721786499, + "learning_rate": 9.94170089599341e-06, + "loss": 0.7914, + "step": 3705 + }, + { + "epoch": 0.19980590899288334, + "grad_norm": 0.7400459051132202, + "learning_rate": 9.941668611081771e-06, + "loss": 0.7841, + "step": 3706 + }, + { + "epoch": 0.19985982316152684, + "grad_norm": 1.0587258338928223, + "learning_rate": 9.94163631728568e-06, + "loss": 0.923, + "step": 3707 + }, + { + "epoch": 0.19991373733017037, + "grad_norm": 0.8322579264640808, + "learning_rate": 9.941604014605193e-06, + "loss": 0.8095, + "step": 3708 + }, + { + "epoch": 0.19996765149881388, + "grad_norm": 0.6660327911376953, + "learning_rate": 9.94157170304037e-06, + "loss": 0.6977, + "step": 3709 + }, + { + "epoch": 0.2000215656674574, + "grad_norm": 0.8063632249832153, + "learning_rate": 9.941539382591267e-06, + "loss": 0.7693, + "step": 3710 + }, + { + "epoch": 0.20007547983610094, + "grad_norm": 0.7367355227470398, + "learning_rate": 9.941507053257942e-06, + "loss": 0.7312, + "step": 3711 + }, + { + "epoch": 0.20012939400474444, + "grad_norm": 0.7430408596992493, + "learning_rate": 9.941474715040454e-06, + "loss": 0.8077, + "step": 3712 + }, + { + "epoch": 0.20018330817338797, + "grad_norm": 0.8141972422599792, + "learning_rate": 9.94144236793886e-06, + "loss": 0.8017, + "step": 3713 + }, + { + "epoch": 0.20023722234203148, + "grad_norm": 0.7599862217903137, + "learning_rate": 9.94141001195322e-06, + "loss": 0.8644, + "step": 3714 + }, + { + "epoch": 0.200291136510675, + "grad_norm": 0.8302745819091797, + "learning_rate": 9.941377647083591e-06, + "loss": 0.8996, + "step": 3715 + }, + { + "epoch": 0.20034505067931851, + "grad_norm": 0.8288695812225342, + "learning_rate": 9.941345273330031e-06, + "loss": 0.7727, + "step": 3716 + }, + { + "epoch": 0.20039896484796205, + "grad_norm": 0.7157832980155945, + "learning_rate": 9.9413128906926e-06, + "loss": 0.7619, + "step": 3717 + }, + { + "epoch": 0.20045287901660555, + "grad_norm": 0.7811874151229858, + "learning_rate": 9.941280499171355e-06, + "loss": 0.7905, + "step": 3718 + }, + { + "epoch": 0.20050679318524908, + "grad_norm": 0.7507179975509644, + "learning_rate": 9.941248098766354e-06, + "loss": 0.7023, + "step": 3719 + }, + { + "epoch": 0.20056070735389261, + "grad_norm": 0.7824770212173462, + "learning_rate": 9.941215689477655e-06, + "loss": 0.8233, + "step": 3720 + }, + { + "epoch": 0.20061462152253612, + "grad_norm": 0.7690337896347046, + "learning_rate": 9.941183271305314e-06, + "loss": 0.7162, + "step": 3721 + }, + { + "epoch": 0.20066853569117965, + "grad_norm": 0.8605464696884155, + "learning_rate": 9.941150844249396e-06, + "loss": 0.8073, + "step": 3722 + }, + { + "epoch": 0.20072244985982315, + "grad_norm": 0.8741899132728577, + "learning_rate": 9.941118408309953e-06, + "loss": 0.8131, + "step": 3723 + }, + { + "epoch": 0.2007763640284667, + "grad_norm": 0.8655528426170349, + "learning_rate": 9.941085963487044e-06, + "loss": 0.8162, + "step": 3724 + }, + { + "epoch": 0.2008302781971102, + "grad_norm": 0.7617276310920715, + "learning_rate": 9.941053509780732e-06, + "loss": 0.8257, + "step": 3725 + }, + { + "epoch": 0.20088419236575372, + "grad_norm": 0.7816554307937622, + "learning_rate": 9.941021047191071e-06, + "loss": 0.7722, + "step": 3726 + }, + { + "epoch": 0.20093810653439723, + "grad_norm": 0.7922171354293823, + "learning_rate": 9.94098857571812e-06, + "loss": 0.8267, + "step": 3727 + }, + { + "epoch": 0.20099202070304076, + "grad_norm": 0.7950446009635925, + "learning_rate": 9.940956095361939e-06, + "loss": 0.7743, + "step": 3728 + }, + { + "epoch": 0.2010459348716843, + "grad_norm": 1.154969573020935, + "learning_rate": 9.940923606122584e-06, + "loss": 0.7542, + "step": 3729 + }, + { + "epoch": 0.2010998490403278, + "grad_norm": 0.9842036962509155, + "learning_rate": 9.940891108000116e-06, + "loss": 0.8469, + "step": 3730 + }, + { + "epoch": 0.20115376320897133, + "grad_norm": 0.7800561785697937, + "learning_rate": 9.940858600994593e-06, + "loss": 0.7894, + "step": 3731 + }, + { + "epoch": 0.20120767737761483, + "grad_norm": 0.8366021513938904, + "learning_rate": 9.94082608510607e-06, + "loss": 0.8298, + "step": 3732 + }, + { + "epoch": 0.20126159154625836, + "grad_norm": 0.8020085692405701, + "learning_rate": 9.940793560334608e-06, + "loss": 0.8874, + "step": 3733 + }, + { + "epoch": 0.20131550571490187, + "grad_norm": 0.7151523232460022, + "learning_rate": 9.940761026680269e-06, + "loss": 0.697, + "step": 3734 + }, + { + "epoch": 0.2013694198835454, + "grad_norm": 0.8671187162399292, + "learning_rate": 9.940728484143105e-06, + "loss": 0.9408, + "step": 3735 + }, + { + "epoch": 0.2014233340521889, + "grad_norm": 0.8134783506393433, + "learning_rate": 9.940695932723179e-06, + "loss": 0.7751, + "step": 3736 + }, + { + "epoch": 0.20147724822083243, + "grad_norm": 0.8050068616867065, + "learning_rate": 9.940663372420546e-06, + "loss": 0.8676, + "step": 3737 + }, + { + "epoch": 0.20153116238947597, + "grad_norm": 0.9040514230728149, + "learning_rate": 9.940630803235269e-06, + "loss": 0.8499, + "step": 3738 + }, + { + "epoch": 0.20158507655811947, + "grad_norm": 0.8492094874382019, + "learning_rate": 9.9405982251674e-06, + "loss": 0.7006, + "step": 3739 + }, + { + "epoch": 0.201638990726763, + "grad_norm": 0.6991918683052063, + "learning_rate": 9.940565638217008e-06, + "loss": 0.73, + "step": 3740 + }, + { + "epoch": 0.2016929048954065, + "grad_norm": 0.8373433947563171, + "learning_rate": 9.940533042384142e-06, + "loss": 0.8514, + "step": 3741 + }, + { + "epoch": 0.20174681906405004, + "grad_norm": 0.8045080304145813, + "learning_rate": 9.940500437668864e-06, + "loss": 0.7678, + "step": 3742 + }, + { + "epoch": 0.20180073323269354, + "grad_norm": 0.8632493019104004, + "learning_rate": 9.940467824071233e-06, + "loss": 0.8541, + "step": 3743 + }, + { + "epoch": 0.20185464740133707, + "grad_norm": 0.8510474562644958, + "learning_rate": 9.940435201591307e-06, + "loss": 0.8124, + "step": 3744 + }, + { + "epoch": 0.2019085615699806, + "grad_norm": 0.8647206425666809, + "learning_rate": 9.940402570229144e-06, + "loss": 0.8553, + "step": 3745 + }, + { + "epoch": 0.2019624757386241, + "grad_norm": 0.8359355330467224, + "learning_rate": 9.940369929984804e-06, + "loss": 0.7459, + "step": 3746 + }, + { + "epoch": 0.20201638990726764, + "grad_norm": 0.7150790691375732, + "learning_rate": 9.940337280858346e-06, + "loss": 0.7155, + "step": 3747 + }, + { + "epoch": 0.20207030407591114, + "grad_norm": 0.8442468047142029, + "learning_rate": 9.940304622849826e-06, + "loss": 0.8139, + "step": 3748 + }, + { + "epoch": 0.20212421824455468, + "grad_norm": 0.8318220973014832, + "learning_rate": 9.940271955959307e-06, + "loss": 0.7255, + "step": 3749 + }, + { + "epoch": 0.20217813241319818, + "grad_norm": 0.802943229675293, + "learning_rate": 9.940239280186842e-06, + "loss": 0.7781, + "step": 3750 + }, + { + "epoch": 0.2022320465818417, + "grad_norm": 0.7529780268669128, + "learning_rate": 9.940206595532497e-06, + "loss": 0.7723, + "step": 3751 + }, + { + "epoch": 0.20228596075048522, + "grad_norm": 0.748574435710907, + "learning_rate": 9.940173901996325e-06, + "loss": 0.7911, + "step": 3752 + }, + { + "epoch": 0.20233987491912875, + "grad_norm": 0.800564706325531, + "learning_rate": 9.940141199578386e-06, + "loss": 0.7973, + "step": 3753 + }, + { + "epoch": 0.20239378908777228, + "grad_norm": 0.7890446186065674, + "learning_rate": 9.940108488278741e-06, + "loss": 0.8618, + "step": 3754 + }, + { + "epoch": 0.20244770325641578, + "grad_norm": 0.8168792128562927, + "learning_rate": 9.940075768097445e-06, + "loss": 0.7948, + "step": 3755 + }, + { + "epoch": 0.20250161742505932, + "grad_norm": 0.7742816209793091, + "learning_rate": 9.940043039034562e-06, + "loss": 0.8215, + "step": 3756 + }, + { + "epoch": 0.20255553159370282, + "grad_norm": 0.7921069860458374, + "learning_rate": 9.940010301090147e-06, + "loss": 0.7379, + "step": 3757 + }, + { + "epoch": 0.20260944576234635, + "grad_norm": 0.7375590205192566, + "learning_rate": 9.939977554264258e-06, + "loss": 0.7829, + "step": 3758 + }, + { + "epoch": 0.20266335993098986, + "grad_norm": 0.8653424382209778, + "learning_rate": 9.939944798556955e-06, + "loss": 0.8414, + "step": 3759 + }, + { + "epoch": 0.2027172740996334, + "grad_norm": 0.862486720085144, + "learning_rate": 9.9399120339683e-06, + "loss": 0.8531, + "step": 3760 + }, + { + "epoch": 0.2027711882682769, + "grad_norm": 0.737153947353363, + "learning_rate": 9.93987926049835e-06, + "loss": 0.8192, + "step": 3761 + }, + { + "epoch": 0.20282510243692042, + "grad_norm": 0.8391088843345642, + "learning_rate": 9.93984647814716e-06, + "loss": 0.7868, + "step": 3762 + }, + { + "epoch": 0.20287901660556396, + "grad_norm": 0.7767393589019775, + "learning_rate": 9.939813686914794e-06, + "loss": 0.7491, + "step": 3763 + }, + { + "epoch": 0.20293293077420746, + "grad_norm": 0.8916594982147217, + "learning_rate": 9.93978088680131e-06, + "loss": 0.7636, + "step": 3764 + }, + { + "epoch": 0.202986844942851, + "grad_norm": 0.8313565254211426, + "learning_rate": 9.939748077806766e-06, + "loss": 0.777, + "step": 3765 + }, + { + "epoch": 0.2030407591114945, + "grad_norm": 1.0501350164413452, + "learning_rate": 9.93971525993122e-06, + "loss": 0.8355, + "step": 3766 + }, + { + "epoch": 0.20309467328013803, + "grad_norm": 0.8451823592185974, + "learning_rate": 9.939682433174733e-06, + "loss": 0.7977, + "step": 3767 + }, + { + "epoch": 0.20314858744878153, + "grad_norm": 0.7655192613601685, + "learning_rate": 9.939649597537363e-06, + "loss": 0.803, + "step": 3768 + }, + { + "epoch": 0.20320250161742506, + "grad_norm": 0.885886549949646, + "learning_rate": 9.939616753019169e-06, + "loss": 0.7001, + "step": 3769 + }, + { + "epoch": 0.20325641578606857, + "grad_norm": 0.7583027482032776, + "learning_rate": 9.939583899620211e-06, + "loss": 0.7477, + "step": 3770 + }, + { + "epoch": 0.2033103299547121, + "grad_norm": 0.7712547779083252, + "learning_rate": 9.939551037340546e-06, + "loss": 0.7587, + "step": 3771 + }, + { + "epoch": 0.20336424412335563, + "grad_norm": 0.8146941065788269, + "learning_rate": 9.939518166180235e-06, + "loss": 0.8707, + "step": 3772 + }, + { + "epoch": 0.20341815829199913, + "grad_norm": 0.813261866569519, + "learning_rate": 9.939485286139338e-06, + "loss": 0.793, + "step": 3773 + }, + { + "epoch": 0.20347207246064267, + "grad_norm": 0.8719590306282043, + "learning_rate": 9.93945239721791e-06, + "loss": 0.7468, + "step": 3774 + }, + { + "epoch": 0.20352598662928617, + "grad_norm": 0.7224612236022949, + "learning_rate": 9.939419499416015e-06, + "loss": 0.7042, + "step": 3775 + }, + { + "epoch": 0.2035799007979297, + "grad_norm": 0.9211709499359131, + "learning_rate": 9.939386592733709e-06, + "loss": 0.859, + "step": 3776 + }, + { + "epoch": 0.2036338149665732, + "grad_norm": 0.7238151431083679, + "learning_rate": 9.939353677171054e-06, + "loss": 0.7656, + "step": 3777 + }, + { + "epoch": 0.20368772913521674, + "grad_norm": 0.7677724957466125, + "learning_rate": 9.939320752728105e-06, + "loss": 0.7827, + "step": 3778 + }, + { + "epoch": 0.20374164330386024, + "grad_norm": 0.8721383213996887, + "learning_rate": 9.939287819404924e-06, + "loss": 0.8731, + "step": 3779 + }, + { + "epoch": 0.20379555747250377, + "grad_norm": 0.815819501876831, + "learning_rate": 9.93925487720157e-06, + "loss": 0.8022, + "step": 3780 + }, + { + "epoch": 0.2038494716411473, + "grad_norm": 0.7322037816047668, + "learning_rate": 9.939221926118102e-06, + "loss": 0.7593, + "step": 3781 + }, + { + "epoch": 0.2039033858097908, + "grad_norm": 0.787909984588623, + "learning_rate": 9.939188966154577e-06, + "loss": 0.6648, + "step": 3782 + }, + { + "epoch": 0.20395729997843434, + "grad_norm": 0.8070237636566162, + "learning_rate": 9.93915599731106e-06, + "loss": 0.8062, + "step": 3783 + }, + { + "epoch": 0.20401121414707785, + "grad_norm": 0.8590712547302246, + "learning_rate": 9.939123019587604e-06, + "loss": 0.7494, + "step": 3784 + }, + { + "epoch": 0.20406512831572138, + "grad_norm": 0.7172074317932129, + "learning_rate": 9.939090032984271e-06, + "loss": 0.7305, + "step": 3785 + }, + { + "epoch": 0.20411904248436488, + "grad_norm": 0.7950757145881653, + "learning_rate": 9.93905703750112e-06, + "loss": 0.8096, + "step": 3786 + }, + { + "epoch": 0.20417295665300841, + "grad_norm": 0.76169353723526, + "learning_rate": 9.939024033138212e-06, + "loss": 0.856, + "step": 3787 + }, + { + "epoch": 0.20422687082165192, + "grad_norm": 0.7239205241203308, + "learning_rate": 9.938991019895606e-06, + "loss": 0.7194, + "step": 3788 + }, + { + "epoch": 0.20428078499029545, + "grad_norm": 0.9215821027755737, + "learning_rate": 9.938957997773358e-06, + "loss": 0.9972, + "step": 3789 + }, + { + "epoch": 0.20433469915893898, + "grad_norm": 0.6955212950706482, + "learning_rate": 9.93892496677153e-06, + "loss": 0.7032, + "step": 3790 + }, + { + "epoch": 0.20438861332758249, + "grad_norm": 0.7507944107055664, + "learning_rate": 9.938891926890181e-06, + "loss": 0.6664, + "step": 3791 + }, + { + "epoch": 0.20444252749622602, + "grad_norm": 0.8046016097068787, + "learning_rate": 9.938858878129372e-06, + "loss": 0.8034, + "step": 3792 + }, + { + "epoch": 0.20449644166486952, + "grad_norm": 0.7906206250190735, + "learning_rate": 9.938825820489158e-06, + "loss": 0.745, + "step": 3793 + }, + { + "epoch": 0.20455035583351305, + "grad_norm": 0.820650577545166, + "learning_rate": 9.938792753969604e-06, + "loss": 0.8695, + "step": 3794 + }, + { + "epoch": 0.20460427000215656, + "grad_norm": 0.7975518107414246, + "learning_rate": 9.938759678570766e-06, + "loss": 0.7835, + "step": 3795 + }, + { + "epoch": 0.2046581841708001, + "grad_norm": 0.7118270993232727, + "learning_rate": 9.938726594292703e-06, + "loss": 0.6961, + "step": 3796 + }, + { + "epoch": 0.2047120983394436, + "grad_norm": 0.7525848746299744, + "learning_rate": 9.938693501135477e-06, + "loss": 0.7714, + "step": 3797 + }, + { + "epoch": 0.20476601250808713, + "grad_norm": 0.7751832604408264, + "learning_rate": 9.938660399099145e-06, + "loss": 0.8213, + "step": 3798 + }, + { + "epoch": 0.20481992667673066, + "grad_norm": 0.7307599186897278, + "learning_rate": 9.938627288183769e-06, + "loss": 0.7164, + "step": 3799 + }, + { + "epoch": 0.20487384084537416, + "grad_norm": 0.7432039380073547, + "learning_rate": 9.938594168389406e-06, + "loss": 0.8215, + "step": 3800 + }, + { + "epoch": 0.2049277550140177, + "grad_norm": 0.8611830472946167, + "learning_rate": 9.938561039716116e-06, + "loss": 0.829, + "step": 3801 + }, + { + "epoch": 0.2049816691826612, + "grad_norm": 0.8893013596534729, + "learning_rate": 9.93852790216396e-06, + "loss": 0.7413, + "step": 3802 + }, + { + "epoch": 0.20503558335130473, + "grad_norm": 0.7722970843315125, + "learning_rate": 9.938494755732999e-06, + "loss": 0.7398, + "step": 3803 + }, + { + "epoch": 0.20508949751994823, + "grad_norm": 0.762994110584259, + "learning_rate": 9.938461600423289e-06, + "loss": 0.8195, + "step": 3804 + }, + { + "epoch": 0.20514341168859176, + "grad_norm": 0.7434782981872559, + "learning_rate": 9.938428436234891e-06, + "loss": 0.7917, + "step": 3805 + }, + { + "epoch": 0.20519732585723527, + "grad_norm": 0.7441586852073669, + "learning_rate": 9.938395263167866e-06, + "loss": 0.7852, + "step": 3806 + }, + { + "epoch": 0.2052512400258788, + "grad_norm": 0.7333529591560364, + "learning_rate": 9.93836208122227e-06, + "loss": 0.6912, + "step": 3807 + }, + { + "epoch": 0.20530515419452233, + "grad_norm": 0.8772805333137512, + "learning_rate": 9.938328890398167e-06, + "loss": 0.7828, + "step": 3808 + }, + { + "epoch": 0.20535906836316584, + "grad_norm": 0.7632616758346558, + "learning_rate": 9.938295690695614e-06, + "loss": 0.7235, + "step": 3809 + }, + { + "epoch": 0.20541298253180937, + "grad_norm": 0.7122440338134766, + "learning_rate": 9.93826248211467e-06, + "loss": 0.7734, + "step": 3810 + }, + { + "epoch": 0.20546689670045287, + "grad_norm": 0.7449793815612793, + "learning_rate": 9.938229264655399e-06, + "loss": 0.6826, + "step": 3811 + }, + { + "epoch": 0.2055208108690964, + "grad_norm": 0.7615137696266174, + "learning_rate": 9.938196038317856e-06, + "loss": 0.8139, + "step": 3812 + }, + { + "epoch": 0.2055747250377399, + "grad_norm": 0.7921400666236877, + "learning_rate": 9.938162803102102e-06, + "loss": 0.8424, + "step": 3813 + }, + { + "epoch": 0.20562863920638344, + "grad_norm": 0.8665443062782288, + "learning_rate": 9.938129559008198e-06, + "loss": 0.665, + "step": 3814 + }, + { + "epoch": 0.20568255337502694, + "grad_norm": 0.7882665991783142, + "learning_rate": 9.938096306036202e-06, + "loss": 0.8162, + "step": 3815 + }, + { + "epoch": 0.20573646754367048, + "grad_norm": 0.7418076395988464, + "learning_rate": 9.938063044186176e-06, + "loss": 0.7629, + "step": 3816 + }, + { + "epoch": 0.205790381712314, + "grad_norm": 0.8741267919540405, + "learning_rate": 9.93802977345818e-06, + "loss": 0.851, + "step": 3817 + }, + { + "epoch": 0.2058442958809575, + "grad_norm": 0.7862716913223267, + "learning_rate": 9.937996493852271e-06, + "loss": 0.7542, + "step": 3818 + }, + { + "epoch": 0.20589821004960104, + "grad_norm": 0.8344624042510986, + "learning_rate": 9.937963205368509e-06, + "loss": 0.7366, + "step": 3819 + }, + { + "epoch": 0.20595212421824455, + "grad_norm": 0.9976859092712402, + "learning_rate": 9.937929908006957e-06, + "loss": 0.9252, + "step": 3820 + }, + { + "epoch": 0.20600603838688808, + "grad_norm": 0.8346890807151794, + "learning_rate": 9.937896601767672e-06, + "loss": 0.8172, + "step": 3821 + }, + { + "epoch": 0.20605995255553158, + "grad_norm": 0.8109154105186462, + "learning_rate": 9.937863286650715e-06, + "loss": 0.8869, + "step": 3822 + }, + { + "epoch": 0.20611386672417512, + "grad_norm": 0.7664018869400024, + "learning_rate": 9.937829962656147e-06, + "loss": 0.7821, + "step": 3823 + }, + { + "epoch": 0.20616778089281862, + "grad_norm": 0.9373911619186401, + "learning_rate": 9.937796629784025e-06, + "loss": 0.7391, + "step": 3824 + }, + { + "epoch": 0.20622169506146215, + "grad_norm": 0.7312552332878113, + "learning_rate": 9.937763288034411e-06, + "loss": 0.7328, + "step": 3825 + }, + { + "epoch": 0.20627560923010568, + "grad_norm": 0.9266682863235474, + "learning_rate": 9.937729937407365e-06, + "loss": 0.8976, + "step": 3826 + }, + { + "epoch": 0.2063295233987492, + "grad_norm": 0.7579758763313293, + "learning_rate": 9.937696577902947e-06, + "loss": 0.766, + "step": 3827 + }, + { + "epoch": 0.20638343756739272, + "grad_norm": 0.8648816347122192, + "learning_rate": 9.937663209521216e-06, + "loss": 0.9122, + "step": 3828 + }, + { + "epoch": 0.20643735173603622, + "grad_norm": 0.8788310289382935, + "learning_rate": 9.937629832262231e-06, + "loss": 0.9152, + "step": 3829 + }, + { + "epoch": 0.20649126590467975, + "grad_norm": 0.8865007162094116, + "learning_rate": 9.937596446126057e-06, + "loss": 0.8767, + "step": 3830 + }, + { + "epoch": 0.20654518007332326, + "grad_norm": 0.7323981523513794, + "learning_rate": 9.937563051112748e-06, + "loss": 0.7733, + "step": 3831 + }, + { + "epoch": 0.2065990942419668, + "grad_norm": 0.8782559037208557, + "learning_rate": 9.937529647222368e-06, + "loss": 0.7694, + "step": 3832 + }, + { + "epoch": 0.2066530084106103, + "grad_norm": 0.8300665020942688, + "learning_rate": 9.937496234454974e-06, + "loss": 0.8386, + "step": 3833 + }, + { + "epoch": 0.20670692257925383, + "grad_norm": 0.8438191413879395, + "learning_rate": 9.937462812810628e-06, + "loss": 0.7394, + "step": 3834 + }, + { + "epoch": 0.20676083674789736, + "grad_norm": 0.7255253195762634, + "learning_rate": 9.937429382289391e-06, + "loss": 0.6973, + "step": 3835 + }, + { + "epoch": 0.20681475091654086, + "grad_norm": 0.8600755929946899, + "learning_rate": 9.93739594289132e-06, + "loss": 0.838, + "step": 3836 + }, + { + "epoch": 0.2068686650851844, + "grad_norm": 0.788693904876709, + "learning_rate": 9.937362494616479e-06, + "loss": 0.7911, + "step": 3837 + }, + { + "epoch": 0.2069225792538279, + "grad_norm": 0.808438777923584, + "learning_rate": 9.937329037464924e-06, + "loss": 0.7802, + "step": 3838 + }, + { + "epoch": 0.20697649342247143, + "grad_norm": 0.9273937344551086, + "learning_rate": 9.937295571436719e-06, + "loss": 0.8589, + "step": 3839 + }, + { + "epoch": 0.20703040759111493, + "grad_norm": 0.7375195026397705, + "learning_rate": 9.937262096531922e-06, + "loss": 0.7017, + "step": 3840 + }, + { + "epoch": 0.20708432175975847, + "grad_norm": 0.7502869963645935, + "learning_rate": 9.937228612750594e-06, + "loss": 0.7577, + "step": 3841 + }, + { + "epoch": 0.20713823592840197, + "grad_norm": 0.8005609512329102, + "learning_rate": 9.937195120092794e-06, + "loss": 0.7411, + "step": 3842 + }, + { + "epoch": 0.2071921500970455, + "grad_norm": 0.8089357018470764, + "learning_rate": 9.937161618558583e-06, + "loss": 0.8149, + "step": 3843 + }, + { + "epoch": 0.20724606426568903, + "grad_norm": 0.946266233921051, + "learning_rate": 9.937128108148022e-06, + "loss": 0.8676, + "step": 3844 + }, + { + "epoch": 0.20729997843433254, + "grad_norm": 0.793250322341919, + "learning_rate": 9.937094588861171e-06, + "loss": 0.8402, + "step": 3845 + }, + { + "epoch": 0.20735389260297607, + "grad_norm": 0.9192420244216919, + "learning_rate": 9.937061060698088e-06, + "loss": 0.8381, + "step": 3846 + }, + { + "epoch": 0.20740780677161957, + "grad_norm": 0.7944622039794922, + "learning_rate": 9.937027523658838e-06, + "loss": 0.8281, + "step": 3847 + }, + { + "epoch": 0.2074617209402631, + "grad_norm": 0.8567733764648438, + "learning_rate": 9.936993977743476e-06, + "loss": 0.7528, + "step": 3848 + }, + { + "epoch": 0.2075156351089066, + "grad_norm": 0.9478929042816162, + "learning_rate": 9.936960422952064e-06, + "loss": 0.6957, + "step": 3849 + }, + { + "epoch": 0.20756954927755014, + "grad_norm": 0.8856588006019592, + "learning_rate": 9.936926859284665e-06, + "loss": 0.9112, + "step": 3850 + }, + { + "epoch": 0.20762346344619367, + "grad_norm": 0.8800935745239258, + "learning_rate": 9.936893286741336e-06, + "loss": 0.7313, + "step": 3851 + }, + { + "epoch": 0.20767737761483718, + "grad_norm": 0.773314893245697, + "learning_rate": 9.936859705322139e-06, + "loss": 0.7953, + "step": 3852 + }, + { + "epoch": 0.2077312917834807, + "grad_norm": 0.8045309782028198, + "learning_rate": 9.936826115027136e-06, + "loss": 0.7789, + "step": 3853 + }, + { + "epoch": 0.2077852059521242, + "grad_norm": 0.7337809801101685, + "learning_rate": 9.936792515856383e-06, + "loss": 0.7471, + "step": 3854 + }, + { + "epoch": 0.20783912012076775, + "grad_norm": 0.7467783093452454, + "learning_rate": 9.936758907809944e-06, + "loss": 0.746, + "step": 3855 + }, + { + "epoch": 0.20789303428941125, + "grad_norm": 0.896782398223877, + "learning_rate": 9.936725290887878e-06, + "loss": 0.8753, + "step": 3856 + }, + { + "epoch": 0.20794694845805478, + "grad_norm": 0.7642794251441956, + "learning_rate": 9.936691665090246e-06, + "loss": 0.744, + "step": 3857 + }, + { + "epoch": 0.20800086262669829, + "grad_norm": 0.9514477849006653, + "learning_rate": 9.936658030417108e-06, + "loss": 0.9586, + "step": 3858 + }, + { + "epoch": 0.20805477679534182, + "grad_norm": 0.8868480324745178, + "learning_rate": 9.936624386868524e-06, + "loss": 0.7381, + "step": 3859 + }, + { + "epoch": 0.20810869096398535, + "grad_norm": 0.7855881452560425, + "learning_rate": 9.936590734444555e-06, + "loss": 0.7942, + "step": 3860 + }, + { + "epoch": 0.20816260513262885, + "grad_norm": 0.7549954056739807, + "learning_rate": 9.936557073145264e-06, + "loss": 0.8478, + "step": 3861 + }, + { + "epoch": 0.20821651930127238, + "grad_norm": 0.7425951361656189, + "learning_rate": 9.936523402970707e-06, + "loss": 0.7854, + "step": 3862 + }, + { + "epoch": 0.2082704334699159, + "grad_norm": 0.7873994708061218, + "learning_rate": 9.936489723920947e-06, + "loss": 0.6917, + "step": 3863 + }, + { + "epoch": 0.20832434763855942, + "grad_norm": 0.7681507468223572, + "learning_rate": 9.936456035996044e-06, + "loss": 0.7427, + "step": 3864 + }, + { + "epoch": 0.20837826180720292, + "grad_norm": 0.8043473362922668, + "learning_rate": 9.93642233919606e-06, + "loss": 0.7319, + "step": 3865 + }, + { + "epoch": 0.20843217597584646, + "grad_norm": 0.9194585084915161, + "learning_rate": 9.936388633521055e-06, + "loss": 0.755, + "step": 3866 + }, + { + "epoch": 0.20848609014448996, + "grad_norm": 0.7365962862968445, + "learning_rate": 9.936354918971087e-06, + "loss": 0.7855, + "step": 3867 + }, + { + "epoch": 0.2085400043131335, + "grad_norm": 0.8254776000976562, + "learning_rate": 9.936321195546218e-06, + "loss": 0.7854, + "step": 3868 + }, + { + "epoch": 0.20859391848177702, + "grad_norm": 0.8259122967720032, + "learning_rate": 9.936287463246513e-06, + "loss": 0.8759, + "step": 3869 + }, + { + "epoch": 0.20864783265042053, + "grad_norm": 0.731363832950592, + "learning_rate": 9.936253722072026e-06, + "loss": 0.76, + "step": 3870 + }, + { + "epoch": 0.20870174681906406, + "grad_norm": 0.908054530620575, + "learning_rate": 9.93621997202282e-06, + "loss": 0.8865, + "step": 3871 + }, + { + "epoch": 0.20875566098770756, + "grad_norm": 0.7576562166213989, + "learning_rate": 9.936186213098958e-06, + "loss": 0.8276, + "step": 3872 + }, + { + "epoch": 0.2088095751563511, + "grad_norm": 0.8297492861747742, + "learning_rate": 9.9361524453005e-06, + "loss": 0.8799, + "step": 3873 + }, + { + "epoch": 0.2088634893249946, + "grad_norm": 0.7945959568023682, + "learning_rate": 9.936118668627502e-06, + "loss": 0.8448, + "step": 3874 + }, + { + "epoch": 0.20891740349363813, + "grad_norm": 0.8161780834197998, + "learning_rate": 9.936084883080031e-06, + "loss": 0.8835, + "step": 3875 + }, + { + "epoch": 0.20897131766228164, + "grad_norm": 0.768398106098175, + "learning_rate": 9.936051088658145e-06, + "loss": 0.7984, + "step": 3876 + }, + { + "epoch": 0.20902523183092517, + "grad_norm": 0.8847882151603699, + "learning_rate": 9.936017285361903e-06, + "loss": 0.8757, + "step": 3877 + }, + { + "epoch": 0.2090791459995687, + "grad_norm": 0.8796868324279785, + "learning_rate": 9.93598347319137e-06, + "loss": 0.8227, + "step": 3878 + }, + { + "epoch": 0.2091330601682122, + "grad_norm": 0.8362753987312317, + "learning_rate": 9.935949652146604e-06, + "loss": 0.7892, + "step": 3879 + }, + { + "epoch": 0.20918697433685574, + "grad_norm": 1.0995301008224487, + "learning_rate": 9.935915822227664e-06, + "loss": 0.7227, + "step": 3880 + }, + { + "epoch": 0.20924088850549924, + "grad_norm": 0.7771546244621277, + "learning_rate": 9.935881983434616e-06, + "loss": 0.8025, + "step": 3881 + }, + { + "epoch": 0.20929480267414277, + "grad_norm": 0.8586302995681763, + "learning_rate": 9.935848135767516e-06, + "loss": 0.7086, + "step": 3882 + }, + { + "epoch": 0.20934871684278628, + "grad_norm": 0.956278920173645, + "learning_rate": 9.935814279226428e-06, + "loss": 0.8625, + "step": 3883 + }, + { + "epoch": 0.2094026310114298, + "grad_norm": 0.8021535277366638, + "learning_rate": 9.935780413811412e-06, + "loss": 0.8392, + "step": 3884 + }, + { + "epoch": 0.2094565451800733, + "grad_norm": 0.7699674367904663, + "learning_rate": 9.935746539522526e-06, + "loss": 0.8322, + "step": 3885 + }, + { + "epoch": 0.20951045934871684, + "grad_norm": 0.7814954519271851, + "learning_rate": 9.935712656359835e-06, + "loss": 0.9123, + "step": 3886 + }, + { + "epoch": 0.20956437351736037, + "grad_norm": 0.7062190175056458, + "learning_rate": 9.935678764323397e-06, + "loss": 0.7398, + "step": 3887 + }, + { + "epoch": 0.20961828768600388, + "grad_norm": 0.8294083476066589, + "learning_rate": 9.935644863413276e-06, + "loss": 0.8381, + "step": 3888 + }, + { + "epoch": 0.2096722018546474, + "grad_norm": 0.779521107673645, + "learning_rate": 9.93561095362953e-06, + "loss": 0.7838, + "step": 3889 + }, + { + "epoch": 0.20972611602329091, + "grad_norm": 0.894511878490448, + "learning_rate": 9.935577034972224e-06, + "loss": 0.7278, + "step": 3890 + }, + { + "epoch": 0.20978003019193445, + "grad_norm": 0.6891781091690063, + "learning_rate": 9.935543107441414e-06, + "loss": 0.6854, + "step": 3891 + }, + { + "epoch": 0.20983394436057795, + "grad_norm": 0.7697615623474121, + "learning_rate": 9.935509171037161e-06, + "loss": 0.7901, + "step": 3892 + }, + { + "epoch": 0.20988785852922148, + "grad_norm": 0.7699109315872192, + "learning_rate": 9.935475225759532e-06, + "loss": 0.7982, + "step": 3893 + }, + { + "epoch": 0.209941772697865, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.93544127160858e-06, + "loss": 0.7911, + "step": 3894 + }, + { + "epoch": 0.20999568686650852, + "grad_norm": 0.7754570245742798, + "learning_rate": 9.935407308584374e-06, + "loss": 0.6886, + "step": 3895 + }, + { + "epoch": 0.21004960103515205, + "grad_norm": 0.8235013484954834, + "learning_rate": 9.935373336686971e-06, + "loss": 0.845, + "step": 3896 + }, + { + "epoch": 0.21010351520379555, + "grad_norm": 0.7366604208946228, + "learning_rate": 9.93533935591643e-06, + "loss": 0.7499, + "step": 3897 + }, + { + "epoch": 0.2101574293724391, + "grad_norm": 0.7987866401672363, + "learning_rate": 9.935305366272816e-06, + "loss": 0.7866, + "step": 3898 + }, + { + "epoch": 0.2102113435410826, + "grad_norm": 0.8240886926651001, + "learning_rate": 9.93527136775619e-06, + "loss": 0.8027, + "step": 3899 + }, + { + "epoch": 0.21026525770972612, + "grad_norm": 0.7460751533508301, + "learning_rate": 9.93523736036661e-06, + "loss": 0.7517, + "step": 3900 + }, + { + "epoch": 0.21031917187836963, + "grad_norm": 0.7845814228057861, + "learning_rate": 9.935203344104139e-06, + "loss": 0.7533, + "step": 3901 + }, + { + "epoch": 0.21037308604701316, + "grad_norm": 0.7805215120315552, + "learning_rate": 9.935169318968838e-06, + "loss": 0.7034, + "step": 3902 + }, + { + "epoch": 0.21042700021565666, + "grad_norm": 0.7909711003303528, + "learning_rate": 9.935135284960769e-06, + "loss": 0.8253, + "step": 3903 + }, + { + "epoch": 0.2104809143843002, + "grad_norm": 0.7670220136642456, + "learning_rate": 9.93510124207999e-06, + "loss": 0.8114, + "step": 3904 + }, + { + "epoch": 0.21053482855294373, + "grad_norm": 0.7751194834709167, + "learning_rate": 9.935067190326566e-06, + "loss": 0.875, + "step": 3905 + }, + { + "epoch": 0.21058874272158723, + "grad_norm": 0.9303408265113831, + "learning_rate": 9.935033129700557e-06, + "loss": 0.9104, + "step": 3906 + }, + { + "epoch": 0.21064265689023076, + "grad_norm": 0.786558210849762, + "learning_rate": 9.934999060202024e-06, + "loss": 0.7453, + "step": 3907 + }, + { + "epoch": 0.21069657105887427, + "grad_norm": 0.8450469970703125, + "learning_rate": 9.934964981831028e-06, + "loss": 0.9733, + "step": 3908 + }, + { + "epoch": 0.2107504852275178, + "grad_norm": 0.8045774698257446, + "learning_rate": 9.93493089458763e-06, + "loss": 0.6763, + "step": 3909 + }, + { + "epoch": 0.2108043993961613, + "grad_norm": 0.7320234775543213, + "learning_rate": 9.934896798471894e-06, + "loss": 0.7668, + "step": 3910 + }, + { + "epoch": 0.21085831356480483, + "grad_norm": 0.8155072331428528, + "learning_rate": 9.934862693483878e-06, + "loss": 0.8186, + "step": 3911 + }, + { + "epoch": 0.21091222773344834, + "grad_norm": 0.7914832234382629, + "learning_rate": 9.934828579623643e-06, + "loss": 0.7977, + "step": 3912 + }, + { + "epoch": 0.21096614190209187, + "grad_norm": 0.7110108733177185, + "learning_rate": 9.934794456891254e-06, + "loss": 0.6576, + "step": 3913 + }, + { + "epoch": 0.2110200560707354, + "grad_norm": 1.0787992477416992, + "learning_rate": 9.934760325286768e-06, + "loss": 0.871, + "step": 3914 + }, + { + "epoch": 0.2110739702393789, + "grad_norm": 0.798880934715271, + "learning_rate": 9.93472618481025e-06, + "loss": 0.7115, + "step": 3915 + }, + { + "epoch": 0.21112788440802244, + "grad_norm": 0.945782482624054, + "learning_rate": 9.934692035461759e-06, + "loss": 0.7806, + "step": 3916 + }, + { + "epoch": 0.21118179857666594, + "grad_norm": 0.8860074877738953, + "learning_rate": 9.934657877241358e-06, + "loss": 0.735, + "step": 3917 + }, + { + "epoch": 0.21123571274530947, + "grad_norm": 0.7661596536636353, + "learning_rate": 9.934623710149107e-06, + "loss": 0.747, + "step": 3918 + }, + { + "epoch": 0.21128962691395298, + "grad_norm": 0.7670447826385498, + "learning_rate": 9.934589534185068e-06, + "loss": 0.7366, + "step": 3919 + }, + { + "epoch": 0.2113435410825965, + "grad_norm": 0.7264759540557861, + "learning_rate": 9.934555349349305e-06, + "loss": 0.7353, + "step": 3920 + }, + { + "epoch": 0.21139745525124, + "grad_norm": 0.7623618841171265, + "learning_rate": 9.934521155641874e-06, + "loss": 0.7758, + "step": 3921 + }, + { + "epoch": 0.21145136941988354, + "grad_norm": 0.6979674100875854, + "learning_rate": 9.93448695306284e-06, + "loss": 0.7376, + "step": 3922 + }, + { + "epoch": 0.21150528358852708, + "grad_norm": 0.7221145033836365, + "learning_rate": 9.934452741612265e-06, + "loss": 0.7918, + "step": 3923 + }, + { + "epoch": 0.21155919775717058, + "grad_norm": 0.7353740930557251, + "learning_rate": 9.934418521290209e-06, + "loss": 0.7487, + "step": 3924 + }, + { + "epoch": 0.2116131119258141, + "grad_norm": 0.8132720589637756, + "learning_rate": 9.934384292096734e-06, + "loss": 0.8121, + "step": 3925 + }, + { + "epoch": 0.21166702609445762, + "grad_norm": 0.8918466567993164, + "learning_rate": 9.9343500540319e-06, + "loss": 0.8911, + "step": 3926 + }, + { + "epoch": 0.21172094026310115, + "grad_norm": 0.7636724710464478, + "learning_rate": 9.934315807095774e-06, + "loss": 0.8012, + "step": 3927 + }, + { + "epoch": 0.21177485443174465, + "grad_norm": 0.889636754989624, + "learning_rate": 9.93428155128841e-06, + "loss": 0.7793, + "step": 3928 + }, + { + "epoch": 0.21182876860038818, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.934247286609875e-06, + "loss": 0.7483, + "step": 3929 + }, + { + "epoch": 0.2118826827690317, + "grad_norm": 0.8311534523963928, + "learning_rate": 9.934213013060228e-06, + "loss": 0.8796, + "step": 3930 + }, + { + "epoch": 0.21193659693767522, + "grad_norm": 0.7643389105796814, + "learning_rate": 9.934178730639531e-06, + "loss": 0.7587, + "step": 3931 + }, + { + "epoch": 0.21199051110631875, + "grad_norm": 0.8276751637458801, + "learning_rate": 9.934144439347849e-06, + "loss": 0.745, + "step": 3932 + }, + { + "epoch": 0.21204442527496226, + "grad_norm": 0.7427680492401123, + "learning_rate": 9.934110139185238e-06, + "loss": 0.7445, + "step": 3933 + }, + { + "epoch": 0.2120983394436058, + "grad_norm": 0.7343453168869019, + "learning_rate": 9.934075830151762e-06, + "loss": 0.7037, + "step": 3934 + }, + { + "epoch": 0.2121522536122493, + "grad_norm": 0.8002830743789673, + "learning_rate": 9.934041512247485e-06, + "loss": 0.8458, + "step": 3935 + }, + { + "epoch": 0.21220616778089282, + "grad_norm": 0.7045907974243164, + "learning_rate": 9.934007185472466e-06, + "loss": 0.7626, + "step": 3936 + }, + { + "epoch": 0.21226008194953633, + "grad_norm": 0.8169815540313721, + "learning_rate": 9.933972849826767e-06, + "loss": 0.8116, + "step": 3937 + }, + { + "epoch": 0.21231399611817986, + "grad_norm": 0.6935508847236633, + "learning_rate": 9.933938505310451e-06, + "loss": 0.7244, + "step": 3938 + }, + { + "epoch": 0.21236791028682336, + "grad_norm": 0.8311216235160828, + "learning_rate": 9.93390415192358e-06, + "loss": 0.8238, + "step": 3939 + }, + { + "epoch": 0.2124218244554669, + "grad_norm": 0.84473717212677, + "learning_rate": 9.933869789666213e-06, + "loss": 0.8334, + "step": 3940 + }, + { + "epoch": 0.21247573862411043, + "grad_norm": 0.7648805379867554, + "learning_rate": 9.933835418538414e-06, + "loss": 0.8705, + "step": 3941 + }, + { + "epoch": 0.21252965279275393, + "grad_norm": 0.752015233039856, + "learning_rate": 9.933801038540245e-06, + "loss": 0.7827, + "step": 3942 + }, + { + "epoch": 0.21258356696139746, + "grad_norm": 0.9639801383018494, + "learning_rate": 9.933766649671765e-06, + "loss": 0.7827, + "step": 3943 + }, + { + "epoch": 0.21263748113004097, + "grad_norm": 0.7730019092559814, + "learning_rate": 9.933732251933042e-06, + "loss": 0.7868, + "step": 3944 + }, + { + "epoch": 0.2126913952986845, + "grad_norm": 0.8141674995422363, + "learning_rate": 9.93369784532413e-06, + "loss": 0.7699, + "step": 3945 + }, + { + "epoch": 0.212745309467328, + "grad_norm": 0.8050745725631714, + "learning_rate": 9.933663429845097e-06, + "loss": 0.706, + "step": 3946 + }, + { + "epoch": 0.21279922363597154, + "grad_norm": 0.8519124388694763, + "learning_rate": 9.933629005496002e-06, + "loss": 0.8638, + "step": 3947 + }, + { + "epoch": 0.21285313780461504, + "grad_norm": 0.7999953627586365, + "learning_rate": 9.933594572276907e-06, + "loss": 0.7263, + "step": 3948 + }, + { + "epoch": 0.21290705197325857, + "grad_norm": 0.8291010856628418, + "learning_rate": 9.933560130187875e-06, + "loss": 0.8241, + "step": 3949 + }, + { + "epoch": 0.2129609661419021, + "grad_norm": 0.8472279906272888, + "learning_rate": 9.933525679228965e-06, + "loss": 0.8031, + "step": 3950 + }, + { + "epoch": 0.2130148803105456, + "grad_norm": 0.8077083826065063, + "learning_rate": 9.933491219400244e-06, + "loss": 0.7499, + "step": 3951 + }, + { + "epoch": 0.21306879447918914, + "grad_norm": 0.7736468315124512, + "learning_rate": 9.933456750701771e-06, + "loss": 0.7767, + "step": 3952 + }, + { + "epoch": 0.21312270864783264, + "grad_norm": 0.7541413307189941, + "learning_rate": 9.933422273133606e-06, + "loss": 0.7952, + "step": 3953 + }, + { + "epoch": 0.21317662281647617, + "grad_norm": 0.8432198762893677, + "learning_rate": 9.933387786695816e-06, + "loss": 0.8618, + "step": 3954 + }, + { + "epoch": 0.21323053698511968, + "grad_norm": 0.9090738296508789, + "learning_rate": 9.933353291388458e-06, + "loss": 0.8484, + "step": 3955 + }, + { + "epoch": 0.2132844511537632, + "grad_norm": 0.7549050450325012, + "learning_rate": 9.933318787211597e-06, + "loss": 0.842, + "step": 3956 + }, + { + "epoch": 0.21333836532240674, + "grad_norm": 0.7340126633644104, + "learning_rate": 9.933284274165293e-06, + "loss": 0.7253, + "step": 3957 + }, + { + "epoch": 0.21339227949105025, + "grad_norm": 0.7898053526878357, + "learning_rate": 9.933249752249609e-06, + "loss": 0.7364, + "step": 3958 + }, + { + "epoch": 0.21344619365969378, + "grad_norm": 0.7347330451011658, + "learning_rate": 9.933215221464609e-06, + "loss": 0.7613, + "step": 3959 + }, + { + "epoch": 0.21350010782833728, + "grad_norm": 0.7483309507369995, + "learning_rate": 9.933180681810354e-06, + "loss": 0.7351, + "step": 3960 + }, + { + "epoch": 0.21355402199698081, + "grad_norm": 0.8972424864768982, + "learning_rate": 9.933146133286905e-06, + "loss": 0.8067, + "step": 3961 + }, + { + "epoch": 0.21360793616562432, + "grad_norm": 0.9186527729034424, + "learning_rate": 9.933111575894323e-06, + "loss": 0.8375, + "step": 3962 + }, + { + "epoch": 0.21366185033426785, + "grad_norm": 0.7975471019744873, + "learning_rate": 9.933077009632672e-06, + "loss": 0.7288, + "step": 3963 + }, + { + "epoch": 0.21371576450291135, + "grad_norm": 0.8140373229980469, + "learning_rate": 9.933042434502014e-06, + "loss": 0.851, + "step": 3964 + }, + { + "epoch": 0.21376967867155489, + "grad_norm": 0.7657467126846313, + "learning_rate": 9.933007850502412e-06, + "loss": 0.7874, + "step": 3965 + }, + { + "epoch": 0.21382359284019842, + "grad_norm": 0.7267435193061829, + "learning_rate": 9.932973257633927e-06, + "loss": 0.7065, + "step": 3966 + }, + { + "epoch": 0.21387750700884192, + "grad_norm": 0.8350456357002258, + "learning_rate": 9.932938655896622e-06, + "loss": 0.949, + "step": 3967 + }, + { + "epoch": 0.21393142117748545, + "grad_norm": 0.7870462536811829, + "learning_rate": 9.932904045290557e-06, + "loss": 0.754, + "step": 3968 + }, + { + "epoch": 0.21398533534612896, + "grad_norm": 0.9062042236328125, + "learning_rate": 9.932869425815797e-06, + "loss": 0.8169, + "step": 3969 + }, + { + "epoch": 0.2140392495147725, + "grad_norm": 0.7563914656639099, + "learning_rate": 9.932834797472401e-06, + "loss": 0.7848, + "step": 3970 + }, + { + "epoch": 0.214093163683416, + "grad_norm": 0.8287369012832642, + "learning_rate": 9.932800160260437e-06, + "loss": 0.7775, + "step": 3971 + }, + { + "epoch": 0.21414707785205953, + "grad_norm": 0.7961543202400208, + "learning_rate": 9.93276551417996e-06, + "loss": 0.785, + "step": 3972 + }, + { + "epoch": 0.21420099202070303, + "grad_norm": 1.1722525358200073, + "learning_rate": 9.932730859231038e-06, + "loss": 0.8139, + "step": 3973 + }, + { + "epoch": 0.21425490618934656, + "grad_norm": 0.7425355315208435, + "learning_rate": 9.93269619541373e-06, + "loss": 0.8, + "step": 3974 + }, + { + "epoch": 0.2143088203579901, + "grad_norm": 0.7701120376586914, + "learning_rate": 9.9326615227281e-06, + "loss": 0.7766, + "step": 3975 + }, + { + "epoch": 0.2143627345266336, + "grad_norm": 0.7475442886352539, + "learning_rate": 9.932626841174212e-06, + "loss": 0.731, + "step": 3976 + }, + { + "epoch": 0.21441664869527713, + "grad_norm": 0.7970359325408936, + "learning_rate": 9.932592150752122e-06, + "loss": 0.685, + "step": 3977 + }, + { + "epoch": 0.21447056286392063, + "grad_norm": 0.7397587299346924, + "learning_rate": 9.9325574514619e-06, + "loss": 0.7768, + "step": 3978 + }, + { + "epoch": 0.21452447703256416, + "grad_norm": 0.7406956553459167, + "learning_rate": 9.932522743303604e-06, + "loss": 0.7288, + "step": 3979 + }, + { + "epoch": 0.21457839120120767, + "grad_norm": 0.7971269488334656, + "learning_rate": 9.932488026277295e-06, + "loss": 0.8475, + "step": 3980 + }, + { + "epoch": 0.2146323053698512, + "grad_norm": 0.8104044198989868, + "learning_rate": 9.93245330038304e-06, + "loss": 0.8302, + "step": 3981 + }, + { + "epoch": 0.2146862195384947, + "grad_norm": 0.7473177313804626, + "learning_rate": 9.9324185656209e-06, + "loss": 0.7144, + "step": 3982 + }, + { + "epoch": 0.21474013370713824, + "grad_norm": 0.8730058670043945, + "learning_rate": 9.932383821990937e-06, + "loss": 0.7823, + "step": 3983 + }, + { + "epoch": 0.21479404787578177, + "grad_norm": 0.7489315271377563, + "learning_rate": 9.93234906949321e-06, + "loss": 0.791, + "step": 3984 + }, + { + "epoch": 0.21484796204442527, + "grad_norm": 0.811970055103302, + "learning_rate": 9.932314308127785e-06, + "loss": 0.7773, + "step": 3985 + }, + { + "epoch": 0.2149018762130688, + "grad_norm": 0.7983556985855103, + "learning_rate": 9.932279537894726e-06, + "loss": 0.8677, + "step": 3986 + }, + { + "epoch": 0.2149557903817123, + "grad_norm": 0.8278135657310486, + "learning_rate": 9.932244758794095e-06, + "loss": 0.8562, + "step": 3987 + }, + { + "epoch": 0.21500970455035584, + "grad_norm": 0.8001466989517212, + "learning_rate": 9.93220997082595e-06, + "loss": 0.7695, + "step": 3988 + }, + { + "epoch": 0.21506361871899934, + "grad_norm": 0.7240970730781555, + "learning_rate": 9.932175173990359e-06, + "loss": 0.7293, + "step": 3989 + }, + { + "epoch": 0.21511753288764288, + "grad_norm": 0.7863660454750061, + "learning_rate": 9.932140368287381e-06, + "loss": 0.8307, + "step": 3990 + }, + { + "epoch": 0.21517144705628638, + "grad_norm": 0.7192577719688416, + "learning_rate": 9.932105553717079e-06, + "loss": 0.7819, + "step": 3991 + }, + { + "epoch": 0.2152253612249299, + "grad_norm": 0.7139109969139099, + "learning_rate": 9.932070730279517e-06, + "loss": 0.7343, + "step": 3992 + }, + { + "epoch": 0.21527927539357344, + "grad_norm": 0.7812891006469727, + "learning_rate": 9.932035897974759e-06, + "loss": 0.8159, + "step": 3993 + }, + { + "epoch": 0.21533318956221695, + "grad_norm": 0.8222309947013855, + "learning_rate": 9.932001056802863e-06, + "loss": 0.7424, + "step": 3994 + }, + { + "epoch": 0.21538710373086048, + "grad_norm": 0.7709689140319824, + "learning_rate": 9.931966206763896e-06, + "loss": 0.7952, + "step": 3995 + }, + { + "epoch": 0.21544101789950398, + "grad_norm": 0.8006699681282043, + "learning_rate": 9.931931347857919e-06, + "loss": 0.8527, + "step": 3996 + }, + { + "epoch": 0.21549493206814752, + "grad_norm": 0.8302900195121765, + "learning_rate": 9.931896480084993e-06, + "loss": 0.8531, + "step": 3997 + }, + { + "epoch": 0.21554884623679102, + "grad_norm": 0.7552672028541565, + "learning_rate": 9.931861603445183e-06, + "loss": 0.7589, + "step": 3998 + }, + { + "epoch": 0.21560276040543455, + "grad_norm": 0.7574741244316101, + "learning_rate": 9.931826717938551e-06, + "loss": 0.7806, + "step": 3999 + }, + { + "epoch": 0.21565667457407806, + "grad_norm": 0.9765385389328003, + "learning_rate": 9.93179182356516e-06, + "loss": 0.8503, + "step": 4000 + }, + { + "epoch": 0.2157105887427216, + "grad_norm": 0.8695611953735352, + "learning_rate": 9.931756920325073e-06, + "loss": 0.8484, + "step": 4001 + }, + { + "epoch": 0.21576450291136512, + "grad_norm": 0.9320261478424072, + "learning_rate": 9.931722008218351e-06, + "loss": 0.8019, + "step": 4002 + }, + { + "epoch": 0.21581841708000862, + "grad_norm": 0.7879775762557983, + "learning_rate": 9.931687087245059e-06, + "loss": 0.789, + "step": 4003 + }, + { + "epoch": 0.21587233124865216, + "grad_norm": 0.8338239789009094, + "learning_rate": 9.931652157405258e-06, + "loss": 0.7903, + "step": 4004 + }, + { + "epoch": 0.21592624541729566, + "grad_norm": 0.7812073230743408, + "learning_rate": 9.931617218699011e-06, + "loss": 0.8457, + "step": 4005 + }, + { + "epoch": 0.2159801595859392, + "grad_norm": 0.8999424576759338, + "learning_rate": 9.931582271126384e-06, + "loss": 0.7719, + "step": 4006 + }, + { + "epoch": 0.2160340737545827, + "grad_norm": 0.7390351295471191, + "learning_rate": 9.931547314687434e-06, + "loss": 0.7393, + "step": 4007 + }, + { + "epoch": 0.21608798792322623, + "grad_norm": 0.8604621887207031, + "learning_rate": 9.931512349382228e-06, + "loss": 0.8218, + "step": 4008 + }, + { + "epoch": 0.21614190209186973, + "grad_norm": 0.7581399083137512, + "learning_rate": 9.93147737521083e-06, + "loss": 0.6874, + "step": 4009 + }, + { + "epoch": 0.21619581626051326, + "grad_norm": 0.7431824803352356, + "learning_rate": 9.931442392173298e-06, + "loss": 0.7587, + "step": 4010 + }, + { + "epoch": 0.2162497304291568, + "grad_norm": 0.858138382434845, + "learning_rate": 9.931407400269699e-06, + "loss": 0.8672, + "step": 4011 + }, + { + "epoch": 0.2163036445978003, + "grad_norm": 0.7675254940986633, + "learning_rate": 9.931372399500094e-06, + "loss": 0.7608, + "step": 4012 + }, + { + "epoch": 0.21635755876644383, + "grad_norm": 0.8220716714859009, + "learning_rate": 9.931337389864546e-06, + "loss": 0.7495, + "step": 4013 + }, + { + "epoch": 0.21641147293508733, + "grad_norm": 0.8696985244750977, + "learning_rate": 9.93130237136312e-06, + "loss": 0.8872, + "step": 4014 + }, + { + "epoch": 0.21646538710373087, + "grad_norm": 0.8657988905906677, + "learning_rate": 9.931267343995878e-06, + "loss": 0.7733, + "step": 4015 + }, + { + "epoch": 0.21651930127237437, + "grad_norm": 0.7498238682746887, + "learning_rate": 9.93123230776288e-06, + "loss": 0.8208, + "step": 4016 + }, + { + "epoch": 0.2165732154410179, + "grad_norm": 0.8726654648780823, + "learning_rate": 9.931197262664193e-06, + "loss": 0.7924, + "step": 4017 + }, + { + "epoch": 0.2166271296096614, + "grad_norm": 0.7092527747154236, + "learning_rate": 9.931162208699879e-06, + "loss": 0.7351, + "step": 4018 + }, + { + "epoch": 0.21668104377830494, + "grad_norm": 0.7181721329689026, + "learning_rate": 9.931127145869998e-06, + "loss": 0.719, + "step": 4019 + }, + { + "epoch": 0.21673495794694847, + "grad_norm": 0.7992464303970337, + "learning_rate": 9.931092074174618e-06, + "loss": 0.5935, + "step": 4020 + }, + { + "epoch": 0.21678887211559197, + "grad_norm": 0.8293359279632568, + "learning_rate": 9.931056993613796e-06, + "loss": 0.8331, + "step": 4021 + }, + { + "epoch": 0.2168427862842355, + "grad_norm": 1.215417742729187, + "learning_rate": 9.931021904187603e-06, + "loss": 0.8067, + "step": 4022 + }, + { + "epoch": 0.216896700452879, + "grad_norm": 0.8828169107437134, + "learning_rate": 9.930986805896095e-06, + "loss": 0.6962, + "step": 4023 + }, + { + "epoch": 0.21695061462152254, + "grad_norm": 0.8225864171981812, + "learning_rate": 9.930951698739338e-06, + "loss": 0.7497, + "step": 4024 + }, + { + "epoch": 0.21700452879016605, + "grad_norm": 0.825343906879425, + "learning_rate": 9.930916582717396e-06, + "loss": 0.8693, + "step": 4025 + }, + { + "epoch": 0.21705844295880958, + "grad_norm": 0.7945353984832764, + "learning_rate": 9.93088145783033e-06, + "loss": 0.8349, + "step": 4026 + }, + { + "epoch": 0.21711235712745308, + "grad_norm": 0.7948806285858154, + "learning_rate": 9.930846324078205e-06, + "loss": 0.7726, + "step": 4027 + }, + { + "epoch": 0.2171662712960966, + "grad_norm": 0.7694181203842163, + "learning_rate": 9.930811181461081e-06, + "loss": 0.7704, + "step": 4028 + }, + { + "epoch": 0.21722018546474015, + "grad_norm": 0.74179607629776, + "learning_rate": 9.930776029979026e-06, + "loss": 0.8566, + "step": 4029 + }, + { + "epoch": 0.21727409963338365, + "grad_norm": 0.7846640348434448, + "learning_rate": 9.9307408696321e-06, + "loss": 0.7063, + "step": 4030 + }, + { + "epoch": 0.21732801380202718, + "grad_norm": 0.865972638130188, + "learning_rate": 9.930705700420368e-06, + "loss": 0.7553, + "step": 4031 + }, + { + "epoch": 0.21738192797067069, + "grad_norm": 0.90953129529953, + "learning_rate": 9.930670522343891e-06, + "loss": 0.7857, + "step": 4032 + }, + { + "epoch": 0.21743584213931422, + "grad_norm": 0.743373692035675, + "learning_rate": 9.930635335402733e-06, + "loss": 0.6955, + "step": 4033 + }, + { + "epoch": 0.21748975630795772, + "grad_norm": 0.994404137134552, + "learning_rate": 9.930600139596958e-06, + "loss": 0.7886, + "step": 4034 + }, + { + "epoch": 0.21754367047660125, + "grad_norm": 0.7715345621109009, + "learning_rate": 9.93056493492663e-06, + "loss": 0.8261, + "step": 4035 + }, + { + "epoch": 0.21759758464524476, + "grad_norm": 0.8100937604904175, + "learning_rate": 9.93052972139181e-06, + "loss": 0.7828, + "step": 4036 + }, + { + "epoch": 0.2176514988138883, + "grad_norm": 1.0633374452590942, + "learning_rate": 9.930494498992562e-06, + "loss": 0.7885, + "step": 4037 + }, + { + "epoch": 0.21770541298253182, + "grad_norm": 0.766617476940155, + "learning_rate": 9.930459267728951e-06, + "loss": 0.8267, + "step": 4038 + }, + { + "epoch": 0.21775932715117532, + "grad_norm": 0.7761416435241699, + "learning_rate": 9.93042402760104e-06, + "loss": 0.8079, + "step": 4039 + }, + { + "epoch": 0.21781324131981886, + "grad_norm": 0.8123136758804321, + "learning_rate": 9.93038877860889e-06, + "loss": 0.8228, + "step": 4040 + }, + { + "epoch": 0.21786715548846236, + "grad_norm": 0.8818230628967285, + "learning_rate": 9.930353520752567e-06, + "loss": 0.8171, + "step": 4041 + }, + { + "epoch": 0.2179210696571059, + "grad_norm": 1.0989209413528442, + "learning_rate": 9.930318254032131e-06, + "loss": 0.9083, + "step": 4042 + }, + { + "epoch": 0.2179749838257494, + "grad_norm": 0.8373724818229675, + "learning_rate": 9.930282978447649e-06, + "loss": 0.7842, + "step": 4043 + }, + { + "epoch": 0.21802889799439293, + "grad_norm": 0.7905243039131165, + "learning_rate": 9.930247693999185e-06, + "loss": 0.7842, + "step": 4044 + }, + { + "epoch": 0.21808281216303643, + "grad_norm": 0.8310670852661133, + "learning_rate": 9.9302124006868e-06, + "loss": 0.735, + "step": 4045 + }, + { + "epoch": 0.21813672633167996, + "grad_norm": 0.8986020684242249, + "learning_rate": 9.930177098510556e-06, + "loss": 0.9901, + "step": 4046 + }, + { + "epoch": 0.2181906405003235, + "grad_norm": 0.7886272668838501, + "learning_rate": 9.93014178747052e-06, + "loss": 0.8593, + "step": 4047 + }, + { + "epoch": 0.218244554668967, + "grad_norm": 0.8021159768104553, + "learning_rate": 9.930106467566754e-06, + "loss": 0.7114, + "step": 4048 + }, + { + "epoch": 0.21829846883761053, + "grad_norm": 0.7256723642349243, + "learning_rate": 9.930071138799322e-06, + "loss": 0.7537, + "step": 4049 + }, + { + "epoch": 0.21835238300625404, + "grad_norm": 0.8547120094299316, + "learning_rate": 9.930035801168286e-06, + "loss": 0.771, + "step": 4050 + }, + { + "epoch": 0.21840629717489757, + "grad_norm": 0.7411953210830688, + "learning_rate": 9.930000454673711e-06, + "loss": 0.7756, + "step": 4051 + }, + { + "epoch": 0.21846021134354107, + "grad_norm": 0.8918336033821106, + "learning_rate": 9.929965099315659e-06, + "loss": 0.8297, + "step": 4052 + }, + { + "epoch": 0.2185141255121846, + "grad_norm": 0.7391760349273682, + "learning_rate": 9.929929735094196e-06, + "loss": 0.8109, + "step": 4053 + }, + { + "epoch": 0.2185680396808281, + "grad_norm": 0.7272089719772339, + "learning_rate": 9.929894362009384e-06, + "loss": 0.7727, + "step": 4054 + }, + { + "epoch": 0.21862195384947164, + "grad_norm": 0.6963438391685486, + "learning_rate": 9.929858980061287e-06, + "loss": 0.681, + "step": 4055 + }, + { + "epoch": 0.21867586801811517, + "grad_norm": 0.714117169380188, + "learning_rate": 9.929823589249968e-06, + "loss": 0.6973, + "step": 4056 + }, + { + "epoch": 0.21872978218675868, + "grad_norm": 0.8449671268463135, + "learning_rate": 9.92978818957549e-06, + "loss": 0.8321, + "step": 4057 + }, + { + "epoch": 0.2187836963554022, + "grad_norm": 0.8275889754295349, + "learning_rate": 9.92975278103792e-06, + "loss": 0.7589, + "step": 4058 + }, + { + "epoch": 0.2188376105240457, + "grad_norm": 0.8010358214378357, + "learning_rate": 9.929717363637318e-06, + "loss": 0.7673, + "step": 4059 + }, + { + "epoch": 0.21889152469268924, + "grad_norm": 0.8558088541030884, + "learning_rate": 9.92968193737375e-06, + "loss": 0.8374, + "step": 4060 + }, + { + "epoch": 0.21894543886133275, + "grad_norm": 0.8413086533546448, + "learning_rate": 9.929646502247278e-06, + "loss": 0.8522, + "step": 4061 + }, + { + "epoch": 0.21899935302997628, + "grad_norm": 0.7852063775062561, + "learning_rate": 9.929611058257966e-06, + "loss": 0.7475, + "step": 4062 + }, + { + "epoch": 0.2190532671986198, + "grad_norm": 0.752642810344696, + "learning_rate": 9.92957560540588e-06, + "loss": 0.7054, + "step": 4063 + }, + { + "epoch": 0.21910718136726332, + "grad_norm": 0.8099555969238281, + "learning_rate": 9.929540143691079e-06, + "loss": 0.8409, + "step": 4064 + }, + { + "epoch": 0.21916109553590685, + "grad_norm": 0.7962636947631836, + "learning_rate": 9.929504673113632e-06, + "loss": 0.8581, + "step": 4065 + }, + { + "epoch": 0.21921500970455035, + "grad_norm": 0.7996272444725037, + "learning_rate": 9.9294691936736e-06, + "loss": 0.7837, + "step": 4066 + }, + { + "epoch": 0.21926892387319388, + "grad_norm": 0.7685336470603943, + "learning_rate": 9.929433705371046e-06, + "loss": 0.7658, + "step": 4067 + }, + { + "epoch": 0.2193228380418374, + "grad_norm": 0.8068851232528687, + "learning_rate": 9.929398208206036e-06, + "loss": 0.8141, + "step": 4068 + }, + { + "epoch": 0.21937675221048092, + "grad_norm": 0.7585315108299255, + "learning_rate": 9.929362702178634e-06, + "loss": 0.7533, + "step": 4069 + }, + { + "epoch": 0.21943066637912442, + "grad_norm": 1.1367120742797852, + "learning_rate": 9.9293271872889e-06, + "loss": 0.925, + "step": 4070 + }, + { + "epoch": 0.21948458054776795, + "grad_norm": 0.8255071640014648, + "learning_rate": 9.929291663536902e-06, + "loss": 0.6905, + "step": 4071 + }, + { + "epoch": 0.2195384947164115, + "grad_norm": 0.805061936378479, + "learning_rate": 9.929256130922702e-06, + "loss": 0.7787, + "step": 4072 + }, + { + "epoch": 0.219592408885055, + "grad_norm": 0.7786453366279602, + "learning_rate": 9.929220589446365e-06, + "loss": 0.8182, + "step": 4073 + }, + { + "epoch": 0.21964632305369852, + "grad_norm": 0.925881028175354, + "learning_rate": 9.929185039107955e-06, + "loss": 0.8611, + "step": 4074 + }, + { + "epoch": 0.21970023722234203, + "grad_norm": 0.7396146059036255, + "learning_rate": 9.929149479907533e-06, + "loss": 0.8427, + "step": 4075 + }, + { + "epoch": 0.21975415139098556, + "grad_norm": 0.8113187551498413, + "learning_rate": 9.929113911845167e-06, + "loss": 0.7436, + "step": 4076 + }, + { + "epoch": 0.21980806555962906, + "grad_norm": 0.8359308838844299, + "learning_rate": 9.929078334920918e-06, + "loss": 0.7606, + "step": 4077 + }, + { + "epoch": 0.2198619797282726, + "grad_norm": 0.9729122519493103, + "learning_rate": 9.92904274913485e-06, + "loss": 0.774, + "step": 4078 + }, + { + "epoch": 0.2199158938969161, + "grad_norm": 0.7794427871704102, + "learning_rate": 9.92900715448703e-06, + "loss": 0.8311, + "step": 4079 + }, + { + "epoch": 0.21996980806555963, + "grad_norm": 0.8245888352394104, + "learning_rate": 9.928971550977519e-06, + "loss": 0.8461, + "step": 4080 + }, + { + "epoch": 0.22002372223420316, + "grad_norm": 0.7551932334899902, + "learning_rate": 9.92893593860638e-06, + "loss": 0.7899, + "step": 4081 + }, + { + "epoch": 0.22007763640284667, + "grad_norm": 0.7409234642982483, + "learning_rate": 9.928900317373681e-06, + "loss": 0.7847, + "step": 4082 + }, + { + "epoch": 0.2201315505714902, + "grad_norm": 1.5267807245254517, + "learning_rate": 9.928864687279485e-06, + "loss": 1.0547, + "step": 4083 + }, + { + "epoch": 0.2201854647401337, + "grad_norm": 0.832936704158783, + "learning_rate": 9.928829048323853e-06, + "loss": 0.8919, + "step": 4084 + }, + { + "epoch": 0.22023937890877723, + "grad_norm": 0.7933560609817505, + "learning_rate": 9.928793400506852e-06, + "loss": 0.833, + "step": 4085 + }, + { + "epoch": 0.22029329307742074, + "grad_norm": 0.7095281481742859, + "learning_rate": 9.928757743828545e-06, + "loss": 0.7383, + "step": 4086 + }, + { + "epoch": 0.22034720724606427, + "grad_norm": 0.7681827545166016, + "learning_rate": 9.928722078288998e-06, + "loss": 0.7573, + "step": 4087 + }, + { + "epoch": 0.22040112141470777, + "grad_norm": 0.7923296689987183, + "learning_rate": 9.928686403888271e-06, + "loss": 0.7574, + "step": 4088 + }, + { + "epoch": 0.2204550355833513, + "grad_norm": 0.7329868674278259, + "learning_rate": 9.928650720626431e-06, + "loss": 0.7798, + "step": 4089 + }, + { + "epoch": 0.22050894975199484, + "grad_norm": 0.6931655406951904, + "learning_rate": 9.928615028503542e-06, + "loss": 0.687, + "step": 4090 + }, + { + "epoch": 0.22056286392063834, + "grad_norm": 0.8253043293952942, + "learning_rate": 9.928579327519668e-06, + "loss": 0.7611, + "step": 4091 + }, + { + "epoch": 0.22061677808928187, + "grad_norm": 0.9808893799781799, + "learning_rate": 9.928543617674873e-06, + "loss": 0.8013, + "step": 4092 + }, + { + "epoch": 0.22067069225792538, + "grad_norm": 0.765825092792511, + "learning_rate": 9.928507898969222e-06, + "loss": 0.704, + "step": 4093 + }, + { + "epoch": 0.2207246064265689, + "grad_norm": 0.8836820721626282, + "learning_rate": 9.928472171402777e-06, + "loss": 0.7862, + "step": 4094 + }, + { + "epoch": 0.2207785205952124, + "grad_norm": 0.7684285640716553, + "learning_rate": 9.928436434975606e-06, + "loss": 0.6694, + "step": 4095 + }, + { + "epoch": 0.22083243476385594, + "grad_norm": 0.8041714429855347, + "learning_rate": 9.92840068968777e-06, + "loss": 0.7593, + "step": 4096 + }, + { + "epoch": 0.22088634893249945, + "grad_norm": 0.8422744274139404, + "learning_rate": 9.928364935539331e-06, + "loss": 0.7447, + "step": 4097 + }, + { + "epoch": 0.22094026310114298, + "grad_norm": 0.8337421417236328, + "learning_rate": 9.928329172530361e-06, + "loss": 0.8273, + "step": 4098 + }, + { + "epoch": 0.2209941772697865, + "grad_norm": 0.9864090085029602, + "learning_rate": 9.928293400660918e-06, + "loss": 0.9286, + "step": 4099 + }, + { + "epoch": 0.22104809143843002, + "grad_norm": 0.8052615523338318, + "learning_rate": 9.928257619931068e-06, + "loss": 0.78, + "step": 4100 + }, + { + "epoch": 0.22110200560707355, + "grad_norm": 0.8060072064399719, + "learning_rate": 9.928221830340876e-06, + "loss": 0.7759, + "step": 4101 + }, + { + "epoch": 0.22115591977571705, + "grad_norm": 0.8900836706161499, + "learning_rate": 9.928186031890405e-06, + "loss": 0.8144, + "step": 4102 + }, + { + "epoch": 0.22120983394436058, + "grad_norm": 0.7392085194587708, + "learning_rate": 9.928150224579723e-06, + "loss": 0.7787, + "step": 4103 + }, + { + "epoch": 0.2212637481130041, + "grad_norm": 0.9728571772575378, + "learning_rate": 9.92811440840889e-06, + "loss": 0.8359, + "step": 4104 + }, + { + "epoch": 0.22131766228164762, + "grad_norm": 0.9601667523384094, + "learning_rate": 9.92807858337797e-06, + "loss": 0.7868, + "step": 4105 + }, + { + "epoch": 0.22137157645029112, + "grad_norm": 0.7148939371109009, + "learning_rate": 9.92804274948703e-06, + "loss": 0.7266, + "step": 4106 + }, + { + "epoch": 0.22142549061893466, + "grad_norm": 0.7482119798660278, + "learning_rate": 9.928006906736136e-06, + "loss": 0.7602, + "step": 4107 + }, + { + "epoch": 0.2214794047875782, + "grad_norm": 0.8613291382789612, + "learning_rate": 9.927971055125348e-06, + "loss": 0.7747, + "step": 4108 + }, + { + "epoch": 0.2215333189562217, + "grad_norm": 0.7668588757514954, + "learning_rate": 9.927935194654733e-06, + "loss": 0.7572, + "step": 4109 + }, + { + "epoch": 0.22158723312486522, + "grad_norm": 0.7911893725395203, + "learning_rate": 9.927899325324356e-06, + "loss": 0.8563, + "step": 4110 + }, + { + "epoch": 0.22164114729350873, + "grad_norm": 0.8059565424919128, + "learning_rate": 9.92786344713428e-06, + "loss": 0.893, + "step": 4111 + }, + { + "epoch": 0.22169506146215226, + "grad_norm": 0.8575117588043213, + "learning_rate": 9.92782756008457e-06, + "loss": 0.8475, + "step": 4112 + }, + { + "epoch": 0.22174897563079576, + "grad_norm": 0.7179403901100159, + "learning_rate": 9.927791664175292e-06, + "loss": 0.7914, + "step": 4113 + }, + { + "epoch": 0.2218028897994393, + "grad_norm": 0.8687799572944641, + "learning_rate": 9.927755759406508e-06, + "loss": 0.7447, + "step": 4114 + }, + { + "epoch": 0.2218568039680828, + "grad_norm": 0.7538093328475952, + "learning_rate": 9.927719845778283e-06, + "loss": 0.6988, + "step": 4115 + }, + { + "epoch": 0.22191071813672633, + "grad_norm": 0.7586212754249573, + "learning_rate": 9.927683923290685e-06, + "loss": 0.7743, + "step": 4116 + }, + { + "epoch": 0.22196463230536986, + "grad_norm": 0.797385573387146, + "learning_rate": 9.927647991943774e-06, + "loss": 0.7541, + "step": 4117 + }, + { + "epoch": 0.22201854647401337, + "grad_norm": 0.7193878293037415, + "learning_rate": 9.927612051737617e-06, + "loss": 0.758, + "step": 4118 + }, + { + "epoch": 0.2220724606426569, + "grad_norm": 0.7417513132095337, + "learning_rate": 9.927576102672276e-06, + "loss": 0.7902, + "step": 4119 + }, + { + "epoch": 0.2221263748113004, + "grad_norm": 0.8947266936302185, + "learning_rate": 9.927540144747821e-06, + "loss": 0.9153, + "step": 4120 + }, + { + "epoch": 0.22218028897994394, + "grad_norm": 0.7990988492965698, + "learning_rate": 9.927504177964311e-06, + "loss": 0.8487, + "step": 4121 + }, + { + "epoch": 0.22223420314858744, + "grad_norm": 0.801420271396637, + "learning_rate": 9.927468202321816e-06, + "loss": 0.7752, + "step": 4122 + }, + { + "epoch": 0.22228811731723097, + "grad_norm": 0.7953904271125793, + "learning_rate": 9.927432217820394e-06, + "loss": 0.7249, + "step": 4123 + }, + { + "epoch": 0.22234203148587448, + "grad_norm": 0.8257938027381897, + "learning_rate": 9.927396224460116e-06, + "loss": 0.8311, + "step": 4124 + }, + { + "epoch": 0.222395945654518, + "grad_norm": 0.7679301500320435, + "learning_rate": 9.927360222241042e-06, + "loss": 0.7155, + "step": 4125 + }, + { + "epoch": 0.22244985982316154, + "grad_norm": 0.7410153150558472, + "learning_rate": 9.92732421116324e-06, + "loss": 0.7007, + "step": 4126 + }, + { + "epoch": 0.22250377399180504, + "grad_norm": 0.8296052813529968, + "learning_rate": 9.927288191226774e-06, + "loss": 0.7546, + "step": 4127 + }, + { + "epoch": 0.22255768816044857, + "grad_norm": 1.051527500152588, + "learning_rate": 9.927252162431708e-06, + "loss": 0.7039, + "step": 4128 + }, + { + "epoch": 0.22261160232909208, + "grad_norm": 0.8625979423522949, + "learning_rate": 9.927216124778108e-06, + "loss": 0.7348, + "step": 4129 + }, + { + "epoch": 0.2226655164977356, + "grad_norm": 0.8892311453819275, + "learning_rate": 9.927180078266038e-06, + "loss": 0.8221, + "step": 4130 + }, + { + "epoch": 0.22271943066637911, + "grad_norm": 0.8888135552406311, + "learning_rate": 9.927144022895562e-06, + "loss": 0.8953, + "step": 4131 + }, + { + "epoch": 0.22277334483502265, + "grad_norm": 0.8566902279853821, + "learning_rate": 9.927107958666746e-06, + "loss": 0.7894, + "step": 4132 + }, + { + "epoch": 0.22282725900366615, + "grad_norm": 0.821061909198761, + "learning_rate": 9.927071885579654e-06, + "loss": 0.8271, + "step": 4133 + }, + { + "epoch": 0.22288117317230968, + "grad_norm": 1.0494943857192993, + "learning_rate": 9.927035803634351e-06, + "loss": 0.79, + "step": 4134 + }, + { + "epoch": 0.22293508734095321, + "grad_norm": 1.730763554573059, + "learning_rate": 9.926999712830903e-06, + "loss": 0.7944, + "step": 4135 + }, + { + "epoch": 0.22298900150959672, + "grad_norm": 0.799264669418335, + "learning_rate": 9.926963613169372e-06, + "loss": 0.7922, + "step": 4136 + }, + { + "epoch": 0.22304291567824025, + "grad_norm": 0.7929497361183167, + "learning_rate": 9.926927504649826e-06, + "loss": 0.8809, + "step": 4137 + }, + { + "epoch": 0.22309682984688375, + "grad_norm": 0.8016352653503418, + "learning_rate": 9.92689138727233e-06, + "loss": 0.8839, + "step": 4138 + }, + { + "epoch": 0.22315074401552729, + "grad_norm": 0.7640015482902527, + "learning_rate": 9.926855261036947e-06, + "loss": 0.7351, + "step": 4139 + }, + { + "epoch": 0.2232046581841708, + "grad_norm": 0.7678577899932861, + "learning_rate": 9.926819125943743e-06, + "loss": 0.7249, + "step": 4140 + }, + { + "epoch": 0.22325857235281432, + "grad_norm": 0.9195266962051392, + "learning_rate": 9.926782981992782e-06, + "loss": 0.7459, + "step": 4141 + }, + { + "epoch": 0.22331248652145783, + "grad_norm": 0.9069259762763977, + "learning_rate": 9.92674682918413e-06, + "loss": 0.8569, + "step": 4142 + }, + { + "epoch": 0.22336640069010136, + "grad_norm": 0.8251914978027344, + "learning_rate": 9.926710667517853e-06, + "loss": 0.7659, + "step": 4143 + }, + { + "epoch": 0.2234203148587449, + "grad_norm": 0.7647615671157837, + "learning_rate": 9.926674496994013e-06, + "loss": 0.7847, + "step": 4144 + }, + { + "epoch": 0.2234742290273884, + "grad_norm": 0.7971541285514832, + "learning_rate": 9.926638317612678e-06, + "loss": 0.7033, + "step": 4145 + }, + { + "epoch": 0.22352814319603193, + "grad_norm": 0.8472650051116943, + "learning_rate": 9.92660212937391e-06, + "loss": 0.7953, + "step": 4146 + }, + { + "epoch": 0.22358205736467543, + "grad_norm": 0.7527226805686951, + "learning_rate": 9.926565932277776e-06, + "loss": 0.7402, + "step": 4147 + }, + { + "epoch": 0.22363597153331896, + "grad_norm": 0.8266519904136658, + "learning_rate": 9.926529726324344e-06, + "loss": 0.8852, + "step": 4148 + }, + { + "epoch": 0.22368988570196247, + "grad_norm": 0.8195723295211792, + "learning_rate": 9.926493511513673e-06, + "loss": 0.8529, + "step": 4149 + }, + { + "epoch": 0.223743799870606, + "grad_norm": 0.821739912033081, + "learning_rate": 9.92645728784583e-06, + "loss": 0.8809, + "step": 4150 + }, + { + "epoch": 0.2237977140392495, + "grad_norm": 0.8063598275184631, + "learning_rate": 9.926421055320883e-06, + "loss": 0.8219, + "step": 4151 + }, + { + "epoch": 0.22385162820789303, + "grad_norm": 0.7054430246353149, + "learning_rate": 9.926384813938896e-06, + "loss": 0.6726, + "step": 4152 + }, + { + "epoch": 0.22390554237653656, + "grad_norm": 0.8751134872436523, + "learning_rate": 9.926348563699933e-06, + "loss": 0.8059, + "step": 4153 + }, + { + "epoch": 0.22395945654518007, + "grad_norm": 0.8193408846855164, + "learning_rate": 9.92631230460406e-06, + "loss": 0.7078, + "step": 4154 + }, + { + "epoch": 0.2240133707138236, + "grad_norm": 0.7827375531196594, + "learning_rate": 9.92627603665134e-06, + "loss": 0.7604, + "step": 4155 + }, + { + "epoch": 0.2240672848824671, + "grad_norm": 0.7906658053398132, + "learning_rate": 9.926239759841842e-06, + "loss": 0.7428, + "step": 4156 + }, + { + "epoch": 0.22412119905111064, + "grad_norm": 0.8965858817100525, + "learning_rate": 9.92620347417563e-06, + "loss": 0.805, + "step": 4157 + }, + { + "epoch": 0.22417511321975414, + "grad_norm": 0.7383534908294678, + "learning_rate": 9.926167179652767e-06, + "loss": 0.8041, + "step": 4158 + }, + { + "epoch": 0.22422902738839767, + "grad_norm": 0.7922899127006531, + "learning_rate": 9.926130876273321e-06, + "loss": 0.8966, + "step": 4159 + }, + { + "epoch": 0.22428294155704118, + "grad_norm": 0.7780346870422363, + "learning_rate": 9.926094564037354e-06, + "loss": 0.787, + "step": 4160 + }, + { + "epoch": 0.2243368557256847, + "grad_norm": 0.8276410102844238, + "learning_rate": 9.926058242944936e-06, + "loss": 0.7222, + "step": 4161 + }, + { + "epoch": 0.22439076989432824, + "grad_norm": 0.8523558378219604, + "learning_rate": 9.926021912996128e-06, + "loss": 0.7784, + "step": 4162 + }, + { + "epoch": 0.22444468406297174, + "grad_norm": 1.0391061305999756, + "learning_rate": 9.925985574190997e-06, + "loss": 0.8078, + "step": 4163 + }, + { + "epoch": 0.22449859823161528, + "grad_norm": 1.09534752368927, + "learning_rate": 9.925949226529609e-06, + "loss": 0.8317, + "step": 4164 + }, + { + "epoch": 0.22455251240025878, + "grad_norm": 1.0554418563842773, + "learning_rate": 9.925912870012028e-06, + "loss": 0.8352, + "step": 4165 + }, + { + "epoch": 0.2246064265689023, + "grad_norm": 0.889376699924469, + "learning_rate": 9.92587650463832e-06, + "loss": 0.7787, + "step": 4166 + }, + { + "epoch": 0.22466034073754582, + "grad_norm": 0.8486199378967285, + "learning_rate": 9.92584013040855e-06, + "loss": 0.8005, + "step": 4167 + }, + { + "epoch": 0.22471425490618935, + "grad_norm": 0.7989416718482971, + "learning_rate": 9.925803747322786e-06, + "loss": 0.7258, + "step": 4168 + }, + { + "epoch": 0.22476816907483288, + "grad_norm": 0.8066874146461487, + "learning_rate": 9.925767355381088e-06, + "loss": 0.7334, + "step": 4169 + }, + { + "epoch": 0.22482208324347638, + "grad_norm": 0.7679908871650696, + "learning_rate": 9.925730954583529e-06, + "loss": 0.8172, + "step": 4170 + }, + { + "epoch": 0.22487599741211992, + "grad_norm": 0.8524256944656372, + "learning_rate": 9.925694544930165e-06, + "loss": 0.788, + "step": 4171 + }, + { + "epoch": 0.22492991158076342, + "grad_norm": 0.7501714825630188, + "learning_rate": 9.925658126421069e-06, + "loss": 0.7749, + "step": 4172 + }, + { + "epoch": 0.22498382574940695, + "grad_norm": 0.7706030607223511, + "learning_rate": 9.925621699056304e-06, + "loss": 0.7231, + "step": 4173 + }, + { + "epoch": 0.22503773991805046, + "grad_norm": 0.8854154348373413, + "learning_rate": 9.925585262835936e-06, + "loss": 0.8278, + "step": 4174 + }, + { + "epoch": 0.225091654086694, + "grad_norm": 0.7319517731666565, + "learning_rate": 9.925548817760029e-06, + "loss": 0.6935, + "step": 4175 + }, + { + "epoch": 0.2251455682553375, + "grad_norm": 0.7906307578086853, + "learning_rate": 9.925512363828652e-06, + "loss": 0.6917, + "step": 4176 + }, + { + "epoch": 0.22519948242398102, + "grad_norm": 0.7849681377410889, + "learning_rate": 9.925475901041865e-06, + "loss": 0.7164, + "step": 4177 + }, + { + "epoch": 0.22525339659262456, + "grad_norm": 0.7835176587104797, + "learning_rate": 9.925439429399737e-06, + "loss": 0.8398, + "step": 4178 + }, + { + "epoch": 0.22530731076126806, + "grad_norm": 0.7237651944160461, + "learning_rate": 9.925402948902334e-06, + "loss": 0.7466, + "step": 4179 + }, + { + "epoch": 0.2253612249299116, + "grad_norm": 0.7823938131332397, + "learning_rate": 9.925366459549721e-06, + "loss": 0.7348, + "step": 4180 + }, + { + "epoch": 0.2254151390985551, + "grad_norm": 0.8057203888893127, + "learning_rate": 9.925329961341964e-06, + "loss": 0.6959, + "step": 4181 + }, + { + "epoch": 0.22546905326719863, + "grad_norm": 0.7731473445892334, + "learning_rate": 9.925293454279125e-06, + "loss": 0.817, + "step": 4182 + }, + { + "epoch": 0.22552296743584213, + "grad_norm": 0.7807347178459167, + "learning_rate": 9.925256938361276e-06, + "loss": 0.8092, + "step": 4183 + }, + { + "epoch": 0.22557688160448566, + "grad_norm": 0.9550508260726929, + "learning_rate": 9.925220413588478e-06, + "loss": 0.777, + "step": 4184 + }, + { + "epoch": 0.22563079577312917, + "grad_norm": 0.7147027254104614, + "learning_rate": 9.925183879960799e-06, + "loss": 0.7964, + "step": 4185 + }, + { + "epoch": 0.2256847099417727, + "grad_norm": 0.8344054222106934, + "learning_rate": 9.925147337478302e-06, + "loss": 0.8445, + "step": 4186 + }, + { + "epoch": 0.22573862411041623, + "grad_norm": 0.7597602605819702, + "learning_rate": 9.925110786141055e-06, + "loss": 0.7832, + "step": 4187 + }, + { + "epoch": 0.22579253827905973, + "grad_norm": 0.7721429467201233, + "learning_rate": 9.925074225949123e-06, + "loss": 0.7126, + "step": 4188 + }, + { + "epoch": 0.22584645244770327, + "grad_norm": 1.0660802125930786, + "learning_rate": 9.925037656902572e-06, + "loss": 0.7464, + "step": 4189 + }, + { + "epoch": 0.22590036661634677, + "grad_norm": 1.1455479860305786, + "learning_rate": 9.925001079001465e-06, + "loss": 0.7962, + "step": 4190 + }, + { + "epoch": 0.2259542807849903, + "grad_norm": 0.7436321377754211, + "learning_rate": 9.924964492245874e-06, + "loss": 0.7943, + "step": 4191 + }, + { + "epoch": 0.2260081949536338, + "grad_norm": 0.8470258712768555, + "learning_rate": 9.92492789663586e-06, + "loss": 0.8384, + "step": 4192 + }, + { + "epoch": 0.22606210912227734, + "grad_norm": 0.7316015958786011, + "learning_rate": 9.92489129217149e-06, + "loss": 0.6658, + "step": 4193 + }, + { + "epoch": 0.22611602329092084, + "grad_norm": 0.8184043765068054, + "learning_rate": 9.924854678852829e-06, + "loss": 0.77, + "step": 4194 + }, + { + "epoch": 0.22616993745956437, + "grad_norm": 0.8100526928901672, + "learning_rate": 9.924818056679943e-06, + "loss": 0.7668, + "step": 4195 + }, + { + "epoch": 0.2262238516282079, + "grad_norm": 0.7480085492134094, + "learning_rate": 9.924781425652899e-06, + "loss": 0.7623, + "step": 4196 + }, + { + "epoch": 0.2262777657968514, + "grad_norm": 0.8250038623809814, + "learning_rate": 9.924744785771762e-06, + "loss": 0.8567, + "step": 4197 + }, + { + "epoch": 0.22633167996549494, + "grad_norm": 0.7686489224433899, + "learning_rate": 9.924708137036599e-06, + "loss": 0.7706, + "step": 4198 + }, + { + "epoch": 0.22638559413413845, + "grad_norm": 0.735899806022644, + "learning_rate": 9.924671479447474e-06, + "loss": 0.7753, + "step": 4199 + }, + { + "epoch": 0.22643950830278198, + "grad_norm": 0.9740009307861328, + "learning_rate": 9.924634813004455e-06, + "loss": 0.7569, + "step": 4200 + }, + { + "epoch": 0.22649342247142548, + "grad_norm": 1.0002168416976929, + "learning_rate": 9.924598137707606e-06, + "loss": 0.8258, + "step": 4201 + }, + { + "epoch": 0.226547336640069, + "grad_norm": 0.6893144845962524, + "learning_rate": 9.924561453556993e-06, + "loss": 0.7139, + "step": 4202 + }, + { + "epoch": 0.22660125080871252, + "grad_norm": 0.8272411227226257, + "learning_rate": 9.924524760552684e-06, + "loss": 0.8422, + "step": 4203 + }, + { + "epoch": 0.22665516497735605, + "grad_norm": 0.7915756702423096, + "learning_rate": 9.924488058694743e-06, + "loss": 0.797, + "step": 4204 + }, + { + "epoch": 0.22670907914599958, + "grad_norm": 0.9074721932411194, + "learning_rate": 9.924451347983238e-06, + "loss": 0.837, + "step": 4205 + }, + { + "epoch": 0.22676299331464309, + "grad_norm": 0.7446406483650208, + "learning_rate": 9.92441462841823e-06, + "loss": 0.7532, + "step": 4206 + }, + { + "epoch": 0.22681690748328662, + "grad_norm": 0.7998174428939819, + "learning_rate": 9.924377899999793e-06, + "loss": 0.7768, + "step": 4207 + }, + { + "epoch": 0.22687082165193012, + "grad_norm": 0.7808948755264282, + "learning_rate": 9.924341162727987e-06, + "loss": 0.8571, + "step": 4208 + }, + { + "epoch": 0.22692473582057365, + "grad_norm": 0.837177574634552, + "learning_rate": 9.924304416602879e-06, + "loss": 0.7659, + "step": 4209 + }, + { + "epoch": 0.22697864998921716, + "grad_norm": 0.922913670539856, + "learning_rate": 9.924267661624536e-06, + "loss": 0.7124, + "step": 4210 + }, + { + "epoch": 0.2270325641578607, + "grad_norm": 0.7991519570350647, + "learning_rate": 9.924230897793024e-06, + "loss": 0.7212, + "step": 4211 + }, + { + "epoch": 0.2270864783265042, + "grad_norm": 0.7561559081077576, + "learning_rate": 9.924194125108409e-06, + "loss": 0.7116, + "step": 4212 + }, + { + "epoch": 0.22714039249514772, + "grad_norm": 0.8377161026000977, + "learning_rate": 9.924157343570758e-06, + "loss": 0.8286, + "step": 4213 + }, + { + "epoch": 0.22719430666379126, + "grad_norm": 0.7423402070999146, + "learning_rate": 9.924120553180135e-06, + "loss": 0.7146, + "step": 4214 + }, + { + "epoch": 0.22724822083243476, + "grad_norm": 0.867027223110199, + "learning_rate": 9.924083753936607e-06, + "loss": 0.9115, + "step": 4215 + }, + { + "epoch": 0.2273021350010783, + "grad_norm": 0.8492380976676941, + "learning_rate": 9.924046945840243e-06, + "loss": 0.8469, + "step": 4216 + }, + { + "epoch": 0.2273560491697218, + "grad_norm": 0.9068216681480408, + "learning_rate": 9.924010128891104e-06, + "loss": 0.8478, + "step": 4217 + }, + { + "epoch": 0.22740996333836533, + "grad_norm": 0.8054717779159546, + "learning_rate": 9.92397330308926e-06, + "loss": 0.8606, + "step": 4218 + }, + { + "epoch": 0.22746387750700883, + "grad_norm": 0.7788351774215698, + "learning_rate": 9.923936468434777e-06, + "loss": 0.7892, + "step": 4219 + }, + { + "epoch": 0.22751779167565236, + "grad_norm": 0.7584444284439087, + "learning_rate": 9.923899624927717e-06, + "loss": 0.7834, + "step": 4220 + }, + { + "epoch": 0.22757170584429587, + "grad_norm": 0.7948986291885376, + "learning_rate": 9.923862772568154e-06, + "loss": 0.8158, + "step": 4221 + }, + { + "epoch": 0.2276256200129394, + "grad_norm": 0.9347550868988037, + "learning_rate": 9.923825911356146e-06, + "loss": 0.8955, + "step": 4222 + }, + { + "epoch": 0.22767953418158293, + "grad_norm": 0.7694705724716187, + "learning_rate": 9.923789041291765e-06, + "loss": 0.7797, + "step": 4223 + }, + { + "epoch": 0.22773344835022644, + "grad_norm": 0.7127852439880371, + "learning_rate": 9.923752162375076e-06, + "loss": 0.7026, + "step": 4224 + }, + { + "epoch": 0.22778736251886997, + "grad_norm": 0.9811069369316101, + "learning_rate": 9.923715274606142e-06, + "loss": 0.7804, + "step": 4225 + }, + { + "epoch": 0.22784127668751347, + "grad_norm": 0.8820962309837341, + "learning_rate": 9.923678377985035e-06, + "loss": 0.7807, + "step": 4226 + }, + { + "epoch": 0.227895190856157, + "grad_norm": 0.9057408571243286, + "learning_rate": 9.923641472511819e-06, + "loss": 0.7855, + "step": 4227 + }, + { + "epoch": 0.2279491050248005, + "grad_norm": 0.8836835622787476, + "learning_rate": 9.923604558186557e-06, + "loss": 0.7507, + "step": 4228 + }, + { + "epoch": 0.22800301919344404, + "grad_norm": 0.7494282722473145, + "learning_rate": 9.923567635009319e-06, + "loss": 0.7939, + "step": 4229 + }, + { + "epoch": 0.22805693336208754, + "grad_norm": 0.9468182921409607, + "learning_rate": 9.92353070298017e-06, + "loss": 0.7739, + "step": 4230 + }, + { + "epoch": 0.22811084753073108, + "grad_norm": 0.8671477437019348, + "learning_rate": 9.923493762099177e-06, + "loss": 0.8455, + "step": 4231 + }, + { + "epoch": 0.2281647616993746, + "grad_norm": 0.7388983964920044, + "learning_rate": 9.923456812366405e-06, + "loss": 0.8193, + "step": 4232 + }, + { + "epoch": 0.2282186758680181, + "grad_norm": 0.8403687477111816, + "learning_rate": 9.923419853781924e-06, + "loss": 0.8591, + "step": 4233 + }, + { + "epoch": 0.22827259003666164, + "grad_norm": 0.8540427684783936, + "learning_rate": 9.923382886345797e-06, + "loss": 0.8384, + "step": 4234 + }, + { + "epoch": 0.22832650420530515, + "grad_norm": 0.8174583911895752, + "learning_rate": 9.923345910058092e-06, + "loss": 0.8088, + "step": 4235 + }, + { + "epoch": 0.22838041837394868, + "grad_norm": 0.8237600326538086, + "learning_rate": 9.923308924918876e-06, + "loss": 0.7776, + "step": 4236 + }, + { + "epoch": 0.22843433254259218, + "grad_norm": 0.7644588947296143, + "learning_rate": 9.923271930928213e-06, + "loss": 0.7916, + "step": 4237 + }, + { + "epoch": 0.22848824671123572, + "grad_norm": 0.7141766548156738, + "learning_rate": 9.923234928086172e-06, + "loss": 0.7218, + "step": 4238 + }, + { + "epoch": 0.22854216087987922, + "grad_norm": 0.6722819805145264, + "learning_rate": 9.923197916392816e-06, + "loss": 0.7451, + "step": 4239 + }, + { + "epoch": 0.22859607504852275, + "grad_norm": 0.8109803199768066, + "learning_rate": 9.923160895848217e-06, + "loss": 0.7948, + "step": 4240 + }, + { + "epoch": 0.22864998921716628, + "grad_norm": 0.7268984913825989, + "learning_rate": 9.923123866452437e-06, + "loss": 0.7946, + "step": 4241 + }, + { + "epoch": 0.2287039033858098, + "grad_norm": 0.7497883439064026, + "learning_rate": 9.923086828205546e-06, + "loss": 0.7594, + "step": 4242 + }, + { + "epoch": 0.22875781755445332, + "grad_norm": 0.7800997495651245, + "learning_rate": 9.92304978110761e-06, + "loss": 0.7278, + "step": 4243 + }, + { + "epoch": 0.22881173172309682, + "grad_norm": 0.7802282571792603, + "learning_rate": 9.923012725158692e-06, + "loss": 0.8135, + "step": 4244 + }, + { + "epoch": 0.22886564589174035, + "grad_norm": 0.6718098521232605, + "learning_rate": 9.92297566035886e-06, + "loss": 0.7554, + "step": 4245 + }, + { + "epoch": 0.22891956006038386, + "grad_norm": 0.9285357594490051, + "learning_rate": 9.922938586708184e-06, + "loss": 0.8134, + "step": 4246 + }, + { + "epoch": 0.2289734742290274, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.922901504206728e-06, + "loss": 0.7114, + "step": 4247 + }, + { + "epoch": 0.2290273883976709, + "grad_norm": 0.893153190612793, + "learning_rate": 9.922864412854558e-06, + "loss": 0.798, + "step": 4248 + }, + { + "epoch": 0.22908130256631443, + "grad_norm": 0.9572556614875793, + "learning_rate": 9.922827312651744e-06, + "loss": 0.8467, + "step": 4249 + }, + { + "epoch": 0.22913521673495796, + "grad_norm": 0.8193963766098022, + "learning_rate": 9.922790203598349e-06, + "loss": 0.7466, + "step": 4250 + }, + { + "epoch": 0.22918913090360146, + "grad_norm": 0.8693044185638428, + "learning_rate": 9.922753085694441e-06, + "loss": 0.7253, + "step": 4251 + }, + { + "epoch": 0.229243045072245, + "grad_norm": 0.7820607423782349, + "learning_rate": 9.922715958940086e-06, + "loss": 0.8457, + "step": 4252 + }, + { + "epoch": 0.2292969592408885, + "grad_norm": 0.8323820233345032, + "learning_rate": 9.922678823335353e-06, + "loss": 0.8532, + "step": 4253 + }, + { + "epoch": 0.22935087340953203, + "grad_norm": 0.7978707551956177, + "learning_rate": 9.922641678880306e-06, + "loss": 0.7549, + "step": 4254 + }, + { + "epoch": 0.22940478757817553, + "grad_norm": 0.8820145726203918, + "learning_rate": 9.922604525575014e-06, + "loss": 0.746, + "step": 4255 + }, + { + "epoch": 0.22945870174681907, + "grad_norm": 0.7836315631866455, + "learning_rate": 9.922567363419544e-06, + "loss": 0.8095, + "step": 4256 + }, + { + "epoch": 0.22951261591546257, + "grad_norm": 0.7744200825691223, + "learning_rate": 9.922530192413962e-06, + "loss": 0.8599, + "step": 4257 + }, + { + "epoch": 0.2295665300841061, + "grad_norm": 0.861124575138092, + "learning_rate": 9.922493012558334e-06, + "loss": 0.8522, + "step": 4258 + }, + { + "epoch": 0.22962044425274963, + "grad_norm": 0.8234331607818604, + "learning_rate": 9.922455823852726e-06, + "loss": 0.8266, + "step": 4259 + }, + { + "epoch": 0.22967435842139314, + "grad_norm": 0.8142805099487305, + "learning_rate": 9.922418626297207e-06, + "loss": 0.7434, + "step": 4260 + }, + { + "epoch": 0.22972827259003667, + "grad_norm": 1.2082080841064453, + "learning_rate": 9.922381419891845e-06, + "loss": 0.7884, + "step": 4261 + }, + { + "epoch": 0.22978218675868017, + "grad_norm": 0.7151769399642944, + "learning_rate": 9.922344204636702e-06, + "loss": 0.8028, + "step": 4262 + }, + { + "epoch": 0.2298361009273237, + "grad_norm": 0.9005017280578613, + "learning_rate": 9.922306980531851e-06, + "loss": 0.7569, + "step": 4263 + }, + { + "epoch": 0.2298900150959672, + "grad_norm": 0.8531069755554199, + "learning_rate": 9.922269747577354e-06, + "loss": 0.7255, + "step": 4264 + }, + { + "epoch": 0.22994392926461074, + "grad_norm": 0.7625791430473328, + "learning_rate": 9.922232505773279e-06, + "loss": 0.7316, + "step": 4265 + }, + { + "epoch": 0.22999784343325425, + "grad_norm": 0.8707940578460693, + "learning_rate": 9.922195255119696e-06, + "loss": 0.7281, + "step": 4266 + }, + { + "epoch": 0.23005175760189778, + "grad_norm": 0.8227086067199707, + "learning_rate": 9.922157995616669e-06, + "loss": 0.7815, + "step": 4267 + }, + { + "epoch": 0.2301056717705413, + "grad_norm": 0.7798532843589783, + "learning_rate": 9.922120727264266e-06, + "loss": 0.7347, + "step": 4268 + }, + { + "epoch": 0.2301595859391848, + "grad_norm": 0.9200069308280945, + "learning_rate": 9.922083450062554e-06, + "loss": 0.8466, + "step": 4269 + }, + { + "epoch": 0.23021350010782834, + "grad_norm": 0.7376945614814758, + "learning_rate": 9.922046164011598e-06, + "loss": 0.7834, + "step": 4270 + }, + { + "epoch": 0.23026741427647185, + "grad_norm": 0.7460160255432129, + "learning_rate": 9.922008869111469e-06, + "loss": 0.7622, + "step": 4271 + }, + { + "epoch": 0.23032132844511538, + "grad_norm": 1.0576467514038086, + "learning_rate": 9.921971565362232e-06, + "loss": 0.7648, + "step": 4272 + }, + { + "epoch": 0.23037524261375888, + "grad_norm": 0.8479774594306946, + "learning_rate": 9.921934252763953e-06, + "loss": 0.7834, + "step": 4273 + }, + { + "epoch": 0.23042915678240242, + "grad_norm": 0.7337886691093445, + "learning_rate": 9.9218969313167e-06, + "loss": 0.7125, + "step": 4274 + }, + { + "epoch": 0.23048307095104595, + "grad_norm": 0.7631418108940125, + "learning_rate": 9.92185960102054e-06, + "loss": 0.7741, + "step": 4275 + }, + { + "epoch": 0.23053698511968945, + "grad_norm": 1.0179954767227173, + "learning_rate": 9.92182226187554e-06, + "loss": 0.8724, + "step": 4276 + }, + { + "epoch": 0.23059089928833298, + "grad_norm": 0.768721342086792, + "learning_rate": 9.921784913881768e-06, + "loss": 0.8324, + "step": 4277 + }, + { + "epoch": 0.2306448134569765, + "grad_norm": 0.8202316761016846, + "learning_rate": 9.92174755703929e-06, + "loss": 0.8052, + "step": 4278 + }, + { + "epoch": 0.23069872762562002, + "grad_norm": 0.8934405446052551, + "learning_rate": 9.921710191348174e-06, + "loss": 0.8247, + "step": 4279 + }, + { + "epoch": 0.23075264179426352, + "grad_norm": 0.8000699281692505, + "learning_rate": 9.921672816808488e-06, + "loss": 0.728, + "step": 4280 + }, + { + "epoch": 0.23080655596290706, + "grad_norm": 0.76044100522995, + "learning_rate": 9.921635433420295e-06, + "loss": 0.7559, + "step": 4281 + }, + { + "epoch": 0.23086047013155056, + "grad_norm": 0.8910096883773804, + "learning_rate": 9.921598041183668e-06, + "loss": 0.7519, + "step": 4282 + }, + { + "epoch": 0.2309143843001941, + "grad_norm": 0.7211179733276367, + "learning_rate": 9.92156064009867e-06, + "loss": 0.7838, + "step": 4283 + }, + { + "epoch": 0.23096829846883762, + "grad_norm": 0.7589021921157837, + "learning_rate": 9.921523230165372e-06, + "loss": 0.8378, + "step": 4284 + }, + { + "epoch": 0.23102221263748113, + "grad_norm": 0.7287599444389343, + "learning_rate": 9.921485811383838e-06, + "loss": 0.7667, + "step": 4285 + }, + { + "epoch": 0.23107612680612466, + "grad_norm": 0.7445182204246521, + "learning_rate": 9.921448383754136e-06, + "loss": 0.7795, + "step": 4286 + }, + { + "epoch": 0.23113004097476816, + "grad_norm": 0.7145516872406006, + "learning_rate": 9.921410947276334e-06, + "loss": 0.7439, + "step": 4287 + }, + { + "epoch": 0.2311839551434117, + "grad_norm": 0.8579338192939758, + "learning_rate": 9.921373501950497e-06, + "loss": 0.7636, + "step": 4288 + }, + { + "epoch": 0.2312378693120552, + "grad_norm": 0.7707585096359253, + "learning_rate": 9.921336047776695e-06, + "loss": 0.7422, + "step": 4289 + }, + { + "epoch": 0.23129178348069873, + "grad_norm": 1.0461829900741577, + "learning_rate": 9.921298584754994e-06, + "loss": 0.7768, + "step": 4290 + }, + { + "epoch": 0.23134569764934224, + "grad_norm": 0.9363743662834167, + "learning_rate": 9.921261112885464e-06, + "loss": 0.8283, + "step": 4291 + }, + { + "epoch": 0.23139961181798577, + "grad_norm": 0.7723295092582703, + "learning_rate": 9.921223632168168e-06, + "loss": 0.769, + "step": 4292 + }, + { + "epoch": 0.2314535259866293, + "grad_norm": 0.8078635931015015, + "learning_rate": 9.921186142603178e-06, + "loss": 0.7919, + "step": 4293 + }, + { + "epoch": 0.2315074401552728, + "grad_norm": 0.8620443940162659, + "learning_rate": 9.921148644190557e-06, + "loss": 0.805, + "step": 4294 + }, + { + "epoch": 0.23156135432391634, + "grad_norm": 0.8201389908790588, + "learning_rate": 9.921111136930376e-06, + "loss": 0.7982, + "step": 4295 + }, + { + "epoch": 0.23161526849255984, + "grad_norm": 0.7656800150871277, + "learning_rate": 9.9210736208227e-06, + "loss": 0.763, + "step": 4296 + }, + { + "epoch": 0.23166918266120337, + "grad_norm": 0.8814857602119446, + "learning_rate": 9.921036095867598e-06, + "loss": 0.8467, + "step": 4297 + }, + { + "epoch": 0.23172309682984688, + "grad_norm": 0.6766259074211121, + "learning_rate": 9.920998562065136e-06, + "loss": 0.6669, + "step": 4298 + }, + { + "epoch": 0.2317770109984904, + "grad_norm": 0.729774534702301, + "learning_rate": 9.920961019415383e-06, + "loss": 0.7208, + "step": 4299 + }, + { + "epoch": 0.2318309251671339, + "grad_norm": 0.8035505414009094, + "learning_rate": 9.920923467918405e-06, + "loss": 0.8282, + "step": 4300 + }, + { + "epoch": 0.23188483933577744, + "grad_norm": 0.7243404388427734, + "learning_rate": 9.920885907574269e-06, + "loss": 0.7861, + "step": 4301 + }, + { + "epoch": 0.23193875350442097, + "grad_norm": 0.9405563473701477, + "learning_rate": 9.920848338383047e-06, + "loss": 0.7703, + "step": 4302 + }, + { + "epoch": 0.23199266767306448, + "grad_norm": 1.192933440208435, + "learning_rate": 9.920810760344801e-06, + "loss": 0.8294, + "step": 4303 + }, + { + "epoch": 0.232046581841708, + "grad_norm": 0.7398643493652344, + "learning_rate": 9.920773173459601e-06, + "loss": 0.7712, + "step": 4304 + }, + { + "epoch": 0.23210049601035151, + "grad_norm": 0.7436460852622986, + "learning_rate": 9.920735577727516e-06, + "loss": 0.7155, + "step": 4305 + }, + { + "epoch": 0.23215441017899505, + "grad_norm": 0.7705883383750916, + "learning_rate": 9.920697973148613e-06, + "loss": 0.6632, + "step": 4306 + }, + { + "epoch": 0.23220832434763855, + "grad_norm": 0.8805288076400757, + "learning_rate": 9.920660359722955e-06, + "loss": 0.7914, + "step": 4307 + }, + { + "epoch": 0.23226223851628208, + "grad_norm": 0.7252172231674194, + "learning_rate": 9.920622737450616e-06, + "loss": 0.688, + "step": 4308 + }, + { + "epoch": 0.2323161526849256, + "grad_norm": 0.8841788172721863, + "learning_rate": 9.92058510633166e-06, + "loss": 0.8714, + "step": 4309 + }, + { + "epoch": 0.23237006685356912, + "grad_norm": 0.9365109205245972, + "learning_rate": 9.920547466366156e-06, + "loss": 0.8198, + "step": 4310 + }, + { + "epoch": 0.23242398102221265, + "grad_norm": 0.7860931754112244, + "learning_rate": 9.920509817554172e-06, + "loss": 0.8343, + "step": 4311 + }, + { + "epoch": 0.23247789519085615, + "grad_norm": 0.7520400881767273, + "learning_rate": 9.920472159895773e-06, + "loss": 0.8114, + "step": 4312 + }, + { + "epoch": 0.23253180935949969, + "grad_norm": 0.9704170823097229, + "learning_rate": 9.920434493391029e-06, + "loss": 0.7866, + "step": 4313 + }, + { + "epoch": 0.2325857235281432, + "grad_norm": 0.7817257046699524, + "learning_rate": 9.920396818040009e-06, + "loss": 0.7344, + "step": 4314 + }, + { + "epoch": 0.23263963769678672, + "grad_norm": 0.8574941158294678, + "learning_rate": 9.920359133842778e-06, + "loss": 0.8337, + "step": 4315 + }, + { + "epoch": 0.23269355186543023, + "grad_norm": 0.8683324456214905, + "learning_rate": 9.920321440799405e-06, + "loss": 0.6998, + "step": 4316 + }, + { + "epoch": 0.23274746603407376, + "grad_norm": 0.831664502620697, + "learning_rate": 9.920283738909958e-06, + "loss": 0.823, + "step": 4317 + }, + { + "epoch": 0.23280138020271726, + "grad_norm": 0.7768320441246033, + "learning_rate": 9.920246028174506e-06, + "loss": 0.8132, + "step": 4318 + }, + { + "epoch": 0.2328552943713608, + "grad_norm": 0.8081845045089722, + "learning_rate": 9.920208308593112e-06, + "loss": 0.8486, + "step": 4319 + }, + { + "epoch": 0.23290920854000433, + "grad_norm": 0.8148953914642334, + "learning_rate": 9.920170580165849e-06, + "loss": 0.7817, + "step": 4320 + }, + { + "epoch": 0.23296312270864783, + "grad_norm": 0.8448207378387451, + "learning_rate": 9.920132842892782e-06, + "loss": 0.81, + "step": 4321 + }, + { + "epoch": 0.23301703687729136, + "grad_norm": 0.9412322640419006, + "learning_rate": 9.92009509677398e-06, + "loss": 0.8847, + "step": 4322 + }, + { + "epoch": 0.23307095104593487, + "grad_norm": 0.745847761631012, + "learning_rate": 9.920057341809511e-06, + "loss": 0.6898, + "step": 4323 + }, + { + "epoch": 0.2331248652145784, + "grad_norm": 0.8120739459991455, + "learning_rate": 9.920019577999442e-06, + "loss": 0.7538, + "step": 4324 + }, + { + "epoch": 0.2331787793832219, + "grad_norm": 0.8183807730674744, + "learning_rate": 9.919981805343842e-06, + "loss": 0.7645, + "step": 4325 + }, + { + "epoch": 0.23323269355186543, + "grad_norm": 0.7442939281463623, + "learning_rate": 9.919944023842778e-06, + "loss": 0.7434, + "step": 4326 + }, + { + "epoch": 0.23328660772050894, + "grad_norm": 0.7586483359336853, + "learning_rate": 9.919906233496319e-06, + "loss": 0.767, + "step": 4327 + }, + { + "epoch": 0.23334052188915247, + "grad_norm": 0.8090452551841736, + "learning_rate": 9.919868434304531e-06, + "loss": 0.7217, + "step": 4328 + }, + { + "epoch": 0.233394436057796, + "grad_norm": 0.7344191670417786, + "learning_rate": 9.919830626267484e-06, + "loss": 0.8379, + "step": 4329 + }, + { + "epoch": 0.2334483502264395, + "grad_norm": 0.842797040939331, + "learning_rate": 9.919792809385244e-06, + "loss": 0.7191, + "step": 4330 + }, + { + "epoch": 0.23350226439508304, + "grad_norm": 0.9725179076194763, + "learning_rate": 9.91975498365788e-06, + "loss": 0.752, + "step": 4331 + }, + { + "epoch": 0.23355617856372654, + "grad_norm": 0.8612834811210632, + "learning_rate": 9.91971714908546e-06, + "loss": 0.6699, + "step": 4332 + }, + { + "epoch": 0.23361009273237007, + "grad_norm": 0.7784733772277832, + "learning_rate": 9.919679305668053e-06, + "loss": 0.6382, + "step": 4333 + }, + { + "epoch": 0.23366400690101358, + "grad_norm": 0.7414956092834473, + "learning_rate": 9.919641453405726e-06, + "loss": 0.7486, + "step": 4334 + }, + { + "epoch": 0.2337179210696571, + "grad_norm": 0.7242193818092346, + "learning_rate": 9.919603592298548e-06, + "loss": 0.7451, + "step": 4335 + }, + { + "epoch": 0.2337718352383006, + "grad_norm": 0.7716617584228516, + "learning_rate": 9.919565722346585e-06, + "loss": 0.8141, + "step": 4336 + }, + { + "epoch": 0.23382574940694414, + "grad_norm": 0.7281931042671204, + "learning_rate": 9.919527843549905e-06, + "loss": 0.7144, + "step": 4337 + }, + { + "epoch": 0.23387966357558768, + "grad_norm": 0.8105024695396423, + "learning_rate": 9.91948995590858e-06, + "loss": 0.7436, + "step": 4338 + }, + { + "epoch": 0.23393357774423118, + "grad_norm": 0.7437110543251038, + "learning_rate": 9.919452059422674e-06, + "loss": 0.7382, + "step": 4339 + }, + { + "epoch": 0.2339874919128747, + "grad_norm": 0.7429775595664978, + "learning_rate": 9.919414154092258e-06, + "loss": 0.6745, + "step": 4340 + }, + { + "epoch": 0.23404140608151822, + "grad_norm": 0.7441113591194153, + "learning_rate": 9.919376239917398e-06, + "loss": 0.6918, + "step": 4341 + }, + { + "epoch": 0.23409532025016175, + "grad_norm": 0.7948750257492065, + "learning_rate": 9.919338316898162e-06, + "loss": 0.7844, + "step": 4342 + }, + { + "epoch": 0.23414923441880525, + "grad_norm": 0.8123278021812439, + "learning_rate": 9.91930038503462e-06, + "loss": 0.8009, + "step": 4343 + }, + { + "epoch": 0.23420314858744878, + "grad_norm": 0.7706881761550903, + "learning_rate": 9.919262444326841e-06, + "loss": 0.7557, + "step": 4344 + }, + { + "epoch": 0.2342570627560923, + "grad_norm": 0.7763088345527649, + "learning_rate": 9.91922449477489e-06, + "loss": 0.7718, + "step": 4345 + }, + { + "epoch": 0.23431097692473582, + "grad_norm": 0.8066530227661133, + "learning_rate": 9.919186536378836e-06, + "loss": 0.7332, + "step": 4346 + }, + { + "epoch": 0.23436489109337935, + "grad_norm": 0.7513235211372375, + "learning_rate": 9.91914856913875e-06, + "loss": 0.7551, + "step": 4347 + }, + { + "epoch": 0.23441880526202286, + "grad_norm": 0.7152560949325562, + "learning_rate": 9.919110593054697e-06, + "loss": 0.7086, + "step": 4348 + }, + { + "epoch": 0.2344727194306664, + "grad_norm": 0.8949812650680542, + "learning_rate": 9.919072608126747e-06, + "loss": 0.8965, + "step": 4349 + }, + { + "epoch": 0.2345266335993099, + "grad_norm": 0.7958235740661621, + "learning_rate": 9.919034614354968e-06, + "loss": 0.8007, + "step": 4350 + }, + { + "epoch": 0.23458054776795342, + "grad_norm": 0.7758817672729492, + "learning_rate": 9.91899661173943e-06, + "loss": 0.6801, + "step": 4351 + }, + { + "epoch": 0.23463446193659693, + "grad_norm": 0.6918591260910034, + "learning_rate": 9.918958600280196e-06, + "loss": 0.7202, + "step": 4352 + }, + { + "epoch": 0.23468837610524046, + "grad_norm": 0.7467452883720398, + "learning_rate": 9.918920579977339e-06, + "loss": 0.7289, + "step": 4353 + }, + { + "epoch": 0.23474229027388396, + "grad_norm": 0.8222523331642151, + "learning_rate": 9.918882550830926e-06, + "loss": 0.8121, + "step": 4354 + }, + { + "epoch": 0.2347962044425275, + "grad_norm": 0.7198072671890259, + "learning_rate": 9.918844512841027e-06, + "loss": 0.7534, + "step": 4355 + }, + { + "epoch": 0.23485011861117103, + "grad_norm": 0.7741684317588806, + "learning_rate": 9.918806466007709e-06, + "loss": 0.7617, + "step": 4356 + }, + { + "epoch": 0.23490403277981453, + "grad_norm": 0.7739984393119812, + "learning_rate": 9.918768410331038e-06, + "loss": 0.668, + "step": 4357 + }, + { + "epoch": 0.23495794694845806, + "grad_norm": 0.7554827928543091, + "learning_rate": 9.918730345811088e-06, + "loss": 0.8149, + "step": 4358 + }, + { + "epoch": 0.23501186111710157, + "grad_norm": 0.687698483467102, + "learning_rate": 9.918692272447922e-06, + "loss": 0.7372, + "step": 4359 + }, + { + "epoch": 0.2350657752857451, + "grad_norm": 0.804979681968689, + "learning_rate": 9.91865419024161e-06, + "loss": 0.7604, + "step": 4360 + }, + { + "epoch": 0.2351196894543886, + "grad_norm": 0.839570164680481, + "learning_rate": 9.918616099192223e-06, + "loss": 0.7819, + "step": 4361 + }, + { + "epoch": 0.23517360362303213, + "grad_norm": 0.7619128823280334, + "learning_rate": 9.918577999299827e-06, + "loss": 0.7964, + "step": 4362 + }, + { + "epoch": 0.23522751779167564, + "grad_norm": 0.8392224311828613, + "learning_rate": 9.918539890564491e-06, + "loss": 0.778, + "step": 4363 + }, + { + "epoch": 0.23528143196031917, + "grad_norm": 0.7874334454536438, + "learning_rate": 9.918501772986284e-06, + "loss": 0.8403, + "step": 4364 + }, + { + "epoch": 0.2353353461289627, + "grad_norm": 0.7531299591064453, + "learning_rate": 9.918463646565276e-06, + "loss": 0.8639, + "step": 4365 + }, + { + "epoch": 0.2353892602976062, + "grad_norm": 0.7251406908035278, + "learning_rate": 9.91842551130153e-06, + "loss": 0.7858, + "step": 4366 + }, + { + "epoch": 0.23544317446624974, + "grad_norm": 0.8003079891204834, + "learning_rate": 9.918387367195121e-06, + "loss": 0.8117, + "step": 4367 + }, + { + "epoch": 0.23549708863489324, + "grad_norm": 0.7766731977462769, + "learning_rate": 9.918349214246112e-06, + "loss": 0.7751, + "step": 4368 + }, + { + "epoch": 0.23555100280353677, + "grad_norm": 0.7517151236534119, + "learning_rate": 9.918311052454577e-06, + "loss": 0.7245, + "step": 4369 + }, + { + "epoch": 0.23560491697218028, + "grad_norm": 0.6932556629180908, + "learning_rate": 9.918272881820582e-06, + "loss": 0.7544, + "step": 4370 + }, + { + "epoch": 0.2356588311408238, + "grad_norm": 0.7345824837684631, + "learning_rate": 9.918234702344194e-06, + "loss": 0.7467, + "step": 4371 + }, + { + "epoch": 0.23571274530946731, + "grad_norm": 0.7525627017021179, + "learning_rate": 9.918196514025485e-06, + "loss": 0.8197, + "step": 4372 + }, + { + "epoch": 0.23576665947811085, + "grad_norm": 0.9494594931602478, + "learning_rate": 9.918158316864522e-06, + "loss": 0.7505, + "step": 4373 + }, + { + "epoch": 0.23582057364675438, + "grad_norm": 0.7376323342323303, + "learning_rate": 9.918120110861372e-06, + "loss": 0.7513, + "step": 4374 + }, + { + "epoch": 0.23587448781539788, + "grad_norm": 0.8581971526145935, + "learning_rate": 9.918081896016108e-06, + "loss": 0.8419, + "step": 4375 + }, + { + "epoch": 0.2359284019840414, + "grad_norm": 0.9238672256469727, + "learning_rate": 9.918043672328793e-06, + "loss": 0.9286, + "step": 4376 + }, + { + "epoch": 0.23598231615268492, + "grad_norm": 0.787239670753479, + "learning_rate": 9.9180054397995e-06, + "loss": 0.7917, + "step": 4377 + }, + { + "epoch": 0.23603623032132845, + "grad_norm": 0.862934947013855, + "learning_rate": 9.917967198428298e-06, + "loss": 0.9533, + "step": 4378 + }, + { + "epoch": 0.23609014448997195, + "grad_norm": 0.8004072308540344, + "learning_rate": 9.917928948215251e-06, + "loss": 0.8035, + "step": 4379 + }, + { + "epoch": 0.23614405865861549, + "grad_norm": 0.7238081097602844, + "learning_rate": 9.917890689160434e-06, + "loss": 0.7777, + "step": 4380 + }, + { + "epoch": 0.23619797282725902, + "grad_norm": 0.7420337200164795, + "learning_rate": 9.917852421263912e-06, + "loss": 0.7643, + "step": 4381 + }, + { + "epoch": 0.23625188699590252, + "grad_norm": 0.8613260984420776, + "learning_rate": 9.917814144525754e-06, + "loss": 0.7936, + "step": 4382 + }, + { + "epoch": 0.23630580116454605, + "grad_norm": 0.787196934223175, + "learning_rate": 9.91777585894603e-06, + "loss": 0.8281, + "step": 4383 + }, + { + "epoch": 0.23635971533318956, + "grad_norm": 0.8265708088874817, + "learning_rate": 9.917737564524807e-06, + "loss": 0.7518, + "step": 4384 + }, + { + "epoch": 0.2364136295018331, + "grad_norm": 0.7922816276550293, + "learning_rate": 9.917699261262156e-06, + "loss": 0.8803, + "step": 4385 + }, + { + "epoch": 0.2364675436704766, + "grad_norm": 0.8977661728858948, + "learning_rate": 9.917660949158147e-06, + "loss": 0.9311, + "step": 4386 + }, + { + "epoch": 0.23652145783912012, + "grad_norm": 0.7732436060905457, + "learning_rate": 9.917622628212846e-06, + "loss": 0.7885, + "step": 4387 + }, + { + "epoch": 0.23657537200776363, + "grad_norm": 0.7951593399047852, + "learning_rate": 9.917584298426322e-06, + "loss": 0.9044, + "step": 4388 + }, + { + "epoch": 0.23662928617640716, + "grad_norm": 0.7638776898384094, + "learning_rate": 9.917545959798643e-06, + "loss": 0.7276, + "step": 4389 + }, + { + "epoch": 0.2366832003450507, + "grad_norm": 0.8405231833457947, + "learning_rate": 9.917507612329882e-06, + "loss": 0.9712, + "step": 4390 + }, + { + "epoch": 0.2367371145136942, + "grad_norm": 0.7908889651298523, + "learning_rate": 9.917469256020104e-06, + "loss": 0.7017, + "step": 4391 + }, + { + "epoch": 0.23679102868233773, + "grad_norm": 0.7041110992431641, + "learning_rate": 9.917430890869379e-06, + "loss": 0.811, + "step": 4392 + }, + { + "epoch": 0.23684494285098123, + "grad_norm": 0.923809289932251, + "learning_rate": 9.917392516877779e-06, + "loss": 0.7363, + "step": 4393 + }, + { + "epoch": 0.23689885701962476, + "grad_norm": 0.7647616267204285, + "learning_rate": 9.91735413404537e-06, + "loss": 0.6428, + "step": 4394 + }, + { + "epoch": 0.23695277118826827, + "grad_norm": 0.7839642763137817, + "learning_rate": 9.91731574237222e-06, + "loss": 0.7562, + "step": 4395 + }, + { + "epoch": 0.2370066853569118, + "grad_norm": 0.7928365468978882, + "learning_rate": 9.9172773418584e-06, + "loss": 0.8758, + "step": 4396 + }, + { + "epoch": 0.2370605995255553, + "grad_norm": 0.8615469336509705, + "learning_rate": 9.917238932503979e-06, + "loss": 0.8264, + "step": 4397 + }, + { + "epoch": 0.23711451369419884, + "grad_norm": 0.7869088649749756, + "learning_rate": 9.917200514309024e-06, + "loss": 0.8973, + "step": 4398 + }, + { + "epoch": 0.23716842786284237, + "grad_norm": 0.8070249557495117, + "learning_rate": 9.917162087273606e-06, + "loss": 0.786, + "step": 4399 + }, + { + "epoch": 0.23722234203148587, + "grad_norm": 0.7543795704841614, + "learning_rate": 9.917123651397796e-06, + "loss": 0.6012, + "step": 4400 + }, + { + "epoch": 0.2372762562001294, + "grad_norm": 1.0837504863739014, + "learning_rate": 9.91708520668166e-06, + "loss": 0.7304, + "step": 4401 + }, + { + "epoch": 0.2373301703687729, + "grad_norm": 0.8013801574707031, + "learning_rate": 9.917046753125265e-06, + "loss": 0.6564, + "step": 4402 + }, + { + "epoch": 0.23738408453741644, + "grad_norm": 0.8721063137054443, + "learning_rate": 9.917008290728687e-06, + "loss": 0.9042, + "step": 4403 + }, + { + "epoch": 0.23743799870605994, + "grad_norm": 0.9169342517852783, + "learning_rate": 9.91696981949199e-06, + "loss": 0.8766, + "step": 4404 + }, + { + "epoch": 0.23749191287470348, + "grad_norm": 0.7514129877090454, + "learning_rate": 9.916931339415243e-06, + "loss": 0.7818, + "step": 4405 + }, + { + "epoch": 0.23754582704334698, + "grad_norm": 0.747178316116333, + "learning_rate": 9.916892850498518e-06, + "loss": 0.7608, + "step": 4406 + }, + { + "epoch": 0.2375997412119905, + "grad_norm": 0.7261523008346558, + "learning_rate": 9.916854352741883e-06, + "loss": 0.6679, + "step": 4407 + }, + { + "epoch": 0.23765365538063404, + "grad_norm": 0.7496599555015564, + "learning_rate": 9.916815846145407e-06, + "loss": 0.8072, + "step": 4408 + }, + { + "epoch": 0.23770756954927755, + "grad_norm": 0.8052302002906799, + "learning_rate": 9.916777330709159e-06, + "loss": 0.7882, + "step": 4409 + }, + { + "epoch": 0.23776148371792108, + "grad_norm": 0.8955451250076294, + "learning_rate": 9.916738806433208e-06, + "loss": 0.7566, + "step": 4410 + }, + { + "epoch": 0.23781539788656458, + "grad_norm": 0.7964259386062622, + "learning_rate": 9.916700273317623e-06, + "loss": 0.7503, + "step": 4411 + }, + { + "epoch": 0.23786931205520812, + "grad_norm": 0.904030978679657, + "learning_rate": 9.916661731362476e-06, + "loss": 0.8056, + "step": 4412 + }, + { + "epoch": 0.23792322622385162, + "grad_norm": 0.8031491637229919, + "learning_rate": 9.916623180567833e-06, + "loss": 0.7978, + "step": 4413 + }, + { + "epoch": 0.23797714039249515, + "grad_norm": 1.2857294082641602, + "learning_rate": 9.916584620933764e-06, + "loss": 0.7822, + "step": 4414 + }, + { + "epoch": 0.23803105456113866, + "grad_norm": 0.8789198994636536, + "learning_rate": 9.91654605246034e-06, + "loss": 0.8392, + "step": 4415 + }, + { + "epoch": 0.2380849687297822, + "grad_norm": 0.7934818267822266, + "learning_rate": 9.91650747514763e-06, + "loss": 0.8242, + "step": 4416 + }, + { + "epoch": 0.23813888289842572, + "grad_norm": 0.8770273923873901, + "learning_rate": 9.916468888995703e-06, + "loss": 0.7649, + "step": 4417 + }, + { + "epoch": 0.23819279706706922, + "grad_norm": 0.9187912940979004, + "learning_rate": 9.916430294004627e-06, + "loss": 0.7531, + "step": 4418 + }, + { + "epoch": 0.23824671123571275, + "grad_norm": 0.8346499800682068, + "learning_rate": 9.916391690174472e-06, + "loss": 0.7785, + "step": 4419 + }, + { + "epoch": 0.23830062540435626, + "grad_norm": 0.7771525382995605, + "learning_rate": 9.916353077505307e-06, + "loss": 0.8418, + "step": 4420 + }, + { + "epoch": 0.2383545395729998, + "grad_norm": 0.8043860197067261, + "learning_rate": 9.916314455997204e-06, + "loss": 0.7878, + "step": 4421 + }, + { + "epoch": 0.2384084537416433, + "grad_norm": 0.8319140672683716, + "learning_rate": 9.916275825650231e-06, + "loss": 0.7751, + "step": 4422 + }, + { + "epoch": 0.23846236791028683, + "grad_norm": 0.7341157793998718, + "learning_rate": 9.916237186464455e-06, + "loss": 0.7486, + "step": 4423 + }, + { + "epoch": 0.23851628207893033, + "grad_norm": 0.8434766530990601, + "learning_rate": 9.91619853843995e-06, + "loss": 0.7906, + "step": 4424 + }, + { + "epoch": 0.23857019624757386, + "grad_norm": 0.8698723912239075, + "learning_rate": 9.916159881576782e-06, + "loss": 0.7577, + "step": 4425 + }, + { + "epoch": 0.2386241104162174, + "grad_norm": 0.6935116052627563, + "learning_rate": 9.91612121587502e-06, + "loss": 0.7538, + "step": 4426 + }, + { + "epoch": 0.2386780245848609, + "grad_norm": 0.7313439249992371, + "learning_rate": 9.916082541334737e-06, + "loss": 0.8306, + "step": 4427 + }, + { + "epoch": 0.23873193875350443, + "grad_norm": 0.7396842241287231, + "learning_rate": 9.916043857956e-06, + "loss": 0.8037, + "step": 4428 + }, + { + "epoch": 0.23878585292214793, + "grad_norm": 0.7954176664352417, + "learning_rate": 9.91600516573888e-06, + "loss": 0.7552, + "step": 4429 + }, + { + "epoch": 0.23883976709079147, + "grad_norm": 0.7113604545593262, + "learning_rate": 9.915966464683444e-06, + "loss": 0.722, + "step": 4430 + }, + { + "epoch": 0.23889368125943497, + "grad_norm": 0.7765493392944336, + "learning_rate": 9.915927754789765e-06, + "loss": 0.7143, + "step": 4431 + }, + { + "epoch": 0.2389475954280785, + "grad_norm": 0.8287819623947144, + "learning_rate": 9.91588903605791e-06, + "loss": 0.8256, + "step": 4432 + }, + { + "epoch": 0.239001509596722, + "grad_norm": 0.7855268120765686, + "learning_rate": 9.91585030848795e-06, + "loss": 0.8666, + "step": 4433 + }, + { + "epoch": 0.23905542376536554, + "grad_norm": 0.7613146901130676, + "learning_rate": 9.915811572079955e-06, + "loss": 0.7367, + "step": 4434 + }, + { + "epoch": 0.23910933793400907, + "grad_norm": 0.7982416152954102, + "learning_rate": 9.91577282683399e-06, + "loss": 0.8782, + "step": 4435 + }, + { + "epoch": 0.23916325210265257, + "grad_norm": 0.8698425889015198, + "learning_rate": 9.915734072750132e-06, + "loss": 0.7962, + "step": 4436 + }, + { + "epoch": 0.2392171662712961, + "grad_norm": 0.7771449089050293, + "learning_rate": 9.915695309828449e-06, + "loss": 0.8175, + "step": 4437 + }, + { + "epoch": 0.2392710804399396, + "grad_norm": 0.7628130912780762, + "learning_rate": 9.915656538069005e-06, + "loss": 0.8522, + "step": 4438 + }, + { + "epoch": 0.23932499460858314, + "grad_norm": 0.7890259623527527, + "learning_rate": 9.915617757471873e-06, + "loss": 0.7256, + "step": 4439 + }, + { + "epoch": 0.23937890877722665, + "grad_norm": 0.8656981587409973, + "learning_rate": 9.915578968037127e-06, + "loss": 0.7982, + "step": 4440 + }, + { + "epoch": 0.23943282294587018, + "grad_norm": 0.7118672132492065, + "learning_rate": 9.91554016976483e-06, + "loss": 0.7917, + "step": 4441 + }, + { + "epoch": 0.23948673711451368, + "grad_norm": 0.8988688588142395, + "learning_rate": 9.915501362655055e-06, + "loss": 0.7884, + "step": 4442 + }, + { + "epoch": 0.2395406512831572, + "grad_norm": 0.7870175242424011, + "learning_rate": 9.915462546707873e-06, + "loss": 0.8008, + "step": 4443 + }, + { + "epoch": 0.23959456545180075, + "grad_norm": 0.8649255037307739, + "learning_rate": 9.915423721923351e-06, + "loss": 0.7897, + "step": 4444 + }, + { + "epoch": 0.23964847962044425, + "grad_norm": 0.8905230164527893, + "learning_rate": 9.915384888301561e-06, + "loss": 0.8611, + "step": 4445 + }, + { + "epoch": 0.23970239378908778, + "grad_norm": 0.7729083299636841, + "learning_rate": 9.91534604584257e-06, + "loss": 0.7539, + "step": 4446 + }, + { + "epoch": 0.23975630795773129, + "grad_norm": 0.9127714037895203, + "learning_rate": 9.915307194546452e-06, + "loss": 0.8286, + "step": 4447 + }, + { + "epoch": 0.23981022212637482, + "grad_norm": 0.9115898013114929, + "learning_rate": 9.915268334413274e-06, + "loss": 0.8655, + "step": 4448 + }, + { + "epoch": 0.23986413629501832, + "grad_norm": 0.8105745315551758, + "learning_rate": 9.915229465443106e-06, + "loss": 0.7936, + "step": 4449 + }, + { + "epoch": 0.23991805046366185, + "grad_norm": 0.732665479183197, + "learning_rate": 9.91519058763602e-06, + "loss": 0.6538, + "step": 4450 + }, + { + "epoch": 0.23997196463230536, + "grad_norm": 0.7506905794143677, + "learning_rate": 9.91515170099208e-06, + "loss": 0.7461, + "step": 4451 + }, + { + "epoch": 0.2400258788009489, + "grad_norm": 1.0013810396194458, + "learning_rate": 9.915112805511364e-06, + "loss": 0.8622, + "step": 4452 + }, + { + "epoch": 0.24007979296959242, + "grad_norm": 0.8527307510375977, + "learning_rate": 9.915073901193937e-06, + "loss": 0.7516, + "step": 4453 + }, + { + "epoch": 0.24013370713823592, + "grad_norm": 0.756240963935852, + "learning_rate": 9.91503498803987e-06, + "loss": 0.7374, + "step": 4454 + }, + { + "epoch": 0.24018762130687946, + "grad_norm": 0.7914390563964844, + "learning_rate": 9.914996066049234e-06, + "loss": 0.7492, + "step": 4455 + }, + { + "epoch": 0.24024153547552296, + "grad_norm": 0.820505678653717, + "learning_rate": 9.914957135222096e-06, + "loss": 0.6724, + "step": 4456 + }, + { + "epoch": 0.2402954496441665, + "grad_norm": 0.9144145846366882, + "learning_rate": 9.91491819555853e-06, + "loss": 0.8507, + "step": 4457 + }, + { + "epoch": 0.24034936381281, + "grad_norm": 0.7114265561103821, + "learning_rate": 9.914879247058602e-06, + "loss": 0.7308, + "step": 4458 + }, + { + "epoch": 0.24040327798145353, + "grad_norm": 0.8527531027793884, + "learning_rate": 9.914840289722385e-06, + "loss": 0.8446, + "step": 4459 + }, + { + "epoch": 0.24045719215009703, + "grad_norm": 0.9392815232276917, + "learning_rate": 9.914801323549948e-06, + "loss": 0.8434, + "step": 4460 + }, + { + "epoch": 0.24051110631874056, + "grad_norm": 0.8654825687408447, + "learning_rate": 9.91476234854136e-06, + "loss": 0.7719, + "step": 4461 + }, + { + "epoch": 0.2405650204873841, + "grad_norm": 0.8563691973686218, + "learning_rate": 9.914723364696693e-06, + "loss": 0.7922, + "step": 4462 + }, + { + "epoch": 0.2406189346560276, + "grad_norm": 0.7988063097000122, + "learning_rate": 9.914684372016016e-06, + "loss": 0.8222, + "step": 4463 + }, + { + "epoch": 0.24067284882467113, + "grad_norm": 0.8066624402999878, + "learning_rate": 9.9146453704994e-06, + "loss": 0.8205, + "step": 4464 + }, + { + "epoch": 0.24072676299331464, + "grad_norm": 0.9636842608451843, + "learning_rate": 9.914606360146915e-06, + "loss": 0.835, + "step": 4465 + }, + { + "epoch": 0.24078067716195817, + "grad_norm": 0.7767032980918884, + "learning_rate": 9.91456734095863e-06, + "loss": 0.7637, + "step": 4466 + }, + { + "epoch": 0.24083459133060167, + "grad_norm": 0.7343990802764893, + "learning_rate": 9.914528312934614e-06, + "loss": 0.752, + "step": 4467 + }, + { + "epoch": 0.2408885054992452, + "grad_norm": 0.8200786113739014, + "learning_rate": 9.91448927607494e-06, + "loss": 0.7468, + "step": 4468 + }, + { + "epoch": 0.2409424196678887, + "grad_norm": 0.810748279094696, + "learning_rate": 9.91445023037968e-06, + "loss": 0.8567, + "step": 4469 + }, + { + "epoch": 0.24099633383653224, + "grad_norm": 0.8314438462257385, + "learning_rate": 9.914411175848896e-06, + "loss": 0.8693, + "step": 4470 + }, + { + "epoch": 0.24105024800517577, + "grad_norm": 0.827609121799469, + "learning_rate": 9.914372112482668e-06, + "loss": 0.7171, + "step": 4471 + }, + { + "epoch": 0.24110416217381928, + "grad_norm": 0.7794898748397827, + "learning_rate": 9.91433304028106e-06, + "loss": 0.7279, + "step": 4472 + }, + { + "epoch": 0.2411580763424628, + "grad_norm": 0.7951536178588867, + "learning_rate": 9.914293959244145e-06, + "loss": 0.8438, + "step": 4473 + }, + { + "epoch": 0.2412119905111063, + "grad_norm": 0.8130155801773071, + "learning_rate": 9.914254869371991e-06, + "loss": 0.7849, + "step": 4474 + }, + { + "epoch": 0.24126590467974984, + "grad_norm": 0.8347324728965759, + "learning_rate": 9.91421577066467e-06, + "loss": 0.851, + "step": 4475 + }, + { + "epoch": 0.24131981884839335, + "grad_norm": 0.8122373819351196, + "learning_rate": 9.914176663122252e-06, + "loss": 0.8293, + "step": 4476 + }, + { + "epoch": 0.24137373301703688, + "grad_norm": 0.728115975856781, + "learning_rate": 9.914137546744807e-06, + "loss": 0.7865, + "step": 4477 + }, + { + "epoch": 0.24142764718568038, + "grad_norm": 0.8177993893623352, + "learning_rate": 9.914098421532404e-06, + "loss": 0.7765, + "step": 4478 + }, + { + "epoch": 0.24148156135432391, + "grad_norm": 0.7987833619117737, + "learning_rate": 9.914059287485117e-06, + "loss": 0.7611, + "step": 4479 + }, + { + "epoch": 0.24153547552296745, + "grad_norm": 0.7656280994415283, + "learning_rate": 9.914020144603013e-06, + "loss": 0.7538, + "step": 4480 + }, + { + "epoch": 0.24158938969161095, + "grad_norm": 0.69268798828125, + "learning_rate": 9.913980992886163e-06, + "loss": 0.6306, + "step": 4481 + }, + { + "epoch": 0.24164330386025448, + "grad_norm": 0.7506656050682068, + "learning_rate": 9.91394183233464e-06, + "loss": 0.7629, + "step": 4482 + }, + { + "epoch": 0.241697218028898, + "grad_norm": 0.893014669418335, + "learning_rate": 9.91390266294851e-06, + "loss": 0.7696, + "step": 4483 + }, + { + "epoch": 0.24175113219754152, + "grad_norm": 0.8073716163635254, + "learning_rate": 9.913863484727847e-06, + "loss": 0.7901, + "step": 4484 + }, + { + "epoch": 0.24180504636618502, + "grad_norm": 0.7654293775558472, + "learning_rate": 9.913824297672721e-06, + "loss": 0.8004, + "step": 4485 + }, + { + "epoch": 0.24185896053482855, + "grad_norm": 0.7301006317138672, + "learning_rate": 9.9137851017832e-06, + "loss": 0.694, + "step": 4486 + }, + { + "epoch": 0.24191287470347209, + "grad_norm": 0.7901747822761536, + "learning_rate": 9.913745897059356e-06, + "loss": 0.741, + "step": 4487 + }, + { + "epoch": 0.2419667888721156, + "grad_norm": 0.7572670578956604, + "learning_rate": 9.91370668350126e-06, + "loss": 0.8361, + "step": 4488 + }, + { + "epoch": 0.24202070304075912, + "grad_norm": 0.8322924971580505, + "learning_rate": 9.913667461108983e-06, + "loss": 0.8155, + "step": 4489 + }, + { + "epoch": 0.24207461720940263, + "grad_norm": 1.0176936388015747, + "learning_rate": 9.913628229882593e-06, + "loss": 0.8341, + "step": 4490 + }, + { + "epoch": 0.24212853137804616, + "grad_norm": 0.7386930584907532, + "learning_rate": 9.913588989822165e-06, + "loss": 0.838, + "step": 4491 + }, + { + "epoch": 0.24218244554668966, + "grad_norm": 0.874079167842865, + "learning_rate": 9.913549740927764e-06, + "loss": 0.7181, + "step": 4492 + }, + { + "epoch": 0.2422363597153332, + "grad_norm": 0.8320260643959045, + "learning_rate": 9.913510483199464e-06, + "loss": 0.8909, + "step": 4493 + }, + { + "epoch": 0.2422902738839767, + "grad_norm": 0.7491182088851929, + "learning_rate": 9.913471216637335e-06, + "loss": 0.8469, + "step": 4494 + }, + { + "epoch": 0.24234418805262023, + "grad_norm": 0.7132229804992676, + "learning_rate": 9.913431941241446e-06, + "loss": 0.7237, + "step": 4495 + }, + { + "epoch": 0.24239810222126376, + "grad_norm": 0.8269235491752625, + "learning_rate": 9.913392657011872e-06, + "loss": 0.7929, + "step": 4496 + }, + { + "epoch": 0.24245201638990727, + "grad_norm": 0.8247712254524231, + "learning_rate": 9.913353363948679e-06, + "loss": 0.7298, + "step": 4497 + }, + { + "epoch": 0.2425059305585508, + "grad_norm": 0.761820912361145, + "learning_rate": 9.91331406205194e-06, + "loss": 0.749, + "step": 4498 + }, + { + "epoch": 0.2425598447271943, + "grad_norm": 0.7263596653938293, + "learning_rate": 9.913274751321723e-06, + "loss": 0.8055, + "step": 4499 + }, + { + "epoch": 0.24261375889583783, + "grad_norm": 0.7232603430747986, + "learning_rate": 9.913235431758102e-06, + "loss": 0.8011, + "step": 4500 + }, + { + "epoch": 0.24266767306448134, + "grad_norm": 0.8140621781349182, + "learning_rate": 9.913196103361146e-06, + "loss": 0.8332, + "step": 4501 + }, + { + "epoch": 0.24272158723312487, + "grad_norm": 0.8474514484405518, + "learning_rate": 9.913156766130926e-06, + "loss": 0.8632, + "step": 4502 + }, + { + "epoch": 0.24277550140176837, + "grad_norm": 0.8690447211265564, + "learning_rate": 9.913117420067515e-06, + "loss": 0.8027, + "step": 4503 + }, + { + "epoch": 0.2428294155704119, + "grad_norm": 0.7381221652030945, + "learning_rate": 9.91307806517098e-06, + "loss": 0.7622, + "step": 4504 + }, + { + "epoch": 0.24288332973905544, + "grad_norm": 0.7889763712882996, + "learning_rate": 9.913038701441393e-06, + "loss": 0.7792, + "step": 4505 + }, + { + "epoch": 0.24293724390769894, + "grad_norm": 0.7800214886665344, + "learning_rate": 9.912999328878825e-06, + "loss": 0.7972, + "step": 4506 + }, + { + "epoch": 0.24299115807634247, + "grad_norm": 0.7379936575889587, + "learning_rate": 9.912959947483348e-06, + "loss": 0.7353, + "step": 4507 + }, + { + "epoch": 0.24304507224498598, + "grad_norm": 0.7070313692092896, + "learning_rate": 9.912920557255028e-06, + "loss": 0.7483, + "step": 4508 + }, + { + "epoch": 0.2430989864136295, + "grad_norm": 0.7230751514434814, + "learning_rate": 9.912881158193943e-06, + "loss": 0.7882, + "step": 4509 + }, + { + "epoch": 0.243152900582273, + "grad_norm": 0.8739690780639648, + "learning_rate": 9.91284175030016e-06, + "loss": 0.885, + "step": 4510 + }, + { + "epoch": 0.24320681475091654, + "grad_norm": 0.7954097986221313, + "learning_rate": 9.912802333573748e-06, + "loss": 0.8575, + "step": 4511 + }, + { + "epoch": 0.24326072891956005, + "grad_norm": 0.7602096796035767, + "learning_rate": 9.912762908014781e-06, + "loss": 0.7847, + "step": 4512 + }, + { + "epoch": 0.24331464308820358, + "grad_norm": 0.7269259691238403, + "learning_rate": 9.91272347362333e-06, + "loss": 0.7931, + "step": 4513 + }, + { + "epoch": 0.2433685572568471, + "grad_norm": 0.6849657297134399, + "learning_rate": 9.912684030399464e-06, + "loss": 0.7478, + "step": 4514 + }, + { + "epoch": 0.24342247142549062, + "grad_norm": 0.8350282907485962, + "learning_rate": 9.912644578343255e-06, + "loss": 0.829, + "step": 4515 + }, + { + "epoch": 0.24347638559413415, + "grad_norm": 0.7411940693855286, + "learning_rate": 9.912605117454772e-06, + "loss": 0.6513, + "step": 4516 + }, + { + "epoch": 0.24353029976277765, + "grad_norm": 0.73365718126297, + "learning_rate": 9.912565647734089e-06, + "loss": 0.7376, + "step": 4517 + }, + { + "epoch": 0.24358421393142118, + "grad_norm": 0.8144620060920715, + "learning_rate": 9.912526169181275e-06, + "loss": 0.745, + "step": 4518 + }, + { + "epoch": 0.2436381281000647, + "grad_norm": 0.7516615390777588, + "learning_rate": 9.912486681796403e-06, + "loss": 0.8864, + "step": 4519 + }, + { + "epoch": 0.24369204226870822, + "grad_norm": 0.8179273009300232, + "learning_rate": 9.91244718557954e-06, + "loss": 0.7763, + "step": 4520 + }, + { + "epoch": 0.24374595643735172, + "grad_norm": 0.7541390657424927, + "learning_rate": 9.912407680530762e-06, + "loss": 0.8565, + "step": 4521 + }, + { + "epoch": 0.24379987060599526, + "grad_norm": 0.7410699129104614, + "learning_rate": 9.912368166650137e-06, + "loss": 0.7938, + "step": 4522 + }, + { + "epoch": 0.2438537847746388, + "grad_norm": 0.840753972530365, + "learning_rate": 9.912328643937735e-06, + "loss": 0.8895, + "step": 4523 + }, + { + "epoch": 0.2439076989432823, + "grad_norm": 0.7780727744102478, + "learning_rate": 9.91228911239363e-06, + "loss": 0.8554, + "step": 4524 + }, + { + "epoch": 0.24396161311192582, + "grad_norm": 0.8156387805938721, + "learning_rate": 9.91224957201789e-06, + "loss": 0.7871, + "step": 4525 + }, + { + "epoch": 0.24401552728056933, + "grad_norm": 0.7830832004547119, + "learning_rate": 9.912210022810591e-06, + "loss": 0.729, + "step": 4526 + }, + { + "epoch": 0.24406944144921286, + "grad_norm": 0.9109267592430115, + "learning_rate": 9.912170464771799e-06, + "loss": 0.8281, + "step": 4527 + }, + { + "epoch": 0.24412335561785636, + "grad_norm": 0.7609542012214661, + "learning_rate": 9.912130897901587e-06, + "loss": 0.8907, + "step": 4528 + }, + { + "epoch": 0.2441772697864999, + "grad_norm": 0.8503179550170898, + "learning_rate": 9.912091322200025e-06, + "loss": 0.8337, + "step": 4529 + }, + { + "epoch": 0.2442311839551434, + "grad_norm": 0.8808969259262085, + "learning_rate": 9.912051737667188e-06, + "loss": 0.806, + "step": 4530 + }, + { + "epoch": 0.24428509812378693, + "grad_norm": 0.8438240885734558, + "learning_rate": 9.912012144303142e-06, + "loss": 0.8318, + "step": 4531 + }, + { + "epoch": 0.24433901229243046, + "grad_norm": 0.7944091558456421, + "learning_rate": 9.911972542107962e-06, + "loss": 0.764, + "step": 4532 + }, + { + "epoch": 0.24439292646107397, + "grad_norm": 0.7484297752380371, + "learning_rate": 9.911932931081718e-06, + "loss": 0.7677, + "step": 4533 + }, + { + "epoch": 0.2444468406297175, + "grad_norm": 0.9554882049560547, + "learning_rate": 9.911893311224479e-06, + "loss": 0.7226, + "step": 4534 + }, + { + "epoch": 0.244500754798361, + "grad_norm": 0.6818152070045471, + "learning_rate": 9.91185368253632e-06, + "loss": 0.6565, + "step": 4535 + }, + { + "epoch": 0.24455466896700453, + "grad_norm": 0.8960323333740234, + "learning_rate": 9.91181404501731e-06, + "loss": 0.6672, + "step": 4536 + }, + { + "epoch": 0.24460858313564804, + "grad_norm": 0.8440603017807007, + "learning_rate": 9.911774398667521e-06, + "loss": 0.8423, + "step": 4537 + }, + { + "epoch": 0.24466249730429157, + "grad_norm": 0.7892050743103027, + "learning_rate": 9.911734743487025e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.24471641147293507, + "grad_norm": 0.7640264630317688, + "learning_rate": 9.911695079475892e-06, + "loss": 0.7893, + "step": 4539 + }, + { + "epoch": 0.2447703256415786, + "grad_norm": 0.8862099051475525, + "learning_rate": 9.911655406634191e-06, + "loss": 0.8155, + "step": 4540 + }, + { + "epoch": 0.24482423981022214, + "grad_norm": 0.7623111009597778, + "learning_rate": 9.911615724961999e-06, + "loss": 0.8313, + "step": 4541 + }, + { + "epoch": 0.24487815397886564, + "grad_norm": 0.713832676410675, + "learning_rate": 9.911576034459385e-06, + "loss": 0.7635, + "step": 4542 + }, + { + "epoch": 0.24493206814750917, + "grad_norm": 0.8501582741737366, + "learning_rate": 9.911536335126417e-06, + "loss": 0.8201, + "step": 4543 + }, + { + "epoch": 0.24498598231615268, + "grad_norm": 0.7051424980163574, + "learning_rate": 9.911496626963171e-06, + "loss": 0.7257, + "step": 4544 + }, + { + "epoch": 0.2450398964847962, + "grad_norm": 0.8079765439033508, + "learning_rate": 9.911456909969716e-06, + "loss": 0.7397, + "step": 4545 + }, + { + "epoch": 0.24509381065343971, + "grad_norm": 0.9106319546699524, + "learning_rate": 9.911417184146124e-06, + "loss": 0.7966, + "step": 4546 + }, + { + "epoch": 0.24514772482208325, + "grad_norm": 0.9614812731742859, + "learning_rate": 9.911377449492465e-06, + "loss": 0.7727, + "step": 4547 + }, + { + "epoch": 0.24520163899072675, + "grad_norm": 0.8388345241546631, + "learning_rate": 9.911337706008813e-06, + "loss": 0.7328, + "step": 4548 + }, + { + "epoch": 0.24525555315937028, + "grad_norm": 0.782459020614624, + "learning_rate": 9.911297953695239e-06, + "loss": 0.7823, + "step": 4549 + }, + { + "epoch": 0.2453094673280138, + "grad_norm": 0.8531977534294128, + "learning_rate": 9.911258192551812e-06, + "loss": 0.7139, + "step": 4550 + }, + { + "epoch": 0.24536338149665732, + "grad_norm": 0.7864230871200562, + "learning_rate": 9.911218422578605e-06, + "loss": 0.8322, + "step": 4551 + }, + { + "epoch": 0.24541729566530085, + "grad_norm": 0.7742743492126465, + "learning_rate": 9.911178643775691e-06, + "loss": 0.6747, + "step": 4552 + }, + { + "epoch": 0.24547120983394435, + "grad_norm": 0.7385323643684387, + "learning_rate": 9.91113885614314e-06, + "loss": 0.7297, + "step": 4553 + }, + { + "epoch": 0.24552512400258789, + "grad_norm": 0.8086322546005249, + "learning_rate": 9.911099059681023e-06, + "loss": 0.8216, + "step": 4554 + }, + { + "epoch": 0.2455790381712314, + "grad_norm": 0.7630950808525085, + "learning_rate": 9.911059254389412e-06, + "loss": 0.7549, + "step": 4555 + }, + { + "epoch": 0.24563295233987492, + "grad_norm": 0.8294158577919006, + "learning_rate": 9.91101944026838e-06, + "loss": 0.7858, + "step": 4556 + }, + { + "epoch": 0.24568686650851843, + "grad_norm": 0.8100032210350037, + "learning_rate": 9.910979617317998e-06, + "loss": 0.8488, + "step": 4557 + }, + { + "epoch": 0.24574078067716196, + "grad_norm": 0.7359179258346558, + "learning_rate": 9.910939785538335e-06, + "loss": 0.8151, + "step": 4558 + }, + { + "epoch": 0.2457946948458055, + "grad_norm": 0.811253011226654, + "learning_rate": 9.910899944929465e-06, + "loss": 0.801, + "step": 4559 + }, + { + "epoch": 0.245848609014449, + "grad_norm": 0.7908209562301636, + "learning_rate": 9.91086009549146e-06, + "loss": 0.8618, + "step": 4560 + }, + { + "epoch": 0.24590252318309253, + "grad_norm": 0.7895631790161133, + "learning_rate": 9.91082023722439e-06, + "loss": 0.7601, + "step": 4561 + }, + { + "epoch": 0.24595643735173603, + "grad_norm": 0.7346864938735962, + "learning_rate": 9.910780370128328e-06, + "loss": 0.7725, + "step": 4562 + }, + { + "epoch": 0.24601035152037956, + "grad_norm": 0.6873648166656494, + "learning_rate": 9.910740494203346e-06, + "loss": 0.7597, + "step": 4563 + }, + { + "epoch": 0.24606426568902307, + "grad_norm": 0.8287232518196106, + "learning_rate": 9.910700609449514e-06, + "loss": 0.8514, + "step": 4564 + }, + { + "epoch": 0.2461181798576666, + "grad_norm": 0.9342181086540222, + "learning_rate": 9.910660715866904e-06, + "loss": 0.8839, + "step": 4565 + }, + { + "epoch": 0.2461720940263101, + "grad_norm": 0.7942633032798767, + "learning_rate": 9.91062081345559e-06, + "loss": 0.7975, + "step": 4566 + }, + { + "epoch": 0.24622600819495363, + "grad_norm": 0.8790503144264221, + "learning_rate": 9.910580902215641e-06, + "loss": 0.655, + "step": 4567 + }, + { + "epoch": 0.24627992236359716, + "grad_norm": 0.7399418354034424, + "learning_rate": 9.91054098214713e-06, + "loss": 0.7647, + "step": 4568 + }, + { + "epoch": 0.24633383653224067, + "grad_norm": 1.9217935800552368, + "learning_rate": 9.91050105325013e-06, + "loss": 0.8151, + "step": 4569 + }, + { + "epoch": 0.2463877507008842, + "grad_norm": 0.7717850804328918, + "learning_rate": 9.910461115524709e-06, + "loss": 0.7653, + "step": 4570 + }, + { + "epoch": 0.2464416648695277, + "grad_norm": 0.9564247131347656, + "learning_rate": 9.910421168970943e-06, + "loss": 0.8427, + "step": 4571 + }, + { + "epoch": 0.24649557903817124, + "grad_norm": 0.7386001348495483, + "learning_rate": 9.9103812135889e-06, + "loss": 0.7577, + "step": 4572 + }, + { + "epoch": 0.24654949320681474, + "grad_norm": 0.7440508008003235, + "learning_rate": 9.910341249378656e-06, + "loss": 0.7735, + "step": 4573 + }, + { + "epoch": 0.24660340737545827, + "grad_norm": 0.7204955220222473, + "learning_rate": 9.91030127634028e-06, + "loss": 0.7404, + "step": 4574 + }, + { + "epoch": 0.24665732154410178, + "grad_norm": 0.7932496666908264, + "learning_rate": 9.910261294473844e-06, + "loss": 0.7131, + "step": 4575 + }, + { + "epoch": 0.2467112357127453, + "grad_norm": 0.8415532112121582, + "learning_rate": 9.91022130377942e-06, + "loss": 0.889, + "step": 4576 + }, + { + "epoch": 0.24676514988138884, + "grad_norm": 0.7823799252510071, + "learning_rate": 9.91018130425708e-06, + "loss": 0.7589, + "step": 4577 + }, + { + "epoch": 0.24681906405003234, + "grad_norm": 0.6958774924278259, + "learning_rate": 9.910141295906898e-06, + "loss": 0.6957, + "step": 4578 + }, + { + "epoch": 0.24687297821867588, + "grad_norm": 0.7267159819602966, + "learning_rate": 9.910101278728944e-06, + "loss": 0.767, + "step": 4579 + }, + { + "epoch": 0.24692689238731938, + "grad_norm": 0.7345640659332275, + "learning_rate": 9.91006125272329e-06, + "loss": 0.7756, + "step": 4580 + }, + { + "epoch": 0.2469808065559629, + "grad_norm": 0.8117407560348511, + "learning_rate": 9.910021217890007e-06, + "loss": 0.8028, + "step": 4581 + }, + { + "epoch": 0.24703472072460642, + "grad_norm": 0.7520045042037964, + "learning_rate": 9.90998117422917e-06, + "loss": 0.7801, + "step": 4582 + }, + { + "epoch": 0.24708863489324995, + "grad_norm": 0.791251003742218, + "learning_rate": 9.909941121740847e-06, + "loss": 0.8244, + "step": 4583 + }, + { + "epoch": 0.24714254906189345, + "grad_norm": 0.8434782028198242, + "learning_rate": 9.909901060425114e-06, + "loss": 0.8461, + "step": 4584 + }, + { + "epoch": 0.24719646323053698, + "grad_norm": 0.789013147354126, + "learning_rate": 9.909860990282038e-06, + "loss": 0.8655, + "step": 4585 + }, + { + "epoch": 0.24725037739918052, + "grad_norm": 0.7809332609176636, + "learning_rate": 9.909820911311697e-06, + "loss": 0.7963, + "step": 4586 + }, + { + "epoch": 0.24730429156782402, + "grad_norm": 0.7775362730026245, + "learning_rate": 9.909780823514159e-06, + "loss": 0.8098, + "step": 4587 + }, + { + "epoch": 0.24735820573646755, + "grad_norm": 0.7136217355728149, + "learning_rate": 9.909740726889498e-06, + "loss": 0.7454, + "step": 4588 + }, + { + "epoch": 0.24741211990511106, + "grad_norm": 0.7367640733718872, + "learning_rate": 9.909700621437786e-06, + "loss": 0.7732, + "step": 4589 + }, + { + "epoch": 0.2474660340737546, + "grad_norm": 0.8922567963600159, + "learning_rate": 9.909660507159093e-06, + "loss": 0.7173, + "step": 4590 + }, + { + "epoch": 0.2475199482423981, + "grad_norm": 0.7434333562850952, + "learning_rate": 9.909620384053494e-06, + "loss": 0.7255, + "step": 4591 + }, + { + "epoch": 0.24757386241104162, + "grad_norm": 0.7813223600387573, + "learning_rate": 9.909580252121057e-06, + "loss": 0.7583, + "step": 4592 + }, + { + "epoch": 0.24762777657968515, + "grad_norm": 0.699350118637085, + "learning_rate": 9.90954011136186e-06, + "loss": 0.7572, + "step": 4593 + }, + { + "epoch": 0.24768169074832866, + "grad_norm": 0.8126040101051331, + "learning_rate": 9.90949996177597e-06, + "loss": 0.839, + "step": 4594 + }, + { + "epoch": 0.2477356049169722, + "grad_norm": 0.7475876808166504, + "learning_rate": 9.90945980336346e-06, + "loss": 0.7268, + "step": 4595 + }, + { + "epoch": 0.2477895190856157, + "grad_norm": 0.7833042740821838, + "learning_rate": 9.909419636124407e-06, + "loss": 0.825, + "step": 4596 + }, + { + "epoch": 0.24784343325425923, + "grad_norm": 0.7600408792495728, + "learning_rate": 9.909379460058877e-06, + "loss": 0.7598, + "step": 4597 + }, + { + "epoch": 0.24789734742290273, + "grad_norm": 0.7315041422843933, + "learning_rate": 9.909339275166946e-06, + "loss": 0.7671, + "step": 4598 + }, + { + "epoch": 0.24795126159154626, + "grad_norm": 0.8522780537605286, + "learning_rate": 9.909299081448685e-06, + "loss": 0.7847, + "step": 4599 + }, + { + "epoch": 0.24800517576018977, + "grad_norm": 0.8812578320503235, + "learning_rate": 9.909258878904166e-06, + "loss": 0.8141, + "step": 4600 + }, + { + "epoch": 0.2480590899288333, + "grad_norm": 0.7550300359725952, + "learning_rate": 9.909218667533463e-06, + "loss": 0.8522, + "step": 4601 + }, + { + "epoch": 0.24811300409747683, + "grad_norm": 0.7031952738761902, + "learning_rate": 9.909178447336644e-06, + "loss": 0.7793, + "step": 4602 + }, + { + "epoch": 0.24816691826612033, + "grad_norm": 0.7782654166221619, + "learning_rate": 9.909138218313788e-06, + "loss": 0.8185, + "step": 4603 + }, + { + "epoch": 0.24822083243476387, + "grad_norm": 0.7581482529640198, + "learning_rate": 9.909097980464961e-06, + "loss": 0.714, + "step": 4604 + }, + { + "epoch": 0.24827474660340737, + "grad_norm": 0.7732239365577698, + "learning_rate": 9.909057733790236e-06, + "loss": 0.7916, + "step": 4605 + }, + { + "epoch": 0.2483286607720509, + "grad_norm": 0.8440051674842834, + "learning_rate": 9.909017478289692e-06, + "loss": 0.6826, + "step": 4606 + }, + { + "epoch": 0.2483825749406944, + "grad_norm": 0.8361368179321289, + "learning_rate": 9.908977213963394e-06, + "loss": 0.7922, + "step": 4607 + }, + { + "epoch": 0.24843648910933794, + "grad_norm": 0.7201125025749207, + "learning_rate": 9.908936940811418e-06, + "loss": 0.7285, + "step": 4608 + }, + { + "epoch": 0.24849040327798144, + "grad_norm": 0.7888527512550354, + "learning_rate": 9.908896658833836e-06, + "loss": 0.807, + "step": 4609 + }, + { + "epoch": 0.24854431744662497, + "grad_norm": 0.7935523390769958, + "learning_rate": 9.908856368030717e-06, + "loss": 0.7634, + "step": 4610 + }, + { + "epoch": 0.2485982316152685, + "grad_norm": 0.8482795357704163, + "learning_rate": 9.908816068402138e-06, + "loss": 0.7679, + "step": 4611 + }, + { + "epoch": 0.248652145783912, + "grad_norm": 0.8024162650108337, + "learning_rate": 9.908775759948171e-06, + "loss": 0.8348, + "step": 4612 + }, + { + "epoch": 0.24870605995255554, + "grad_norm": 1.1745551824569702, + "learning_rate": 9.908735442668886e-06, + "loss": 0.9002, + "step": 4613 + }, + { + "epoch": 0.24875997412119905, + "grad_norm": 0.7877936363220215, + "learning_rate": 9.908695116564356e-06, + "loss": 0.8618, + "step": 4614 + }, + { + "epoch": 0.24881388828984258, + "grad_norm": 0.7331380248069763, + "learning_rate": 9.908654781634656e-06, + "loss": 0.7798, + "step": 4615 + }, + { + "epoch": 0.24886780245848608, + "grad_norm": 0.7370942831039429, + "learning_rate": 9.908614437879856e-06, + "loss": 0.7355, + "step": 4616 + }, + { + "epoch": 0.2489217166271296, + "grad_norm": 0.7926658391952515, + "learning_rate": 9.908574085300029e-06, + "loss": 0.7758, + "step": 4617 + }, + { + "epoch": 0.24897563079577312, + "grad_norm": 0.7218267917633057, + "learning_rate": 9.908533723895247e-06, + "loss": 0.7218, + "step": 4618 + }, + { + "epoch": 0.24902954496441665, + "grad_norm": 0.7260599136352539, + "learning_rate": 9.908493353665584e-06, + "loss": 0.7298, + "step": 4619 + }, + { + "epoch": 0.24908345913306018, + "grad_norm": 0.7151805758476257, + "learning_rate": 9.908452974611114e-06, + "loss": 0.8047, + "step": 4620 + }, + { + "epoch": 0.24913737330170369, + "grad_norm": 0.7485063076019287, + "learning_rate": 9.908412586731905e-06, + "loss": 0.8048, + "step": 4621 + }, + { + "epoch": 0.24919128747034722, + "grad_norm": 0.733971893787384, + "learning_rate": 9.908372190028033e-06, + "loss": 0.7345, + "step": 4622 + }, + { + "epoch": 0.24924520163899072, + "grad_norm": 0.7228642106056213, + "learning_rate": 9.90833178449957e-06, + "loss": 0.7076, + "step": 4623 + }, + { + "epoch": 0.24929911580763425, + "grad_norm": 0.7565811276435852, + "learning_rate": 9.908291370146588e-06, + "loss": 0.8207, + "step": 4624 + }, + { + "epoch": 0.24935302997627776, + "grad_norm": 0.7520995140075684, + "learning_rate": 9.90825094696916e-06, + "loss": 0.7815, + "step": 4625 + }, + { + "epoch": 0.2494069441449213, + "grad_norm": 0.8191807866096497, + "learning_rate": 9.908210514967358e-06, + "loss": 0.775, + "step": 4626 + }, + { + "epoch": 0.2494608583135648, + "grad_norm": 0.7196933031082153, + "learning_rate": 9.908170074141257e-06, + "loss": 0.8197, + "step": 4627 + }, + { + "epoch": 0.24951477248220832, + "grad_norm": 0.724298894405365, + "learning_rate": 9.908129624490928e-06, + "loss": 0.8882, + "step": 4628 + }, + { + "epoch": 0.24956868665085186, + "grad_norm": 0.7686057686805725, + "learning_rate": 9.908089166016444e-06, + "loss": 0.7896, + "step": 4629 + }, + { + "epoch": 0.24962260081949536, + "grad_norm": 0.7816513180732727, + "learning_rate": 9.908048698717877e-06, + "loss": 0.7487, + "step": 4630 + }, + { + "epoch": 0.2496765149881389, + "grad_norm": 0.7616474628448486, + "learning_rate": 9.9080082225953e-06, + "loss": 0.7335, + "step": 4631 + }, + { + "epoch": 0.2497304291567824, + "grad_norm": 0.923209011554718, + "learning_rate": 9.907967737648787e-06, + "loss": 0.7808, + "step": 4632 + }, + { + "epoch": 0.24978434332542593, + "grad_norm": 0.7830556035041809, + "learning_rate": 9.90792724387841e-06, + "loss": 0.6349, + "step": 4633 + }, + { + "epoch": 0.24983825749406943, + "grad_norm": 0.7756953835487366, + "learning_rate": 9.90788674128424e-06, + "loss": 0.7903, + "step": 4634 + }, + { + "epoch": 0.24989217166271296, + "grad_norm": 0.7644580006599426, + "learning_rate": 9.907846229866354e-06, + "loss": 0.8474, + "step": 4635 + }, + { + "epoch": 0.24994608583135647, + "grad_norm": 0.7665796875953674, + "learning_rate": 9.907805709624822e-06, + "loss": 0.8081, + "step": 4636 + }, + { + "epoch": 0.25, + "grad_norm": 0.823797881603241, + "learning_rate": 9.907765180559716e-06, + "loss": 0.734, + "step": 4637 + }, + { + "epoch": 0.25005391416864353, + "grad_norm": 0.7901148200035095, + "learning_rate": 9.907724642671111e-06, + "loss": 0.752, + "step": 4638 + }, + { + "epoch": 0.25010782833728706, + "grad_norm": 0.994473934173584, + "learning_rate": 9.90768409595908e-06, + "loss": 0.8103, + "step": 4639 + }, + { + "epoch": 0.25016174250593054, + "grad_norm": 0.7167239189147949, + "learning_rate": 9.907643540423692e-06, + "loss": 0.7288, + "step": 4640 + }, + { + "epoch": 0.25021565667457407, + "grad_norm": 0.8114840388298035, + "learning_rate": 9.907602976065025e-06, + "loss": 0.629, + "step": 4641 + }, + { + "epoch": 0.2502695708432176, + "grad_norm": 0.8481932282447815, + "learning_rate": 9.90756240288315e-06, + "loss": 0.8044, + "step": 4642 + }, + { + "epoch": 0.25032348501186114, + "grad_norm": 0.8757217526435852, + "learning_rate": 9.907521820878139e-06, + "loss": 0.8285, + "step": 4643 + }, + { + "epoch": 0.2503773991805046, + "grad_norm": 0.7892036437988281, + "learning_rate": 9.907481230050065e-06, + "loss": 0.7795, + "step": 4644 + }, + { + "epoch": 0.25043131334914814, + "grad_norm": 0.8281320333480835, + "learning_rate": 9.907440630399003e-06, + "loss": 0.8106, + "step": 4645 + }, + { + "epoch": 0.2504852275177917, + "grad_norm": 0.7743760943412781, + "learning_rate": 9.907400021925022e-06, + "loss": 0.8023, + "step": 4646 + }, + { + "epoch": 0.2505391416864352, + "grad_norm": 0.7882426977157593, + "learning_rate": 9.9073594046282e-06, + "loss": 0.7913, + "step": 4647 + }, + { + "epoch": 0.25059305585507874, + "grad_norm": 0.7276794910430908, + "learning_rate": 9.907318778508607e-06, + "loss": 0.7044, + "step": 4648 + }, + { + "epoch": 0.2506469700237222, + "grad_norm": 0.7869488596916199, + "learning_rate": 9.907278143566317e-06, + "loss": 0.8278, + "step": 4649 + }, + { + "epoch": 0.25070088419236575, + "grad_norm": 0.8069205284118652, + "learning_rate": 9.907237499801403e-06, + "loss": 0.7968, + "step": 4650 + }, + { + "epoch": 0.2507547983610093, + "grad_norm": 0.7453712224960327, + "learning_rate": 9.907196847213938e-06, + "loss": 0.7703, + "step": 4651 + }, + { + "epoch": 0.2508087125296528, + "grad_norm": 0.7574083209037781, + "learning_rate": 9.907156185803994e-06, + "loss": 0.7364, + "step": 4652 + }, + { + "epoch": 0.2508626266982963, + "grad_norm": 0.7393423318862915, + "learning_rate": 9.907115515571643e-06, + "loss": 0.7262, + "step": 4653 + }, + { + "epoch": 0.2509165408669398, + "grad_norm": 0.6861773133277893, + "learning_rate": 9.907074836516963e-06, + "loss": 0.6719, + "step": 4654 + }, + { + "epoch": 0.25097045503558335, + "grad_norm": 0.7770050764083862, + "learning_rate": 9.907034148640025e-06, + "loss": 0.833, + "step": 4655 + }, + { + "epoch": 0.2510243692042269, + "grad_norm": 0.8121877312660217, + "learning_rate": 9.9069934519409e-06, + "loss": 0.7668, + "step": 4656 + }, + { + "epoch": 0.2510782833728704, + "grad_norm": 0.7469497919082642, + "learning_rate": 9.906952746419662e-06, + "loss": 0.7414, + "step": 4657 + }, + { + "epoch": 0.2511321975415139, + "grad_norm": 0.7283838391304016, + "learning_rate": 9.906912032076385e-06, + "loss": 0.7748, + "step": 4658 + }, + { + "epoch": 0.2511861117101574, + "grad_norm": 0.7288998365402222, + "learning_rate": 9.906871308911143e-06, + "loss": 0.7462, + "step": 4659 + }, + { + "epoch": 0.25124002587880095, + "grad_norm": 0.7184773087501526, + "learning_rate": 9.906830576924007e-06, + "loss": 0.7055, + "step": 4660 + }, + { + "epoch": 0.2512939400474445, + "grad_norm": 0.7292659878730774, + "learning_rate": 9.906789836115051e-06, + "loss": 0.7817, + "step": 4661 + }, + { + "epoch": 0.25134785421608796, + "grad_norm": 0.8918725848197937, + "learning_rate": 9.906749086484351e-06, + "loss": 0.8216, + "step": 4662 + }, + { + "epoch": 0.2514017683847315, + "grad_norm": 0.8097497224807739, + "learning_rate": 9.906708328031977e-06, + "loss": 0.7242, + "step": 4663 + }, + { + "epoch": 0.251455682553375, + "grad_norm": 0.7008753418922424, + "learning_rate": 9.906667560758003e-06, + "loss": 0.6947, + "step": 4664 + }, + { + "epoch": 0.25150959672201856, + "grad_norm": 0.7514529228210449, + "learning_rate": 9.906626784662502e-06, + "loss": 0.7933, + "step": 4665 + }, + { + "epoch": 0.2515635108906621, + "grad_norm": 0.992230236530304, + "learning_rate": 9.906585999745547e-06, + "loss": 0.6778, + "step": 4666 + }, + { + "epoch": 0.25161742505930557, + "grad_norm": 0.6534571051597595, + "learning_rate": 9.906545206007214e-06, + "loss": 0.7024, + "step": 4667 + }, + { + "epoch": 0.2516713392279491, + "grad_norm": 0.7981176376342773, + "learning_rate": 9.906504403447573e-06, + "loss": 0.7208, + "step": 4668 + }, + { + "epoch": 0.25172525339659263, + "grad_norm": 0.7560659646987915, + "learning_rate": 9.906463592066699e-06, + "loss": 0.7456, + "step": 4669 + }, + { + "epoch": 0.25177916756523616, + "grad_norm": 1.1304062604904175, + "learning_rate": 9.906422771864666e-06, + "loss": 0.8277, + "step": 4670 + }, + { + "epoch": 0.25183308173387964, + "grad_norm": 0.7330453395843506, + "learning_rate": 9.906381942841546e-06, + "loss": 0.7689, + "step": 4671 + }, + { + "epoch": 0.25188699590252317, + "grad_norm": 0.7832298278808594, + "learning_rate": 9.906341104997412e-06, + "loss": 0.7738, + "step": 4672 + }, + { + "epoch": 0.2519409100711667, + "grad_norm": 0.8089982271194458, + "learning_rate": 9.90630025833234e-06, + "loss": 0.8649, + "step": 4673 + }, + { + "epoch": 0.25199482423981023, + "grad_norm": 0.7778360247612, + "learning_rate": 9.906259402846401e-06, + "loss": 0.8055, + "step": 4674 + }, + { + "epoch": 0.25204873840845377, + "grad_norm": 0.7771027684211731, + "learning_rate": 9.906218538539671e-06, + "loss": 0.6705, + "step": 4675 + }, + { + "epoch": 0.25210265257709724, + "grad_norm": 0.9576727151870728, + "learning_rate": 9.90617766541222e-06, + "loss": 0.8363, + "step": 4676 + }, + { + "epoch": 0.2521565667457408, + "grad_norm": 0.7581680417060852, + "learning_rate": 9.906136783464124e-06, + "loss": 0.778, + "step": 4677 + }, + { + "epoch": 0.2522104809143843, + "grad_norm": 0.8484781384468079, + "learning_rate": 9.906095892695455e-06, + "loss": 0.692, + "step": 4678 + }, + { + "epoch": 0.25226439508302784, + "grad_norm": 0.8313053846359253, + "learning_rate": 9.906054993106289e-06, + "loss": 0.8329, + "step": 4679 + }, + { + "epoch": 0.2523183092516713, + "grad_norm": 0.8454006314277649, + "learning_rate": 9.906014084696696e-06, + "loss": 0.8002, + "step": 4680 + }, + { + "epoch": 0.25237222342031485, + "grad_norm": 0.7415658235549927, + "learning_rate": 9.905973167466751e-06, + "loss": 0.7369, + "step": 4681 + }, + { + "epoch": 0.2524261375889584, + "grad_norm": 1.1871880292892456, + "learning_rate": 9.90593224141653e-06, + "loss": 0.8321, + "step": 4682 + }, + { + "epoch": 0.2524800517576019, + "grad_norm": 0.7169525623321533, + "learning_rate": 9.905891306546102e-06, + "loss": 0.7746, + "step": 4683 + }, + { + "epoch": 0.25253396592624544, + "grad_norm": 0.9533750414848328, + "learning_rate": 9.905850362855544e-06, + "loss": 0.7785, + "step": 4684 + }, + { + "epoch": 0.2525878800948889, + "grad_norm": 0.7524462342262268, + "learning_rate": 9.90580941034493e-06, + "loss": 0.7905, + "step": 4685 + }, + { + "epoch": 0.25264179426353245, + "grad_norm": 0.7788832187652588, + "learning_rate": 9.905768449014332e-06, + "loss": 0.8351, + "step": 4686 + }, + { + "epoch": 0.252695708432176, + "grad_norm": 0.7439721822738647, + "learning_rate": 9.905727478863823e-06, + "loss": 0.8131, + "step": 4687 + }, + { + "epoch": 0.2527496226008195, + "grad_norm": 0.7753449082374573, + "learning_rate": 9.90568649989348e-06, + "loss": 0.8662, + "step": 4688 + }, + { + "epoch": 0.252803536769463, + "grad_norm": 0.7604972124099731, + "learning_rate": 9.90564551210337e-06, + "loss": 0.7539, + "step": 4689 + }, + { + "epoch": 0.2528574509381065, + "grad_norm": 0.7789442539215088, + "learning_rate": 9.905604515493574e-06, + "loss": 0.7488, + "step": 4690 + }, + { + "epoch": 0.25291136510675005, + "grad_norm": 0.7509225010871887, + "learning_rate": 9.905563510064162e-06, + "loss": 0.7889, + "step": 4691 + }, + { + "epoch": 0.2529652792753936, + "grad_norm": 0.7840915322303772, + "learning_rate": 9.905522495815208e-06, + "loss": 0.8808, + "step": 4692 + }, + { + "epoch": 0.2530191934440371, + "grad_norm": 0.7814779877662659, + "learning_rate": 9.905481472746787e-06, + "loss": 0.784, + "step": 4693 + }, + { + "epoch": 0.2530731076126806, + "grad_norm": 1.052604079246521, + "learning_rate": 9.905440440858973e-06, + "loss": 0.9627, + "step": 4694 + }, + { + "epoch": 0.2531270217813241, + "grad_norm": 0.8607435822486877, + "learning_rate": 9.905399400151836e-06, + "loss": 0.7937, + "step": 4695 + }, + { + "epoch": 0.25318093594996766, + "grad_norm": 0.7610926628112793, + "learning_rate": 9.905358350625453e-06, + "loss": 0.828, + "step": 4696 + }, + { + "epoch": 0.2532348501186112, + "grad_norm": 0.7309452295303345, + "learning_rate": 9.905317292279899e-06, + "loss": 0.7337, + "step": 4697 + }, + { + "epoch": 0.25328876428725466, + "grad_norm": 0.9021269083023071, + "learning_rate": 9.905276225115246e-06, + "loss": 0.7447, + "step": 4698 + }, + { + "epoch": 0.2533426784558982, + "grad_norm": 0.8152287006378174, + "learning_rate": 9.905235149131565e-06, + "loss": 0.7322, + "step": 4699 + }, + { + "epoch": 0.2533965926245417, + "grad_norm": 0.8354026675224304, + "learning_rate": 9.905194064328935e-06, + "loss": 0.801, + "step": 4700 + }, + { + "epoch": 0.25345050679318526, + "grad_norm": 0.7649407982826233, + "learning_rate": 9.905152970707428e-06, + "loss": 0.8091, + "step": 4701 + }, + { + "epoch": 0.2535044209618288, + "grad_norm": 0.8044828176498413, + "learning_rate": 9.905111868267116e-06, + "loss": 0.8391, + "step": 4702 + }, + { + "epoch": 0.25355833513047227, + "grad_norm": 0.8590373992919922, + "learning_rate": 9.905070757008076e-06, + "loss": 0.8373, + "step": 4703 + }, + { + "epoch": 0.2536122492991158, + "grad_norm": 0.7771210670471191, + "learning_rate": 9.90502963693038e-06, + "loss": 0.7158, + "step": 4704 + }, + { + "epoch": 0.25366616346775933, + "grad_norm": 1.00150728225708, + "learning_rate": 9.904988508034102e-06, + "loss": 0.7809, + "step": 4705 + }, + { + "epoch": 0.25372007763640286, + "grad_norm": 0.7746372222900391, + "learning_rate": 9.904947370319316e-06, + "loss": 0.8337, + "step": 4706 + }, + { + "epoch": 0.25377399180504634, + "grad_norm": 0.796157717704773, + "learning_rate": 9.904906223786097e-06, + "loss": 0.7119, + "step": 4707 + }, + { + "epoch": 0.25382790597368987, + "grad_norm": 0.7384063601493835, + "learning_rate": 9.904865068434517e-06, + "loss": 0.7639, + "step": 4708 + }, + { + "epoch": 0.2538818201423334, + "grad_norm": 0.7987060546875, + "learning_rate": 9.904823904264651e-06, + "loss": 0.777, + "step": 4709 + }, + { + "epoch": 0.25393573431097693, + "grad_norm": 0.7243106365203857, + "learning_rate": 9.904782731276574e-06, + "loss": 0.796, + "step": 4710 + }, + { + "epoch": 0.25398964847962047, + "grad_norm": 0.9222633242607117, + "learning_rate": 9.904741549470358e-06, + "loss": 0.967, + "step": 4711 + }, + { + "epoch": 0.25404356264826394, + "grad_norm": 1.1910635232925415, + "learning_rate": 9.90470035884608e-06, + "loss": 0.8376, + "step": 4712 + }, + { + "epoch": 0.2540974768169075, + "grad_norm": 0.7156771421432495, + "learning_rate": 9.904659159403811e-06, + "loss": 0.7112, + "step": 4713 + }, + { + "epoch": 0.254151390985551, + "grad_norm": 0.7093952894210815, + "learning_rate": 9.904617951143627e-06, + "loss": 0.7948, + "step": 4714 + }, + { + "epoch": 0.25420530515419454, + "grad_norm": 0.7801835536956787, + "learning_rate": 9.9045767340656e-06, + "loss": 0.6716, + "step": 4715 + }, + { + "epoch": 0.254259219322838, + "grad_norm": 0.7523871660232544, + "learning_rate": 9.904535508169807e-06, + "loss": 0.7212, + "step": 4716 + }, + { + "epoch": 0.25431313349148155, + "grad_norm": 0.7119418978691101, + "learning_rate": 9.90449427345632e-06, + "loss": 0.6655, + "step": 4717 + }, + { + "epoch": 0.2543670476601251, + "grad_norm": 0.7273330092430115, + "learning_rate": 9.904453029925214e-06, + "loss": 0.7697, + "step": 4718 + }, + { + "epoch": 0.2544209618287686, + "grad_norm": 0.9550130367279053, + "learning_rate": 9.904411777576564e-06, + "loss": 0.7441, + "step": 4719 + }, + { + "epoch": 0.25447487599741214, + "grad_norm": 0.7486676573753357, + "learning_rate": 9.90437051641044e-06, + "loss": 0.8171, + "step": 4720 + }, + { + "epoch": 0.2545287901660556, + "grad_norm": 0.8107298612594604, + "learning_rate": 9.904329246426923e-06, + "loss": 0.7311, + "step": 4721 + }, + { + "epoch": 0.25458270433469915, + "grad_norm": 0.679837167263031, + "learning_rate": 9.90428796762608e-06, + "loss": 0.7797, + "step": 4722 + }, + { + "epoch": 0.2546366185033427, + "grad_norm": 0.8209143877029419, + "learning_rate": 9.904246680007993e-06, + "loss": 0.8244, + "step": 4723 + }, + { + "epoch": 0.2546905326719862, + "grad_norm": 0.7561433911323547, + "learning_rate": 9.904205383572727e-06, + "loss": 0.86, + "step": 4724 + }, + { + "epoch": 0.2547444468406297, + "grad_norm": 0.8573929071426392, + "learning_rate": 9.904164078320363e-06, + "loss": 0.802, + "step": 4725 + }, + { + "epoch": 0.2547983610092732, + "grad_norm": 0.8191418051719666, + "learning_rate": 9.904122764250975e-06, + "loss": 0.8121, + "step": 4726 + }, + { + "epoch": 0.25485227517791675, + "grad_norm": 0.8703283667564392, + "learning_rate": 9.904081441364635e-06, + "loss": 0.7535, + "step": 4727 + }, + { + "epoch": 0.2549061893465603, + "grad_norm": 0.8311215043067932, + "learning_rate": 9.904040109661417e-06, + "loss": 0.8085, + "step": 4728 + }, + { + "epoch": 0.2549601035152038, + "grad_norm": 0.754145622253418, + "learning_rate": 9.903998769141397e-06, + "loss": 0.7151, + "step": 4729 + }, + { + "epoch": 0.2550140176838473, + "grad_norm": 0.6936500668525696, + "learning_rate": 9.903957419804648e-06, + "loss": 0.6923, + "step": 4730 + }, + { + "epoch": 0.2550679318524908, + "grad_norm": 0.7825912237167358, + "learning_rate": 9.903916061651245e-06, + "loss": 0.7544, + "step": 4731 + }, + { + "epoch": 0.25512184602113436, + "grad_norm": 0.8273274898529053, + "learning_rate": 9.903874694681264e-06, + "loss": 0.8099, + "step": 4732 + }, + { + "epoch": 0.2551757601897779, + "grad_norm": 0.7580922842025757, + "learning_rate": 9.903833318894776e-06, + "loss": 0.7341, + "step": 4733 + }, + { + "epoch": 0.25522967435842137, + "grad_norm": 0.78266441822052, + "learning_rate": 9.903791934291856e-06, + "loss": 0.8152, + "step": 4734 + }, + { + "epoch": 0.2552835885270649, + "grad_norm": 0.7813715934753418, + "learning_rate": 9.903750540872582e-06, + "loss": 0.7859, + "step": 4735 + }, + { + "epoch": 0.25533750269570843, + "grad_norm": 0.9143809080123901, + "learning_rate": 9.903709138637024e-06, + "loss": 0.8131, + "step": 4736 + }, + { + "epoch": 0.25539141686435196, + "grad_norm": 0.7642751932144165, + "learning_rate": 9.90366772758526e-06, + "loss": 0.6977, + "step": 4737 + }, + { + "epoch": 0.2554453310329955, + "grad_norm": 0.7250218391418457, + "learning_rate": 9.903626307717362e-06, + "loss": 0.7843, + "step": 4738 + }, + { + "epoch": 0.25549924520163897, + "grad_norm": 0.7458422780036926, + "learning_rate": 9.903584879033404e-06, + "loss": 0.7054, + "step": 4739 + }, + { + "epoch": 0.2555531593702825, + "grad_norm": 0.7256683707237244, + "learning_rate": 9.903543441533463e-06, + "loss": 0.7024, + "step": 4740 + }, + { + "epoch": 0.25560707353892603, + "grad_norm": 0.8649191856384277, + "learning_rate": 9.903501995217613e-06, + "loss": 0.7044, + "step": 4741 + }, + { + "epoch": 0.25566098770756956, + "grad_norm": 0.8472884297370911, + "learning_rate": 9.903460540085927e-06, + "loss": 0.722, + "step": 4742 + }, + { + "epoch": 0.2557149018762131, + "grad_norm": 0.7086893916130066, + "learning_rate": 9.90341907613848e-06, + "loss": 0.6744, + "step": 4743 + }, + { + "epoch": 0.2557688160448566, + "grad_norm": 0.9324516654014587, + "learning_rate": 9.903377603375346e-06, + "loss": 0.8765, + "step": 4744 + }, + { + "epoch": 0.2558227302135001, + "grad_norm": 0.8247219324111938, + "learning_rate": 9.903336121796601e-06, + "loss": 0.7104, + "step": 4745 + }, + { + "epoch": 0.25587664438214364, + "grad_norm": 0.7695756554603577, + "learning_rate": 9.90329463140232e-06, + "loss": 0.8399, + "step": 4746 + }, + { + "epoch": 0.25593055855078717, + "grad_norm": 0.8179047107696533, + "learning_rate": 9.903253132192577e-06, + "loss": 0.7531, + "step": 4747 + }, + { + "epoch": 0.25598447271943064, + "grad_norm": 0.7995123863220215, + "learning_rate": 9.903211624167444e-06, + "loss": 0.8248, + "step": 4748 + }, + { + "epoch": 0.2560383868880742, + "grad_norm": 0.7822200059890747, + "learning_rate": 9.903170107326997e-06, + "loss": 0.7224, + "step": 4749 + }, + { + "epoch": 0.2560923010567177, + "grad_norm": 0.9561625123023987, + "learning_rate": 9.903128581671315e-06, + "loss": 0.7307, + "step": 4750 + }, + { + "epoch": 0.25614621522536124, + "grad_norm": 0.8102663159370422, + "learning_rate": 9.903087047200468e-06, + "loss": 0.7958, + "step": 4751 + }, + { + "epoch": 0.25620012939400477, + "grad_norm": 0.8910477757453918, + "learning_rate": 9.90304550391453e-06, + "loss": 0.8577, + "step": 4752 + }, + { + "epoch": 0.25625404356264825, + "grad_norm": 0.9179983139038086, + "learning_rate": 9.903003951813579e-06, + "loss": 0.854, + "step": 4753 + }, + { + "epoch": 0.2563079577312918, + "grad_norm": 0.8993476629257202, + "learning_rate": 9.902962390897688e-06, + "loss": 0.7274, + "step": 4754 + }, + { + "epoch": 0.2563618718999353, + "grad_norm": 0.7873006463050842, + "learning_rate": 9.902920821166932e-06, + "loss": 0.7421, + "step": 4755 + }, + { + "epoch": 0.25641578606857884, + "grad_norm": 0.8410146236419678, + "learning_rate": 9.902879242621385e-06, + "loss": 0.852, + "step": 4756 + }, + { + "epoch": 0.2564697002372223, + "grad_norm": 0.7734405994415283, + "learning_rate": 9.902837655261123e-06, + "loss": 0.7485, + "step": 4757 + }, + { + "epoch": 0.25652361440586585, + "grad_norm": 0.7456048130989075, + "learning_rate": 9.90279605908622e-06, + "loss": 0.7223, + "step": 4758 + }, + { + "epoch": 0.2565775285745094, + "grad_norm": 0.8257940411567688, + "learning_rate": 9.90275445409675e-06, + "loss": 0.8324, + "step": 4759 + }, + { + "epoch": 0.2566314427431529, + "grad_norm": 0.9504823088645935, + "learning_rate": 9.90271284029279e-06, + "loss": 0.8872, + "step": 4760 + }, + { + "epoch": 0.25668535691179645, + "grad_norm": 0.7958370447158813, + "learning_rate": 9.902671217674413e-06, + "loss": 0.7605, + "step": 4761 + }, + { + "epoch": 0.2567392710804399, + "grad_norm": 0.7735753059387207, + "learning_rate": 9.902629586241694e-06, + "loss": 0.7682, + "step": 4762 + }, + { + "epoch": 0.25679318524908346, + "grad_norm": 0.8063069581985474, + "learning_rate": 9.902587945994709e-06, + "loss": 0.8126, + "step": 4763 + }, + { + "epoch": 0.256847099417727, + "grad_norm": 0.7964012026786804, + "learning_rate": 9.902546296933532e-06, + "loss": 0.7487, + "step": 4764 + }, + { + "epoch": 0.2569010135863705, + "grad_norm": 0.7407160997390747, + "learning_rate": 9.902504639058237e-06, + "loss": 0.8495, + "step": 4765 + }, + { + "epoch": 0.256954927755014, + "grad_norm": 0.7712891697883606, + "learning_rate": 9.9024629723689e-06, + "loss": 0.71, + "step": 4766 + }, + { + "epoch": 0.2570088419236575, + "grad_norm": 0.708794355392456, + "learning_rate": 9.902421296865596e-06, + "loss": 0.8264, + "step": 4767 + }, + { + "epoch": 0.25706275609230106, + "grad_norm": 0.7903236746788025, + "learning_rate": 9.902379612548401e-06, + "loss": 0.7409, + "step": 4768 + }, + { + "epoch": 0.2571166702609446, + "grad_norm": 0.8012224435806274, + "learning_rate": 9.902337919417387e-06, + "loss": 0.8192, + "step": 4769 + }, + { + "epoch": 0.2571705844295881, + "grad_norm": 0.7412340044975281, + "learning_rate": 9.902296217472632e-06, + "loss": 0.7908, + "step": 4770 + }, + { + "epoch": 0.2572244985982316, + "grad_norm": 0.7860136032104492, + "learning_rate": 9.902254506714209e-06, + "loss": 0.8757, + "step": 4771 + }, + { + "epoch": 0.25727841276687513, + "grad_norm": 0.7804144620895386, + "learning_rate": 9.902212787142193e-06, + "loss": 0.7549, + "step": 4772 + }, + { + "epoch": 0.25733232693551866, + "grad_norm": 0.809959888458252, + "learning_rate": 9.90217105875666e-06, + "loss": 0.8345, + "step": 4773 + }, + { + "epoch": 0.2573862411041622, + "grad_norm": 0.7853354811668396, + "learning_rate": 9.902129321557685e-06, + "loss": 0.8185, + "step": 4774 + }, + { + "epoch": 0.25744015527280567, + "grad_norm": 0.7500307559967041, + "learning_rate": 9.902087575545341e-06, + "loss": 0.7888, + "step": 4775 + }, + { + "epoch": 0.2574940694414492, + "grad_norm": 0.7578644752502441, + "learning_rate": 9.902045820719705e-06, + "loss": 0.7489, + "step": 4776 + }, + { + "epoch": 0.25754798361009273, + "grad_norm": 0.8096863627433777, + "learning_rate": 9.902004057080854e-06, + "loss": 0.7409, + "step": 4777 + }, + { + "epoch": 0.25760189777873627, + "grad_norm": 0.837684154510498, + "learning_rate": 9.90196228462886e-06, + "loss": 0.8483, + "step": 4778 + }, + { + "epoch": 0.2576558119473798, + "grad_norm": 0.7905386686325073, + "learning_rate": 9.901920503363798e-06, + "loss": 0.7641, + "step": 4779 + }, + { + "epoch": 0.2577097261160233, + "grad_norm": 0.750465452671051, + "learning_rate": 9.901878713285744e-06, + "loss": 0.7286, + "step": 4780 + }, + { + "epoch": 0.2577636402846668, + "grad_norm": 0.8911929726600647, + "learning_rate": 9.901836914394773e-06, + "loss": 0.9407, + "step": 4781 + }, + { + "epoch": 0.25781755445331034, + "grad_norm": 0.7831119894981384, + "learning_rate": 9.90179510669096e-06, + "loss": 0.7239, + "step": 4782 + }, + { + "epoch": 0.25787146862195387, + "grad_norm": 0.7694600820541382, + "learning_rate": 9.901753290174382e-06, + "loss": 0.8146, + "step": 4783 + }, + { + "epoch": 0.25792538279059735, + "grad_norm": 0.8094425797462463, + "learning_rate": 9.901711464845114e-06, + "loss": 0.8349, + "step": 4784 + }, + { + "epoch": 0.2579792969592409, + "grad_norm": 0.8766717314720154, + "learning_rate": 9.901669630703229e-06, + "loss": 0.8034, + "step": 4785 + }, + { + "epoch": 0.2580332111278844, + "grad_norm": 0.7051625847816467, + "learning_rate": 9.9016277877488e-06, + "loss": 0.7393, + "step": 4786 + }, + { + "epoch": 0.25808712529652794, + "grad_norm": 0.8611576557159424, + "learning_rate": 9.901585935981907e-06, + "loss": 0.8226, + "step": 4787 + }, + { + "epoch": 0.2581410394651715, + "grad_norm": 0.767514705657959, + "learning_rate": 9.901544075402624e-06, + "loss": 0.7877, + "step": 4788 + }, + { + "epoch": 0.25819495363381495, + "grad_norm": 0.7997928857803345, + "learning_rate": 9.901502206011027e-06, + "loss": 0.8712, + "step": 4789 + }, + { + "epoch": 0.2582488678024585, + "grad_norm": 0.9323418736457825, + "learning_rate": 9.901460327807189e-06, + "loss": 0.823, + "step": 4790 + }, + { + "epoch": 0.258302781971102, + "grad_norm": 0.8389249444007874, + "learning_rate": 9.901418440791186e-06, + "loss": 0.8592, + "step": 4791 + }, + { + "epoch": 0.25835669613974555, + "grad_norm": 0.6641879677772522, + "learning_rate": 9.901376544963094e-06, + "loss": 0.6147, + "step": 4792 + }, + { + "epoch": 0.258410610308389, + "grad_norm": 0.8162431716918945, + "learning_rate": 9.901334640322989e-06, + "loss": 0.8057, + "step": 4793 + }, + { + "epoch": 0.25846452447703255, + "grad_norm": 0.7615718841552734, + "learning_rate": 9.901292726870943e-06, + "loss": 0.8446, + "step": 4794 + }, + { + "epoch": 0.2585184386456761, + "grad_norm": 0.764523983001709, + "learning_rate": 9.901250804607037e-06, + "loss": 0.8061, + "step": 4795 + }, + { + "epoch": 0.2585723528143196, + "grad_norm": 0.8213503360748291, + "learning_rate": 9.901208873531341e-06, + "loss": 0.7875, + "step": 4796 + }, + { + "epoch": 0.25862626698296315, + "grad_norm": 1.050784945487976, + "learning_rate": 9.901166933643933e-06, + "loss": 0.8412, + "step": 4797 + }, + { + "epoch": 0.2586801811516066, + "grad_norm": 0.7617695927619934, + "learning_rate": 9.901124984944886e-06, + "loss": 0.7946, + "step": 4798 + }, + { + "epoch": 0.25873409532025016, + "grad_norm": 0.8027677536010742, + "learning_rate": 9.90108302743428e-06, + "loss": 0.7967, + "step": 4799 + }, + { + "epoch": 0.2587880094888937, + "grad_norm": 0.7340978384017944, + "learning_rate": 9.901041061112186e-06, + "loss": 0.7771, + "step": 4800 + }, + { + "epoch": 0.2588419236575372, + "grad_norm": 0.7108075618743896, + "learning_rate": 9.900999085978682e-06, + "loss": 0.7182, + "step": 4801 + }, + { + "epoch": 0.2588958378261807, + "grad_norm": 0.8320378661155701, + "learning_rate": 9.90095710203384e-06, + "loss": 0.8903, + "step": 4802 + }, + { + "epoch": 0.25894975199482423, + "grad_norm": 0.7735534310340881, + "learning_rate": 9.900915109277743e-06, + "loss": 0.8604, + "step": 4803 + }, + { + "epoch": 0.25900366616346776, + "grad_norm": 0.9205079078674316, + "learning_rate": 9.900873107710458e-06, + "loss": 0.8451, + "step": 4804 + }, + { + "epoch": 0.2590575803321113, + "grad_norm": 0.8668771386146545, + "learning_rate": 9.900831097332066e-06, + "loss": 0.8566, + "step": 4805 + }, + { + "epoch": 0.2591114945007548, + "grad_norm": 0.8134620785713196, + "learning_rate": 9.90078907814264e-06, + "loss": 0.7842, + "step": 4806 + }, + { + "epoch": 0.2591654086693983, + "grad_norm": 0.8436452746391296, + "learning_rate": 9.900747050142257e-06, + "loss": 0.8673, + "step": 4807 + }, + { + "epoch": 0.25921932283804183, + "grad_norm": 0.877737283706665, + "learning_rate": 9.90070501333099e-06, + "loss": 0.8668, + "step": 4808 + }, + { + "epoch": 0.25927323700668536, + "grad_norm": 0.7415887713432312, + "learning_rate": 9.900662967708917e-06, + "loss": 0.7148, + "step": 4809 + }, + { + "epoch": 0.2593271511753289, + "grad_norm": 0.6708645820617676, + "learning_rate": 9.900620913276114e-06, + "loss": 0.6428, + "step": 4810 + }, + { + "epoch": 0.2593810653439724, + "grad_norm": 0.7553024888038635, + "learning_rate": 9.900578850032655e-06, + "loss": 0.7812, + "step": 4811 + }, + { + "epoch": 0.2594349795126159, + "grad_norm": 0.7633180022239685, + "learning_rate": 9.900536777978615e-06, + "loss": 0.7481, + "step": 4812 + }, + { + "epoch": 0.25948889368125944, + "grad_norm": 0.8034750819206238, + "learning_rate": 9.900494697114072e-06, + "loss": 0.7981, + "step": 4813 + }, + { + "epoch": 0.25954280784990297, + "grad_norm": 0.7227773070335388, + "learning_rate": 9.9004526074391e-06, + "loss": 0.7233, + "step": 4814 + }, + { + "epoch": 0.2595967220185465, + "grad_norm": 0.8364164233207703, + "learning_rate": 9.900410508953775e-06, + "loss": 0.8677, + "step": 4815 + }, + { + "epoch": 0.25965063618719, + "grad_norm": 0.7321234941482544, + "learning_rate": 9.900368401658174e-06, + "loss": 0.6652, + "step": 4816 + }, + { + "epoch": 0.2597045503558335, + "grad_norm": 0.7887052893638611, + "learning_rate": 9.90032628555237e-06, + "loss": 0.8696, + "step": 4817 + }, + { + "epoch": 0.25975846452447704, + "grad_norm": 0.7807821035385132, + "learning_rate": 9.900284160636441e-06, + "loss": 0.8359, + "step": 4818 + }, + { + "epoch": 0.25981237869312057, + "grad_norm": 0.8123578429222107, + "learning_rate": 9.900242026910462e-06, + "loss": 0.8893, + "step": 4819 + }, + { + "epoch": 0.25986629286176405, + "grad_norm": 0.7520090937614441, + "learning_rate": 9.900199884374508e-06, + "loss": 0.8037, + "step": 4820 + }, + { + "epoch": 0.2599202070304076, + "grad_norm": 0.8489886522293091, + "learning_rate": 9.900157733028656e-06, + "loss": 0.8827, + "step": 4821 + }, + { + "epoch": 0.2599741211990511, + "grad_norm": 0.8435912132263184, + "learning_rate": 9.900115572872981e-06, + "loss": 0.7468, + "step": 4822 + }, + { + "epoch": 0.26002803536769464, + "grad_norm": 0.7331469655036926, + "learning_rate": 9.90007340390756e-06, + "loss": 0.7421, + "step": 4823 + }, + { + "epoch": 0.2600819495363382, + "grad_norm": 0.8015231490135193, + "learning_rate": 9.900031226132469e-06, + "loss": 0.8709, + "step": 4824 + }, + { + "epoch": 0.26013586370498165, + "grad_norm": 0.8771700263023376, + "learning_rate": 9.89998903954778e-06, + "loss": 0.8313, + "step": 4825 + }, + { + "epoch": 0.2601897778736252, + "grad_norm": 0.752811074256897, + "learning_rate": 9.899946844153573e-06, + "loss": 0.7887, + "step": 4826 + }, + { + "epoch": 0.2602436920422687, + "grad_norm": 0.7526640295982361, + "learning_rate": 9.899904639949921e-06, + "loss": 0.8189, + "step": 4827 + }, + { + "epoch": 0.26029760621091225, + "grad_norm": 0.8185133337974548, + "learning_rate": 9.899862426936904e-06, + "loss": 0.8426, + "step": 4828 + }, + { + "epoch": 0.2603515203795557, + "grad_norm": 0.6737107038497925, + "learning_rate": 9.899820205114593e-06, + "loss": 0.7235, + "step": 4829 + }, + { + "epoch": 0.26040543454819925, + "grad_norm": 0.880402147769928, + "learning_rate": 9.899777974483068e-06, + "loss": 0.9, + "step": 4830 + }, + { + "epoch": 0.2604593487168428, + "grad_norm": 0.8077740669250488, + "learning_rate": 9.8997357350424e-06, + "loss": 0.7653, + "step": 4831 + }, + { + "epoch": 0.2605132628854863, + "grad_norm": 0.9043613076210022, + "learning_rate": 9.89969348679267e-06, + "loss": 0.7489, + "step": 4832 + }, + { + "epoch": 0.26056717705412985, + "grad_norm": 0.7480129599571228, + "learning_rate": 9.899651229733952e-06, + "loss": 0.804, + "step": 4833 + }, + { + "epoch": 0.2606210912227733, + "grad_norm": 0.8027556538581848, + "learning_rate": 9.899608963866322e-06, + "loss": 0.7484, + "step": 4834 + }, + { + "epoch": 0.26067500539141686, + "grad_norm": 0.7745609283447266, + "learning_rate": 9.899566689189855e-06, + "loss": 0.737, + "step": 4835 + }, + { + "epoch": 0.2607289195600604, + "grad_norm": 0.8240119218826294, + "learning_rate": 9.899524405704627e-06, + "loss": 0.861, + "step": 4836 + }, + { + "epoch": 0.2607828337287039, + "grad_norm": 0.7260393500328064, + "learning_rate": 9.899482113410718e-06, + "loss": 0.812, + "step": 4837 + }, + { + "epoch": 0.2608367478973474, + "grad_norm": 0.7049936652183533, + "learning_rate": 9.899439812308198e-06, + "loss": 0.7422, + "step": 4838 + }, + { + "epoch": 0.26089066206599093, + "grad_norm": 0.802170991897583, + "learning_rate": 9.899397502397148e-06, + "loss": 0.7852, + "step": 4839 + }, + { + "epoch": 0.26094457623463446, + "grad_norm": 0.7912299633026123, + "learning_rate": 9.899355183677642e-06, + "loss": 0.8151, + "step": 4840 + }, + { + "epoch": 0.260998490403278, + "grad_norm": 0.7643092274665833, + "learning_rate": 9.899312856149756e-06, + "loss": 0.7903, + "step": 4841 + }, + { + "epoch": 0.2610524045719215, + "grad_norm": 0.7583617568016052, + "learning_rate": 9.899270519813564e-06, + "loss": 0.8403, + "step": 4842 + }, + { + "epoch": 0.261106318740565, + "grad_norm": 0.8232578635215759, + "learning_rate": 9.899228174669146e-06, + "loss": 0.7994, + "step": 4843 + }, + { + "epoch": 0.26116023290920853, + "grad_norm": 0.7829787731170654, + "learning_rate": 9.899185820716576e-06, + "loss": 0.7586, + "step": 4844 + }, + { + "epoch": 0.26121414707785207, + "grad_norm": 0.8476693630218506, + "learning_rate": 9.899143457955933e-06, + "loss": 0.7687, + "step": 4845 + }, + { + "epoch": 0.2612680612464956, + "grad_norm": 0.7025540471076965, + "learning_rate": 9.899101086387289e-06, + "loss": 0.7326, + "step": 4846 + }, + { + "epoch": 0.2613219754151391, + "grad_norm": 0.727745532989502, + "learning_rate": 9.899058706010723e-06, + "loss": 0.7813, + "step": 4847 + }, + { + "epoch": 0.2613758895837826, + "grad_norm": 0.7706053853034973, + "learning_rate": 9.89901631682631e-06, + "loss": 0.8251, + "step": 4848 + }, + { + "epoch": 0.26142980375242614, + "grad_norm": 0.8354002833366394, + "learning_rate": 9.898973918834123e-06, + "loss": 0.7891, + "step": 4849 + }, + { + "epoch": 0.26148371792106967, + "grad_norm": 0.970196545124054, + "learning_rate": 9.898931512034245e-06, + "loss": 0.812, + "step": 4850 + }, + { + "epoch": 0.2615376320897132, + "grad_norm": 0.7720034718513489, + "learning_rate": 9.898889096426748e-06, + "loss": 0.7794, + "step": 4851 + }, + { + "epoch": 0.2615915462583567, + "grad_norm": 1.2140640020370483, + "learning_rate": 9.89884667201171e-06, + "loss": 0.7144, + "step": 4852 + }, + { + "epoch": 0.2616454604270002, + "grad_norm": 0.8927225470542908, + "learning_rate": 9.898804238789206e-06, + "loss": 0.7906, + "step": 4853 + }, + { + "epoch": 0.26169937459564374, + "grad_norm": 0.886418342590332, + "learning_rate": 9.898761796759312e-06, + "loss": 0.7661, + "step": 4854 + }, + { + "epoch": 0.2617532887642873, + "grad_norm": 0.8143467903137207, + "learning_rate": 9.898719345922105e-06, + "loss": 0.8139, + "step": 4855 + }, + { + "epoch": 0.26180720293293075, + "grad_norm": 0.7952978014945984, + "learning_rate": 9.898676886277662e-06, + "loss": 0.7199, + "step": 4856 + }, + { + "epoch": 0.2618611171015743, + "grad_norm": 0.7782503962516785, + "learning_rate": 9.898634417826059e-06, + "loss": 0.7104, + "step": 4857 + }, + { + "epoch": 0.2619150312702178, + "grad_norm": 0.8419458866119385, + "learning_rate": 9.898591940567371e-06, + "loss": 0.6675, + "step": 4858 + }, + { + "epoch": 0.26196894543886134, + "grad_norm": 0.8036027550697327, + "learning_rate": 9.898549454501675e-06, + "loss": 0.8304, + "step": 4859 + }, + { + "epoch": 0.2620228596075049, + "grad_norm": 0.8537300825119019, + "learning_rate": 9.898506959629049e-06, + "loss": 0.7559, + "step": 4860 + }, + { + "epoch": 0.26207677377614835, + "grad_norm": 0.8351823687553406, + "learning_rate": 9.898464455949565e-06, + "loss": 0.755, + "step": 4861 + }, + { + "epoch": 0.2621306879447919, + "grad_norm": 0.7771688103675842, + "learning_rate": 9.898421943463307e-06, + "loss": 0.7593, + "step": 4862 + }, + { + "epoch": 0.2621846021134354, + "grad_norm": 0.923363208770752, + "learning_rate": 9.898379422170344e-06, + "loss": 0.7514, + "step": 4863 + }, + { + "epoch": 0.26223851628207895, + "grad_norm": 0.695932924747467, + "learning_rate": 9.898336892070756e-06, + "loss": 0.691, + "step": 4864 + }, + { + "epoch": 0.2622924304507224, + "grad_norm": 0.8631780743598938, + "learning_rate": 9.89829435316462e-06, + "loss": 0.8582, + "step": 4865 + }, + { + "epoch": 0.26234634461936596, + "grad_norm": 0.7588357925415039, + "learning_rate": 9.89825180545201e-06, + "loss": 0.7677, + "step": 4866 + }, + { + "epoch": 0.2624002587880095, + "grad_norm": 0.858504056930542, + "learning_rate": 9.898209248933006e-06, + "loss": 0.8136, + "step": 4867 + }, + { + "epoch": 0.262454172956653, + "grad_norm": 0.7912299633026123, + "learning_rate": 9.898166683607683e-06, + "loss": 0.7603, + "step": 4868 + }, + { + "epoch": 0.26250808712529655, + "grad_norm": 0.7564625144004822, + "learning_rate": 9.898124109476113e-06, + "loss": 0.7402, + "step": 4869 + }, + { + "epoch": 0.26256200129394003, + "grad_norm": 0.7155072689056396, + "learning_rate": 9.89808152653838e-06, + "loss": 0.7288, + "step": 4870 + }, + { + "epoch": 0.26261591546258356, + "grad_norm": 0.7694748044013977, + "learning_rate": 9.898038934794554e-06, + "loss": 0.7637, + "step": 4871 + }, + { + "epoch": 0.2626698296312271, + "grad_norm": 0.7335909605026245, + "learning_rate": 9.897996334244717e-06, + "loss": 0.6838, + "step": 4872 + }, + { + "epoch": 0.2627237437998706, + "grad_norm": 0.651745080947876, + "learning_rate": 9.897953724888942e-06, + "loss": 0.6384, + "step": 4873 + }, + { + "epoch": 0.2627776579685141, + "grad_norm": 0.8076156377792358, + "learning_rate": 9.897911106727307e-06, + "loss": 0.783, + "step": 4874 + }, + { + "epoch": 0.26283157213715763, + "grad_norm": 0.74184650182724, + "learning_rate": 9.897868479759888e-06, + "loss": 0.787, + "step": 4875 + }, + { + "epoch": 0.26288548630580116, + "grad_norm": 0.7538748383522034, + "learning_rate": 9.897825843986763e-06, + "loss": 0.7606, + "step": 4876 + }, + { + "epoch": 0.2629394004744447, + "grad_norm": 0.7376627922058105, + "learning_rate": 9.897783199408006e-06, + "loss": 0.8512, + "step": 4877 + }, + { + "epoch": 0.2629933146430882, + "grad_norm": 0.7860908508300781, + "learning_rate": 9.897740546023697e-06, + "loss": 0.8811, + "step": 4878 + }, + { + "epoch": 0.2630472288117317, + "grad_norm": 0.8043631911277771, + "learning_rate": 9.897697883833912e-06, + "loss": 0.8369, + "step": 4879 + }, + { + "epoch": 0.26310114298037524, + "grad_norm": 0.8448672890663147, + "learning_rate": 9.897655212838724e-06, + "loss": 0.8011, + "step": 4880 + }, + { + "epoch": 0.26315505714901877, + "grad_norm": 0.7942283749580383, + "learning_rate": 9.897612533038214e-06, + "loss": 0.8357, + "step": 4881 + }, + { + "epoch": 0.2632089713176623, + "grad_norm": 0.8033713698387146, + "learning_rate": 9.897569844432458e-06, + "loss": 0.8054, + "step": 4882 + }, + { + "epoch": 0.2632628854863058, + "grad_norm": 0.842699408531189, + "learning_rate": 9.89752714702153e-06, + "loss": 0.8016, + "step": 4883 + }, + { + "epoch": 0.2633167996549493, + "grad_norm": 0.8190520405769348, + "learning_rate": 9.89748444080551e-06, + "loss": 0.7823, + "step": 4884 + }, + { + "epoch": 0.26337071382359284, + "grad_norm": 1.5263949632644653, + "learning_rate": 9.897441725784474e-06, + "loss": 0.8822, + "step": 4885 + }, + { + "epoch": 0.26342462799223637, + "grad_norm": 0.7523469924926758, + "learning_rate": 9.897399001958496e-06, + "loss": 0.8486, + "step": 4886 + }, + { + "epoch": 0.2634785421608799, + "grad_norm": 0.8582022190093994, + "learning_rate": 9.897356269327659e-06, + "loss": 0.8655, + "step": 4887 + }, + { + "epoch": 0.2635324563295234, + "grad_norm": 0.9637673497200012, + "learning_rate": 9.897313527892032e-06, + "loss": 0.9027, + "step": 4888 + }, + { + "epoch": 0.2635863704981669, + "grad_norm": 0.7891300916671753, + "learning_rate": 9.897270777651698e-06, + "loss": 0.7856, + "step": 4889 + }, + { + "epoch": 0.26364028466681044, + "grad_norm": 0.7728479504585266, + "learning_rate": 9.897228018606731e-06, + "loss": 0.8606, + "step": 4890 + }, + { + "epoch": 0.263694198835454, + "grad_norm": 0.9174859523773193, + "learning_rate": 9.897185250757209e-06, + "loss": 0.7001, + "step": 4891 + }, + { + "epoch": 0.26374811300409745, + "grad_norm": 0.7392576932907104, + "learning_rate": 9.897142474103208e-06, + "loss": 0.7096, + "step": 4892 + }, + { + "epoch": 0.263802027172741, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.897099688644804e-06, + "loss": 0.7899, + "step": 4893 + }, + { + "epoch": 0.2638559413413845, + "grad_norm": 0.7568668723106384, + "learning_rate": 9.897056894382077e-06, + "loss": 0.7595, + "step": 4894 + }, + { + "epoch": 0.26390985551002805, + "grad_norm": 0.800240695476532, + "learning_rate": 9.897014091315102e-06, + "loss": 0.8398, + "step": 4895 + }, + { + "epoch": 0.2639637696786716, + "grad_norm": 0.7847012281417847, + "learning_rate": 9.896971279443956e-06, + "loss": 0.7433, + "step": 4896 + }, + { + "epoch": 0.26401768384731505, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.896928458768716e-06, + "loss": 0.7549, + "step": 4897 + }, + { + "epoch": 0.2640715980159586, + "grad_norm": 0.7179371118545532, + "learning_rate": 9.89688562928946e-06, + "loss": 0.6947, + "step": 4898 + }, + { + "epoch": 0.2641255121846021, + "grad_norm": 0.8114293217658997, + "learning_rate": 9.896842791006261e-06, + "loss": 0.7943, + "step": 4899 + }, + { + "epoch": 0.26417942635324565, + "grad_norm": 0.7791370749473572, + "learning_rate": 9.896799943919202e-06, + "loss": 0.6892, + "step": 4900 + }, + { + "epoch": 0.2642333405218891, + "grad_norm": 0.8667739629745483, + "learning_rate": 9.896757088028355e-06, + "loss": 0.8893, + "step": 4901 + }, + { + "epoch": 0.26428725469053266, + "grad_norm": 0.739639163017273, + "learning_rate": 9.8967142233338e-06, + "loss": 0.7566, + "step": 4902 + }, + { + "epoch": 0.2643411688591762, + "grad_norm": 0.7148702144622803, + "learning_rate": 9.896671349835616e-06, + "loss": 0.7915, + "step": 4903 + }, + { + "epoch": 0.2643950830278197, + "grad_norm": 0.7041117548942566, + "learning_rate": 9.896628467533875e-06, + "loss": 0.7123, + "step": 4904 + }, + { + "epoch": 0.26444899719646325, + "grad_norm": 0.7493545413017273, + "learning_rate": 9.896585576428655e-06, + "loss": 0.8255, + "step": 4905 + }, + { + "epoch": 0.26450291136510673, + "grad_norm": 0.802142322063446, + "learning_rate": 9.896542676520035e-06, + "loss": 0.8414, + "step": 4906 + }, + { + "epoch": 0.26455682553375026, + "grad_norm": 0.7283496260643005, + "learning_rate": 9.896499767808094e-06, + "loss": 0.745, + "step": 4907 + }, + { + "epoch": 0.2646107397023938, + "grad_norm": 0.7583940029144287, + "learning_rate": 9.896456850292907e-06, + "loss": 0.7771, + "step": 4908 + }, + { + "epoch": 0.2646646538710373, + "grad_norm": 0.7401677966117859, + "learning_rate": 9.896413923974548e-06, + "loss": 0.7648, + "step": 4909 + }, + { + "epoch": 0.2647185680396808, + "grad_norm": 0.7986511588096619, + "learning_rate": 9.896370988853099e-06, + "loss": 0.7145, + "step": 4910 + }, + { + "epoch": 0.26477248220832433, + "grad_norm": 0.6956211924552917, + "learning_rate": 9.896328044928634e-06, + "loss": 0.7786, + "step": 4911 + }, + { + "epoch": 0.26482639637696787, + "grad_norm": 0.8934255838394165, + "learning_rate": 9.896285092201231e-06, + "loss": 0.7156, + "step": 4912 + }, + { + "epoch": 0.2648803105456114, + "grad_norm": 0.6990894079208374, + "learning_rate": 9.896242130670972e-06, + "loss": 0.753, + "step": 4913 + }, + { + "epoch": 0.26493422471425493, + "grad_norm": 0.79696124792099, + "learning_rate": 9.896199160337927e-06, + "loss": 0.8626, + "step": 4914 + }, + { + "epoch": 0.2649881388828984, + "grad_norm": 0.7954263091087341, + "learning_rate": 9.896156181202175e-06, + "loss": 0.7447, + "step": 4915 + }, + { + "epoch": 0.26504205305154194, + "grad_norm": 0.7960940003395081, + "learning_rate": 9.896113193263796e-06, + "loss": 0.7805, + "step": 4916 + }, + { + "epoch": 0.26509596722018547, + "grad_norm": 0.7872769236564636, + "learning_rate": 9.896070196522867e-06, + "loss": 0.8706, + "step": 4917 + }, + { + "epoch": 0.265149881388829, + "grad_norm": 0.8143740892410278, + "learning_rate": 9.896027190979462e-06, + "loss": 0.894, + "step": 4918 + }, + { + "epoch": 0.2652037955574725, + "grad_norm": 0.7195903062820435, + "learning_rate": 9.895984176633662e-06, + "loss": 0.7079, + "step": 4919 + }, + { + "epoch": 0.265257709726116, + "grad_norm": 1.2636377811431885, + "learning_rate": 9.895941153485541e-06, + "loss": 0.8019, + "step": 4920 + }, + { + "epoch": 0.26531162389475954, + "grad_norm": 0.9132199287414551, + "learning_rate": 9.895898121535182e-06, + "loss": 0.8137, + "step": 4921 + }, + { + "epoch": 0.2653655380634031, + "grad_norm": 0.7580793499946594, + "learning_rate": 9.895855080782655e-06, + "loss": 0.8015, + "step": 4922 + }, + { + "epoch": 0.2654194522320466, + "grad_norm": 0.764226496219635, + "learning_rate": 9.89581203122804e-06, + "loss": 0.7951, + "step": 4923 + }, + { + "epoch": 0.2654733664006901, + "grad_norm": 0.7804572582244873, + "learning_rate": 9.895768972871418e-06, + "loss": 0.8292, + "step": 4924 + }, + { + "epoch": 0.2655272805693336, + "grad_norm": 1.0945926904678345, + "learning_rate": 9.895725905712863e-06, + "loss": 0.916, + "step": 4925 + }, + { + "epoch": 0.26558119473797714, + "grad_norm": 0.7809876203536987, + "learning_rate": 9.895682829752452e-06, + "loss": 0.8282, + "step": 4926 + }, + { + "epoch": 0.2656351089066207, + "grad_norm": 0.9589576721191406, + "learning_rate": 9.895639744990264e-06, + "loss": 0.7427, + "step": 4927 + }, + { + "epoch": 0.26568902307526415, + "grad_norm": 0.8494128584861755, + "learning_rate": 9.895596651426376e-06, + "loss": 0.8192, + "step": 4928 + }, + { + "epoch": 0.2657429372439077, + "grad_norm": 0.7642913460731506, + "learning_rate": 9.895553549060867e-06, + "loss": 0.7407, + "step": 4929 + }, + { + "epoch": 0.2657968514125512, + "grad_norm": 0.7758688926696777, + "learning_rate": 9.895510437893812e-06, + "loss": 0.8022, + "step": 4930 + }, + { + "epoch": 0.26585076558119475, + "grad_norm": 0.7677244544029236, + "learning_rate": 9.895467317925289e-06, + "loss": 0.7344, + "step": 4931 + }, + { + "epoch": 0.2659046797498383, + "grad_norm": 0.7520139217376709, + "learning_rate": 9.895424189155375e-06, + "loss": 0.8539, + "step": 4932 + }, + { + "epoch": 0.26595859391848176, + "grad_norm": 0.8028707504272461, + "learning_rate": 9.89538105158415e-06, + "loss": 0.7777, + "step": 4933 + }, + { + "epoch": 0.2660125080871253, + "grad_norm": 0.7818429470062256, + "learning_rate": 9.895337905211691e-06, + "loss": 0.7559, + "step": 4934 + }, + { + "epoch": 0.2660664222557688, + "grad_norm": 0.7150774002075195, + "learning_rate": 9.895294750038073e-06, + "loss": 0.7501, + "step": 4935 + }, + { + "epoch": 0.26612033642441235, + "grad_norm": 0.709414541721344, + "learning_rate": 9.895251586063376e-06, + "loss": 0.7232, + "step": 4936 + }, + { + "epoch": 0.26617425059305583, + "grad_norm": 0.8100318908691406, + "learning_rate": 9.895208413287677e-06, + "loss": 0.7702, + "step": 4937 + }, + { + "epoch": 0.26622816476169936, + "grad_norm": 0.6777253150939941, + "learning_rate": 9.895165231711052e-06, + "loss": 0.707, + "step": 4938 + }, + { + "epoch": 0.2662820789303429, + "grad_norm": 0.7034317851066589, + "learning_rate": 9.895122041333583e-06, + "loss": 0.6021, + "step": 4939 + }, + { + "epoch": 0.2663359930989864, + "grad_norm": 0.8210963606834412, + "learning_rate": 9.895078842155343e-06, + "loss": 0.8198, + "step": 4940 + }, + { + "epoch": 0.26638990726762996, + "grad_norm": 0.7624147534370422, + "learning_rate": 9.89503563417641e-06, + "loss": 0.7677, + "step": 4941 + }, + { + "epoch": 0.26644382143627343, + "grad_norm": 0.735461413860321, + "learning_rate": 9.894992417396866e-06, + "loss": 0.839, + "step": 4942 + }, + { + "epoch": 0.26649773560491696, + "grad_norm": 0.7400258183479309, + "learning_rate": 9.894949191816786e-06, + "loss": 0.7904, + "step": 4943 + }, + { + "epoch": 0.2665516497735605, + "grad_norm": 0.7352719902992249, + "learning_rate": 9.894905957436244e-06, + "loss": 0.7283, + "step": 4944 + }, + { + "epoch": 0.266605563942204, + "grad_norm": 0.7771669626235962, + "learning_rate": 9.894862714255324e-06, + "loss": 0.863, + "step": 4945 + }, + { + "epoch": 0.2666594781108475, + "grad_norm": 0.8066530227661133, + "learning_rate": 9.8948194622741e-06, + "loss": 0.8065, + "step": 4946 + }, + { + "epoch": 0.26671339227949104, + "grad_norm": 0.7446811199188232, + "learning_rate": 9.894776201492651e-06, + "loss": 0.7539, + "step": 4947 + }, + { + "epoch": 0.26676730644813457, + "grad_norm": 0.787760317325592, + "learning_rate": 9.894732931911056e-06, + "loss": 0.8361, + "step": 4948 + }, + { + "epoch": 0.2668212206167781, + "grad_norm": 0.9865973591804504, + "learning_rate": 9.894689653529389e-06, + "loss": 0.8228, + "step": 4949 + }, + { + "epoch": 0.26687513478542163, + "grad_norm": 0.7901219129562378, + "learning_rate": 9.89464636634773e-06, + "loss": 0.8059, + "step": 4950 + }, + { + "epoch": 0.2669290489540651, + "grad_norm": 0.8485696911811829, + "learning_rate": 9.89460307036616e-06, + "loss": 0.781, + "step": 4951 + }, + { + "epoch": 0.26698296312270864, + "grad_norm": 0.8590619564056396, + "learning_rate": 9.89455976558475e-06, + "loss": 0.7428, + "step": 4952 + }, + { + "epoch": 0.26703687729135217, + "grad_norm": 0.8802759051322937, + "learning_rate": 9.894516452003584e-06, + "loss": 0.7261, + "step": 4953 + }, + { + "epoch": 0.2670907914599957, + "grad_norm": 0.9600741267204285, + "learning_rate": 9.894473129622739e-06, + "loss": 0.8006, + "step": 4954 + }, + { + "epoch": 0.26714470562863923, + "grad_norm": 0.8588278889656067, + "learning_rate": 9.894429798442288e-06, + "loss": 0.7971, + "step": 4955 + }, + { + "epoch": 0.2671986197972827, + "grad_norm": 0.7204979658126831, + "learning_rate": 9.894386458462315e-06, + "loss": 0.7733, + "step": 4956 + }, + { + "epoch": 0.26725253396592624, + "grad_norm": 0.9327245354652405, + "learning_rate": 9.894343109682893e-06, + "loss": 0.7785, + "step": 4957 + }, + { + "epoch": 0.2673064481345698, + "grad_norm": 0.6946107745170593, + "learning_rate": 9.894299752104105e-06, + "loss": 0.7144, + "step": 4958 + }, + { + "epoch": 0.2673603623032133, + "grad_norm": 0.7115009427070618, + "learning_rate": 9.894256385726025e-06, + "loss": 0.7705, + "step": 4959 + }, + { + "epoch": 0.2674142764718568, + "grad_norm": 0.7661309242248535, + "learning_rate": 9.89421301054873e-06, + "loss": 0.6842, + "step": 4960 + }, + { + "epoch": 0.2674681906405003, + "grad_norm": 0.7183328866958618, + "learning_rate": 9.894169626572302e-06, + "loss": 0.8208, + "step": 4961 + }, + { + "epoch": 0.26752210480914385, + "grad_norm": 0.9643034338951111, + "learning_rate": 9.894126233796816e-06, + "loss": 0.8814, + "step": 4962 + }, + { + "epoch": 0.2675760189777874, + "grad_norm": 0.7522911429405212, + "learning_rate": 9.894082832222352e-06, + "loss": 0.7545, + "step": 4963 + }, + { + "epoch": 0.2676299331464309, + "grad_norm": 0.733444333076477, + "learning_rate": 9.894039421848988e-06, + "loss": 0.6791, + "step": 4964 + }, + { + "epoch": 0.2676838473150744, + "grad_norm": 0.7534430623054504, + "learning_rate": 9.8939960026768e-06, + "loss": 0.8344, + "step": 4965 + }, + { + "epoch": 0.2677377614837179, + "grad_norm": 0.7849922776222229, + "learning_rate": 9.893952574705867e-06, + "loss": 0.6955, + "step": 4966 + }, + { + "epoch": 0.26779167565236145, + "grad_norm": 0.7080478668212891, + "learning_rate": 9.893909137936268e-06, + "loss": 0.7518, + "step": 4967 + }, + { + "epoch": 0.267845589821005, + "grad_norm": 0.7007871270179749, + "learning_rate": 9.893865692368081e-06, + "loss": 0.7011, + "step": 4968 + }, + { + "epoch": 0.26789950398964846, + "grad_norm": 0.8561926484107971, + "learning_rate": 9.893822238001383e-06, + "loss": 0.7918, + "step": 4969 + }, + { + "epoch": 0.267953418158292, + "grad_norm": 0.9306691288948059, + "learning_rate": 9.893778774836251e-06, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.2680073323269355, + "grad_norm": 0.8165447115898132, + "learning_rate": 9.893735302872767e-06, + "loss": 0.8634, + "step": 4971 + }, + { + "epoch": 0.26806124649557905, + "grad_norm": 0.7696943283081055, + "learning_rate": 9.893691822111005e-06, + "loss": 0.7597, + "step": 4972 + }, + { + "epoch": 0.2681151606642226, + "grad_norm": 0.821960985660553, + "learning_rate": 9.893648332551047e-06, + "loss": 0.8266, + "step": 4973 + }, + { + "epoch": 0.26816907483286606, + "grad_norm": 0.7997711300849915, + "learning_rate": 9.893604834192968e-06, + "loss": 0.738, + "step": 4974 + }, + { + "epoch": 0.2682229890015096, + "grad_norm": 0.7624261379241943, + "learning_rate": 9.893561327036847e-06, + "loss": 0.8676, + "step": 4975 + }, + { + "epoch": 0.2682769031701531, + "grad_norm": 0.8748223185539246, + "learning_rate": 9.893517811082764e-06, + "loss": 0.8396, + "step": 4976 + }, + { + "epoch": 0.26833081733879666, + "grad_norm": 0.9294693470001221, + "learning_rate": 9.893474286330797e-06, + "loss": 0.8869, + "step": 4977 + }, + { + "epoch": 0.26838473150744013, + "grad_norm": 0.7981976866722107, + "learning_rate": 9.893430752781021e-06, + "loss": 0.8176, + "step": 4978 + }, + { + "epoch": 0.26843864567608366, + "grad_norm": 0.8983638882637024, + "learning_rate": 9.893387210433518e-06, + "loss": 0.8181, + "step": 4979 + }, + { + "epoch": 0.2684925598447272, + "grad_norm": 0.7371122241020203, + "learning_rate": 9.893343659288364e-06, + "loss": 0.8004, + "step": 4980 + }, + { + "epoch": 0.26854647401337073, + "grad_norm": 0.8287851214408875, + "learning_rate": 9.893300099345639e-06, + "loss": 0.8249, + "step": 4981 + }, + { + "epoch": 0.26860038818201426, + "grad_norm": 0.7839323878288269, + "learning_rate": 9.89325653060542e-06, + "loss": 0.7561, + "step": 4982 + }, + { + "epoch": 0.26865430235065774, + "grad_norm": 0.7348718643188477, + "learning_rate": 9.893212953067784e-06, + "loss": 0.7693, + "step": 4983 + }, + { + "epoch": 0.26870821651930127, + "grad_norm": 0.7529023885726929, + "learning_rate": 9.893169366732814e-06, + "loss": 0.7874, + "step": 4984 + }, + { + "epoch": 0.2687621306879448, + "grad_norm": 0.8256911635398865, + "learning_rate": 9.893125771600583e-06, + "loss": 0.8646, + "step": 4985 + }, + { + "epoch": 0.26881604485658833, + "grad_norm": 0.8608624935150146, + "learning_rate": 9.893082167671172e-06, + "loss": 0.7953, + "step": 4986 + }, + { + "epoch": 0.2688699590252318, + "grad_norm": 0.7824952006340027, + "learning_rate": 9.893038554944661e-06, + "loss": 0.6885, + "step": 4987 + }, + { + "epoch": 0.26892387319387534, + "grad_norm": 0.8561933636665344, + "learning_rate": 9.892994933421125e-06, + "loss": 0.771, + "step": 4988 + }, + { + "epoch": 0.26897778736251887, + "grad_norm": 0.8238648176193237, + "learning_rate": 9.892951303100644e-06, + "loss": 0.7308, + "step": 4989 + }, + { + "epoch": 0.2690317015311624, + "grad_norm": 0.6714439392089844, + "learning_rate": 9.892907663983297e-06, + "loss": 0.6775, + "step": 4990 + }, + { + "epoch": 0.26908561569980594, + "grad_norm": 0.714019775390625, + "learning_rate": 9.892864016069162e-06, + "loss": 0.753, + "step": 4991 + }, + { + "epoch": 0.2691395298684494, + "grad_norm": 0.7529036402702332, + "learning_rate": 9.892820359358318e-06, + "loss": 0.8614, + "step": 4992 + }, + { + "epoch": 0.26919344403709294, + "grad_norm": 0.8602166771888733, + "learning_rate": 9.89277669385084e-06, + "loss": 0.8552, + "step": 4993 + }, + { + "epoch": 0.2692473582057365, + "grad_norm": 0.7607848048210144, + "learning_rate": 9.892733019546811e-06, + "loss": 0.7749, + "step": 4994 + }, + { + "epoch": 0.26930127237438, + "grad_norm": 0.664573609828949, + "learning_rate": 9.89268933644631e-06, + "loss": 0.7819, + "step": 4995 + }, + { + "epoch": 0.2693551865430235, + "grad_norm": 0.7218571901321411, + "learning_rate": 9.892645644549412e-06, + "loss": 0.7618, + "step": 4996 + }, + { + "epoch": 0.269409100711667, + "grad_norm": 0.7744899988174438, + "learning_rate": 9.892601943856198e-06, + "loss": 0.7899, + "step": 4997 + }, + { + "epoch": 0.26946301488031055, + "grad_norm": 0.866887629032135, + "learning_rate": 9.892558234366743e-06, + "loss": 0.7779, + "step": 4998 + }, + { + "epoch": 0.2695169290489541, + "grad_norm": 0.7656950354576111, + "learning_rate": 9.892514516081129e-06, + "loss": 0.8449, + "step": 4999 + }, + { + "epoch": 0.2695708432175976, + "grad_norm": 0.8089601397514343, + "learning_rate": 9.892470788999435e-06, + "loss": 0.7406, + "step": 5000 + }, + { + "epoch": 0.2696247573862411, + "grad_norm": 0.7319750189781189, + "learning_rate": 9.892427053121738e-06, + "loss": 0.7794, + "step": 5001 + }, + { + "epoch": 0.2696786715548846, + "grad_norm": 0.8019516468048096, + "learning_rate": 9.892383308448117e-06, + "loss": 0.7322, + "step": 5002 + }, + { + "epoch": 0.26973258572352815, + "grad_norm": 0.7320996522903442, + "learning_rate": 9.89233955497865e-06, + "loss": 0.7882, + "step": 5003 + }, + { + "epoch": 0.2697864998921717, + "grad_norm": 0.8075882792472839, + "learning_rate": 9.892295792713417e-06, + "loss": 0.8391, + "step": 5004 + }, + { + "epoch": 0.26984041406081516, + "grad_norm": 0.7340912222862244, + "learning_rate": 9.892252021652495e-06, + "loss": 0.7815, + "step": 5005 + }, + { + "epoch": 0.2698943282294587, + "grad_norm": 0.8739588260650635, + "learning_rate": 9.892208241795965e-06, + "loss": 0.8287, + "step": 5006 + }, + { + "epoch": 0.2699482423981022, + "grad_norm": 0.7938231229782104, + "learning_rate": 9.892164453143904e-06, + "loss": 0.8494, + "step": 5007 + }, + { + "epoch": 0.27000215656674575, + "grad_norm": 0.7387966513633728, + "learning_rate": 9.892120655696391e-06, + "loss": 0.7465, + "step": 5008 + }, + { + "epoch": 0.2700560707353893, + "grad_norm": 0.7171775102615356, + "learning_rate": 9.892076849453504e-06, + "loss": 0.7227, + "step": 5009 + }, + { + "epoch": 0.27010998490403276, + "grad_norm": 0.7506486773490906, + "learning_rate": 9.892033034415324e-06, + "loss": 0.7606, + "step": 5010 + }, + { + "epoch": 0.2701638990726763, + "grad_norm": 0.833413302898407, + "learning_rate": 9.891989210581928e-06, + "loss": 0.7998, + "step": 5011 + }, + { + "epoch": 0.2702178132413198, + "grad_norm": 0.7675343155860901, + "learning_rate": 9.891945377953395e-06, + "loss": 0.7554, + "step": 5012 + }, + { + "epoch": 0.27027172740996336, + "grad_norm": 0.8682401180267334, + "learning_rate": 9.891901536529804e-06, + "loss": 0.8342, + "step": 5013 + }, + { + "epoch": 0.27032564157860683, + "grad_norm": 0.7674192190170288, + "learning_rate": 9.891857686311232e-06, + "loss": 0.7055, + "step": 5014 + }, + { + "epoch": 0.27037955574725037, + "grad_norm": 0.717960000038147, + "learning_rate": 9.891813827297762e-06, + "loss": 0.7939, + "step": 5015 + }, + { + "epoch": 0.2704334699158939, + "grad_norm": 0.8811343908309937, + "learning_rate": 9.89176995948947e-06, + "loss": 0.7987, + "step": 5016 + }, + { + "epoch": 0.27048738408453743, + "grad_norm": 0.9724238514900208, + "learning_rate": 9.891726082886436e-06, + "loss": 0.8342, + "step": 5017 + }, + { + "epoch": 0.27054129825318096, + "grad_norm": 0.7969245314598083, + "learning_rate": 9.891682197488737e-06, + "loss": 0.8937, + "step": 5018 + }, + { + "epoch": 0.27059521242182444, + "grad_norm": 0.8564383387565613, + "learning_rate": 9.891638303296453e-06, + "loss": 0.7454, + "step": 5019 + }, + { + "epoch": 0.27064912659046797, + "grad_norm": 0.7879497408866882, + "learning_rate": 9.891594400309665e-06, + "loss": 0.7283, + "step": 5020 + }, + { + "epoch": 0.2707030407591115, + "grad_norm": 0.7248218059539795, + "learning_rate": 9.891550488528448e-06, + "loss": 0.7661, + "step": 5021 + }, + { + "epoch": 0.27075695492775503, + "grad_norm": 0.7548377513885498, + "learning_rate": 9.891506567952884e-06, + "loss": 0.8127, + "step": 5022 + }, + { + "epoch": 0.2708108690963985, + "grad_norm": 0.72477787733078, + "learning_rate": 9.891462638583051e-06, + "loss": 0.6732, + "step": 5023 + }, + { + "epoch": 0.27086478326504204, + "grad_norm": 0.7293525338172913, + "learning_rate": 9.891418700419026e-06, + "loss": 0.7547, + "step": 5024 + }, + { + "epoch": 0.2709186974336856, + "grad_norm": 0.6827152371406555, + "learning_rate": 9.891374753460893e-06, + "loss": 0.7069, + "step": 5025 + }, + { + "epoch": 0.2709726116023291, + "grad_norm": 0.8005618453025818, + "learning_rate": 9.891330797708726e-06, + "loss": 0.7789, + "step": 5026 + }, + { + "epoch": 0.27102652577097264, + "grad_norm": 0.8415570259094238, + "learning_rate": 9.891286833162606e-06, + "loss": 0.8397, + "step": 5027 + }, + { + "epoch": 0.2710804399396161, + "grad_norm": 0.7276983261108398, + "learning_rate": 9.891242859822612e-06, + "loss": 0.7051, + "step": 5028 + }, + { + "epoch": 0.27113435410825965, + "grad_norm": 0.7116531729698181, + "learning_rate": 9.891198877688824e-06, + "loss": 0.6909, + "step": 5029 + }, + { + "epoch": 0.2711882682769032, + "grad_norm": 0.7504072189331055, + "learning_rate": 9.891154886761319e-06, + "loss": 0.7552, + "step": 5030 + }, + { + "epoch": 0.2712421824455467, + "grad_norm": 0.7239630222320557, + "learning_rate": 9.891110887040177e-06, + "loss": 0.7546, + "step": 5031 + }, + { + "epoch": 0.2712960966141902, + "grad_norm": 0.7500813603401184, + "learning_rate": 9.891066878525478e-06, + "loss": 0.7983, + "step": 5032 + }, + { + "epoch": 0.2713500107828337, + "grad_norm": 1.0069187879562378, + "learning_rate": 9.8910228612173e-06, + "loss": 0.8422, + "step": 5033 + }, + { + "epoch": 0.27140392495147725, + "grad_norm": 0.7656623721122742, + "learning_rate": 9.890978835115723e-06, + "loss": 0.7754, + "step": 5034 + }, + { + "epoch": 0.2714578391201208, + "grad_norm": 0.8915570974349976, + "learning_rate": 9.890934800220825e-06, + "loss": 0.8195, + "step": 5035 + }, + { + "epoch": 0.2715117532887643, + "grad_norm": 0.8333117961883545, + "learning_rate": 9.890890756532686e-06, + "loss": 0.8419, + "step": 5036 + }, + { + "epoch": 0.2715656674574078, + "grad_norm": 0.8374854922294617, + "learning_rate": 9.890846704051386e-06, + "loss": 0.7581, + "step": 5037 + }, + { + "epoch": 0.2716195816260513, + "grad_norm": 0.7093636989593506, + "learning_rate": 9.890802642777002e-06, + "loss": 0.6926, + "step": 5038 + }, + { + "epoch": 0.27167349579469485, + "grad_norm": 0.7575312852859497, + "learning_rate": 9.890758572709615e-06, + "loss": 0.802, + "step": 5039 + }, + { + "epoch": 0.2717274099633384, + "grad_norm": 0.902991771697998, + "learning_rate": 9.890714493849304e-06, + "loss": 0.9113, + "step": 5040 + }, + { + "epoch": 0.27178132413198186, + "grad_norm": 0.7198828458786011, + "learning_rate": 9.890670406196147e-06, + "loss": 0.7271, + "step": 5041 + }, + { + "epoch": 0.2718352383006254, + "grad_norm": 0.8525444269180298, + "learning_rate": 9.890626309750226e-06, + "loss": 0.7872, + "step": 5042 + }, + { + "epoch": 0.2718891524692689, + "grad_norm": 0.7253887057304382, + "learning_rate": 9.890582204511616e-06, + "loss": 0.7847, + "step": 5043 + }, + { + "epoch": 0.27194306663791246, + "grad_norm": 0.871543824672699, + "learning_rate": 9.890538090480402e-06, + "loss": 0.7855, + "step": 5044 + }, + { + "epoch": 0.271996980806556, + "grad_norm": 0.7563179731369019, + "learning_rate": 9.890493967656658e-06, + "loss": 0.765, + "step": 5045 + }, + { + "epoch": 0.27205089497519946, + "grad_norm": 0.8132460713386536, + "learning_rate": 9.890449836040465e-06, + "loss": 0.815, + "step": 5046 + }, + { + "epoch": 0.272104809143843, + "grad_norm": 0.6690226197242737, + "learning_rate": 9.890405695631905e-06, + "loss": 0.679, + "step": 5047 + }, + { + "epoch": 0.27215872331248653, + "grad_norm": 0.7403889894485474, + "learning_rate": 9.890361546431052e-06, + "loss": 0.6578, + "step": 5048 + }, + { + "epoch": 0.27221263748113006, + "grad_norm": 0.7937926054000854, + "learning_rate": 9.89031738843799e-06, + "loss": 0.8178, + "step": 5049 + }, + { + "epoch": 0.27226655164977354, + "grad_norm": 0.7222248911857605, + "learning_rate": 9.890273221652798e-06, + "loss": 0.6765, + "step": 5050 + }, + { + "epoch": 0.27232046581841707, + "grad_norm": 0.7936972975730896, + "learning_rate": 9.890229046075553e-06, + "loss": 0.7552, + "step": 5051 + }, + { + "epoch": 0.2723743799870606, + "grad_norm": 0.7286278009414673, + "learning_rate": 9.890184861706336e-06, + "loss": 0.7409, + "step": 5052 + }, + { + "epoch": 0.27242829415570413, + "grad_norm": 0.7878450751304626, + "learning_rate": 9.890140668545226e-06, + "loss": 0.8493, + "step": 5053 + }, + { + "epoch": 0.27248220832434766, + "grad_norm": 0.7352455854415894, + "learning_rate": 9.890096466592303e-06, + "loss": 0.6574, + "step": 5054 + }, + { + "epoch": 0.27253612249299114, + "grad_norm": 0.7900424003601074, + "learning_rate": 9.890052255847646e-06, + "loss": 0.8187, + "step": 5055 + }, + { + "epoch": 0.27259003666163467, + "grad_norm": 0.8364367485046387, + "learning_rate": 9.890008036311334e-06, + "loss": 0.7423, + "step": 5056 + }, + { + "epoch": 0.2726439508302782, + "grad_norm": 0.7436595559120178, + "learning_rate": 9.889963807983447e-06, + "loss": 0.7412, + "step": 5057 + }, + { + "epoch": 0.27269786499892174, + "grad_norm": 0.7472354769706726, + "learning_rate": 9.889919570864066e-06, + "loss": 0.8264, + "step": 5058 + }, + { + "epoch": 0.2727517791675652, + "grad_norm": 0.7758167386054993, + "learning_rate": 9.889875324953268e-06, + "loss": 0.7133, + "step": 5059 + }, + { + "epoch": 0.27280569333620874, + "grad_norm": 0.7223731875419617, + "learning_rate": 9.889831070251135e-06, + "loss": 0.7244, + "step": 5060 + }, + { + "epoch": 0.2728596075048523, + "grad_norm": 1.041771650314331, + "learning_rate": 9.889786806757743e-06, + "loss": 0.9429, + "step": 5061 + }, + { + "epoch": 0.2729135216734958, + "grad_norm": 0.8936665654182434, + "learning_rate": 9.889742534473174e-06, + "loss": 0.7424, + "step": 5062 + }, + { + "epoch": 0.27296743584213934, + "grad_norm": 0.8620690107345581, + "learning_rate": 9.88969825339751e-06, + "loss": 0.8211, + "step": 5063 + }, + { + "epoch": 0.2730213500107828, + "grad_norm": 0.8004252314567566, + "learning_rate": 9.889653963530826e-06, + "loss": 0.7296, + "step": 5064 + }, + { + "epoch": 0.27307526417942635, + "grad_norm": 0.7337127327919006, + "learning_rate": 9.889609664873203e-06, + "loss": 0.7898, + "step": 5065 + }, + { + "epoch": 0.2731291783480699, + "grad_norm": 1.7178047895431519, + "learning_rate": 9.889565357424722e-06, + "loss": 0.8032, + "step": 5066 + }, + { + "epoch": 0.2731830925167134, + "grad_norm": 0.871757984161377, + "learning_rate": 9.889521041185464e-06, + "loss": 0.8074, + "step": 5067 + }, + { + "epoch": 0.2732370066853569, + "grad_norm": 1.1161519289016724, + "learning_rate": 9.889476716155503e-06, + "loss": 0.8783, + "step": 5068 + }, + { + "epoch": 0.2732909208540004, + "grad_norm": 1.4781978130340576, + "learning_rate": 9.889432382334924e-06, + "loss": 0.8364, + "step": 5069 + }, + { + "epoch": 0.27334483502264395, + "grad_norm": 0.7921425700187683, + "learning_rate": 9.889388039723807e-06, + "loss": 0.7559, + "step": 5070 + }, + { + "epoch": 0.2733987491912875, + "grad_norm": 0.9014592170715332, + "learning_rate": 9.889343688322227e-06, + "loss": 0.8887, + "step": 5071 + }, + { + "epoch": 0.273452663359931, + "grad_norm": 0.7558442950248718, + "learning_rate": 9.889299328130268e-06, + "loss": 0.7839, + "step": 5072 + }, + { + "epoch": 0.2735065775285745, + "grad_norm": 0.7945775985717773, + "learning_rate": 9.889254959148006e-06, + "loss": 0.8413, + "step": 5073 + }, + { + "epoch": 0.273560491697218, + "grad_norm": 0.8391217589378357, + "learning_rate": 9.889210581375526e-06, + "loss": 0.7617, + "step": 5074 + }, + { + "epoch": 0.27361440586586155, + "grad_norm": 0.8547251224517822, + "learning_rate": 9.889166194812903e-06, + "loss": 0.7955, + "step": 5075 + }, + { + "epoch": 0.2736683200345051, + "grad_norm": 0.8064761757850647, + "learning_rate": 9.88912179946022e-06, + "loss": 0.7557, + "step": 5076 + }, + { + "epoch": 0.27372223420314856, + "grad_norm": 0.7102752923965454, + "learning_rate": 9.889077395317553e-06, + "loss": 0.7526, + "step": 5077 + }, + { + "epoch": 0.2737761483717921, + "grad_norm": 0.8167790770530701, + "learning_rate": 9.889032982384986e-06, + "loss": 0.8245, + "step": 5078 + }, + { + "epoch": 0.2738300625404356, + "grad_norm": 0.7231212854385376, + "learning_rate": 9.888988560662597e-06, + "loss": 0.799, + "step": 5079 + }, + { + "epoch": 0.27388397670907916, + "grad_norm": 0.7393338084220886, + "learning_rate": 9.888944130150464e-06, + "loss": 0.8118, + "step": 5080 + }, + { + "epoch": 0.2739378908777227, + "grad_norm": 0.847621738910675, + "learning_rate": 9.888899690848673e-06, + "loss": 0.8174, + "step": 5081 + }, + { + "epoch": 0.27399180504636617, + "grad_norm": 0.9880374073982239, + "learning_rate": 9.888855242757296e-06, + "loss": 0.9501, + "step": 5082 + }, + { + "epoch": 0.2740457192150097, + "grad_norm": 0.7384204864501953, + "learning_rate": 9.888810785876416e-06, + "loss": 0.785, + "step": 5083 + }, + { + "epoch": 0.27409963338365323, + "grad_norm": 1.001950740814209, + "learning_rate": 9.888766320206118e-06, + "loss": 0.8439, + "step": 5084 + }, + { + "epoch": 0.27415354755229676, + "grad_norm": 0.8231346011161804, + "learning_rate": 9.888721845746473e-06, + "loss": 0.8127, + "step": 5085 + }, + { + "epoch": 0.27420746172094024, + "grad_norm": 0.7128643989562988, + "learning_rate": 9.888677362497568e-06, + "loss": 0.6922, + "step": 5086 + }, + { + "epoch": 0.27426137588958377, + "grad_norm": 0.7206726670265198, + "learning_rate": 9.88863287045948e-06, + "loss": 0.7977, + "step": 5087 + }, + { + "epoch": 0.2743152900582273, + "grad_norm": 0.7943522334098816, + "learning_rate": 9.888588369632289e-06, + "loss": 0.7565, + "step": 5088 + }, + { + "epoch": 0.27436920422687083, + "grad_norm": 0.7610237002372742, + "learning_rate": 9.888543860016075e-06, + "loss": 0.7539, + "step": 5089 + }, + { + "epoch": 0.27442311839551436, + "grad_norm": 0.7111551761627197, + "learning_rate": 9.88849934161092e-06, + "loss": 0.6942, + "step": 5090 + }, + { + "epoch": 0.27447703256415784, + "grad_norm": 0.8590908050537109, + "learning_rate": 9.888454814416901e-06, + "loss": 0.8405, + "step": 5091 + }, + { + "epoch": 0.2745309467328014, + "grad_norm": 0.7120518684387207, + "learning_rate": 9.888410278434101e-06, + "loss": 0.7574, + "step": 5092 + }, + { + "epoch": 0.2745848609014449, + "grad_norm": 0.7736578583717346, + "learning_rate": 9.888365733662598e-06, + "loss": 0.7823, + "step": 5093 + }, + { + "epoch": 0.27463877507008844, + "grad_norm": 0.712278425693512, + "learning_rate": 9.888321180102472e-06, + "loss": 0.7657, + "step": 5094 + }, + { + "epoch": 0.2746926892387319, + "grad_norm": 0.7149209976196289, + "learning_rate": 9.888276617753804e-06, + "loss": 0.7515, + "step": 5095 + }, + { + "epoch": 0.27474660340737544, + "grad_norm": 0.8070907592773438, + "learning_rate": 9.888232046616676e-06, + "loss": 0.7541, + "step": 5096 + }, + { + "epoch": 0.274800517576019, + "grad_norm": 0.8107784390449524, + "learning_rate": 9.888187466691163e-06, + "loss": 0.896, + "step": 5097 + }, + { + "epoch": 0.2748544317446625, + "grad_norm": 0.7852044105529785, + "learning_rate": 9.888142877977349e-06, + "loss": 0.8934, + "step": 5098 + }, + { + "epoch": 0.27490834591330604, + "grad_norm": 0.8732671141624451, + "learning_rate": 9.888098280475315e-06, + "loss": 0.7711, + "step": 5099 + }, + { + "epoch": 0.2749622600819495, + "grad_norm": 0.8847461342811584, + "learning_rate": 9.888053674185138e-06, + "loss": 0.7291, + "step": 5100 + }, + { + "epoch": 0.27501617425059305, + "grad_norm": 0.8422223329544067, + "learning_rate": 9.8880090591069e-06, + "loss": 0.6604, + "step": 5101 + }, + { + "epoch": 0.2750700884192366, + "grad_norm": 0.6901240944862366, + "learning_rate": 9.887964435240681e-06, + "loss": 0.7411, + "step": 5102 + }, + { + "epoch": 0.2751240025878801, + "grad_norm": 0.7141496539115906, + "learning_rate": 9.887919802586561e-06, + "loss": 0.7647, + "step": 5103 + }, + { + "epoch": 0.2751779167565236, + "grad_norm": 0.7716993093490601, + "learning_rate": 9.88787516114462e-06, + "loss": 0.7541, + "step": 5104 + }, + { + "epoch": 0.2752318309251671, + "grad_norm": 0.7874771356582642, + "learning_rate": 9.88783051091494e-06, + "loss": 0.7992, + "step": 5105 + }, + { + "epoch": 0.27528574509381065, + "grad_norm": 0.7106810212135315, + "learning_rate": 9.8877858518976e-06, + "loss": 0.7383, + "step": 5106 + }, + { + "epoch": 0.2753396592624542, + "grad_norm": 0.7486706376075745, + "learning_rate": 9.88774118409268e-06, + "loss": 0.7741, + "step": 5107 + }, + { + "epoch": 0.2753935734310977, + "grad_norm": 0.8137489557266235, + "learning_rate": 9.887696507500259e-06, + "loss": 0.8238, + "step": 5108 + }, + { + "epoch": 0.2754474875997412, + "grad_norm": 0.8295445442199707, + "learning_rate": 9.88765182212042e-06, + "loss": 0.7718, + "step": 5109 + }, + { + "epoch": 0.2755014017683847, + "grad_norm": 0.8613603115081787, + "learning_rate": 9.887607127953243e-06, + "loss": 0.835, + "step": 5110 + }, + { + "epoch": 0.27555531593702826, + "grad_norm": 0.7091763019561768, + "learning_rate": 9.887562424998806e-06, + "loss": 0.7089, + "step": 5111 + }, + { + "epoch": 0.2756092301056718, + "grad_norm": 0.7690724730491638, + "learning_rate": 9.887517713257193e-06, + "loss": 0.7846, + "step": 5112 + }, + { + "epoch": 0.27566314427431526, + "grad_norm": 0.7905461192131042, + "learning_rate": 9.88747299272848e-06, + "loss": 0.7955, + "step": 5113 + }, + { + "epoch": 0.2757170584429588, + "grad_norm": 0.7611652612686157, + "learning_rate": 9.887428263412752e-06, + "loss": 0.7802, + "step": 5114 + }, + { + "epoch": 0.2757709726116023, + "grad_norm": 0.7323983311653137, + "learning_rate": 9.887383525310086e-06, + "loss": 0.8312, + "step": 5115 + }, + { + "epoch": 0.27582488678024586, + "grad_norm": 0.7839152216911316, + "learning_rate": 9.887338778420563e-06, + "loss": 0.7792, + "step": 5116 + }, + { + "epoch": 0.2758788009488894, + "grad_norm": 0.9436889886856079, + "learning_rate": 9.887294022744264e-06, + "loss": 0.8232, + "step": 5117 + }, + { + "epoch": 0.27593271511753287, + "grad_norm": 0.7726641893386841, + "learning_rate": 9.88724925828127e-06, + "loss": 0.7142, + "step": 5118 + }, + { + "epoch": 0.2759866292861764, + "grad_norm": 0.7798104286193848, + "learning_rate": 9.887204485031662e-06, + "loss": 0.7575, + "step": 5119 + }, + { + "epoch": 0.27604054345481993, + "grad_norm": 0.7332453727722168, + "learning_rate": 9.887159702995518e-06, + "loss": 0.7362, + "step": 5120 + }, + { + "epoch": 0.27609445762346346, + "grad_norm": 0.7793838381767273, + "learning_rate": 9.887114912172922e-06, + "loss": 0.8488, + "step": 5121 + }, + { + "epoch": 0.27614837179210694, + "grad_norm": 0.8711932301521301, + "learning_rate": 9.88707011256395e-06, + "loss": 0.714, + "step": 5122 + }, + { + "epoch": 0.27620228596075047, + "grad_norm": 0.747809886932373, + "learning_rate": 9.887025304168686e-06, + "loss": 0.7847, + "step": 5123 + }, + { + "epoch": 0.276256200129394, + "grad_norm": 0.7189614176750183, + "learning_rate": 9.88698048698721e-06, + "loss": 0.7773, + "step": 5124 + }, + { + "epoch": 0.27631011429803753, + "grad_norm": 0.745582640171051, + "learning_rate": 9.886935661019604e-06, + "loss": 0.7567, + "step": 5125 + }, + { + "epoch": 0.27636402846668107, + "grad_norm": 0.7648694515228271, + "learning_rate": 9.886890826265942e-06, + "loss": 0.7938, + "step": 5126 + }, + { + "epoch": 0.27641794263532454, + "grad_norm": 0.8848762512207031, + "learning_rate": 9.886845982726312e-06, + "loss": 0.7978, + "step": 5127 + }, + { + "epoch": 0.2764718568039681, + "grad_norm": 0.8495482206344604, + "learning_rate": 9.886801130400794e-06, + "loss": 0.8016, + "step": 5128 + }, + { + "epoch": 0.2765257709726116, + "grad_norm": 0.7696657180786133, + "learning_rate": 9.886756269289463e-06, + "loss": 0.8715, + "step": 5129 + }, + { + "epoch": 0.27657968514125514, + "grad_norm": 0.7655208110809326, + "learning_rate": 9.886711399392406e-06, + "loss": 0.7964, + "step": 5130 + }, + { + "epoch": 0.2766335993098986, + "grad_norm": 0.7606762051582336, + "learning_rate": 9.8866665207097e-06, + "loss": 0.7159, + "step": 5131 + }, + { + "epoch": 0.27668751347854215, + "grad_norm": 0.8046274781227112, + "learning_rate": 9.886621633241427e-06, + "loss": 0.8083, + "step": 5132 + }, + { + "epoch": 0.2767414276471857, + "grad_norm": 0.9933425784111023, + "learning_rate": 9.886576736987667e-06, + "loss": 0.8654, + "step": 5133 + }, + { + "epoch": 0.2767953418158292, + "grad_norm": 1.6594408750534058, + "learning_rate": 9.8865318319485e-06, + "loss": 0.9209, + "step": 5134 + }, + { + "epoch": 0.27684925598447274, + "grad_norm": 0.857893168926239, + "learning_rate": 9.88648691812401e-06, + "loss": 0.7785, + "step": 5135 + }, + { + "epoch": 0.2769031701531162, + "grad_norm": 0.8305732011795044, + "learning_rate": 9.886441995514275e-06, + "loss": 0.8565, + "step": 5136 + }, + { + "epoch": 0.27695708432175975, + "grad_norm": 0.7797301411628723, + "learning_rate": 9.886397064119375e-06, + "loss": 0.7577, + "step": 5137 + }, + { + "epoch": 0.2770109984904033, + "grad_norm": 0.8581737875938416, + "learning_rate": 9.886352123939393e-06, + "loss": 0.8265, + "step": 5138 + }, + { + "epoch": 0.2770649126590468, + "grad_norm": 0.7265759110450745, + "learning_rate": 9.88630717497441e-06, + "loss": 0.7848, + "step": 5139 + }, + { + "epoch": 0.2771188268276903, + "grad_norm": 0.7873173952102661, + "learning_rate": 9.886262217224505e-06, + "loss": 0.8573, + "step": 5140 + }, + { + "epoch": 0.2771727409963338, + "grad_norm": 0.755599319934845, + "learning_rate": 9.886217250689758e-06, + "loss": 0.7217, + "step": 5141 + }, + { + "epoch": 0.27722665516497735, + "grad_norm": 0.8430512547492981, + "learning_rate": 9.886172275370254e-06, + "loss": 0.8689, + "step": 5142 + }, + { + "epoch": 0.2772805693336209, + "grad_norm": 0.8128552436828613, + "learning_rate": 9.88612729126607e-06, + "loss": 0.7929, + "step": 5143 + }, + { + "epoch": 0.2773344835022644, + "grad_norm": 0.7788698077201843, + "learning_rate": 9.886082298377287e-06, + "loss": 0.8285, + "step": 5144 + }, + { + "epoch": 0.2773883976709079, + "grad_norm": 0.8579205870628357, + "learning_rate": 9.886037296703987e-06, + "loss": 0.8288, + "step": 5145 + }, + { + "epoch": 0.2774423118395514, + "grad_norm": 0.767217755317688, + "learning_rate": 9.885992286246253e-06, + "loss": 0.7999, + "step": 5146 + }, + { + "epoch": 0.27749622600819496, + "grad_norm": 0.7575383186340332, + "learning_rate": 9.885947267004162e-06, + "loss": 0.7647, + "step": 5147 + }, + { + "epoch": 0.2775501401768385, + "grad_norm": 0.8674237132072449, + "learning_rate": 9.885902238977798e-06, + "loss": 0.6781, + "step": 5148 + }, + { + "epoch": 0.27760405434548197, + "grad_norm": 0.6494048833847046, + "learning_rate": 9.885857202167239e-06, + "loss": 0.6748, + "step": 5149 + }, + { + "epoch": 0.2776579685141255, + "grad_norm": 0.8333936333656311, + "learning_rate": 9.885812156572569e-06, + "loss": 0.8393, + "step": 5150 + }, + { + "epoch": 0.27771188268276903, + "grad_norm": 0.8702477812767029, + "learning_rate": 9.885767102193869e-06, + "loss": 0.6428, + "step": 5151 + }, + { + "epoch": 0.27776579685141256, + "grad_norm": 0.8017061948776245, + "learning_rate": 9.885722039031217e-06, + "loss": 0.7933, + "step": 5152 + }, + { + "epoch": 0.2778197110200561, + "grad_norm": 0.7803055047988892, + "learning_rate": 9.885676967084696e-06, + "loss": 0.723, + "step": 5153 + }, + { + "epoch": 0.27787362518869957, + "grad_norm": 0.7224579453468323, + "learning_rate": 9.885631886354387e-06, + "loss": 0.74, + "step": 5154 + }, + { + "epoch": 0.2779275393573431, + "grad_norm": 1.1245145797729492, + "learning_rate": 9.885586796840369e-06, + "loss": 0.877, + "step": 5155 + }, + { + "epoch": 0.27798145352598663, + "grad_norm": 0.7135274410247803, + "learning_rate": 9.885541698542728e-06, + "loss": 0.775, + "step": 5156 + }, + { + "epoch": 0.27803536769463016, + "grad_norm": 0.7516048550605774, + "learning_rate": 9.885496591461541e-06, + "loss": 0.8302, + "step": 5157 + }, + { + "epoch": 0.27808928186327364, + "grad_norm": 0.8390230536460876, + "learning_rate": 9.885451475596887e-06, + "loss": 0.8098, + "step": 5158 + }, + { + "epoch": 0.2781431960319172, + "grad_norm": 0.7310529947280884, + "learning_rate": 9.885406350948854e-06, + "loss": 0.7605, + "step": 5159 + }, + { + "epoch": 0.2781971102005607, + "grad_norm": 0.7502579689025879, + "learning_rate": 9.885361217517517e-06, + "loss": 0.8413, + "step": 5160 + }, + { + "epoch": 0.27825102436920424, + "grad_norm": 0.7119940519332886, + "learning_rate": 9.885316075302963e-06, + "loss": 0.6954, + "step": 5161 + }, + { + "epoch": 0.27830493853784777, + "grad_norm": 0.7565783262252808, + "learning_rate": 9.885270924305266e-06, + "loss": 0.7479, + "step": 5162 + }, + { + "epoch": 0.27835885270649124, + "grad_norm": 0.7579078078269958, + "learning_rate": 9.885225764524511e-06, + "loss": 0.7976, + "step": 5163 + }, + { + "epoch": 0.2784127668751348, + "grad_norm": 0.7112993001937866, + "learning_rate": 9.885180595960779e-06, + "loss": 0.7153, + "step": 5164 + }, + { + "epoch": 0.2784666810437783, + "grad_norm": 1.1651597023010254, + "learning_rate": 9.88513541861415e-06, + "loss": 0.7977, + "step": 5165 + }, + { + "epoch": 0.27852059521242184, + "grad_norm": 0.7818348407745361, + "learning_rate": 9.88509023248471e-06, + "loss": 0.7502, + "step": 5166 + }, + { + "epoch": 0.27857450938106537, + "grad_norm": 0.6622827053070068, + "learning_rate": 9.885045037572534e-06, + "loss": 0.6677, + "step": 5167 + }, + { + "epoch": 0.27862842354970885, + "grad_norm": 0.7490810751914978, + "learning_rate": 9.884999833877706e-06, + "loss": 0.8084, + "step": 5168 + }, + { + "epoch": 0.2786823377183524, + "grad_norm": 0.7105234861373901, + "learning_rate": 9.88495462140031e-06, + "loss": 0.7346, + "step": 5169 + }, + { + "epoch": 0.2787362518869959, + "grad_norm": 0.7885896563529968, + "learning_rate": 9.884909400140421e-06, + "loss": 0.7143, + "step": 5170 + }, + { + "epoch": 0.27879016605563944, + "grad_norm": 0.8720527291297913, + "learning_rate": 9.884864170098125e-06, + "loss": 0.7752, + "step": 5171 + }, + { + "epoch": 0.2788440802242829, + "grad_norm": 0.749433159828186, + "learning_rate": 9.884818931273501e-06, + "loss": 0.8073, + "step": 5172 + }, + { + "epoch": 0.27889799439292645, + "grad_norm": 0.784222424030304, + "learning_rate": 9.884773683666633e-06, + "loss": 0.8404, + "step": 5173 + }, + { + "epoch": 0.27895190856157, + "grad_norm": 0.7572906613349915, + "learning_rate": 9.8847284272776e-06, + "loss": 0.8205, + "step": 5174 + }, + { + "epoch": 0.2790058227302135, + "grad_norm": 0.793807327747345, + "learning_rate": 9.884683162106484e-06, + "loss": 0.7864, + "step": 5175 + }, + { + "epoch": 0.27905973689885705, + "grad_norm": 0.7129535675048828, + "learning_rate": 9.884637888153366e-06, + "loss": 0.8112, + "step": 5176 + }, + { + "epoch": 0.2791136510675005, + "grad_norm": 0.9506208896636963, + "learning_rate": 9.884592605418329e-06, + "loss": 0.8708, + "step": 5177 + }, + { + "epoch": 0.27916756523614406, + "grad_norm": 0.7119637727737427, + "learning_rate": 9.884547313901452e-06, + "loss": 0.7684, + "step": 5178 + }, + { + "epoch": 0.2792214794047876, + "grad_norm": 0.7711455225944519, + "learning_rate": 9.88450201360282e-06, + "loss": 0.7419, + "step": 5179 + }, + { + "epoch": 0.2792753935734311, + "grad_norm": 0.7384727001190186, + "learning_rate": 9.88445670452251e-06, + "loss": 0.8021, + "step": 5180 + }, + { + "epoch": 0.2793293077420746, + "grad_norm": 0.8161928057670593, + "learning_rate": 9.884411386660606e-06, + "loss": 0.8036, + "step": 5181 + }, + { + "epoch": 0.2793832219107181, + "grad_norm": 0.7533312439918518, + "learning_rate": 9.88436606001719e-06, + "loss": 0.6964, + "step": 5182 + }, + { + "epoch": 0.27943713607936166, + "grad_norm": 0.7554582357406616, + "learning_rate": 9.884320724592342e-06, + "loss": 0.7167, + "step": 5183 + }, + { + "epoch": 0.2794910502480052, + "grad_norm": 0.8615080118179321, + "learning_rate": 9.884275380386143e-06, + "loss": 0.8091, + "step": 5184 + }, + { + "epoch": 0.2795449644166487, + "grad_norm": 0.7100309133529663, + "learning_rate": 9.884230027398676e-06, + "loss": 0.7496, + "step": 5185 + }, + { + "epoch": 0.2795988785852922, + "grad_norm": 0.7255486845970154, + "learning_rate": 9.884184665630024e-06, + "loss": 0.6666, + "step": 5186 + }, + { + "epoch": 0.27965279275393573, + "grad_norm": 0.8223450779914856, + "learning_rate": 9.884139295080264e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.27970670692257926, + "grad_norm": 0.7971575856208801, + "learning_rate": 9.884093915749483e-06, + "loss": 0.7606, + "step": 5188 + }, + { + "epoch": 0.2797606210912228, + "grad_norm": 0.749407947063446, + "learning_rate": 9.884048527637757e-06, + "loss": 0.7972, + "step": 5189 + }, + { + "epoch": 0.27981453525986627, + "grad_norm": 0.812382698059082, + "learning_rate": 9.884003130745172e-06, + "loss": 0.736, + "step": 5190 + }, + { + "epoch": 0.2798684494285098, + "grad_norm": 0.7022697925567627, + "learning_rate": 9.883957725071808e-06, + "loss": 0.7541, + "step": 5191 + }, + { + "epoch": 0.27992236359715333, + "grad_norm": 0.8105473518371582, + "learning_rate": 9.883912310617747e-06, + "loss": 0.8424, + "step": 5192 + }, + { + "epoch": 0.27997627776579687, + "grad_norm": 0.7091902494430542, + "learning_rate": 9.883866887383072e-06, + "loss": 0.7669, + "step": 5193 + }, + { + "epoch": 0.2800301919344404, + "grad_norm": 1.0922960042953491, + "learning_rate": 9.88382145536786e-06, + "loss": 0.7833, + "step": 5194 + }, + { + "epoch": 0.2800841061030839, + "grad_norm": 0.6879577040672302, + "learning_rate": 9.883776014572197e-06, + "loss": 0.7359, + "step": 5195 + }, + { + "epoch": 0.2801380202717274, + "grad_norm": 0.7436100244522095, + "learning_rate": 9.883730564996164e-06, + "loss": 0.8516, + "step": 5196 + }, + { + "epoch": 0.28019193444037094, + "grad_norm": 0.6883706450462341, + "learning_rate": 9.88368510663984e-06, + "loss": 0.7517, + "step": 5197 + }, + { + "epoch": 0.28024584860901447, + "grad_norm": 0.7650019526481628, + "learning_rate": 9.88363963950331e-06, + "loss": 0.8363, + "step": 5198 + }, + { + "epoch": 0.28029976277765795, + "grad_norm": 0.7188605070114136, + "learning_rate": 9.883594163586657e-06, + "loss": 0.6951, + "step": 5199 + }, + { + "epoch": 0.2803536769463015, + "grad_norm": 0.8194975852966309, + "learning_rate": 9.883548678889956e-06, + "loss": 0.8482, + "step": 5200 + }, + { + "epoch": 0.280407591114945, + "grad_norm": 0.8289690017700195, + "learning_rate": 9.883503185413296e-06, + "loss": 0.779, + "step": 5201 + }, + { + "epoch": 0.28046150528358854, + "grad_norm": 0.7521582245826721, + "learning_rate": 9.883457683156755e-06, + "loss": 0.7972, + "step": 5202 + }, + { + "epoch": 0.2805154194522321, + "grad_norm": 0.788716733455658, + "learning_rate": 9.883412172120416e-06, + "loss": 0.7889, + "step": 5203 + }, + { + "epoch": 0.28056933362087555, + "grad_norm": 0.7104058265686035, + "learning_rate": 9.88336665230436e-06, + "loss": 0.8007, + "step": 5204 + }, + { + "epoch": 0.2806232477895191, + "grad_norm": 0.7565460801124573, + "learning_rate": 9.88332112370867e-06, + "loss": 0.7697, + "step": 5205 + }, + { + "epoch": 0.2806771619581626, + "grad_norm": 0.7092845439910889, + "learning_rate": 9.883275586333427e-06, + "loss": 0.8195, + "step": 5206 + }, + { + "epoch": 0.28073107612680614, + "grad_norm": 0.7442013621330261, + "learning_rate": 9.883230040178712e-06, + "loss": 0.7558, + "step": 5207 + }, + { + "epoch": 0.2807849902954496, + "grad_norm": 0.7580548524856567, + "learning_rate": 9.88318448524461e-06, + "loss": 0.8208, + "step": 5208 + }, + { + "epoch": 0.28083890446409315, + "grad_norm": 0.8129982948303223, + "learning_rate": 9.883138921531202e-06, + "loss": 0.7475, + "step": 5209 + }, + { + "epoch": 0.2808928186327367, + "grad_norm": 0.708234429359436, + "learning_rate": 9.883093349038567e-06, + "loss": 0.7157, + "step": 5210 + }, + { + "epoch": 0.2809467328013802, + "grad_norm": 0.8034481406211853, + "learning_rate": 9.883047767766786e-06, + "loss": 0.8209, + "step": 5211 + }, + { + "epoch": 0.28100064697002375, + "grad_norm": 0.7125903367996216, + "learning_rate": 9.883002177715948e-06, + "loss": 0.7216, + "step": 5212 + }, + { + "epoch": 0.2810545611386672, + "grad_norm": 0.7171017527580261, + "learning_rate": 9.882956578886128e-06, + "loss": 0.7364, + "step": 5213 + }, + { + "epoch": 0.28110847530731076, + "grad_norm": 0.8210889101028442, + "learning_rate": 9.882910971277413e-06, + "loss": 0.7802, + "step": 5214 + }, + { + "epoch": 0.2811623894759543, + "grad_norm": 0.8742465972900391, + "learning_rate": 9.88286535488988e-06, + "loss": 0.8323, + "step": 5215 + }, + { + "epoch": 0.2812163036445978, + "grad_norm": 0.7613638043403625, + "learning_rate": 9.882819729723616e-06, + "loss": 0.82, + "step": 5216 + }, + { + "epoch": 0.2812702178132413, + "grad_norm": 0.8171275854110718, + "learning_rate": 9.882774095778698e-06, + "loss": 0.7821, + "step": 5217 + }, + { + "epoch": 0.28132413198188483, + "grad_norm": 0.9041802287101746, + "learning_rate": 9.882728453055212e-06, + "loss": 0.7862, + "step": 5218 + }, + { + "epoch": 0.28137804615052836, + "grad_norm": 0.7604931592941284, + "learning_rate": 9.88268280155324e-06, + "loss": 0.7713, + "step": 5219 + }, + { + "epoch": 0.2814319603191719, + "grad_norm": 0.7445857524871826, + "learning_rate": 9.882637141272861e-06, + "loss": 0.7453, + "step": 5220 + }, + { + "epoch": 0.2814858744878154, + "grad_norm": 0.7988085150718689, + "learning_rate": 9.882591472214161e-06, + "loss": 0.7268, + "step": 5221 + }, + { + "epoch": 0.2815397886564589, + "grad_norm": 0.7210063934326172, + "learning_rate": 9.882545794377219e-06, + "loss": 0.7605, + "step": 5222 + }, + { + "epoch": 0.28159370282510243, + "grad_norm": 0.8361137509346008, + "learning_rate": 9.882500107762117e-06, + "loss": 0.8923, + "step": 5223 + }, + { + "epoch": 0.28164761699374596, + "grad_norm": 0.7608784437179565, + "learning_rate": 9.88245441236894e-06, + "loss": 0.7601, + "step": 5224 + }, + { + "epoch": 0.2817015311623895, + "grad_norm": 0.7668020725250244, + "learning_rate": 9.882408708197766e-06, + "loss": 0.7655, + "step": 5225 + }, + { + "epoch": 0.28175544533103297, + "grad_norm": 0.7516483068466187, + "learning_rate": 9.882362995248681e-06, + "loss": 0.7331, + "step": 5226 + }, + { + "epoch": 0.2818093594996765, + "grad_norm": 0.7915279865264893, + "learning_rate": 9.882317273521769e-06, + "loss": 0.8115, + "step": 5227 + }, + { + "epoch": 0.28186327366832004, + "grad_norm": 0.8899939656257629, + "learning_rate": 9.882271543017106e-06, + "loss": 0.7087, + "step": 5228 + }, + { + "epoch": 0.28191718783696357, + "grad_norm": 0.7095377445220947, + "learning_rate": 9.882225803734778e-06, + "loss": 0.6556, + "step": 5229 + }, + { + "epoch": 0.2819711020056071, + "grad_norm": 0.9443415403366089, + "learning_rate": 9.882180055674864e-06, + "loss": 0.7678, + "step": 5230 + }, + { + "epoch": 0.2820250161742506, + "grad_norm": 0.7427262663841248, + "learning_rate": 9.882134298837452e-06, + "loss": 0.7256, + "step": 5231 + }, + { + "epoch": 0.2820789303428941, + "grad_norm": 0.8854336142539978, + "learning_rate": 9.88208853322262e-06, + "loss": 0.7646, + "step": 5232 + }, + { + "epoch": 0.28213284451153764, + "grad_norm": 0.7270344495773315, + "learning_rate": 9.88204275883045e-06, + "loss": 0.7714, + "step": 5233 + }, + { + "epoch": 0.28218675868018117, + "grad_norm": 0.8252066969871521, + "learning_rate": 9.881996975661026e-06, + "loss": 0.835, + "step": 5234 + }, + { + "epoch": 0.28224067284882465, + "grad_norm": 0.803297221660614, + "learning_rate": 9.881951183714432e-06, + "loss": 0.758, + "step": 5235 + }, + { + "epoch": 0.2822945870174682, + "grad_norm": 0.7957141399383545, + "learning_rate": 9.881905382990746e-06, + "loss": 0.8464, + "step": 5236 + }, + { + "epoch": 0.2823485011861117, + "grad_norm": 0.7272878885269165, + "learning_rate": 9.881859573490054e-06, + "loss": 0.8062, + "step": 5237 + }, + { + "epoch": 0.28240241535475524, + "grad_norm": 0.7172011733055115, + "learning_rate": 9.881813755212434e-06, + "loss": 0.8349, + "step": 5238 + }, + { + "epoch": 0.2824563295233988, + "grad_norm": 0.9161372184753418, + "learning_rate": 9.881767928157976e-06, + "loss": 0.815, + "step": 5239 + }, + { + "epoch": 0.28251024369204225, + "grad_norm": 0.6740238666534424, + "learning_rate": 9.881722092326753e-06, + "loss": 0.7096, + "step": 5240 + }, + { + "epoch": 0.2825641578606858, + "grad_norm": 0.740080714225769, + "learning_rate": 9.881676247718855e-06, + "loss": 0.7848, + "step": 5241 + }, + { + "epoch": 0.2826180720293293, + "grad_norm": 0.7177533507347107, + "learning_rate": 9.88163039433436e-06, + "loss": 0.7406, + "step": 5242 + }, + { + "epoch": 0.28267198619797285, + "grad_norm": 0.6941720247268677, + "learning_rate": 9.881584532173352e-06, + "loss": 0.746, + "step": 5243 + }, + { + "epoch": 0.2827259003666163, + "grad_norm": 0.902332603931427, + "learning_rate": 9.881538661235914e-06, + "loss": 0.8925, + "step": 5244 + }, + { + "epoch": 0.28277981453525985, + "grad_norm": 0.7620295882225037, + "learning_rate": 9.881492781522128e-06, + "loss": 0.7031, + "step": 5245 + }, + { + "epoch": 0.2828337287039034, + "grad_norm": 0.735544741153717, + "learning_rate": 9.881446893032077e-06, + "loss": 0.8363, + "step": 5246 + }, + { + "epoch": 0.2828876428725469, + "grad_norm": 0.7686198353767395, + "learning_rate": 9.881400995765843e-06, + "loss": 0.681, + "step": 5247 + }, + { + "epoch": 0.28294155704119045, + "grad_norm": 0.7868270874023438, + "learning_rate": 9.881355089723505e-06, + "loss": 0.7561, + "step": 5248 + }, + { + "epoch": 0.2829954712098339, + "grad_norm": 0.8680627942085266, + "learning_rate": 9.881309174905152e-06, + "loss": 0.7526, + "step": 5249 + }, + { + "epoch": 0.28304938537847746, + "grad_norm": 0.748607873916626, + "learning_rate": 9.881263251310862e-06, + "loss": 0.7898, + "step": 5250 + }, + { + "epoch": 0.283103299547121, + "grad_norm": 0.7534239292144775, + "learning_rate": 9.88121731894072e-06, + "loss": 0.7188, + "step": 5251 + }, + { + "epoch": 0.2831572137157645, + "grad_norm": 0.7027668952941895, + "learning_rate": 9.881171377794808e-06, + "loss": 0.8206, + "step": 5252 + }, + { + "epoch": 0.283211127884408, + "grad_norm": 0.7831504940986633, + "learning_rate": 9.881125427873206e-06, + "loss": 0.7974, + "step": 5253 + }, + { + "epoch": 0.28326504205305153, + "grad_norm": 0.657393753528595, + "learning_rate": 9.881079469176e-06, + "loss": 0.6525, + "step": 5254 + }, + { + "epoch": 0.28331895622169506, + "grad_norm": 0.7056339383125305, + "learning_rate": 9.881033501703272e-06, + "loss": 0.6854, + "step": 5255 + }, + { + "epoch": 0.2833728703903386, + "grad_norm": 0.7217456102371216, + "learning_rate": 9.880987525455105e-06, + "loss": 0.7541, + "step": 5256 + }, + { + "epoch": 0.2834267845589821, + "grad_norm": 0.7223145961761475, + "learning_rate": 9.880941540431579e-06, + "loss": 0.8176, + "step": 5257 + }, + { + "epoch": 0.2834806987276256, + "grad_norm": 0.6996636986732483, + "learning_rate": 9.880895546632779e-06, + "loss": 0.7219, + "step": 5258 + }, + { + "epoch": 0.28353461289626913, + "grad_norm": 0.7340953946113586, + "learning_rate": 9.880849544058787e-06, + "loss": 0.8225, + "step": 5259 + }, + { + "epoch": 0.28358852706491267, + "grad_norm": 0.7698047757148743, + "learning_rate": 9.880803532709687e-06, + "loss": 0.8609, + "step": 5260 + }, + { + "epoch": 0.2836424412335562, + "grad_norm": 0.781949520111084, + "learning_rate": 9.880757512585558e-06, + "loss": 0.8634, + "step": 5261 + }, + { + "epoch": 0.2836963554021997, + "grad_norm": 0.681658923625946, + "learning_rate": 9.880711483686488e-06, + "loss": 0.6711, + "step": 5262 + }, + { + "epoch": 0.2837502695708432, + "grad_norm": 0.802488386631012, + "learning_rate": 9.880665446012553e-06, + "loss": 0.7737, + "step": 5263 + }, + { + "epoch": 0.28380418373948674, + "grad_norm": 0.8142992258071899, + "learning_rate": 9.880619399563844e-06, + "loss": 0.8235, + "step": 5264 + }, + { + "epoch": 0.28385809790813027, + "grad_norm": 0.7499324083328247, + "learning_rate": 9.880573344340438e-06, + "loss": 0.7042, + "step": 5265 + }, + { + "epoch": 0.2839120120767738, + "grad_norm": 0.768059253692627, + "learning_rate": 9.880527280342419e-06, + "loss": 0.7423, + "step": 5266 + }, + { + "epoch": 0.2839659262454173, + "grad_norm": 0.7405000329017639, + "learning_rate": 9.88048120756987e-06, + "loss": 0.6528, + "step": 5267 + }, + { + "epoch": 0.2840198404140608, + "grad_norm": 0.7251627445220947, + "learning_rate": 9.880435126022875e-06, + "loss": 0.7517, + "step": 5268 + }, + { + "epoch": 0.28407375458270434, + "grad_norm": 0.9095546007156372, + "learning_rate": 9.880389035701515e-06, + "loss": 0.8721, + "step": 5269 + }, + { + "epoch": 0.2841276687513479, + "grad_norm": 0.7784069776535034, + "learning_rate": 9.880342936605874e-06, + "loss": 0.8546, + "step": 5270 + }, + { + "epoch": 0.28418158291999135, + "grad_norm": 0.6824434995651245, + "learning_rate": 9.880296828736034e-06, + "loss": 0.7583, + "step": 5271 + }, + { + "epoch": 0.2842354970886349, + "grad_norm": 0.749785840511322, + "learning_rate": 9.88025071209208e-06, + "loss": 0.709, + "step": 5272 + }, + { + "epoch": 0.2842894112572784, + "grad_norm": 0.7068313360214233, + "learning_rate": 9.880204586674093e-06, + "loss": 0.699, + "step": 5273 + }, + { + "epoch": 0.28434332542592194, + "grad_norm": 0.7990247011184692, + "learning_rate": 9.880158452482155e-06, + "loss": 0.8137, + "step": 5274 + }, + { + "epoch": 0.2843972395945655, + "grad_norm": 0.821013867855072, + "learning_rate": 9.880112309516352e-06, + "loss": 0.7723, + "step": 5275 + }, + { + "epoch": 0.28445115376320895, + "grad_norm": 0.680288553237915, + "learning_rate": 9.880066157776764e-06, + "loss": 0.6754, + "step": 5276 + }, + { + "epoch": 0.2845050679318525, + "grad_norm": 0.7425721883773804, + "learning_rate": 9.880019997263477e-06, + "loss": 0.7894, + "step": 5277 + }, + { + "epoch": 0.284558982100496, + "grad_norm": 0.7550294995307922, + "learning_rate": 9.87997382797657e-06, + "loss": 0.732, + "step": 5278 + }, + { + "epoch": 0.28461289626913955, + "grad_norm": 0.8641289472579956, + "learning_rate": 9.87992764991613e-06, + "loss": 0.8209, + "step": 5279 + }, + { + "epoch": 0.284666810437783, + "grad_norm": 0.7044229507446289, + "learning_rate": 9.879881463082238e-06, + "loss": 0.7403, + "step": 5280 + }, + { + "epoch": 0.28472072460642656, + "grad_norm": 0.7343770861625671, + "learning_rate": 9.879835267474975e-06, + "loss": 0.7428, + "step": 5281 + }, + { + "epoch": 0.2847746387750701, + "grad_norm": 0.7690380215644836, + "learning_rate": 9.879789063094429e-06, + "loss": 0.8236, + "step": 5282 + }, + { + "epoch": 0.2848285529437136, + "grad_norm": 0.7682362198829651, + "learning_rate": 9.879742849940679e-06, + "loss": 0.7854, + "step": 5283 + }, + { + "epoch": 0.28488246711235715, + "grad_norm": 0.7253369688987732, + "learning_rate": 9.87969662801381e-06, + "loss": 0.8281, + "step": 5284 + }, + { + "epoch": 0.28493638128100063, + "grad_norm": 0.7726433277130127, + "learning_rate": 9.879650397313905e-06, + "loss": 0.7586, + "step": 5285 + }, + { + "epoch": 0.28499029544964416, + "grad_norm": 0.804685115814209, + "learning_rate": 9.879604157841044e-06, + "loss": 0.8654, + "step": 5286 + }, + { + "epoch": 0.2850442096182877, + "grad_norm": 0.7872894406318665, + "learning_rate": 9.879557909595316e-06, + "loss": 0.7907, + "step": 5287 + }, + { + "epoch": 0.2850981237869312, + "grad_norm": 0.7489103078842163, + "learning_rate": 9.879511652576801e-06, + "loss": 0.7459, + "step": 5288 + }, + { + "epoch": 0.2851520379555747, + "grad_norm": 0.8003327250480652, + "learning_rate": 9.879465386785581e-06, + "loss": 0.7579, + "step": 5289 + }, + { + "epoch": 0.28520595212421823, + "grad_norm": 0.7461791634559631, + "learning_rate": 9.879419112221741e-06, + "loss": 0.7757, + "step": 5290 + }, + { + "epoch": 0.28525986629286176, + "grad_norm": 0.7338587641716003, + "learning_rate": 9.879372828885364e-06, + "loss": 0.7835, + "step": 5291 + }, + { + "epoch": 0.2853137804615053, + "grad_norm": 0.7397693395614624, + "learning_rate": 9.87932653677653e-06, + "loss": 0.7713, + "step": 5292 + }, + { + "epoch": 0.2853676946301488, + "grad_norm": 0.8379868865013123, + "learning_rate": 9.879280235895327e-06, + "loss": 0.8882, + "step": 5293 + }, + { + "epoch": 0.2854216087987923, + "grad_norm": 0.7283885478973389, + "learning_rate": 9.879233926241836e-06, + "loss": 0.7085, + "step": 5294 + }, + { + "epoch": 0.28547552296743584, + "grad_norm": 0.915597140789032, + "learning_rate": 9.879187607816141e-06, + "loss": 0.853, + "step": 5295 + }, + { + "epoch": 0.28552943713607937, + "grad_norm": 0.7851650714874268, + "learning_rate": 9.879141280618325e-06, + "loss": 0.8858, + "step": 5296 + }, + { + "epoch": 0.2855833513047229, + "grad_norm": 0.7895732522010803, + "learning_rate": 9.879094944648468e-06, + "loss": 0.8603, + "step": 5297 + }, + { + "epoch": 0.2856372654733664, + "grad_norm": 0.9263603687286377, + "learning_rate": 9.87904859990666e-06, + "loss": 0.8225, + "step": 5298 + }, + { + "epoch": 0.2856911796420099, + "grad_norm": 0.8861474990844727, + "learning_rate": 9.879002246392979e-06, + "loss": 0.7079, + "step": 5299 + }, + { + "epoch": 0.28574509381065344, + "grad_norm": 0.7643340229988098, + "learning_rate": 9.87895588410751e-06, + "loss": 0.7841, + "step": 5300 + }, + { + "epoch": 0.28579900797929697, + "grad_norm": 0.746583878993988, + "learning_rate": 9.878909513050337e-06, + "loss": 0.8013, + "step": 5301 + }, + { + "epoch": 0.2858529221479405, + "grad_norm": 0.7135025262832642, + "learning_rate": 9.878863133221542e-06, + "loss": 0.7171, + "step": 5302 + }, + { + "epoch": 0.285906836316584, + "grad_norm": 0.7493758201599121, + "learning_rate": 9.878816744621209e-06, + "loss": 0.8217, + "step": 5303 + }, + { + "epoch": 0.2859607504852275, + "grad_norm": 0.8908335566520691, + "learning_rate": 9.878770347249423e-06, + "loss": 0.8303, + "step": 5304 + }, + { + "epoch": 0.28601466465387104, + "grad_norm": 0.7408186793327332, + "learning_rate": 9.878723941106263e-06, + "loss": 0.7275, + "step": 5305 + }, + { + "epoch": 0.2860685788225146, + "grad_norm": 0.8047646880149841, + "learning_rate": 9.878677526191818e-06, + "loss": 0.6659, + "step": 5306 + }, + { + "epoch": 0.28612249299115805, + "grad_norm": 0.7265205979347229, + "learning_rate": 9.878631102506168e-06, + "loss": 0.7725, + "step": 5307 + }, + { + "epoch": 0.2861764071598016, + "grad_norm": 0.98882657289505, + "learning_rate": 9.878584670049398e-06, + "loss": 0.8552, + "step": 5308 + }, + { + "epoch": 0.2862303213284451, + "grad_norm": 0.8431620001792908, + "learning_rate": 9.878538228821588e-06, + "loss": 0.8504, + "step": 5309 + }, + { + "epoch": 0.28628423549708865, + "grad_norm": 0.920662522315979, + "learning_rate": 9.878491778822828e-06, + "loss": 0.8216, + "step": 5310 + }, + { + "epoch": 0.2863381496657322, + "grad_norm": 0.7579310536384583, + "learning_rate": 9.878445320053195e-06, + "loss": 0.7501, + "step": 5311 + }, + { + "epoch": 0.28639206383437565, + "grad_norm": 0.7596756219863892, + "learning_rate": 9.878398852512776e-06, + "loss": 0.815, + "step": 5312 + }, + { + "epoch": 0.2864459780030192, + "grad_norm": 0.8128134608268738, + "learning_rate": 9.878352376201654e-06, + "loss": 0.7782, + "step": 5313 + }, + { + "epoch": 0.2864998921716627, + "grad_norm": 0.7208645939826965, + "learning_rate": 9.878305891119913e-06, + "loss": 0.7444, + "step": 5314 + }, + { + "epoch": 0.28655380634030625, + "grad_norm": 0.8024547100067139, + "learning_rate": 9.878259397267635e-06, + "loss": 0.818, + "step": 5315 + }, + { + "epoch": 0.2866077205089497, + "grad_norm": 0.8033369183540344, + "learning_rate": 9.878212894644904e-06, + "loss": 0.777, + "step": 5316 + }, + { + "epoch": 0.28666163467759326, + "grad_norm": 0.7594527006149292, + "learning_rate": 9.878166383251805e-06, + "loss": 0.7681, + "step": 5317 + }, + { + "epoch": 0.2867155488462368, + "grad_norm": 0.6697728037834167, + "learning_rate": 9.878119863088421e-06, + "loss": 0.724, + "step": 5318 + }, + { + "epoch": 0.2867694630148803, + "grad_norm": 0.7886657118797302, + "learning_rate": 9.878073334154835e-06, + "loss": 0.8544, + "step": 5319 + }, + { + "epoch": 0.28682337718352385, + "grad_norm": 0.7841383218765259, + "learning_rate": 9.878026796451132e-06, + "loss": 0.7671, + "step": 5320 + }, + { + "epoch": 0.28687729135216733, + "grad_norm": 0.766963005065918, + "learning_rate": 9.877980249977393e-06, + "loss": 0.7516, + "step": 5321 + }, + { + "epoch": 0.28693120552081086, + "grad_norm": 0.7714352607727051, + "learning_rate": 9.877933694733705e-06, + "loss": 0.7246, + "step": 5322 + }, + { + "epoch": 0.2869851196894544, + "grad_norm": 0.7595851421356201, + "learning_rate": 9.87788713072015e-06, + "loss": 0.7312, + "step": 5323 + }, + { + "epoch": 0.2870390338580979, + "grad_norm": 0.8249819278717041, + "learning_rate": 9.877840557936811e-06, + "loss": 0.6534, + "step": 5324 + }, + { + "epoch": 0.2870929480267414, + "grad_norm": 0.8358021974563599, + "learning_rate": 9.877793976383772e-06, + "loss": 0.7759, + "step": 5325 + }, + { + "epoch": 0.28714686219538493, + "grad_norm": 0.7542338371276855, + "learning_rate": 9.877747386061118e-06, + "loss": 0.7497, + "step": 5326 + }, + { + "epoch": 0.28720077636402846, + "grad_norm": 0.6970787644386292, + "learning_rate": 9.877700786968932e-06, + "loss": 0.6836, + "step": 5327 + }, + { + "epoch": 0.287254690532672, + "grad_norm": 0.7709139585494995, + "learning_rate": 9.877654179107298e-06, + "loss": 0.7574, + "step": 5328 + }, + { + "epoch": 0.28730860470131553, + "grad_norm": 0.7152370810508728, + "learning_rate": 9.877607562476299e-06, + "loss": 0.7974, + "step": 5329 + }, + { + "epoch": 0.287362518869959, + "grad_norm": 1.1318089962005615, + "learning_rate": 9.877560937076021e-06, + "loss": 0.7187, + "step": 5330 + }, + { + "epoch": 0.28741643303860254, + "grad_norm": 0.66380774974823, + "learning_rate": 9.877514302906546e-06, + "loss": 0.6937, + "step": 5331 + }, + { + "epoch": 0.28747034720724607, + "grad_norm": 0.8609433770179749, + "learning_rate": 9.877467659967957e-06, + "loss": 0.8743, + "step": 5332 + }, + { + "epoch": 0.2875242613758896, + "grad_norm": 0.7391762733459473, + "learning_rate": 9.87742100826034e-06, + "loss": 0.6951, + "step": 5333 + }, + { + "epoch": 0.2875781755445331, + "grad_norm": 0.7332816123962402, + "learning_rate": 9.877374347783776e-06, + "loss": 0.7715, + "step": 5334 + }, + { + "epoch": 0.2876320897131766, + "grad_norm": 0.7669941782951355, + "learning_rate": 9.877327678538351e-06, + "loss": 0.7943, + "step": 5335 + }, + { + "epoch": 0.28768600388182014, + "grad_norm": 0.9585753679275513, + "learning_rate": 9.87728100052415e-06, + "loss": 0.8018, + "step": 5336 + }, + { + "epoch": 0.28773991805046367, + "grad_norm": 0.7633230686187744, + "learning_rate": 9.877234313741255e-06, + "loss": 0.7118, + "step": 5337 + }, + { + "epoch": 0.2877938322191072, + "grad_norm": 0.7662307620048523, + "learning_rate": 9.877187618189751e-06, + "loss": 0.8159, + "step": 5338 + }, + { + "epoch": 0.2878477463877507, + "grad_norm": 0.8725135922431946, + "learning_rate": 9.877140913869722e-06, + "loss": 0.7314, + "step": 5339 + }, + { + "epoch": 0.2879016605563942, + "grad_norm": 0.7815779447555542, + "learning_rate": 9.87709420078125e-06, + "loss": 0.7562, + "step": 5340 + }, + { + "epoch": 0.28795557472503774, + "grad_norm": 0.7647536396980286, + "learning_rate": 9.877047478924421e-06, + "loss": 0.7642, + "step": 5341 + }, + { + "epoch": 0.2880094888936813, + "grad_norm": 0.7150182723999023, + "learning_rate": 9.87700074829932e-06, + "loss": 0.7773, + "step": 5342 + }, + { + "epoch": 0.28806340306232475, + "grad_norm": 0.7187753915786743, + "learning_rate": 9.876954008906026e-06, + "loss": 0.7776, + "step": 5343 + }, + { + "epoch": 0.2881173172309683, + "grad_norm": 0.7617197036743164, + "learning_rate": 9.876907260744628e-06, + "loss": 0.8818, + "step": 5344 + }, + { + "epoch": 0.2881712313996118, + "grad_norm": 0.7334546446800232, + "learning_rate": 9.876860503815208e-06, + "loss": 0.8162, + "step": 5345 + }, + { + "epoch": 0.28822514556825535, + "grad_norm": 0.8149188756942749, + "learning_rate": 9.876813738117852e-06, + "loss": 0.7801, + "step": 5346 + }, + { + "epoch": 0.2882790597368989, + "grad_norm": 0.8440023064613342, + "learning_rate": 9.876766963652642e-06, + "loss": 0.8394, + "step": 5347 + }, + { + "epoch": 0.28833297390554236, + "grad_norm": 0.7138864994049072, + "learning_rate": 9.876720180419664e-06, + "loss": 0.7316, + "step": 5348 + }, + { + "epoch": 0.2883868880741859, + "grad_norm": 0.7690035104751587, + "learning_rate": 9.876673388418999e-06, + "loss": 0.6458, + "step": 5349 + }, + { + "epoch": 0.2884408022428294, + "grad_norm": 0.844340980052948, + "learning_rate": 9.876626587650733e-06, + "loss": 0.8192, + "step": 5350 + }, + { + "epoch": 0.28849471641147295, + "grad_norm": 0.7028863430023193, + "learning_rate": 9.87657977811495e-06, + "loss": 0.7469, + "step": 5351 + }, + { + "epoch": 0.2885486305801164, + "grad_norm": 0.6825146079063416, + "learning_rate": 9.876532959811735e-06, + "loss": 0.6672, + "step": 5352 + }, + { + "epoch": 0.28860254474875996, + "grad_norm": 0.928514838218689, + "learning_rate": 9.876486132741172e-06, + "loss": 0.9185, + "step": 5353 + }, + { + "epoch": 0.2886564589174035, + "grad_norm": 0.9195801615715027, + "learning_rate": 9.876439296903345e-06, + "loss": 0.887, + "step": 5354 + }, + { + "epoch": 0.288710373086047, + "grad_norm": 0.8025040030479431, + "learning_rate": 9.876392452298335e-06, + "loss": 0.7647, + "step": 5355 + }, + { + "epoch": 0.28876428725469055, + "grad_norm": 0.6811031699180603, + "learning_rate": 9.876345598926232e-06, + "loss": 0.7118, + "step": 5356 + }, + { + "epoch": 0.28881820142333403, + "grad_norm": 0.7687453031539917, + "learning_rate": 9.876298736787115e-06, + "loss": 0.8349, + "step": 5357 + }, + { + "epoch": 0.28887211559197756, + "grad_norm": 0.7131432890892029, + "learning_rate": 9.876251865881072e-06, + "loss": 0.7868, + "step": 5358 + }, + { + "epoch": 0.2889260297606211, + "grad_norm": 0.8985068202018738, + "learning_rate": 9.876204986208185e-06, + "loss": 0.8927, + "step": 5359 + }, + { + "epoch": 0.2889799439292646, + "grad_norm": 0.8284032344818115, + "learning_rate": 9.87615809776854e-06, + "loss": 0.9579, + "step": 5360 + }, + { + "epoch": 0.2890338580979081, + "grad_norm": 0.7818793058395386, + "learning_rate": 9.87611120056222e-06, + "loss": 0.8718, + "step": 5361 + }, + { + "epoch": 0.28908777226655163, + "grad_norm": 0.7686202526092529, + "learning_rate": 9.87606429458931e-06, + "loss": 0.7685, + "step": 5362 + }, + { + "epoch": 0.28914168643519517, + "grad_norm": 0.768067479133606, + "learning_rate": 9.876017379849892e-06, + "loss": 0.7785, + "step": 5363 + }, + { + "epoch": 0.2891956006038387, + "grad_norm": 0.846842885017395, + "learning_rate": 9.875970456344055e-06, + "loss": 0.7418, + "step": 5364 + }, + { + "epoch": 0.28924951477248223, + "grad_norm": 0.800483226776123, + "learning_rate": 9.87592352407188e-06, + "loss": 0.8441, + "step": 5365 + }, + { + "epoch": 0.2893034289411257, + "grad_norm": 0.8230191469192505, + "learning_rate": 9.875876583033451e-06, + "loss": 0.8538, + "step": 5366 + }, + { + "epoch": 0.28935734310976924, + "grad_norm": 0.7700148224830627, + "learning_rate": 9.875829633228855e-06, + "loss": 0.7969, + "step": 5367 + }, + { + "epoch": 0.28941125727841277, + "grad_norm": 0.8188271522521973, + "learning_rate": 9.875782674658173e-06, + "loss": 0.8411, + "step": 5368 + }, + { + "epoch": 0.2894651714470563, + "grad_norm": 0.8774964809417725, + "learning_rate": 9.875735707321495e-06, + "loss": 0.9097, + "step": 5369 + }, + { + "epoch": 0.2895190856156998, + "grad_norm": 0.6922599077224731, + "learning_rate": 9.875688731218898e-06, + "loss": 0.7647, + "step": 5370 + }, + { + "epoch": 0.2895729997843433, + "grad_norm": 0.8296899795532227, + "learning_rate": 9.875641746350472e-06, + "loss": 0.8133, + "step": 5371 + }, + { + "epoch": 0.28962691395298684, + "grad_norm": 0.9972916841506958, + "learning_rate": 9.8755947527163e-06, + "loss": 0.9084, + "step": 5372 + }, + { + "epoch": 0.2896808281216304, + "grad_norm": 0.6791282892227173, + "learning_rate": 9.875547750316465e-06, + "loss": 0.6742, + "step": 5373 + }, + { + "epoch": 0.2897347422902739, + "grad_norm": 0.7278220057487488, + "learning_rate": 9.875500739151054e-06, + "loss": 0.7947, + "step": 5374 + }, + { + "epoch": 0.2897886564589174, + "grad_norm": 0.7634933590888977, + "learning_rate": 9.87545371922015e-06, + "loss": 0.8535, + "step": 5375 + }, + { + "epoch": 0.2898425706275609, + "grad_norm": 0.8038228750228882, + "learning_rate": 9.875406690523837e-06, + "loss": 0.8205, + "step": 5376 + }, + { + "epoch": 0.28989648479620445, + "grad_norm": 0.7821580767631531, + "learning_rate": 9.8753596530622e-06, + "loss": 0.7765, + "step": 5377 + }, + { + "epoch": 0.289950398964848, + "grad_norm": 0.7491927742958069, + "learning_rate": 9.875312606835325e-06, + "loss": 0.7238, + "step": 5378 + }, + { + "epoch": 0.2900043131334915, + "grad_norm": 0.8357378840446472, + "learning_rate": 9.875265551843294e-06, + "loss": 0.8244, + "step": 5379 + }, + { + "epoch": 0.290058227302135, + "grad_norm": 0.792351484298706, + "learning_rate": 9.875218488086194e-06, + "loss": 0.7871, + "step": 5380 + }, + { + "epoch": 0.2901121414707785, + "grad_norm": 0.7484980225563049, + "learning_rate": 9.875171415564109e-06, + "loss": 0.7487, + "step": 5381 + }, + { + "epoch": 0.29016605563942205, + "grad_norm": 0.8140117526054382, + "learning_rate": 9.875124334277123e-06, + "loss": 0.7895, + "step": 5382 + }, + { + "epoch": 0.2902199698080656, + "grad_norm": 0.7369776964187622, + "learning_rate": 9.875077244225322e-06, + "loss": 0.7785, + "step": 5383 + }, + { + "epoch": 0.29027388397670906, + "grad_norm": 0.8499336242675781, + "learning_rate": 9.875030145408789e-06, + "loss": 0.8289, + "step": 5384 + }, + { + "epoch": 0.2903277981453526, + "grad_norm": 0.7209733724594116, + "learning_rate": 9.874983037827608e-06, + "loss": 0.6624, + "step": 5385 + }, + { + "epoch": 0.2903817123139961, + "grad_norm": 0.8489585518836975, + "learning_rate": 9.874935921481865e-06, + "loss": 0.8074, + "step": 5386 + }, + { + "epoch": 0.29043562648263965, + "grad_norm": 0.7765734195709229, + "learning_rate": 9.874888796371647e-06, + "loss": 0.7899, + "step": 5387 + }, + { + "epoch": 0.2904895406512832, + "grad_norm": 0.7301489114761353, + "learning_rate": 9.874841662497034e-06, + "loss": 0.6868, + "step": 5388 + }, + { + "epoch": 0.29054345481992666, + "grad_norm": 0.7872721552848816, + "learning_rate": 9.874794519858114e-06, + "loss": 0.8456, + "step": 5389 + }, + { + "epoch": 0.2905973689885702, + "grad_norm": 0.7796556949615479, + "learning_rate": 9.87474736845497e-06, + "loss": 0.7338, + "step": 5390 + }, + { + "epoch": 0.2906512831572137, + "grad_norm": 0.7958070635795593, + "learning_rate": 9.874700208287691e-06, + "loss": 0.773, + "step": 5391 + }, + { + "epoch": 0.29070519732585726, + "grad_norm": 0.8552476167678833, + "learning_rate": 9.874653039356356e-06, + "loss": 0.772, + "step": 5392 + }, + { + "epoch": 0.29075911149450073, + "grad_norm": 0.7346936464309692, + "learning_rate": 9.874605861661051e-06, + "loss": 0.7714, + "step": 5393 + }, + { + "epoch": 0.29081302566314426, + "grad_norm": 0.804050862789154, + "learning_rate": 9.874558675201864e-06, + "loss": 0.8539, + "step": 5394 + }, + { + "epoch": 0.2908669398317878, + "grad_norm": 0.7373083233833313, + "learning_rate": 9.874511479978879e-06, + "loss": 0.7483, + "step": 5395 + }, + { + "epoch": 0.29092085400043133, + "grad_norm": 0.8145542740821838, + "learning_rate": 9.874464275992177e-06, + "loss": 0.7697, + "step": 5396 + }, + { + "epoch": 0.29097476816907486, + "grad_norm": 0.6865667700767517, + "learning_rate": 9.874417063241848e-06, + "loss": 0.771, + "step": 5397 + }, + { + "epoch": 0.29102868233771834, + "grad_norm": 0.7204734086990356, + "learning_rate": 9.874369841727973e-06, + "loss": 0.7562, + "step": 5398 + }, + { + "epoch": 0.29108259650636187, + "grad_norm": 0.8261793851852417, + "learning_rate": 9.87432261145064e-06, + "loss": 0.8245, + "step": 5399 + }, + { + "epoch": 0.2911365106750054, + "grad_norm": 0.7563614845275879, + "learning_rate": 9.87427537240993e-06, + "loss": 0.8051, + "step": 5400 + }, + { + "epoch": 0.29119042484364893, + "grad_norm": 0.7967458367347717, + "learning_rate": 9.874228124605932e-06, + "loss": 0.8236, + "step": 5401 + }, + { + "epoch": 0.2912443390122924, + "grad_norm": 0.806373119354248, + "learning_rate": 9.874180868038729e-06, + "loss": 0.8202, + "step": 5402 + }, + { + "epoch": 0.29129825318093594, + "grad_norm": 0.6726234555244446, + "learning_rate": 9.874133602708406e-06, + "loss": 0.7128, + "step": 5403 + }, + { + "epoch": 0.29135216734957947, + "grad_norm": 0.7642708420753479, + "learning_rate": 9.874086328615047e-06, + "loss": 0.8134, + "step": 5404 + }, + { + "epoch": 0.291406081518223, + "grad_norm": 0.6992095708847046, + "learning_rate": 9.874039045758742e-06, + "loss": 0.6887, + "step": 5405 + }, + { + "epoch": 0.29145999568686654, + "grad_norm": 0.7869388461112976, + "learning_rate": 9.873991754139567e-06, + "loss": 0.8069, + "step": 5406 + }, + { + "epoch": 0.29151390985551, + "grad_norm": 0.7390547394752502, + "learning_rate": 9.873944453757616e-06, + "loss": 0.7591, + "step": 5407 + }, + { + "epoch": 0.29156782402415354, + "grad_norm": 0.6705611348152161, + "learning_rate": 9.873897144612968e-06, + "loss": 0.7474, + "step": 5408 + }, + { + "epoch": 0.2916217381927971, + "grad_norm": 0.7684745788574219, + "learning_rate": 9.873849826705711e-06, + "loss": 0.7477, + "step": 5409 + }, + { + "epoch": 0.2916756523614406, + "grad_norm": 0.7341989278793335, + "learning_rate": 9.87380250003593e-06, + "loss": 0.7634, + "step": 5410 + }, + { + "epoch": 0.2917295665300841, + "grad_norm": 0.7358923554420471, + "learning_rate": 9.873755164603708e-06, + "loss": 0.7, + "step": 5411 + }, + { + "epoch": 0.2917834806987276, + "grad_norm": 0.8319085836410522, + "learning_rate": 9.873707820409132e-06, + "loss": 0.859, + "step": 5412 + }, + { + "epoch": 0.29183739486737115, + "grad_norm": 0.8299946188926697, + "learning_rate": 9.873660467452288e-06, + "loss": 0.9912, + "step": 5413 + }, + { + "epoch": 0.2918913090360147, + "grad_norm": 0.7632084488868713, + "learning_rate": 9.87361310573326e-06, + "loss": 0.7579, + "step": 5414 + }, + { + "epoch": 0.2919452232046582, + "grad_norm": 0.8068237900733948, + "learning_rate": 9.873565735252131e-06, + "loss": 0.7249, + "step": 5415 + }, + { + "epoch": 0.2919991373733017, + "grad_norm": 0.8328914046287537, + "learning_rate": 9.873518356008988e-06, + "loss": 0.7903, + "step": 5416 + }, + { + "epoch": 0.2920530515419452, + "grad_norm": 0.7877300977706909, + "learning_rate": 9.873470968003917e-06, + "loss": 0.8328, + "step": 5417 + }, + { + "epoch": 0.29210696571058875, + "grad_norm": 0.7755314111709595, + "learning_rate": 9.873423571237004e-06, + "loss": 0.6584, + "step": 5418 + }, + { + "epoch": 0.2921608798792323, + "grad_norm": 0.8157472014427185, + "learning_rate": 9.873376165708332e-06, + "loss": 0.6761, + "step": 5419 + }, + { + "epoch": 0.29221479404787576, + "grad_norm": 0.7559711933135986, + "learning_rate": 9.873328751417985e-06, + "loss": 0.8345, + "step": 5420 + }, + { + "epoch": 0.2922687082165193, + "grad_norm": 0.8466331958770752, + "learning_rate": 9.873281328366053e-06, + "loss": 0.7568, + "step": 5421 + }, + { + "epoch": 0.2923226223851628, + "grad_norm": 0.7468219995498657, + "learning_rate": 9.873233896552617e-06, + "loss": 0.7857, + "step": 5422 + }, + { + "epoch": 0.29237653655380635, + "grad_norm": 0.7857210040092468, + "learning_rate": 9.873186455977763e-06, + "loss": 0.7557, + "step": 5423 + }, + { + "epoch": 0.2924304507224499, + "grad_norm": 0.7680637240409851, + "learning_rate": 9.873139006641577e-06, + "loss": 0.7225, + "step": 5424 + }, + { + "epoch": 0.29248436489109336, + "grad_norm": 0.7393225431442261, + "learning_rate": 9.873091548544146e-06, + "loss": 0.7978, + "step": 5425 + }, + { + "epoch": 0.2925382790597369, + "grad_norm": 0.8140562176704407, + "learning_rate": 9.873044081685552e-06, + "loss": 0.8496, + "step": 5426 + }, + { + "epoch": 0.2925921932283804, + "grad_norm": 0.7890025973320007, + "learning_rate": 9.872996606065883e-06, + "loss": 0.7475, + "step": 5427 + }, + { + "epoch": 0.29264610739702396, + "grad_norm": 0.8253166079521179, + "learning_rate": 9.872949121685223e-06, + "loss": 0.8336, + "step": 5428 + }, + { + "epoch": 0.29270002156566743, + "grad_norm": 0.9723641276359558, + "learning_rate": 9.872901628543657e-06, + "loss": 0.83, + "step": 5429 + }, + { + "epoch": 0.29275393573431097, + "grad_norm": 0.884645938873291, + "learning_rate": 9.87285412664127e-06, + "loss": 0.8324, + "step": 5430 + }, + { + "epoch": 0.2928078499029545, + "grad_norm": 0.7741670608520508, + "learning_rate": 9.872806615978152e-06, + "loss": 0.8724, + "step": 5431 + }, + { + "epoch": 0.29286176407159803, + "grad_norm": 0.6959695219993591, + "learning_rate": 9.872759096554383e-06, + "loss": 0.657, + "step": 5432 + }, + { + "epoch": 0.29291567824024156, + "grad_norm": 0.7823370695114136, + "learning_rate": 9.872711568370051e-06, + "loss": 0.7939, + "step": 5433 + }, + { + "epoch": 0.29296959240888504, + "grad_norm": 0.7705811858177185, + "learning_rate": 9.87266403142524e-06, + "loss": 0.7604, + "step": 5434 + }, + { + "epoch": 0.29302350657752857, + "grad_norm": 0.7560339570045471, + "learning_rate": 9.872616485720037e-06, + "loss": 0.7303, + "step": 5435 + }, + { + "epoch": 0.2930774207461721, + "grad_norm": 0.7380449771881104, + "learning_rate": 9.872568931254524e-06, + "loss": 0.6181, + "step": 5436 + }, + { + "epoch": 0.29313133491481563, + "grad_norm": 0.743810772895813, + "learning_rate": 9.872521368028794e-06, + "loss": 0.8403, + "step": 5437 + }, + { + "epoch": 0.2931852490834591, + "grad_norm": 0.7859793901443481, + "learning_rate": 9.872473796042924e-06, + "loss": 0.8448, + "step": 5438 + }, + { + "epoch": 0.29323916325210264, + "grad_norm": 0.7643007040023804, + "learning_rate": 9.872426215297003e-06, + "loss": 0.757, + "step": 5439 + }, + { + "epoch": 0.2932930774207462, + "grad_norm": 0.7227921485900879, + "learning_rate": 9.87237862579112e-06, + "loss": 0.7882, + "step": 5440 + }, + { + "epoch": 0.2933469915893897, + "grad_norm": 0.7416848540306091, + "learning_rate": 9.872331027525356e-06, + "loss": 0.7644, + "step": 5441 + }, + { + "epoch": 0.29340090575803324, + "grad_norm": 0.7258424758911133, + "learning_rate": 9.872283420499797e-06, + "loss": 0.6828, + "step": 5442 + }, + { + "epoch": 0.2934548199266767, + "grad_norm": 0.7854428291320801, + "learning_rate": 9.87223580471453e-06, + "loss": 0.795, + "step": 5443 + }, + { + "epoch": 0.29350873409532025, + "grad_norm": 0.7590177655220032, + "learning_rate": 9.87218818016964e-06, + "loss": 0.7798, + "step": 5444 + }, + { + "epoch": 0.2935626482639638, + "grad_norm": 0.7291384339332581, + "learning_rate": 9.872140546865212e-06, + "loss": 0.7249, + "step": 5445 + }, + { + "epoch": 0.2936165624326073, + "grad_norm": 0.8444628119468689, + "learning_rate": 9.872092904801334e-06, + "loss": 0.824, + "step": 5446 + }, + { + "epoch": 0.2936704766012508, + "grad_norm": 0.7586516737937927, + "learning_rate": 9.87204525397809e-06, + "loss": 0.7716, + "step": 5447 + }, + { + "epoch": 0.2937243907698943, + "grad_norm": 0.7367489337921143, + "learning_rate": 9.871997594395565e-06, + "loss": 0.6108, + "step": 5448 + }, + { + "epoch": 0.29377830493853785, + "grad_norm": 0.8746148347854614, + "learning_rate": 9.871949926053845e-06, + "loss": 0.841, + "step": 5449 + }, + { + "epoch": 0.2938322191071814, + "grad_norm": 0.8738248944282532, + "learning_rate": 9.871902248953017e-06, + "loss": 0.7911, + "step": 5450 + }, + { + "epoch": 0.2938861332758249, + "grad_norm": 0.8541892766952515, + "learning_rate": 9.871854563093167e-06, + "loss": 0.8283, + "step": 5451 + }, + { + "epoch": 0.2939400474444684, + "grad_norm": 0.7325894832611084, + "learning_rate": 9.871806868474376e-06, + "loss": 0.6988, + "step": 5452 + }, + { + "epoch": 0.2939939616131119, + "grad_norm": 0.730920135974884, + "learning_rate": 9.871759165096735e-06, + "loss": 0.7696, + "step": 5453 + }, + { + "epoch": 0.29404787578175545, + "grad_norm": 0.8190314173698425, + "learning_rate": 9.871711452960329e-06, + "loss": 0.8021, + "step": 5454 + }, + { + "epoch": 0.294101789950399, + "grad_norm": 0.7794191241264343, + "learning_rate": 9.871663732065243e-06, + "loss": 0.7141, + "step": 5455 + }, + { + "epoch": 0.29415570411904246, + "grad_norm": 0.729831874370575, + "learning_rate": 9.871616002411561e-06, + "loss": 0.7142, + "step": 5456 + }, + { + "epoch": 0.294209618287686, + "grad_norm": 0.8393380641937256, + "learning_rate": 9.871568263999371e-06, + "loss": 0.8494, + "step": 5457 + }, + { + "epoch": 0.2942635324563295, + "grad_norm": 0.7556251883506775, + "learning_rate": 9.87152051682876e-06, + "loss": 0.6954, + "step": 5458 + }, + { + "epoch": 0.29431744662497306, + "grad_norm": 0.7716967463493347, + "learning_rate": 9.87147276089981e-06, + "loss": 0.7139, + "step": 5459 + }, + { + "epoch": 0.2943713607936166, + "grad_norm": 0.7605961561203003, + "learning_rate": 9.871424996212611e-06, + "loss": 0.7788, + "step": 5460 + }, + { + "epoch": 0.29442527496226006, + "grad_norm": 0.7812150716781616, + "learning_rate": 9.871377222767245e-06, + "loss": 0.8462, + "step": 5461 + }, + { + "epoch": 0.2944791891309036, + "grad_norm": 0.7436057925224304, + "learning_rate": 9.8713294405638e-06, + "loss": 0.859, + "step": 5462 + }, + { + "epoch": 0.2945331032995471, + "grad_norm": 0.8104838132858276, + "learning_rate": 9.871281649602362e-06, + "loss": 0.8203, + "step": 5463 + }, + { + "epoch": 0.29458701746819066, + "grad_norm": 0.730912446975708, + "learning_rate": 9.871233849883018e-06, + "loss": 0.8419, + "step": 5464 + }, + { + "epoch": 0.29464093163683414, + "grad_norm": 0.7726290822029114, + "learning_rate": 9.871186041405852e-06, + "loss": 0.8276, + "step": 5465 + }, + { + "epoch": 0.29469484580547767, + "grad_norm": 0.7509479522705078, + "learning_rate": 9.871138224170949e-06, + "loss": 0.656, + "step": 5466 + }, + { + "epoch": 0.2947487599741212, + "grad_norm": 1.2936142683029175, + "learning_rate": 9.871090398178396e-06, + "loss": 0.7648, + "step": 5467 + }, + { + "epoch": 0.29480267414276473, + "grad_norm": 0.7731900215148926, + "learning_rate": 9.87104256342828e-06, + "loss": 0.8216, + "step": 5468 + }, + { + "epoch": 0.29485658831140826, + "grad_norm": 0.7106019258499146, + "learning_rate": 9.870994719920688e-06, + "loss": 0.6923, + "step": 5469 + }, + { + "epoch": 0.29491050248005174, + "grad_norm": 0.7590166926383972, + "learning_rate": 9.870946867655704e-06, + "loss": 0.7469, + "step": 5470 + }, + { + "epoch": 0.29496441664869527, + "grad_norm": 0.7591565847396851, + "learning_rate": 9.870899006633414e-06, + "loss": 0.8032, + "step": 5471 + }, + { + "epoch": 0.2950183308173388, + "grad_norm": 0.9401304125785828, + "learning_rate": 9.870851136853904e-06, + "loss": 0.7261, + "step": 5472 + }, + { + "epoch": 0.29507224498598233, + "grad_norm": 0.7991933822631836, + "learning_rate": 9.870803258317261e-06, + "loss": 0.7226, + "step": 5473 + }, + { + "epoch": 0.2951261591546258, + "grad_norm": 0.7324903011322021, + "learning_rate": 9.87075537102357e-06, + "loss": 0.7444, + "step": 5474 + }, + { + "epoch": 0.29518007332326934, + "grad_norm": 0.7185311317443848, + "learning_rate": 9.87070747497292e-06, + "loss": 0.7125, + "step": 5475 + }, + { + "epoch": 0.2952339874919129, + "grad_norm": 0.750343382358551, + "learning_rate": 9.870659570165393e-06, + "loss": 0.7458, + "step": 5476 + }, + { + "epoch": 0.2952879016605564, + "grad_norm": 0.8604345917701721, + "learning_rate": 9.870611656601077e-06, + "loss": 0.7445, + "step": 5477 + }, + { + "epoch": 0.29534181582919994, + "grad_norm": 0.7870331406593323, + "learning_rate": 9.870563734280059e-06, + "loss": 0.7116, + "step": 5478 + }, + { + "epoch": 0.2953957299978434, + "grad_norm": 0.6978838443756104, + "learning_rate": 9.870515803202424e-06, + "loss": 0.7563, + "step": 5479 + }, + { + "epoch": 0.29544964416648695, + "grad_norm": 1.5832971334457397, + "learning_rate": 9.870467863368258e-06, + "loss": 0.7095, + "step": 5480 + }, + { + "epoch": 0.2955035583351305, + "grad_norm": 0.7247046828269958, + "learning_rate": 9.870419914777646e-06, + "loss": 0.7586, + "step": 5481 + }, + { + "epoch": 0.295557472503774, + "grad_norm": 0.7100489735603333, + "learning_rate": 9.87037195743068e-06, + "loss": 0.804, + "step": 5482 + }, + { + "epoch": 0.2956113866724175, + "grad_norm": 0.78151935338974, + "learning_rate": 9.87032399132744e-06, + "loss": 0.7443, + "step": 5483 + }, + { + "epoch": 0.295665300841061, + "grad_norm": 0.7440445423126221, + "learning_rate": 9.870276016468013e-06, + "loss": 0.7476, + "step": 5484 + }, + { + "epoch": 0.29571921500970455, + "grad_norm": 0.7003461718559265, + "learning_rate": 9.870228032852489e-06, + "loss": 0.7401, + "step": 5485 + }, + { + "epoch": 0.2957731291783481, + "grad_norm": 0.7338505387306213, + "learning_rate": 9.87018004048095e-06, + "loss": 0.7757, + "step": 5486 + }, + { + "epoch": 0.2958270433469916, + "grad_norm": 0.7721376419067383, + "learning_rate": 9.870132039353484e-06, + "loss": 0.8646, + "step": 5487 + }, + { + "epoch": 0.2958809575156351, + "grad_norm": 0.7995434999465942, + "learning_rate": 9.870084029470179e-06, + "loss": 0.7917, + "step": 5488 + }, + { + "epoch": 0.2959348716842786, + "grad_norm": 0.8954901099205017, + "learning_rate": 9.87003601083112e-06, + "loss": 0.8308, + "step": 5489 + }, + { + "epoch": 0.29598878585292215, + "grad_norm": 0.7231770753860474, + "learning_rate": 9.86998798343639e-06, + "loss": 0.8126, + "step": 5490 + }, + { + "epoch": 0.2960427000215657, + "grad_norm": 0.8772289752960205, + "learning_rate": 9.869939947286081e-06, + "loss": 0.8513, + "step": 5491 + }, + { + "epoch": 0.29609661419020916, + "grad_norm": 0.726995050907135, + "learning_rate": 9.869891902380276e-06, + "loss": 0.6717, + "step": 5492 + }, + { + "epoch": 0.2961505283588527, + "grad_norm": 0.7519280910491943, + "learning_rate": 9.869843848719062e-06, + "loss": 0.7634, + "step": 5493 + }, + { + "epoch": 0.2962044425274962, + "grad_norm": 0.8302793502807617, + "learning_rate": 9.869795786302528e-06, + "loss": 0.845, + "step": 5494 + }, + { + "epoch": 0.29625835669613976, + "grad_norm": 0.9483422636985779, + "learning_rate": 9.869747715130756e-06, + "loss": 0.8187, + "step": 5495 + }, + { + "epoch": 0.2963122708647833, + "grad_norm": 0.808182418346405, + "learning_rate": 9.869699635203833e-06, + "loss": 0.8221, + "step": 5496 + }, + { + "epoch": 0.29636618503342677, + "grad_norm": 0.8152076601982117, + "learning_rate": 9.869651546521848e-06, + "loss": 0.8683, + "step": 5497 + }, + { + "epoch": 0.2964200992020703, + "grad_norm": 0.9072142243385315, + "learning_rate": 9.869603449084886e-06, + "loss": 0.8853, + "step": 5498 + }, + { + "epoch": 0.29647401337071383, + "grad_norm": 0.7798082828521729, + "learning_rate": 9.869555342893035e-06, + "loss": 0.84, + "step": 5499 + }, + { + "epoch": 0.29652792753935736, + "grad_norm": 0.7505926489830017, + "learning_rate": 9.869507227946378e-06, + "loss": 0.748, + "step": 5500 + }, + { + "epoch": 0.29658184170800084, + "grad_norm": 0.7643826007843018, + "learning_rate": 9.869459104245006e-06, + "loss": 0.7423, + "step": 5501 + }, + { + "epoch": 0.29663575587664437, + "grad_norm": 0.8993945717811584, + "learning_rate": 9.869410971789003e-06, + "loss": 0.7736, + "step": 5502 + }, + { + "epoch": 0.2966896700452879, + "grad_norm": 0.8132869005203247, + "learning_rate": 9.869362830578455e-06, + "loss": 0.8926, + "step": 5503 + }, + { + "epoch": 0.29674358421393143, + "grad_norm": 0.7741131782531738, + "learning_rate": 9.869314680613449e-06, + "loss": 0.7087, + "step": 5504 + }, + { + "epoch": 0.29679749838257496, + "grad_norm": 0.83815598487854, + "learning_rate": 9.869266521894073e-06, + "loss": 0.808, + "step": 5505 + }, + { + "epoch": 0.29685141255121844, + "grad_norm": 0.7051485180854797, + "learning_rate": 9.869218354420413e-06, + "loss": 0.695, + "step": 5506 + }, + { + "epoch": 0.296905326719862, + "grad_norm": 0.7514739036560059, + "learning_rate": 9.869170178192554e-06, + "loss": 0.7496, + "step": 5507 + }, + { + "epoch": 0.2969592408885055, + "grad_norm": 0.8005251288414001, + "learning_rate": 9.869121993210582e-06, + "loss": 0.8144, + "step": 5508 + }, + { + "epoch": 0.29701315505714904, + "grad_norm": 0.7894544005393982, + "learning_rate": 9.86907379947459e-06, + "loss": 0.7345, + "step": 5509 + }, + { + "epoch": 0.2970670692257925, + "grad_norm": 0.7498524785041809, + "learning_rate": 9.869025596984655e-06, + "loss": 0.6906, + "step": 5510 + }, + { + "epoch": 0.29712098339443604, + "grad_norm": 0.7346488237380981, + "learning_rate": 9.868977385740873e-06, + "loss": 0.7512, + "step": 5511 + }, + { + "epoch": 0.2971748975630796, + "grad_norm": 0.8185198307037354, + "learning_rate": 9.868929165743323e-06, + "loss": 0.847, + "step": 5512 + }, + { + "epoch": 0.2972288117317231, + "grad_norm": 0.7798783183097839, + "learning_rate": 9.868880936992095e-06, + "loss": 0.8442, + "step": 5513 + }, + { + "epoch": 0.29728272590036664, + "grad_norm": 0.862074077129364, + "learning_rate": 9.868832699487279e-06, + "loss": 0.8357, + "step": 5514 + }, + { + "epoch": 0.2973366400690101, + "grad_norm": 0.7395896911621094, + "learning_rate": 9.868784453228957e-06, + "loss": 0.7449, + "step": 5515 + }, + { + "epoch": 0.29739055423765365, + "grad_norm": 0.7291044592857361, + "learning_rate": 9.868736198217215e-06, + "loss": 0.7686, + "step": 5516 + }, + { + "epoch": 0.2974444684062972, + "grad_norm": 1.070936918258667, + "learning_rate": 9.868687934452143e-06, + "loss": 0.8639, + "step": 5517 + }, + { + "epoch": 0.2974983825749407, + "grad_norm": 0.7176975607872009, + "learning_rate": 9.868639661933828e-06, + "loss": 0.7579, + "step": 5518 + }, + { + "epoch": 0.2975522967435842, + "grad_norm": 0.7830207943916321, + "learning_rate": 9.868591380662356e-06, + "loss": 0.7744, + "step": 5519 + }, + { + "epoch": 0.2976062109122277, + "grad_norm": 1.0292960405349731, + "learning_rate": 9.868543090637812e-06, + "loss": 0.6333, + "step": 5520 + }, + { + "epoch": 0.29766012508087125, + "grad_norm": 0.7741127014160156, + "learning_rate": 9.868494791860285e-06, + "loss": 0.7859, + "step": 5521 + }, + { + "epoch": 0.2977140392495148, + "grad_norm": 0.8201294541358948, + "learning_rate": 9.86844648432986e-06, + "loss": 0.8045, + "step": 5522 + }, + { + "epoch": 0.2977679534181583, + "grad_norm": 0.7732555866241455, + "learning_rate": 9.868398168046625e-06, + "loss": 0.7253, + "step": 5523 + }, + { + "epoch": 0.2978218675868018, + "grad_norm": 0.727921724319458, + "learning_rate": 9.868349843010668e-06, + "loss": 0.7155, + "step": 5524 + }, + { + "epoch": 0.2978757817554453, + "grad_norm": 0.7359254360198975, + "learning_rate": 9.868301509222072e-06, + "loss": 0.894, + "step": 5525 + }, + { + "epoch": 0.29792969592408886, + "grad_norm": 0.8356531858444214, + "learning_rate": 9.868253166680927e-06, + "loss": 0.7506, + "step": 5526 + }, + { + "epoch": 0.2979836100927324, + "grad_norm": 0.8150777816772461, + "learning_rate": 9.868204815387321e-06, + "loss": 0.7737, + "step": 5527 + }, + { + "epoch": 0.29803752426137586, + "grad_norm": 0.7688710689544678, + "learning_rate": 9.86815645534134e-06, + "loss": 0.7656, + "step": 5528 + }, + { + "epoch": 0.2980914384300194, + "grad_norm": 0.7309591174125671, + "learning_rate": 9.868108086543069e-06, + "loss": 0.7655, + "step": 5529 + }, + { + "epoch": 0.2981453525986629, + "grad_norm": 0.9307131767272949, + "learning_rate": 9.868059708992595e-06, + "loss": 0.742, + "step": 5530 + }, + { + "epoch": 0.29819926676730646, + "grad_norm": 0.7241950631141663, + "learning_rate": 9.868011322690008e-06, + "loss": 0.7113, + "step": 5531 + }, + { + "epoch": 0.29825318093595, + "grad_norm": 0.8070489168167114, + "learning_rate": 9.867962927635393e-06, + "loss": 0.7835, + "step": 5532 + }, + { + "epoch": 0.29830709510459347, + "grad_norm": 0.6972863078117371, + "learning_rate": 9.867914523828836e-06, + "loss": 0.6914, + "step": 5533 + }, + { + "epoch": 0.298361009273237, + "grad_norm": 0.8001635670661926, + "learning_rate": 9.867866111270425e-06, + "loss": 0.82, + "step": 5534 + }, + { + "epoch": 0.29841492344188053, + "grad_norm": 0.7933236956596375, + "learning_rate": 9.867817689960249e-06, + "loss": 0.8148, + "step": 5535 + }, + { + "epoch": 0.29846883761052406, + "grad_norm": 0.7881083488464355, + "learning_rate": 9.867769259898393e-06, + "loss": 0.8342, + "step": 5536 + }, + { + "epoch": 0.29852275177916754, + "grad_norm": 0.7492312788963318, + "learning_rate": 9.867720821084943e-06, + "loss": 0.7736, + "step": 5537 + }, + { + "epoch": 0.29857666594781107, + "grad_norm": 0.7429683804512024, + "learning_rate": 9.86767237351999e-06, + "loss": 0.7602, + "step": 5538 + }, + { + "epoch": 0.2986305801164546, + "grad_norm": 0.7982121109962463, + "learning_rate": 9.867623917203618e-06, + "loss": 0.9007, + "step": 5539 + }, + { + "epoch": 0.29868449428509813, + "grad_norm": 0.77519291639328, + "learning_rate": 9.867575452135911e-06, + "loss": 0.7136, + "step": 5540 + }, + { + "epoch": 0.29873840845374167, + "grad_norm": 0.8341544270515442, + "learning_rate": 9.867526978316963e-06, + "loss": 0.852, + "step": 5541 + }, + { + "epoch": 0.29879232262238514, + "grad_norm": 0.8006002306938171, + "learning_rate": 9.867478495746859e-06, + "loss": 0.7557, + "step": 5542 + }, + { + "epoch": 0.2988462367910287, + "grad_norm": 0.7797364592552185, + "learning_rate": 9.867430004425683e-06, + "loss": 0.7776, + "step": 5543 + }, + { + "epoch": 0.2989001509596722, + "grad_norm": 0.8187147378921509, + "learning_rate": 9.867381504353525e-06, + "loss": 0.7871, + "step": 5544 + }, + { + "epoch": 0.29895406512831574, + "grad_norm": 0.8447971343994141, + "learning_rate": 9.867332995530471e-06, + "loss": 0.7563, + "step": 5545 + }, + { + "epoch": 0.2990079792969592, + "grad_norm": 0.7229753136634827, + "learning_rate": 9.867284477956608e-06, + "loss": 0.7227, + "step": 5546 + }, + { + "epoch": 0.29906189346560275, + "grad_norm": 0.7300926446914673, + "learning_rate": 9.867235951632026e-06, + "loss": 0.834, + "step": 5547 + }, + { + "epoch": 0.2991158076342463, + "grad_norm": 0.873554527759552, + "learning_rate": 9.86718741655681e-06, + "loss": 0.8454, + "step": 5548 + }, + { + "epoch": 0.2991697218028898, + "grad_norm": 0.7391233444213867, + "learning_rate": 9.867138872731047e-06, + "loss": 0.7505, + "step": 5549 + }, + { + "epoch": 0.29922363597153334, + "grad_norm": 0.7330740690231323, + "learning_rate": 9.867090320154824e-06, + "loss": 0.776, + "step": 5550 + }, + { + "epoch": 0.2992775501401768, + "grad_norm": 0.7050237655639648, + "learning_rate": 9.867041758828231e-06, + "loss": 0.7063, + "step": 5551 + }, + { + "epoch": 0.29933146430882035, + "grad_norm": 0.7757040858268738, + "learning_rate": 9.86699318875135e-06, + "loss": 0.757, + "step": 5552 + }, + { + "epoch": 0.2993853784774639, + "grad_norm": 0.7693188190460205, + "learning_rate": 9.866944609924274e-06, + "loss": 0.8058, + "step": 5553 + }, + { + "epoch": 0.2994392926461074, + "grad_norm": 0.8201676607131958, + "learning_rate": 9.866896022347088e-06, + "loss": 0.8317, + "step": 5554 + }, + { + "epoch": 0.2994932068147509, + "grad_norm": 0.768905758857727, + "learning_rate": 9.866847426019878e-06, + "loss": 0.705, + "step": 5555 + }, + { + "epoch": 0.2995471209833944, + "grad_norm": 0.7787859439849854, + "learning_rate": 9.866798820942735e-06, + "loss": 0.84, + "step": 5556 + }, + { + "epoch": 0.29960103515203795, + "grad_norm": 0.7377595901489258, + "learning_rate": 9.866750207115742e-06, + "loss": 0.7687, + "step": 5557 + }, + { + "epoch": 0.2996549493206815, + "grad_norm": 0.7098401784896851, + "learning_rate": 9.86670158453899e-06, + "loss": 0.6778, + "step": 5558 + }, + { + "epoch": 0.299708863489325, + "grad_norm": 0.7346776723861694, + "learning_rate": 9.866652953212563e-06, + "loss": 0.7201, + "step": 5559 + }, + { + "epoch": 0.2997627776579685, + "grad_norm": 0.9845607280731201, + "learning_rate": 9.866604313136551e-06, + "loss": 0.784, + "step": 5560 + }, + { + "epoch": 0.299816691826612, + "grad_norm": 0.7436274886131287, + "learning_rate": 9.866555664311042e-06, + "loss": 0.7904, + "step": 5561 + }, + { + "epoch": 0.29987060599525556, + "grad_norm": 0.712096095085144, + "learning_rate": 9.866507006736123e-06, + "loss": 0.6796, + "step": 5562 + }, + { + "epoch": 0.2999245201638991, + "grad_norm": 0.7324764132499695, + "learning_rate": 9.86645834041188e-06, + "loss": 0.7305, + "step": 5563 + }, + { + "epoch": 0.29997843433254257, + "grad_norm": 0.7124375104904175, + "learning_rate": 9.866409665338399e-06, + "loss": 0.7425, + "step": 5564 + }, + { + "epoch": 0.3000323485011861, + "grad_norm": 0.8470612168312073, + "learning_rate": 9.866360981515772e-06, + "loss": 0.7878, + "step": 5565 + }, + { + "epoch": 0.30008626266982963, + "grad_norm": 0.7153372168540955, + "learning_rate": 9.866312288944084e-06, + "loss": 0.8024, + "step": 5566 + }, + { + "epoch": 0.30014017683847316, + "grad_norm": 0.8719370365142822, + "learning_rate": 9.866263587623421e-06, + "loss": 0.8848, + "step": 5567 + }, + { + "epoch": 0.3001940910071167, + "grad_norm": 0.6834362745285034, + "learning_rate": 9.866214877553874e-06, + "loss": 0.7227, + "step": 5568 + }, + { + "epoch": 0.30024800517576017, + "grad_norm": 0.7608352899551392, + "learning_rate": 9.86616615873553e-06, + "loss": 0.7771, + "step": 5569 + }, + { + "epoch": 0.3003019193444037, + "grad_norm": 0.7039864659309387, + "learning_rate": 9.866117431168475e-06, + "loss": 0.6952, + "step": 5570 + }, + { + "epoch": 0.30035583351304723, + "grad_norm": 0.9486655592918396, + "learning_rate": 9.866068694852795e-06, + "loss": 0.7245, + "step": 5571 + }, + { + "epoch": 0.30040974768169076, + "grad_norm": 0.817467987537384, + "learning_rate": 9.866019949788583e-06, + "loss": 0.8019, + "step": 5572 + }, + { + "epoch": 0.30046366185033424, + "grad_norm": 0.9697426557540894, + "learning_rate": 9.865971195975922e-06, + "loss": 0.819, + "step": 5573 + }, + { + "epoch": 0.3005175760189778, + "grad_norm": 0.9181515574455261, + "learning_rate": 9.865922433414899e-06, + "loss": 0.6899, + "step": 5574 + }, + { + "epoch": 0.3005714901876213, + "grad_norm": 0.8325493335723877, + "learning_rate": 9.865873662105607e-06, + "loss": 0.7534, + "step": 5575 + }, + { + "epoch": 0.30062540435626484, + "grad_norm": 0.7544535994529724, + "learning_rate": 9.865824882048127e-06, + "loss": 0.7713, + "step": 5576 + }, + { + "epoch": 0.30067931852490837, + "grad_norm": 0.865722119808197, + "learning_rate": 9.865776093242553e-06, + "loss": 0.7043, + "step": 5577 + }, + { + "epoch": 0.30073323269355184, + "grad_norm": 0.8197489976882935, + "learning_rate": 9.86572729568897e-06, + "loss": 0.8126, + "step": 5578 + }, + { + "epoch": 0.3007871468621954, + "grad_norm": 0.8265332579612732, + "learning_rate": 9.865678489387464e-06, + "loss": 0.8225, + "step": 5579 + }, + { + "epoch": 0.3008410610308389, + "grad_norm": 0.7778556942939758, + "learning_rate": 9.865629674338124e-06, + "loss": 0.7358, + "step": 5580 + }, + { + "epoch": 0.30089497519948244, + "grad_norm": 0.821940004825592, + "learning_rate": 9.865580850541039e-06, + "loss": 0.8311, + "step": 5581 + }, + { + "epoch": 0.3009488893681259, + "grad_norm": 0.7767245769500732, + "learning_rate": 9.865532017996296e-06, + "loss": 0.7822, + "step": 5582 + }, + { + "epoch": 0.30100280353676945, + "grad_norm": 0.7898547053337097, + "learning_rate": 9.865483176703982e-06, + "loss": 0.7554, + "step": 5583 + }, + { + "epoch": 0.301056717705413, + "grad_norm": 0.7359830141067505, + "learning_rate": 9.865434326664185e-06, + "loss": 0.7927, + "step": 5584 + }, + { + "epoch": 0.3011106318740565, + "grad_norm": 0.8576635718345642, + "learning_rate": 9.865385467876996e-06, + "loss": 0.7179, + "step": 5585 + }, + { + "epoch": 0.30116454604270004, + "grad_norm": 0.8060041069984436, + "learning_rate": 9.865336600342496e-06, + "loss": 0.8445, + "step": 5586 + }, + { + "epoch": 0.3012184602113435, + "grad_norm": 0.8457094430923462, + "learning_rate": 9.86528772406078e-06, + "loss": 0.7395, + "step": 5587 + }, + { + "epoch": 0.30127237437998705, + "grad_norm": 0.8461765646934509, + "learning_rate": 9.865238839031932e-06, + "loss": 0.7613, + "step": 5588 + }, + { + "epoch": 0.3013262885486306, + "grad_norm": 0.7543842792510986, + "learning_rate": 9.865189945256041e-06, + "loss": 0.7798, + "step": 5589 + }, + { + "epoch": 0.3013802027172741, + "grad_norm": 0.7907589077949524, + "learning_rate": 9.865141042733195e-06, + "loss": 0.7322, + "step": 5590 + }, + { + "epoch": 0.30143411688591765, + "grad_norm": 0.8245344758033752, + "learning_rate": 9.86509213146348e-06, + "loss": 0.8101, + "step": 5591 + }, + { + "epoch": 0.3014880310545611, + "grad_norm": 0.8601032495498657, + "learning_rate": 9.865043211446987e-06, + "loss": 0.7518, + "step": 5592 + }, + { + "epoch": 0.30154194522320465, + "grad_norm": 0.7512524724006653, + "learning_rate": 9.864994282683802e-06, + "loss": 0.6719, + "step": 5593 + }, + { + "epoch": 0.3015958593918482, + "grad_norm": 0.7544498443603516, + "learning_rate": 9.864945345174013e-06, + "loss": 0.7199, + "step": 5594 + }, + { + "epoch": 0.3016497735604917, + "grad_norm": 0.7920962572097778, + "learning_rate": 9.864896398917709e-06, + "loss": 0.8057, + "step": 5595 + }, + { + "epoch": 0.3017036877291352, + "grad_norm": 0.8171366453170776, + "learning_rate": 9.864847443914978e-06, + "loss": 0.6848, + "step": 5596 + }, + { + "epoch": 0.3017576018977787, + "grad_norm": 1.0175178050994873, + "learning_rate": 9.864798480165905e-06, + "loss": 0.9037, + "step": 5597 + }, + { + "epoch": 0.30181151606642226, + "grad_norm": 0.7526590824127197, + "learning_rate": 9.864749507670584e-06, + "loss": 0.7384, + "step": 5598 + }, + { + "epoch": 0.3018654302350658, + "grad_norm": 0.8390552997589111, + "learning_rate": 9.864700526429097e-06, + "loss": 0.8319, + "step": 5599 + }, + { + "epoch": 0.3019193444037093, + "grad_norm": 0.7716279625892639, + "learning_rate": 9.864651536441534e-06, + "loss": 0.7846, + "step": 5600 + }, + { + "epoch": 0.3019732585723528, + "grad_norm": 0.8819800019264221, + "learning_rate": 9.864602537707986e-06, + "loss": 0.9081, + "step": 5601 + }, + { + "epoch": 0.30202717274099633, + "grad_norm": 0.7664394974708557, + "learning_rate": 9.864553530228537e-06, + "loss": 0.8134, + "step": 5602 + }, + { + "epoch": 0.30208108690963986, + "grad_norm": 0.7210355997085571, + "learning_rate": 9.864504514003277e-06, + "loss": 0.7239, + "step": 5603 + }, + { + "epoch": 0.3021350010782834, + "grad_norm": 0.7813848257064819, + "learning_rate": 9.864455489032294e-06, + "loss": 0.8045, + "step": 5604 + }, + { + "epoch": 0.30218891524692687, + "grad_norm": 1.2233554124832153, + "learning_rate": 9.864406455315676e-06, + "loss": 0.809, + "step": 5605 + }, + { + "epoch": 0.3022428294155704, + "grad_norm": 0.6908262968063354, + "learning_rate": 9.864357412853512e-06, + "loss": 0.6992, + "step": 5606 + }, + { + "epoch": 0.30229674358421393, + "grad_norm": 0.7175195217132568, + "learning_rate": 9.864308361645889e-06, + "loss": 0.6696, + "step": 5607 + }, + { + "epoch": 0.30235065775285747, + "grad_norm": 0.7976323366165161, + "learning_rate": 9.864259301692896e-06, + "loss": 0.79, + "step": 5608 + }, + { + "epoch": 0.302404571921501, + "grad_norm": 0.7869724035263062, + "learning_rate": 9.86421023299462e-06, + "loss": 0.7904, + "step": 5609 + }, + { + "epoch": 0.3024584860901445, + "grad_norm": 0.7077897191047668, + "learning_rate": 9.864161155551151e-06, + "loss": 0.6626, + "step": 5610 + }, + { + "epoch": 0.302512400258788, + "grad_norm": 0.7350404262542725, + "learning_rate": 9.864112069362576e-06, + "loss": 0.7935, + "step": 5611 + }, + { + "epoch": 0.30256631442743154, + "grad_norm": 0.8462132215499878, + "learning_rate": 9.864062974428983e-06, + "loss": 0.77, + "step": 5612 + }, + { + "epoch": 0.30262022859607507, + "grad_norm": 0.7448462247848511, + "learning_rate": 9.864013870750462e-06, + "loss": 0.7759, + "step": 5613 + }, + { + "epoch": 0.30267414276471855, + "grad_norm": 0.6589133143424988, + "learning_rate": 9.863964758327099e-06, + "loss": 0.655, + "step": 5614 + }, + { + "epoch": 0.3027280569333621, + "grad_norm": 0.8155136108398438, + "learning_rate": 9.863915637158983e-06, + "loss": 0.7986, + "step": 5615 + }, + { + "epoch": 0.3027819711020056, + "grad_norm": 0.683279275894165, + "learning_rate": 9.863866507246203e-06, + "loss": 0.7509, + "step": 5616 + }, + { + "epoch": 0.30283588527064914, + "grad_norm": 0.7811158299446106, + "learning_rate": 9.863817368588847e-06, + "loss": 0.7489, + "step": 5617 + }, + { + "epoch": 0.3028897994392927, + "grad_norm": 0.7594341039657593, + "learning_rate": 9.863768221187004e-06, + "loss": 0.8449, + "step": 5618 + }, + { + "epoch": 0.30294371360793615, + "grad_norm": 0.8166807889938354, + "learning_rate": 9.86371906504076e-06, + "loss": 0.7537, + "step": 5619 + }, + { + "epoch": 0.3029976277765797, + "grad_norm": 0.7390779256820679, + "learning_rate": 9.863669900150208e-06, + "loss": 0.7984, + "step": 5620 + }, + { + "epoch": 0.3030515419452232, + "grad_norm": 0.7977797389030457, + "learning_rate": 9.863620726515432e-06, + "loss": 0.7444, + "step": 5621 + }, + { + "epoch": 0.30310545611386674, + "grad_norm": 0.7973284721374512, + "learning_rate": 9.86357154413652e-06, + "loss": 0.7606, + "step": 5622 + }, + { + "epoch": 0.3031593702825102, + "grad_norm": 0.7334224581718445, + "learning_rate": 9.863522353013566e-06, + "loss": 0.7224, + "step": 5623 + }, + { + "epoch": 0.30321328445115375, + "grad_norm": 0.6981216073036194, + "learning_rate": 9.863473153146653e-06, + "loss": 0.7217, + "step": 5624 + }, + { + "epoch": 0.3032671986197973, + "grad_norm": 1.1201540231704712, + "learning_rate": 9.863423944535871e-06, + "loss": 0.9198, + "step": 5625 + }, + { + "epoch": 0.3033211127884408, + "grad_norm": 0.8230857849121094, + "learning_rate": 9.863374727181307e-06, + "loss": 0.7921, + "step": 5626 + }, + { + "epoch": 0.30337502695708435, + "grad_norm": 0.6343400478363037, + "learning_rate": 9.863325501083054e-06, + "loss": 0.6828, + "step": 5627 + }, + { + "epoch": 0.3034289411257278, + "grad_norm": 0.6875293254852295, + "learning_rate": 9.863276266241197e-06, + "loss": 0.7533, + "step": 5628 + }, + { + "epoch": 0.30348285529437136, + "grad_norm": 0.845723569393158, + "learning_rate": 9.863227022655824e-06, + "loss": 0.8772, + "step": 5629 + }, + { + "epoch": 0.3035367694630149, + "grad_norm": 0.7609561085700989, + "learning_rate": 9.863177770327025e-06, + "loss": 0.6915, + "step": 5630 + }, + { + "epoch": 0.3035906836316584, + "grad_norm": 0.8494479656219482, + "learning_rate": 9.863128509254889e-06, + "loss": 0.8361, + "step": 5631 + }, + { + "epoch": 0.3036445978003019, + "grad_norm": 0.8684791326522827, + "learning_rate": 9.863079239439504e-06, + "loss": 0.8718, + "step": 5632 + }, + { + "epoch": 0.30369851196894543, + "grad_norm": 0.7694339752197266, + "learning_rate": 9.863029960880958e-06, + "loss": 0.9112, + "step": 5633 + }, + { + "epoch": 0.30375242613758896, + "grad_norm": 0.6828801035881042, + "learning_rate": 9.86298067357934e-06, + "loss": 0.7093, + "step": 5634 + }, + { + "epoch": 0.3038063403062325, + "grad_norm": 0.6944296956062317, + "learning_rate": 9.86293137753474e-06, + "loss": 0.7028, + "step": 5635 + }, + { + "epoch": 0.303860254474876, + "grad_norm": 0.6979078650474548, + "learning_rate": 9.862882072747243e-06, + "loss": 0.7193, + "step": 5636 + }, + { + "epoch": 0.3039141686435195, + "grad_norm": 0.6764166951179504, + "learning_rate": 9.862832759216941e-06, + "loss": 0.5988, + "step": 5637 + }, + { + "epoch": 0.30396808281216303, + "grad_norm": 0.8332488536834717, + "learning_rate": 9.86278343694392e-06, + "loss": 0.841, + "step": 5638 + }, + { + "epoch": 0.30402199698080656, + "grad_norm": 0.6636878848075867, + "learning_rate": 9.862734105928271e-06, + "loss": 0.6478, + "step": 5639 + }, + { + "epoch": 0.3040759111494501, + "grad_norm": 0.724332332611084, + "learning_rate": 9.862684766170084e-06, + "loss": 0.7388, + "step": 5640 + }, + { + "epoch": 0.30412982531809357, + "grad_norm": 0.8727566599845886, + "learning_rate": 9.862635417669443e-06, + "loss": 0.7867, + "step": 5641 + }, + { + "epoch": 0.3041837394867371, + "grad_norm": 0.7513052225112915, + "learning_rate": 9.86258606042644e-06, + "loss": 0.7802, + "step": 5642 + }, + { + "epoch": 0.30423765365538064, + "grad_norm": 0.7201392650604248, + "learning_rate": 9.862536694441164e-06, + "loss": 0.7338, + "step": 5643 + }, + { + "epoch": 0.30429156782402417, + "grad_norm": 0.7723386883735657, + "learning_rate": 9.862487319713703e-06, + "loss": 0.7175, + "step": 5644 + }, + { + "epoch": 0.3043454819926677, + "grad_norm": 0.7148961424827576, + "learning_rate": 9.862437936244145e-06, + "loss": 0.7965, + "step": 5645 + }, + { + "epoch": 0.3043993961613112, + "grad_norm": 0.7666732668876648, + "learning_rate": 9.862388544032578e-06, + "loss": 0.8061, + "step": 5646 + }, + { + "epoch": 0.3044533103299547, + "grad_norm": 0.7971140742301941, + "learning_rate": 9.862339143079093e-06, + "loss": 0.7624, + "step": 5647 + }, + { + "epoch": 0.30450722449859824, + "grad_norm": 0.8448652625083923, + "learning_rate": 9.862289733383779e-06, + "loss": 0.8578, + "step": 5648 + }, + { + "epoch": 0.30456113866724177, + "grad_norm": 0.8290865421295166, + "learning_rate": 9.862240314946724e-06, + "loss": 0.8462, + "step": 5649 + }, + { + "epoch": 0.30461505283588525, + "grad_norm": 0.7474130988121033, + "learning_rate": 9.862190887768015e-06, + "loss": 0.7996, + "step": 5650 + }, + { + "epoch": 0.3046689670045288, + "grad_norm": 0.8694010376930237, + "learning_rate": 9.862141451847744e-06, + "loss": 0.8637, + "step": 5651 + }, + { + "epoch": 0.3047228811731723, + "grad_norm": 0.8624962568283081, + "learning_rate": 9.862092007185999e-06, + "loss": 0.884, + "step": 5652 + }, + { + "epoch": 0.30477679534181584, + "grad_norm": 0.7829890847206116, + "learning_rate": 9.862042553782866e-06, + "loss": 0.9021, + "step": 5653 + }, + { + "epoch": 0.3048307095104594, + "grad_norm": 0.7834928631782532, + "learning_rate": 9.861993091638437e-06, + "loss": 0.79, + "step": 5654 + }, + { + "epoch": 0.30488462367910285, + "grad_norm": 0.8633791208267212, + "learning_rate": 9.861943620752802e-06, + "loss": 0.8753, + "step": 5655 + }, + { + "epoch": 0.3049385378477464, + "grad_norm": 0.7444242238998413, + "learning_rate": 9.861894141126046e-06, + "loss": 0.7671, + "step": 5656 + }, + { + "epoch": 0.3049924520163899, + "grad_norm": 0.8170194625854492, + "learning_rate": 9.861844652758261e-06, + "loss": 0.8036, + "step": 5657 + }, + { + "epoch": 0.30504636618503345, + "grad_norm": 0.6889896392822266, + "learning_rate": 9.861795155649537e-06, + "loss": 0.6347, + "step": 5658 + }, + { + "epoch": 0.3051002803536769, + "grad_norm": 0.7670578360557556, + "learning_rate": 9.861745649799957e-06, + "loss": 0.8125, + "step": 5659 + }, + { + "epoch": 0.30515419452232045, + "grad_norm": 0.7764411568641663, + "learning_rate": 9.861696135209616e-06, + "loss": 0.8287, + "step": 5660 + }, + { + "epoch": 0.305208108690964, + "grad_norm": 0.8411297798156738, + "learning_rate": 9.861646611878601e-06, + "loss": 0.8475, + "step": 5661 + }, + { + "epoch": 0.3052620228596075, + "grad_norm": 0.9569882154464722, + "learning_rate": 9.861597079807001e-06, + "loss": 0.8466, + "step": 5662 + }, + { + "epoch": 0.30531593702825105, + "grad_norm": 0.7936221361160278, + "learning_rate": 9.861547538994905e-06, + "loss": 0.7822, + "step": 5663 + }, + { + "epoch": 0.3053698511968945, + "grad_norm": 0.7625929117202759, + "learning_rate": 9.861497989442403e-06, + "loss": 0.7977, + "step": 5664 + }, + { + "epoch": 0.30542376536553806, + "grad_norm": 0.8340699076652527, + "learning_rate": 9.861448431149584e-06, + "loss": 0.8711, + "step": 5665 + }, + { + "epoch": 0.3054776795341816, + "grad_norm": 0.8672143220901489, + "learning_rate": 9.861398864116534e-06, + "loss": 0.7653, + "step": 5666 + }, + { + "epoch": 0.3055315937028251, + "grad_norm": 0.7499459385871887, + "learning_rate": 9.861349288343345e-06, + "loss": 0.7784, + "step": 5667 + }, + { + "epoch": 0.3055855078714686, + "grad_norm": 0.8266206383705139, + "learning_rate": 9.861299703830107e-06, + "loss": 0.7215, + "step": 5668 + }, + { + "epoch": 0.30563942204011213, + "grad_norm": 0.7902846336364746, + "learning_rate": 9.861250110576907e-06, + "loss": 0.8172, + "step": 5669 + }, + { + "epoch": 0.30569333620875566, + "grad_norm": 0.7668362259864807, + "learning_rate": 9.861200508583835e-06, + "loss": 0.787, + "step": 5670 + }, + { + "epoch": 0.3057472503773992, + "grad_norm": 0.7463662028312683, + "learning_rate": 9.86115089785098e-06, + "loss": 0.7746, + "step": 5671 + }, + { + "epoch": 0.3058011645460427, + "grad_norm": 0.8604795932769775, + "learning_rate": 9.86110127837843e-06, + "loss": 0.8125, + "step": 5672 + }, + { + "epoch": 0.3058550787146862, + "grad_norm": 0.7146342396736145, + "learning_rate": 9.861051650166279e-06, + "loss": 0.7107, + "step": 5673 + }, + { + "epoch": 0.30590899288332973, + "grad_norm": 0.7614452838897705, + "learning_rate": 9.861002013214608e-06, + "loss": 0.657, + "step": 5674 + }, + { + "epoch": 0.30596290705197327, + "grad_norm": 0.823521077632904, + "learning_rate": 9.860952367523514e-06, + "loss": 0.7704, + "step": 5675 + }, + { + "epoch": 0.3060168212206168, + "grad_norm": 0.7902831435203552, + "learning_rate": 9.860902713093084e-06, + "loss": 0.8077, + "step": 5676 + }, + { + "epoch": 0.3060707353892603, + "grad_norm": 0.7902441024780273, + "learning_rate": 9.860853049923404e-06, + "loss": 0.7821, + "step": 5677 + }, + { + "epoch": 0.3061246495579038, + "grad_norm": 0.7113842368125916, + "learning_rate": 9.860803378014569e-06, + "loss": 0.741, + "step": 5678 + }, + { + "epoch": 0.30617856372654734, + "grad_norm": 0.79274982213974, + "learning_rate": 9.860753697366663e-06, + "loss": 0.7908, + "step": 5679 + }, + { + "epoch": 0.30623247789519087, + "grad_norm": 0.8727068305015564, + "learning_rate": 9.860704007979778e-06, + "loss": 0.7392, + "step": 5680 + }, + { + "epoch": 0.3062863920638344, + "grad_norm": 0.7981608510017395, + "learning_rate": 9.860654309854001e-06, + "loss": 0.7663, + "step": 5681 + }, + { + "epoch": 0.3063403062324779, + "grad_norm": 0.792843222618103, + "learning_rate": 9.860604602989426e-06, + "loss": 0.7546, + "step": 5682 + }, + { + "epoch": 0.3063942204011214, + "grad_norm": 0.7595586180686951, + "learning_rate": 9.860554887386138e-06, + "loss": 0.7564, + "step": 5683 + }, + { + "epoch": 0.30644813456976494, + "grad_norm": 0.7474101185798645, + "learning_rate": 9.860505163044227e-06, + "loss": 0.7371, + "step": 5684 + }, + { + "epoch": 0.3065020487384085, + "grad_norm": 0.8055018186569214, + "learning_rate": 9.860455429963785e-06, + "loss": 0.7878, + "step": 5685 + }, + { + "epoch": 0.30655596290705195, + "grad_norm": 0.7451410889625549, + "learning_rate": 9.860405688144898e-06, + "loss": 0.739, + "step": 5686 + }, + { + "epoch": 0.3066098770756955, + "grad_norm": 0.8545166254043579, + "learning_rate": 9.860355937587657e-06, + "loss": 0.8017, + "step": 5687 + }, + { + "epoch": 0.306663791244339, + "grad_norm": 0.7120118737220764, + "learning_rate": 9.860306178292154e-06, + "loss": 0.6558, + "step": 5688 + }, + { + "epoch": 0.30671770541298254, + "grad_norm": 0.7577475905418396, + "learning_rate": 9.860256410258472e-06, + "loss": 0.8263, + "step": 5689 + }, + { + "epoch": 0.3067716195816261, + "grad_norm": 0.8143375515937805, + "learning_rate": 9.86020663348671e-06, + "loss": 0.6725, + "step": 5690 + }, + { + "epoch": 0.30682553375026955, + "grad_norm": 0.8326741456985474, + "learning_rate": 9.860156847976947e-06, + "loss": 0.7125, + "step": 5691 + }, + { + "epoch": 0.3068794479189131, + "grad_norm": 0.8200056552886963, + "learning_rate": 9.86010705372928e-06, + "loss": 0.7883, + "step": 5692 + }, + { + "epoch": 0.3069333620875566, + "grad_norm": 0.8206968903541565, + "learning_rate": 9.860057250743795e-06, + "loss": 0.8162, + "step": 5693 + }, + { + "epoch": 0.30698727625620015, + "grad_norm": 0.8901194930076599, + "learning_rate": 9.860007439020583e-06, + "loss": 0.8107, + "step": 5694 + }, + { + "epoch": 0.3070411904248436, + "grad_norm": 0.7782232761383057, + "learning_rate": 9.859957618559733e-06, + "loss": 0.7642, + "step": 5695 + }, + { + "epoch": 0.30709510459348716, + "grad_norm": 0.7663282752037048, + "learning_rate": 9.859907789361335e-06, + "loss": 0.7585, + "step": 5696 + }, + { + "epoch": 0.3071490187621307, + "grad_norm": 1.1563618183135986, + "learning_rate": 9.859857951425478e-06, + "loss": 0.7923, + "step": 5697 + }, + { + "epoch": 0.3072029329307742, + "grad_norm": 1.0483899116516113, + "learning_rate": 9.859808104752251e-06, + "loss": 0.8818, + "step": 5698 + }, + { + "epoch": 0.30725684709941775, + "grad_norm": 0.8558086156845093, + "learning_rate": 9.859758249341746e-06, + "loss": 0.8112, + "step": 5699 + }, + { + "epoch": 0.3073107612680612, + "grad_norm": 0.7713198661804199, + "learning_rate": 9.85970838519405e-06, + "loss": 0.7433, + "step": 5700 + }, + { + "epoch": 0.30736467543670476, + "grad_norm": 0.7688874006271362, + "learning_rate": 9.859658512309254e-06, + "loss": 0.7881, + "step": 5701 + }, + { + "epoch": 0.3074185896053483, + "grad_norm": 0.8336641788482666, + "learning_rate": 9.859608630687447e-06, + "loss": 0.7735, + "step": 5702 + }, + { + "epoch": 0.3074725037739918, + "grad_norm": 0.7127462029457092, + "learning_rate": 9.85955874032872e-06, + "loss": 0.7952, + "step": 5703 + }, + { + "epoch": 0.3075264179426353, + "grad_norm": 0.9315282702445984, + "learning_rate": 9.859508841233161e-06, + "loss": 0.8251, + "step": 5704 + }, + { + "epoch": 0.30758033211127883, + "grad_norm": 0.8110449910163879, + "learning_rate": 9.859458933400861e-06, + "loss": 0.8044, + "step": 5705 + }, + { + "epoch": 0.30763424627992236, + "grad_norm": 0.6779415607452393, + "learning_rate": 9.85940901683191e-06, + "loss": 0.6338, + "step": 5706 + }, + { + "epoch": 0.3076881604485659, + "grad_norm": 0.839794933795929, + "learning_rate": 9.859359091526396e-06, + "loss": 0.8493, + "step": 5707 + }, + { + "epoch": 0.3077420746172094, + "grad_norm": 0.7848679423332214, + "learning_rate": 9.85930915748441e-06, + "loss": 0.7008, + "step": 5708 + }, + { + "epoch": 0.3077959887858529, + "grad_norm": 0.8231180906295776, + "learning_rate": 9.85925921470604e-06, + "loss": 0.9246, + "step": 5709 + }, + { + "epoch": 0.30784990295449643, + "grad_norm": 0.6748291254043579, + "learning_rate": 9.85920926319138e-06, + "loss": 0.7293, + "step": 5710 + }, + { + "epoch": 0.30790381712313997, + "grad_norm": 0.9213640093803406, + "learning_rate": 9.859159302940515e-06, + "loss": 0.8215, + "step": 5711 + }, + { + "epoch": 0.3079577312917835, + "grad_norm": 0.7676255702972412, + "learning_rate": 9.859109333953539e-06, + "loss": 0.7937, + "step": 5712 + }, + { + "epoch": 0.308011645460427, + "grad_norm": 0.7422990202903748, + "learning_rate": 9.859059356230538e-06, + "loss": 0.7334, + "step": 5713 + }, + { + "epoch": 0.3080655596290705, + "grad_norm": 0.8571780920028687, + "learning_rate": 9.859009369771604e-06, + "loss": 0.823, + "step": 5714 + }, + { + "epoch": 0.30811947379771404, + "grad_norm": 0.7396836876869202, + "learning_rate": 9.858959374576827e-06, + "loss": 0.7581, + "step": 5715 + }, + { + "epoch": 0.30817338796635757, + "grad_norm": 0.7429798245429993, + "learning_rate": 9.858909370646296e-06, + "loss": 0.8114, + "step": 5716 + }, + { + "epoch": 0.3082273021350011, + "grad_norm": 0.6987196207046509, + "learning_rate": 9.8588593579801e-06, + "loss": 0.723, + "step": 5717 + }, + { + "epoch": 0.3082812163036446, + "grad_norm": 0.7034950256347656, + "learning_rate": 9.858809336578332e-06, + "loss": 0.7232, + "step": 5718 + }, + { + "epoch": 0.3083351304722881, + "grad_norm": 0.7538275718688965, + "learning_rate": 9.85875930644108e-06, + "loss": 0.7004, + "step": 5719 + }, + { + "epoch": 0.30838904464093164, + "grad_norm": 0.7592359185218811, + "learning_rate": 9.858709267568434e-06, + "loss": 0.8044, + "step": 5720 + }, + { + "epoch": 0.3084429588095752, + "grad_norm": 0.8540379405021667, + "learning_rate": 9.858659219960483e-06, + "loss": 0.843, + "step": 5721 + }, + { + "epoch": 0.30849687297821865, + "grad_norm": 0.72111976146698, + "learning_rate": 9.858609163617318e-06, + "loss": 0.6721, + "step": 5722 + }, + { + "epoch": 0.3085507871468622, + "grad_norm": 0.7123001217842102, + "learning_rate": 9.858559098539031e-06, + "loss": 0.7271, + "step": 5723 + }, + { + "epoch": 0.3086047013155057, + "grad_norm": 0.7733619809150696, + "learning_rate": 9.858509024725708e-06, + "loss": 0.6701, + "step": 5724 + }, + { + "epoch": 0.30865861548414925, + "grad_norm": 0.7647392153739929, + "learning_rate": 9.858458942177442e-06, + "loss": 0.7992, + "step": 5725 + }, + { + "epoch": 0.3087125296527928, + "grad_norm": 0.7586882710456848, + "learning_rate": 9.85840885089432e-06, + "loss": 0.8324, + "step": 5726 + }, + { + "epoch": 0.30876644382143625, + "grad_norm": 0.7589958310127258, + "learning_rate": 9.858358750876438e-06, + "loss": 0.8271, + "step": 5727 + }, + { + "epoch": 0.3088203579900798, + "grad_norm": 0.8030262589454651, + "learning_rate": 9.85830864212388e-06, + "loss": 0.7775, + "step": 5728 + }, + { + "epoch": 0.3088742721587233, + "grad_norm": 0.7601757049560547, + "learning_rate": 9.858258524636738e-06, + "loss": 0.8273, + "step": 5729 + }, + { + "epoch": 0.30892818632736685, + "grad_norm": 0.7365875840187073, + "learning_rate": 9.858208398415103e-06, + "loss": 0.7818, + "step": 5730 + }, + { + "epoch": 0.3089821004960103, + "grad_norm": 0.9204131364822388, + "learning_rate": 9.858158263459065e-06, + "loss": 0.6819, + "step": 5731 + }, + { + "epoch": 0.30903601466465386, + "grad_norm": 0.7195471525192261, + "learning_rate": 9.858108119768712e-06, + "loss": 0.801, + "step": 5732 + }, + { + "epoch": 0.3090899288332974, + "grad_norm": 0.9012541174888611, + "learning_rate": 9.858057967344137e-06, + "loss": 0.8263, + "step": 5733 + }, + { + "epoch": 0.3091438430019409, + "grad_norm": 0.6790168285369873, + "learning_rate": 9.858007806185429e-06, + "loss": 0.6725, + "step": 5734 + }, + { + "epoch": 0.30919775717058445, + "grad_norm": 0.9280243515968323, + "learning_rate": 9.857957636292678e-06, + "loss": 0.7817, + "step": 5735 + }, + { + "epoch": 0.30925167133922793, + "grad_norm": 1.1063090562820435, + "learning_rate": 9.857907457665974e-06, + "loss": 0.921, + "step": 5736 + }, + { + "epoch": 0.30930558550787146, + "grad_norm": 0.7667673826217651, + "learning_rate": 9.857857270305408e-06, + "loss": 0.7904, + "step": 5737 + }, + { + "epoch": 0.309359499676515, + "grad_norm": 0.8255706429481506, + "learning_rate": 9.85780707421107e-06, + "loss": 0.8041, + "step": 5738 + }, + { + "epoch": 0.3094134138451585, + "grad_norm": 0.742287814617157, + "learning_rate": 9.85775686938305e-06, + "loss": 0.726, + "step": 5739 + }, + { + "epoch": 0.309467328013802, + "grad_norm": 0.8331649899482727, + "learning_rate": 9.857706655821438e-06, + "loss": 0.7952, + "step": 5740 + }, + { + "epoch": 0.30952124218244553, + "grad_norm": 0.7530134916305542, + "learning_rate": 9.857656433526325e-06, + "loss": 0.7413, + "step": 5741 + }, + { + "epoch": 0.30957515635108906, + "grad_norm": 0.9277242422103882, + "learning_rate": 9.8576062024978e-06, + "loss": 0.8895, + "step": 5742 + }, + { + "epoch": 0.3096290705197326, + "grad_norm": 0.7589628100395203, + "learning_rate": 9.857555962735956e-06, + "loss": 0.8385, + "step": 5743 + }, + { + "epoch": 0.30968298468837613, + "grad_norm": 0.7948117256164551, + "learning_rate": 9.85750571424088e-06, + "loss": 0.846, + "step": 5744 + }, + { + "epoch": 0.3097368988570196, + "grad_norm": 0.775082528591156, + "learning_rate": 9.857455457012663e-06, + "loss": 0.7357, + "step": 5745 + }, + { + "epoch": 0.30979081302566314, + "grad_norm": 0.7113924622535706, + "learning_rate": 9.857405191051399e-06, + "loss": 0.778, + "step": 5746 + }, + { + "epoch": 0.30984472719430667, + "grad_norm": 0.7900906801223755, + "learning_rate": 9.857354916357174e-06, + "loss": 0.7584, + "step": 5747 + }, + { + "epoch": 0.3098986413629502, + "grad_norm": 0.7182400226593018, + "learning_rate": 9.85730463293008e-06, + "loss": 0.814, + "step": 5748 + }, + { + "epoch": 0.3099525555315937, + "grad_norm": 0.7115117907524109, + "learning_rate": 9.857254340770207e-06, + "loss": 0.7648, + "step": 5749 + }, + { + "epoch": 0.3100064697002372, + "grad_norm": 0.8024550676345825, + "learning_rate": 9.857204039877646e-06, + "loss": 0.7196, + "step": 5750 + }, + { + "epoch": 0.31006038386888074, + "grad_norm": 0.7853803634643555, + "learning_rate": 9.857153730252487e-06, + "loss": 0.7293, + "step": 5751 + }, + { + "epoch": 0.31011429803752427, + "grad_norm": 0.7588229179382324, + "learning_rate": 9.857103411894822e-06, + "loss": 0.8404, + "step": 5752 + }, + { + "epoch": 0.3101682122061678, + "grad_norm": 0.7723760008811951, + "learning_rate": 9.857053084804738e-06, + "loss": 0.815, + "step": 5753 + }, + { + "epoch": 0.3102221263748113, + "grad_norm": 0.7242899537086487, + "learning_rate": 9.857002748982329e-06, + "loss": 0.7752, + "step": 5754 + }, + { + "epoch": 0.3102760405434548, + "grad_norm": 0.7757428288459778, + "learning_rate": 9.856952404427684e-06, + "loss": 0.7938, + "step": 5755 + }, + { + "epoch": 0.31032995471209834, + "grad_norm": 0.7425571084022522, + "learning_rate": 9.856902051140892e-06, + "loss": 0.7238, + "step": 5756 + }, + { + "epoch": 0.3103838688807419, + "grad_norm": 0.7900612354278564, + "learning_rate": 9.856851689122048e-06, + "loss": 0.8129, + "step": 5757 + }, + { + "epoch": 0.31043778304938535, + "grad_norm": 0.8878443241119385, + "learning_rate": 9.856801318371238e-06, + "loss": 0.7986, + "step": 5758 + }, + { + "epoch": 0.3104916972180289, + "grad_norm": 0.7342200875282288, + "learning_rate": 9.856750938888554e-06, + "loss": 0.821, + "step": 5759 + }, + { + "epoch": 0.3105456113866724, + "grad_norm": 1.700501799583435, + "learning_rate": 9.856700550674088e-06, + "loss": 0.8207, + "step": 5760 + }, + { + "epoch": 0.31059952555531595, + "grad_norm": 0.8365451097488403, + "learning_rate": 9.85665015372793e-06, + "loss": 0.7258, + "step": 5761 + }, + { + "epoch": 0.3106534397239595, + "grad_norm": 0.908216655254364, + "learning_rate": 9.856599748050168e-06, + "loss": 0.8017, + "step": 5762 + }, + { + "epoch": 0.31070735389260296, + "grad_norm": 0.7779913544654846, + "learning_rate": 9.856549333640896e-06, + "loss": 0.7998, + "step": 5763 + }, + { + "epoch": 0.3107612680612465, + "grad_norm": 0.8979871273040771, + "learning_rate": 9.856498910500204e-06, + "loss": 0.8191, + "step": 5764 + }, + { + "epoch": 0.31081518222989, + "grad_norm": 0.8549361228942871, + "learning_rate": 9.85644847862818e-06, + "loss": 0.7747, + "step": 5765 + }, + { + "epoch": 0.31086909639853355, + "grad_norm": 0.7997828125953674, + "learning_rate": 9.856398038024917e-06, + "loss": 0.7387, + "step": 5766 + }, + { + "epoch": 0.310923010567177, + "grad_norm": 0.80763840675354, + "learning_rate": 9.856347588690507e-06, + "loss": 0.7336, + "step": 5767 + }, + { + "epoch": 0.31097692473582056, + "grad_norm": 0.9658893346786499, + "learning_rate": 9.856297130625038e-06, + "loss": 0.7409, + "step": 5768 + }, + { + "epoch": 0.3110308389044641, + "grad_norm": 0.8522310853004456, + "learning_rate": 9.8562466638286e-06, + "loss": 0.7911, + "step": 5769 + }, + { + "epoch": 0.3110847530731076, + "grad_norm": 0.7911937236785889, + "learning_rate": 9.856196188301289e-06, + "loss": 0.7275, + "step": 5770 + }, + { + "epoch": 0.31113866724175115, + "grad_norm": 0.7432350516319275, + "learning_rate": 9.85614570404319e-06, + "loss": 0.8182, + "step": 5771 + }, + { + "epoch": 0.31119258141039463, + "grad_norm": 0.8038663864135742, + "learning_rate": 9.856095211054397e-06, + "loss": 0.8021, + "step": 5772 + }, + { + "epoch": 0.31124649557903816, + "grad_norm": 0.7536031007766724, + "learning_rate": 9.856044709334998e-06, + "loss": 0.8136, + "step": 5773 + }, + { + "epoch": 0.3113004097476817, + "grad_norm": 0.7866044044494629, + "learning_rate": 9.855994198885086e-06, + "loss": 0.7574, + "step": 5774 + }, + { + "epoch": 0.3113543239163252, + "grad_norm": 0.6577789783477783, + "learning_rate": 9.855943679704752e-06, + "loss": 0.707, + "step": 5775 + }, + { + "epoch": 0.3114082380849687, + "grad_norm": 0.6679837703704834, + "learning_rate": 9.855893151794087e-06, + "loss": 0.718, + "step": 5776 + }, + { + "epoch": 0.31146215225361223, + "grad_norm": 0.7168986797332764, + "learning_rate": 9.85584261515318e-06, + "loss": 0.7349, + "step": 5777 + }, + { + "epoch": 0.31151606642225577, + "grad_norm": 0.7366868257522583, + "learning_rate": 9.855792069782123e-06, + "loss": 0.766, + "step": 5778 + }, + { + "epoch": 0.3115699805908993, + "grad_norm": 0.6968795657157898, + "learning_rate": 9.855741515681007e-06, + "loss": 0.7344, + "step": 5779 + }, + { + "epoch": 0.31162389475954283, + "grad_norm": 0.7138608694076538, + "learning_rate": 9.855690952849921e-06, + "loss": 0.7845, + "step": 5780 + }, + { + "epoch": 0.3116778089281863, + "grad_norm": 0.7169587016105652, + "learning_rate": 9.85564038128896e-06, + "loss": 0.6987, + "step": 5781 + }, + { + "epoch": 0.31173172309682984, + "grad_norm": 1.0132629871368408, + "learning_rate": 9.85558980099821e-06, + "loss": 0.8272, + "step": 5782 + }, + { + "epoch": 0.31178563726547337, + "grad_norm": 0.819661021232605, + "learning_rate": 9.855539211977767e-06, + "loss": 0.8121, + "step": 5783 + }, + { + "epoch": 0.3118395514341169, + "grad_norm": 0.8109739422798157, + "learning_rate": 9.855488614227718e-06, + "loss": 0.7549, + "step": 5784 + }, + { + "epoch": 0.3118934656027604, + "grad_norm": 0.7935467958450317, + "learning_rate": 9.855438007748155e-06, + "loss": 0.7668, + "step": 5785 + }, + { + "epoch": 0.3119473797714039, + "grad_norm": 0.742849588394165, + "learning_rate": 9.855387392539171e-06, + "loss": 0.7784, + "step": 5786 + }, + { + "epoch": 0.31200129394004744, + "grad_norm": 0.6716766953468323, + "learning_rate": 9.855336768600855e-06, + "loss": 0.7054, + "step": 5787 + }, + { + "epoch": 0.312055208108691, + "grad_norm": 0.7886252403259277, + "learning_rate": 9.855286135933298e-06, + "loss": 0.7507, + "step": 5788 + }, + { + "epoch": 0.3121091222773345, + "grad_norm": 0.8030907511711121, + "learning_rate": 9.85523549453659e-06, + "loss": 0.759, + "step": 5789 + }, + { + "epoch": 0.312163036445978, + "grad_norm": 0.8043600916862488, + "learning_rate": 9.855184844410823e-06, + "loss": 0.7511, + "step": 5790 + }, + { + "epoch": 0.3122169506146215, + "grad_norm": 0.7559619545936584, + "learning_rate": 9.85513418555609e-06, + "loss": 0.6483, + "step": 5791 + }, + { + "epoch": 0.31227086478326505, + "grad_norm": 0.752955436706543, + "learning_rate": 9.855083517972482e-06, + "loss": 0.7642, + "step": 5792 + }, + { + "epoch": 0.3123247789519086, + "grad_norm": 0.8848125338554382, + "learning_rate": 9.855032841660086e-06, + "loss": 0.7942, + "step": 5793 + }, + { + "epoch": 0.31237869312055205, + "grad_norm": 0.7342360019683838, + "learning_rate": 9.854982156618998e-06, + "loss": 0.7871, + "step": 5794 + }, + { + "epoch": 0.3124326072891956, + "grad_norm": 0.8534268140792847, + "learning_rate": 9.854931462849305e-06, + "loss": 0.845, + "step": 5795 + }, + { + "epoch": 0.3124865214578391, + "grad_norm": 1.071186900138855, + "learning_rate": 9.8548807603511e-06, + "loss": 0.8058, + "step": 5796 + }, + { + "epoch": 0.31254043562648265, + "grad_norm": 0.9060416221618652, + "learning_rate": 9.854830049124475e-06, + "loss": 0.8609, + "step": 5797 + }, + { + "epoch": 0.3125943497951262, + "grad_norm": 0.7898332476615906, + "learning_rate": 9.85477932916952e-06, + "loss": 0.8355, + "step": 5798 + }, + { + "epoch": 0.31264826396376966, + "grad_norm": 0.7644670009613037, + "learning_rate": 9.854728600486326e-06, + "loss": 0.8266, + "step": 5799 + }, + { + "epoch": 0.3127021781324132, + "grad_norm": 0.719078004360199, + "learning_rate": 9.854677863074987e-06, + "loss": 0.6484, + "step": 5800 + }, + { + "epoch": 0.3127560923010567, + "grad_norm": 0.7221580743789673, + "learning_rate": 9.85462711693559e-06, + "loss": 0.6918, + "step": 5801 + }, + { + "epoch": 0.31281000646970025, + "grad_norm": 0.7351505160331726, + "learning_rate": 9.854576362068228e-06, + "loss": 0.7317, + "step": 5802 + }, + { + "epoch": 0.3128639206383438, + "grad_norm": 0.8377266526222229, + "learning_rate": 9.854525598472993e-06, + "loss": 0.755, + "step": 5803 + }, + { + "epoch": 0.31291783480698726, + "grad_norm": 0.7597038149833679, + "learning_rate": 9.854474826149976e-06, + "loss": 0.8052, + "step": 5804 + }, + { + "epoch": 0.3129717489756308, + "grad_norm": 0.674359142780304, + "learning_rate": 9.854424045099267e-06, + "loss": 0.6435, + "step": 5805 + }, + { + "epoch": 0.3130256631442743, + "grad_norm": 0.6617887020111084, + "learning_rate": 9.85437325532096e-06, + "loss": 0.7145, + "step": 5806 + }, + { + "epoch": 0.31307957731291786, + "grad_norm": 0.6445237994194031, + "learning_rate": 9.854322456815141e-06, + "loss": 0.6446, + "step": 5807 + }, + { + "epoch": 0.31313349148156133, + "grad_norm": 0.7854598760604858, + "learning_rate": 9.854271649581907e-06, + "loss": 0.7758, + "step": 5808 + }, + { + "epoch": 0.31318740565020486, + "grad_norm": 0.8278177380561829, + "learning_rate": 9.85422083362135e-06, + "loss": 0.7925, + "step": 5809 + }, + { + "epoch": 0.3132413198188484, + "grad_norm": 0.7923195958137512, + "learning_rate": 9.854170008933556e-06, + "loss": 0.8004, + "step": 5810 + }, + { + "epoch": 0.31329523398749193, + "grad_norm": 0.7870082855224609, + "learning_rate": 9.85411917551862e-06, + "loss": 0.6756, + "step": 5811 + }, + { + "epoch": 0.31334914815613546, + "grad_norm": 0.6959754824638367, + "learning_rate": 9.85406833337663e-06, + "loss": 0.672, + "step": 5812 + }, + { + "epoch": 0.31340306232477894, + "grad_norm": 0.8121528625488281, + "learning_rate": 9.854017482507682e-06, + "loss": 0.7581, + "step": 5813 + }, + { + "epoch": 0.31345697649342247, + "grad_norm": 0.754766583442688, + "learning_rate": 9.853966622911866e-06, + "loss": 0.7252, + "step": 5814 + }, + { + "epoch": 0.313510890662066, + "grad_norm": 0.7755377888679504, + "learning_rate": 9.85391575458927e-06, + "loss": 0.8181, + "step": 5815 + }, + { + "epoch": 0.31356480483070953, + "grad_norm": 0.784403383731842, + "learning_rate": 9.85386487753999e-06, + "loss": 0.8192, + "step": 5816 + }, + { + "epoch": 0.313618718999353, + "grad_norm": 0.7509371638298035, + "learning_rate": 9.853813991764116e-06, + "loss": 0.7961, + "step": 5817 + }, + { + "epoch": 0.31367263316799654, + "grad_norm": 0.8024166226387024, + "learning_rate": 9.853763097261736e-06, + "loss": 0.8147, + "step": 5818 + }, + { + "epoch": 0.31372654733664007, + "grad_norm": 0.8381186127662659, + "learning_rate": 9.853712194032949e-06, + "loss": 0.8171, + "step": 5819 + }, + { + "epoch": 0.3137804615052836, + "grad_norm": 0.7710514664649963, + "learning_rate": 9.85366128207784e-06, + "loss": 0.8022, + "step": 5820 + }, + { + "epoch": 0.31383437567392714, + "grad_norm": 0.6437869071960449, + "learning_rate": 9.853610361396501e-06, + "loss": 0.6448, + "step": 5821 + }, + { + "epoch": 0.3138882898425706, + "grad_norm": 0.8716824650764465, + "learning_rate": 9.853559431989029e-06, + "loss": 0.8177, + "step": 5822 + }, + { + "epoch": 0.31394220401121414, + "grad_norm": 0.7774370908737183, + "learning_rate": 9.853508493855507e-06, + "loss": 0.7382, + "step": 5823 + }, + { + "epoch": 0.3139961181798577, + "grad_norm": 0.9840212464332581, + "learning_rate": 9.853457546996034e-06, + "loss": 0.7965, + "step": 5824 + }, + { + "epoch": 0.3140500323485012, + "grad_norm": 0.8395524621009827, + "learning_rate": 9.853406591410699e-06, + "loss": 0.7582, + "step": 5825 + }, + { + "epoch": 0.3141039465171447, + "grad_norm": 0.774830162525177, + "learning_rate": 9.853355627099594e-06, + "loss": 0.7806, + "step": 5826 + }, + { + "epoch": 0.3141578606857882, + "grad_norm": 0.746880829334259, + "learning_rate": 9.85330465406281e-06, + "loss": 0.8041, + "step": 5827 + }, + { + "epoch": 0.31421177485443175, + "grad_norm": 0.7635205984115601, + "learning_rate": 9.853253672300437e-06, + "loss": 0.7938, + "step": 5828 + }, + { + "epoch": 0.3142656890230753, + "grad_norm": 0.8529419302940369, + "learning_rate": 9.85320268181257e-06, + "loss": 0.7793, + "step": 5829 + }, + { + "epoch": 0.3143196031917188, + "grad_norm": 0.7115237712860107, + "learning_rate": 9.853151682599298e-06, + "loss": 0.7151, + "step": 5830 + }, + { + "epoch": 0.3143735173603623, + "grad_norm": 0.784304141998291, + "learning_rate": 9.853100674660716e-06, + "loss": 0.7556, + "step": 5831 + }, + { + "epoch": 0.3144274315290058, + "grad_norm": 0.8274726271629333, + "learning_rate": 9.85304965799691e-06, + "loss": 0.8457, + "step": 5832 + }, + { + "epoch": 0.31448134569764935, + "grad_norm": 0.9436095952987671, + "learning_rate": 9.85299863260798e-06, + "loss": 0.7479, + "step": 5833 + }, + { + "epoch": 0.3145352598662929, + "grad_norm": 0.7544522881507874, + "learning_rate": 9.85294759849401e-06, + "loss": 0.7865, + "step": 5834 + }, + { + "epoch": 0.31458917403493636, + "grad_norm": 0.7091339230537415, + "learning_rate": 9.852896555655095e-06, + "loss": 0.7208, + "step": 5835 + }, + { + "epoch": 0.3146430882035799, + "grad_norm": 0.8669036626815796, + "learning_rate": 9.852845504091326e-06, + "loss": 0.8044, + "step": 5836 + }, + { + "epoch": 0.3146970023722234, + "grad_norm": 0.6712572574615479, + "learning_rate": 9.852794443802796e-06, + "loss": 0.727, + "step": 5837 + }, + { + "epoch": 0.31475091654086695, + "grad_norm": 0.7483683824539185, + "learning_rate": 9.852743374789596e-06, + "loss": 0.7588, + "step": 5838 + }, + { + "epoch": 0.3148048307095105, + "grad_norm": 0.8184488415718079, + "learning_rate": 9.852692297051818e-06, + "loss": 0.7595, + "step": 5839 + }, + { + "epoch": 0.31485874487815396, + "grad_norm": 0.8620066046714783, + "learning_rate": 9.852641210589553e-06, + "loss": 0.7901, + "step": 5840 + }, + { + "epoch": 0.3149126590467975, + "grad_norm": 0.7897535562515259, + "learning_rate": 9.852590115402895e-06, + "loss": 0.7587, + "step": 5841 + }, + { + "epoch": 0.314966573215441, + "grad_norm": 0.9280666708946228, + "learning_rate": 9.852539011491932e-06, + "loss": 0.8399, + "step": 5842 + }, + { + "epoch": 0.31502048738408456, + "grad_norm": 0.8631352782249451, + "learning_rate": 9.852487898856761e-06, + "loss": 0.8913, + "step": 5843 + }, + { + "epoch": 0.31507440155272803, + "grad_norm": 0.7954910397529602, + "learning_rate": 9.85243677749747e-06, + "loss": 0.7752, + "step": 5844 + }, + { + "epoch": 0.31512831572137157, + "grad_norm": 0.7695401310920715, + "learning_rate": 9.852385647414153e-06, + "loss": 0.7503, + "step": 5845 + }, + { + "epoch": 0.3151822298900151, + "grad_norm": 0.8069831132888794, + "learning_rate": 9.8523345086069e-06, + "loss": 0.8466, + "step": 5846 + }, + { + "epoch": 0.31523614405865863, + "grad_norm": 0.8643277883529663, + "learning_rate": 9.852283361075806e-06, + "loss": 0.7863, + "step": 5847 + }, + { + "epoch": 0.31529005822730216, + "grad_norm": 0.9699618816375732, + "learning_rate": 9.852232204820957e-06, + "loss": 0.7837, + "step": 5848 + }, + { + "epoch": 0.31534397239594564, + "grad_norm": 0.9122641086578369, + "learning_rate": 9.852181039842453e-06, + "loss": 0.7672, + "step": 5849 + }, + { + "epoch": 0.31539788656458917, + "grad_norm": 0.7795725464820862, + "learning_rate": 9.852129866140379e-06, + "loss": 0.8322, + "step": 5850 + }, + { + "epoch": 0.3154518007332327, + "grad_norm": 0.772454559803009, + "learning_rate": 9.85207868371483e-06, + "loss": 0.8068, + "step": 5851 + }, + { + "epoch": 0.31550571490187623, + "grad_norm": 0.7837170958518982, + "learning_rate": 9.8520274925659e-06, + "loss": 0.8069, + "step": 5852 + }, + { + "epoch": 0.3155596290705197, + "grad_norm": 0.7513681650161743, + "learning_rate": 9.851976292693677e-06, + "loss": 0.7305, + "step": 5853 + }, + { + "epoch": 0.31561354323916324, + "grad_norm": 0.814034640789032, + "learning_rate": 9.851925084098257e-06, + "loss": 0.8766, + "step": 5854 + }, + { + "epoch": 0.3156674574078068, + "grad_norm": 0.7257975339889526, + "learning_rate": 9.851873866779729e-06, + "loss": 0.7512, + "step": 5855 + }, + { + "epoch": 0.3157213715764503, + "grad_norm": 0.7528879046440125, + "learning_rate": 9.851822640738186e-06, + "loss": 0.7858, + "step": 5856 + }, + { + "epoch": 0.31577528574509384, + "grad_norm": 0.8108073472976685, + "learning_rate": 9.85177140597372e-06, + "loss": 0.821, + "step": 5857 + }, + { + "epoch": 0.3158291999137373, + "grad_norm": 0.6840653419494629, + "learning_rate": 9.851720162486424e-06, + "loss": 0.6982, + "step": 5858 + }, + { + "epoch": 0.31588311408238084, + "grad_norm": 0.7130960822105408, + "learning_rate": 9.851668910276388e-06, + "loss": 0.7523, + "step": 5859 + }, + { + "epoch": 0.3159370282510244, + "grad_norm": 0.9361156821250916, + "learning_rate": 9.851617649343708e-06, + "loss": 0.7693, + "step": 5860 + }, + { + "epoch": 0.3159909424196679, + "grad_norm": 0.7786045074462891, + "learning_rate": 9.851566379688473e-06, + "loss": 0.8661, + "step": 5861 + }, + { + "epoch": 0.3160448565883114, + "grad_norm": 0.7253924608230591, + "learning_rate": 9.851515101310776e-06, + "loss": 0.7806, + "step": 5862 + }, + { + "epoch": 0.3160987707569549, + "grad_norm": 0.70335853099823, + "learning_rate": 9.85146381421071e-06, + "loss": 0.8002, + "step": 5863 + }, + { + "epoch": 0.31615268492559845, + "grad_norm": 0.6935878396034241, + "learning_rate": 9.851412518388363e-06, + "loss": 0.7306, + "step": 5864 + }, + { + "epoch": 0.316206599094242, + "grad_norm": 0.840141236782074, + "learning_rate": 9.851361213843834e-06, + "loss": 0.8375, + "step": 5865 + }, + { + "epoch": 0.3162605132628855, + "grad_norm": 0.7647420167922974, + "learning_rate": 9.851309900577211e-06, + "loss": 0.7402, + "step": 5866 + }, + { + "epoch": 0.316314427431529, + "grad_norm": 0.8464421629905701, + "learning_rate": 9.851258578588586e-06, + "loss": 0.7871, + "step": 5867 + }, + { + "epoch": 0.3163683416001725, + "grad_norm": 0.7676255702972412, + "learning_rate": 9.851207247878053e-06, + "loss": 0.706, + "step": 5868 + }, + { + "epoch": 0.31642225576881605, + "grad_norm": 0.6883916258811951, + "learning_rate": 9.851155908445704e-06, + "loss": 0.7738, + "step": 5869 + }, + { + "epoch": 0.3164761699374596, + "grad_norm": 0.687218427658081, + "learning_rate": 9.851104560291631e-06, + "loss": 0.6875, + "step": 5870 + }, + { + "epoch": 0.31653008410610306, + "grad_norm": 0.7811813354492188, + "learning_rate": 9.851053203415926e-06, + "loss": 0.6922, + "step": 5871 + }, + { + "epoch": 0.3165839982747466, + "grad_norm": 0.85787034034729, + "learning_rate": 9.851001837818681e-06, + "loss": 0.8895, + "step": 5872 + }, + { + "epoch": 0.3166379124433901, + "grad_norm": 0.9360331296920776, + "learning_rate": 9.85095046349999e-06, + "loss": 0.878, + "step": 5873 + }, + { + "epoch": 0.31669182661203366, + "grad_norm": 0.8196890950202942, + "learning_rate": 9.850899080459944e-06, + "loss": 0.8456, + "step": 5874 + }, + { + "epoch": 0.3167457407806772, + "grad_norm": 0.7880243062973022, + "learning_rate": 9.850847688698634e-06, + "loss": 0.7871, + "step": 5875 + }, + { + "epoch": 0.31679965494932066, + "grad_norm": 0.8372710943222046, + "learning_rate": 9.850796288216158e-06, + "loss": 0.8296, + "step": 5876 + }, + { + "epoch": 0.3168535691179642, + "grad_norm": 0.735093355178833, + "learning_rate": 9.8507448790126e-06, + "loss": 0.6431, + "step": 5877 + }, + { + "epoch": 0.3169074832866077, + "grad_norm": 0.840723991394043, + "learning_rate": 9.850693461088061e-06, + "loss": 0.8992, + "step": 5878 + }, + { + "epoch": 0.31696139745525126, + "grad_norm": 0.7168570756912231, + "learning_rate": 9.850642034442628e-06, + "loss": 0.7286, + "step": 5879 + }, + { + "epoch": 0.31701531162389474, + "grad_norm": 0.7975905537605286, + "learning_rate": 9.850590599076393e-06, + "loss": 0.8534, + "step": 5880 + }, + { + "epoch": 0.31706922579253827, + "grad_norm": 0.7787801027297974, + "learning_rate": 9.850539154989452e-06, + "loss": 0.7506, + "step": 5881 + }, + { + "epoch": 0.3171231399611818, + "grad_norm": 0.8750953674316406, + "learning_rate": 9.850487702181895e-06, + "loss": 0.8317, + "step": 5882 + }, + { + "epoch": 0.31717705412982533, + "grad_norm": 0.7858786582946777, + "learning_rate": 9.850436240653815e-06, + "loss": 0.7943, + "step": 5883 + }, + { + "epoch": 0.31723096829846886, + "grad_norm": 0.6804238557815552, + "learning_rate": 9.850384770405306e-06, + "loss": 0.6855, + "step": 5884 + }, + { + "epoch": 0.31728488246711234, + "grad_norm": 0.7625332474708557, + "learning_rate": 9.850333291436461e-06, + "loss": 0.7748, + "step": 5885 + }, + { + "epoch": 0.31733879663575587, + "grad_norm": 0.7445969581604004, + "learning_rate": 9.850281803747367e-06, + "loss": 0.6449, + "step": 5886 + }, + { + "epoch": 0.3173927108043994, + "grad_norm": 0.7251415848731995, + "learning_rate": 9.850230307338122e-06, + "loss": 0.7658, + "step": 5887 + }, + { + "epoch": 0.31744662497304293, + "grad_norm": 0.9136094450950623, + "learning_rate": 9.850178802208818e-06, + "loss": 0.8192, + "step": 5888 + }, + { + "epoch": 0.3175005391416864, + "grad_norm": 0.6731426119804382, + "learning_rate": 9.850127288359546e-06, + "loss": 0.6744, + "step": 5889 + }, + { + "epoch": 0.31755445331032994, + "grad_norm": 0.6679263114929199, + "learning_rate": 9.8500757657904e-06, + "loss": 0.6734, + "step": 5890 + }, + { + "epoch": 0.3176083674789735, + "grad_norm": 0.7872502207756042, + "learning_rate": 9.850024234501469e-06, + "loss": 0.735, + "step": 5891 + }, + { + "epoch": 0.317662281647617, + "grad_norm": 0.806593656539917, + "learning_rate": 9.849972694492851e-06, + "loss": 0.8145, + "step": 5892 + }, + { + "epoch": 0.31771619581626054, + "grad_norm": 0.6743435859680176, + "learning_rate": 9.849921145764637e-06, + "loss": 0.7039, + "step": 5893 + }, + { + "epoch": 0.317770109984904, + "grad_norm": 0.7740095853805542, + "learning_rate": 9.849869588316919e-06, + "loss": 0.725, + "step": 5894 + }, + { + "epoch": 0.31782402415354755, + "grad_norm": 0.7239527702331543, + "learning_rate": 9.849818022149787e-06, + "loss": 0.7544, + "step": 5895 + }, + { + "epoch": 0.3178779383221911, + "grad_norm": 0.7768372297286987, + "learning_rate": 9.849766447263338e-06, + "loss": 0.7481, + "step": 5896 + }, + { + "epoch": 0.3179318524908346, + "grad_norm": 0.7212410569190979, + "learning_rate": 9.849714863657663e-06, + "loss": 0.7734, + "step": 5897 + }, + { + "epoch": 0.3179857666594781, + "grad_norm": 0.7393351793289185, + "learning_rate": 9.849663271332854e-06, + "loss": 0.7746, + "step": 5898 + }, + { + "epoch": 0.3180396808281216, + "grad_norm": 0.8589115142822266, + "learning_rate": 9.849611670289006e-06, + "loss": 0.8, + "step": 5899 + }, + { + "epoch": 0.31809359499676515, + "grad_norm": 0.7275830507278442, + "learning_rate": 9.849560060526208e-06, + "loss": 0.751, + "step": 5900 + }, + { + "epoch": 0.3181475091654087, + "grad_norm": 0.7192492485046387, + "learning_rate": 9.849508442044557e-06, + "loss": 0.79, + "step": 5901 + }, + { + "epoch": 0.3182014233340522, + "grad_norm": 0.7314275503158569, + "learning_rate": 9.849456814844143e-06, + "loss": 0.7547, + "step": 5902 + }, + { + "epoch": 0.3182553375026957, + "grad_norm": 0.8149042725563049, + "learning_rate": 9.84940517892506e-06, + "loss": 0.8646, + "step": 5903 + }, + { + "epoch": 0.3183092516713392, + "grad_norm": 0.8278919458389282, + "learning_rate": 9.8493535342874e-06, + "loss": 0.8001, + "step": 5904 + }, + { + "epoch": 0.31836316583998275, + "grad_norm": 0.6453657746315002, + "learning_rate": 9.849301880931257e-06, + "loss": 0.6624, + "step": 5905 + }, + { + "epoch": 0.3184170800086263, + "grad_norm": 0.7421174049377441, + "learning_rate": 9.849250218856722e-06, + "loss": 0.7438, + "step": 5906 + }, + { + "epoch": 0.31847099417726976, + "grad_norm": 0.7499041557312012, + "learning_rate": 9.849198548063892e-06, + "loss": 0.7257, + "step": 5907 + }, + { + "epoch": 0.3185249083459133, + "grad_norm": 0.831326961517334, + "learning_rate": 9.849146868552854e-06, + "loss": 0.8768, + "step": 5908 + }, + { + "epoch": 0.3185788225145568, + "grad_norm": 0.8465683460235596, + "learning_rate": 9.849095180323706e-06, + "loss": 0.7383, + "step": 5909 + }, + { + "epoch": 0.31863273668320036, + "grad_norm": 0.7447533011436462, + "learning_rate": 9.849043483376537e-06, + "loss": 0.7728, + "step": 5910 + }, + { + "epoch": 0.3186866508518439, + "grad_norm": 0.6588696241378784, + "learning_rate": 9.848991777711443e-06, + "loss": 0.6385, + "step": 5911 + }, + { + "epoch": 0.31874056502048737, + "grad_norm": 0.7334310412406921, + "learning_rate": 9.848940063328516e-06, + "loss": 0.7306, + "step": 5912 + }, + { + "epoch": 0.3187944791891309, + "grad_norm": 0.7731287479400635, + "learning_rate": 9.84888834022785e-06, + "loss": 0.739, + "step": 5913 + }, + { + "epoch": 0.31884839335777443, + "grad_norm": 0.7623046040534973, + "learning_rate": 9.848836608409534e-06, + "loss": 0.7694, + "step": 5914 + }, + { + "epoch": 0.31890230752641796, + "grad_norm": 0.8255560398101807, + "learning_rate": 9.848784867873664e-06, + "loss": 0.7706, + "step": 5915 + }, + { + "epoch": 0.31895622169506144, + "grad_norm": 0.7074644565582275, + "learning_rate": 9.848733118620333e-06, + "loss": 0.7507, + "step": 5916 + }, + { + "epoch": 0.31901013586370497, + "grad_norm": 0.7689027190208435, + "learning_rate": 9.848681360649635e-06, + "loss": 0.8385, + "step": 5917 + }, + { + "epoch": 0.3190640500323485, + "grad_norm": 0.8051035404205322, + "learning_rate": 9.848629593961661e-06, + "loss": 0.8275, + "step": 5918 + }, + { + "epoch": 0.31911796420099203, + "grad_norm": 0.8028756976127625, + "learning_rate": 9.848577818556506e-06, + "loss": 0.7473, + "step": 5919 + }, + { + "epoch": 0.31917187836963556, + "grad_norm": 0.717805802822113, + "learning_rate": 9.848526034434261e-06, + "loss": 0.7486, + "step": 5920 + }, + { + "epoch": 0.31922579253827904, + "grad_norm": 0.8375559449195862, + "learning_rate": 9.848474241595021e-06, + "loss": 0.6865, + "step": 5921 + }, + { + "epoch": 0.3192797067069226, + "grad_norm": 0.7154849767684937, + "learning_rate": 9.848422440038877e-06, + "loss": 0.6794, + "step": 5922 + }, + { + "epoch": 0.3193336208755661, + "grad_norm": 0.7213690280914307, + "learning_rate": 9.848370629765923e-06, + "loss": 0.814, + "step": 5923 + }, + { + "epoch": 0.31938753504420964, + "grad_norm": 1.3015811443328857, + "learning_rate": 9.848318810776254e-06, + "loss": 0.825, + "step": 5924 + }, + { + "epoch": 0.3194414492128531, + "grad_norm": 0.8018954992294312, + "learning_rate": 9.848266983069961e-06, + "loss": 0.8223, + "step": 5925 + }, + { + "epoch": 0.31949536338149664, + "grad_norm": 0.8266699910163879, + "learning_rate": 9.848215146647137e-06, + "loss": 0.7321, + "step": 5926 + }, + { + "epoch": 0.3195492775501402, + "grad_norm": 0.6959900259971619, + "learning_rate": 9.848163301507878e-06, + "loss": 0.6819, + "step": 5927 + }, + { + "epoch": 0.3196031917187837, + "grad_norm": 0.8597580194473267, + "learning_rate": 9.848111447652272e-06, + "loss": 0.81, + "step": 5928 + }, + { + "epoch": 0.31965710588742724, + "grad_norm": 0.7409177422523499, + "learning_rate": 9.84805958508042e-06, + "loss": 0.7705, + "step": 5929 + }, + { + "epoch": 0.3197110200560707, + "grad_norm": 0.6678770780563354, + "learning_rate": 9.848007713792409e-06, + "loss": 0.7095, + "step": 5930 + }, + { + "epoch": 0.31976493422471425, + "grad_norm": 0.6633094549179077, + "learning_rate": 9.847955833788332e-06, + "loss": 0.6253, + "step": 5931 + }, + { + "epoch": 0.3198188483933578, + "grad_norm": 0.7608556151390076, + "learning_rate": 9.847903945068286e-06, + "loss": 0.805, + "step": 5932 + }, + { + "epoch": 0.3198727625620013, + "grad_norm": 0.790964663028717, + "learning_rate": 9.847852047632362e-06, + "loss": 0.7435, + "step": 5933 + }, + { + "epoch": 0.3199266767306448, + "grad_norm": 0.7132748961448669, + "learning_rate": 9.847800141480654e-06, + "loss": 0.7985, + "step": 5934 + }, + { + "epoch": 0.3199805908992883, + "grad_norm": 0.6799381971359253, + "learning_rate": 9.847748226613255e-06, + "loss": 0.6708, + "step": 5935 + }, + { + "epoch": 0.32003450506793185, + "grad_norm": 0.7694737911224365, + "learning_rate": 9.847696303030258e-06, + "loss": 0.7736, + "step": 5936 + }, + { + "epoch": 0.3200884192365754, + "grad_norm": 0.7217262983322144, + "learning_rate": 9.84764437073176e-06, + "loss": 0.6892, + "step": 5937 + }, + { + "epoch": 0.3201423334052189, + "grad_norm": 0.7349720001220703, + "learning_rate": 9.847592429717848e-06, + "loss": 0.854, + "step": 5938 + }, + { + "epoch": 0.3201962475738624, + "grad_norm": 0.7681494951248169, + "learning_rate": 9.847540479988619e-06, + "loss": 0.7571, + "step": 5939 + }, + { + "epoch": 0.3202501617425059, + "grad_norm": 0.8056629300117493, + "learning_rate": 9.847488521544166e-06, + "loss": 0.8339, + "step": 5940 + }, + { + "epoch": 0.32030407591114946, + "grad_norm": 0.7394456267356873, + "learning_rate": 9.847436554384584e-06, + "loss": 0.7404, + "step": 5941 + }, + { + "epoch": 0.320357990079793, + "grad_norm": 0.7189937829971313, + "learning_rate": 9.847384578509962e-06, + "loss": 0.7913, + "step": 5942 + }, + { + "epoch": 0.32041190424843646, + "grad_norm": 0.7788477540016174, + "learning_rate": 9.847332593920398e-06, + "loss": 0.7626, + "step": 5943 + }, + { + "epoch": 0.32046581841708, + "grad_norm": 0.7758776545524597, + "learning_rate": 9.847280600615986e-06, + "loss": 0.7319, + "step": 5944 + }, + { + "epoch": 0.3205197325857235, + "grad_norm": 0.7239378690719604, + "learning_rate": 9.847228598596813e-06, + "loss": 0.7236, + "step": 5945 + }, + { + "epoch": 0.32057364675436706, + "grad_norm": 0.6927216649055481, + "learning_rate": 9.847176587862979e-06, + "loss": 0.6876, + "step": 5946 + }, + { + "epoch": 0.3206275609230106, + "grad_norm": 0.6984835863113403, + "learning_rate": 9.847124568414575e-06, + "loss": 0.7333, + "step": 5947 + }, + { + "epoch": 0.32068147509165407, + "grad_norm": 0.7435864806175232, + "learning_rate": 9.847072540251694e-06, + "loss": 0.8024, + "step": 5948 + }, + { + "epoch": 0.3207353892602976, + "grad_norm": 0.7117223143577576, + "learning_rate": 9.847020503374432e-06, + "loss": 0.7618, + "step": 5949 + }, + { + "epoch": 0.32078930342894113, + "grad_norm": 0.9263757467269897, + "learning_rate": 9.84696845778288e-06, + "loss": 0.8615, + "step": 5950 + }, + { + "epoch": 0.32084321759758466, + "grad_norm": 0.7057978510856628, + "learning_rate": 9.846916403477132e-06, + "loss": 0.7523, + "step": 5951 + }, + { + "epoch": 0.32089713176622814, + "grad_norm": 0.7646591067314148, + "learning_rate": 9.846864340457282e-06, + "loss": 0.795, + "step": 5952 + }, + { + "epoch": 0.32095104593487167, + "grad_norm": 0.7106613516807556, + "learning_rate": 9.846812268723423e-06, + "loss": 0.7632, + "step": 5953 + }, + { + "epoch": 0.3210049601035152, + "grad_norm": 0.7975527048110962, + "learning_rate": 9.84676018827565e-06, + "loss": 0.7104, + "step": 5954 + }, + { + "epoch": 0.32105887427215873, + "grad_norm": 0.6670999526977539, + "learning_rate": 9.846708099114058e-06, + "loss": 0.703, + "step": 5955 + }, + { + "epoch": 0.32111278844080227, + "grad_norm": 0.8072288632392883, + "learning_rate": 9.846656001238735e-06, + "loss": 0.704, + "step": 5956 + }, + { + "epoch": 0.32116670260944574, + "grad_norm": 0.7998098134994507, + "learning_rate": 9.84660389464978e-06, + "loss": 0.8175, + "step": 5957 + }, + { + "epoch": 0.3212206167780893, + "grad_norm": 0.821967363357544, + "learning_rate": 9.846551779347284e-06, + "loss": 0.8373, + "step": 5958 + }, + { + "epoch": 0.3212745309467328, + "grad_norm": 0.6980282068252563, + "learning_rate": 9.846499655331343e-06, + "loss": 0.6926, + "step": 5959 + }, + { + "epoch": 0.32132844511537634, + "grad_norm": 0.778827965259552, + "learning_rate": 9.846447522602047e-06, + "loss": 0.7612, + "step": 5960 + }, + { + "epoch": 0.3213823592840198, + "grad_norm": 0.8421810269355774, + "learning_rate": 9.846395381159494e-06, + "loss": 0.8181, + "step": 5961 + }, + { + "epoch": 0.32143627345266335, + "grad_norm": 0.7123310565948486, + "learning_rate": 9.846343231003776e-06, + "loss": 0.748, + "step": 5962 + }, + { + "epoch": 0.3214901876213069, + "grad_norm": 0.7417185306549072, + "learning_rate": 9.846291072134984e-06, + "loss": 0.7674, + "step": 5963 + }, + { + "epoch": 0.3215441017899504, + "grad_norm": 0.6777938604354858, + "learning_rate": 9.846238904553216e-06, + "loss": 0.7341, + "step": 5964 + }, + { + "epoch": 0.32159801595859394, + "grad_norm": 0.7674862146377563, + "learning_rate": 9.846186728258564e-06, + "loss": 0.7622, + "step": 5965 + }, + { + "epoch": 0.3216519301272374, + "grad_norm": 0.751640796661377, + "learning_rate": 9.846134543251122e-06, + "loss": 0.8033, + "step": 5966 + }, + { + "epoch": 0.32170584429588095, + "grad_norm": 0.8294724225997925, + "learning_rate": 9.846082349530983e-06, + "loss": 0.8026, + "step": 5967 + }, + { + "epoch": 0.3217597584645245, + "grad_norm": 0.7189640998840332, + "learning_rate": 9.846030147098243e-06, + "loss": 0.7441, + "step": 5968 + }, + { + "epoch": 0.321813672633168, + "grad_norm": 0.821915864944458, + "learning_rate": 9.845977935952993e-06, + "loss": 0.7551, + "step": 5969 + }, + { + "epoch": 0.3218675868018115, + "grad_norm": 0.7824541926383972, + "learning_rate": 9.84592571609533e-06, + "loss": 0.8196, + "step": 5970 + }, + { + "epoch": 0.321921500970455, + "grad_norm": 0.6421594619750977, + "learning_rate": 9.845873487525343e-06, + "loss": 0.6157, + "step": 5971 + }, + { + "epoch": 0.32197541513909855, + "grad_norm": 0.7021391987800598, + "learning_rate": 9.845821250243132e-06, + "loss": 0.6678, + "step": 5972 + }, + { + "epoch": 0.3220293293077421, + "grad_norm": 0.7680091261863708, + "learning_rate": 9.845769004248787e-06, + "loss": 0.7955, + "step": 5973 + }, + { + "epoch": 0.3220832434763856, + "grad_norm": 0.8046531081199646, + "learning_rate": 9.845716749542403e-06, + "loss": 0.7923, + "step": 5974 + }, + { + "epoch": 0.3221371576450291, + "grad_norm": 0.7487708330154419, + "learning_rate": 9.845664486124073e-06, + "loss": 0.8082, + "step": 5975 + }, + { + "epoch": 0.3221910718136726, + "grad_norm": 0.9088075757026672, + "learning_rate": 9.845612213993891e-06, + "loss": 0.8243, + "step": 5976 + }, + { + "epoch": 0.32224498598231616, + "grad_norm": 0.7386162877082825, + "learning_rate": 9.845559933151953e-06, + "loss": 0.7488, + "step": 5977 + }, + { + "epoch": 0.3222989001509597, + "grad_norm": 0.7410885691642761, + "learning_rate": 9.845507643598352e-06, + "loss": 0.7584, + "step": 5978 + }, + { + "epoch": 0.32235281431960316, + "grad_norm": 0.7176096439361572, + "learning_rate": 9.84545534533318e-06, + "loss": 0.6489, + "step": 5979 + }, + { + "epoch": 0.3224067284882467, + "grad_norm": 0.7185577154159546, + "learning_rate": 9.845403038356536e-06, + "loss": 0.8144, + "step": 5980 + }, + { + "epoch": 0.32246064265689023, + "grad_norm": 0.7305336594581604, + "learning_rate": 9.84535072266851e-06, + "loss": 0.8169, + "step": 5981 + }, + { + "epoch": 0.32251455682553376, + "grad_norm": 0.692741870880127, + "learning_rate": 9.845298398269196e-06, + "loss": 0.7864, + "step": 5982 + }, + { + "epoch": 0.3225684709941773, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.845246065158688e-06, + "loss": 0.6488, + "step": 5983 + }, + { + "epoch": 0.32262238516282077, + "grad_norm": 0.7783107757568359, + "learning_rate": 9.845193723337083e-06, + "loss": 0.8142, + "step": 5984 + }, + { + "epoch": 0.3226762993314643, + "grad_norm": 0.8554279804229736, + "learning_rate": 9.845141372804473e-06, + "loss": 0.8846, + "step": 5985 + }, + { + "epoch": 0.32273021350010783, + "grad_norm": 0.6922138333320618, + "learning_rate": 9.845089013560952e-06, + "loss": 0.7624, + "step": 5986 + }, + { + "epoch": 0.32278412766875136, + "grad_norm": 0.7180864214897156, + "learning_rate": 9.845036645606613e-06, + "loss": 0.7669, + "step": 5987 + }, + { + "epoch": 0.32283804183739484, + "grad_norm": 0.9408607482910156, + "learning_rate": 9.844984268941552e-06, + "loss": 0.8211, + "step": 5988 + }, + { + "epoch": 0.32289195600603837, + "grad_norm": 0.7381359934806824, + "learning_rate": 9.844931883565862e-06, + "loss": 0.7911, + "step": 5989 + }, + { + "epoch": 0.3229458701746819, + "grad_norm": 0.9394528865814209, + "learning_rate": 9.84487948947964e-06, + "loss": 0.711, + "step": 5990 + }, + { + "epoch": 0.32299978434332544, + "grad_norm": 0.7220025062561035, + "learning_rate": 9.844827086682978e-06, + "loss": 0.7351, + "step": 5991 + }, + { + "epoch": 0.32305369851196897, + "grad_norm": 0.7684987187385559, + "learning_rate": 9.84477467517597e-06, + "loss": 0.7449, + "step": 5992 + }, + { + "epoch": 0.32310761268061244, + "grad_norm": 0.8644974231719971, + "learning_rate": 9.84472225495871e-06, + "loss": 0.6884, + "step": 5993 + }, + { + "epoch": 0.323161526849256, + "grad_norm": 0.7389580011367798, + "learning_rate": 9.844669826031291e-06, + "loss": 0.8052, + "step": 5994 + }, + { + "epoch": 0.3232154410178995, + "grad_norm": 0.8581752777099609, + "learning_rate": 9.844617388393812e-06, + "loss": 0.7494, + "step": 5995 + }, + { + "epoch": 0.32326935518654304, + "grad_norm": 0.6825373768806458, + "learning_rate": 9.844564942046365e-06, + "loss": 0.6883, + "step": 5996 + }, + { + "epoch": 0.3233232693551865, + "grad_norm": 0.6962830424308777, + "learning_rate": 9.84451248698904e-06, + "loss": 0.7378, + "step": 5997 + }, + { + "epoch": 0.32337718352383005, + "grad_norm": 0.9575344920158386, + "learning_rate": 9.844460023221938e-06, + "loss": 0.8532, + "step": 5998 + }, + { + "epoch": 0.3234310976924736, + "grad_norm": 0.775353729724884, + "learning_rate": 9.844407550745148e-06, + "loss": 0.8116, + "step": 5999 + }, + { + "epoch": 0.3234850118611171, + "grad_norm": 0.780318558216095, + "learning_rate": 9.844355069558768e-06, + "loss": 0.8569, + "step": 6000 + }, + { + "epoch": 0.32353892602976064, + "grad_norm": 0.7270397543907166, + "learning_rate": 9.84430257966289e-06, + "loss": 0.7962, + "step": 6001 + }, + { + "epoch": 0.3235928401984041, + "grad_norm": 0.7463461756706238, + "learning_rate": 9.844250081057612e-06, + "loss": 0.8462, + "step": 6002 + }, + { + "epoch": 0.32364675436704765, + "grad_norm": 1.2405445575714111, + "learning_rate": 9.844197573743022e-06, + "loss": 0.8676, + "step": 6003 + }, + { + "epoch": 0.3237006685356912, + "grad_norm": 0.7673426866531372, + "learning_rate": 9.84414505771922e-06, + "loss": 0.7668, + "step": 6004 + }, + { + "epoch": 0.3237545827043347, + "grad_norm": 0.9063783884048462, + "learning_rate": 9.844092532986298e-06, + "loss": 0.8576, + "step": 6005 + }, + { + "epoch": 0.3238084968729782, + "grad_norm": 0.8285248279571533, + "learning_rate": 9.844039999544351e-06, + "loss": 0.775, + "step": 6006 + }, + { + "epoch": 0.3238624110416217, + "grad_norm": 0.8698264956474304, + "learning_rate": 9.843987457393474e-06, + "loss": 0.8259, + "step": 6007 + }, + { + "epoch": 0.32391632521026525, + "grad_norm": 0.7413813471794128, + "learning_rate": 9.843934906533761e-06, + "loss": 0.7364, + "step": 6008 + }, + { + "epoch": 0.3239702393789088, + "grad_norm": 0.7562968134880066, + "learning_rate": 9.843882346965305e-06, + "loss": 0.7725, + "step": 6009 + }, + { + "epoch": 0.3240241535475523, + "grad_norm": 0.7958462238311768, + "learning_rate": 9.843829778688203e-06, + "loss": 0.8649, + "step": 6010 + }, + { + "epoch": 0.3240780677161958, + "grad_norm": 0.7941526174545288, + "learning_rate": 9.843777201702547e-06, + "loss": 0.7849, + "step": 6011 + }, + { + "epoch": 0.3241319818848393, + "grad_norm": 0.7362817525863647, + "learning_rate": 9.843724616008434e-06, + "loss": 0.7663, + "step": 6012 + }, + { + "epoch": 0.32418589605348286, + "grad_norm": 0.7165996432304382, + "learning_rate": 9.84367202160596e-06, + "loss": 0.7215, + "step": 6013 + }, + { + "epoch": 0.3242398102221264, + "grad_norm": 0.7727562785148621, + "learning_rate": 9.843619418495212e-06, + "loss": 0.8027, + "step": 6014 + }, + { + "epoch": 0.3242937243907699, + "grad_norm": 0.7132217288017273, + "learning_rate": 9.843566806676292e-06, + "loss": 0.6801, + "step": 6015 + }, + { + "epoch": 0.3243476385594134, + "grad_norm": 0.7188240885734558, + "learning_rate": 9.843514186149292e-06, + "loss": 0.7477, + "step": 6016 + }, + { + "epoch": 0.32440155272805693, + "grad_norm": 0.7331006526947021, + "learning_rate": 9.843461556914307e-06, + "loss": 0.7364, + "step": 6017 + }, + { + "epoch": 0.32445546689670046, + "grad_norm": 0.8003280162811279, + "learning_rate": 9.843408918971432e-06, + "loss": 0.7866, + "step": 6018 + }, + { + "epoch": 0.324509381065344, + "grad_norm": 0.7623522281646729, + "learning_rate": 9.843356272320758e-06, + "loss": 0.7346, + "step": 6019 + }, + { + "epoch": 0.32456329523398747, + "grad_norm": 0.735309362411499, + "learning_rate": 9.843303616962386e-06, + "loss": 0.7657, + "step": 6020 + }, + { + "epoch": 0.324617209402631, + "grad_norm": 0.7945713996887207, + "learning_rate": 9.843250952896407e-06, + "loss": 0.7369, + "step": 6021 + }, + { + "epoch": 0.32467112357127453, + "grad_norm": 0.717688798904419, + "learning_rate": 9.843198280122914e-06, + "loss": 0.7384, + "step": 6022 + }, + { + "epoch": 0.32472503773991807, + "grad_norm": 0.7865655422210693, + "learning_rate": 9.843145598642005e-06, + "loss": 0.774, + "step": 6023 + }, + { + "epoch": 0.3247789519085616, + "grad_norm": 0.7063577175140381, + "learning_rate": 9.843092908453773e-06, + "loss": 0.7493, + "step": 6024 + }, + { + "epoch": 0.3248328660772051, + "grad_norm": 1.2190371751785278, + "learning_rate": 9.843040209558313e-06, + "loss": 0.7435, + "step": 6025 + }, + { + "epoch": 0.3248867802458486, + "grad_norm": 0.810580849647522, + "learning_rate": 9.842987501955719e-06, + "loss": 0.8294, + "step": 6026 + }, + { + "epoch": 0.32494069441449214, + "grad_norm": 0.8750993609428406, + "learning_rate": 9.842934785646088e-06, + "loss": 0.7891, + "step": 6027 + }, + { + "epoch": 0.32499460858313567, + "grad_norm": 0.7133095860481262, + "learning_rate": 9.842882060629513e-06, + "loss": 0.8087, + "step": 6028 + }, + { + "epoch": 0.32504852275177915, + "grad_norm": 0.781443178653717, + "learning_rate": 9.842829326906089e-06, + "loss": 0.8137, + "step": 6029 + }, + { + "epoch": 0.3251024369204227, + "grad_norm": 0.789086639881134, + "learning_rate": 9.842776584475913e-06, + "loss": 0.8163, + "step": 6030 + }, + { + "epoch": 0.3251563510890662, + "grad_norm": 0.8399695158004761, + "learning_rate": 9.842723833339077e-06, + "loss": 0.7431, + "step": 6031 + }, + { + "epoch": 0.32521026525770974, + "grad_norm": 0.7760040163993835, + "learning_rate": 9.842671073495675e-06, + "loss": 0.8415, + "step": 6032 + }, + { + "epoch": 0.3252641794263533, + "grad_norm": 0.7111086249351501, + "learning_rate": 9.842618304945804e-06, + "loss": 0.7675, + "step": 6033 + }, + { + "epoch": 0.32531809359499675, + "grad_norm": 0.7499242424964905, + "learning_rate": 9.842565527689558e-06, + "loss": 0.7134, + "step": 6034 + }, + { + "epoch": 0.3253720077636403, + "grad_norm": 0.7601577639579773, + "learning_rate": 9.842512741727035e-06, + "loss": 0.7921, + "step": 6035 + }, + { + "epoch": 0.3254259219322838, + "grad_norm": 0.7153837084770203, + "learning_rate": 9.842459947058325e-06, + "loss": 0.7351, + "step": 6036 + }, + { + "epoch": 0.32547983610092734, + "grad_norm": 1.6180272102355957, + "learning_rate": 9.842407143683525e-06, + "loss": 0.8145, + "step": 6037 + }, + { + "epoch": 0.3255337502695708, + "grad_norm": 0.6792227625846863, + "learning_rate": 9.84235433160273e-06, + "loss": 0.6771, + "step": 6038 + }, + { + "epoch": 0.32558766443821435, + "grad_norm": 0.8405896425247192, + "learning_rate": 9.842301510816036e-06, + "loss": 0.8286, + "step": 6039 + }, + { + "epoch": 0.3256415786068579, + "grad_norm": 0.7823806405067444, + "learning_rate": 9.842248681323536e-06, + "loss": 0.8401, + "step": 6040 + }, + { + "epoch": 0.3256954927755014, + "grad_norm": 0.7300710678100586, + "learning_rate": 9.842195843125327e-06, + "loss": 0.7567, + "step": 6041 + }, + { + "epoch": 0.32574940694414495, + "grad_norm": 0.8341587781906128, + "learning_rate": 9.842142996221502e-06, + "loss": 0.8506, + "step": 6042 + }, + { + "epoch": 0.3258033211127884, + "grad_norm": 0.8645915985107422, + "learning_rate": 9.842090140612158e-06, + "loss": 0.9672, + "step": 6043 + }, + { + "epoch": 0.32585723528143196, + "grad_norm": 0.6935297250747681, + "learning_rate": 9.84203727629739e-06, + "loss": 0.783, + "step": 6044 + }, + { + "epoch": 0.3259111494500755, + "grad_norm": 0.728500485420227, + "learning_rate": 9.84198440327729e-06, + "loss": 0.831, + "step": 6045 + }, + { + "epoch": 0.325965063618719, + "grad_norm": 0.8680667877197266, + "learning_rate": 9.841931521551955e-06, + "loss": 0.7624, + "step": 6046 + }, + { + "epoch": 0.3260189777873625, + "grad_norm": 0.7493764162063599, + "learning_rate": 9.84187863112148e-06, + "loss": 0.7204, + "step": 6047 + }, + { + "epoch": 0.32607289195600603, + "grad_norm": 0.7388346195220947, + "learning_rate": 9.84182573198596e-06, + "loss": 0.7824, + "step": 6048 + }, + { + "epoch": 0.32612680612464956, + "grad_norm": 0.76985764503479, + "learning_rate": 9.841772824145493e-06, + "loss": 0.7255, + "step": 6049 + }, + { + "epoch": 0.3261807202932931, + "grad_norm": 0.9086965322494507, + "learning_rate": 9.841719907600168e-06, + "loss": 0.7941, + "step": 6050 + }, + { + "epoch": 0.3262346344619366, + "grad_norm": 0.7744296789169312, + "learning_rate": 9.841666982350087e-06, + "loss": 0.7908, + "step": 6051 + }, + { + "epoch": 0.3262885486305801, + "grad_norm": 0.7631317973136902, + "learning_rate": 9.84161404839534e-06, + "loss": 0.8129, + "step": 6052 + }, + { + "epoch": 0.32634246279922363, + "grad_norm": 0.8645358085632324, + "learning_rate": 9.841561105736023e-06, + "loss": 0.8719, + "step": 6053 + }, + { + "epoch": 0.32639637696786716, + "grad_norm": 0.8196138143539429, + "learning_rate": 9.841508154372233e-06, + "loss": 0.7742, + "step": 6054 + }, + { + "epoch": 0.3264502911365107, + "grad_norm": 0.7668523788452148, + "learning_rate": 9.841455194304065e-06, + "loss": 0.7545, + "step": 6055 + }, + { + "epoch": 0.32650420530515417, + "grad_norm": 0.7486326694488525, + "learning_rate": 9.841402225531613e-06, + "loss": 0.8312, + "step": 6056 + }, + { + "epoch": 0.3265581194737977, + "grad_norm": 0.7360913157463074, + "learning_rate": 9.841349248054972e-06, + "loss": 0.7447, + "step": 6057 + }, + { + "epoch": 0.32661203364244124, + "grad_norm": 0.6595920324325562, + "learning_rate": 9.84129626187424e-06, + "loss": 0.6934, + "step": 6058 + }, + { + "epoch": 0.32666594781108477, + "grad_norm": 0.7957530617713928, + "learning_rate": 9.841243266989509e-06, + "loss": 0.7758, + "step": 6059 + }, + { + "epoch": 0.3267198619797283, + "grad_norm": 0.6686688661575317, + "learning_rate": 9.841190263400874e-06, + "loss": 0.7071, + "step": 6060 + }, + { + "epoch": 0.3267737761483718, + "grad_norm": 0.7522386312484741, + "learning_rate": 9.841137251108434e-06, + "loss": 0.7658, + "step": 6061 + }, + { + "epoch": 0.3268276903170153, + "grad_norm": 0.7338777780532837, + "learning_rate": 9.841084230112281e-06, + "loss": 0.7519, + "step": 6062 + }, + { + "epoch": 0.32688160448565884, + "grad_norm": 0.7533041834831238, + "learning_rate": 9.841031200412513e-06, + "loss": 0.7219, + "step": 6063 + }, + { + "epoch": 0.32693551865430237, + "grad_norm": 0.7255193591117859, + "learning_rate": 9.840978162009223e-06, + "loss": 0.6612, + "step": 6064 + }, + { + "epoch": 0.32698943282294585, + "grad_norm": 0.8155741095542908, + "learning_rate": 9.840925114902507e-06, + "loss": 0.9107, + "step": 6065 + }, + { + "epoch": 0.3270433469915894, + "grad_norm": 0.7890196442604065, + "learning_rate": 9.84087205909246e-06, + "loss": 0.7467, + "step": 6066 + }, + { + "epoch": 0.3270972611602329, + "grad_norm": 0.8308191299438477, + "learning_rate": 9.840818994579178e-06, + "loss": 0.8076, + "step": 6067 + }, + { + "epoch": 0.32715117532887644, + "grad_norm": 0.7396252155303955, + "learning_rate": 9.840765921362756e-06, + "loss": 0.7616, + "step": 6068 + }, + { + "epoch": 0.32720508949752, + "grad_norm": 0.7735585570335388, + "learning_rate": 9.840712839443291e-06, + "loss": 0.761, + "step": 6069 + }, + { + "epoch": 0.32725900366616345, + "grad_norm": 0.8124379515647888, + "learning_rate": 9.840659748820878e-06, + "loss": 0.8952, + "step": 6070 + }, + { + "epoch": 0.327312917834807, + "grad_norm": 0.7297885417938232, + "learning_rate": 9.84060664949561e-06, + "loss": 0.7946, + "step": 6071 + }, + { + "epoch": 0.3273668320034505, + "grad_norm": 0.7874012589454651, + "learning_rate": 9.840553541467584e-06, + "loss": 0.8265, + "step": 6072 + }, + { + "epoch": 0.32742074617209405, + "grad_norm": 0.7697011828422546, + "learning_rate": 9.840500424736896e-06, + "loss": 0.7347, + "step": 6073 + }, + { + "epoch": 0.3274746603407375, + "grad_norm": 0.6917446851730347, + "learning_rate": 9.840447299303642e-06, + "loss": 0.7885, + "step": 6074 + }, + { + "epoch": 0.32752857450938105, + "grad_norm": 1.3732929229736328, + "learning_rate": 9.840394165167915e-06, + "loss": 0.8814, + "step": 6075 + }, + { + "epoch": 0.3275824886780246, + "grad_norm": 0.7249336242675781, + "learning_rate": 9.840341022329813e-06, + "loss": 0.7601, + "step": 6076 + }, + { + "epoch": 0.3276364028466681, + "grad_norm": 0.7332364320755005, + "learning_rate": 9.84028787078943e-06, + "loss": 0.7835, + "step": 6077 + }, + { + "epoch": 0.32769031701531165, + "grad_norm": 0.7836764454841614, + "learning_rate": 9.840234710546863e-06, + "loss": 0.7614, + "step": 6078 + }, + { + "epoch": 0.3277442311839551, + "grad_norm": 0.8428083658218384, + "learning_rate": 9.840181541602207e-06, + "loss": 0.8581, + "step": 6079 + }, + { + "epoch": 0.32779814535259866, + "grad_norm": 0.7642959952354431, + "learning_rate": 9.840128363955557e-06, + "loss": 0.7763, + "step": 6080 + }, + { + "epoch": 0.3278520595212422, + "grad_norm": 0.8178489804267883, + "learning_rate": 9.840075177607008e-06, + "loss": 0.7877, + "step": 6081 + }, + { + "epoch": 0.3279059736898857, + "grad_norm": 0.7757980823516846, + "learning_rate": 9.840021982556658e-06, + "loss": 0.7862, + "step": 6082 + }, + { + "epoch": 0.3279598878585292, + "grad_norm": 0.7946853637695312, + "learning_rate": 9.8399687788046e-06, + "loss": 0.7318, + "step": 6083 + }, + { + "epoch": 0.32801380202717273, + "grad_norm": 0.7144894003868103, + "learning_rate": 9.839915566350931e-06, + "loss": 0.7206, + "step": 6084 + }, + { + "epoch": 0.32806771619581626, + "grad_norm": 0.7347074151039124, + "learning_rate": 9.839862345195748e-06, + "loss": 0.7521, + "step": 6085 + }, + { + "epoch": 0.3281216303644598, + "grad_norm": 0.7249374985694885, + "learning_rate": 9.839809115339145e-06, + "loss": 0.8403, + "step": 6086 + }, + { + "epoch": 0.3281755445331033, + "grad_norm": 0.7791916131973267, + "learning_rate": 9.839755876781216e-06, + "loss": 0.8198, + "step": 6087 + }, + { + "epoch": 0.3282294587017468, + "grad_norm": 0.9882380366325378, + "learning_rate": 9.839702629522061e-06, + "loss": 0.7433, + "step": 6088 + }, + { + "epoch": 0.32828337287039033, + "grad_norm": 0.8705452680587769, + "learning_rate": 9.839649373561773e-06, + "loss": 0.7746, + "step": 6089 + }, + { + "epoch": 0.32833728703903386, + "grad_norm": 0.7160933017730713, + "learning_rate": 9.839596108900446e-06, + "loss": 0.7119, + "step": 6090 + }, + { + "epoch": 0.3283912012076774, + "grad_norm": 0.773858368396759, + "learning_rate": 9.839542835538181e-06, + "loss": 0.8502, + "step": 6091 + }, + { + "epoch": 0.3284451153763209, + "grad_norm": 0.7210254669189453, + "learning_rate": 9.83948955347507e-06, + "loss": 0.7956, + "step": 6092 + }, + { + "epoch": 0.3284990295449644, + "grad_norm": 0.7189142107963562, + "learning_rate": 9.839436262711207e-06, + "loss": 0.7212, + "step": 6093 + }, + { + "epoch": 0.32855294371360794, + "grad_norm": 0.8489739298820496, + "learning_rate": 9.839382963246693e-06, + "loss": 0.7717, + "step": 6094 + }, + { + "epoch": 0.32860685788225147, + "grad_norm": 0.8385297060012817, + "learning_rate": 9.83932965508162e-06, + "loss": 0.7303, + "step": 6095 + }, + { + "epoch": 0.328660772050895, + "grad_norm": 0.7927126884460449, + "learning_rate": 9.839276338216084e-06, + "loss": 0.7268, + "step": 6096 + }, + { + "epoch": 0.3287146862195385, + "grad_norm": 0.7500227689743042, + "learning_rate": 9.839223012650183e-06, + "loss": 0.7703, + "step": 6097 + }, + { + "epoch": 0.328768600388182, + "grad_norm": 0.8327365517616272, + "learning_rate": 9.839169678384013e-06, + "loss": 0.7643, + "step": 6098 + }, + { + "epoch": 0.32882251455682554, + "grad_norm": 0.8245545625686646, + "learning_rate": 9.839116335417667e-06, + "loss": 0.8976, + "step": 6099 + }, + { + "epoch": 0.32887642872546907, + "grad_norm": 0.7789725661277771, + "learning_rate": 9.839062983751242e-06, + "loss": 0.8116, + "step": 6100 + }, + { + "epoch": 0.32893034289411255, + "grad_norm": 0.7301298975944519, + "learning_rate": 9.839009623384834e-06, + "loss": 0.8022, + "step": 6101 + }, + { + "epoch": 0.3289842570627561, + "grad_norm": 0.6957124471664429, + "learning_rate": 9.838956254318542e-06, + "loss": 0.7127, + "step": 6102 + }, + { + "epoch": 0.3290381712313996, + "grad_norm": 0.7631757259368896, + "learning_rate": 9.838902876552456e-06, + "loss": 0.6973, + "step": 6103 + }, + { + "epoch": 0.32909208540004314, + "grad_norm": 0.7194784283638, + "learning_rate": 9.838849490086679e-06, + "loss": 0.8004, + "step": 6104 + }, + { + "epoch": 0.3291459995686867, + "grad_norm": 0.7119805812835693, + "learning_rate": 9.838796094921301e-06, + "loss": 0.784, + "step": 6105 + }, + { + "epoch": 0.32919991373733015, + "grad_norm": 0.6293159127235413, + "learning_rate": 9.83874269105642e-06, + "loss": 0.7562, + "step": 6106 + }, + { + "epoch": 0.3292538279059737, + "grad_norm": 0.7770401835441589, + "learning_rate": 9.838689278492134e-06, + "loss": 0.783, + "step": 6107 + }, + { + "epoch": 0.3293077420746172, + "grad_norm": 0.7125090956687927, + "learning_rate": 9.838635857228534e-06, + "loss": 0.7803, + "step": 6108 + }, + { + "epoch": 0.32936165624326075, + "grad_norm": 0.9537239670753479, + "learning_rate": 9.838582427265721e-06, + "loss": 0.8406, + "step": 6109 + }, + { + "epoch": 0.3294155704119042, + "grad_norm": 0.7614902853965759, + "learning_rate": 9.83852898860379e-06, + "loss": 0.7233, + "step": 6110 + }, + { + "epoch": 0.32946948458054776, + "grad_norm": 0.8114527463912964, + "learning_rate": 9.838475541242836e-06, + "loss": 0.7547, + "step": 6111 + }, + { + "epoch": 0.3295233987491913, + "grad_norm": 0.7188670635223389, + "learning_rate": 9.838422085182955e-06, + "loss": 0.7121, + "step": 6112 + }, + { + "epoch": 0.3295773129178348, + "grad_norm": 0.7254597544670105, + "learning_rate": 9.838368620424244e-06, + "loss": 0.8287, + "step": 6113 + }, + { + "epoch": 0.32963122708647835, + "grad_norm": 0.7403092384338379, + "learning_rate": 9.8383151469668e-06, + "loss": 0.7268, + "step": 6114 + }, + { + "epoch": 0.3296851412551218, + "grad_norm": 1.0679517984390259, + "learning_rate": 9.838261664810716e-06, + "loss": 0.8475, + "step": 6115 + }, + { + "epoch": 0.32973905542376536, + "grad_norm": 0.9359924793243408, + "learning_rate": 9.838208173956092e-06, + "loss": 0.7505, + "step": 6116 + }, + { + "epoch": 0.3297929695924089, + "grad_norm": 0.7238296270370483, + "learning_rate": 9.83815467440302e-06, + "loss": 0.741, + "step": 6117 + }, + { + "epoch": 0.3298468837610524, + "grad_norm": 0.8017160892486572, + "learning_rate": 9.838101166151602e-06, + "loss": 0.6872, + "step": 6118 + }, + { + "epoch": 0.3299007979296959, + "grad_norm": 0.8161622285842896, + "learning_rate": 9.838047649201928e-06, + "loss": 0.7786, + "step": 6119 + }, + { + "epoch": 0.32995471209833943, + "grad_norm": 0.8322761654853821, + "learning_rate": 9.837994123554096e-06, + "loss": 0.8856, + "step": 6120 + }, + { + "epoch": 0.33000862626698296, + "grad_norm": 0.6873301863670349, + "learning_rate": 9.837940589208204e-06, + "loss": 0.6405, + "step": 6121 + }, + { + "epoch": 0.3300625404356265, + "grad_norm": 0.9589494466781616, + "learning_rate": 9.837887046164347e-06, + "loss": 0.866, + "step": 6122 + }, + { + "epoch": 0.33011645460427, + "grad_norm": 0.8298083543777466, + "learning_rate": 9.837833494422623e-06, + "loss": 0.7266, + "step": 6123 + }, + { + "epoch": 0.3301703687729135, + "grad_norm": 0.8999565243721008, + "learning_rate": 9.837779933983124e-06, + "loss": 0.7482, + "step": 6124 + }, + { + "epoch": 0.33022428294155703, + "grad_norm": 0.7154240012168884, + "learning_rate": 9.837726364845952e-06, + "loss": 0.7786, + "step": 6125 + }, + { + "epoch": 0.33027819711020057, + "grad_norm": 0.8142261505126953, + "learning_rate": 9.8376727870112e-06, + "loss": 0.795, + "step": 6126 + }, + { + "epoch": 0.3303321112788441, + "grad_norm": 0.8122156262397766, + "learning_rate": 9.837619200478966e-06, + "loss": 0.7307, + "step": 6127 + }, + { + "epoch": 0.3303860254474876, + "grad_norm": 0.6999771595001221, + "learning_rate": 9.837565605249342e-06, + "loss": 0.7447, + "step": 6128 + }, + { + "epoch": 0.3304399396161311, + "grad_norm": 0.8703051209449768, + "learning_rate": 9.837512001322428e-06, + "loss": 0.7271, + "step": 6129 + }, + { + "epoch": 0.33049385378477464, + "grad_norm": 0.7183845043182373, + "learning_rate": 9.837458388698322e-06, + "loss": 0.7394, + "step": 6130 + }, + { + "epoch": 0.33054776795341817, + "grad_norm": 0.7363807559013367, + "learning_rate": 9.837404767377116e-06, + "loss": 0.7132, + "step": 6131 + }, + { + "epoch": 0.3306016821220617, + "grad_norm": 0.8021197319030762, + "learning_rate": 9.837351137358911e-06, + "loss": 0.8376, + "step": 6132 + }, + { + "epoch": 0.3306555962907052, + "grad_norm": 0.8289563059806824, + "learning_rate": 9.837297498643798e-06, + "loss": 0.8226, + "step": 6133 + }, + { + "epoch": 0.3307095104593487, + "grad_norm": 0.6811582446098328, + "learning_rate": 9.83724385123188e-06, + "loss": 0.6755, + "step": 6134 + }, + { + "epoch": 0.33076342462799224, + "grad_norm": 0.8314020037651062, + "learning_rate": 9.837190195123247e-06, + "loss": 0.8538, + "step": 6135 + }, + { + "epoch": 0.3308173387966358, + "grad_norm": 0.6725255846977234, + "learning_rate": 9.837136530318e-06, + "loss": 0.7735, + "step": 6136 + }, + { + "epoch": 0.33087125296527925, + "grad_norm": 0.685630738735199, + "learning_rate": 9.837082856816234e-06, + "loss": 0.6806, + "step": 6137 + }, + { + "epoch": 0.3309251671339228, + "grad_norm": 1.1149775981903076, + "learning_rate": 9.837029174618045e-06, + "loss": 0.7402, + "step": 6138 + }, + { + "epoch": 0.3309790813025663, + "grad_norm": 0.7758622169494629, + "learning_rate": 9.83697548372353e-06, + "loss": 0.8006, + "step": 6139 + }, + { + "epoch": 0.33103299547120985, + "grad_norm": 0.6982478499412537, + "learning_rate": 9.836921784132785e-06, + "loss": 0.7239, + "step": 6140 + }, + { + "epoch": 0.3310869096398534, + "grad_norm": 1.1207581758499146, + "learning_rate": 9.836868075845907e-06, + "loss": 0.7628, + "step": 6141 + }, + { + "epoch": 0.33114082380849685, + "grad_norm": 0.8356226086616516, + "learning_rate": 9.836814358862993e-06, + "loss": 0.7945, + "step": 6142 + }, + { + "epoch": 0.3311947379771404, + "grad_norm": 0.7442872524261475, + "learning_rate": 9.83676063318414e-06, + "loss": 0.7435, + "step": 6143 + }, + { + "epoch": 0.3312486521457839, + "grad_norm": 0.7981832027435303, + "learning_rate": 9.836706898809442e-06, + "loss": 0.7811, + "step": 6144 + }, + { + "epoch": 0.33130256631442745, + "grad_norm": 0.740649938583374, + "learning_rate": 9.836653155738998e-06, + "loss": 0.7763, + "step": 6145 + }, + { + "epoch": 0.3313564804830709, + "grad_norm": 0.8768689632415771, + "learning_rate": 9.836599403972903e-06, + "loss": 0.6948, + "step": 6146 + }, + { + "epoch": 0.33141039465171446, + "grad_norm": 0.8029646873474121, + "learning_rate": 9.836545643511257e-06, + "loss": 0.8064, + "step": 6147 + }, + { + "epoch": 0.331464308820358, + "grad_norm": 0.857420027256012, + "learning_rate": 9.836491874354151e-06, + "loss": 0.7761, + "step": 6148 + }, + { + "epoch": 0.3315182229890015, + "grad_norm": 0.7678744792938232, + "learning_rate": 9.836438096501688e-06, + "loss": 0.856, + "step": 6149 + }, + { + "epoch": 0.33157213715764505, + "grad_norm": 0.694432258605957, + "learning_rate": 9.836384309953959e-06, + "loss": 0.714, + "step": 6150 + }, + { + "epoch": 0.33162605132628853, + "grad_norm": 0.776814877986908, + "learning_rate": 9.836330514711066e-06, + "loss": 0.7933, + "step": 6151 + }, + { + "epoch": 0.33167996549493206, + "grad_norm": 0.9273865818977356, + "learning_rate": 9.8362767107731e-06, + "loss": 0.7308, + "step": 6152 + }, + { + "epoch": 0.3317338796635756, + "grad_norm": 0.7457488179206848, + "learning_rate": 9.836222898140163e-06, + "loss": 0.7234, + "step": 6153 + }, + { + "epoch": 0.3317877938322191, + "grad_norm": 0.657256007194519, + "learning_rate": 9.83616907681235e-06, + "loss": 0.7182, + "step": 6154 + }, + { + "epoch": 0.3318417080008626, + "grad_norm": 0.7859936356544495, + "learning_rate": 9.836115246789754e-06, + "loss": 0.6805, + "step": 6155 + }, + { + "epoch": 0.33189562216950613, + "grad_norm": 0.6827487349510193, + "learning_rate": 9.836061408072478e-06, + "loss": 0.7048, + "step": 6156 + }, + { + "epoch": 0.33194953633814966, + "grad_norm": 0.7208519577980042, + "learning_rate": 9.836007560660616e-06, + "loss": 0.7636, + "step": 6157 + }, + { + "epoch": 0.3320034505067932, + "grad_norm": 0.8533946871757507, + "learning_rate": 9.835953704554262e-06, + "loss": 0.6757, + "step": 6158 + }, + { + "epoch": 0.33205736467543673, + "grad_norm": 0.7826882004737854, + "learning_rate": 9.835899839753518e-06, + "loss": 0.7241, + "step": 6159 + }, + { + "epoch": 0.3321112788440802, + "grad_norm": 0.7297992706298828, + "learning_rate": 9.835845966258477e-06, + "loss": 0.8083, + "step": 6160 + }, + { + "epoch": 0.33216519301272374, + "grad_norm": 0.8451754450798035, + "learning_rate": 9.835792084069238e-06, + "loss": 0.7721, + "step": 6161 + }, + { + "epoch": 0.33221910718136727, + "grad_norm": 0.8423025608062744, + "learning_rate": 9.835738193185895e-06, + "loss": 0.8068, + "step": 6162 + }, + { + "epoch": 0.3322730213500108, + "grad_norm": 0.756104052066803, + "learning_rate": 9.83568429360855e-06, + "loss": 0.7249, + "step": 6163 + }, + { + "epoch": 0.3323269355186543, + "grad_norm": 0.7540079355239868, + "learning_rate": 9.835630385337295e-06, + "loss": 0.7699, + "step": 6164 + }, + { + "epoch": 0.3323808496872978, + "grad_norm": 0.7744835019111633, + "learning_rate": 9.835576468372229e-06, + "loss": 0.7329, + "step": 6165 + }, + { + "epoch": 0.33243476385594134, + "grad_norm": 0.7503830194473267, + "learning_rate": 9.83552254271345e-06, + "loss": 0.7517, + "step": 6166 + }, + { + "epoch": 0.33248867802458487, + "grad_norm": 0.7132351398468018, + "learning_rate": 9.83546860836105e-06, + "loss": 0.7224, + "step": 6167 + }, + { + "epoch": 0.3325425921932284, + "grad_norm": 0.7316455245018005, + "learning_rate": 9.835414665315133e-06, + "loss": 0.6782, + "step": 6168 + }, + { + "epoch": 0.3325965063618719, + "grad_norm": 0.7445632815361023, + "learning_rate": 9.835360713575793e-06, + "loss": 0.8118, + "step": 6169 + }, + { + "epoch": 0.3326504205305154, + "grad_norm": 0.7687620520591736, + "learning_rate": 9.835306753143123e-06, + "loss": 0.694, + "step": 6170 + }, + { + "epoch": 0.33270433469915894, + "grad_norm": 0.689497172832489, + "learning_rate": 9.835252784017226e-06, + "loss": 0.7345, + "step": 6171 + }, + { + "epoch": 0.3327582488678025, + "grad_norm": 0.8443883061408997, + "learning_rate": 9.835198806198197e-06, + "loss": 0.7457, + "step": 6172 + }, + { + "epoch": 0.33281216303644595, + "grad_norm": 0.7037844061851501, + "learning_rate": 9.835144819686132e-06, + "loss": 0.7364, + "step": 6173 + }, + { + "epoch": 0.3328660772050895, + "grad_norm": 0.7182630896568298, + "learning_rate": 9.835090824481128e-06, + "loss": 0.7661, + "step": 6174 + }, + { + "epoch": 0.332919991373733, + "grad_norm": 1.086168885231018, + "learning_rate": 9.835036820583283e-06, + "loss": 0.7479, + "step": 6175 + }, + { + "epoch": 0.33297390554237655, + "grad_norm": 0.9191387295722961, + "learning_rate": 9.834982807992696e-06, + "loss": 0.7792, + "step": 6176 + }, + { + "epoch": 0.3330278197110201, + "grad_norm": 0.7431225776672363, + "learning_rate": 9.83492878670946e-06, + "loss": 0.7399, + "step": 6177 + }, + { + "epoch": 0.33308173387966356, + "grad_norm": 0.7455822229385376, + "learning_rate": 9.834874756733674e-06, + "loss": 0.8112, + "step": 6178 + }, + { + "epoch": 0.3331356480483071, + "grad_norm": 0.7717078328132629, + "learning_rate": 9.834820718065436e-06, + "loss": 0.7559, + "step": 6179 + }, + { + "epoch": 0.3331895622169506, + "grad_norm": 0.7372557520866394, + "learning_rate": 9.834766670704843e-06, + "loss": 0.7576, + "step": 6180 + }, + { + "epoch": 0.33324347638559415, + "grad_norm": 0.7553979754447937, + "learning_rate": 9.834712614651991e-06, + "loss": 0.7818, + "step": 6181 + }, + { + "epoch": 0.3332973905542376, + "grad_norm": 0.7095852494239807, + "learning_rate": 9.834658549906977e-06, + "loss": 0.7546, + "step": 6182 + }, + { + "epoch": 0.33335130472288116, + "grad_norm": 0.7873508930206299, + "learning_rate": 9.8346044764699e-06, + "loss": 0.854, + "step": 6183 + }, + { + "epoch": 0.3334052188915247, + "grad_norm": 0.7784111499786377, + "learning_rate": 9.834550394340856e-06, + "loss": 0.8939, + "step": 6184 + }, + { + "epoch": 0.3334591330601682, + "grad_norm": 0.7377014756202698, + "learning_rate": 9.834496303519943e-06, + "loss": 0.7838, + "step": 6185 + }, + { + "epoch": 0.33351304722881175, + "grad_norm": 0.6974765658378601, + "learning_rate": 9.834442204007255e-06, + "loss": 0.7851, + "step": 6186 + }, + { + "epoch": 0.33356696139745523, + "grad_norm": 0.8516370058059692, + "learning_rate": 9.834388095802895e-06, + "loss": 0.7406, + "step": 6187 + }, + { + "epoch": 0.33362087556609876, + "grad_norm": 0.8454179167747498, + "learning_rate": 9.834333978906957e-06, + "loss": 0.8128, + "step": 6188 + }, + { + "epoch": 0.3336747897347423, + "grad_norm": 0.9424523711204529, + "learning_rate": 9.834279853319537e-06, + "loss": 0.8216, + "step": 6189 + }, + { + "epoch": 0.3337287039033858, + "grad_norm": 0.7633955478668213, + "learning_rate": 9.834225719040734e-06, + "loss": 0.8709, + "step": 6190 + }, + { + "epoch": 0.3337826180720293, + "grad_norm": 0.8300744891166687, + "learning_rate": 9.834171576070645e-06, + "loss": 0.8924, + "step": 6191 + }, + { + "epoch": 0.33383653224067283, + "grad_norm": 0.7658351063728333, + "learning_rate": 9.834117424409368e-06, + "loss": 0.7132, + "step": 6192 + }, + { + "epoch": 0.33389044640931637, + "grad_norm": 0.7253869771957397, + "learning_rate": 9.834063264057e-06, + "loss": 0.741, + "step": 6193 + }, + { + "epoch": 0.3339443605779599, + "grad_norm": 0.7782844305038452, + "learning_rate": 9.83400909501364e-06, + "loss": 0.7408, + "step": 6194 + }, + { + "epoch": 0.33399827474660343, + "grad_norm": 0.7073631882667542, + "learning_rate": 9.83395491727938e-06, + "loss": 0.7343, + "step": 6195 + }, + { + "epoch": 0.3340521889152469, + "grad_norm": 0.8233753442764282, + "learning_rate": 9.833900730854324e-06, + "loss": 0.8408, + "step": 6196 + }, + { + "epoch": 0.33410610308389044, + "grad_norm": 0.813093364238739, + "learning_rate": 9.833846535738564e-06, + "loss": 0.8106, + "step": 6197 + }, + { + "epoch": 0.33416001725253397, + "grad_norm": 0.7061467170715332, + "learning_rate": 9.833792331932202e-06, + "loss": 0.6583, + "step": 6198 + }, + { + "epoch": 0.3342139314211775, + "grad_norm": 0.7652360200881958, + "learning_rate": 9.833738119435333e-06, + "loss": 0.8283, + "step": 6199 + }, + { + "epoch": 0.334267845589821, + "grad_norm": 0.7208247780799866, + "learning_rate": 9.833683898248053e-06, + "loss": 0.7678, + "step": 6200 + }, + { + "epoch": 0.3343217597584645, + "grad_norm": 0.8003979921340942, + "learning_rate": 9.833629668370462e-06, + "loss": 0.8435, + "step": 6201 + }, + { + "epoch": 0.33437567392710804, + "grad_norm": 0.7712222933769226, + "learning_rate": 9.833575429802658e-06, + "loss": 0.7659, + "step": 6202 + }, + { + "epoch": 0.3344295880957516, + "grad_norm": 0.7202783226966858, + "learning_rate": 9.833521182544737e-06, + "loss": 0.7468, + "step": 6203 + }, + { + "epoch": 0.3344835022643951, + "grad_norm": 0.675953209400177, + "learning_rate": 9.833466926596795e-06, + "loss": 0.7468, + "step": 6204 + }, + { + "epoch": 0.3345374164330386, + "grad_norm": 0.800348162651062, + "learning_rate": 9.833412661958933e-06, + "loss": 0.802, + "step": 6205 + }, + { + "epoch": 0.3345913306016821, + "grad_norm": 0.7532010674476624, + "learning_rate": 9.833358388631247e-06, + "loss": 0.7514, + "step": 6206 + }, + { + "epoch": 0.33464524477032564, + "grad_norm": 0.686203122138977, + "learning_rate": 9.833304106613835e-06, + "loss": 0.6874, + "step": 6207 + }, + { + "epoch": 0.3346991589389692, + "grad_norm": 0.8788251280784607, + "learning_rate": 9.833249815906792e-06, + "loss": 0.8594, + "step": 6208 + }, + { + "epoch": 0.33475307310761265, + "grad_norm": 0.7813701629638672, + "learning_rate": 9.83319551651022e-06, + "loss": 0.7494, + "step": 6209 + }, + { + "epoch": 0.3348069872762562, + "grad_norm": 0.7381582260131836, + "learning_rate": 9.833141208424213e-06, + "loss": 0.7547, + "step": 6210 + }, + { + "epoch": 0.3348609014448997, + "grad_norm": 0.8589098453521729, + "learning_rate": 9.833086891648869e-06, + "loss": 0.7283, + "step": 6211 + }, + { + "epoch": 0.33491481561354325, + "grad_norm": 0.7888010740280151, + "learning_rate": 9.83303256618429e-06, + "loss": 0.7956, + "step": 6212 + }, + { + "epoch": 0.3349687297821868, + "grad_norm": 0.7753561735153198, + "learning_rate": 9.832978232030566e-06, + "loss": 0.8327, + "step": 6213 + }, + { + "epoch": 0.33502264395083026, + "grad_norm": 1.0800096988677979, + "learning_rate": 9.832923889187802e-06, + "loss": 0.8246, + "step": 6214 + }, + { + "epoch": 0.3350765581194738, + "grad_norm": 0.7646483778953552, + "learning_rate": 9.832869537656092e-06, + "loss": 0.8405, + "step": 6215 + }, + { + "epoch": 0.3351304722881173, + "grad_norm": 1.0635271072387695, + "learning_rate": 9.832815177435533e-06, + "loss": 0.7119, + "step": 6216 + }, + { + "epoch": 0.33518438645676085, + "grad_norm": 0.7924190759658813, + "learning_rate": 9.832760808526225e-06, + "loss": 0.6712, + "step": 6217 + }, + { + "epoch": 0.3352383006254044, + "grad_norm": 0.8422202467918396, + "learning_rate": 9.832706430928266e-06, + "loss": 0.6349, + "step": 6218 + }, + { + "epoch": 0.33529221479404786, + "grad_norm": 0.7442725896835327, + "learning_rate": 9.83265204464175e-06, + "loss": 0.7698, + "step": 6219 + }, + { + "epoch": 0.3353461289626914, + "grad_norm": 0.9910275340080261, + "learning_rate": 9.83259764966678e-06, + "loss": 0.8084, + "step": 6220 + }, + { + "epoch": 0.3354000431313349, + "grad_norm": 0.7336423397064209, + "learning_rate": 9.832543246003449e-06, + "loss": 0.8095, + "step": 6221 + }, + { + "epoch": 0.33545395729997846, + "grad_norm": 0.7493695020675659, + "learning_rate": 9.832488833651858e-06, + "loss": 0.7298, + "step": 6222 + }, + { + "epoch": 0.33550787146862193, + "grad_norm": 0.6641638875007629, + "learning_rate": 9.832434412612106e-06, + "loss": 0.659, + "step": 6223 + }, + { + "epoch": 0.33556178563726546, + "grad_norm": 0.7921575903892517, + "learning_rate": 9.832379982884286e-06, + "loss": 0.8049, + "step": 6224 + }, + { + "epoch": 0.335615699805909, + "grad_norm": 0.7988188862800598, + "learning_rate": 9.8323255444685e-06, + "loss": 0.7573, + "step": 6225 + }, + { + "epoch": 0.3356696139745525, + "grad_norm": 0.7328951954841614, + "learning_rate": 9.832271097364843e-06, + "loss": 0.7031, + "step": 6226 + }, + { + "epoch": 0.33572352814319606, + "grad_norm": 0.7667033672332764, + "learning_rate": 9.832216641573416e-06, + "loss": 0.7803, + "step": 6227 + }, + { + "epoch": 0.33577744231183954, + "grad_norm": 0.6491816639900208, + "learning_rate": 9.832162177094313e-06, + "loss": 0.7226, + "step": 6228 + }, + { + "epoch": 0.33583135648048307, + "grad_norm": 0.7009122967720032, + "learning_rate": 9.832107703927637e-06, + "loss": 0.7414, + "step": 6229 + }, + { + "epoch": 0.3358852706491266, + "grad_norm": 0.8263521194458008, + "learning_rate": 9.83205322207348e-06, + "loss": 0.7617, + "step": 6230 + }, + { + "epoch": 0.33593918481777013, + "grad_norm": 0.7571332454681396, + "learning_rate": 9.831998731531944e-06, + "loss": 0.7414, + "step": 6231 + }, + { + "epoch": 0.3359930989864136, + "grad_norm": 0.722075343132019, + "learning_rate": 9.831944232303126e-06, + "loss": 0.6649, + "step": 6232 + }, + { + "epoch": 0.33604701315505714, + "grad_norm": 0.8078354597091675, + "learning_rate": 9.831889724387125e-06, + "loss": 0.7423, + "step": 6233 + }, + { + "epoch": 0.33610092732370067, + "grad_norm": 0.7287851572036743, + "learning_rate": 9.831835207784037e-06, + "loss": 0.7391, + "step": 6234 + }, + { + "epoch": 0.3361548414923442, + "grad_norm": 0.8404607176780701, + "learning_rate": 9.831780682493961e-06, + "loss": 0.8091, + "step": 6235 + }, + { + "epoch": 0.33620875566098773, + "grad_norm": 0.8004715442657471, + "learning_rate": 9.831726148516996e-06, + "loss": 0.7325, + "step": 6236 + }, + { + "epoch": 0.3362626698296312, + "grad_norm": 0.7508640289306641, + "learning_rate": 9.831671605853238e-06, + "loss": 0.7273, + "step": 6237 + }, + { + "epoch": 0.33631658399827474, + "grad_norm": 0.6869575381278992, + "learning_rate": 9.831617054502786e-06, + "loss": 0.7295, + "step": 6238 + }, + { + "epoch": 0.3363704981669183, + "grad_norm": 0.7840809226036072, + "learning_rate": 9.831562494465739e-06, + "loss": 0.7622, + "step": 6239 + }, + { + "epoch": 0.3364244123355618, + "grad_norm": 0.7323046922683716, + "learning_rate": 9.831507925742194e-06, + "loss": 0.782, + "step": 6240 + }, + { + "epoch": 0.3364783265042053, + "grad_norm": 0.7710634469985962, + "learning_rate": 9.831453348332249e-06, + "loss": 0.8266, + "step": 6241 + }, + { + "epoch": 0.3365322406728488, + "grad_norm": 0.7900035977363586, + "learning_rate": 9.831398762236001e-06, + "loss": 0.8101, + "step": 6242 + }, + { + "epoch": 0.33658615484149235, + "grad_norm": 0.6933030486106873, + "learning_rate": 9.831344167453552e-06, + "loss": 0.7993, + "step": 6243 + }, + { + "epoch": 0.3366400690101359, + "grad_norm": 0.7265595197677612, + "learning_rate": 9.831289563984997e-06, + "loss": 0.795, + "step": 6244 + }, + { + "epoch": 0.3366939831787794, + "grad_norm": 0.7805317640304565, + "learning_rate": 9.831234951830435e-06, + "loss": 0.7251, + "step": 6245 + }, + { + "epoch": 0.3367478973474229, + "grad_norm": 0.7463899850845337, + "learning_rate": 9.831180330989964e-06, + "loss": 0.7757, + "step": 6246 + }, + { + "epoch": 0.3368018115160664, + "grad_norm": 0.785297155380249, + "learning_rate": 9.831125701463684e-06, + "loss": 0.6769, + "step": 6247 + }, + { + "epoch": 0.33685572568470995, + "grad_norm": 0.6809384226799011, + "learning_rate": 9.83107106325169e-06, + "loss": 0.6836, + "step": 6248 + }, + { + "epoch": 0.3369096398533535, + "grad_norm": 0.7194486856460571, + "learning_rate": 9.831016416354082e-06, + "loss": 0.8119, + "step": 6249 + }, + { + "epoch": 0.33696355402199696, + "grad_norm": 0.9592674374580383, + "learning_rate": 9.830961760770959e-06, + "loss": 0.8448, + "step": 6250 + }, + { + "epoch": 0.3370174681906405, + "grad_norm": 0.7490488886833191, + "learning_rate": 9.830907096502416e-06, + "loss": 0.8008, + "step": 6251 + }, + { + "epoch": 0.337071382359284, + "grad_norm": 0.8315609693527222, + "learning_rate": 9.830852423548556e-06, + "loss": 0.733, + "step": 6252 + }, + { + "epoch": 0.33712529652792755, + "grad_norm": 0.8040405511856079, + "learning_rate": 9.830797741909473e-06, + "loss": 0.7952, + "step": 6253 + }, + { + "epoch": 0.3371792106965711, + "grad_norm": 0.8686261177062988, + "learning_rate": 9.830743051585267e-06, + "loss": 0.8077, + "step": 6254 + }, + { + "epoch": 0.33723312486521456, + "grad_norm": 0.7632514238357544, + "learning_rate": 9.830688352576037e-06, + "loss": 0.772, + "step": 6255 + }, + { + "epoch": 0.3372870390338581, + "grad_norm": 0.7437588572502136, + "learning_rate": 9.830633644881882e-06, + "loss": 0.7873, + "step": 6256 + }, + { + "epoch": 0.3373409532025016, + "grad_norm": 0.8499639630317688, + "learning_rate": 9.830578928502899e-06, + "loss": 0.801, + "step": 6257 + }, + { + "epoch": 0.33739486737114516, + "grad_norm": 0.6750261783599854, + "learning_rate": 9.830524203439185e-06, + "loss": 0.7145, + "step": 6258 + }, + { + "epoch": 0.33744878153978863, + "grad_norm": 0.8233493566513062, + "learning_rate": 9.830469469690842e-06, + "loss": 0.8149, + "step": 6259 + }, + { + "epoch": 0.33750269570843217, + "grad_norm": 0.7706056833267212, + "learning_rate": 9.830414727257965e-06, + "loss": 0.8053, + "step": 6260 + }, + { + "epoch": 0.3375566098770757, + "grad_norm": 0.8783419132232666, + "learning_rate": 9.830359976140656e-06, + "loss": 0.8259, + "step": 6261 + }, + { + "epoch": 0.33761052404571923, + "grad_norm": 0.780724823474884, + "learning_rate": 9.83030521633901e-06, + "loss": 0.8358, + "step": 6262 + }, + { + "epoch": 0.33766443821436276, + "grad_norm": 0.7131387591362, + "learning_rate": 9.830250447853124e-06, + "loss": 0.8177, + "step": 6263 + }, + { + "epoch": 0.33771835238300624, + "grad_norm": 0.7248073220252991, + "learning_rate": 9.8301956706831e-06, + "loss": 0.7043, + "step": 6264 + }, + { + "epoch": 0.33777226655164977, + "grad_norm": 0.7510526776313782, + "learning_rate": 9.830140884829038e-06, + "loss": 0.8179, + "step": 6265 + }, + { + "epoch": 0.3378261807202933, + "grad_norm": 0.8340782523155212, + "learning_rate": 9.830086090291033e-06, + "loss": 0.8432, + "step": 6266 + }, + { + "epoch": 0.33788009488893683, + "grad_norm": 0.6900983452796936, + "learning_rate": 9.830031287069187e-06, + "loss": 0.7096, + "step": 6267 + }, + { + "epoch": 0.3379340090575803, + "grad_norm": 0.8552762269973755, + "learning_rate": 9.829976475163591e-06, + "loss": 0.8018, + "step": 6268 + }, + { + "epoch": 0.33798792322622384, + "grad_norm": 0.7504406571388245, + "learning_rate": 9.829921654574352e-06, + "loss": 0.7504, + "step": 6269 + }, + { + "epoch": 0.3380418373948674, + "grad_norm": 0.6848222017288208, + "learning_rate": 9.829866825301566e-06, + "loss": 0.7203, + "step": 6270 + }, + { + "epoch": 0.3380957515635109, + "grad_norm": 0.7247674465179443, + "learning_rate": 9.829811987345331e-06, + "loss": 0.7348, + "step": 6271 + }, + { + "epoch": 0.33814966573215444, + "grad_norm": 0.7426888942718506, + "learning_rate": 9.829757140705743e-06, + "loss": 0.7907, + "step": 6272 + }, + { + "epoch": 0.3382035799007979, + "grad_norm": 0.7520292401313782, + "learning_rate": 9.829702285382905e-06, + "loss": 0.8663, + "step": 6273 + }, + { + "epoch": 0.33825749406944144, + "grad_norm": 0.7866230010986328, + "learning_rate": 9.829647421376913e-06, + "loss": 0.7555, + "step": 6274 + }, + { + "epoch": 0.338311408238085, + "grad_norm": 0.7156290411949158, + "learning_rate": 9.829592548687865e-06, + "loss": 0.8231, + "step": 6275 + }, + { + "epoch": 0.3383653224067285, + "grad_norm": 0.7172961235046387, + "learning_rate": 9.829537667315862e-06, + "loss": 0.8065, + "step": 6276 + }, + { + "epoch": 0.338419236575372, + "grad_norm": 0.7039121985435486, + "learning_rate": 9.829482777261002e-06, + "loss": 0.7558, + "step": 6277 + }, + { + "epoch": 0.3384731507440155, + "grad_norm": 0.8979704976081848, + "learning_rate": 9.829427878523382e-06, + "loss": 0.8858, + "step": 6278 + }, + { + "epoch": 0.33852706491265905, + "grad_norm": 0.7325716614723206, + "learning_rate": 9.829372971103106e-06, + "loss": 0.7626, + "step": 6279 + }, + { + "epoch": 0.3385809790813026, + "grad_norm": 0.694421112537384, + "learning_rate": 9.829318055000265e-06, + "loss": 0.7445, + "step": 6280 + }, + { + "epoch": 0.3386348932499461, + "grad_norm": 0.7471193671226501, + "learning_rate": 9.829263130214962e-06, + "loss": 0.7206, + "step": 6281 + }, + { + "epoch": 0.3386888074185896, + "grad_norm": 0.6921202540397644, + "learning_rate": 9.829208196747296e-06, + "loss": 0.7267, + "step": 6282 + }, + { + "epoch": 0.3387427215872331, + "grad_norm": 0.758085310459137, + "learning_rate": 9.829153254597364e-06, + "loss": 0.8207, + "step": 6283 + }, + { + "epoch": 0.33879663575587665, + "grad_norm": 0.7711400389671326, + "learning_rate": 9.829098303765266e-06, + "loss": 0.832, + "step": 6284 + }, + { + "epoch": 0.3388505499245202, + "grad_norm": 0.7252094149589539, + "learning_rate": 9.829043344251101e-06, + "loss": 0.7601, + "step": 6285 + }, + { + "epoch": 0.33890446409316366, + "grad_norm": 0.9912717938423157, + "learning_rate": 9.828988376054969e-06, + "loss": 0.7719, + "step": 6286 + }, + { + "epoch": 0.3389583782618072, + "grad_norm": 0.7227590680122375, + "learning_rate": 9.828933399176964e-06, + "loss": 0.7138, + "step": 6287 + }, + { + "epoch": 0.3390122924304507, + "grad_norm": 0.7515584826469421, + "learning_rate": 9.82887841361719e-06, + "loss": 0.8169, + "step": 6288 + }, + { + "epoch": 0.33906620659909426, + "grad_norm": 0.788090169429779, + "learning_rate": 9.828823419375744e-06, + "loss": 0.8469, + "step": 6289 + }, + { + "epoch": 0.3391201207677378, + "grad_norm": 0.7734285593032837, + "learning_rate": 9.828768416452723e-06, + "loss": 0.8578, + "step": 6290 + }, + { + "epoch": 0.33917403493638126, + "grad_norm": 0.7342573404312134, + "learning_rate": 9.828713404848228e-06, + "loss": 0.7452, + "step": 6291 + }, + { + "epoch": 0.3392279491050248, + "grad_norm": 0.7694045305252075, + "learning_rate": 9.82865838456236e-06, + "loss": 0.7877, + "step": 6292 + }, + { + "epoch": 0.3392818632736683, + "grad_norm": 0.7270126342773438, + "learning_rate": 9.828603355595213e-06, + "loss": 0.768, + "step": 6293 + }, + { + "epoch": 0.33933577744231186, + "grad_norm": 0.7833864688873291, + "learning_rate": 9.828548317946889e-06, + "loss": 0.7288, + "step": 6294 + }, + { + "epoch": 0.33938969161095534, + "grad_norm": 0.7528559565544128, + "learning_rate": 9.828493271617488e-06, + "loss": 0.8174, + "step": 6295 + }, + { + "epoch": 0.33944360577959887, + "grad_norm": 0.7197186350822449, + "learning_rate": 9.828438216607105e-06, + "loss": 0.6974, + "step": 6296 + }, + { + "epoch": 0.3394975199482424, + "grad_norm": 0.6839184165000916, + "learning_rate": 9.828383152915841e-06, + "loss": 0.6483, + "step": 6297 + }, + { + "epoch": 0.33955143411688593, + "grad_norm": 0.7436637878417969, + "learning_rate": 9.828328080543796e-06, + "loss": 0.6868, + "step": 6298 + }, + { + "epoch": 0.33960534828552946, + "grad_norm": 0.7180050611495972, + "learning_rate": 9.82827299949107e-06, + "loss": 0.8141, + "step": 6299 + }, + { + "epoch": 0.33965926245417294, + "grad_norm": 0.7652395367622375, + "learning_rate": 9.82821790975776e-06, + "loss": 0.6568, + "step": 6300 + }, + { + "epoch": 0.33971317662281647, + "grad_norm": 0.7141926884651184, + "learning_rate": 9.828162811343964e-06, + "loss": 0.694, + "step": 6301 + }, + { + "epoch": 0.33976709079146, + "grad_norm": 0.7092131972312927, + "learning_rate": 9.828107704249783e-06, + "loss": 0.6624, + "step": 6302 + }, + { + "epoch": 0.33982100496010353, + "grad_norm": 0.7080019116401672, + "learning_rate": 9.828052588475314e-06, + "loss": 0.7819, + "step": 6303 + }, + { + "epoch": 0.339874919128747, + "grad_norm": 0.803486168384552, + "learning_rate": 9.82799746402066e-06, + "loss": 0.8708, + "step": 6304 + }, + { + "epoch": 0.33992883329739054, + "grad_norm": 0.7407463788986206, + "learning_rate": 9.827942330885917e-06, + "loss": 0.71, + "step": 6305 + }, + { + "epoch": 0.3399827474660341, + "grad_norm": 0.8360207080841064, + "learning_rate": 9.827887189071183e-06, + "loss": 0.7488, + "step": 6306 + }, + { + "epoch": 0.3400366616346776, + "grad_norm": 0.7429782748222351, + "learning_rate": 9.82783203857656e-06, + "loss": 0.7893, + "step": 6307 + }, + { + "epoch": 0.34009057580332114, + "grad_norm": 0.7807549238204956, + "learning_rate": 9.827776879402149e-06, + "loss": 0.844, + "step": 6308 + }, + { + "epoch": 0.3401444899719646, + "grad_norm": 0.7305359244346619, + "learning_rate": 9.827721711548043e-06, + "loss": 0.7695, + "step": 6309 + }, + { + "epoch": 0.34019840414060815, + "grad_norm": 0.7343246936798096, + "learning_rate": 9.827666535014346e-06, + "loss": 0.7405, + "step": 6310 + }, + { + "epoch": 0.3402523183092517, + "grad_norm": 0.8299920558929443, + "learning_rate": 9.827611349801155e-06, + "loss": 0.8252, + "step": 6311 + }, + { + "epoch": 0.3403062324778952, + "grad_norm": 0.7973533868789673, + "learning_rate": 9.827556155908569e-06, + "loss": 0.8182, + "step": 6312 + }, + { + "epoch": 0.3403601466465387, + "grad_norm": 0.7138605713844299, + "learning_rate": 9.82750095333669e-06, + "loss": 0.7727, + "step": 6313 + }, + { + "epoch": 0.3404140608151822, + "grad_norm": 0.7919808030128479, + "learning_rate": 9.827445742085616e-06, + "loss": 0.7954, + "step": 6314 + }, + { + "epoch": 0.34046797498382575, + "grad_norm": 0.7648377418518066, + "learning_rate": 9.827390522155442e-06, + "loss": 0.7858, + "step": 6315 + }, + { + "epoch": 0.3405218891524693, + "grad_norm": 0.8675017356872559, + "learning_rate": 9.827335293546274e-06, + "loss": 0.8283, + "step": 6316 + }, + { + "epoch": 0.3405758033211128, + "grad_norm": 1.1577383279800415, + "learning_rate": 9.827280056258207e-06, + "loss": 0.8209, + "step": 6317 + }, + { + "epoch": 0.3406297174897563, + "grad_norm": 0.8136908411979675, + "learning_rate": 9.827224810291342e-06, + "loss": 0.8793, + "step": 6318 + }, + { + "epoch": 0.3406836316583998, + "grad_norm": 0.8253350853919983, + "learning_rate": 9.827169555645777e-06, + "loss": 0.8411, + "step": 6319 + }, + { + "epoch": 0.34073754582704335, + "grad_norm": 0.7451859712600708, + "learning_rate": 9.827114292321613e-06, + "loss": 0.8493, + "step": 6320 + }, + { + "epoch": 0.3407914599956869, + "grad_norm": 1.0527219772338867, + "learning_rate": 9.827059020318949e-06, + "loss": 0.798, + "step": 6321 + }, + { + "epoch": 0.34084537416433036, + "grad_norm": 1.096200942993164, + "learning_rate": 9.827003739637883e-06, + "loss": 0.7964, + "step": 6322 + }, + { + "epoch": 0.3408992883329739, + "grad_norm": 0.7498440146446228, + "learning_rate": 9.826948450278516e-06, + "loss": 0.7008, + "step": 6323 + }, + { + "epoch": 0.3409532025016174, + "grad_norm": 0.7353782653808594, + "learning_rate": 9.826893152240947e-06, + "loss": 0.7594, + "step": 6324 + }, + { + "epoch": 0.34100711667026096, + "grad_norm": 0.7682814002037048, + "learning_rate": 9.826837845525273e-06, + "loss": 0.767, + "step": 6325 + }, + { + "epoch": 0.3410610308389045, + "grad_norm": 0.7614009976387024, + "learning_rate": 9.826782530131597e-06, + "loss": 0.7943, + "step": 6326 + }, + { + "epoch": 0.34111494500754796, + "grad_norm": 0.7521767616271973, + "learning_rate": 9.826727206060017e-06, + "loss": 0.8057, + "step": 6327 + }, + { + "epoch": 0.3411688591761915, + "grad_norm": 0.7462873458862305, + "learning_rate": 9.826671873310631e-06, + "loss": 0.818, + "step": 6328 + }, + { + "epoch": 0.34122277334483503, + "grad_norm": 0.665500283241272, + "learning_rate": 9.826616531883542e-06, + "loss": 0.746, + "step": 6329 + }, + { + "epoch": 0.34127668751347856, + "grad_norm": 0.7449725270271301, + "learning_rate": 9.826561181778846e-06, + "loss": 0.7853, + "step": 6330 + }, + { + "epoch": 0.34133060168212204, + "grad_norm": 0.7842909693717957, + "learning_rate": 9.826505822996646e-06, + "loss": 0.7391, + "step": 6331 + }, + { + "epoch": 0.34138451585076557, + "grad_norm": 0.6937515735626221, + "learning_rate": 9.826450455537038e-06, + "loss": 0.7602, + "step": 6332 + }, + { + "epoch": 0.3414384300194091, + "grad_norm": 0.7926383018493652, + "learning_rate": 9.826395079400124e-06, + "loss": 0.8436, + "step": 6333 + }, + { + "epoch": 0.34149234418805263, + "grad_norm": 0.8494393229484558, + "learning_rate": 9.826339694586002e-06, + "loss": 0.8413, + "step": 6334 + }, + { + "epoch": 0.34154625835669616, + "grad_norm": 0.9613466858863831, + "learning_rate": 9.82628430109477e-06, + "loss": 0.7633, + "step": 6335 + }, + { + "epoch": 0.34160017252533964, + "grad_norm": 0.7245813012123108, + "learning_rate": 9.826228898926533e-06, + "loss": 0.7895, + "step": 6336 + }, + { + "epoch": 0.34165408669398317, + "grad_norm": 0.7161980271339417, + "learning_rate": 9.826173488081386e-06, + "loss": 0.7412, + "step": 6337 + }, + { + "epoch": 0.3417080008626267, + "grad_norm": 0.7117916941642761, + "learning_rate": 9.82611806855943e-06, + "loss": 0.7183, + "step": 6338 + }, + { + "epoch": 0.34176191503127024, + "grad_norm": 0.8033546805381775, + "learning_rate": 9.826062640360766e-06, + "loss": 0.6853, + "step": 6339 + }, + { + "epoch": 0.3418158291999137, + "grad_norm": 0.8144798874855042, + "learning_rate": 9.82600720348549e-06, + "loss": 0.7859, + "step": 6340 + }, + { + "epoch": 0.34186974336855724, + "grad_norm": 0.7945190072059631, + "learning_rate": 9.825951757933705e-06, + "loss": 0.7794, + "step": 6341 + }, + { + "epoch": 0.3419236575372008, + "grad_norm": 0.7182852625846863, + "learning_rate": 9.825896303705509e-06, + "loss": 0.7534, + "step": 6342 + }, + { + "epoch": 0.3419775717058443, + "grad_norm": 0.7478010654449463, + "learning_rate": 9.825840840801002e-06, + "loss": 0.7553, + "step": 6343 + }, + { + "epoch": 0.34203148587448784, + "grad_norm": 0.7231805920600891, + "learning_rate": 9.825785369220285e-06, + "loss": 0.7548, + "step": 6344 + }, + { + "epoch": 0.3420854000431313, + "grad_norm": 0.8229427933692932, + "learning_rate": 9.825729888963457e-06, + "loss": 0.84, + "step": 6345 + }, + { + "epoch": 0.34213931421177485, + "grad_norm": 0.7435299754142761, + "learning_rate": 9.825674400030617e-06, + "loss": 0.7605, + "step": 6346 + }, + { + "epoch": 0.3421932283804184, + "grad_norm": 0.7611926198005676, + "learning_rate": 9.825618902421865e-06, + "loss": 0.7439, + "step": 6347 + }, + { + "epoch": 0.3422471425490619, + "grad_norm": 0.7303493022918701, + "learning_rate": 9.825563396137301e-06, + "loss": 0.7956, + "step": 6348 + }, + { + "epoch": 0.3423010567177054, + "grad_norm": 0.7439278960227966, + "learning_rate": 9.825507881177025e-06, + "loss": 0.7998, + "step": 6349 + }, + { + "epoch": 0.3423549708863489, + "grad_norm": 0.7513930797576904, + "learning_rate": 9.825452357541138e-06, + "loss": 0.7717, + "step": 6350 + }, + { + "epoch": 0.34240888505499245, + "grad_norm": 0.6386781930923462, + "learning_rate": 9.825396825229736e-06, + "loss": 0.6949, + "step": 6351 + }, + { + "epoch": 0.342462799223636, + "grad_norm": 1.0100027322769165, + "learning_rate": 9.825341284242925e-06, + "loss": 0.8, + "step": 6352 + }, + { + "epoch": 0.3425167133922795, + "grad_norm": 0.8025746941566467, + "learning_rate": 9.825285734580797e-06, + "loss": 0.8337, + "step": 6353 + }, + { + "epoch": 0.342570627560923, + "grad_norm": 0.6919582486152649, + "learning_rate": 9.825230176243459e-06, + "loss": 0.7057, + "step": 6354 + }, + { + "epoch": 0.3426245417295665, + "grad_norm": 0.73997563123703, + "learning_rate": 9.825174609231006e-06, + "loss": 0.7231, + "step": 6355 + }, + { + "epoch": 0.34267845589821005, + "grad_norm": 0.943370521068573, + "learning_rate": 9.82511903354354e-06, + "loss": 0.7959, + "step": 6356 + }, + { + "epoch": 0.3427323700668536, + "grad_norm": 0.83916175365448, + "learning_rate": 9.825063449181163e-06, + "loss": 0.7588, + "step": 6357 + }, + { + "epoch": 0.34278628423549706, + "grad_norm": 0.7405370473861694, + "learning_rate": 9.82500785614397e-06, + "loss": 0.7877, + "step": 6358 + }, + { + "epoch": 0.3428401984041406, + "grad_norm": 0.8066117167472839, + "learning_rate": 9.824952254432065e-06, + "loss": 0.7505, + "step": 6359 + }, + { + "epoch": 0.3428941125727841, + "grad_norm": 0.8027492761611938, + "learning_rate": 9.824896644045546e-06, + "loss": 0.7675, + "step": 6360 + }, + { + "epoch": 0.34294802674142766, + "grad_norm": 0.7610853314399719, + "learning_rate": 9.824841024984514e-06, + "loss": 0.7627, + "step": 6361 + }, + { + "epoch": 0.3430019409100712, + "grad_norm": 0.6604021787643433, + "learning_rate": 9.824785397249067e-06, + "loss": 0.7011, + "step": 6362 + }, + { + "epoch": 0.34305585507871467, + "grad_norm": 0.7737109065055847, + "learning_rate": 9.824729760839309e-06, + "loss": 0.7177, + "step": 6363 + }, + { + "epoch": 0.3431097692473582, + "grad_norm": 0.8075612187385559, + "learning_rate": 9.824674115755338e-06, + "loss": 0.8006, + "step": 6364 + }, + { + "epoch": 0.34316368341600173, + "grad_norm": 0.7853705883026123, + "learning_rate": 9.82461846199725e-06, + "loss": 0.7979, + "step": 6365 + }, + { + "epoch": 0.34321759758464526, + "grad_norm": 0.8715709447860718, + "learning_rate": 9.824562799565152e-06, + "loss": 0.7461, + "step": 6366 + }, + { + "epoch": 0.34327151175328874, + "grad_norm": 0.7453888654708862, + "learning_rate": 9.82450712845914e-06, + "loss": 0.8291, + "step": 6367 + }, + { + "epoch": 0.34332542592193227, + "grad_norm": 0.7173608541488647, + "learning_rate": 9.824451448679313e-06, + "loss": 0.7616, + "step": 6368 + }, + { + "epoch": 0.3433793400905758, + "grad_norm": 1.02524995803833, + "learning_rate": 9.824395760225775e-06, + "loss": 0.7431, + "step": 6369 + }, + { + "epoch": 0.34343325425921933, + "grad_norm": 0.8196084499359131, + "learning_rate": 9.824340063098625e-06, + "loss": 0.8438, + "step": 6370 + }, + { + "epoch": 0.34348716842786287, + "grad_norm": 0.7988221049308777, + "learning_rate": 9.82428435729796e-06, + "loss": 0.8484, + "step": 6371 + }, + { + "epoch": 0.34354108259650634, + "grad_norm": 1.0993025302886963, + "learning_rate": 9.824228642823883e-06, + "loss": 0.7559, + "step": 6372 + }, + { + "epoch": 0.3435949967651499, + "grad_norm": 0.7754915952682495, + "learning_rate": 9.824172919676493e-06, + "loss": 0.757, + "step": 6373 + }, + { + "epoch": 0.3436489109337934, + "grad_norm": 1.6397086381912231, + "learning_rate": 9.824117187855893e-06, + "loss": 1.0692, + "step": 6374 + }, + { + "epoch": 0.34370282510243694, + "grad_norm": 0.9474613070487976, + "learning_rate": 9.824061447362179e-06, + "loss": 0.6859, + "step": 6375 + }, + { + "epoch": 0.3437567392710804, + "grad_norm": 0.7551336884498596, + "learning_rate": 9.824005698195453e-06, + "loss": 0.8719, + "step": 6376 + }, + { + "epoch": 0.34381065343972395, + "grad_norm": 0.9159377217292786, + "learning_rate": 9.823949940355815e-06, + "loss": 0.8659, + "step": 6377 + }, + { + "epoch": 0.3438645676083675, + "grad_norm": 0.7486509680747986, + "learning_rate": 9.823894173843366e-06, + "loss": 0.8277, + "step": 6378 + }, + { + "epoch": 0.343918481777011, + "grad_norm": 0.6845034956932068, + "learning_rate": 9.823838398658208e-06, + "loss": 0.6711, + "step": 6379 + }, + { + "epoch": 0.34397239594565454, + "grad_norm": 0.7883821725845337, + "learning_rate": 9.823782614800437e-06, + "loss": 0.7357, + "step": 6380 + }, + { + "epoch": 0.344026310114298, + "grad_norm": 0.6626349687576294, + "learning_rate": 9.823726822270156e-06, + "loss": 0.7286, + "step": 6381 + }, + { + "epoch": 0.34408022428294155, + "grad_norm": 0.7307568192481995, + "learning_rate": 9.823671021067464e-06, + "loss": 0.7779, + "step": 6382 + }, + { + "epoch": 0.3441341384515851, + "grad_norm": 0.9291589856147766, + "learning_rate": 9.823615211192464e-06, + "loss": 0.9077, + "step": 6383 + }, + { + "epoch": 0.3441880526202286, + "grad_norm": 0.7208368182182312, + "learning_rate": 9.823559392645253e-06, + "loss": 0.7873, + "step": 6384 + }, + { + "epoch": 0.3442419667888721, + "grad_norm": 0.784195601940155, + "learning_rate": 9.823503565425934e-06, + "loss": 0.8665, + "step": 6385 + }, + { + "epoch": 0.3442958809575156, + "grad_norm": 0.80647212266922, + "learning_rate": 9.823447729534604e-06, + "loss": 0.7465, + "step": 6386 + }, + { + "epoch": 0.34434979512615915, + "grad_norm": 0.7908110618591309, + "learning_rate": 9.823391884971367e-06, + "loss": 0.7539, + "step": 6387 + }, + { + "epoch": 0.3444037092948027, + "grad_norm": 0.7687129974365234, + "learning_rate": 9.823336031736322e-06, + "loss": 0.7856, + "step": 6388 + }, + { + "epoch": 0.3444576234634462, + "grad_norm": 0.7525848746299744, + "learning_rate": 9.823280169829567e-06, + "loss": 0.6993, + "step": 6389 + }, + { + "epoch": 0.3445115376320897, + "grad_norm": 0.8074336647987366, + "learning_rate": 9.82322429925121e-06, + "loss": 0.7925, + "step": 6390 + }, + { + "epoch": 0.3445654518007332, + "grad_norm": 0.8567779660224915, + "learning_rate": 9.823168420001341e-06, + "loss": 0.862, + "step": 6391 + }, + { + "epoch": 0.34461936596937676, + "grad_norm": 0.9519703984260559, + "learning_rate": 9.823112532080068e-06, + "loss": 0.8414, + "step": 6392 + }, + { + "epoch": 0.3446732801380203, + "grad_norm": 0.9044995307922363, + "learning_rate": 9.823056635487489e-06, + "loss": 0.7634, + "step": 6393 + }, + { + "epoch": 0.34472719430666376, + "grad_norm": 0.6782298684120178, + "learning_rate": 9.823000730223704e-06, + "loss": 0.7309, + "step": 6394 + }, + { + "epoch": 0.3447811084753073, + "grad_norm": 0.7118787169456482, + "learning_rate": 9.822944816288815e-06, + "loss": 0.7441, + "step": 6395 + }, + { + "epoch": 0.34483502264395083, + "grad_norm": 0.7205974459648132, + "learning_rate": 9.82288889368292e-06, + "loss": 0.7926, + "step": 6396 + }, + { + "epoch": 0.34488893681259436, + "grad_norm": 0.7549376487731934, + "learning_rate": 9.822832962406124e-06, + "loss": 0.7748, + "step": 6397 + }, + { + "epoch": 0.3449428509812379, + "grad_norm": 0.7076659798622131, + "learning_rate": 9.822777022458524e-06, + "loss": 0.7198, + "step": 6398 + }, + { + "epoch": 0.34499676514988137, + "grad_norm": 0.9088957905769348, + "learning_rate": 9.822721073840218e-06, + "loss": 0.842, + "step": 6399 + }, + { + "epoch": 0.3450506793185249, + "grad_norm": 0.8088563680648804, + "learning_rate": 9.822665116551313e-06, + "loss": 0.7791, + "step": 6400 + }, + { + "epoch": 0.34510459348716843, + "grad_norm": 0.8280682563781738, + "learning_rate": 9.822609150591907e-06, + "loss": 0.7146, + "step": 6401 + }, + { + "epoch": 0.34515850765581196, + "grad_norm": 0.7762987017631531, + "learning_rate": 9.822553175962099e-06, + "loss": 0.654, + "step": 6402 + }, + { + "epoch": 0.34521242182445544, + "grad_norm": 0.8811458945274353, + "learning_rate": 9.82249719266199e-06, + "loss": 0.8532, + "step": 6403 + }, + { + "epoch": 0.34526633599309897, + "grad_norm": 0.7698847651481628, + "learning_rate": 9.822441200691683e-06, + "loss": 0.6643, + "step": 6404 + }, + { + "epoch": 0.3453202501617425, + "grad_norm": 0.7481468915939331, + "learning_rate": 9.822385200051278e-06, + "loss": 0.8127, + "step": 6405 + }, + { + "epoch": 0.34537416433038604, + "grad_norm": 0.6996219754219055, + "learning_rate": 9.822329190740873e-06, + "loss": 0.7479, + "step": 6406 + }, + { + "epoch": 0.34542807849902957, + "grad_norm": 0.8087040781974792, + "learning_rate": 9.82227317276057e-06, + "loss": 0.8361, + "step": 6407 + }, + { + "epoch": 0.34548199266767304, + "grad_norm": 0.6825314164161682, + "learning_rate": 9.822217146110472e-06, + "loss": 0.7701, + "step": 6408 + }, + { + "epoch": 0.3455359068363166, + "grad_norm": 0.696908712387085, + "learning_rate": 9.822161110790678e-06, + "loss": 0.7207, + "step": 6409 + }, + { + "epoch": 0.3455898210049601, + "grad_norm": 0.739766538143158, + "learning_rate": 9.822105066801288e-06, + "loss": 0.7551, + "step": 6410 + }, + { + "epoch": 0.34564373517360364, + "grad_norm": 0.7141444087028503, + "learning_rate": 9.822049014142403e-06, + "loss": 0.8257, + "step": 6411 + }, + { + "epoch": 0.3456976493422471, + "grad_norm": 0.7097259163856506, + "learning_rate": 9.821992952814125e-06, + "loss": 0.7963, + "step": 6412 + }, + { + "epoch": 0.34575156351089065, + "grad_norm": 0.9131011366844177, + "learning_rate": 9.821936882816552e-06, + "loss": 0.8273, + "step": 6413 + }, + { + "epoch": 0.3458054776795342, + "grad_norm": 0.776439368724823, + "learning_rate": 9.821880804149789e-06, + "loss": 0.7506, + "step": 6414 + }, + { + "epoch": 0.3458593918481777, + "grad_norm": 0.7044750452041626, + "learning_rate": 9.821824716813934e-06, + "loss": 0.8232, + "step": 6415 + }, + { + "epoch": 0.34591330601682124, + "grad_norm": 0.6936453580856323, + "learning_rate": 9.821768620809089e-06, + "loss": 0.704, + "step": 6416 + }, + { + "epoch": 0.3459672201854647, + "grad_norm": 0.7428335547447205, + "learning_rate": 9.821712516135353e-06, + "loss": 0.7605, + "step": 6417 + }, + { + "epoch": 0.34602113435410825, + "grad_norm": 0.7981945872306824, + "learning_rate": 9.821656402792829e-06, + "loss": 0.7708, + "step": 6418 + }, + { + "epoch": 0.3460750485227518, + "grad_norm": 0.7747992277145386, + "learning_rate": 9.821600280781618e-06, + "loss": 0.8238, + "step": 6419 + }, + { + "epoch": 0.3461289626913953, + "grad_norm": 1.3347132205963135, + "learning_rate": 9.821544150101819e-06, + "loss": 0.834, + "step": 6420 + }, + { + "epoch": 0.3461828768600388, + "grad_norm": 0.7775027751922607, + "learning_rate": 9.821488010753533e-06, + "loss": 0.8241, + "step": 6421 + }, + { + "epoch": 0.3462367910286823, + "grad_norm": 0.6745424866676331, + "learning_rate": 9.821431862736864e-06, + "loss": 0.7281, + "step": 6422 + }, + { + "epoch": 0.34629070519732585, + "grad_norm": 0.7588618397712708, + "learning_rate": 9.82137570605191e-06, + "loss": 0.8076, + "step": 6423 + }, + { + "epoch": 0.3463446193659694, + "grad_norm": 0.7134720683097839, + "learning_rate": 9.821319540698771e-06, + "loss": 0.7126, + "step": 6424 + }, + { + "epoch": 0.3463985335346129, + "grad_norm": 0.8626348376274109, + "learning_rate": 9.821263366677552e-06, + "loss": 0.8379, + "step": 6425 + }, + { + "epoch": 0.3464524477032564, + "grad_norm": 0.7033327221870422, + "learning_rate": 9.82120718398835e-06, + "loss": 0.7322, + "step": 6426 + }, + { + "epoch": 0.3465063618718999, + "grad_norm": 0.753189206123352, + "learning_rate": 9.821150992631268e-06, + "loss": 0.7893, + "step": 6427 + }, + { + "epoch": 0.34656027604054346, + "grad_norm": 0.7654975652694702, + "learning_rate": 9.821094792606408e-06, + "loss": 0.8505, + "step": 6428 + }, + { + "epoch": 0.346614190209187, + "grad_norm": 0.8966901898384094, + "learning_rate": 9.821038583913868e-06, + "loss": 0.8254, + "step": 6429 + }, + { + "epoch": 0.3466681043778305, + "grad_norm": 0.7760552167892456, + "learning_rate": 9.820982366553752e-06, + "loss": 0.7356, + "step": 6430 + }, + { + "epoch": 0.346722018546474, + "grad_norm": 0.7150897979736328, + "learning_rate": 9.820926140526161e-06, + "loss": 0.7497, + "step": 6431 + }, + { + "epoch": 0.34677593271511753, + "grad_norm": 0.7337846159934998, + "learning_rate": 9.820869905831194e-06, + "loss": 0.6844, + "step": 6432 + }, + { + "epoch": 0.34682984688376106, + "grad_norm": 0.8700940012931824, + "learning_rate": 9.820813662468954e-06, + "loss": 0.9393, + "step": 6433 + }, + { + "epoch": 0.3468837610524046, + "grad_norm": 0.7758312821388245, + "learning_rate": 9.820757410439538e-06, + "loss": 0.8055, + "step": 6434 + }, + { + "epoch": 0.34693767522104807, + "grad_norm": 0.7520482540130615, + "learning_rate": 9.820701149743053e-06, + "loss": 0.7825, + "step": 6435 + }, + { + "epoch": 0.3469915893896916, + "grad_norm": 0.7819029688835144, + "learning_rate": 9.820644880379597e-06, + "loss": 0.8002, + "step": 6436 + }, + { + "epoch": 0.34704550355833513, + "grad_norm": 0.7956332564353943, + "learning_rate": 9.820588602349272e-06, + "loss": 0.8215, + "step": 6437 + }, + { + "epoch": 0.34709941772697867, + "grad_norm": 0.695021390914917, + "learning_rate": 9.820532315652179e-06, + "loss": 0.7227, + "step": 6438 + }, + { + "epoch": 0.3471533318956222, + "grad_norm": 0.9836909770965576, + "learning_rate": 9.820476020288417e-06, + "loss": 0.7617, + "step": 6439 + }, + { + "epoch": 0.3472072460642657, + "grad_norm": 0.7812206745147705, + "learning_rate": 9.820419716258091e-06, + "loss": 0.8567, + "step": 6440 + }, + { + "epoch": 0.3472611602329092, + "grad_norm": 0.719775378704071, + "learning_rate": 9.820363403561301e-06, + "loss": 0.7171, + "step": 6441 + }, + { + "epoch": 0.34731507440155274, + "grad_norm": 0.6919370293617249, + "learning_rate": 9.820307082198146e-06, + "loss": 0.7174, + "step": 6442 + }, + { + "epoch": 0.34736898857019627, + "grad_norm": 0.9075651168823242, + "learning_rate": 9.820250752168731e-06, + "loss": 0.69, + "step": 6443 + }, + { + "epoch": 0.34742290273883975, + "grad_norm": 0.8470256328582764, + "learning_rate": 9.820194413473155e-06, + "loss": 0.7925, + "step": 6444 + }, + { + "epoch": 0.3474768169074833, + "grad_norm": 0.8116269707679749, + "learning_rate": 9.820138066111518e-06, + "loss": 0.8259, + "step": 6445 + }, + { + "epoch": 0.3475307310761268, + "grad_norm": 0.7773719429969788, + "learning_rate": 9.820081710083922e-06, + "loss": 0.8785, + "step": 6446 + }, + { + "epoch": 0.34758464524477034, + "grad_norm": 0.9721434712409973, + "learning_rate": 9.820025345390472e-06, + "loss": 0.6889, + "step": 6447 + }, + { + "epoch": 0.3476385594134139, + "grad_norm": 0.7448714971542358, + "learning_rate": 9.819968972031265e-06, + "loss": 0.8389, + "step": 6448 + }, + { + "epoch": 0.34769247358205735, + "grad_norm": 0.7153246402740479, + "learning_rate": 9.819912590006403e-06, + "loss": 0.7757, + "step": 6449 + }, + { + "epoch": 0.3477463877507009, + "grad_norm": 0.7386745810508728, + "learning_rate": 9.81985619931599e-06, + "loss": 0.7906, + "step": 6450 + }, + { + "epoch": 0.3478003019193444, + "grad_norm": 0.7068947553634644, + "learning_rate": 9.819799799960126e-06, + "loss": 0.7316, + "step": 6451 + }, + { + "epoch": 0.34785421608798794, + "grad_norm": 0.6804941296577454, + "learning_rate": 9.81974339193891e-06, + "loss": 0.7119, + "step": 6452 + }, + { + "epoch": 0.3479081302566314, + "grad_norm": 0.7537106275558472, + "learning_rate": 9.819686975252446e-06, + "loss": 0.717, + "step": 6453 + }, + { + "epoch": 0.34796204442527495, + "grad_norm": 0.817944347858429, + "learning_rate": 9.819630549900835e-06, + "loss": 0.7877, + "step": 6454 + }, + { + "epoch": 0.3480159585939185, + "grad_norm": 0.8272240161895752, + "learning_rate": 9.81957411588418e-06, + "loss": 0.8101, + "step": 6455 + }, + { + "epoch": 0.348069872762562, + "grad_norm": 0.7055686712265015, + "learning_rate": 9.819517673202579e-06, + "loss": 0.6301, + "step": 6456 + }, + { + "epoch": 0.34812378693120555, + "grad_norm": 0.8163101077079773, + "learning_rate": 9.819461221856136e-06, + "loss": 0.7508, + "step": 6457 + }, + { + "epoch": 0.348177701099849, + "grad_norm": 0.8285984396934509, + "learning_rate": 9.81940476184495e-06, + "loss": 0.7562, + "step": 6458 + }, + { + "epoch": 0.34823161526849256, + "grad_norm": 0.7238637804985046, + "learning_rate": 9.819348293169127e-06, + "loss": 0.7456, + "step": 6459 + }, + { + "epoch": 0.3482855294371361, + "grad_norm": 0.8180177807807922, + "learning_rate": 9.819291815828765e-06, + "loss": 0.7584, + "step": 6460 + }, + { + "epoch": 0.3483394436057796, + "grad_norm": 0.6947215795516968, + "learning_rate": 9.819235329823964e-06, + "loss": 0.7253, + "step": 6461 + }, + { + "epoch": 0.3483933577744231, + "grad_norm": 0.8604253530502319, + "learning_rate": 9.81917883515483e-06, + "loss": 0.8658, + "step": 6462 + }, + { + "epoch": 0.3484472719430666, + "grad_norm": 0.7379468679428101, + "learning_rate": 9.819122331821463e-06, + "loss": 0.7911, + "step": 6463 + }, + { + "epoch": 0.34850118611171016, + "grad_norm": 0.687770426273346, + "learning_rate": 9.81906581982396e-06, + "loss": 0.7716, + "step": 6464 + }, + { + "epoch": 0.3485551002803537, + "grad_norm": 0.784186601638794, + "learning_rate": 9.819009299162432e-06, + "loss": 0.7243, + "step": 6465 + }, + { + "epoch": 0.3486090144489972, + "grad_norm": 0.6983310580253601, + "learning_rate": 9.81895276983697e-06, + "loss": 0.7176, + "step": 6466 + }, + { + "epoch": 0.3486629286176407, + "grad_norm": 0.757946789264679, + "learning_rate": 9.818896231847686e-06, + "loss": 0.8331, + "step": 6467 + }, + { + "epoch": 0.34871684278628423, + "grad_norm": 0.7098625302314758, + "learning_rate": 9.818839685194672e-06, + "loss": 0.7063, + "step": 6468 + }, + { + "epoch": 0.34877075695492776, + "grad_norm": 0.7717018127441406, + "learning_rate": 9.818783129878039e-06, + "loss": 0.7261, + "step": 6469 + }, + { + "epoch": 0.3488246711235713, + "grad_norm": 0.7872465252876282, + "learning_rate": 9.81872656589788e-06, + "loss": 0.8372, + "step": 6470 + }, + { + "epoch": 0.34887858529221477, + "grad_norm": 0.8801752924919128, + "learning_rate": 9.8186699932543e-06, + "loss": 0.7178, + "step": 6471 + }, + { + "epoch": 0.3489324994608583, + "grad_norm": 0.8796175718307495, + "learning_rate": 9.818613411947405e-06, + "loss": 0.7526, + "step": 6472 + }, + { + "epoch": 0.34898641362950183, + "grad_norm": 0.7068731188774109, + "learning_rate": 9.81855682197729e-06, + "loss": 0.7864, + "step": 6473 + }, + { + "epoch": 0.34904032779814537, + "grad_norm": 0.7311115264892578, + "learning_rate": 9.81850022334406e-06, + "loss": 0.6956, + "step": 6474 + }, + { + "epoch": 0.3490942419667889, + "grad_norm": 0.6934844851493835, + "learning_rate": 9.818443616047816e-06, + "loss": 0.652, + "step": 6475 + }, + { + "epoch": 0.3491481561354324, + "grad_norm": 0.7721162438392639, + "learning_rate": 9.81838700008866e-06, + "loss": 0.7976, + "step": 6476 + }, + { + "epoch": 0.3492020703040759, + "grad_norm": 0.7528020143508911, + "learning_rate": 9.818330375466698e-06, + "loss": 0.7778, + "step": 6477 + }, + { + "epoch": 0.34925598447271944, + "grad_norm": 0.7696189284324646, + "learning_rate": 9.818273742182023e-06, + "loss": 0.8416, + "step": 6478 + }, + { + "epoch": 0.34930989864136297, + "grad_norm": 0.7881112694740295, + "learning_rate": 9.818217100234745e-06, + "loss": 0.8089, + "step": 6479 + }, + { + "epoch": 0.34936381281000645, + "grad_norm": 0.7639322876930237, + "learning_rate": 9.81816044962496e-06, + "loss": 0.8362, + "step": 6480 + }, + { + "epoch": 0.34941772697865, + "grad_norm": 0.8432510495185852, + "learning_rate": 9.818103790352774e-06, + "loss": 0.8718, + "step": 6481 + }, + { + "epoch": 0.3494716411472935, + "grad_norm": 0.752714216709137, + "learning_rate": 9.818047122418287e-06, + "loss": 0.8021, + "step": 6482 + }, + { + "epoch": 0.34952555531593704, + "grad_norm": 0.8005210757255554, + "learning_rate": 9.817990445821601e-06, + "loss": 0.7351, + "step": 6483 + }, + { + "epoch": 0.3495794694845806, + "grad_norm": 0.7862645983695984, + "learning_rate": 9.817933760562818e-06, + "loss": 0.8125, + "step": 6484 + }, + { + "epoch": 0.34963338365322405, + "grad_norm": 0.7842736840248108, + "learning_rate": 9.817877066642038e-06, + "loss": 0.8261, + "step": 6485 + }, + { + "epoch": 0.3496872978218676, + "grad_norm": 0.8234707713127136, + "learning_rate": 9.817820364059368e-06, + "loss": 0.7099, + "step": 6486 + }, + { + "epoch": 0.3497412119905111, + "grad_norm": 0.7278059124946594, + "learning_rate": 9.817763652814904e-06, + "loss": 0.728, + "step": 6487 + }, + { + "epoch": 0.34979512615915465, + "grad_norm": 0.7479499578475952, + "learning_rate": 9.817706932908753e-06, + "loss": 0.7856, + "step": 6488 + }, + { + "epoch": 0.3498490403277981, + "grad_norm": 0.9248040318489075, + "learning_rate": 9.817650204341014e-06, + "loss": 0.8408, + "step": 6489 + }, + { + "epoch": 0.34990295449644165, + "grad_norm": 0.7217525839805603, + "learning_rate": 9.81759346711179e-06, + "loss": 0.7345, + "step": 6490 + }, + { + "epoch": 0.3499568686650852, + "grad_norm": 0.747064471244812, + "learning_rate": 9.81753672122118e-06, + "loss": 0.7473, + "step": 6491 + }, + { + "epoch": 0.3500107828337287, + "grad_norm": 0.8962185382843018, + "learning_rate": 9.817479966669292e-06, + "loss": 0.6928, + "step": 6492 + }, + { + "epoch": 0.35006469700237225, + "grad_norm": 0.8055808544158936, + "learning_rate": 9.817423203456224e-06, + "loss": 0.7914, + "step": 6493 + }, + { + "epoch": 0.3501186111710157, + "grad_norm": 0.7093660235404968, + "learning_rate": 9.817366431582078e-06, + "loss": 0.7692, + "step": 6494 + }, + { + "epoch": 0.35017252533965926, + "grad_norm": 0.7229071855545044, + "learning_rate": 9.817309651046958e-06, + "loss": 0.7111, + "step": 6495 + }, + { + "epoch": 0.3502264395083028, + "grad_norm": 0.7764929533004761, + "learning_rate": 9.817252861850964e-06, + "loss": 0.7178, + "step": 6496 + }, + { + "epoch": 0.3502803536769463, + "grad_norm": 0.7664844989776611, + "learning_rate": 9.817196063994199e-06, + "loss": 0.8367, + "step": 6497 + }, + { + "epoch": 0.3503342678455898, + "grad_norm": 0.7622857093811035, + "learning_rate": 9.817139257476765e-06, + "loss": 0.8408, + "step": 6498 + }, + { + "epoch": 0.35038818201423333, + "grad_norm": 0.7263977527618408, + "learning_rate": 9.817082442298764e-06, + "loss": 0.7658, + "step": 6499 + }, + { + "epoch": 0.35044209618287686, + "grad_norm": 0.7250953912734985, + "learning_rate": 9.817025618460301e-06, + "loss": 0.7677, + "step": 6500 + }, + { + "epoch": 0.3504960103515204, + "grad_norm": 0.7612128257751465, + "learning_rate": 9.816968785961474e-06, + "loss": 0.7137, + "step": 6501 + }, + { + "epoch": 0.3505499245201639, + "grad_norm": 0.7602818012237549, + "learning_rate": 9.816911944802385e-06, + "loss": 0.7779, + "step": 6502 + }, + { + "epoch": 0.3506038386888074, + "grad_norm": 0.7841636538505554, + "learning_rate": 9.81685509498314e-06, + "loss": 0.8384, + "step": 6503 + }, + { + "epoch": 0.35065775285745093, + "grad_norm": 0.7339578866958618, + "learning_rate": 9.816798236503839e-06, + "loss": 0.7374, + "step": 6504 + }, + { + "epoch": 0.35071166702609446, + "grad_norm": 0.7405341863632202, + "learning_rate": 9.816741369364584e-06, + "loss": 0.7736, + "step": 6505 + }, + { + "epoch": 0.350765581194738, + "grad_norm": 0.7137883901596069, + "learning_rate": 9.816684493565478e-06, + "loss": 0.6616, + "step": 6506 + }, + { + "epoch": 0.3508194953633815, + "grad_norm": 0.9714195728302002, + "learning_rate": 9.816627609106623e-06, + "loss": 0.7016, + "step": 6507 + }, + { + "epoch": 0.350873409532025, + "grad_norm": 0.7854532599449158, + "learning_rate": 9.81657071598812e-06, + "loss": 0.8525, + "step": 6508 + }, + { + "epoch": 0.35092732370066854, + "grad_norm": 0.7967653870582581, + "learning_rate": 9.816513814210074e-06, + "loss": 0.7925, + "step": 6509 + }, + { + "epoch": 0.35098123786931207, + "grad_norm": 0.7127383351325989, + "learning_rate": 9.816456903772584e-06, + "loss": 0.6582, + "step": 6510 + }, + { + "epoch": 0.3510351520379556, + "grad_norm": 0.8363492488861084, + "learning_rate": 9.816399984675756e-06, + "loss": 0.8089, + "step": 6511 + }, + { + "epoch": 0.3510890662065991, + "grad_norm": 0.7839988470077515, + "learning_rate": 9.816343056919691e-06, + "loss": 0.803, + "step": 6512 + }, + { + "epoch": 0.3511429803752426, + "grad_norm": 0.8152570724487305, + "learning_rate": 9.816286120504489e-06, + "loss": 0.7915, + "step": 6513 + }, + { + "epoch": 0.35119689454388614, + "grad_norm": 0.8126181364059448, + "learning_rate": 9.816229175430254e-06, + "loss": 0.8071, + "step": 6514 + }, + { + "epoch": 0.35125080871252967, + "grad_norm": 0.8739742040634155, + "learning_rate": 9.816172221697089e-06, + "loss": 0.7142, + "step": 6515 + }, + { + "epoch": 0.35130472288117315, + "grad_norm": 0.7542504668235779, + "learning_rate": 9.816115259305097e-06, + "loss": 0.7483, + "step": 6516 + }, + { + "epoch": 0.3513586370498167, + "grad_norm": 0.8625519871711731, + "learning_rate": 9.816058288254378e-06, + "loss": 0.6611, + "step": 6517 + }, + { + "epoch": 0.3514125512184602, + "grad_norm": 0.7368396520614624, + "learning_rate": 9.816001308545036e-06, + "loss": 0.7498, + "step": 6518 + }, + { + "epoch": 0.35146646538710374, + "grad_norm": 0.7155764698982239, + "learning_rate": 9.815944320177172e-06, + "loss": 0.5973, + "step": 6519 + }, + { + "epoch": 0.3515203795557473, + "grad_norm": 0.8611487150192261, + "learning_rate": 9.81588732315089e-06, + "loss": 0.8191, + "step": 6520 + }, + { + "epoch": 0.35157429372439075, + "grad_norm": 0.7063427567481995, + "learning_rate": 9.815830317466294e-06, + "loss": 0.8023, + "step": 6521 + }, + { + "epoch": 0.3516282078930343, + "grad_norm": 0.7668420076370239, + "learning_rate": 9.815773303123484e-06, + "loss": 0.7215, + "step": 6522 + }, + { + "epoch": 0.3516821220616778, + "grad_norm": 0.7419091463088989, + "learning_rate": 9.815716280122563e-06, + "loss": 0.7643, + "step": 6523 + }, + { + "epoch": 0.35173603623032135, + "grad_norm": 0.7184603214263916, + "learning_rate": 9.815659248463633e-06, + "loss": 0.7832, + "step": 6524 + }, + { + "epoch": 0.3517899503989648, + "grad_norm": 0.7812091112136841, + "learning_rate": 9.815602208146797e-06, + "loss": 0.7902, + "step": 6525 + }, + { + "epoch": 0.35184386456760836, + "grad_norm": 1.1969261169433594, + "learning_rate": 9.815545159172157e-06, + "loss": 0.7611, + "step": 6526 + }, + { + "epoch": 0.3518977787362519, + "grad_norm": 0.8125418424606323, + "learning_rate": 9.815488101539818e-06, + "loss": 0.7149, + "step": 6527 + }, + { + "epoch": 0.3519516929048954, + "grad_norm": 0.9524067044258118, + "learning_rate": 9.81543103524988e-06, + "loss": 0.7406, + "step": 6528 + }, + { + "epoch": 0.35200560707353895, + "grad_norm": 0.7527855634689331, + "learning_rate": 9.815373960302447e-06, + "loss": 0.732, + "step": 6529 + }, + { + "epoch": 0.3520595212421824, + "grad_norm": 0.8080143928527832, + "learning_rate": 9.815316876697621e-06, + "loss": 0.8302, + "step": 6530 + }, + { + "epoch": 0.35211343541082596, + "grad_norm": 0.7470413446426392, + "learning_rate": 9.815259784435505e-06, + "loss": 0.7261, + "step": 6531 + }, + { + "epoch": 0.3521673495794695, + "grad_norm": 1.378711462020874, + "learning_rate": 9.8152026835162e-06, + "loss": 0.962, + "step": 6532 + }, + { + "epoch": 0.352221263748113, + "grad_norm": 0.8135038614273071, + "learning_rate": 9.815145573939811e-06, + "loss": 0.7983, + "step": 6533 + }, + { + "epoch": 0.3522751779167565, + "grad_norm": 0.809755265712738, + "learning_rate": 9.81508845570644e-06, + "loss": 0.7814, + "step": 6534 + }, + { + "epoch": 0.35232909208540003, + "grad_norm": 0.7261335849761963, + "learning_rate": 9.81503132881619e-06, + "loss": 0.6884, + "step": 6535 + }, + { + "epoch": 0.35238300625404356, + "grad_norm": 0.7459113001823425, + "learning_rate": 9.81497419326916e-06, + "loss": 0.701, + "step": 6536 + }, + { + "epoch": 0.3524369204226871, + "grad_norm": 0.727853536605835, + "learning_rate": 9.814917049065457e-06, + "loss": 0.7754, + "step": 6537 + }, + { + "epoch": 0.3524908345913306, + "grad_norm": 0.7328651547431946, + "learning_rate": 9.814859896205184e-06, + "loss": 0.6909, + "step": 6538 + }, + { + "epoch": 0.3525447487599741, + "grad_norm": 0.7301684617996216, + "learning_rate": 9.814802734688442e-06, + "loss": 0.7665, + "step": 6539 + }, + { + "epoch": 0.35259866292861763, + "grad_norm": 0.8859847187995911, + "learning_rate": 9.814745564515334e-06, + "loss": 0.7567, + "step": 6540 + }, + { + "epoch": 0.35265257709726117, + "grad_norm": 0.6668254733085632, + "learning_rate": 9.814688385685963e-06, + "loss": 0.6945, + "step": 6541 + }, + { + "epoch": 0.3527064912659047, + "grad_norm": 0.8477579951286316, + "learning_rate": 9.81463119820043e-06, + "loss": 0.7801, + "step": 6542 + }, + { + "epoch": 0.3527604054345482, + "grad_norm": 0.7669034600257874, + "learning_rate": 9.81457400205884e-06, + "loss": 0.8516, + "step": 6543 + }, + { + "epoch": 0.3528143196031917, + "grad_norm": 0.7083528637886047, + "learning_rate": 9.814516797261297e-06, + "loss": 0.6961, + "step": 6544 + }, + { + "epoch": 0.35286823377183524, + "grad_norm": 0.7594015598297119, + "learning_rate": 9.814459583807901e-06, + "loss": 0.8094, + "step": 6545 + }, + { + "epoch": 0.35292214794047877, + "grad_norm": 0.7847710847854614, + "learning_rate": 9.814402361698756e-06, + "loss": 0.7982, + "step": 6546 + }, + { + "epoch": 0.3529760621091223, + "grad_norm": 0.8393752574920654, + "learning_rate": 9.814345130933965e-06, + "loss": 0.8782, + "step": 6547 + }, + { + "epoch": 0.3530299762777658, + "grad_norm": 0.8381170034408569, + "learning_rate": 9.81428789151363e-06, + "loss": 0.7444, + "step": 6548 + }, + { + "epoch": 0.3530838904464093, + "grad_norm": 0.8959423303604126, + "learning_rate": 9.814230643437857e-06, + "loss": 0.765, + "step": 6549 + }, + { + "epoch": 0.35313780461505284, + "grad_norm": 0.7836467027664185, + "learning_rate": 9.814173386706744e-06, + "loss": 0.8299, + "step": 6550 + }, + { + "epoch": 0.3531917187836964, + "grad_norm": 0.7173193693161011, + "learning_rate": 9.814116121320397e-06, + "loss": 0.7545, + "step": 6551 + }, + { + "epoch": 0.35324563295233985, + "grad_norm": 0.6796035766601562, + "learning_rate": 9.814058847278919e-06, + "loss": 0.7295, + "step": 6552 + }, + { + "epoch": 0.3532995471209834, + "grad_norm": 0.8881590962409973, + "learning_rate": 9.814001564582412e-06, + "loss": 0.8548, + "step": 6553 + }, + { + "epoch": 0.3533534612896269, + "grad_norm": 0.7311136722564697, + "learning_rate": 9.81394427323098e-06, + "loss": 0.7594, + "step": 6554 + }, + { + "epoch": 0.35340737545827045, + "grad_norm": 0.7221459150314331, + "learning_rate": 9.813886973224725e-06, + "loss": 0.7156, + "step": 6555 + }, + { + "epoch": 0.353461289626914, + "grad_norm": 0.7357953190803528, + "learning_rate": 9.813829664563751e-06, + "loss": 0.7286, + "step": 6556 + }, + { + "epoch": 0.35351520379555745, + "grad_norm": 0.7420037388801575, + "learning_rate": 9.813772347248161e-06, + "loss": 0.7189, + "step": 6557 + }, + { + "epoch": 0.353569117964201, + "grad_norm": 0.8052580952644348, + "learning_rate": 9.813715021278057e-06, + "loss": 0.7638, + "step": 6558 + }, + { + "epoch": 0.3536230321328445, + "grad_norm": 0.7177057266235352, + "learning_rate": 9.813657686653542e-06, + "loss": 0.7218, + "step": 6559 + }, + { + "epoch": 0.35367694630148805, + "grad_norm": 0.7287971377372742, + "learning_rate": 9.813600343374721e-06, + "loss": 0.7631, + "step": 6560 + }, + { + "epoch": 0.3537308604701315, + "grad_norm": 0.7643932104110718, + "learning_rate": 9.813542991441694e-06, + "loss": 0.7679, + "step": 6561 + }, + { + "epoch": 0.35378477463877506, + "grad_norm": 0.7828801274299622, + "learning_rate": 9.813485630854566e-06, + "loss": 0.7748, + "step": 6562 + }, + { + "epoch": 0.3538386888074186, + "grad_norm": 0.6876081228256226, + "learning_rate": 9.813428261613442e-06, + "loss": 0.7188, + "step": 6563 + }, + { + "epoch": 0.3538926029760621, + "grad_norm": 0.8935626149177551, + "learning_rate": 9.813370883718421e-06, + "loss": 0.7503, + "step": 6564 + }, + { + "epoch": 0.35394651714470565, + "grad_norm": 0.7178196907043457, + "learning_rate": 9.81331349716961e-06, + "loss": 0.6179, + "step": 6565 + }, + { + "epoch": 0.35400043131334913, + "grad_norm": 0.7489269971847534, + "learning_rate": 9.81325610196711e-06, + "loss": 0.7694, + "step": 6566 + }, + { + "epoch": 0.35405434548199266, + "grad_norm": 1.1973050832748413, + "learning_rate": 9.813198698111024e-06, + "loss": 0.7318, + "step": 6567 + }, + { + "epoch": 0.3541082596506362, + "grad_norm": 0.7678642868995667, + "learning_rate": 9.813141285601458e-06, + "loss": 0.7164, + "step": 6568 + }, + { + "epoch": 0.3541621738192797, + "grad_norm": 0.8079233169555664, + "learning_rate": 9.813083864438512e-06, + "loss": 0.7446, + "step": 6569 + }, + { + "epoch": 0.3542160879879232, + "grad_norm": 0.9621824622154236, + "learning_rate": 9.813026434622289e-06, + "loss": 0.7641, + "step": 6570 + }, + { + "epoch": 0.35427000215656673, + "grad_norm": 0.850386917591095, + "learning_rate": 9.812968996152894e-06, + "loss": 0.7695, + "step": 6571 + }, + { + "epoch": 0.35432391632521026, + "grad_norm": 0.7684780955314636, + "learning_rate": 9.812911549030431e-06, + "loss": 0.8356, + "step": 6572 + }, + { + "epoch": 0.3543778304938538, + "grad_norm": 0.7108227610588074, + "learning_rate": 9.812854093255002e-06, + "loss": 0.732, + "step": 6573 + }, + { + "epoch": 0.35443174466249733, + "grad_norm": 0.709865927696228, + "learning_rate": 9.812796628826709e-06, + "loss": 0.8039, + "step": 6574 + }, + { + "epoch": 0.3544856588311408, + "grad_norm": 0.9866155982017517, + "learning_rate": 9.812739155745659e-06, + "loss": 0.8184, + "step": 6575 + }, + { + "epoch": 0.35453957299978434, + "grad_norm": 0.8751039505004883, + "learning_rate": 9.812681674011951e-06, + "loss": 0.7778, + "step": 6576 + }, + { + "epoch": 0.35459348716842787, + "grad_norm": 0.6718238592147827, + "learning_rate": 9.812624183625691e-06, + "loss": 0.6981, + "step": 6577 + }, + { + "epoch": 0.3546474013370714, + "grad_norm": 0.7232449054718018, + "learning_rate": 9.812566684586983e-06, + "loss": 0.7309, + "step": 6578 + }, + { + "epoch": 0.3547013155057149, + "grad_norm": 0.7263187766075134, + "learning_rate": 9.812509176895929e-06, + "loss": 0.6814, + "step": 6579 + }, + { + "epoch": 0.3547552296743584, + "grad_norm": 0.7074307799339294, + "learning_rate": 9.81245166055263e-06, + "loss": 0.6852, + "step": 6580 + }, + { + "epoch": 0.35480914384300194, + "grad_norm": 0.8384448885917664, + "learning_rate": 9.812394135557194e-06, + "loss": 0.7921, + "step": 6581 + }, + { + "epoch": 0.35486305801164547, + "grad_norm": 0.7591973543167114, + "learning_rate": 9.812336601909723e-06, + "loss": 0.7831, + "step": 6582 + }, + { + "epoch": 0.354916972180289, + "grad_norm": 0.7600874304771423, + "learning_rate": 9.81227905961032e-06, + "loss": 0.7264, + "step": 6583 + }, + { + "epoch": 0.3549708863489325, + "grad_norm": 0.8201075792312622, + "learning_rate": 9.812221508659085e-06, + "loss": 0.8631, + "step": 6584 + }, + { + "epoch": 0.355024800517576, + "grad_norm": 0.7076126337051392, + "learning_rate": 9.812163949056129e-06, + "loss": 0.7747, + "step": 6585 + }, + { + "epoch": 0.35507871468621954, + "grad_norm": 0.7350928783416748, + "learning_rate": 9.81210638080155e-06, + "loss": 0.7043, + "step": 6586 + }, + { + "epoch": 0.3551326288548631, + "grad_norm": 0.7440031170845032, + "learning_rate": 9.812048803895451e-06, + "loss": 0.7828, + "step": 6587 + }, + { + "epoch": 0.35518654302350655, + "grad_norm": 0.7375351190567017, + "learning_rate": 9.811991218337937e-06, + "loss": 0.7492, + "step": 6588 + }, + { + "epoch": 0.3552404571921501, + "grad_norm": 0.7286681532859802, + "learning_rate": 9.811933624129114e-06, + "loss": 0.7795, + "step": 6589 + }, + { + "epoch": 0.3552943713607936, + "grad_norm": 0.882192075252533, + "learning_rate": 9.811876021269083e-06, + "loss": 0.8368, + "step": 6590 + }, + { + "epoch": 0.35534828552943715, + "grad_norm": 0.9999831914901733, + "learning_rate": 9.811818409757947e-06, + "loss": 0.8699, + "step": 6591 + }, + { + "epoch": 0.3554021996980807, + "grad_norm": 0.7052724957466125, + "learning_rate": 9.81176078959581e-06, + "loss": 0.6705, + "step": 6592 + }, + { + "epoch": 0.35545611386672415, + "grad_norm": 0.8101418614387512, + "learning_rate": 9.811703160782777e-06, + "loss": 0.8159, + "step": 6593 + }, + { + "epoch": 0.3555100280353677, + "grad_norm": 0.7498278617858887, + "learning_rate": 9.811645523318951e-06, + "loss": 0.8097, + "step": 6594 + }, + { + "epoch": 0.3555639422040112, + "grad_norm": 1.0161553621292114, + "learning_rate": 9.811587877204434e-06, + "loss": 0.7872, + "step": 6595 + }, + { + "epoch": 0.35561785637265475, + "grad_norm": 0.7806850671768188, + "learning_rate": 9.811530222439332e-06, + "loss": 0.8255, + "step": 6596 + }, + { + "epoch": 0.3556717705412982, + "grad_norm": 0.671782374382019, + "learning_rate": 9.811472559023748e-06, + "loss": 0.7159, + "step": 6597 + }, + { + "epoch": 0.35572568470994176, + "grad_norm": 0.7299701571464539, + "learning_rate": 9.811414886957785e-06, + "loss": 0.7438, + "step": 6598 + }, + { + "epoch": 0.3557795988785853, + "grad_norm": 0.8439094424247742, + "learning_rate": 9.811357206241546e-06, + "loss": 0.9141, + "step": 6599 + }, + { + "epoch": 0.3558335130472288, + "grad_norm": 0.7000816464424133, + "learning_rate": 9.811299516875137e-06, + "loss": 0.8018, + "step": 6600 + }, + { + "epoch": 0.35588742721587235, + "grad_norm": 0.7904139161109924, + "learning_rate": 9.811241818858659e-06, + "loss": 0.8672, + "step": 6601 + }, + { + "epoch": 0.35594134138451583, + "grad_norm": 0.7034594416618347, + "learning_rate": 9.811184112192217e-06, + "loss": 0.7041, + "step": 6602 + }, + { + "epoch": 0.35599525555315936, + "grad_norm": 0.7331507802009583, + "learning_rate": 9.811126396875916e-06, + "loss": 0.7938, + "step": 6603 + }, + { + "epoch": 0.3560491697218029, + "grad_norm": 1.2808091640472412, + "learning_rate": 9.811068672909859e-06, + "loss": 0.8071, + "step": 6604 + }, + { + "epoch": 0.3561030838904464, + "grad_norm": 0.7333995699882507, + "learning_rate": 9.811010940294148e-06, + "loss": 0.7391, + "step": 6605 + }, + { + "epoch": 0.3561569980590899, + "grad_norm": 1.101427435874939, + "learning_rate": 9.810953199028888e-06, + "loss": 0.8133, + "step": 6606 + }, + { + "epoch": 0.35621091222773343, + "grad_norm": 0.9040619730949402, + "learning_rate": 9.810895449114185e-06, + "loss": 0.7745, + "step": 6607 + }, + { + "epoch": 0.35626482639637697, + "grad_norm": 0.8693681359291077, + "learning_rate": 9.81083769055014e-06, + "loss": 0.7925, + "step": 6608 + }, + { + "epoch": 0.3563187405650205, + "grad_norm": 0.7265763282775879, + "learning_rate": 9.810779923336857e-06, + "loss": 0.7465, + "step": 6609 + }, + { + "epoch": 0.35637265473366403, + "grad_norm": 0.7850440740585327, + "learning_rate": 9.810722147474441e-06, + "loss": 0.7184, + "step": 6610 + }, + { + "epoch": 0.3564265689023075, + "grad_norm": 0.7340103387832642, + "learning_rate": 9.810664362962993e-06, + "loss": 0.7494, + "step": 6611 + }, + { + "epoch": 0.35648048307095104, + "grad_norm": 0.7261168360710144, + "learning_rate": 9.810606569802623e-06, + "loss": 0.7964, + "step": 6612 + }, + { + "epoch": 0.35653439723959457, + "grad_norm": 0.778197705745697, + "learning_rate": 9.810548767993428e-06, + "loss": 0.8634, + "step": 6613 + }, + { + "epoch": 0.3565883114082381, + "grad_norm": 0.6873172521591187, + "learning_rate": 9.810490957535518e-06, + "loss": 0.7457, + "step": 6614 + }, + { + "epoch": 0.3566422255768816, + "grad_norm": 0.7205674052238464, + "learning_rate": 9.810433138428992e-06, + "loss": 0.7412, + "step": 6615 + }, + { + "epoch": 0.3566961397455251, + "grad_norm": 0.6810207366943359, + "learning_rate": 9.810375310673958e-06, + "loss": 0.7463, + "step": 6616 + }, + { + "epoch": 0.35675005391416864, + "grad_norm": 0.8870421648025513, + "learning_rate": 9.810317474270516e-06, + "loss": 0.7757, + "step": 6617 + }, + { + "epoch": 0.3568039680828122, + "grad_norm": 0.7103554010391235, + "learning_rate": 9.810259629218773e-06, + "loss": 0.6719, + "step": 6618 + }, + { + "epoch": 0.3568578822514557, + "grad_norm": 0.7561456561088562, + "learning_rate": 9.810201775518832e-06, + "loss": 0.7192, + "step": 6619 + }, + { + "epoch": 0.3569117964200992, + "grad_norm": 0.7545101642608643, + "learning_rate": 9.810143913170796e-06, + "loss": 0.7433, + "step": 6620 + }, + { + "epoch": 0.3569657105887427, + "grad_norm": 0.652327835559845, + "learning_rate": 9.810086042174771e-06, + "loss": 0.7311, + "step": 6621 + }, + { + "epoch": 0.35701962475738624, + "grad_norm": 0.789463460445404, + "learning_rate": 9.81002816253086e-06, + "loss": 0.7556, + "step": 6622 + }, + { + "epoch": 0.3570735389260298, + "grad_norm": 0.7785344123840332, + "learning_rate": 9.809970274239167e-06, + "loss": 0.7571, + "step": 6623 + }, + { + "epoch": 0.35712745309467325, + "grad_norm": 0.7134326100349426, + "learning_rate": 9.809912377299795e-06, + "loss": 0.8414, + "step": 6624 + }, + { + "epoch": 0.3571813672633168, + "grad_norm": 0.8180043697357178, + "learning_rate": 9.809854471712851e-06, + "loss": 0.7321, + "step": 6625 + }, + { + "epoch": 0.3572352814319603, + "grad_norm": 0.7174314260482788, + "learning_rate": 9.809796557478437e-06, + "loss": 0.7429, + "step": 6626 + }, + { + "epoch": 0.35728919560060385, + "grad_norm": 0.7083105444908142, + "learning_rate": 9.809738634596656e-06, + "loss": 0.764, + "step": 6627 + }, + { + "epoch": 0.3573431097692474, + "grad_norm": 0.808198869228363, + "learning_rate": 9.809680703067616e-06, + "loss": 0.8322, + "step": 6628 + }, + { + "epoch": 0.35739702393789086, + "grad_norm": 0.7731208801269531, + "learning_rate": 9.809622762891417e-06, + "loss": 0.8247, + "step": 6629 + }, + { + "epoch": 0.3574509381065344, + "grad_norm": 0.8366483449935913, + "learning_rate": 9.809564814068165e-06, + "loss": 0.7537, + "step": 6630 + }, + { + "epoch": 0.3575048522751779, + "grad_norm": 0.7587538361549377, + "learning_rate": 9.809506856597965e-06, + "loss": 0.7755, + "step": 6631 + }, + { + "epoch": 0.35755876644382145, + "grad_norm": 0.7224121689796448, + "learning_rate": 9.809448890480919e-06, + "loss": 0.708, + "step": 6632 + }, + { + "epoch": 0.35761268061246493, + "grad_norm": 0.8257870674133301, + "learning_rate": 9.809390915717133e-06, + "loss": 0.8425, + "step": 6633 + }, + { + "epoch": 0.35766659478110846, + "grad_norm": 0.7082727551460266, + "learning_rate": 9.809332932306712e-06, + "loss": 0.6904, + "step": 6634 + }, + { + "epoch": 0.357720508949752, + "grad_norm": 0.8497577905654907, + "learning_rate": 9.809274940249758e-06, + "loss": 0.6639, + "step": 6635 + }, + { + "epoch": 0.3577744231183955, + "grad_norm": 0.7666820883750916, + "learning_rate": 9.809216939546378e-06, + "loss": 0.8179, + "step": 6636 + }, + { + "epoch": 0.35782833728703906, + "grad_norm": 0.7550464868545532, + "learning_rate": 9.80915893019667e-06, + "loss": 0.7934, + "step": 6637 + }, + { + "epoch": 0.35788225145568253, + "grad_norm": 0.9429128766059875, + "learning_rate": 9.809100912200749e-06, + "loss": 0.8682, + "step": 6638 + }, + { + "epoch": 0.35793616562432606, + "grad_norm": 0.7584381103515625, + "learning_rate": 9.809042885558708e-06, + "loss": 0.7063, + "step": 6639 + }, + { + "epoch": 0.3579900797929696, + "grad_norm": 0.7465893626213074, + "learning_rate": 9.80898485027066e-06, + "loss": 0.7769, + "step": 6640 + }, + { + "epoch": 0.3580439939616131, + "grad_norm": 0.834865152835846, + "learning_rate": 9.808926806336703e-06, + "loss": 0.7675, + "step": 6641 + }, + { + "epoch": 0.35809790813025666, + "grad_norm": 0.6598499417304993, + "learning_rate": 9.808868753756947e-06, + "loss": 0.6834, + "step": 6642 + }, + { + "epoch": 0.35815182229890014, + "grad_norm": 0.9130172729492188, + "learning_rate": 9.808810692531492e-06, + "loss": 0.8214, + "step": 6643 + }, + { + "epoch": 0.35820573646754367, + "grad_norm": 1.0007606744766235, + "learning_rate": 9.808752622660444e-06, + "loss": 0.8704, + "step": 6644 + }, + { + "epoch": 0.3582596506361872, + "grad_norm": 0.7705127000808716, + "learning_rate": 9.808694544143908e-06, + "loss": 0.7484, + "step": 6645 + }, + { + "epoch": 0.35831356480483073, + "grad_norm": 0.7216114401817322, + "learning_rate": 9.808636456981986e-06, + "loss": 0.7798, + "step": 6646 + }, + { + "epoch": 0.3583674789734742, + "grad_norm": 0.7486965656280518, + "learning_rate": 9.808578361174785e-06, + "loss": 0.7766, + "step": 6647 + }, + { + "epoch": 0.35842139314211774, + "grad_norm": 0.7999522686004639, + "learning_rate": 9.808520256722409e-06, + "loss": 0.7564, + "step": 6648 + }, + { + "epoch": 0.35847530731076127, + "grad_norm": 0.7279207110404968, + "learning_rate": 9.808462143624964e-06, + "loss": 0.7611, + "step": 6649 + }, + { + "epoch": 0.3585292214794048, + "grad_norm": 0.8348183035850525, + "learning_rate": 9.808404021882549e-06, + "loss": 0.6852, + "step": 6650 + }, + { + "epoch": 0.35858313564804833, + "grad_norm": 0.8778795003890991, + "learning_rate": 9.808345891495274e-06, + "loss": 0.7929, + "step": 6651 + }, + { + "epoch": 0.3586370498166918, + "grad_norm": 0.7209139466285706, + "learning_rate": 9.808287752463242e-06, + "loss": 0.6688, + "step": 6652 + }, + { + "epoch": 0.35869096398533534, + "grad_norm": 0.7195870280265808, + "learning_rate": 9.808229604786557e-06, + "loss": 0.7002, + "step": 6653 + }, + { + "epoch": 0.3587448781539789, + "grad_norm": 0.709906816482544, + "learning_rate": 9.808171448465322e-06, + "loss": 0.73, + "step": 6654 + }, + { + "epoch": 0.3587987923226224, + "grad_norm": 0.8573802709579468, + "learning_rate": 9.808113283499645e-06, + "loss": 0.7699, + "step": 6655 + }, + { + "epoch": 0.3588527064912659, + "grad_norm": 0.9519971013069153, + "learning_rate": 9.808055109889627e-06, + "loss": 0.7054, + "step": 6656 + }, + { + "epoch": 0.3589066206599094, + "grad_norm": 0.70480877161026, + "learning_rate": 9.807996927635377e-06, + "loss": 0.7953, + "step": 6657 + }, + { + "epoch": 0.35896053482855295, + "grad_norm": 0.7647779583930969, + "learning_rate": 9.807938736736995e-06, + "loss": 0.7807, + "step": 6658 + }, + { + "epoch": 0.3590144489971965, + "grad_norm": 0.7367976903915405, + "learning_rate": 9.807880537194587e-06, + "loss": 0.8175, + "step": 6659 + }, + { + "epoch": 0.35906836316584, + "grad_norm": 0.700531542301178, + "learning_rate": 9.80782232900826e-06, + "loss": 0.8433, + "step": 6660 + }, + { + "epoch": 0.3591222773344835, + "grad_norm": 0.7186942100524902, + "learning_rate": 9.807764112178117e-06, + "loss": 0.7228, + "step": 6661 + }, + { + "epoch": 0.359176191503127, + "grad_norm": 1.3057339191436768, + "learning_rate": 9.807705886704262e-06, + "loss": 0.714, + "step": 6662 + }, + { + "epoch": 0.35923010567177055, + "grad_norm": 0.9423549771308899, + "learning_rate": 9.8076476525868e-06, + "loss": 0.7907, + "step": 6663 + }, + { + "epoch": 0.3592840198404141, + "grad_norm": 0.9193512797355652, + "learning_rate": 9.807589409825835e-06, + "loss": 0.8062, + "step": 6664 + }, + { + "epoch": 0.35933793400905756, + "grad_norm": 0.7547391057014465, + "learning_rate": 9.807531158421474e-06, + "loss": 0.7393, + "step": 6665 + }, + { + "epoch": 0.3593918481777011, + "grad_norm": 0.7634474039077759, + "learning_rate": 9.80747289837382e-06, + "loss": 0.6765, + "step": 6666 + }, + { + "epoch": 0.3594457623463446, + "grad_norm": 0.7682304978370667, + "learning_rate": 9.807414629682977e-06, + "loss": 0.716, + "step": 6667 + }, + { + "epoch": 0.35949967651498815, + "grad_norm": 0.8408803343772888, + "learning_rate": 9.807356352349053e-06, + "loss": 0.9078, + "step": 6668 + }, + { + "epoch": 0.3595535906836317, + "grad_norm": 0.8683525323867798, + "learning_rate": 9.807298066372148e-06, + "loss": 0.7891, + "step": 6669 + }, + { + "epoch": 0.35960750485227516, + "grad_norm": 0.6413837671279907, + "learning_rate": 9.807239771752372e-06, + "loss": 0.5898, + "step": 6670 + }, + { + "epoch": 0.3596614190209187, + "grad_norm": 0.8290123343467712, + "learning_rate": 9.807181468489826e-06, + "loss": 0.7629, + "step": 6671 + }, + { + "epoch": 0.3597153331895622, + "grad_norm": 0.8880342841148376, + "learning_rate": 9.807123156584617e-06, + "loss": 0.8109, + "step": 6672 + }, + { + "epoch": 0.35976924735820576, + "grad_norm": 0.7290730476379395, + "learning_rate": 9.807064836036848e-06, + "loss": 0.6863, + "step": 6673 + }, + { + "epoch": 0.35982316152684923, + "grad_norm": 0.8520159125328064, + "learning_rate": 9.807006506846625e-06, + "loss": 0.7541, + "step": 6674 + }, + { + "epoch": 0.35987707569549277, + "grad_norm": 0.7564545273780823, + "learning_rate": 9.806948169014053e-06, + "loss": 0.8383, + "step": 6675 + }, + { + "epoch": 0.3599309898641363, + "grad_norm": 0.8367358446121216, + "learning_rate": 9.806889822539235e-06, + "loss": 0.939, + "step": 6676 + }, + { + "epoch": 0.35998490403277983, + "grad_norm": 0.8862128853797913, + "learning_rate": 9.80683146742228e-06, + "loss": 0.7679, + "step": 6677 + }, + { + "epoch": 0.36003881820142336, + "grad_norm": 0.7538912296295166, + "learning_rate": 9.806773103663287e-06, + "loss": 0.7735, + "step": 6678 + }, + { + "epoch": 0.36009273237006684, + "grad_norm": 0.8559550046920776, + "learning_rate": 9.806714731262368e-06, + "loss": 0.8092, + "step": 6679 + }, + { + "epoch": 0.36014664653871037, + "grad_norm": 0.8111431002616882, + "learning_rate": 9.806656350219621e-06, + "loss": 0.7776, + "step": 6680 + }, + { + "epoch": 0.3602005607073539, + "grad_norm": 0.8416717052459717, + "learning_rate": 9.806597960535156e-06, + "loss": 0.7725, + "step": 6681 + }, + { + "epoch": 0.36025447487599743, + "grad_norm": 0.6742260456085205, + "learning_rate": 9.806539562209076e-06, + "loss": 0.6623, + "step": 6682 + }, + { + "epoch": 0.3603083890446409, + "grad_norm": 0.7117223143577576, + "learning_rate": 9.806481155241484e-06, + "loss": 0.7049, + "step": 6683 + }, + { + "epoch": 0.36036230321328444, + "grad_norm": 0.7776613235473633, + "learning_rate": 9.80642273963249e-06, + "loss": 0.8345, + "step": 6684 + }, + { + "epoch": 0.360416217381928, + "grad_norm": 0.7228115797042847, + "learning_rate": 9.806364315382196e-06, + "loss": 0.6476, + "step": 6685 + }, + { + "epoch": 0.3604701315505715, + "grad_norm": 0.9165751934051514, + "learning_rate": 9.806305882490705e-06, + "loss": 0.8387, + "step": 6686 + }, + { + "epoch": 0.36052404571921504, + "grad_norm": 0.6338438391685486, + "learning_rate": 9.806247440958125e-06, + "loss": 0.6986, + "step": 6687 + }, + { + "epoch": 0.3605779598878585, + "grad_norm": 0.7140188217163086, + "learning_rate": 9.80618899078456e-06, + "loss": 0.6949, + "step": 6688 + }, + { + "epoch": 0.36063187405650204, + "grad_norm": 0.8785191774368286, + "learning_rate": 9.806130531970118e-06, + "loss": 0.8812, + "step": 6689 + }, + { + "epoch": 0.3606857882251456, + "grad_norm": 0.6468793749809265, + "learning_rate": 9.806072064514899e-06, + "loss": 0.6855, + "step": 6690 + }, + { + "epoch": 0.3607397023937891, + "grad_norm": 0.8219329714775085, + "learning_rate": 9.80601358841901e-06, + "loss": 0.922, + "step": 6691 + }, + { + "epoch": 0.3607936165624326, + "grad_norm": 0.7951709032058716, + "learning_rate": 9.80595510368256e-06, + "loss": 0.7265, + "step": 6692 + }, + { + "epoch": 0.3608475307310761, + "grad_norm": 0.863088846206665, + "learning_rate": 9.805896610305649e-06, + "loss": 0.8505, + "step": 6693 + }, + { + "epoch": 0.36090144489971965, + "grad_norm": 0.8640897870063782, + "learning_rate": 9.805838108288384e-06, + "loss": 0.7652, + "step": 6694 + }, + { + "epoch": 0.3609553590683632, + "grad_norm": 0.8764123320579529, + "learning_rate": 9.805779597630868e-06, + "loss": 0.7574, + "step": 6695 + }, + { + "epoch": 0.3610092732370067, + "grad_norm": 0.7477309107780457, + "learning_rate": 9.805721078333213e-06, + "loss": 0.7466, + "step": 6696 + }, + { + "epoch": 0.3610631874056502, + "grad_norm": 0.7052176594734192, + "learning_rate": 9.805662550395517e-06, + "loss": 0.7885, + "step": 6697 + }, + { + "epoch": 0.3611171015742937, + "grad_norm": 0.7080042362213135, + "learning_rate": 9.805604013817888e-06, + "loss": 0.7044, + "step": 6698 + }, + { + "epoch": 0.36117101574293725, + "grad_norm": 0.7068271636962891, + "learning_rate": 9.80554546860043e-06, + "loss": 0.7572, + "step": 6699 + }, + { + "epoch": 0.3612249299115808, + "grad_norm": 0.8425297141075134, + "learning_rate": 9.805486914743251e-06, + "loss": 0.7911, + "step": 6700 + }, + { + "epoch": 0.36127884408022426, + "grad_norm": 1.0098155736923218, + "learning_rate": 9.805428352246455e-06, + "loss": 0.8813, + "step": 6701 + }, + { + "epoch": 0.3613327582488678, + "grad_norm": 0.880401074886322, + "learning_rate": 9.805369781110146e-06, + "loss": 0.7312, + "step": 6702 + }, + { + "epoch": 0.3613866724175113, + "grad_norm": 0.7234327793121338, + "learning_rate": 9.805311201334432e-06, + "loss": 0.7012, + "step": 6703 + }, + { + "epoch": 0.36144058658615486, + "grad_norm": 0.8709098100662231, + "learning_rate": 9.805252612919413e-06, + "loss": 0.7855, + "step": 6704 + }, + { + "epoch": 0.3614945007547984, + "grad_norm": 0.7029942870140076, + "learning_rate": 9.8051940158652e-06, + "loss": 0.731, + "step": 6705 + }, + { + "epoch": 0.36154841492344186, + "grad_norm": 0.9543294310569763, + "learning_rate": 9.805135410171898e-06, + "loss": 0.8143, + "step": 6706 + }, + { + "epoch": 0.3616023290920854, + "grad_norm": 0.6950708031654358, + "learning_rate": 9.805076795839607e-06, + "loss": 0.6888, + "step": 6707 + }, + { + "epoch": 0.3616562432607289, + "grad_norm": 0.81794273853302, + "learning_rate": 9.80501817286844e-06, + "loss": 0.8678, + "step": 6708 + }, + { + "epoch": 0.36171015742937246, + "grad_norm": 0.8532668948173523, + "learning_rate": 9.804959541258495e-06, + "loss": 0.7957, + "step": 6709 + }, + { + "epoch": 0.36176407159801593, + "grad_norm": 0.7973982095718384, + "learning_rate": 9.804900901009881e-06, + "loss": 0.7009, + "step": 6710 + }, + { + "epoch": 0.36181798576665947, + "grad_norm": 0.7889378666877747, + "learning_rate": 9.804842252122706e-06, + "loss": 0.6762, + "step": 6711 + }, + { + "epoch": 0.361871899935303, + "grad_norm": 0.7479483485221863, + "learning_rate": 9.80478359459707e-06, + "loss": 0.7105, + "step": 6712 + }, + { + "epoch": 0.36192581410394653, + "grad_norm": 0.7120743989944458, + "learning_rate": 9.804724928433083e-06, + "loss": 0.7287, + "step": 6713 + }, + { + "epoch": 0.36197972827259006, + "grad_norm": 0.7907229065895081, + "learning_rate": 9.804666253630846e-06, + "loss": 0.7967, + "step": 6714 + }, + { + "epoch": 0.36203364244123354, + "grad_norm": 0.6709898710250854, + "learning_rate": 9.80460757019047e-06, + "loss": 0.7035, + "step": 6715 + }, + { + "epoch": 0.36208755660987707, + "grad_norm": 0.7738235592842102, + "learning_rate": 9.804548878112055e-06, + "loss": 0.7682, + "step": 6716 + }, + { + "epoch": 0.3621414707785206, + "grad_norm": 0.7634631991386414, + "learning_rate": 9.804490177395711e-06, + "loss": 0.7391, + "step": 6717 + }, + { + "epoch": 0.36219538494716413, + "grad_norm": 0.8247049450874329, + "learning_rate": 9.80443146804154e-06, + "loss": 0.7531, + "step": 6718 + }, + { + "epoch": 0.3622492991158076, + "grad_norm": 0.7538613677024841, + "learning_rate": 9.80437275004965e-06, + "loss": 0.7635, + "step": 6719 + }, + { + "epoch": 0.36230321328445114, + "grad_norm": 0.6676668524742126, + "learning_rate": 9.804314023420148e-06, + "loss": 0.7272, + "step": 6720 + }, + { + "epoch": 0.3623571274530947, + "grad_norm": 0.8900763988494873, + "learning_rate": 9.804255288153133e-06, + "loss": 0.7631, + "step": 6721 + }, + { + "epoch": 0.3624110416217382, + "grad_norm": 0.7756088972091675, + "learning_rate": 9.804196544248719e-06, + "loss": 0.81, + "step": 6722 + }, + { + "epoch": 0.36246495579038174, + "grad_norm": 0.6948640942573547, + "learning_rate": 9.804137791707004e-06, + "loss": 0.693, + "step": 6723 + }, + { + "epoch": 0.3625188699590252, + "grad_norm": 0.7438804507255554, + "learning_rate": 9.8040790305281e-06, + "loss": 0.7986, + "step": 6724 + }, + { + "epoch": 0.36257278412766875, + "grad_norm": 0.7065889835357666, + "learning_rate": 9.804020260712109e-06, + "loss": 0.6949, + "step": 6725 + }, + { + "epoch": 0.3626266982963123, + "grad_norm": 0.8189884424209595, + "learning_rate": 9.803961482259136e-06, + "loss": 0.7448, + "step": 6726 + }, + { + "epoch": 0.3626806124649558, + "grad_norm": 0.8585190176963806, + "learning_rate": 9.80390269516929e-06, + "loss": 0.8617, + "step": 6727 + }, + { + "epoch": 0.3627345266335993, + "grad_norm": 0.911480188369751, + "learning_rate": 9.803843899442674e-06, + "loss": 0.7426, + "step": 6728 + }, + { + "epoch": 0.3627884408022428, + "grad_norm": 0.8193525075912476, + "learning_rate": 9.803785095079396e-06, + "loss": 0.811, + "step": 6729 + }, + { + "epoch": 0.36284235497088635, + "grad_norm": 0.8435998558998108, + "learning_rate": 9.80372628207956e-06, + "loss": 0.7132, + "step": 6730 + }, + { + "epoch": 0.3628962691395299, + "grad_norm": 0.7402483224868774, + "learning_rate": 9.803667460443268e-06, + "loss": 0.8113, + "step": 6731 + }, + { + "epoch": 0.3629501833081734, + "grad_norm": 0.9357480406761169, + "learning_rate": 9.803608630170635e-06, + "loss": 0.8954, + "step": 6732 + }, + { + "epoch": 0.3630040974768169, + "grad_norm": 0.845879852771759, + "learning_rate": 9.80354979126176e-06, + "loss": 0.8002, + "step": 6733 + }, + { + "epoch": 0.3630580116454604, + "grad_norm": 0.7685959339141846, + "learning_rate": 9.803490943716748e-06, + "loss": 0.7198, + "step": 6734 + }, + { + "epoch": 0.36311192581410395, + "grad_norm": 0.7155508995056152, + "learning_rate": 9.80343208753571e-06, + "loss": 0.7235, + "step": 6735 + }, + { + "epoch": 0.3631658399827475, + "grad_norm": 0.7680899500846863, + "learning_rate": 9.803373222718748e-06, + "loss": 0.7864, + "step": 6736 + }, + { + "epoch": 0.36321975415139096, + "grad_norm": 0.6764048337936401, + "learning_rate": 9.803314349265968e-06, + "loss": 0.6931, + "step": 6737 + }, + { + "epoch": 0.3632736683200345, + "grad_norm": 0.6954395771026611, + "learning_rate": 9.803255467177476e-06, + "loss": 0.6097, + "step": 6738 + }, + { + "epoch": 0.363327582488678, + "grad_norm": 0.6562077403068542, + "learning_rate": 9.803196576453382e-06, + "loss": 0.7381, + "step": 6739 + }, + { + "epoch": 0.36338149665732156, + "grad_norm": 0.6876485347747803, + "learning_rate": 9.803137677093786e-06, + "loss": 0.723, + "step": 6740 + }, + { + "epoch": 0.3634354108259651, + "grad_norm": 0.7708773016929626, + "learning_rate": 9.803078769098797e-06, + "loss": 0.7602, + "step": 6741 + }, + { + "epoch": 0.36348932499460856, + "grad_norm": 1.038544774055481, + "learning_rate": 9.803019852468518e-06, + "loss": 0.935, + "step": 6742 + }, + { + "epoch": 0.3635432391632521, + "grad_norm": 0.7528790831565857, + "learning_rate": 9.80296092720306e-06, + "loss": 0.821, + "step": 6743 + }, + { + "epoch": 0.36359715333189563, + "grad_norm": 0.651340126991272, + "learning_rate": 9.802901993302525e-06, + "loss": 0.6782, + "step": 6744 + }, + { + "epoch": 0.36365106750053916, + "grad_norm": 0.762054979801178, + "learning_rate": 9.80284305076702e-06, + "loss": 0.824, + "step": 6745 + }, + { + "epoch": 0.36370498166918264, + "grad_norm": 0.7798411846160889, + "learning_rate": 9.80278409959665e-06, + "loss": 0.8258, + "step": 6746 + }, + { + "epoch": 0.36375889583782617, + "grad_norm": 0.7562406063079834, + "learning_rate": 9.802725139791523e-06, + "loss": 0.6716, + "step": 6747 + }, + { + "epoch": 0.3638128100064697, + "grad_norm": 0.7402334213256836, + "learning_rate": 9.802666171351742e-06, + "loss": 0.7661, + "step": 6748 + }, + { + "epoch": 0.36386672417511323, + "grad_norm": 0.8259956240653992, + "learning_rate": 9.802607194277417e-06, + "loss": 0.8572, + "step": 6749 + }, + { + "epoch": 0.36392063834375676, + "grad_norm": 1.0304851531982422, + "learning_rate": 9.802548208568652e-06, + "loss": 0.6969, + "step": 6750 + }, + { + "epoch": 0.36397455251240024, + "grad_norm": 0.6883974075317383, + "learning_rate": 9.802489214225552e-06, + "loss": 0.7219, + "step": 6751 + }, + { + "epoch": 0.36402846668104377, + "grad_norm": 0.6879093647003174, + "learning_rate": 9.802430211248225e-06, + "loss": 0.7, + "step": 6752 + }, + { + "epoch": 0.3640823808496873, + "grad_norm": 0.7767845988273621, + "learning_rate": 9.802371199636775e-06, + "loss": 0.7676, + "step": 6753 + }, + { + "epoch": 0.36413629501833084, + "grad_norm": 0.7344959378242493, + "learning_rate": 9.802312179391311e-06, + "loss": 0.7944, + "step": 6754 + }, + { + "epoch": 0.3641902091869743, + "grad_norm": 0.7712574005126953, + "learning_rate": 9.802253150511936e-06, + "loss": 0.7423, + "step": 6755 + }, + { + "epoch": 0.36424412335561784, + "grad_norm": 0.6754806637763977, + "learning_rate": 9.802194112998757e-06, + "loss": 0.6918, + "step": 6756 + }, + { + "epoch": 0.3642980375242614, + "grad_norm": 0.7157537937164307, + "learning_rate": 9.802135066851884e-06, + "loss": 0.6972, + "step": 6757 + }, + { + "epoch": 0.3643519516929049, + "grad_norm": 0.8084623217582703, + "learning_rate": 9.802076012071415e-06, + "loss": 0.8669, + "step": 6758 + }, + { + "epoch": 0.36440586586154844, + "grad_norm": 0.7006164193153381, + "learning_rate": 9.802016948657464e-06, + "loss": 0.7662, + "step": 6759 + }, + { + "epoch": 0.3644597800301919, + "grad_norm": 0.7667980790138245, + "learning_rate": 9.801957876610134e-06, + "loss": 0.7805, + "step": 6760 + }, + { + "epoch": 0.36451369419883545, + "grad_norm": 1.0004878044128418, + "learning_rate": 9.801898795929532e-06, + "loss": 0.7942, + "step": 6761 + }, + { + "epoch": 0.364567608367479, + "grad_norm": 0.778765082359314, + "learning_rate": 9.80183970661576e-06, + "loss": 0.7959, + "step": 6762 + }, + { + "epoch": 0.3646215225361225, + "grad_norm": 0.7090725302696228, + "learning_rate": 9.801780608668931e-06, + "loss": 0.8208, + "step": 6763 + }, + { + "epoch": 0.364675436704766, + "grad_norm": 0.6817572712898254, + "learning_rate": 9.80172150208915e-06, + "loss": 0.7114, + "step": 6764 + }, + { + "epoch": 0.3647293508734095, + "grad_norm": 0.798346757888794, + "learning_rate": 9.801662386876518e-06, + "loss": 0.8291, + "step": 6765 + }, + { + "epoch": 0.36478326504205305, + "grad_norm": 0.7163029313087463, + "learning_rate": 9.801603263031145e-06, + "loss": 0.7279, + "step": 6766 + }, + { + "epoch": 0.3648371792106966, + "grad_norm": 0.7624267935752869, + "learning_rate": 9.801544130553139e-06, + "loss": 0.8105, + "step": 6767 + }, + { + "epoch": 0.3648910933793401, + "grad_norm": 0.7381821274757385, + "learning_rate": 9.801484989442602e-06, + "loss": 0.7986, + "step": 6768 + }, + { + "epoch": 0.3649450075479836, + "grad_norm": 0.7259721159934998, + "learning_rate": 9.801425839699642e-06, + "loss": 0.7807, + "step": 6769 + }, + { + "epoch": 0.3649989217166271, + "grad_norm": 0.8212186694145203, + "learning_rate": 9.801366681324367e-06, + "loss": 0.777, + "step": 6770 + }, + { + "epoch": 0.36505283588527065, + "grad_norm": 0.6906125545501709, + "learning_rate": 9.801307514316882e-06, + "loss": 0.6944, + "step": 6771 + }, + { + "epoch": 0.3651067500539142, + "grad_norm": 0.7668260931968689, + "learning_rate": 9.801248338677294e-06, + "loss": 0.8669, + "step": 6772 + }, + { + "epoch": 0.36516066422255766, + "grad_norm": 0.8373915553092957, + "learning_rate": 9.80118915440571e-06, + "loss": 0.6735, + "step": 6773 + }, + { + "epoch": 0.3652145783912012, + "grad_norm": 0.7345216870307922, + "learning_rate": 9.801129961502234e-06, + "loss": 0.6516, + "step": 6774 + }, + { + "epoch": 0.3652684925598447, + "grad_norm": 0.7207350730895996, + "learning_rate": 9.801070759966976e-06, + "loss": 0.7419, + "step": 6775 + }, + { + "epoch": 0.36532240672848826, + "grad_norm": 0.697950005531311, + "learning_rate": 9.801011549800036e-06, + "loss": 0.7194, + "step": 6776 + }, + { + "epoch": 0.3653763208971318, + "grad_norm": 0.6933177709579468, + "learning_rate": 9.800952331001528e-06, + "loss": 0.7838, + "step": 6777 + }, + { + "epoch": 0.36543023506577527, + "grad_norm": 0.730925977230072, + "learning_rate": 9.800893103571556e-06, + "loss": 0.7028, + "step": 6778 + }, + { + "epoch": 0.3654841492344188, + "grad_norm": 0.6382230520248413, + "learning_rate": 9.800833867510224e-06, + "loss": 0.7089, + "step": 6779 + }, + { + "epoch": 0.36553806340306233, + "grad_norm": 0.7513541579246521, + "learning_rate": 9.80077462281764e-06, + "loss": 0.8282, + "step": 6780 + }, + { + "epoch": 0.36559197757170586, + "grad_norm": 0.7457998394966125, + "learning_rate": 9.800715369493912e-06, + "loss": 0.7631, + "step": 6781 + }, + { + "epoch": 0.36564589174034934, + "grad_norm": 0.7229024171829224, + "learning_rate": 9.800656107539144e-06, + "loss": 0.7672, + "step": 6782 + }, + { + "epoch": 0.36569980590899287, + "grad_norm": 0.6779196858406067, + "learning_rate": 9.800596836953445e-06, + "loss": 0.7193, + "step": 6783 + }, + { + "epoch": 0.3657537200776364, + "grad_norm": 0.7451487183570862, + "learning_rate": 9.800537557736918e-06, + "loss": 0.7206, + "step": 6784 + }, + { + "epoch": 0.36580763424627993, + "grad_norm": 0.762353241443634, + "learning_rate": 9.800478269889675e-06, + "loss": 0.8719, + "step": 6785 + }, + { + "epoch": 0.36586154841492347, + "grad_norm": 0.6893555521965027, + "learning_rate": 9.800418973411818e-06, + "loss": 0.7471, + "step": 6786 + }, + { + "epoch": 0.36591546258356694, + "grad_norm": 0.6815069913864136, + "learning_rate": 9.800359668303454e-06, + "loss": 0.7639, + "step": 6787 + }, + { + "epoch": 0.3659693767522105, + "grad_norm": 0.854435384273529, + "learning_rate": 9.800300354564692e-06, + "loss": 0.7386, + "step": 6788 + }, + { + "epoch": 0.366023290920854, + "grad_norm": 0.78327476978302, + "learning_rate": 9.800241032195638e-06, + "loss": 0.7488, + "step": 6789 + }, + { + "epoch": 0.36607720508949754, + "grad_norm": 0.8019092082977295, + "learning_rate": 9.800181701196396e-06, + "loss": 0.8677, + "step": 6790 + }, + { + "epoch": 0.366131119258141, + "grad_norm": 0.7387152314186096, + "learning_rate": 9.800122361567077e-06, + "loss": 0.7428, + "step": 6791 + }, + { + "epoch": 0.36618503342678455, + "grad_norm": 0.6695654392242432, + "learning_rate": 9.800063013307784e-06, + "loss": 0.6293, + "step": 6792 + }, + { + "epoch": 0.3662389475954281, + "grad_norm": 0.7722582221031189, + "learning_rate": 9.800003656418626e-06, + "loss": 0.7132, + "step": 6793 + }, + { + "epoch": 0.3662928617640716, + "grad_norm": 0.7223978042602539, + "learning_rate": 9.799944290899708e-06, + "loss": 0.8326, + "step": 6794 + }, + { + "epoch": 0.36634677593271514, + "grad_norm": 0.8613165020942688, + "learning_rate": 9.799884916751139e-06, + "loss": 0.8471, + "step": 6795 + }, + { + "epoch": 0.3664006901013586, + "grad_norm": 0.74866783618927, + "learning_rate": 9.799825533973021e-06, + "loss": 0.7485, + "step": 6796 + }, + { + "epoch": 0.36645460427000215, + "grad_norm": 0.7431685924530029, + "learning_rate": 9.799766142565468e-06, + "loss": 0.8062, + "step": 6797 + }, + { + "epoch": 0.3665085184386457, + "grad_norm": 0.9503677487373352, + "learning_rate": 9.79970674252858e-06, + "loss": 0.8372, + "step": 6798 + }, + { + "epoch": 0.3665624326072892, + "grad_norm": 0.8219765424728394, + "learning_rate": 9.799647333862468e-06, + "loss": 0.8167, + "step": 6799 + }, + { + "epoch": 0.3666163467759327, + "grad_norm": 0.8645057678222656, + "learning_rate": 9.799587916567237e-06, + "loss": 0.7166, + "step": 6800 + }, + { + "epoch": 0.3666702609445762, + "grad_norm": 0.7951784729957581, + "learning_rate": 9.799528490642993e-06, + "loss": 0.7416, + "step": 6801 + }, + { + "epoch": 0.36672417511321975, + "grad_norm": 0.7034339904785156, + "learning_rate": 9.799469056089846e-06, + "loss": 0.748, + "step": 6802 + }, + { + "epoch": 0.3667780892818633, + "grad_norm": 0.6551966071128845, + "learning_rate": 9.7994096129079e-06, + "loss": 0.6962, + "step": 6803 + }, + { + "epoch": 0.3668320034505068, + "grad_norm": 0.7568443417549133, + "learning_rate": 9.799350161097264e-06, + "loss": 0.7421, + "step": 6804 + }, + { + "epoch": 0.3668859176191503, + "grad_norm": 0.6819575428962708, + "learning_rate": 9.799290700658041e-06, + "loss": 0.7346, + "step": 6805 + }, + { + "epoch": 0.3669398317877938, + "grad_norm": 0.6802785992622375, + "learning_rate": 9.799231231590343e-06, + "loss": 0.6852, + "step": 6806 + }, + { + "epoch": 0.36699374595643736, + "grad_norm": 0.9512805342674255, + "learning_rate": 9.799171753894275e-06, + "loss": 0.6612, + "step": 6807 + }, + { + "epoch": 0.3670476601250809, + "grad_norm": 0.8041424751281738, + "learning_rate": 9.799112267569943e-06, + "loss": 0.8658, + "step": 6808 + }, + { + "epoch": 0.36710157429372436, + "grad_norm": 0.8485898375511169, + "learning_rate": 9.799052772617452e-06, + "loss": 0.8528, + "step": 6809 + }, + { + "epoch": 0.3671554884623679, + "grad_norm": 0.7696205377578735, + "learning_rate": 9.798993269036913e-06, + "loss": 0.6882, + "step": 6810 + }, + { + "epoch": 0.36720940263101143, + "grad_norm": 0.6709177494049072, + "learning_rate": 9.798933756828433e-06, + "loss": 0.7186, + "step": 6811 + }, + { + "epoch": 0.36726331679965496, + "grad_norm": 0.7012826800346375, + "learning_rate": 9.798874235992115e-06, + "loss": 0.7671, + "step": 6812 + }, + { + "epoch": 0.3673172309682985, + "grad_norm": 0.7008756399154663, + "learning_rate": 9.79881470652807e-06, + "loss": 0.7607, + "step": 6813 + }, + { + "epoch": 0.36737114513694197, + "grad_norm": 0.9492394924163818, + "learning_rate": 9.798755168436402e-06, + "loss": 0.846, + "step": 6814 + }, + { + "epoch": 0.3674250593055855, + "grad_norm": 0.6980317831039429, + "learning_rate": 9.798695621717221e-06, + "loss": 0.7472, + "step": 6815 + }, + { + "epoch": 0.36747897347422903, + "grad_norm": 0.7253183722496033, + "learning_rate": 9.79863606637063e-06, + "loss": 0.7287, + "step": 6816 + }, + { + "epoch": 0.36753288764287256, + "grad_norm": 0.7307718992233276, + "learning_rate": 9.798576502396741e-06, + "loss": 0.8166, + "step": 6817 + }, + { + "epoch": 0.36758680181151604, + "grad_norm": 0.7275675535202026, + "learning_rate": 9.798516929795656e-06, + "loss": 0.8208, + "step": 6818 + }, + { + "epoch": 0.36764071598015957, + "grad_norm": 0.7654398083686829, + "learning_rate": 9.798457348567487e-06, + "loss": 0.7794, + "step": 6819 + }, + { + "epoch": 0.3676946301488031, + "grad_norm": 0.7827500104904175, + "learning_rate": 9.798397758712338e-06, + "loss": 0.7793, + "step": 6820 + }, + { + "epoch": 0.36774854431744664, + "grad_norm": 0.794808566570282, + "learning_rate": 9.798338160230319e-06, + "loss": 0.8406, + "step": 6821 + }, + { + "epoch": 0.36780245848609017, + "grad_norm": 1.1487492322921753, + "learning_rate": 9.798278553121533e-06, + "loss": 0.8149, + "step": 6822 + }, + { + "epoch": 0.36785637265473364, + "grad_norm": 0.8011932373046875, + "learning_rate": 9.798218937386089e-06, + "loss": 0.7724, + "step": 6823 + }, + { + "epoch": 0.3679102868233772, + "grad_norm": 0.7093950510025024, + "learning_rate": 9.798159313024095e-06, + "loss": 0.7318, + "step": 6824 + }, + { + "epoch": 0.3679642009920207, + "grad_norm": 0.7179621458053589, + "learning_rate": 9.798099680035657e-06, + "loss": 0.7203, + "step": 6825 + }, + { + "epoch": 0.36801811516066424, + "grad_norm": 0.7556604743003845, + "learning_rate": 9.798040038420884e-06, + "loss": 0.7931, + "step": 6826 + }, + { + "epoch": 0.3680720293293077, + "grad_norm": 0.6847818493843079, + "learning_rate": 9.797980388179882e-06, + "loss": 0.655, + "step": 6827 + }, + { + "epoch": 0.36812594349795125, + "grad_norm": 0.7528108358383179, + "learning_rate": 9.797920729312758e-06, + "loss": 0.7067, + "step": 6828 + }, + { + "epoch": 0.3681798576665948, + "grad_norm": 0.7894001007080078, + "learning_rate": 9.79786106181962e-06, + "loss": 0.7867, + "step": 6829 + }, + { + "epoch": 0.3682337718352383, + "grad_norm": 0.7251179814338684, + "learning_rate": 9.797801385700575e-06, + "loss": 0.6972, + "step": 6830 + }, + { + "epoch": 0.36828768600388184, + "grad_norm": 0.7494256496429443, + "learning_rate": 9.79774170095573e-06, + "loss": 0.7847, + "step": 6831 + }, + { + "epoch": 0.3683416001725253, + "grad_norm": 0.7221285700798035, + "learning_rate": 9.797682007585192e-06, + "loss": 0.7277, + "step": 6832 + }, + { + "epoch": 0.36839551434116885, + "grad_norm": 0.8274533152580261, + "learning_rate": 9.79762230558907e-06, + "loss": 0.7224, + "step": 6833 + }, + { + "epoch": 0.3684494285098124, + "grad_norm": 0.6401543617248535, + "learning_rate": 9.797562594967469e-06, + "loss": 0.64, + "step": 6834 + }, + { + "epoch": 0.3685033426784559, + "grad_norm": 0.7221955060958862, + "learning_rate": 9.797502875720497e-06, + "loss": 0.7074, + "step": 6835 + }, + { + "epoch": 0.3685572568470994, + "grad_norm": 0.7283970713615417, + "learning_rate": 9.797443147848264e-06, + "loss": 0.7548, + "step": 6836 + }, + { + "epoch": 0.3686111710157429, + "grad_norm": 0.8405442833900452, + "learning_rate": 9.797383411350874e-06, + "loss": 0.7408, + "step": 6837 + }, + { + "epoch": 0.36866508518438645, + "grad_norm": 0.8040479421615601, + "learning_rate": 9.797323666228435e-06, + "loss": 0.7602, + "step": 6838 + }, + { + "epoch": 0.36871899935303, + "grad_norm": 0.7866907119750977, + "learning_rate": 9.797263912481056e-06, + "loss": 0.7683, + "step": 6839 + }, + { + "epoch": 0.3687729135216735, + "grad_norm": 0.8023240566253662, + "learning_rate": 9.797204150108844e-06, + "loss": 0.8177, + "step": 6840 + }, + { + "epoch": 0.368826827690317, + "grad_norm": 0.8296356201171875, + "learning_rate": 9.797144379111903e-06, + "loss": 0.7478, + "step": 6841 + }, + { + "epoch": 0.3688807418589605, + "grad_norm": 0.720715343952179, + "learning_rate": 9.797084599490346e-06, + "loss": 0.6901, + "step": 6842 + }, + { + "epoch": 0.36893465602760406, + "grad_norm": 0.7493615746498108, + "learning_rate": 9.797024811244277e-06, + "loss": 0.7772, + "step": 6843 + }, + { + "epoch": 0.3689885701962476, + "grad_norm": 0.6748061180114746, + "learning_rate": 9.796965014373806e-06, + "loss": 0.6819, + "step": 6844 + }, + { + "epoch": 0.36904248436489107, + "grad_norm": 0.7307026386260986, + "learning_rate": 9.796905208879038e-06, + "loss": 0.7534, + "step": 6845 + }, + { + "epoch": 0.3690963985335346, + "grad_norm": 0.8521978259086609, + "learning_rate": 9.79684539476008e-06, + "loss": 0.7728, + "step": 6846 + }, + { + "epoch": 0.36915031270217813, + "grad_norm": 0.806467592716217, + "learning_rate": 9.796785572017043e-06, + "loss": 0.8593, + "step": 6847 + }, + { + "epoch": 0.36920422687082166, + "grad_norm": 0.7701194286346436, + "learning_rate": 9.796725740650031e-06, + "loss": 0.7665, + "step": 6848 + }, + { + "epoch": 0.3692581410394652, + "grad_norm": 0.6880419850349426, + "learning_rate": 9.796665900659152e-06, + "loss": 0.7103, + "step": 6849 + }, + { + "epoch": 0.36931205520810867, + "grad_norm": 0.7667257785797119, + "learning_rate": 9.796606052044517e-06, + "loss": 0.7431, + "step": 6850 + }, + { + "epoch": 0.3693659693767522, + "grad_norm": 0.7349566221237183, + "learning_rate": 9.796546194806232e-06, + "loss": 0.6624, + "step": 6851 + }, + { + "epoch": 0.36941988354539573, + "grad_norm": 0.8275545239448547, + "learning_rate": 9.796486328944402e-06, + "loss": 0.6743, + "step": 6852 + }, + { + "epoch": 0.36947379771403926, + "grad_norm": 0.9421560764312744, + "learning_rate": 9.796426454459135e-06, + "loss": 0.7725, + "step": 6853 + }, + { + "epoch": 0.3695277118826828, + "grad_norm": 0.6952826380729675, + "learning_rate": 9.796366571350542e-06, + "loss": 0.7523, + "step": 6854 + }, + { + "epoch": 0.3695816260513263, + "grad_norm": 0.7041664123535156, + "learning_rate": 9.79630667961873e-06, + "loss": 0.6987, + "step": 6855 + }, + { + "epoch": 0.3696355402199698, + "grad_norm": 0.8537763953208923, + "learning_rate": 9.796246779263803e-06, + "loss": 0.7215, + "step": 6856 + }, + { + "epoch": 0.36968945438861334, + "grad_norm": 0.7674410343170166, + "learning_rate": 9.796186870285873e-06, + "loss": 0.8097, + "step": 6857 + }, + { + "epoch": 0.36974336855725687, + "grad_norm": 0.6804946660995483, + "learning_rate": 9.796126952685044e-06, + "loss": 0.7652, + "step": 6858 + }, + { + "epoch": 0.36979728272590034, + "grad_norm": 0.7157647013664246, + "learning_rate": 9.796067026461427e-06, + "loss": 0.7514, + "step": 6859 + }, + { + "epoch": 0.3698511968945439, + "grad_norm": 0.6757988929748535, + "learning_rate": 9.79600709161513e-06, + "loss": 0.7187, + "step": 6860 + }, + { + "epoch": 0.3699051110631874, + "grad_norm": 0.7528148889541626, + "learning_rate": 9.795947148146255e-06, + "loss": 0.7237, + "step": 6861 + }, + { + "epoch": 0.36995902523183094, + "grad_norm": 0.8383325934410095, + "learning_rate": 9.795887196054917e-06, + "loss": 0.8799, + "step": 6862 + }, + { + "epoch": 0.37001293940047447, + "grad_norm": 0.7268112897872925, + "learning_rate": 9.79582723534122e-06, + "loss": 0.7667, + "step": 6863 + }, + { + "epoch": 0.37006685356911795, + "grad_norm": 0.7768090963363647, + "learning_rate": 9.795767266005271e-06, + "loss": 0.9178, + "step": 6864 + }, + { + "epoch": 0.3701207677377615, + "grad_norm": 0.7708846926689148, + "learning_rate": 9.79570728804718e-06, + "loss": 0.6583, + "step": 6865 + }, + { + "epoch": 0.370174681906405, + "grad_norm": 0.8274603486061096, + "learning_rate": 9.795647301467054e-06, + "loss": 0.7253, + "step": 6866 + }, + { + "epoch": 0.37022859607504854, + "grad_norm": 0.8143756985664368, + "learning_rate": 9.795587306265002e-06, + "loss": 0.6821, + "step": 6867 + }, + { + "epoch": 0.370282510243692, + "grad_norm": 0.7719526886940002, + "learning_rate": 9.79552730244113e-06, + "loss": 0.6409, + "step": 6868 + }, + { + "epoch": 0.37033642441233555, + "grad_norm": 0.8253846764564514, + "learning_rate": 9.795467289995549e-06, + "loss": 0.7624, + "step": 6869 + }, + { + "epoch": 0.3703903385809791, + "grad_norm": 0.9532812237739563, + "learning_rate": 9.795407268928362e-06, + "loss": 0.7441, + "step": 6870 + }, + { + "epoch": 0.3704442527496226, + "grad_norm": 0.7846934795379639, + "learning_rate": 9.795347239239678e-06, + "loss": 0.8205, + "step": 6871 + }, + { + "epoch": 0.37049816691826615, + "grad_norm": 0.8254940509796143, + "learning_rate": 9.795287200929609e-06, + "loss": 0.8532, + "step": 6872 + }, + { + "epoch": 0.3705520810869096, + "grad_norm": 1.0097715854644775, + "learning_rate": 9.79522715399826e-06, + "loss": 0.7086, + "step": 6873 + }, + { + "epoch": 0.37060599525555316, + "grad_norm": 0.7541647553443909, + "learning_rate": 9.795167098445739e-06, + "loss": 0.8013, + "step": 6874 + }, + { + "epoch": 0.3706599094241967, + "grad_norm": 0.7585844397544861, + "learning_rate": 9.795107034272154e-06, + "loss": 0.6829, + "step": 6875 + }, + { + "epoch": 0.3707138235928402, + "grad_norm": 0.6726213693618774, + "learning_rate": 9.795046961477615e-06, + "loss": 0.7169, + "step": 6876 + }, + { + "epoch": 0.3707677377614837, + "grad_norm": 0.7394425272941589, + "learning_rate": 9.794986880062225e-06, + "loss": 0.6868, + "step": 6877 + }, + { + "epoch": 0.3708216519301272, + "grad_norm": 1.1811943054199219, + "learning_rate": 9.794926790026098e-06, + "loss": 0.7089, + "step": 6878 + }, + { + "epoch": 0.37087556609877076, + "grad_norm": 0.7064139246940613, + "learning_rate": 9.794866691369337e-06, + "loss": 0.7787, + "step": 6879 + }, + { + "epoch": 0.3709294802674143, + "grad_norm": 0.7243874073028564, + "learning_rate": 9.794806584092056e-06, + "loss": 0.7308, + "step": 6880 + }, + { + "epoch": 0.3709833944360578, + "grad_norm": 1.0770469903945923, + "learning_rate": 9.794746468194357e-06, + "loss": 0.7223, + "step": 6881 + }, + { + "epoch": 0.3710373086047013, + "grad_norm": 0.7796792387962341, + "learning_rate": 9.794686343676349e-06, + "loss": 0.6869, + "step": 6882 + }, + { + "epoch": 0.37109122277334483, + "grad_norm": 0.7601329684257507, + "learning_rate": 9.794626210538146e-06, + "loss": 0.7921, + "step": 6883 + }, + { + "epoch": 0.37114513694198836, + "grad_norm": 0.7816028594970703, + "learning_rate": 9.794566068779848e-06, + "loss": 0.8324, + "step": 6884 + }, + { + "epoch": 0.3711990511106319, + "grad_norm": 0.7331908345222473, + "learning_rate": 9.794505918401568e-06, + "loss": 0.7392, + "step": 6885 + }, + { + "epoch": 0.37125296527927537, + "grad_norm": 0.8185816407203674, + "learning_rate": 9.794445759403413e-06, + "loss": 0.8046, + "step": 6886 + }, + { + "epoch": 0.3713068794479189, + "grad_norm": 0.8189926147460938, + "learning_rate": 9.79438559178549e-06, + "loss": 0.7273, + "step": 6887 + }, + { + "epoch": 0.37136079361656243, + "grad_norm": 0.8290352821350098, + "learning_rate": 9.79432541554791e-06, + "loss": 0.8448, + "step": 6888 + }, + { + "epoch": 0.37141470778520597, + "grad_norm": 0.771603524684906, + "learning_rate": 9.79426523069078e-06, + "loss": 0.8165, + "step": 6889 + }, + { + "epoch": 0.3714686219538495, + "grad_norm": 0.6970923542976379, + "learning_rate": 9.794205037214207e-06, + "loss": 0.7904, + "step": 6890 + }, + { + "epoch": 0.371522536122493, + "grad_norm": 0.7284457087516785, + "learning_rate": 9.7941448351183e-06, + "loss": 0.8258, + "step": 6891 + }, + { + "epoch": 0.3715764502911365, + "grad_norm": 0.7594045996665955, + "learning_rate": 9.794084624403166e-06, + "loss": 0.7696, + "step": 6892 + }, + { + "epoch": 0.37163036445978004, + "grad_norm": 0.686918318271637, + "learning_rate": 9.794024405068915e-06, + "loss": 0.6909, + "step": 6893 + }, + { + "epoch": 0.37168427862842357, + "grad_norm": 0.6950833201408386, + "learning_rate": 9.793964177115654e-06, + "loss": 0.7893, + "step": 6894 + }, + { + "epoch": 0.37173819279706705, + "grad_norm": 0.70562344789505, + "learning_rate": 9.793903940543493e-06, + "loss": 0.7546, + "step": 6895 + }, + { + "epoch": 0.3717921069657106, + "grad_norm": 0.806863009929657, + "learning_rate": 9.79384369535254e-06, + "loss": 0.6723, + "step": 6896 + }, + { + "epoch": 0.3718460211343541, + "grad_norm": 0.7545826435089111, + "learning_rate": 9.793783441542901e-06, + "loss": 0.7913, + "step": 6897 + }, + { + "epoch": 0.37189993530299764, + "grad_norm": 0.753429651260376, + "learning_rate": 9.793723179114687e-06, + "loss": 0.8453, + "step": 6898 + }, + { + "epoch": 0.3719538494716412, + "grad_norm": 0.7044295072555542, + "learning_rate": 9.793662908068005e-06, + "loss": 0.6852, + "step": 6899 + }, + { + "epoch": 0.37200776364028465, + "grad_norm": 0.7389166355133057, + "learning_rate": 9.793602628402964e-06, + "loss": 0.7612, + "step": 6900 + }, + { + "epoch": 0.3720616778089282, + "grad_norm": 0.6807994842529297, + "learning_rate": 9.79354234011967e-06, + "loss": 0.643, + "step": 6901 + }, + { + "epoch": 0.3721155919775717, + "grad_norm": 0.8061318397521973, + "learning_rate": 9.793482043218236e-06, + "loss": 0.7682, + "step": 6902 + }, + { + "epoch": 0.37216950614621525, + "grad_norm": 0.7490041851997375, + "learning_rate": 9.793421737698767e-06, + "loss": 0.6814, + "step": 6903 + }, + { + "epoch": 0.3722234203148587, + "grad_norm": 0.8781059980392456, + "learning_rate": 9.793361423561372e-06, + "loss": 0.9454, + "step": 6904 + }, + { + "epoch": 0.37227733448350225, + "grad_norm": 0.7331268787384033, + "learning_rate": 9.793301100806158e-06, + "loss": 0.733, + "step": 6905 + }, + { + "epoch": 0.3723312486521458, + "grad_norm": 0.6957118511199951, + "learning_rate": 9.793240769433238e-06, + "loss": 0.7437, + "step": 6906 + }, + { + "epoch": 0.3723851628207893, + "grad_norm": 0.7501729130744934, + "learning_rate": 9.793180429442716e-06, + "loss": 0.8469, + "step": 6907 + }, + { + "epoch": 0.37243907698943285, + "grad_norm": 0.7289294600486755, + "learning_rate": 9.793120080834701e-06, + "loss": 0.7343, + "step": 6908 + }, + { + "epoch": 0.3724929911580763, + "grad_norm": 0.8165875673294067, + "learning_rate": 9.793059723609306e-06, + "loss": 0.7658, + "step": 6909 + }, + { + "epoch": 0.37254690532671986, + "grad_norm": 0.7840486168861389, + "learning_rate": 9.792999357766632e-06, + "loss": 0.8269, + "step": 6910 + }, + { + "epoch": 0.3726008194953634, + "grad_norm": 0.7197390794754028, + "learning_rate": 9.792938983306794e-06, + "loss": 0.8034, + "step": 6911 + }, + { + "epoch": 0.3726547336640069, + "grad_norm": 0.7720826268196106, + "learning_rate": 9.792878600229898e-06, + "loss": 0.7453, + "step": 6912 + }, + { + "epoch": 0.3727086478326504, + "grad_norm": 0.7407202124595642, + "learning_rate": 9.792818208536052e-06, + "loss": 0.6816, + "step": 6913 + }, + { + "epoch": 0.37276256200129393, + "grad_norm": 0.7115561962127686, + "learning_rate": 9.792757808225367e-06, + "loss": 0.7387, + "step": 6914 + }, + { + "epoch": 0.37281647616993746, + "grad_norm": 0.7147287726402283, + "learning_rate": 9.792697399297947e-06, + "loss": 0.7041, + "step": 6915 + }, + { + "epoch": 0.372870390338581, + "grad_norm": 0.7222192883491516, + "learning_rate": 9.792636981753905e-06, + "loss": 0.7919, + "step": 6916 + }, + { + "epoch": 0.3729243045072245, + "grad_norm": 0.7770732045173645, + "learning_rate": 9.792576555593346e-06, + "loss": 0.8133, + "step": 6917 + }, + { + "epoch": 0.372978218675868, + "grad_norm": 0.7901696562767029, + "learning_rate": 9.792516120816384e-06, + "loss": 0.6469, + "step": 6918 + }, + { + "epoch": 0.37303213284451153, + "grad_norm": 0.7730213403701782, + "learning_rate": 9.792455677423122e-06, + "loss": 0.7771, + "step": 6919 + }, + { + "epoch": 0.37308604701315506, + "grad_norm": 0.7911192178726196, + "learning_rate": 9.792395225413673e-06, + "loss": 0.7814, + "step": 6920 + }, + { + "epoch": 0.3731399611817986, + "grad_norm": 0.6789551377296448, + "learning_rate": 9.792334764788143e-06, + "loss": 0.7014, + "step": 6921 + }, + { + "epoch": 0.3731938753504421, + "grad_norm": 0.7136116027832031, + "learning_rate": 9.792274295546641e-06, + "loss": 0.7472, + "step": 6922 + }, + { + "epoch": 0.3732477895190856, + "grad_norm": 0.6979206204414368, + "learning_rate": 9.792213817689276e-06, + "loss": 0.6595, + "step": 6923 + }, + { + "epoch": 0.37330170368772914, + "grad_norm": 0.9858417510986328, + "learning_rate": 9.792153331216158e-06, + "loss": 0.8843, + "step": 6924 + }, + { + "epoch": 0.37335561785637267, + "grad_norm": 0.7235552668571472, + "learning_rate": 9.792092836127393e-06, + "loss": 0.6897, + "step": 6925 + }, + { + "epoch": 0.3734095320250162, + "grad_norm": 0.6932023763656616, + "learning_rate": 9.792032332423094e-06, + "loss": 0.8096, + "step": 6926 + }, + { + "epoch": 0.3734634461936597, + "grad_norm": 0.7130481004714966, + "learning_rate": 9.791971820103365e-06, + "loss": 0.7245, + "step": 6927 + }, + { + "epoch": 0.3735173603623032, + "grad_norm": 1.0221943855285645, + "learning_rate": 9.791911299168317e-06, + "loss": 0.8183, + "step": 6928 + }, + { + "epoch": 0.37357127453094674, + "grad_norm": 0.6936571002006531, + "learning_rate": 9.79185076961806e-06, + "loss": 0.709, + "step": 6929 + }, + { + "epoch": 0.37362518869959027, + "grad_norm": 0.6904107332229614, + "learning_rate": 9.7917902314527e-06, + "loss": 0.8128, + "step": 6930 + }, + { + "epoch": 0.37367910286823375, + "grad_norm": 0.8130210041999817, + "learning_rate": 9.791729684672348e-06, + "loss": 0.8228, + "step": 6931 + }, + { + "epoch": 0.3737330170368773, + "grad_norm": 0.7137600779533386, + "learning_rate": 9.791669129277113e-06, + "loss": 0.7597, + "step": 6932 + }, + { + "epoch": 0.3737869312055208, + "grad_norm": 0.7492494583129883, + "learning_rate": 9.791608565267103e-06, + "loss": 0.8269, + "step": 6933 + }, + { + "epoch": 0.37384084537416434, + "grad_norm": 0.7429704666137695, + "learning_rate": 9.791547992642427e-06, + "loss": 0.778, + "step": 6934 + }, + { + "epoch": 0.3738947595428079, + "grad_norm": 0.6476424336433411, + "learning_rate": 9.791487411403193e-06, + "loss": 0.6519, + "step": 6935 + }, + { + "epoch": 0.37394867371145135, + "grad_norm": 0.7774726152420044, + "learning_rate": 9.791426821549512e-06, + "loss": 0.6967, + "step": 6936 + }, + { + "epoch": 0.3740025878800949, + "grad_norm": 0.6981332898139954, + "learning_rate": 9.791366223081491e-06, + "loss": 0.7911, + "step": 6937 + }, + { + "epoch": 0.3740565020487384, + "grad_norm": 0.7180396318435669, + "learning_rate": 9.79130561599924e-06, + "loss": 0.799, + "step": 6938 + }, + { + "epoch": 0.37411041621738195, + "grad_norm": 0.7640271782875061, + "learning_rate": 9.791245000302867e-06, + "loss": 0.7667, + "step": 6939 + }, + { + "epoch": 0.3741643303860254, + "grad_norm": 0.7280723452568054, + "learning_rate": 9.791184375992482e-06, + "loss": 0.7194, + "step": 6940 + }, + { + "epoch": 0.37421824455466896, + "grad_norm": 0.7007558345794678, + "learning_rate": 9.791123743068196e-06, + "loss": 0.7645, + "step": 6941 + }, + { + "epoch": 0.3742721587233125, + "grad_norm": 0.7428101897239685, + "learning_rate": 9.791063101530113e-06, + "loss": 0.8518, + "step": 6942 + }, + { + "epoch": 0.374326072891956, + "grad_norm": 0.679993212223053, + "learning_rate": 9.791002451378347e-06, + "loss": 0.6898, + "step": 6943 + }, + { + "epoch": 0.37437998706059955, + "grad_norm": 0.713161051273346, + "learning_rate": 9.790941792613002e-06, + "loss": 0.7229, + "step": 6944 + }, + { + "epoch": 0.374433901229243, + "grad_norm": 0.7696486115455627, + "learning_rate": 9.79088112523419e-06, + "loss": 0.8105, + "step": 6945 + }, + { + "epoch": 0.37448781539788656, + "grad_norm": 0.8551701903343201, + "learning_rate": 9.790820449242022e-06, + "loss": 0.7852, + "step": 6946 + }, + { + "epoch": 0.3745417295665301, + "grad_norm": 0.7431685328483582, + "learning_rate": 9.790759764636603e-06, + "loss": 0.8441, + "step": 6947 + }, + { + "epoch": 0.3745956437351736, + "grad_norm": 0.7315366268157959, + "learning_rate": 9.790699071418045e-06, + "loss": 0.8418, + "step": 6948 + }, + { + "epoch": 0.3746495579038171, + "grad_norm": 0.6586199402809143, + "learning_rate": 9.790638369586458e-06, + "loss": 0.7294, + "step": 6949 + }, + { + "epoch": 0.37470347207246063, + "grad_norm": 0.7396568655967712, + "learning_rate": 9.790577659141947e-06, + "loss": 0.695, + "step": 6950 + }, + { + "epoch": 0.37475738624110416, + "grad_norm": 0.8480649590492249, + "learning_rate": 9.790516940084625e-06, + "loss": 0.7373, + "step": 6951 + }, + { + "epoch": 0.3748113004097477, + "grad_norm": 0.7321200370788574, + "learning_rate": 9.790456212414598e-06, + "loss": 0.7213, + "step": 6952 + }, + { + "epoch": 0.3748652145783912, + "grad_norm": 0.8196578025817871, + "learning_rate": 9.790395476131978e-06, + "loss": 0.7353, + "step": 6953 + }, + { + "epoch": 0.3749191287470347, + "grad_norm": 0.8583417534828186, + "learning_rate": 9.790334731236872e-06, + "loss": 0.7872, + "step": 6954 + }, + { + "epoch": 0.37497304291567823, + "grad_norm": 0.761122465133667, + "learning_rate": 9.79027397772939e-06, + "loss": 0.7299, + "step": 6955 + }, + { + "epoch": 0.37502695708432177, + "grad_norm": 0.8275685906410217, + "learning_rate": 9.790213215609644e-06, + "loss": 0.8439, + "step": 6956 + }, + { + "epoch": 0.3750808712529653, + "grad_norm": 0.8194490075111389, + "learning_rate": 9.790152444877737e-06, + "loss": 0.7939, + "step": 6957 + }, + { + "epoch": 0.3751347854216088, + "grad_norm": 0.7357431054115295, + "learning_rate": 9.790091665533785e-06, + "loss": 0.7396, + "step": 6958 + }, + { + "epoch": 0.3751886995902523, + "grad_norm": 0.7256408929824829, + "learning_rate": 9.790030877577892e-06, + "loss": 0.6916, + "step": 6959 + }, + { + "epoch": 0.37524261375889584, + "grad_norm": 0.8127861022949219, + "learning_rate": 9.78997008101017e-06, + "loss": 0.8467, + "step": 6960 + }, + { + "epoch": 0.37529652792753937, + "grad_norm": 0.7045642137527466, + "learning_rate": 9.789909275830728e-06, + "loss": 0.7533, + "step": 6961 + }, + { + "epoch": 0.3753504420961829, + "grad_norm": 0.7065895199775696, + "learning_rate": 9.789848462039676e-06, + "loss": 0.7056, + "step": 6962 + }, + { + "epoch": 0.3754043562648264, + "grad_norm": 0.7797097563743591, + "learning_rate": 9.789787639637123e-06, + "loss": 0.8228, + "step": 6963 + }, + { + "epoch": 0.3754582704334699, + "grad_norm": 0.7362637519836426, + "learning_rate": 9.789726808623176e-06, + "loss": 0.7697, + "step": 6964 + }, + { + "epoch": 0.37551218460211344, + "grad_norm": 0.7192915081977844, + "learning_rate": 9.789665968997948e-06, + "loss": 0.7333, + "step": 6965 + }, + { + "epoch": 0.375566098770757, + "grad_norm": 0.7071816921234131, + "learning_rate": 9.789605120761545e-06, + "loss": 0.7954, + "step": 6966 + }, + { + "epoch": 0.37562001293940045, + "grad_norm": 0.7744930386543274, + "learning_rate": 9.78954426391408e-06, + "loss": 0.7834, + "step": 6967 + }, + { + "epoch": 0.375673927108044, + "grad_norm": 0.6952130198478699, + "learning_rate": 9.789483398455657e-06, + "loss": 0.7256, + "step": 6968 + }, + { + "epoch": 0.3757278412766875, + "grad_norm": 0.6868245601654053, + "learning_rate": 9.789422524386392e-06, + "loss": 0.7377, + "step": 6969 + }, + { + "epoch": 0.37578175544533104, + "grad_norm": 0.7161628603935242, + "learning_rate": 9.789361641706389e-06, + "loss": 0.776, + "step": 6970 + }, + { + "epoch": 0.3758356696139746, + "grad_norm": 0.6631112098693848, + "learning_rate": 9.789300750415763e-06, + "loss": 0.7588, + "step": 6971 + }, + { + "epoch": 0.37588958378261805, + "grad_norm": 0.8051118850708008, + "learning_rate": 9.789239850514616e-06, + "loss": 0.8193, + "step": 6972 + }, + { + "epoch": 0.3759434979512616, + "grad_norm": 0.7175277471542358, + "learning_rate": 9.789178942003063e-06, + "loss": 0.8385, + "step": 6973 + }, + { + "epoch": 0.3759974121199051, + "grad_norm": 0.6904747486114502, + "learning_rate": 9.789118024881215e-06, + "loss": 0.7602, + "step": 6974 + }, + { + "epoch": 0.37605132628854865, + "grad_norm": 0.7528780698776245, + "learning_rate": 9.789057099149176e-06, + "loss": 0.7419, + "step": 6975 + }, + { + "epoch": 0.3761052404571921, + "grad_norm": 0.7037287950515747, + "learning_rate": 9.788996164807058e-06, + "loss": 0.7707, + "step": 6976 + }, + { + "epoch": 0.37615915462583566, + "grad_norm": 0.8184666633605957, + "learning_rate": 9.788935221854972e-06, + "loss": 0.7773, + "step": 6977 + }, + { + "epoch": 0.3762130687944792, + "grad_norm": 0.6951115727424622, + "learning_rate": 9.788874270293026e-06, + "loss": 0.7398, + "step": 6978 + }, + { + "epoch": 0.3762669829631227, + "grad_norm": 0.7333779335021973, + "learning_rate": 9.78881331012133e-06, + "loss": 0.6817, + "step": 6979 + }, + { + "epoch": 0.37632089713176625, + "grad_norm": 0.8382815718650818, + "learning_rate": 9.788752341339994e-06, + "loss": 0.7949, + "step": 6980 + }, + { + "epoch": 0.37637481130040973, + "grad_norm": 0.7410640716552734, + "learning_rate": 9.788691363949126e-06, + "loss": 0.834, + "step": 6981 + }, + { + "epoch": 0.37642872546905326, + "grad_norm": 0.6883760094642639, + "learning_rate": 9.788630377948835e-06, + "loss": 0.6931, + "step": 6982 + }, + { + "epoch": 0.3764826396376968, + "grad_norm": 0.6875273585319519, + "learning_rate": 9.788569383339236e-06, + "loss": 0.732, + "step": 6983 + }, + { + "epoch": 0.3765365538063403, + "grad_norm": 0.7382376790046692, + "learning_rate": 9.788508380120434e-06, + "loss": 0.797, + "step": 6984 + }, + { + "epoch": 0.3765904679749838, + "grad_norm": 0.7307778000831604, + "learning_rate": 9.788447368292539e-06, + "loss": 0.7108, + "step": 6985 + }, + { + "epoch": 0.37664438214362733, + "grad_norm": 0.7628811001777649, + "learning_rate": 9.788386347855662e-06, + "loss": 0.6364, + "step": 6986 + }, + { + "epoch": 0.37669829631227086, + "grad_norm": 0.734599232673645, + "learning_rate": 9.788325318809912e-06, + "loss": 0.7229, + "step": 6987 + }, + { + "epoch": 0.3767522104809144, + "grad_norm": 0.7767276763916016, + "learning_rate": 9.7882642811554e-06, + "loss": 0.8374, + "step": 6988 + }, + { + "epoch": 0.3768061246495579, + "grad_norm": 0.9289737939834595, + "learning_rate": 9.788203234892233e-06, + "loss": 0.7133, + "step": 6989 + }, + { + "epoch": 0.3768600388182014, + "grad_norm": 0.7036685347557068, + "learning_rate": 9.788142180020524e-06, + "loss": 0.7752, + "step": 6990 + }, + { + "epoch": 0.37691395298684494, + "grad_norm": 0.6594575047492981, + "learning_rate": 9.788081116540378e-06, + "loss": 0.7445, + "step": 6991 + }, + { + "epoch": 0.37696786715548847, + "grad_norm": 0.7152668833732605, + "learning_rate": 9.788020044451911e-06, + "loss": 0.7338, + "step": 6992 + }, + { + "epoch": 0.377021781324132, + "grad_norm": 0.7223868370056152, + "learning_rate": 9.787958963755228e-06, + "loss": 0.6079, + "step": 6993 + }, + { + "epoch": 0.3770756954927755, + "grad_norm": 0.8862934708595276, + "learning_rate": 9.787897874450443e-06, + "loss": 0.7908, + "step": 6994 + }, + { + "epoch": 0.377129609661419, + "grad_norm": 0.7505145072937012, + "learning_rate": 9.787836776537661e-06, + "loss": 0.7758, + "step": 6995 + }, + { + "epoch": 0.37718352383006254, + "grad_norm": 0.6852383613586426, + "learning_rate": 9.787775670016995e-06, + "loss": 0.7495, + "step": 6996 + }, + { + "epoch": 0.37723743799870607, + "grad_norm": 0.8175929188728333, + "learning_rate": 9.787714554888554e-06, + "loss": 0.8116, + "step": 6997 + }, + { + "epoch": 0.3772913521673496, + "grad_norm": 0.7715110182762146, + "learning_rate": 9.787653431152448e-06, + "loss": 0.7824, + "step": 6998 + }, + { + "epoch": 0.3773452663359931, + "grad_norm": 0.7201720476150513, + "learning_rate": 9.787592298808786e-06, + "loss": 0.817, + "step": 6999 + }, + { + "epoch": 0.3773991805046366, + "grad_norm": 0.751273512840271, + "learning_rate": 9.787531157857679e-06, + "loss": 0.7845, + "step": 7000 + }, + { + "epoch": 0.37745309467328014, + "grad_norm": 0.7723568677902222, + "learning_rate": 9.787470008299238e-06, + "loss": 0.7953, + "step": 7001 + }, + { + "epoch": 0.3775070088419237, + "grad_norm": 0.6954260468482971, + "learning_rate": 9.787408850133571e-06, + "loss": 0.7847, + "step": 7002 + }, + { + "epoch": 0.37756092301056715, + "grad_norm": 0.766312837600708, + "learning_rate": 9.787347683360788e-06, + "loss": 0.7249, + "step": 7003 + }, + { + "epoch": 0.3776148371792107, + "grad_norm": 0.8782551884651184, + "learning_rate": 9.787286507981e-06, + "loss": 0.8922, + "step": 7004 + }, + { + "epoch": 0.3776687513478542, + "grad_norm": 0.8427724838256836, + "learning_rate": 9.787225323994316e-06, + "loss": 0.8694, + "step": 7005 + }, + { + "epoch": 0.37772266551649775, + "grad_norm": 0.6549016833305359, + "learning_rate": 9.787164131400846e-06, + "loss": 0.6598, + "step": 7006 + }, + { + "epoch": 0.3777765796851413, + "grad_norm": 0.7378600835800171, + "learning_rate": 9.787102930200702e-06, + "loss": 0.7691, + "step": 7007 + }, + { + "epoch": 0.37783049385378475, + "grad_norm": 0.7623856663703918, + "learning_rate": 9.787041720393993e-06, + "loss": 0.8026, + "step": 7008 + }, + { + "epoch": 0.3778844080224283, + "grad_norm": 0.85904461145401, + "learning_rate": 9.786980501980827e-06, + "loss": 0.8413, + "step": 7009 + }, + { + "epoch": 0.3779383221910718, + "grad_norm": 0.7900813221931458, + "learning_rate": 9.786919274961318e-06, + "loss": 0.7901, + "step": 7010 + }, + { + "epoch": 0.37799223635971535, + "grad_norm": 0.7434378862380981, + "learning_rate": 9.786858039335571e-06, + "loss": 0.7645, + "step": 7011 + }, + { + "epoch": 0.3780461505283588, + "grad_norm": 0.6656724214553833, + "learning_rate": 9.786796795103698e-06, + "loss": 0.6802, + "step": 7012 + }, + { + "epoch": 0.37810006469700236, + "grad_norm": 0.8180235028266907, + "learning_rate": 9.786735542265814e-06, + "loss": 0.7637, + "step": 7013 + }, + { + "epoch": 0.3781539788656459, + "grad_norm": 0.7488604187965393, + "learning_rate": 9.786674280822023e-06, + "loss": 0.7837, + "step": 7014 + }, + { + "epoch": 0.3782078930342894, + "grad_norm": 0.7408158183097839, + "learning_rate": 9.786613010772437e-06, + "loss": 0.7832, + "step": 7015 + }, + { + "epoch": 0.37826180720293295, + "grad_norm": 0.6865716576576233, + "learning_rate": 9.786551732117166e-06, + "loss": 0.735, + "step": 7016 + }, + { + "epoch": 0.37831572137157643, + "grad_norm": 0.7515228390693665, + "learning_rate": 9.786490444856321e-06, + "loss": 0.6537, + "step": 7017 + }, + { + "epoch": 0.37836963554021996, + "grad_norm": 0.8293788433074951, + "learning_rate": 9.786429148990012e-06, + "loss": 0.7292, + "step": 7018 + }, + { + "epoch": 0.3784235497088635, + "grad_norm": 0.9012221097946167, + "learning_rate": 9.78636784451835e-06, + "loss": 0.7987, + "step": 7019 + }, + { + "epoch": 0.378477463877507, + "grad_norm": 0.6629627346992493, + "learning_rate": 9.786306531441442e-06, + "loss": 0.6516, + "step": 7020 + }, + { + "epoch": 0.3785313780461505, + "grad_norm": 0.7131246328353882, + "learning_rate": 9.786245209759401e-06, + "loss": 0.7352, + "step": 7021 + }, + { + "epoch": 0.37858529221479403, + "grad_norm": 0.815075159072876, + "learning_rate": 9.786183879472337e-06, + "loss": 0.792, + "step": 7022 + }, + { + "epoch": 0.37863920638343757, + "grad_norm": 0.7357726693153381, + "learning_rate": 9.78612254058036e-06, + "loss": 0.7674, + "step": 7023 + }, + { + "epoch": 0.3786931205520811, + "grad_norm": 0.815360963344574, + "learning_rate": 9.78606119308358e-06, + "loss": 0.7803, + "step": 7024 + }, + { + "epoch": 0.37874703472072463, + "grad_norm": 0.7957213521003723, + "learning_rate": 9.785999836982108e-06, + "loss": 0.7418, + "step": 7025 + }, + { + "epoch": 0.3788009488893681, + "grad_norm": 0.8183251619338989, + "learning_rate": 9.785938472276054e-06, + "loss": 0.7743, + "step": 7026 + }, + { + "epoch": 0.37885486305801164, + "grad_norm": 0.7750178575515747, + "learning_rate": 9.785877098965526e-06, + "loss": 0.6911, + "step": 7027 + }, + { + "epoch": 0.37890877722665517, + "grad_norm": 0.7593466639518738, + "learning_rate": 9.785815717050639e-06, + "loss": 0.8251, + "step": 7028 + }, + { + "epoch": 0.3789626913952987, + "grad_norm": 0.878562331199646, + "learning_rate": 9.785754326531499e-06, + "loss": 0.8174, + "step": 7029 + }, + { + "epoch": 0.3790166055639422, + "grad_norm": 0.7743996977806091, + "learning_rate": 9.785692927408219e-06, + "loss": 0.7118, + "step": 7030 + }, + { + "epoch": 0.3790705197325857, + "grad_norm": 0.7831797003746033, + "learning_rate": 9.785631519680908e-06, + "loss": 0.7661, + "step": 7031 + }, + { + "epoch": 0.37912443390122924, + "grad_norm": 0.7243192195892334, + "learning_rate": 9.785570103349678e-06, + "loss": 0.6711, + "step": 7032 + }, + { + "epoch": 0.3791783480698728, + "grad_norm": 0.704127311706543, + "learning_rate": 9.785508678414637e-06, + "loss": 0.7574, + "step": 7033 + }, + { + "epoch": 0.3792322622385163, + "grad_norm": 0.810826301574707, + "learning_rate": 9.785447244875898e-06, + "loss": 0.8035, + "step": 7034 + }, + { + "epoch": 0.3792861764071598, + "grad_norm": 0.6859053373336792, + "learning_rate": 9.78538580273357e-06, + "loss": 0.7195, + "step": 7035 + }, + { + "epoch": 0.3793400905758033, + "grad_norm": 0.6804265975952148, + "learning_rate": 9.785324351987763e-06, + "loss": 0.6705, + "step": 7036 + }, + { + "epoch": 0.37939400474444684, + "grad_norm": 0.7749894857406616, + "learning_rate": 9.785262892638589e-06, + "loss": 0.7768, + "step": 7037 + }, + { + "epoch": 0.3794479189130904, + "grad_norm": 1.082223653793335, + "learning_rate": 9.785201424686157e-06, + "loss": 0.6538, + "step": 7038 + }, + { + "epoch": 0.37950183308173385, + "grad_norm": 0.645409345626831, + "learning_rate": 9.78513994813058e-06, + "loss": 0.6847, + "step": 7039 + }, + { + "epoch": 0.3795557472503774, + "grad_norm": 0.6922236084938049, + "learning_rate": 9.785078462971965e-06, + "loss": 0.7456, + "step": 7040 + }, + { + "epoch": 0.3796096614190209, + "grad_norm": 0.7990418076515198, + "learning_rate": 9.785016969210425e-06, + "loss": 0.7988, + "step": 7041 + }, + { + "epoch": 0.37966357558766445, + "grad_norm": 0.7243112325668335, + "learning_rate": 9.784955466846069e-06, + "loss": 0.7619, + "step": 7042 + }, + { + "epoch": 0.379717489756308, + "grad_norm": 0.7268019914627075, + "learning_rate": 9.78489395587901e-06, + "loss": 0.7163, + "step": 7043 + }, + { + "epoch": 0.37977140392495146, + "grad_norm": 0.6868086457252502, + "learning_rate": 9.784832436309355e-06, + "loss": 0.7547, + "step": 7044 + }, + { + "epoch": 0.379825318093595, + "grad_norm": 0.7140119075775146, + "learning_rate": 9.784770908137217e-06, + "loss": 0.7881, + "step": 7045 + }, + { + "epoch": 0.3798792322622385, + "grad_norm": 0.7873802185058594, + "learning_rate": 9.784709371362708e-06, + "loss": 0.8094, + "step": 7046 + }, + { + "epoch": 0.37993314643088205, + "grad_norm": 0.6661947965621948, + "learning_rate": 9.784647825985935e-06, + "loss": 0.7504, + "step": 7047 + }, + { + "epoch": 0.37998706059952553, + "grad_norm": 0.7648390531539917, + "learning_rate": 9.784586272007012e-06, + "loss": 0.7529, + "step": 7048 + }, + { + "epoch": 0.38004097476816906, + "grad_norm": 0.7275189757347107, + "learning_rate": 9.784524709426046e-06, + "loss": 0.7164, + "step": 7049 + }, + { + "epoch": 0.3800948889368126, + "grad_norm": 0.7488008141517639, + "learning_rate": 9.784463138243153e-06, + "loss": 0.7956, + "step": 7050 + }, + { + "epoch": 0.3801488031054561, + "grad_norm": 0.7896369695663452, + "learning_rate": 9.784401558458438e-06, + "loss": 0.8353, + "step": 7051 + }, + { + "epoch": 0.38020271727409966, + "grad_norm": 0.6925151348114014, + "learning_rate": 9.784339970072015e-06, + "loss": 0.6538, + "step": 7052 + }, + { + "epoch": 0.38025663144274313, + "grad_norm": 0.7564902901649475, + "learning_rate": 9.784278373083995e-06, + "loss": 0.7738, + "step": 7053 + }, + { + "epoch": 0.38031054561138666, + "grad_norm": 0.7728143334388733, + "learning_rate": 9.784216767494486e-06, + "loss": 0.7377, + "step": 7054 + }, + { + "epoch": 0.3803644597800302, + "grad_norm": 0.8956003785133362, + "learning_rate": 9.784155153303601e-06, + "loss": 0.8548, + "step": 7055 + }, + { + "epoch": 0.3804183739486737, + "grad_norm": 0.7866317629814148, + "learning_rate": 9.784093530511452e-06, + "loss": 0.7465, + "step": 7056 + }, + { + "epoch": 0.3804722881173172, + "grad_norm": 0.7851076722145081, + "learning_rate": 9.784031899118146e-06, + "loss": 0.7832, + "step": 7057 + }, + { + "epoch": 0.38052620228596074, + "grad_norm": 0.7084400653839111, + "learning_rate": 9.783970259123797e-06, + "loss": 0.7861, + "step": 7058 + }, + { + "epoch": 0.38058011645460427, + "grad_norm": 0.7742816805839539, + "learning_rate": 9.783908610528514e-06, + "loss": 0.7276, + "step": 7059 + }, + { + "epoch": 0.3806340306232478, + "grad_norm": 0.799237847328186, + "learning_rate": 9.78384695333241e-06, + "loss": 0.9005, + "step": 7060 + }, + { + "epoch": 0.38068794479189133, + "grad_norm": 0.6975693702697754, + "learning_rate": 9.783785287535592e-06, + "loss": 0.7836, + "step": 7061 + }, + { + "epoch": 0.3807418589605348, + "grad_norm": 0.7811087369918823, + "learning_rate": 9.783723613138174e-06, + "loss": 0.8322, + "step": 7062 + }, + { + "epoch": 0.38079577312917834, + "grad_norm": 0.7736833095550537, + "learning_rate": 9.783661930140267e-06, + "loss": 0.8224, + "step": 7063 + }, + { + "epoch": 0.38084968729782187, + "grad_norm": 0.7396823167800903, + "learning_rate": 9.78360023854198e-06, + "loss": 0.6992, + "step": 7064 + }, + { + "epoch": 0.3809036014664654, + "grad_norm": 0.7272439002990723, + "learning_rate": 9.783538538343426e-06, + "loss": 0.7133, + "step": 7065 + }, + { + "epoch": 0.38095751563510893, + "grad_norm": 0.6347727179527283, + "learning_rate": 9.783476829544713e-06, + "loss": 0.6264, + "step": 7066 + }, + { + "epoch": 0.3810114298037524, + "grad_norm": 0.6885404586791992, + "learning_rate": 9.783415112145956e-06, + "loss": 0.7112, + "step": 7067 + }, + { + "epoch": 0.38106534397239594, + "grad_norm": 0.8151293992996216, + "learning_rate": 9.783353386147264e-06, + "loss": 0.7618, + "step": 7068 + }, + { + "epoch": 0.3811192581410395, + "grad_norm": 0.8563313484191895, + "learning_rate": 9.783291651548745e-06, + "loss": 0.8474, + "step": 7069 + }, + { + "epoch": 0.381173172309683, + "grad_norm": 0.7840670347213745, + "learning_rate": 9.783229908350515e-06, + "loss": 0.8037, + "step": 7070 + }, + { + "epoch": 0.3812270864783265, + "grad_norm": 0.8969873785972595, + "learning_rate": 9.783168156552682e-06, + "loss": 0.7194, + "step": 7071 + }, + { + "epoch": 0.38128100064697, + "grad_norm": 0.7913740873336792, + "learning_rate": 9.783106396155357e-06, + "loss": 0.8437, + "step": 7072 + }, + { + "epoch": 0.38133491481561355, + "grad_norm": 0.7702563405036926, + "learning_rate": 9.783044627158654e-06, + "loss": 0.7086, + "step": 7073 + }, + { + "epoch": 0.3813888289842571, + "grad_norm": 0.7691439390182495, + "learning_rate": 9.78298284956268e-06, + "loss": 0.8479, + "step": 7074 + }, + { + "epoch": 0.3814427431529006, + "grad_norm": 0.918720543384552, + "learning_rate": 9.78292106336755e-06, + "loss": 0.8995, + "step": 7075 + }, + { + "epoch": 0.3814966573215441, + "grad_norm": 0.7640904784202576, + "learning_rate": 9.78285926857337e-06, + "loss": 0.6897, + "step": 7076 + }, + { + "epoch": 0.3815505714901876, + "grad_norm": 0.7212051153182983, + "learning_rate": 9.782797465180256e-06, + "loss": 0.6184, + "step": 7077 + }, + { + "epoch": 0.38160448565883115, + "grad_norm": 0.6944488286972046, + "learning_rate": 9.782735653188317e-06, + "loss": 0.7918, + "step": 7078 + }, + { + "epoch": 0.3816583998274747, + "grad_norm": 0.6525841355323792, + "learning_rate": 9.782673832597664e-06, + "loss": 0.6701, + "step": 7079 + }, + { + "epoch": 0.38171231399611816, + "grad_norm": 0.8735437989234924, + "learning_rate": 9.782612003408408e-06, + "loss": 0.7685, + "step": 7080 + }, + { + "epoch": 0.3817662281647617, + "grad_norm": 0.6927170157432556, + "learning_rate": 9.78255016562066e-06, + "loss": 0.7491, + "step": 7081 + }, + { + "epoch": 0.3818201423334052, + "grad_norm": 0.7590362429618835, + "learning_rate": 9.782488319234536e-06, + "loss": 0.7758, + "step": 7082 + }, + { + "epoch": 0.38187405650204875, + "grad_norm": 0.7485416531562805, + "learning_rate": 9.782426464250138e-06, + "loss": 0.7282, + "step": 7083 + }, + { + "epoch": 0.3819279706706923, + "grad_norm": 0.8229808211326599, + "learning_rate": 9.782364600667585e-06, + "loss": 0.7586, + "step": 7084 + }, + { + "epoch": 0.38198188483933576, + "grad_norm": 0.6965273022651672, + "learning_rate": 9.782302728486985e-06, + "loss": 0.7807, + "step": 7085 + }, + { + "epoch": 0.3820357990079793, + "grad_norm": 0.782135546207428, + "learning_rate": 9.78224084770845e-06, + "loss": 0.7857, + "step": 7086 + }, + { + "epoch": 0.3820897131766228, + "grad_norm": 0.7080146670341492, + "learning_rate": 9.78217895833209e-06, + "loss": 0.6938, + "step": 7087 + }, + { + "epoch": 0.38214362734526636, + "grad_norm": 0.6996012330055237, + "learning_rate": 9.782117060358017e-06, + "loss": 0.7202, + "step": 7088 + }, + { + "epoch": 0.38219754151390983, + "grad_norm": 0.7857486009597778, + "learning_rate": 9.782055153786342e-06, + "loss": 0.8207, + "step": 7089 + }, + { + "epoch": 0.38225145568255336, + "grad_norm": 0.7938824892044067, + "learning_rate": 9.78199323861718e-06, + "loss": 0.793, + "step": 7090 + }, + { + "epoch": 0.3823053698511969, + "grad_norm": 0.7517973780632019, + "learning_rate": 9.781931314850637e-06, + "loss": 0.8071, + "step": 7091 + }, + { + "epoch": 0.38235928401984043, + "grad_norm": 0.7473757863044739, + "learning_rate": 9.781869382486827e-06, + "loss": 0.8683, + "step": 7092 + }, + { + "epoch": 0.38241319818848396, + "grad_norm": 0.6528701186180115, + "learning_rate": 9.781807441525859e-06, + "loss": 0.702, + "step": 7093 + }, + { + "epoch": 0.38246711235712744, + "grad_norm": 0.7527685165405273, + "learning_rate": 9.781745491967847e-06, + "loss": 0.7458, + "step": 7094 + }, + { + "epoch": 0.38252102652577097, + "grad_norm": 0.8190217614173889, + "learning_rate": 9.781683533812903e-06, + "loss": 0.844, + "step": 7095 + }, + { + "epoch": 0.3825749406944145, + "grad_norm": 0.7233890891075134, + "learning_rate": 9.781621567061135e-06, + "loss": 0.7877, + "step": 7096 + }, + { + "epoch": 0.38262885486305803, + "grad_norm": 0.9560993909835815, + "learning_rate": 9.781559591712657e-06, + "loss": 0.7122, + "step": 7097 + }, + { + "epoch": 0.3826827690317015, + "grad_norm": 1.2268224954605103, + "learning_rate": 9.781497607767581e-06, + "loss": 0.7091, + "step": 7098 + }, + { + "epoch": 0.38273668320034504, + "grad_norm": 0.7089884281158447, + "learning_rate": 9.781435615226015e-06, + "loss": 0.7065, + "step": 7099 + }, + { + "epoch": 0.38279059736898857, + "grad_norm": 0.814584493637085, + "learning_rate": 9.781373614088074e-06, + "loss": 0.7798, + "step": 7100 + }, + { + "epoch": 0.3828445115376321, + "grad_norm": 1.0395959615707397, + "learning_rate": 9.781311604353867e-06, + "loss": 0.808, + "step": 7101 + }, + { + "epoch": 0.38289842570627564, + "grad_norm": 0.7653658390045166, + "learning_rate": 9.781249586023508e-06, + "loss": 0.6993, + "step": 7102 + }, + { + "epoch": 0.3829523398749191, + "grad_norm": 1.231651782989502, + "learning_rate": 9.781187559097105e-06, + "loss": 0.6384, + "step": 7103 + }, + { + "epoch": 0.38300625404356264, + "grad_norm": 0.8316325545310974, + "learning_rate": 9.781125523574775e-06, + "loss": 0.7548, + "step": 7104 + }, + { + "epoch": 0.3830601682122062, + "grad_norm": 0.9582620859146118, + "learning_rate": 9.781063479456622e-06, + "loss": 0.8951, + "step": 7105 + }, + { + "epoch": 0.3831140823808497, + "grad_norm": 0.6989379525184631, + "learning_rate": 9.781001426742766e-06, + "loss": 0.6727, + "step": 7106 + }, + { + "epoch": 0.3831679965494932, + "grad_norm": 0.8801706433296204, + "learning_rate": 9.78093936543331e-06, + "loss": 0.8299, + "step": 7107 + }, + { + "epoch": 0.3832219107181367, + "grad_norm": 1.0359632968902588, + "learning_rate": 9.780877295528374e-06, + "loss": 0.8703, + "step": 7108 + }, + { + "epoch": 0.38327582488678025, + "grad_norm": 0.6963036060333252, + "learning_rate": 9.780815217028063e-06, + "loss": 0.723, + "step": 7109 + }, + { + "epoch": 0.3833297390554238, + "grad_norm": 0.9630826711654663, + "learning_rate": 9.780753129932492e-06, + "loss": 0.7917, + "step": 7110 + }, + { + "epoch": 0.3833836532240673, + "grad_norm": 0.7008663415908813, + "learning_rate": 9.78069103424177e-06, + "loss": 0.7417, + "step": 7111 + }, + { + "epoch": 0.3834375673927108, + "grad_norm": 1.047590732574463, + "learning_rate": 9.780628929956012e-06, + "loss": 0.7374, + "step": 7112 + }, + { + "epoch": 0.3834914815613543, + "grad_norm": 0.8497990965843201, + "learning_rate": 9.780566817075328e-06, + "loss": 0.7152, + "step": 7113 + }, + { + "epoch": 0.38354539572999785, + "grad_norm": 0.7189215421676636, + "learning_rate": 9.780504695599828e-06, + "loss": 0.7391, + "step": 7114 + }, + { + "epoch": 0.3835993098986414, + "grad_norm": 0.7899144887924194, + "learning_rate": 9.780442565529627e-06, + "loss": 0.8038, + "step": 7115 + }, + { + "epoch": 0.38365322406728486, + "grad_norm": 0.7165384888648987, + "learning_rate": 9.780380426864833e-06, + "loss": 0.7152, + "step": 7116 + }, + { + "epoch": 0.3837071382359284, + "grad_norm": 0.7541099786758423, + "learning_rate": 9.78031827960556e-06, + "loss": 0.6174, + "step": 7117 + }, + { + "epoch": 0.3837610524045719, + "grad_norm": 0.7759084105491638, + "learning_rate": 9.780256123751922e-06, + "loss": 0.7956, + "step": 7118 + }, + { + "epoch": 0.38381496657321545, + "grad_norm": 0.7828801274299622, + "learning_rate": 9.780193959304026e-06, + "loss": 0.8298, + "step": 7119 + }, + { + "epoch": 0.383868880741859, + "grad_norm": 0.7594059109687805, + "learning_rate": 9.780131786261987e-06, + "loss": 0.7468, + "step": 7120 + }, + { + "epoch": 0.38392279491050246, + "grad_norm": 0.8509312272071838, + "learning_rate": 9.780069604625914e-06, + "loss": 0.8449, + "step": 7121 + }, + { + "epoch": 0.383976709079146, + "grad_norm": 0.814454972743988, + "learning_rate": 9.780007414395923e-06, + "loss": 0.8111, + "step": 7122 + }, + { + "epoch": 0.3840306232477895, + "grad_norm": 0.8145062327384949, + "learning_rate": 9.779945215572122e-06, + "loss": 0.8345, + "step": 7123 + }, + { + "epoch": 0.38408453741643306, + "grad_norm": 0.909447968006134, + "learning_rate": 9.779883008154625e-06, + "loss": 0.9282, + "step": 7124 + }, + { + "epoch": 0.38413845158507653, + "grad_norm": 0.7273494601249695, + "learning_rate": 9.779820792143542e-06, + "loss": 0.7642, + "step": 7125 + }, + { + "epoch": 0.38419236575372007, + "grad_norm": 0.8186411261558533, + "learning_rate": 9.779758567538986e-06, + "loss": 0.8227, + "step": 7126 + }, + { + "epoch": 0.3842462799223636, + "grad_norm": 0.7517033815383911, + "learning_rate": 9.779696334341068e-06, + "loss": 0.6771, + "step": 7127 + }, + { + "epoch": 0.38430019409100713, + "grad_norm": 0.7327735424041748, + "learning_rate": 9.779634092549903e-06, + "loss": 0.8404, + "step": 7128 + }, + { + "epoch": 0.38435410825965066, + "grad_norm": 0.7326714396476746, + "learning_rate": 9.779571842165599e-06, + "loss": 0.752, + "step": 7129 + }, + { + "epoch": 0.38440802242829414, + "grad_norm": 0.7145928740501404, + "learning_rate": 9.77950958318827e-06, + "loss": 0.7336, + "step": 7130 + }, + { + "epoch": 0.38446193659693767, + "grad_norm": 0.9827373623847961, + "learning_rate": 9.779447315618027e-06, + "loss": 0.8231, + "step": 7131 + }, + { + "epoch": 0.3845158507655812, + "grad_norm": 0.8853504657745361, + "learning_rate": 9.779385039454983e-06, + "loss": 0.8779, + "step": 7132 + }, + { + "epoch": 0.38456976493422473, + "grad_norm": 0.7579771280288696, + "learning_rate": 9.779322754699248e-06, + "loss": 0.6818, + "step": 7133 + }, + { + "epoch": 0.3846236791028682, + "grad_norm": 0.7155663371086121, + "learning_rate": 9.779260461350937e-06, + "loss": 0.7575, + "step": 7134 + }, + { + "epoch": 0.38467759327151174, + "grad_norm": 0.7029743194580078, + "learning_rate": 9.779198159410158e-06, + "loss": 0.8218, + "step": 7135 + }, + { + "epoch": 0.3847315074401553, + "grad_norm": 0.773006021976471, + "learning_rate": 9.779135848877026e-06, + "loss": 0.8279, + "step": 7136 + }, + { + "epoch": 0.3847854216087988, + "grad_norm": 0.8574584126472473, + "learning_rate": 9.779073529751653e-06, + "loss": 0.7889, + "step": 7137 + }, + { + "epoch": 0.38483933577744234, + "grad_norm": 0.6678891777992249, + "learning_rate": 9.77901120203415e-06, + "loss": 0.6788, + "step": 7138 + }, + { + "epoch": 0.3848932499460858, + "grad_norm": 0.7358391284942627, + "learning_rate": 9.77894886572463e-06, + "loss": 0.8491, + "step": 7139 + }, + { + "epoch": 0.38494716411472935, + "grad_norm": 0.9488425254821777, + "learning_rate": 9.778886520823204e-06, + "loss": 0.8, + "step": 7140 + }, + { + "epoch": 0.3850010782833729, + "grad_norm": 0.6843715906143188, + "learning_rate": 9.778824167329986e-06, + "loss": 0.7513, + "step": 7141 + }, + { + "epoch": 0.3850549924520164, + "grad_norm": 0.7813382148742676, + "learning_rate": 9.778761805245084e-06, + "loss": 0.844, + "step": 7142 + }, + { + "epoch": 0.3851089066206599, + "grad_norm": 0.8355675339698792, + "learning_rate": 9.778699434568614e-06, + "loss": 0.8253, + "step": 7143 + }, + { + "epoch": 0.3851628207893034, + "grad_norm": 0.7164924144744873, + "learning_rate": 9.778637055300687e-06, + "loss": 0.8254, + "step": 7144 + }, + { + "epoch": 0.38521673495794695, + "grad_norm": 0.7583678364753723, + "learning_rate": 9.778574667441416e-06, + "loss": 0.7364, + "step": 7145 + }, + { + "epoch": 0.3852706491265905, + "grad_norm": 0.7031839489936829, + "learning_rate": 9.77851227099091e-06, + "loss": 0.7179, + "step": 7146 + }, + { + "epoch": 0.385324563295234, + "grad_norm": 0.7389762997627258, + "learning_rate": 9.778449865949284e-06, + "loss": 0.7219, + "step": 7147 + }, + { + "epoch": 0.3853784774638775, + "grad_norm": 0.788447916507721, + "learning_rate": 9.778387452316651e-06, + "loss": 0.7282, + "step": 7148 + }, + { + "epoch": 0.385432391632521, + "grad_norm": 0.8027471899986267, + "learning_rate": 9.778325030093122e-06, + "loss": 0.7223, + "step": 7149 + }, + { + "epoch": 0.38548630580116455, + "grad_norm": 0.6544430255889893, + "learning_rate": 9.778262599278807e-06, + "loss": 0.6405, + "step": 7150 + }, + { + "epoch": 0.3855402199698081, + "grad_norm": 0.754837155342102, + "learning_rate": 9.778200159873822e-06, + "loss": 0.7927, + "step": 7151 + }, + { + "epoch": 0.38559413413845156, + "grad_norm": 0.8215610384941101, + "learning_rate": 9.778137711878278e-06, + "loss": 0.8136, + "step": 7152 + }, + { + "epoch": 0.3856480483070951, + "grad_norm": 0.6234309673309326, + "learning_rate": 9.778075255292285e-06, + "loss": 0.6496, + "step": 7153 + }, + { + "epoch": 0.3857019624757386, + "grad_norm": 0.7744494080543518, + "learning_rate": 9.778012790115958e-06, + "loss": 0.6882, + "step": 7154 + }, + { + "epoch": 0.38575587664438216, + "grad_norm": 0.7534972429275513, + "learning_rate": 9.777950316349408e-06, + "loss": 0.8306, + "step": 7155 + }, + { + "epoch": 0.3858097908130257, + "grad_norm": 0.7850611209869385, + "learning_rate": 9.777887833992747e-06, + "loss": 0.6157, + "step": 7156 + }, + { + "epoch": 0.38586370498166916, + "grad_norm": 0.681824803352356, + "learning_rate": 9.77782534304609e-06, + "loss": 0.7417, + "step": 7157 + }, + { + "epoch": 0.3859176191503127, + "grad_norm": 1.020413875579834, + "learning_rate": 9.777762843509547e-06, + "loss": 0.704, + "step": 7158 + }, + { + "epoch": 0.38597153331895623, + "grad_norm": 0.8840301632881165, + "learning_rate": 9.77770033538323e-06, + "loss": 0.839, + "step": 7159 + }, + { + "epoch": 0.38602544748759976, + "grad_norm": 0.7336562871932983, + "learning_rate": 9.777637818667253e-06, + "loss": 0.8295, + "step": 7160 + }, + { + "epoch": 0.38607936165624324, + "grad_norm": 0.8236925005912781, + "learning_rate": 9.777575293361727e-06, + "loss": 0.8271, + "step": 7161 + }, + { + "epoch": 0.38613327582488677, + "grad_norm": 0.7166433334350586, + "learning_rate": 9.777512759466764e-06, + "loss": 0.7598, + "step": 7162 + }, + { + "epoch": 0.3861871899935303, + "grad_norm": 0.7700833678245544, + "learning_rate": 9.777450216982481e-06, + "loss": 0.8275, + "step": 7163 + }, + { + "epoch": 0.38624110416217383, + "grad_norm": 0.7442660331726074, + "learning_rate": 9.777387665908983e-06, + "loss": 0.7664, + "step": 7164 + }, + { + "epoch": 0.38629501833081736, + "grad_norm": 0.7203068137168884, + "learning_rate": 9.777325106246387e-06, + "loss": 0.7887, + "step": 7165 + }, + { + "epoch": 0.38634893249946084, + "grad_norm": 0.7156651020050049, + "learning_rate": 9.777262537994806e-06, + "loss": 0.8015, + "step": 7166 + }, + { + "epoch": 0.38640284666810437, + "grad_norm": 0.7667158842086792, + "learning_rate": 9.777199961154351e-06, + "loss": 0.7798, + "step": 7167 + }, + { + "epoch": 0.3864567608367479, + "grad_norm": 0.8424071669578552, + "learning_rate": 9.777137375725134e-06, + "loss": 0.7846, + "step": 7168 + }, + { + "epoch": 0.38651067500539144, + "grad_norm": 0.8547120094299316, + "learning_rate": 9.777074781707268e-06, + "loss": 0.7722, + "step": 7169 + }, + { + "epoch": 0.3865645891740349, + "grad_norm": 0.6703792214393616, + "learning_rate": 9.777012179100867e-06, + "loss": 0.6888, + "step": 7170 + }, + { + "epoch": 0.38661850334267844, + "grad_norm": 0.7104577422142029, + "learning_rate": 9.776949567906042e-06, + "loss": 0.7484, + "step": 7171 + }, + { + "epoch": 0.386672417511322, + "grad_norm": 0.6734687685966492, + "learning_rate": 9.776886948122905e-06, + "loss": 0.7067, + "step": 7172 + }, + { + "epoch": 0.3867263316799655, + "grad_norm": 0.7321043610572815, + "learning_rate": 9.776824319751571e-06, + "loss": 0.7549, + "step": 7173 + }, + { + "epoch": 0.38678024584860904, + "grad_norm": 0.7712130546569824, + "learning_rate": 9.77676168279215e-06, + "loss": 0.7487, + "step": 7174 + }, + { + "epoch": 0.3868341600172525, + "grad_norm": 0.7074081897735596, + "learning_rate": 9.776699037244757e-06, + "loss": 0.7587, + "step": 7175 + }, + { + "epoch": 0.38688807418589605, + "grad_norm": 0.8595475554466248, + "learning_rate": 9.776636383109503e-06, + "loss": 0.8593, + "step": 7176 + }, + { + "epoch": 0.3869419883545396, + "grad_norm": 0.7398093342781067, + "learning_rate": 9.7765737203865e-06, + "loss": 0.7985, + "step": 7177 + }, + { + "epoch": 0.3869959025231831, + "grad_norm": 0.7544327974319458, + "learning_rate": 9.776511049075863e-06, + "loss": 0.71, + "step": 7178 + }, + { + "epoch": 0.3870498166918266, + "grad_norm": 0.7162982821464539, + "learning_rate": 9.776448369177702e-06, + "loss": 0.7149, + "step": 7179 + }, + { + "epoch": 0.3871037308604701, + "grad_norm": 0.7425058484077454, + "learning_rate": 9.776385680692132e-06, + "loss": 0.7674, + "step": 7180 + }, + { + "epoch": 0.38715764502911365, + "grad_norm": 0.7449594736099243, + "learning_rate": 9.776322983619265e-06, + "loss": 0.7787, + "step": 7181 + }, + { + "epoch": 0.3872115591977572, + "grad_norm": 0.7404816746711731, + "learning_rate": 9.776260277959214e-06, + "loss": 0.7568, + "step": 7182 + }, + { + "epoch": 0.3872654733664007, + "grad_norm": 0.8262953758239746, + "learning_rate": 9.776197563712088e-06, + "loss": 0.8439, + "step": 7183 + }, + { + "epoch": 0.3873193875350442, + "grad_norm": 0.7689031958580017, + "learning_rate": 9.776134840878005e-06, + "loss": 0.7875, + "step": 7184 + }, + { + "epoch": 0.3873733017036877, + "grad_norm": 0.7427681088447571, + "learning_rate": 9.776072109457075e-06, + "loss": 0.7571, + "step": 7185 + }, + { + "epoch": 0.38742721587233125, + "grad_norm": 0.7441726326942444, + "learning_rate": 9.776009369449412e-06, + "loss": 0.6062, + "step": 7186 + }, + { + "epoch": 0.3874811300409748, + "grad_norm": 0.7031913995742798, + "learning_rate": 9.77594662085513e-06, + "loss": 0.7333, + "step": 7187 + }, + { + "epoch": 0.38753504420961826, + "grad_norm": 0.7519451379776001, + "learning_rate": 9.775883863674338e-06, + "loss": 0.7846, + "step": 7188 + }, + { + "epoch": 0.3875889583782618, + "grad_norm": 0.8085461854934692, + "learning_rate": 9.775821097907152e-06, + "loss": 0.6919, + "step": 7189 + }, + { + "epoch": 0.3876428725469053, + "grad_norm": 0.7484344840049744, + "learning_rate": 9.775758323553683e-06, + "loss": 0.7392, + "step": 7190 + }, + { + "epoch": 0.38769678671554886, + "grad_norm": 0.7392482161521912, + "learning_rate": 9.775695540614045e-06, + "loss": 0.7008, + "step": 7191 + }, + { + "epoch": 0.3877507008841924, + "grad_norm": 0.8023311495780945, + "learning_rate": 9.77563274908835e-06, + "loss": 0.806, + "step": 7192 + }, + { + "epoch": 0.38780461505283587, + "grad_norm": 0.7499966025352478, + "learning_rate": 9.775569948976714e-06, + "loss": 0.7355, + "step": 7193 + }, + { + "epoch": 0.3878585292214794, + "grad_norm": 0.7906792759895325, + "learning_rate": 9.775507140279243e-06, + "loss": 0.8647, + "step": 7194 + }, + { + "epoch": 0.38791244339012293, + "grad_norm": 0.7196068167686462, + "learning_rate": 9.775444322996057e-06, + "loss": 0.8203, + "step": 7195 + }, + { + "epoch": 0.38796635755876646, + "grad_norm": 0.6494006514549255, + "learning_rate": 9.775381497127266e-06, + "loss": 0.7183, + "step": 7196 + }, + { + "epoch": 0.38802027172740994, + "grad_norm": 0.6549767255783081, + "learning_rate": 9.775318662672984e-06, + "loss": 0.7368, + "step": 7197 + }, + { + "epoch": 0.38807418589605347, + "grad_norm": 0.8048344254493713, + "learning_rate": 9.77525581963332e-06, + "loss": 0.8634, + "step": 7198 + }, + { + "epoch": 0.388128100064697, + "grad_norm": 0.7775405645370483, + "learning_rate": 9.775192968008394e-06, + "loss": 0.7856, + "step": 7199 + }, + { + "epoch": 0.38818201423334053, + "grad_norm": 0.7218417525291443, + "learning_rate": 9.775130107798311e-06, + "loss": 0.6913, + "step": 7200 + }, + { + "epoch": 0.38823592840198407, + "grad_norm": 0.8354431986808777, + "learning_rate": 9.775067239003191e-06, + "loss": 0.7491, + "step": 7201 + }, + { + "epoch": 0.38828984257062754, + "grad_norm": 0.8344469666481018, + "learning_rate": 9.775004361623144e-06, + "loss": 0.8513, + "step": 7202 + }, + { + "epoch": 0.3883437567392711, + "grad_norm": 0.7339980006217957, + "learning_rate": 9.774941475658281e-06, + "loss": 0.6933, + "step": 7203 + }, + { + "epoch": 0.3883976709079146, + "grad_norm": 0.7363070845603943, + "learning_rate": 9.77487858110872e-06, + "loss": 0.8156, + "step": 7204 + }, + { + "epoch": 0.38845158507655814, + "grad_norm": 0.7072883248329163, + "learning_rate": 9.774815677974569e-06, + "loss": 0.712, + "step": 7205 + }, + { + "epoch": 0.3885054992452016, + "grad_norm": 0.7575790286064148, + "learning_rate": 9.774752766255944e-06, + "loss": 0.8267, + "step": 7206 + }, + { + "epoch": 0.38855941341384514, + "grad_norm": 1.1160898208618164, + "learning_rate": 9.774689845952958e-06, + "loss": 0.7346, + "step": 7207 + }, + { + "epoch": 0.3886133275824887, + "grad_norm": 0.672379195690155, + "learning_rate": 9.774626917065724e-06, + "loss": 0.6814, + "step": 7208 + }, + { + "epoch": 0.3886672417511322, + "grad_norm": 0.8016130328178406, + "learning_rate": 9.774563979594354e-06, + "loss": 0.8384, + "step": 7209 + }, + { + "epoch": 0.38872115591977574, + "grad_norm": 0.7499125003814697, + "learning_rate": 9.774501033538961e-06, + "loss": 0.7583, + "step": 7210 + }, + { + "epoch": 0.3887750700884192, + "grad_norm": 0.6985812187194824, + "learning_rate": 9.77443807889966e-06, + "loss": 0.676, + "step": 7211 + }, + { + "epoch": 0.38882898425706275, + "grad_norm": 0.7313497066497803, + "learning_rate": 9.774375115676565e-06, + "loss": 0.7817, + "step": 7212 + }, + { + "epoch": 0.3888828984257063, + "grad_norm": 0.6926987767219543, + "learning_rate": 9.774312143869785e-06, + "loss": 0.7017, + "step": 7213 + }, + { + "epoch": 0.3889368125943498, + "grad_norm": 0.7496188879013062, + "learning_rate": 9.774249163479437e-06, + "loss": 0.7478, + "step": 7214 + }, + { + "epoch": 0.3889907267629933, + "grad_norm": 0.732998788356781, + "learning_rate": 9.774186174505632e-06, + "loss": 0.7039, + "step": 7215 + }, + { + "epoch": 0.3890446409316368, + "grad_norm": 0.6698726415634155, + "learning_rate": 9.774123176948484e-06, + "loss": 0.7522, + "step": 7216 + }, + { + "epoch": 0.38909855510028035, + "grad_norm": 0.9169347286224365, + "learning_rate": 9.774060170808108e-06, + "loss": 0.8933, + "step": 7217 + }, + { + "epoch": 0.3891524692689239, + "grad_norm": 0.7003192901611328, + "learning_rate": 9.773997156084615e-06, + "loss": 0.7744, + "step": 7218 + }, + { + "epoch": 0.3892063834375674, + "grad_norm": 0.7086424231529236, + "learning_rate": 9.773934132778118e-06, + "loss": 0.7167, + "step": 7219 + }, + { + "epoch": 0.3892602976062109, + "grad_norm": 1.0899872779846191, + "learning_rate": 9.773871100888733e-06, + "loss": 0.7447, + "step": 7220 + }, + { + "epoch": 0.3893142117748544, + "grad_norm": 0.6036999225616455, + "learning_rate": 9.77380806041657e-06, + "loss": 0.6182, + "step": 7221 + }, + { + "epoch": 0.38936812594349796, + "grad_norm": 0.7723484039306641, + "learning_rate": 9.773745011361743e-06, + "loss": 0.7982, + "step": 7222 + }, + { + "epoch": 0.3894220401121415, + "grad_norm": 0.7311958074569702, + "learning_rate": 9.77368195372437e-06, + "loss": 0.8198, + "step": 7223 + }, + { + "epoch": 0.38947595428078496, + "grad_norm": 0.7924654483795166, + "learning_rate": 9.773618887504558e-06, + "loss": 0.7702, + "step": 7224 + }, + { + "epoch": 0.3895298684494285, + "grad_norm": 0.9057634472846985, + "learning_rate": 9.773555812702423e-06, + "loss": 0.8835, + "step": 7225 + }, + { + "epoch": 0.389583782618072, + "grad_norm": 0.6983116865158081, + "learning_rate": 9.773492729318079e-06, + "loss": 0.7243, + "step": 7226 + }, + { + "epoch": 0.38963769678671556, + "grad_norm": 0.7721404433250427, + "learning_rate": 9.773429637351638e-06, + "loss": 0.8214, + "step": 7227 + }, + { + "epoch": 0.3896916109553591, + "grad_norm": 0.7740369439125061, + "learning_rate": 9.773366536803218e-06, + "loss": 0.8336, + "step": 7228 + }, + { + "epoch": 0.38974552512400257, + "grad_norm": 0.8241497874259949, + "learning_rate": 9.773303427672924e-06, + "loss": 0.7732, + "step": 7229 + }, + { + "epoch": 0.3897994392926461, + "grad_norm": 0.7673742771148682, + "learning_rate": 9.773240309960876e-06, + "loss": 0.785, + "step": 7230 + }, + { + "epoch": 0.38985335346128963, + "grad_norm": 0.695315957069397, + "learning_rate": 9.773177183667186e-06, + "loss": 0.7008, + "step": 7231 + }, + { + "epoch": 0.38990726762993316, + "grad_norm": 0.8540239930152893, + "learning_rate": 9.773114048791967e-06, + "loss": 0.8269, + "step": 7232 + }, + { + "epoch": 0.38996118179857664, + "grad_norm": 0.8529762029647827, + "learning_rate": 9.773050905335334e-06, + "loss": 0.769, + "step": 7233 + }, + { + "epoch": 0.39001509596722017, + "grad_norm": 0.8246963024139404, + "learning_rate": 9.772987753297399e-06, + "loss": 0.8003, + "step": 7234 + }, + { + "epoch": 0.3900690101358637, + "grad_norm": 0.6554851531982422, + "learning_rate": 9.772924592678274e-06, + "loss": 0.6151, + "step": 7235 + }, + { + "epoch": 0.39012292430450723, + "grad_norm": 0.7284125685691833, + "learning_rate": 9.772861423478075e-06, + "loss": 0.7704, + "step": 7236 + }, + { + "epoch": 0.39017683847315077, + "grad_norm": 0.6979566812515259, + "learning_rate": 9.772798245696914e-06, + "loss": 0.6634, + "step": 7237 + }, + { + "epoch": 0.39023075264179424, + "grad_norm": 0.6561529040336609, + "learning_rate": 9.772735059334907e-06, + "loss": 0.7215, + "step": 7238 + }, + { + "epoch": 0.3902846668104378, + "grad_norm": 1.4794889688491821, + "learning_rate": 9.772671864392165e-06, + "loss": 0.6525, + "step": 7239 + }, + { + "epoch": 0.3903385809790813, + "grad_norm": 0.7443854808807373, + "learning_rate": 9.772608660868802e-06, + "loss": 0.7975, + "step": 7240 + }, + { + "epoch": 0.39039249514772484, + "grad_norm": 0.7808144092559814, + "learning_rate": 9.772545448764935e-06, + "loss": 0.8145, + "step": 7241 + }, + { + "epoch": 0.3904464093163683, + "grad_norm": 0.6567608118057251, + "learning_rate": 9.772482228080673e-06, + "loss": 0.7462, + "step": 7242 + }, + { + "epoch": 0.39050032348501185, + "grad_norm": 0.7482529282569885, + "learning_rate": 9.772418998816133e-06, + "loss": 0.8116, + "step": 7243 + }, + { + "epoch": 0.3905542376536554, + "grad_norm": 0.6860970854759216, + "learning_rate": 9.772355760971425e-06, + "loss": 0.748, + "step": 7244 + }, + { + "epoch": 0.3906081518222989, + "grad_norm": 0.711781919002533, + "learning_rate": 9.772292514546667e-06, + "loss": 0.7699, + "step": 7245 + }, + { + "epoch": 0.39066206599094244, + "grad_norm": 0.7066224217414856, + "learning_rate": 9.77222925954197e-06, + "loss": 0.7359, + "step": 7246 + }, + { + "epoch": 0.3907159801595859, + "grad_norm": 0.8418669104576111, + "learning_rate": 9.772165995957449e-06, + "loss": 0.8782, + "step": 7247 + }, + { + "epoch": 0.39076989432822945, + "grad_norm": 0.7255772948265076, + "learning_rate": 9.772102723793216e-06, + "loss": 0.8685, + "step": 7248 + }, + { + "epoch": 0.390823808496873, + "grad_norm": 0.6516369581222534, + "learning_rate": 9.772039443049386e-06, + "loss": 0.7099, + "step": 7249 + }, + { + "epoch": 0.3908777226655165, + "grad_norm": 0.7248709201812744, + "learning_rate": 9.771976153726073e-06, + "loss": 0.8036, + "step": 7250 + }, + { + "epoch": 0.39093163683416, + "grad_norm": 1.0460762977600098, + "learning_rate": 9.77191285582339e-06, + "loss": 0.7839, + "step": 7251 + }, + { + "epoch": 0.3909855510028035, + "grad_norm": 0.7196239233016968, + "learning_rate": 9.771849549341454e-06, + "loss": 0.7106, + "step": 7252 + }, + { + "epoch": 0.39103946517144705, + "grad_norm": 0.7477738261222839, + "learning_rate": 9.771786234280374e-06, + "loss": 0.8581, + "step": 7253 + }, + { + "epoch": 0.3910933793400906, + "grad_norm": 0.8589550852775574, + "learning_rate": 9.771722910640265e-06, + "loss": 0.7366, + "step": 7254 + }, + { + "epoch": 0.3911472935087341, + "grad_norm": 0.7551881670951843, + "learning_rate": 9.771659578421244e-06, + "loss": 0.793, + "step": 7255 + }, + { + "epoch": 0.3912012076773776, + "grad_norm": 0.8437439203262329, + "learning_rate": 9.77159623762342e-06, + "loss": 0.7742, + "step": 7256 + }, + { + "epoch": 0.3912551218460211, + "grad_norm": 0.8025644421577454, + "learning_rate": 9.771532888246911e-06, + "loss": 0.8448, + "step": 7257 + }, + { + "epoch": 0.39130903601466466, + "grad_norm": 0.7572808861732483, + "learning_rate": 9.771469530291829e-06, + "loss": 0.8255, + "step": 7258 + }, + { + "epoch": 0.3913629501833082, + "grad_norm": 0.7170048356056213, + "learning_rate": 9.77140616375829e-06, + "loss": 0.7403, + "step": 7259 + }, + { + "epoch": 0.39141686435195167, + "grad_norm": 0.7780778408050537, + "learning_rate": 9.771342788646404e-06, + "loss": 0.7593, + "step": 7260 + }, + { + "epoch": 0.3914707785205952, + "grad_norm": 0.697472095489502, + "learning_rate": 9.77127940495629e-06, + "loss": 0.6738, + "step": 7261 + }, + { + "epoch": 0.39152469268923873, + "grad_norm": 0.6877736449241638, + "learning_rate": 9.771216012688055e-06, + "loss": 0.7371, + "step": 7262 + }, + { + "epoch": 0.39157860685788226, + "grad_norm": 0.8481675982475281, + "learning_rate": 9.77115261184182e-06, + "loss": 0.8472, + "step": 7263 + }, + { + "epoch": 0.3916325210265258, + "grad_norm": 0.880743682384491, + "learning_rate": 9.771089202417695e-06, + "loss": 0.8487, + "step": 7264 + }, + { + "epoch": 0.39168643519516927, + "grad_norm": 0.7655490636825562, + "learning_rate": 9.771025784415795e-06, + "loss": 0.8606, + "step": 7265 + }, + { + "epoch": 0.3917403493638128, + "grad_norm": 0.7072889804840088, + "learning_rate": 9.770962357836234e-06, + "loss": 0.7341, + "step": 7266 + }, + { + "epoch": 0.39179426353245633, + "grad_norm": 0.713034451007843, + "learning_rate": 9.770898922679126e-06, + "loss": 0.7216, + "step": 7267 + }, + { + "epoch": 0.39184817770109986, + "grad_norm": 0.7493498921394348, + "learning_rate": 9.770835478944587e-06, + "loss": 0.7723, + "step": 7268 + }, + { + "epoch": 0.39190209186974334, + "grad_norm": 0.7712430357933044, + "learning_rate": 9.770772026632728e-06, + "loss": 0.8066, + "step": 7269 + }, + { + "epoch": 0.3919560060383869, + "grad_norm": 0.9837167859077454, + "learning_rate": 9.770708565743664e-06, + "loss": 0.7815, + "step": 7270 + }, + { + "epoch": 0.3920099202070304, + "grad_norm": 0.7358155846595764, + "learning_rate": 9.770645096277511e-06, + "loss": 0.7121, + "step": 7271 + }, + { + "epoch": 0.39206383437567394, + "grad_norm": 0.7458567023277283, + "learning_rate": 9.77058161823438e-06, + "loss": 0.6925, + "step": 7272 + }, + { + "epoch": 0.39211774854431747, + "grad_norm": 0.6907920241355896, + "learning_rate": 9.770518131614387e-06, + "loss": 0.7196, + "step": 7273 + }, + { + "epoch": 0.39217166271296094, + "grad_norm": 0.7684288024902344, + "learning_rate": 9.770454636417646e-06, + "loss": 0.8098, + "step": 7274 + }, + { + "epoch": 0.3922255768816045, + "grad_norm": 0.8313829302787781, + "learning_rate": 9.77039113264427e-06, + "loss": 0.6774, + "step": 7275 + }, + { + "epoch": 0.392279491050248, + "grad_norm": 0.6937968730926514, + "learning_rate": 9.770327620294375e-06, + "loss": 0.7417, + "step": 7276 + }, + { + "epoch": 0.39233340521889154, + "grad_norm": 0.8188153505325317, + "learning_rate": 9.770264099368075e-06, + "loss": 0.6879, + "step": 7277 + }, + { + "epoch": 0.39238731938753507, + "grad_norm": 0.8670568466186523, + "learning_rate": 9.770200569865483e-06, + "loss": 0.7743, + "step": 7278 + }, + { + "epoch": 0.39244123355617855, + "grad_norm": 0.702637255191803, + "learning_rate": 9.770137031786713e-06, + "loss": 0.7489, + "step": 7279 + }, + { + "epoch": 0.3924951477248221, + "grad_norm": 0.7256435751914978, + "learning_rate": 9.77007348513188e-06, + "loss": 0.6802, + "step": 7280 + }, + { + "epoch": 0.3925490618934656, + "grad_norm": 0.7279376983642578, + "learning_rate": 9.770009929901099e-06, + "loss": 0.759, + "step": 7281 + }, + { + "epoch": 0.39260297606210914, + "grad_norm": 0.705956757068634, + "learning_rate": 9.769946366094484e-06, + "loss": 0.7623, + "step": 7282 + }, + { + "epoch": 0.3926568902307526, + "grad_norm": 0.7047564387321472, + "learning_rate": 9.769882793712147e-06, + "loss": 0.7259, + "step": 7283 + }, + { + "epoch": 0.39271080439939615, + "grad_norm": 0.7247083187103271, + "learning_rate": 9.769819212754206e-06, + "loss": 0.7303, + "step": 7284 + }, + { + "epoch": 0.3927647185680397, + "grad_norm": 0.752776563167572, + "learning_rate": 9.769755623220772e-06, + "loss": 0.8773, + "step": 7285 + }, + { + "epoch": 0.3928186327366832, + "grad_norm": 0.8147323727607727, + "learning_rate": 9.76969202511196e-06, + "loss": 0.809, + "step": 7286 + }, + { + "epoch": 0.39287254690532675, + "grad_norm": 0.7553462982177734, + "learning_rate": 9.769628418427886e-06, + "loss": 0.7689, + "step": 7287 + }, + { + "epoch": 0.3929264610739702, + "grad_norm": 0.7849236726760864, + "learning_rate": 9.769564803168665e-06, + "loss": 0.6951, + "step": 7288 + }, + { + "epoch": 0.39298037524261376, + "grad_norm": 0.8374189138412476, + "learning_rate": 9.769501179334408e-06, + "loss": 0.8156, + "step": 7289 + }, + { + "epoch": 0.3930342894112573, + "grad_norm": 0.6603876352310181, + "learning_rate": 9.769437546925232e-06, + "loss": 0.6556, + "step": 7290 + }, + { + "epoch": 0.3930882035799008, + "grad_norm": 0.8605456948280334, + "learning_rate": 9.769373905941249e-06, + "loss": 0.7491, + "step": 7291 + }, + { + "epoch": 0.3931421177485443, + "grad_norm": 1.1489166021347046, + "learning_rate": 9.769310256382576e-06, + "loss": 0.7826, + "step": 7292 + }, + { + "epoch": 0.3931960319171878, + "grad_norm": 0.666865348815918, + "learning_rate": 9.769246598249325e-06, + "loss": 0.6825, + "step": 7293 + }, + { + "epoch": 0.39324994608583136, + "grad_norm": 0.7862495183944702, + "learning_rate": 9.769182931541614e-06, + "loss": 0.8041, + "step": 7294 + }, + { + "epoch": 0.3933038602544749, + "grad_norm": 0.6634209156036377, + "learning_rate": 9.769119256259554e-06, + "loss": 0.7457, + "step": 7295 + }, + { + "epoch": 0.3933577744231184, + "grad_norm": 0.905255138874054, + "learning_rate": 9.769055572403261e-06, + "loss": 0.8636, + "step": 7296 + }, + { + "epoch": 0.3934116885917619, + "grad_norm": 0.7580832242965698, + "learning_rate": 9.76899187997285e-06, + "loss": 0.722, + "step": 7297 + }, + { + "epoch": 0.39346560276040543, + "grad_norm": 0.8241808414459229, + "learning_rate": 9.768928178968435e-06, + "loss": 0.8048, + "step": 7298 + }, + { + "epoch": 0.39351951692904896, + "grad_norm": 0.7899685502052307, + "learning_rate": 9.768864469390128e-06, + "loss": 0.7474, + "step": 7299 + }, + { + "epoch": 0.3935734310976925, + "grad_norm": 0.6969719529151917, + "learning_rate": 9.768800751238048e-06, + "loss": 0.7111, + "step": 7300 + }, + { + "epoch": 0.39362734526633597, + "grad_norm": 0.7246440649032593, + "learning_rate": 9.768737024512306e-06, + "loss": 0.7172, + "step": 7301 + }, + { + "epoch": 0.3936812594349795, + "grad_norm": 0.8120426535606384, + "learning_rate": 9.76867328921302e-06, + "loss": 0.7665, + "step": 7302 + }, + { + "epoch": 0.39373517360362303, + "grad_norm": 0.8631522059440613, + "learning_rate": 9.768609545340302e-06, + "loss": 0.7708, + "step": 7303 + }, + { + "epoch": 0.39378908777226657, + "grad_norm": 0.7484616041183472, + "learning_rate": 9.768545792894267e-06, + "loss": 0.8757, + "step": 7304 + }, + { + "epoch": 0.3938430019409101, + "grad_norm": 0.6609781980514526, + "learning_rate": 9.768482031875028e-06, + "loss": 0.7447, + "step": 7305 + }, + { + "epoch": 0.3938969161095536, + "grad_norm": 0.7116879224777222, + "learning_rate": 9.768418262282704e-06, + "loss": 0.7758, + "step": 7306 + }, + { + "epoch": 0.3939508302781971, + "grad_norm": 0.843863844871521, + "learning_rate": 9.768354484117406e-06, + "loss": 0.8307, + "step": 7307 + }, + { + "epoch": 0.39400474444684064, + "grad_norm": 0.8119161128997803, + "learning_rate": 9.76829069737925e-06, + "loss": 0.7883, + "step": 7308 + }, + { + "epoch": 0.39405865861548417, + "grad_norm": 0.6922174096107483, + "learning_rate": 9.768226902068349e-06, + "loss": 0.6898, + "step": 7309 + }, + { + "epoch": 0.39411257278412765, + "grad_norm": 0.7575286030769348, + "learning_rate": 9.76816309818482e-06, + "loss": 0.8155, + "step": 7310 + }, + { + "epoch": 0.3941664869527712, + "grad_norm": 0.8841058611869812, + "learning_rate": 9.768099285728777e-06, + "loss": 0.8113, + "step": 7311 + }, + { + "epoch": 0.3942204011214147, + "grad_norm": 0.7096531987190247, + "learning_rate": 9.768035464700335e-06, + "loss": 0.7892, + "step": 7312 + }, + { + "epoch": 0.39427431529005824, + "grad_norm": 0.7000054717063904, + "learning_rate": 9.767971635099608e-06, + "loss": 0.7566, + "step": 7313 + }, + { + "epoch": 0.3943282294587018, + "grad_norm": 0.7348666787147522, + "learning_rate": 9.76790779692671e-06, + "loss": 0.81, + "step": 7314 + }, + { + "epoch": 0.39438214362734525, + "grad_norm": 0.7042277455329895, + "learning_rate": 9.76784395018176e-06, + "loss": 0.8269, + "step": 7315 + }, + { + "epoch": 0.3944360577959888, + "grad_norm": 0.7349006533622742, + "learning_rate": 9.767780094864866e-06, + "loss": 0.7989, + "step": 7316 + }, + { + "epoch": 0.3944899719646323, + "grad_norm": 0.716917872428894, + "learning_rate": 9.767716230976147e-06, + "loss": 0.7832, + "step": 7317 + }, + { + "epoch": 0.39454388613327585, + "grad_norm": 0.6973740458488464, + "learning_rate": 9.76765235851572e-06, + "loss": 0.8214, + "step": 7318 + }, + { + "epoch": 0.3945978003019193, + "grad_norm": 0.7072700262069702, + "learning_rate": 9.767588477483694e-06, + "loss": 0.723, + "step": 7319 + }, + { + "epoch": 0.39465171447056285, + "grad_norm": 0.7606068849563599, + "learning_rate": 9.767524587880188e-06, + "loss": 0.8075, + "step": 7320 + }, + { + "epoch": 0.3947056286392064, + "grad_norm": 0.671668291091919, + "learning_rate": 9.767460689705315e-06, + "loss": 0.7338, + "step": 7321 + }, + { + "epoch": 0.3947595428078499, + "grad_norm": 1.1946953535079956, + "learning_rate": 9.767396782959194e-06, + "loss": 0.8632, + "step": 7322 + }, + { + "epoch": 0.39481345697649345, + "grad_norm": 0.7443973422050476, + "learning_rate": 9.767332867641933e-06, + "loss": 0.8716, + "step": 7323 + }, + { + "epoch": 0.3948673711451369, + "grad_norm": 0.7200720310211182, + "learning_rate": 9.767268943753652e-06, + "loss": 0.7557, + "step": 7324 + }, + { + "epoch": 0.39492128531378046, + "grad_norm": 0.7980144023895264, + "learning_rate": 9.767205011294463e-06, + "loss": 0.759, + "step": 7325 + }, + { + "epoch": 0.394975199482424, + "grad_norm": 0.6510097980499268, + "learning_rate": 9.767141070264484e-06, + "loss": 0.7418, + "step": 7326 + }, + { + "epoch": 0.3950291136510675, + "grad_norm": 0.6695359945297241, + "learning_rate": 9.767077120663827e-06, + "loss": 0.67, + "step": 7327 + }, + { + "epoch": 0.395083027819711, + "grad_norm": 0.7033188343048096, + "learning_rate": 9.767013162492609e-06, + "loss": 0.7431, + "step": 7328 + }, + { + "epoch": 0.39513694198835453, + "grad_norm": 0.7332863807678223, + "learning_rate": 9.766949195750944e-06, + "loss": 0.8169, + "step": 7329 + }, + { + "epoch": 0.39519085615699806, + "grad_norm": 0.6246016025543213, + "learning_rate": 9.766885220438948e-06, + "loss": 0.6674, + "step": 7330 + }, + { + "epoch": 0.3952447703256416, + "grad_norm": 0.8020443320274353, + "learning_rate": 9.766821236556734e-06, + "loss": 0.8112, + "step": 7331 + }, + { + "epoch": 0.3952986844942851, + "grad_norm": 0.6486284136772156, + "learning_rate": 9.76675724410442e-06, + "loss": 0.6272, + "step": 7332 + }, + { + "epoch": 0.3953525986629286, + "grad_norm": 0.6875466704368591, + "learning_rate": 9.766693243082117e-06, + "loss": 0.7294, + "step": 7333 + }, + { + "epoch": 0.39540651283157213, + "grad_norm": 0.7521031498908997, + "learning_rate": 9.766629233489944e-06, + "loss": 0.7395, + "step": 7334 + }, + { + "epoch": 0.39546042700021566, + "grad_norm": 0.7039968371391296, + "learning_rate": 9.766565215328015e-06, + "loss": 0.792, + "step": 7335 + }, + { + "epoch": 0.3955143411688592, + "grad_norm": 0.7921715974807739, + "learning_rate": 9.766501188596444e-06, + "loss": 0.6848, + "step": 7336 + }, + { + "epoch": 0.39556825533750267, + "grad_norm": 0.7686671614646912, + "learning_rate": 9.766437153295347e-06, + "loss": 0.692, + "step": 7337 + }, + { + "epoch": 0.3956221695061462, + "grad_norm": 0.8454588651657104, + "learning_rate": 9.766373109424839e-06, + "loss": 0.7273, + "step": 7338 + }, + { + "epoch": 0.39567608367478974, + "grad_norm": 0.8625504970550537, + "learning_rate": 9.766309056985034e-06, + "loss": 0.8249, + "step": 7339 + }, + { + "epoch": 0.39572999784343327, + "grad_norm": 0.7065198421478271, + "learning_rate": 9.76624499597605e-06, + "loss": 0.7717, + "step": 7340 + }, + { + "epoch": 0.3957839120120768, + "grad_norm": 0.736350953578949, + "learning_rate": 9.766180926397996e-06, + "loss": 0.791, + "step": 7341 + }, + { + "epoch": 0.3958378261807203, + "grad_norm": 0.7497819066047668, + "learning_rate": 9.766116848250994e-06, + "loss": 0.7313, + "step": 7342 + }, + { + "epoch": 0.3958917403493638, + "grad_norm": 0.760215699672699, + "learning_rate": 9.76605276153516e-06, + "loss": 0.7423, + "step": 7343 + }, + { + "epoch": 0.39594565451800734, + "grad_norm": 0.8288607597351074, + "learning_rate": 9.765988666250602e-06, + "loss": 0.8228, + "step": 7344 + }, + { + "epoch": 0.39599956868665087, + "grad_norm": 0.7401853203773499, + "learning_rate": 9.76592456239744e-06, + "loss": 0.7624, + "step": 7345 + }, + { + "epoch": 0.39605348285529435, + "grad_norm": 0.6871310472488403, + "learning_rate": 9.765860449975789e-06, + "loss": 0.7691, + "step": 7346 + }, + { + "epoch": 0.3961073970239379, + "grad_norm": 0.7931047081947327, + "learning_rate": 9.765796328985763e-06, + "loss": 0.8657, + "step": 7347 + }, + { + "epoch": 0.3961613111925814, + "grad_norm": 0.7092416286468506, + "learning_rate": 9.76573219942748e-06, + "loss": 0.7098, + "step": 7348 + }, + { + "epoch": 0.39621522536122494, + "grad_norm": 0.732279360294342, + "learning_rate": 9.76566806130105e-06, + "loss": 0.8873, + "step": 7349 + }, + { + "epoch": 0.3962691395298685, + "grad_norm": 0.6804463863372803, + "learning_rate": 9.765603914606595e-06, + "loss": 0.7326, + "step": 7350 + }, + { + "epoch": 0.39632305369851195, + "grad_norm": 0.7748779058456421, + "learning_rate": 9.765539759344224e-06, + "loss": 0.7429, + "step": 7351 + }, + { + "epoch": 0.3963769678671555, + "grad_norm": 0.7442399263381958, + "learning_rate": 9.765475595514055e-06, + "loss": 0.7894, + "step": 7352 + }, + { + "epoch": 0.396430882035799, + "grad_norm": 0.7418267130851746, + "learning_rate": 9.765411423116206e-06, + "loss": 0.7107, + "step": 7353 + }, + { + "epoch": 0.39648479620444255, + "grad_norm": 1.14374577999115, + "learning_rate": 9.765347242150788e-06, + "loss": 0.6983, + "step": 7354 + }, + { + "epoch": 0.396538710373086, + "grad_norm": 0.7526988387107849, + "learning_rate": 9.76528305261792e-06, + "loss": 0.7474, + "step": 7355 + }, + { + "epoch": 0.39659262454172955, + "grad_norm": 0.9308310747146606, + "learning_rate": 9.765218854517715e-06, + "loss": 0.8172, + "step": 7356 + }, + { + "epoch": 0.3966465387103731, + "grad_norm": 0.8375173807144165, + "learning_rate": 9.76515464785029e-06, + "loss": 0.7659, + "step": 7357 + }, + { + "epoch": 0.3967004528790166, + "grad_norm": 2.2292466163635254, + "learning_rate": 9.765090432615757e-06, + "loss": 0.8999, + "step": 7358 + }, + { + "epoch": 0.39675436704766015, + "grad_norm": 0.7127243280410767, + "learning_rate": 9.765026208814237e-06, + "loss": 0.6924, + "step": 7359 + }, + { + "epoch": 0.3968082812163036, + "grad_norm": 0.7080599069595337, + "learning_rate": 9.764961976445842e-06, + "loss": 0.7141, + "step": 7360 + }, + { + "epoch": 0.39686219538494716, + "grad_norm": 0.8234174847602844, + "learning_rate": 9.764897735510687e-06, + "loss": 0.7984, + "step": 7361 + }, + { + "epoch": 0.3969161095535907, + "grad_norm": 0.684299647808075, + "learning_rate": 9.76483348600889e-06, + "loss": 0.7044, + "step": 7362 + }, + { + "epoch": 0.3969700237222342, + "grad_norm": 0.6945247650146484, + "learning_rate": 9.764769227940564e-06, + "loss": 0.7601, + "step": 7363 + }, + { + "epoch": 0.3970239378908777, + "grad_norm": 0.7135429382324219, + "learning_rate": 9.764704961305824e-06, + "loss": 0.7875, + "step": 7364 + }, + { + "epoch": 0.39707785205952123, + "grad_norm": 0.8100901246070862, + "learning_rate": 9.764640686104789e-06, + "loss": 0.862, + "step": 7365 + }, + { + "epoch": 0.39713176622816476, + "grad_norm": 0.8199816346168518, + "learning_rate": 9.764576402337573e-06, + "loss": 0.837, + "step": 7366 + }, + { + "epoch": 0.3971856803968083, + "grad_norm": 0.6950012445449829, + "learning_rate": 9.76451211000429e-06, + "loss": 0.6764, + "step": 7367 + }, + { + "epoch": 0.3972395945654518, + "grad_norm": 0.6791244149208069, + "learning_rate": 9.764447809105058e-06, + "loss": 0.6491, + "step": 7368 + }, + { + "epoch": 0.3972935087340953, + "grad_norm": 0.9352306127548218, + "learning_rate": 9.764383499639991e-06, + "loss": 0.824, + "step": 7369 + }, + { + "epoch": 0.39734742290273883, + "grad_norm": 0.8357742428779602, + "learning_rate": 9.764319181609205e-06, + "loss": 0.7885, + "step": 7370 + }, + { + "epoch": 0.39740133707138237, + "grad_norm": 0.8630572557449341, + "learning_rate": 9.764254855012816e-06, + "loss": 0.8129, + "step": 7371 + }, + { + "epoch": 0.3974552512400259, + "grad_norm": 0.7412340044975281, + "learning_rate": 9.764190519850938e-06, + "loss": 0.7661, + "step": 7372 + }, + { + "epoch": 0.3975091654086694, + "grad_norm": 0.855338990688324, + "learning_rate": 9.76412617612369e-06, + "loss": 0.6961, + "step": 7373 + }, + { + "epoch": 0.3975630795773129, + "grad_norm": 0.6861628293991089, + "learning_rate": 9.764061823831184e-06, + "loss": 0.6054, + "step": 7374 + }, + { + "epoch": 0.39761699374595644, + "grad_norm": 0.7501578330993652, + "learning_rate": 9.763997462973537e-06, + "loss": 0.7922, + "step": 7375 + }, + { + "epoch": 0.39767090791459997, + "grad_norm": 0.9399696588516235, + "learning_rate": 9.763933093550866e-06, + "loss": 1.0283, + "step": 7376 + }, + { + "epoch": 0.3977248220832435, + "grad_norm": 0.837867021560669, + "learning_rate": 9.763868715563285e-06, + "loss": 0.7026, + "step": 7377 + }, + { + "epoch": 0.397778736251887, + "grad_norm": 0.7877710461616516, + "learning_rate": 9.763804329010913e-06, + "loss": 0.7742, + "step": 7378 + }, + { + "epoch": 0.3978326504205305, + "grad_norm": 0.7550365924835205, + "learning_rate": 9.763739933893861e-06, + "loss": 0.7672, + "step": 7379 + }, + { + "epoch": 0.39788656458917404, + "grad_norm": 0.6531806588172913, + "learning_rate": 9.763675530212246e-06, + "loss": 0.6251, + "step": 7380 + }, + { + "epoch": 0.3979404787578176, + "grad_norm": 0.7789273262023926, + "learning_rate": 9.763611117966188e-06, + "loss": 0.848, + "step": 7381 + }, + { + "epoch": 0.39799439292646105, + "grad_norm": 0.9140807390213013, + "learning_rate": 9.763546697155798e-06, + "loss": 0.7929, + "step": 7382 + }, + { + "epoch": 0.3980483070951046, + "grad_norm": 0.6922179460525513, + "learning_rate": 9.763482267781194e-06, + "loss": 0.6873, + "step": 7383 + }, + { + "epoch": 0.3981022212637481, + "grad_norm": 0.7744159698486328, + "learning_rate": 9.76341782984249e-06, + "loss": 0.826, + "step": 7384 + }, + { + "epoch": 0.39815613543239164, + "grad_norm": 0.7017830014228821, + "learning_rate": 9.763353383339805e-06, + "loss": 0.7118, + "step": 7385 + }, + { + "epoch": 0.3982100496010352, + "grad_norm": 0.7147089242935181, + "learning_rate": 9.763288928273254e-06, + "loss": 0.7416, + "step": 7386 + }, + { + "epoch": 0.39826396376967865, + "grad_norm": 0.7732849717140198, + "learning_rate": 9.763224464642948e-06, + "loss": 0.7861, + "step": 7387 + }, + { + "epoch": 0.3983178779383222, + "grad_norm": 0.7319730520248413, + "learning_rate": 9.76315999244901e-06, + "loss": 0.7648, + "step": 7388 + }, + { + "epoch": 0.3983717921069657, + "grad_norm": 0.7354785799980164, + "learning_rate": 9.76309551169155e-06, + "loss": 0.7933, + "step": 7389 + }, + { + "epoch": 0.39842570627560925, + "grad_norm": 0.6598151922225952, + "learning_rate": 9.76303102237069e-06, + "loss": 0.7448, + "step": 7390 + }, + { + "epoch": 0.3984796204442527, + "grad_norm": 0.8648874759674072, + "learning_rate": 9.762966524486541e-06, + "loss": 0.7273, + "step": 7391 + }, + { + "epoch": 0.39853353461289626, + "grad_norm": 0.9220259785652161, + "learning_rate": 9.762902018039222e-06, + "loss": 0.7899, + "step": 7392 + }, + { + "epoch": 0.3985874487815398, + "grad_norm": 0.7293949127197266, + "learning_rate": 9.762837503028846e-06, + "loss": 0.7769, + "step": 7393 + }, + { + "epoch": 0.3986413629501833, + "grad_norm": 0.7581174373626709, + "learning_rate": 9.762772979455531e-06, + "loss": 0.8273, + "step": 7394 + }, + { + "epoch": 0.39869527711882685, + "grad_norm": 0.6532285213470459, + "learning_rate": 9.762708447319391e-06, + "loss": 0.7079, + "step": 7395 + }, + { + "epoch": 0.39874919128747033, + "grad_norm": 0.8002683520317078, + "learning_rate": 9.762643906620546e-06, + "loss": 0.7068, + "step": 7396 + }, + { + "epoch": 0.39880310545611386, + "grad_norm": 0.6997725963592529, + "learning_rate": 9.762579357359107e-06, + "loss": 0.7344, + "step": 7397 + }, + { + "epoch": 0.3988570196247574, + "grad_norm": 0.7591243386268616, + "learning_rate": 9.762514799535194e-06, + "loss": 0.7233, + "step": 7398 + }, + { + "epoch": 0.3989109337934009, + "grad_norm": 0.776656448841095, + "learning_rate": 9.762450233148924e-06, + "loss": 0.8609, + "step": 7399 + }, + { + "epoch": 0.3989648479620444, + "grad_norm": 0.7420164346694946, + "learning_rate": 9.762385658200406e-06, + "loss": 0.8023, + "step": 7400 + }, + { + "epoch": 0.39901876213068793, + "grad_norm": 0.7199655771255493, + "learning_rate": 9.762321074689765e-06, + "loss": 0.8014, + "step": 7401 + }, + { + "epoch": 0.39907267629933146, + "grad_norm": 0.8053634166717529, + "learning_rate": 9.762256482617111e-06, + "loss": 0.8514, + "step": 7402 + }, + { + "epoch": 0.399126590467975, + "grad_norm": 0.9103530645370483, + "learning_rate": 9.762191881982563e-06, + "loss": 0.7841, + "step": 7403 + }, + { + "epoch": 0.3991805046366185, + "grad_norm": 0.8359214663505554, + "learning_rate": 9.762127272786235e-06, + "loss": 0.8384, + "step": 7404 + }, + { + "epoch": 0.399234418805262, + "grad_norm": 0.7621626257896423, + "learning_rate": 9.762062655028246e-06, + "loss": 0.788, + "step": 7405 + }, + { + "epoch": 0.39928833297390554, + "grad_norm": 0.7081224918365479, + "learning_rate": 9.76199802870871e-06, + "loss": 0.785, + "step": 7406 + }, + { + "epoch": 0.39934224714254907, + "grad_norm": 0.7636391520500183, + "learning_rate": 9.761933393827744e-06, + "loss": 0.86, + "step": 7407 + }, + { + "epoch": 0.3993961613111926, + "grad_norm": 0.6978710889816284, + "learning_rate": 9.761868750385464e-06, + "loss": 0.7681, + "step": 7408 + }, + { + "epoch": 0.3994500754798361, + "grad_norm": 0.7182696461677551, + "learning_rate": 9.761804098381987e-06, + "loss": 0.7212, + "step": 7409 + }, + { + "epoch": 0.3995039896484796, + "grad_norm": 0.8285101652145386, + "learning_rate": 9.761739437817426e-06, + "loss": 0.8119, + "step": 7410 + }, + { + "epoch": 0.39955790381712314, + "grad_norm": 0.7303835153579712, + "learning_rate": 9.7616747686919e-06, + "loss": 0.7873, + "step": 7411 + }, + { + "epoch": 0.39961181798576667, + "grad_norm": 0.7284088730812073, + "learning_rate": 9.761610091005527e-06, + "loss": 0.8272, + "step": 7412 + }, + { + "epoch": 0.3996657321544102, + "grad_norm": 0.8736691474914551, + "learning_rate": 9.761545404758422e-06, + "loss": 0.6722, + "step": 7413 + }, + { + "epoch": 0.3997196463230537, + "grad_norm": 0.7206536531448364, + "learning_rate": 9.761480709950697e-06, + "loss": 0.7323, + "step": 7414 + }, + { + "epoch": 0.3997735604916972, + "grad_norm": 0.760828971862793, + "learning_rate": 9.761416006582474e-06, + "loss": 0.7487, + "step": 7415 + }, + { + "epoch": 0.39982747466034074, + "grad_norm": 0.7757959961891174, + "learning_rate": 9.761351294653867e-06, + "loss": 0.7831, + "step": 7416 + }, + { + "epoch": 0.3998813888289843, + "grad_norm": 0.7302231788635254, + "learning_rate": 9.761286574164993e-06, + "loss": 0.7916, + "step": 7417 + }, + { + "epoch": 0.39993530299762775, + "grad_norm": 0.7968741655349731, + "learning_rate": 9.761221845115967e-06, + "loss": 0.7864, + "step": 7418 + }, + { + "epoch": 0.3999892171662713, + "grad_norm": 0.8659828901290894, + "learning_rate": 9.761157107506907e-06, + "loss": 0.8256, + "step": 7419 + }, + { + "epoch": 0.4000431313349148, + "grad_norm": 0.7657403349876404, + "learning_rate": 9.761092361337928e-06, + "loss": 0.7287, + "step": 7420 + } + ], + "logging_steps": 1, + "max_steps": 74192, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1855, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1896838931902628e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7420/training_args.bin b/checkpoint-7420/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..736549377f516c4bc25a43293c6f37ec549a9a60 --- /dev/null +++ b/checkpoint-7420/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb97268504007caea6a1175a54f08b974d7fa47a1a5fb4547021d5b9d223b4a4 +size 7928 diff --git a/checkpoint-7420/zero_to_fp32.py b/checkpoint-7420/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-7420/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)